OpenCL kernel 函数 该如何调试?

接触OpenCL 的时间不是很久,很多东西都是现学,有很多不懂地方。
我写了个OpenCL的示例:在float型数组中找到最大值或最小值。
麻烦帮我看一下问题出在哪里?原因是什么? 或者告诉我kernel 函数如何进行调试,比如将输入输出打出来?

#include <stdio.h>
#include <stdlib.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MEM_SIZE (32)
#define MAX_SOURCE_SIZE (0x100000)

int main()
{
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem memobj[2] = { NULL };
    cl_program program = NULL;
    cl_kernel kernel = NULL;
    cl_platform_id platform_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    float array[MEM_SIZE];
    float result;
    printf("array:");
    for(int i = 0; i<MEM_SIZE ; i++)
    {
        array[i] = i;
        printf("%f ",array[i]);
    }
    printf("\n");
    FILE *fp;
    char fileName[] = "./findmax.cl";
    char *source_str;
    size_t source_size;

    /* 打开文件*/
    fp = fopen(fileName, "r");
    if (!fp) {
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    /* 设备获取 */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* 上下文创建 */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    /* 工作队列创建 */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /* 创建项目 */
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
                                      (const size_t *)&source_size, &ret);

    /* 编译 */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* 创建内核 */
    kernel = clCreateKernel(program, "findmax", &ret);

    /* 内存对象创建 */
    memobj[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,MEM_SIZE * sizeof(float), array, &ret);
    memobj[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,sizeof(float), NULL, &ret);


    /* 设置内核参数 */
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)memobj[0]);
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)memobj[1]);

    /* OpenCL内核执行 */
    size_t globalWorkSize = MEM_SIZE ;
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);

    /* 读取Buffer */
    ret = clEnqueueReadBuffer(command_queue, memobj[1], CL_TRUE, 0,
                              sizeof(float),&result, 0, NULL, NULL);

    /* 結果显示 */
    printf("Result:%f\n",result);

    /* 終了処理 */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj[0]);
    ret = clReleaseMemObject(memobj[1]);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(source_str);

    return 0;
}

kernel

 _kernel void vector_add(global const float *a, global float *result)
 {
     int gid = get_global_id(0);
     *result = fmax(a[gid],a[gid+1]);
 }

img