初学cuda,简单程序无输出结果

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include
#include
#include

typedef struct
{
BYTE r;
BYTE b;
BYTE g;
}RGB;

global void func(BYTE *gray, RGB img[][500], int width)
{
int i = threadIdx.x;
for (int j = 0; j < width; j++)
{
gray[(img[i][j].r * 38 + img[i][j].g * 75 + img[i][j].b * 15) >> 7]++;
}
}

int main()
{
BITMAPFILEHEADER fileHeader;
BITMAPINFOHEADER infoHeader;
FILE* pfin;
BYTE gray[256] = { 0 };
RGB img[256][256] = { 0 };

cudaError_t cudaStatus;

pfin = fopen("1.bmp", "rb");
//ReadtheBitmapfileheader;
fread(&fileHeader, sizeof(BITMAPFILEHEADER), 1, pfin);
//ReadtheBitmapinfoheader;
fread(&infoHeader, sizeof(BITMAPINFOHEADER), 1, pfin);
//the size of the picture
int size = infoHeader.biWidth*infoHeader.biHeight;
//read the file
fread(img, sizeof(RGB), size, pfin);
int width = infoHeader.biWidth;

BYTE *dev_gray;

cudaMalloc((void **)&dev_gray,width*sizeof(BYTE));

func<<<1,width>>>(dev_gray, img, width);

cudaMemcpy(gray, dev_gray, width*sizeof(BYTE),cudaMemcpyDeviceToHost);

cudaFree(dev_gray);

getchar();
return 0;

}

想要的结果是结果在gray数组中 但无结果 是内存的问题还是什么问题
应该如何修改呢

https://zhidao.baidu.com/question/552384161.html