CUDA核函数运行时参数加入cudaStream时出现Unrecognized token

核函数定义

 __global__
void local_binary_fit_part1_kernel(
float *Hu_gpu,
float *I_gpu,
const float *img_gpu,
const float *phi_gpu,
const float epsilon,
const int width,
const int height,
const int depth
)
{
    int x = threadIdx.x;
    int y = blockIdx.x;
    int z = blockIdx.y;

    int pos = x + (y + z*height)*width;

    float hu = 0.5*(1 + (2 / PI)*atan(phi_gpu[pos] / epsilon));

    Hu_gpu[pos] = hu;
    I_gpu[pos] = hu * img_gpu[pos];

    return;
}

在C函数中调用该核函数

    cudaStream_t stream0, stream1;
    cudaStreamCreate(&stream0);
    cudaStreamCreate(&stream1);
    int div1 = depth / 2;
    int div2 = depth - div1;
    dim3 grid1(height, div1, 1);
    dim3 grid2(height, div2, 1);
    cudaMemcpyAsync(phi_gpu, phi, div1*height*width*sizeof(float), cudaMemcpyHostToDevice, stream0);
    cudaMemcpyAsync(phi_gpu + div1*width*height, phi + div1*width*height, div2*height*width*sizeof(float), cudaMemcpyHostToDevice, stream1);
    cudaMemcpyAsync(img_gpu, img, div1*height*width*sizeof(float), cudaMemcpyHostToDevice, stream0);
    cudaMemcpyAsync(img_gpu + div1*width*height, img + div1*width*height, div2*height*width*sizeof(float), cudaMemcpyHostToDevice, stream1);
    local_binary_fit_part1_kernel <<<grid1, width, 0, stream0 >>>(Hu_gpu, I_gpu, img_gpu, phi_gpu, epsilon, width, height, depth);
    local_binary_fit_part1_kernel <<<grid2, width,0, stream1 >>>(Hu_gpu + div1*width*height, I_gpu + div1*width*height, img_gpu + div1*width*height, phi_gpu + div1*width*height, epsilon, width, height, depth);

    cudaStreamSynchronize(stream0);
    cudaStreamSynchronize(stream1);
    cudaStreamDestroy(stream0);
    cudaStreamDestroy(stream1);

编译时出现如下错误
error : unrecognized token

如改成

 local_binary_fit_part1_kernel <<<grid1, width>>>(Hu_gpu, I_gpu, img_gpu, phi_gpu, epsilon, width, height, depth);

则能通过,这个是什么原因

http://blog.csdn.net/a925907195/article/details/39500915