CUDA完成统计图像像素和

最近要写一个归一化互相关求两张图片相似度的项目,要用CUDA完成
在第一步统计像素和就卡住了。。
代码如下


//图像大小
__device__ __constant__ int d_Imgsize[2];

const int threadsPerBlock = 1024;

__device__ int d_partial_sum[1024];

__global__ void calTotalPixVal(uchar* input) {
    __shared__ int partialSum[threadsPerBlock];
    //确定索引
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;

    //传global memory数据到shared memory
    partialSum[tid] = s_sum[i];

    //传输同步
    __syncthreads();

    //在共享存储器中进行规约
    for (int stride = blockDim.x / 2; stride > 0; stride /= 2)
    {
        if (tid < stride)
            partialSum[tid] += partialSum[tid + stride];
        __syncthreads();
    }
    //将当前block的计算结果写回输出数组
    if (tid == 0)
        d_partial_sum[blockIdx.x] = partialSum[0];
    
}

uchar* ImgProcess(Mat m_srcImg,Mat s_srcImg) {

    int height = m_srcImg.rows;
    int width = m_srcImg.cols;

    //待测图和模板图UCHAR序列
    uchar* m_src = m_srcImg.data;
    uchar* s_src = s_srcImg.data;

    //图像大小
    int Imgsize[2] = { height,width };
    int memsize = height * width;

    //开辟显存
    cudaMalloc((void**)&d_m_src, memsize * sizeof(uchar));
    cudaMalloc((void**)&d_s_src, memsize * sizeof(uchar));
    cudaMalloc((void**)&s_sum,  sizeof(int));

    cudaMemcpyToSymbol(d_Imgsize, Imgsize, 2 * sizeof(int), cudaMemcpyHostToDevice);

    cudaMemcpy(d_m_src, m_src, memsize * sizeof(uchar), cudaMemcpyHostToDevice);
    cudaMemcpy(d_s_src, s_src, memsize * sizeof(uchar), cudaMemcpyHostToDevice);

    //sum接受像素和
    int sum[64] = { 0 };

    cudaMemcpyToSymbol(s_sum, sum,tile_nums * sizeof(int), cudaMemcpyHostToDevice);

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int numThreads = prop.maxThreadsPerBlock;
    int numBlocks = (int)(height * width / numThreads);
    
    calTotalPixVal << <numBlocks, numThreads >> > (d_m_src);
    //cudaThreadSynchronize();

    int h_s_sum[1024];
    cudaMemcpyFromSymbol(h_s_sum, d_partial_sum,1024* sizeof(int), cudaMemcpyDeviceToHost);
    cout << h_s_sum[0] << endl;
    cout << h_s_sum[1] << endl;
    cout << h_s_sum[2] << endl;
    cout << h_s_sum[3] << endl;
    
    cudaFree(s_sum);
    

    return m_srcImg.data;
}

输出结果如下

1040
0
1024
0

这个结果显然不对,求告知错误在哪里,或者有CUDA像素求和的完整代码