最近要写一个归一化互相关求两张图片相似度的项目,要用CUDA完成
在第一步统计像素和就卡住了。。
代码如下
//图像大小
__device__ __constant__ int d_Imgsize[2];
const int threadsPerBlock = 1024;
__device__ int d_partial_sum[1024];
__global__ void calTotalPixVal(uchar* input) {
__shared__ int partialSum[threadsPerBlock];
//确定索引
int i = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
//传global memory数据到shared memory
partialSum[tid] = s_sum[i];
//传输同步
__syncthreads();
//在共享存储器中进行规约
for (int stride = blockDim.x / 2; stride > 0; stride /= 2)
{
if (tid < stride)
partialSum[tid] += partialSum[tid + stride];
__syncthreads();
}
//将当前block的计算结果写回输出数组
if (tid == 0)
d_partial_sum[blockIdx.x] = partialSum[0];
}
uchar* ImgProcess(Mat m_srcImg,Mat s_srcImg) {
int height = m_srcImg.rows;
int width = m_srcImg.cols;
//待测图和模板图UCHAR序列
uchar* m_src = m_srcImg.data;
uchar* s_src = s_srcImg.data;
//图像大小
int Imgsize[2] = { height,width };
int memsize = height * width;
//开辟显存
cudaMalloc((void**)&d_m_src, memsize * sizeof(uchar));
cudaMalloc((void**)&d_s_src, memsize * sizeof(uchar));
cudaMalloc((void**)&s_sum, sizeof(int));
cudaMemcpyToSymbol(d_Imgsize, Imgsize, 2 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_m_src, m_src, memsize * sizeof(uchar), cudaMemcpyHostToDevice);
cudaMemcpy(d_s_src, s_src, memsize * sizeof(uchar), cudaMemcpyHostToDevice);
//sum接受像素和
int sum[64] = { 0 };
cudaMemcpyToSymbol(s_sum, sum,tile_nums * sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int numThreads = prop.maxThreadsPerBlock;
int numBlocks = (int)(height * width / numThreads);
calTotalPixVal << <numBlocks, numThreads >> > (d_m_src);
//cudaThreadSynchronize();
int h_s_sum[1024];
cudaMemcpyFromSymbol(h_s_sum, d_partial_sum,1024* sizeof(int), cudaMemcpyDeviceToHost);
cout << h_s_sum[0] << endl;
cout << h_s_sum[1] << endl;
cout << h_s_sum[2] << endl;
cout << h_s_sum[3] << endl;
cudaFree(s_sum);
return m_srcImg.data;
}
输出结果如下
1040
0
1024
0
这个结果显然不对,求告知错误在哪里,或者有CUDA像素求和的完整代码