#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <complex.h>
#include "cuComplex.h"
#include <typeinfo> //输出变量类型所需头文件
#include <time.h>
typedef cuDoubleComplex complexd;
using namespace std;
#define pi acos(-1)
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
int N = 70; //阵元个数
int Len = 2400; //采样点数
int theta_N = 1800; //角度步进数
__device__ __host__ complexd operator*(complexd a, complexd b) { return cuCmul(a,b); }
__device__ __host__ complexd operator+(complexd a, complexd b) { return cuCadd(a,b); }
__device__ __host__ complexd operator/(complexd a, complexd b) { return cuCdiv(a,b); }
__device__ __host__ complexd exp_(complexd arg)
{
complexd res; //定义一个复数
double s, c;
double e = exp(arg.x);
sincos(arg.y, &s, &c);
res.x = c * e;
res.y = s * e;
return res;
}
/* CUDA核函数 */
__global__ void beamforming_nb(complexd* sig_out, complexd* sig_in,complexd* time_delay,int theta_N,int Len,int N)
{
int row = threadIdx.y + blockDim.y * blockIdx.y;
int col = threadIdx.x + blockDim.x * blockIdx.x;
complexd temp;
complexd add_;
if (row < theta_N && col < Len)
{
for (int i = 0; i < N; i++)
{
//theta_N x Len大小的矩阵sig_out = time_delay矩阵的row行i列 X sig_in矩阵i行col列
add_ = time_delay[row * N + i] * sig_in[col + i * Len];
__syncthreads();
temp = temp + add_;
__syncthreads();
}
sig_out[row * Len + col] = temp;
temp.x = 0;
temp.y = 0;
}
}
int main(int argc, char ** argv)
{
/* 初始参数定义 */
const double c = 1500; //介质声速
const double T = 1; //采样时长
const int FS = 2400; //采样频率
auto LEN = T*FS; //采样点数
double t[Len]; //时间长度
for(int i = 0;i<Len;i++)
{
t[i] = i/LEN;
//cout <<t[i]<<endl; //验证通过
}
const double f0 = 300; //频率为300
const double d = 0.27; //传感器间距为0.27m
const double deg2rad = pi/180; // cos是弧度
const double theta = 60; //目标方位角角度制
const double theta_rad = theta * deg2rad; //目标方位角弧度制
cudaError_t res;
/* 原始信号定义 */
complexd sig_[Len]; //原始信号列表,长度为:采样频率*采样时间
for(int i = 0;i<Len;i++)
{
complexd temp{0,2*pi*f0*t[i]};
sig_[i] = exp_(temp); //原始信号
//cout << cuCreal(sig_[i]) << '+' << cuCimag(sig_[i])<< 'i' <<'\n'; //验证通过
}
/* 加入驾驶向量 */
complexd sig[N*Len]; //定义未加入噪声信号
for(int i = 0;i<N;i++)
{
complexd steer{0,2*pi*f0*cos(theta_rad)*i*d/c}; //驾驶向量
// cout << cuCreal(exp_(steer)) << '+' << cuCimag(exp_(steer))<< 'i' << endl; //验证通过
for(int j = 0;j<Len;j++)
{
sig[i*Len+j] = sig_[j] * exp_(steer); //未加入噪声信号.N*Len
// cout << cuCreal(sig[i*Len+j]) << '+' << cuCimag(sig[i*Len+j])<< 'i' <<'\n'; //验证通过
}
}
/* 计算并加入theta_stp */
double theta_n = 1800;
complexd t_delay[theta_N*N];
for(int i = 0;i<theta_N;i++)
{
for(int j = 0 ;j<N;j++)
{
complexd tao{0,j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c};
t_delay[i*N + j] = exp_(cuConj(tao)); //第i个角度下的N个补偿,最终得到theta_N*N矩阵
}
}
/* CPU计算 */
double ttt;
clock_t at, bt;
at = clock();
complexd *h_pt;
h_pt = (complexd *)malloc(theta_N*Len*sizeof(complexd));
complexd h_temp;
for(int i = 0 ; i < theta_N; i ++ )
{
for(int j = 0; j < Len ; j ++)
{
for(int k = 0 ; k <N; k ++)
{
h_temp = h_temp + t_delay[i * N + k] * sig[k * Len + j];
}
h_pt[i * Len + j] = h_temp;
h_temp.x = 0;
h_temp.y = 0;
}
}
bt = clock();
ttt = double(bt-at)/CLOCKS_PER_SEC;
cout << ttt << "s" << endl;
/* CUDA加速 */
complexd *sig_in;
complexd *time_delay;
complexd *sig_out;
res = cudaMalloc((void**)&sig_in,N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&sig_out,theta_N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&time_delay,theta_N*N*sizeof(complexd));CHECK(res)
res = cudaMemcpy(sig_in,sig,N*Len*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
res = cudaMemcpy(time_delay,t_delay,theta_N*N*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
dim3 threadsPerBlocks(32,32); //先尝试角度采样点个数与采样频率相同的情况
dim3 numBlocks((Len)/threadsPerBlocks.x,(theta_N)/threadsPerBlocks.y);
/*
Jetson Orin 模块包含以下内容:NVIDIA Ampere架构GPU,
具有多达2048个CUDA 核、多达64个Tensor核多达12个Arm A78AE CPU核
*/
double tt;
clock_t a, b;
a = clock();
beamforming_nb<<<numBlocks,threadsPerBlocks>>>(sig_out,sig_in,time_delay,theta_N,Len,N);
cudaDeviceSynchronize();
b = clock();
tt = double(b-a)/CLOCKS_PER_SEC;
cout << tt << "s" << endl;
complexd *pt_sig = NULL; //定义输出列表
pt_sig = (complexd*)malloc(theta_N*Len*sizeof(complexd)); //输出信息
res = cudaMemcpy(pt_sig,sig_out,theta_N*Len*sizeof(complexd),cudaMemcpyDeviceToHost);CHECK(res)
cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';
// -3.74895500 + 1.3897567i
cudaFree(sig_in);
cudaFree(time_delay);
cudaFree(sig_out);
free(pt_sig);
free(h_pt);
return 0;
}
输出结果
24.4865s
0.000353s
-3.74896+13.8978i
-3.74896+1.38976i
想请问一下,为什么通过GPU加速计算的结果和实际上通过CPU计算的结果为什么会有不同,通过验证观察发现:部分数据是相等的,但是就比如说输出的第2400个数据就是不等的,现在可以保证CPU端计算的结果是正确的,但是无法找到GPU计算出错的原因,希望大家能够给予帮助!
目前已经解决了问题,问题主要出在线程分配这个地方,theta_N和Len分别为1800和2400,线程块如果取(32,32,1)的话,1800/32 = 56.25,系统将取56,x方向上线程索引(即矩阵的行索引)数目将少于theta_N,计算将会出现错误。之后从网上找了相关教程,将第161行修改为:dim3 numBlocks((Len + threadPerBlocks.x - 1)/threadsPerBlocks.x,(theta_N + threadPerBlocks.y - 1)/threadsPerBlocks.y),这时(1800+32 - 1)/32 = 57.21,发现计算还是有错,因此判断,必须要保证矩阵的行列数能够被线程块除尽才能保证运算正确。因此最后重写了代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <complex.h>
#include "cuComplex.h"
#include <typeinfo> //输出变量类型所需头文件
#include <time.h>
typedef cuDoubleComplex complexd;
using namespace std;
#define pi acos(-1)
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
int N = 70; //阵元个数
int Len = 2400; //采样点数
int theta_N = 1800; //角度步进数
__device__ __host__ complexd operator*(complexd a, complexd b) { return cuCmul(a,b); }
__device__ __host__ complexd operator+(complexd a, complexd b) { return cuCadd(a,b); }
__device__ __host__ complexd operator/(complexd a, complexd b) { return cuCdiv(a,b); }
// __device__ __host__ complexd operator+=(complexd a, complexd b) //存在问题,但是不会修改
// {
// complexd res;
// res = cuCadd(a,b);
// return res;
// }
__device__ __host__ complexd exp_(complexd arg)
{
complexd res; //定义一个复数
double s, c;
double e = exp(arg.x);
sincos(arg.y, &s, &c);
res.x = c * e;
res.y = s * e;
return res;
}
/* CUDA核函数 */
__global__ void beamforming_nb(complexd* sig_out, complexd* sig_in,complexd* time_delay,int theta_N,int Len,int N)
{
int row = threadIdx.x + blockDim.x * blockIdx.x;
int col = threadIdx.y + blockDim.y * blockIdx.y; //*:CUDA中的索引逻辑顺序为X>Y>Z
// dim3 threadsPerBlocks(32, 32);
// dim3 numBlocks((theta_N + threadsPerBlocks.x -1)/threadsPerBlocks.x,(Len + threadsPerBlocks.y -1)/threadsPerBlocks.y);
complexd temp{0,0};
complexd add_{0,0};
if (row < theta_N && col < Len) // 试试使用两个if呢?
{
for (int i = 0; i < N; i++)
{
//theta_N x Len大小的矩阵sig_out = time_delay矩阵的row行i列 X sig_in矩阵i行col列
temp = temp + time_delay[row * N + i] * sig_in[col + i * Len];
}
sig_out[row * Len + col] = temp;
temp = add_;
}
/*
存在问题:
1. 解决theta_N*N矩阵和N*Len矩阵的并行计算问题; 答:已解决,用if (row < theta_N && col < Len)替换外围两个for循环,详见success.cu
2. 数组大小超过单个block所含i线程大小的计算; 答:暂时不用考虑
3. CUDA的blocks和线程调用; 答:需要进一步学习。
4. 让更多的计算容纳到CUDA计算核中;
5. CUDA的核计算中如何做到循环。 答:使用if实现
*/
}
int main(int argc, char ** argv)
{
/* 初始参数定义 */
const double c = 1500; //介质声速
//const int N = 70; //传感器数量
const double T = 1; //采样时长
const int FS = 2400; //采样频率
auto LEN = T*FS; //采样点数
double t[Len]; //时间长度
for(int i = 0;i<Len;i++)
{
t[i] = i/LEN;
// cout <<t[i]<<endl; //验证通过
}
const double f0 = 300; //频率为300
const double d = 0.27; //传感器间距为0.27m
const double deg2rad = pi/180; // cos是弧度
// cout << pi << endl; 验证通过
const double theta = 60; //目标方位角角度制
const double theta_rad = theta * deg2rad; //目标方位角弧度制
cudaError_t res;
/* 原始信号定义 */
complexd sig_[Len]; //原始信号列表,长度为:采样频率*采样时间
for(int i = 0;i<Len;i++)
{
complexd temp{0,2*pi*f0*t[i]};
sig_[i] = exp_(temp); //原始信号
// cout << cuCreal(sig_[i]) << '+' << cuCimag(sig_[i])<< 'i' <<'\n'; //验证通过
}
/* 加入驾驶向量 */
complexd sig[N*Len]; //定义未加入噪声信号
for(int i = 0;i<N;i++)
{
complexd steer{0,2*pi*f0*cos(theta_rad)*i*d/c}; //驾驶向量
// cout << cuCreal(exp_(steer)) << '+' << cuCimag(exp_(steer))<< 'i' << endl; //验证通过
for(int j = 0;j<Len;j++)
{
sig[i*Len+j] = sig_[j] * exp_(steer); //未加入噪声信号.N*Len
// cout << cuCreal(sig[i*Len+j]) << '+' << cuCimag(sig[i*Len+j])<< 'i' <<'\n'; //验证通过
}
}
/* 加入噪声 */
/* 计算并加入theta_stp */
double theta_n = 1800;
complexd t_delay[theta_N*N];
for(int i = 0;i<theta_N;i++)
{
for(int j = 0 ;j<N;j++)
{
complexd tao{0,j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c};
// cout << j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c << '\t' <<j <<endl; //验证通过
t_delay[i*N + j] = exp_(cuConj(tao)); //第i个角度下的N个补偿,最终得到theta_N*N矩阵
// cout << cuCreal(exp_(cuConj(tao))) << '+' << cuCimag(exp_(cuConj(tao)))<< 'i' <<'\n'; //验证通过
}
}
/* CPU计算 */
double ttt;
clock_t at, bt;
at = clock();
complexd *h_pt;
h_pt = (complexd *)malloc(theta_N*Len*sizeof(complexd));
complexd h_temp{0,0};
complexd a_temp{0,0};
for(int i = 0 ; i < theta_N; i ++ )
{
for(int j = 0; j < Len ; j ++)
{
for(int k = 0 ; k <N; k ++)
{
h_temp = h_temp + t_delay[i * N + k] * sig[k * Len + j]; // i行k列 x k行j列 = i行j列
}
h_pt[i * Len + j] = h_temp;
h_temp = a_temp;
}
}
bt = clock();
ttt = double(bt-at)/CLOCKS_PER_SEC;
cout << ttt << "s" << endl;
/* CUDA加速 */
complexd *sig_in;
complexd *time_delay;
complexd *sig_out;
res = cudaMalloc((void**)&sig_in,N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&sig_out,theta_N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&time_delay,theta_N*N*sizeof(complexd));CHECK(res)
res = cudaMemcpy(sig_in,sig,N*Len*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
res = cudaMemcpy(time_delay,t_delay,theta_N*N*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
dim3 threadsPerBlocks(24,24);
// dim3 numBlocks((Len+threadsPerBlocks.x-1)/threadsPerBlocks.x,(theta_N+threadsPerBlocks.y-1)/threadsPerBlocks.y);
dim3 numBlocks((theta_N)/threadsPerBlocks.x,(Len)/threadsPerBlocks.y);
// cout << numBlocks.x << "," << numBlocks.y << endl;
/*
Jetson Orin 模块包含以下内容:NVIDIA Ampere架构GPU,
具有多达2048个CUDA 核、多达64个Tensor核多达12个Arm A78AE CPU核
*/
double tt;
clock_t a, b;
a = clock();
beamforming_nb<<<numBlocks,threadsPerBlocks>>>(sig_out,sig_in,time_delay,theta_N,Len,N); // (grid, block)
cudaDeviceSynchronize();
b = clock();
tt = double(b-a)/CLOCKS_PER_SEC;
cout << tt << "s" << endl;
complexd *pt_sig = NULL; //定义输出列表
pt_sig = (complexd*)malloc(theta_N*Len*sizeof(complexd)); //输出信息
res = cudaMemcpy(pt_sig,sig_out,theta_N*Len*sizeof(complexd),cudaMemcpyDeviceToHost);CHECK(res)
bool is_right = true;
for(int i = 0 ; i <theta_N;i++)
{
for(int j = 0 ; j < Len;j++)
{
// if(cuCreal(pt_sig[(i*Len+j)]) != cuCreal(h_pt[i*Len+j]) || cuCimag(pt_sig[(i*Len+j)]) != cuCimag(h_pt[i*Len+j]) )
if(((cuCreal(pt_sig[(i*Len+j)]) - cuCreal(h_pt[(i*Len+j)])) > 1e-8) || ((cuCimag(pt_sig[(i*Len+j)]) - cuCimag(h_pt[(i*Len+j)])) > 1e-8))
{
is_right = false;
// cout << "GPU:" << cuCreal(pt_sig[(i*Len+j)]) - cuCreal(h_pt[(i*Len+j)])<< '+' << cuCimag(pt_sig[(i*Len+j)]) - cuCimag(h_pt[(i*Len+j)])<< 'i' << '\n';
cout << "第" << i*Len+j << "个" << "数据不正确" << endl;
cout << "GPU:" << cuCreal(pt_sig[(i*Len+j)])<< '+' << cuCimag(pt_sig[(i*Len+j)]) << 'i' << '\n';
cout << "CPU:" << cuCreal(h_pt[(i*Len+j)])<< '+' << cuCimag(h_pt[(i*Len+j)]) << 'i' << '\n';
}
}
}
printf("The result is %s!\n",is_right?"right":"false");
cout << cuCreal(pt_sig[(0)])<< '+' << cuCimag(pt_sig[(0)]) << 'i' << '\n';
cout << cuCreal(h_pt[(0)])<< '+' << cuCimag(h_pt[(0)]) << 'i' << '\n';
cudaFree(sig_in);
cudaFree(time_delay);
cudaFree(sig_out);
free(pt_sig);
free(h_pt);
return 0;
}
主要的改动点是:线程块大小和网格大小的重新设定,将col和row的索引修改以便符合逻辑,去掉了__syncthreads()(当初以为是线程同步问题导致的计算错误),最后还是和CPU计算结果进行了比较,结果最终显示正确。
只能是自己一步步调试分析,数据不对肯定是哪里逻辑不对,或逻辑设计正确但编码不对。https://blog.csdn.net/wd1603926823/article/details/77451433 之前我也是慢慢调的OCL和CUDA都是如此,算法原理一致下CPU与cuda版本肯定完全一致 https://blog.csdn.net/wd1603926823/article/details/120157432
有时不将“调用函数名字+各参数值,进入函数后各参数值,中间变量值,退出函数前准备返回的值,返回函数到调用处后函数名字+各参数值+返回值”这些信息写日志到文件中是无论如何也发现不了问题在哪里的,包括捕获各种异常、写日志到屏幕、单步或设断点或生成core或dmp文件、……这些方法都不行! 写日志到文件参考下面:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _MSC_VER
#pragma warning(disable:4996)
#include <windows.h>
#include <io.h>
#else
#include <unistd.h>
#include <sys/time.h>
#include <pthread.h>
#define CRITICAL_SECTION pthread_mutex_t
#define _vsnprintf vsnprintf
#endif
//Log{
#define MAXLOGSIZE 20000000
#define MAXLINSIZE 16000
#include <time.h>
#include <sys/timeb.h>
#include <stdarg.h>
char logfilename1[]="MyLog1.log";
char logfilename2[]="MyLog2.log";
static char logstr[MAXLINSIZE+1];
char datestr[16];
char timestr[16];
char mss[4];
CRITICAL_SECTION cs_log;
FILE *flog;
#ifdef _MSC_VER
void Lock(CRITICAL_SECTION *l) {
EnterCriticalSection(l);
}
void Unlock(CRITICAL_SECTION *l) {
LeaveCriticalSection(l);
}
#else
void Lock(CRITICAL_SECTION *l) {
pthread_mutex_lock(l);
}
void Unlock(CRITICAL_SECTION *l) {
pthread_mutex_unlock(l);
}
#endif
void LogV(const char *pszFmt,va_list argp) {
struct tm *now;
struct timeb tb;
if (NULL==pszFmt||0==pszFmt[0]) return;
_vsnprintf(logstr,MAXLINSIZE,pszFmt,argp);
ftime(&tb);
now=localtime(&tb.time);
sprintf(datestr,"%04d-%02d-%02d",now->tm_year+1900,now->tm_mon+1,now->tm_mday);
sprintf(timestr,"%02d:%02d:%02d",now->tm_hour ,now->tm_min ,now->tm_sec );
sprintf(mss,"%03d",tb.millitm);
printf("%s %s.%s %s",datestr,timestr,mss,logstr);
flog=fopen(logfilename1,"a");
if (NULL!=flog) {
fprintf(flog,"%s %s.%s %s",datestr,timestr,mss,logstr);
if (ftell(flog)>MAXLOGSIZE) {
fclose(flog);
if (rename(logfilename1,logfilename2)) {
remove(logfilename2);
rename(logfilename1,logfilename2);
}
} else {
fclose(flog);
}
}
}
void Log(const char *pszFmt,...) {
va_list argp;
Lock(&cs_log);
va_start(argp,pszFmt);
LogV(pszFmt,argp);
va_end(argp);
Unlock(&cs_log);
}
//Log}
int main(int argc,char * argv[]) {
int i;
#ifdef _MSC_VER
InitializeCriticalSection(&cs_log);
#else
pthread_mutex_init(&cs_log,NULL);
#endif
for (i=0;i<10000;i++) {
Log("This is a Log %04d from FILE:%s LINE:%d\n",i, __FILE__, __LINE__);
}
#ifdef _MSC_VER
DeleteCriticalSection(&cs_log);
#else
pthread_mutex_destroy(&cs_log);
#endif
return 0;
}
//1-79行添加到你带main的.c或.cpp的那个文件的最前面
//82-86行添加到你的main函数开头
//90-94行添加到你的main函数结束前
//在要写LOG的地方仿照第88行的写法写LOG到文件MyLog1.log中