float Forward_Solution_t(double* lattice, double* lattice_result_pointer, unsigned int size, unsigned int ponit, unsigned int velue ,
double alpha[], double a[], double d[])
{
// CUDA事件对象
cudaEvent_t start, end;
// 创建CUDA事件
cudaEventCreate(&start);
cudaEventCreate(&end);
// 记录开始时间
cudaEventRecord(start, 0);
//将数据分层处理
unsigned int tier_c;
if (ponit < 2000) {
tier_c = ponit;
}
else {
tier_c = 2000;//每层处理2000组数据 tier_c*velue
}
int data = tier_c*velue;//每层处理的数据量
//DH参数
double* d_alpha;
double* d_a;
double* d_d;
//copy data
double* lattice_tier_in;//临时数组(输入)
double* lattice_tier_out;//临时数组(输出)
double* data_in;//临时数组(传入)
double* data_out;//临时数组(传出)
unsigned int tier;//层数
double* data_temp;//临时数组(存储数据)
//分配线程块
int gridSize;//需要块数量
int blockSize;//需要线程数
unsigned int thread = 800;//每个块分配的线程
//
bool tier_bool = false;
//分配内存
data = tier_c*velue;//每层处理的数据量
lattice_tier_in = new double[data];
lattice_tier_out = new double[data];
data_temp = new double[data];
//分配设备内存
cudaMalloc((void**)&data_in, data * sizeof(double));
cudaMalloc((void**)&data_out, data * sizeof(double));
cudaMalloc((void**)&d_alpha, velue * sizeof(double));
cudaMalloc((void**)&d_a, velue * sizeof(double));
cudaMalloc((void**)&d_d, velue * sizeof(double));
//分配线程块
if (tier_c>thread) {
gridSize = (tier_c + thread - 1) / thread;
blockSize = thread;
}
else {
blockSize = tier_c;
gridSize = 1;
}
//循环处理数据
int loop_v = (ponit + tier_c - 1) / tier_c;//循环变量
for (tier = 0; tier < loop_v; tier++) {
tier_bool = (tier == (ponit / tier_c));//是否为最后一层
if (tier_bool) {
//释放内存
delete[] lattice_tier_in;
lattice_tier_in = nullptr;
delete[] lattice_tier_out;
lattice_tier_out = nullptr;
delete[] data_temp;
//device
cudaFree(data_in);
data_in = nullptr;
cudaFree(data_out);
data_out = nullptr;
//最后一层
tier_c = ponit%tier_c;//最后一层的c组数据
data = tier_c*velue;//最后一层的数据量
lattice_tier_in = new double[data];
lattice_tier_out = new double[data];
data_temp = new double[data];
//分配设备内存
cudaMalloc((void**)&data_in, data * sizeof(double));
cudaMalloc((void**)&data_out, data * sizeof(double));
cudaMalloc((void**)&d_alpha, velue * sizeof(double));
cudaMalloc((void**)&d_a, velue * sizeof(double));
cudaMalloc((void**)&d_d, velue * sizeof(double));
//分配线程块
if (tier_c>thread) {
gridSize = (tier_c + thread - 1) / thread;
blockSize = thread;
}
else {
blockSize = tier_c;
gridSize = 1;
}
}
//初始化计算数组(输入)
for (int i = 0; i < data; i++) {
lattice_tier_in[i] = lattice[tier*2000*6 + i];
}
//执行调用
//HostToDevice
HANDLE_ERROR(cudaMemcpy(data_in, lattice_tier_in, data * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(data_out, lattice_tier_out, data * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_alpha, alpha, velue * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_a, a, velue * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_d, d, velue * sizeof(double), cudaMemcpyHostToDevice));
//开启并行主函数
Forward_Solution_kernel << <gridSize, blockSize >> >(data_in, data_out, tier_c, d_alpha, d_a , d_d);
//最近一次核函数异常
cudaError_t error = cudaGetLastError();
printf("CUDA error: %s %d\n", cudaGetErrorString(error),tier);
// 等待设备上的所有任务完成
cudaDeviceSynchronize();
//cudaMemcpyDeviceToHost
HANDLE_ERROR(cudaMemcpy(lattice_tier_out, data_out, data * sizeof(double), cudaMemcpyDeviceToHost));
//存储处理后的数据
for (int i = 0; i < data; i++) {
data_temp[i] = lattice_tier_out[i];
}
for (int i = 0; i < data; i++) {
lattice_result_pointer[tier*data + i] = data_temp[i];
}
}//循环结束
//释放内存
delete[] lattice_tier_in;
lattice_tier_in = nullptr;
delete[] lattice_tier_out;
lattice_tier_out = nullptr;
delete[] data_temp;
//host
/*cudaFreeHost(lattice_tier_in);
cudaFreeHost(lattice_tier_out);*/
//device
cudaFree(data_in);
data_in = nullptr;
cudaFree(data_out);
data_out = nullptr;
// 记录结束时间
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
// 计算时间差(以毫秒为单位)
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, end);
// 释放CUDA事件
cudaEventDestroy(start);
cudaEventDestroy(end);
return elapsedTime;
}
数据量只能增大到5000,再增加就会报错。一直在找内存的问题,写循环分组也是为了找问题,最开始只能运算2000组数据,没想到分组还是会报错。
以下是10000组数据时当前代码的报错信息
其中474与458--->462为:
//cudaMemcpyDeviceToHost
HANDLE_ERROR(cudaMemcpy(lattice_tier_out, data_out, data * sizeof(double), cudaMemcpyDeviceToHost));
//HostToDevice
HANDLE_ERROR(cudaMemcpy(data_in, lattice_tier_in, data * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(data_out, lattice_tier_out, data * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_alpha, alpha, velue * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_a, a, velue * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_d, d, velue * sizeof(double), cudaMemcpyHostToDevice));
前两次循环能导出结果且是正确的,第三次出了问题
作为初学者想知道对此问题的原因和解决办法
https://blog.csdn.net/captainAAAjohn/article/details/118162508