这是我引用库所作的csr+csr,需要写核函数吗?但是在核函数能套这个算法进去吗?
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include<iostream>
using namespace std;
//iDivUp 函数
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
//Cuda 错误检查
void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
static const char* _cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char* file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
int main()
{
//初始化 Cusparse
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// 初始化 matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 2; // 行数
const int N = 2; // 列数
const int nnzA = 4; // 矩阵A非零的个数
const int nnzB = 4;
//matrix A 的CSR格式
int h_csrRowPtrA[M + 1] = { 0, 2, 4 };
int h_csrColIndA[nnzA] = { 0, 1, 0, 1 };
float h_csrValA[nnzA] = { 1, 1, 1, 1 };
//matrix B 的CSR格式
int h_csrRowPtrB[M + 1] = { 0, 2, 4 };
int h_csrColIndB[nnzB] = { 0, 1, 0, 1 };
float h_csrValB[nnzB] = { 1, 1, 1, 1 };
// 定义Gpu上的矩阵A与矩阵B的三行式
float* d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnzA * sizeof(float)));
int* d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int* d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnzA * sizeof(int)));
float* d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnzB * sizeof(float)));
int* d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int* d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnzB * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnzA * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnzA * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnzB * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnzB * sizeof(int), cudaMemcpyHostToDevice));
// matrix A 与matrix B 求和
int baseC = NULL, nnzC = NULL;
char* buffer = NULL;
//nnzTotalDevHostPtr指向主机内存
float alpha = 1.f, beta = 1.f;
size_t BufferSizeInBytes;
int* nnzTotalDevHostPtr = &nnzC;
cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
int* d_csrRowPtrC = NULL, * d_csrColIndC = NULL; float* d_csrValC = NULL;
gpuErrchk(cudaMalloc(&d_csrRowPtrC, sizeof(int) * (M + 1)));
cusparseScsrgeam2_bufferSizeExt(handle, N, M,
&alpha,
descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
&beta,
descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
&BufferSizeInBytes);
gpuErrchk(cudaMalloc((void**)&buffer, sizeof(char) * BufferSizeInBytes));
cusparseSafeCall(cusparseXcsrgeam2Nnz(handle, N, M, descrA, nnzA, d_csrRowPtrA, d_csrColIndA, descrB, nnzB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC,
nnzTotalDevHostPtr, buffer));
if (NULL != nnzTotalDevHostPtr)
{
nnzC = *nnzTotalDevHostPtr;
}
else
{
gpuErrchk(cudaMemcpy(&nnzC, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnzC -= baseC;
}
gpuErrchk(cudaMalloc(&d_csrColIndC, sizeof(int) * nnzC));
gpuErrchk(cudaMalloc(&d_csrValC, sizeof(double) * nnzC));
cusparseSafeCall(cusparseScsrgeam2(handle, N, M,
&alpha,
descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
&beta,
descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
buffer));
//将矩阵C的所有资料往CPU上回传
int* csrRowPtrC = NULL;
int* csrColIndC = NULL;
float* csrValC = NULL;
csrRowPtrC = (int*)malloc(sizeof(int) * (M + 1));
csrColIndC = (int*)malloc(sizeof(int) * nnzC);
csrValC = (float*)malloc(sizeof(float) * nnzC);
assert(NULL != csrRowPtrC);
assert(NULL != csrColIndC);
assert(NULL != csrValC);
gpuErrchk((cudaMemcpy(csrRowPtrC, d_csrRowPtrC, sizeof(int) * (M + 1), cudaMemcpyDeviceToHost)));
gpuErrchk((cudaMemcpy(csrColIndC, d_csrColIndC, sizeof(int) * nnzC, cudaMemcpyDeviceToHost)));
gpuErrchk((cudaMemcpy(csrValC, d_csrValC, sizeof(float) * nnzC, cudaMemcpyDeviceToHost)));
cout << "csrValC:" << endl;
for (int i = 0; i < nnzC - baseC; i++)
{
cout << csrValC[i] << ",";
}
cout << endl;
cout << "csrColIndC:" << endl;
for (int i = 0; i < nnzC - baseC; i++)
{
cout << csrColIndC[i] << ",";
}
cout << endl;
cout << "csrRowPtrC:" << endl;
for (int i = 0; i < M + 1; i++)
{
cout << csrRowPtrC[i] << ",";
}
cout << endl;
if (d_csrValA) cudaFree(d_csrValA);
if (d_csrColIndA) cudaFree(d_csrColIndA);
if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
if (d_csrValB) cudaFree(d_csrValB);
if (d_csrColIndB) cudaFree(d_csrColIndB);
if (d_csrRowPtrB) cudaFree(d_csrRowPtrB);
if (d_csrValC) cudaFree(d_csrValC);
if (d_csrColIndC) cudaFree(d_csrColIndC);
if (d_csrRowPtrC) cudaFree(d_csrRowPtrC);
if (csrRowPtrC) free(csrRowPtrC);
if (csrColIndC) free(csrColIndC);
if (csrValC) free(csrValC);
if (handle) cusparseDestroy(handle);
if (descrA) cusparseDestroyMatDescr(descrA);
if (descrB) cusparseDestroyMatDescr(descrB);
if (descrC) cusparseDestroyMatDescr(descrC);
return 0;
}