CSRmatrix matMulti(float alpha,
int height_A, int width_A, size_t nnz_A, const float* elem_A, const int* colInd_A, const int* rowPtr_A,
int height_B, int width_B, size_t nnz_B, const float* elem_B, const int* colInd_B, const int* rowPtr_B)
{
if (width_A != height_B){
fprintf(stderr, "矩阵无法相乘\n");
exit(EXIT_FAILURE);
}
if (nnz_A > 0 && nnz_B > 0) {
float beta = 0.0f;
size_t rowPtrSize_A = (size_t)height_A + 1;
size_t rowPtrSize_B = (size_t)height_B + 1;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cudaDataType computeType = CUDA_R_32F;
int* rowPtr_A_d, * colInd_A_d, * rowPtr_B_d, * colInd_B_d, * rowPtr_C_d, * colInd_C_d;
float* elem_A_d, * elem_B_d, * elem_C_d;
// allocate A
checkCuda(cudaMalloc((void**)&rowPtr_A_d, (size_t)rowPtrSize_A * sizeof(int)));
checkCuda(cudaMalloc((void**)&colInd_A_d, (size_t)nnz_A * sizeof(int)));
checkCuda(cudaMalloc((void**)&elem_A_d, (size_t)nnz_A * sizeof(float)));
// allocate B
checkCuda(cudaMalloc((void**)&rowPtr_B_d, (size_t)rowPtrSize_B * sizeof(int)));
checkCuda(cudaMalloc((void**)&colInd_B_d, (size_t)nnz_B * sizeof(int)));
checkCuda(cudaMalloc((void**)&elem_B_d, (size_t)nnz_B * sizeof(float)));
// allocate C rowPtr
checkCuda(cudaMalloc((void**)&rowPtr_C_d, (size_t)rowPtrSize_A * sizeof(int)));
// copy A
checkCuda(cudaMemcpy(rowPtr_A_d, rowPtr_A, (size_t)rowPtrSize_A * sizeof(int), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(colInd_A_d, colInd_A, (size_t)nnz_A * sizeof(int), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(elem_A_d, elem_A, (size_t)nnz_A * sizeof(float), cudaMemcpyHostToDevice));
// copy B
checkCuda(cudaMemcpy(rowPtr_B_d, rowPtr_B, (size_t)rowPtrSize_B * sizeof(int), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(colInd_B_d, colInd_B, (size_t)nnz_B * sizeof(int), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(elem_B_d, elem_B, (size_t)nnz_B * sizeof(float), cudaMemcpyHostToDevice));
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA, matB, matC;
void* dBuffer1 = NULL, * dBuffer2 = NULL;
size_t bufferSize1 = 0, bufferSize2 = 0;
checkCusparse(cusparseCreate(&handle));
// Create sparse matrix A in CSR format
checkCusparse(cusparseCreateCsr(&matA, height_A, width_A, nnz_A, rowPtr_A_d, colInd_A_d, elem_A_d,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
checkCusparse(cusparseCreateCsr(&matB, height_B, width_B, nnz_B, rowPtr_B_d, colInd_B_d, elem_B_d,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
checkCusparse(cusparseCreateCsr(&matC, height_A, width_B, 0, NULL, NULL, NULL,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
// SpGEMM Computation
cusparseSpGEMMDescr_t spgemmDesc;
checkCusparse(cusparseSpGEMM_createDescr(&spgemmDesc));
// ask bufferSize1 bytes for external memory
checkCusparse(cusparseSpGEMM_workEstimation(handle, opA, opB, &alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, NULL));
cudaMalloc((void**)&dBuffer1, bufferSize1);
// inspect the matrices A and B to understand the memory requiremnent for
// the next step
cusparseSpGEMM_workEstimation(handle, opA, opB, &alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, dBuffer1);
// ask bufferSize2 bytes for external memory
cusparseSpGEMM_compute(handle, opA, opB, &alpha, matA, matB, &beta, matC, computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, NULL);
cudaMalloc((void**)&dBuffer2, bufferSize2);
// compute the intermediate product of A * B
cusparseSpGEMM_compute(handle, opA, opB, &alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, dBuffer2);
// get matrix C non-zero entries C_num_nnz1
int64_t height_C, width_C, nnz_C;
cusparseSpMatGetSize(matC, &height_C, &width_C, &nnz_C);
if (nnz_C == 0) {
return CSRmatrix(height_A, width_B);
}
// allocate matrix C
cudaMalloc((void**)&colInd_C_d, (size_t)nnz_C * sizeof(int));
cudaMalloc((void**)&elem_C_d, (size_t)nnz_C * sizeof(float));
// update matC with the new pointers
cusparseCsrSetPointers(matC, rowPtr_C_d, colInd_C_d, elem_C_d);
// copy the final products to the matrix C
cusparseSpGEMM_copy(handle, opA, opB, &alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc);
// destroy matrix/vector descriptors
checkCusparse(cusparseSpGEMM_destroyDescr(spgemmDesc));
checkCusparse(cusparseDestroySpMat(matA));
checkCusparse(cusparseDestroySpMat(matB));
checkCusparse(cusparseDestroySpMat(matC));
checkCusparse(cusparseDestroy(handle));
vector<int> rowPtr_C(rowPtrSize_A);
vector<int> colInd_C(nnz_C);
vector<float> elem_C(nnz_C);
checkCuda(cudaMemcpy(rowPtr_C.data(), rowPtr_C_d, (size_t)rowPtrSize_A * sizeof(int), cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(colInd_C.data(), colInd_C_d, (size_t)nnz_C * sizeof(int), cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(elem_C.data(), elem_C_d, (size_t)nnz_C * sizeof(float), cudaMemcpyDeviceToHost));
// device memory deallocation
checkCuda(cudaFree(dBuffer1));
checkCuda(cudaFree(dBuffer2));
checkCuda(cudaFree(rowPtr_A_d));
checkCuda(cudaFree(colInd_A_d));
checkCuda(cudaFree(elem_A_d));
checkCuda(cudaFree(rowPtr_B_d));
checkCuda(cudaFree(colInd_B_d));
checkCuda(cudaFree(elem_B_d));
checkCuda(cudaFree(rowPtr_C_d));
checkCuda(cudaFree(colInd_C_d));
checkCuda(cudaFree(elem_C_d));
return CSRmatrix(height_A, width_B, nnz_C, elem_C, colInd_C, rowPtr_C);
}
return CSRmatrix(height_A, width_B);
}
C++ CUDA API failed at line 394 with error: an illegal memory access was encount