求解决C++ CUDA API failed at line 394 with error: an illegal memory access was encount


CSRmatrix matMulti(float alpha,
                   int height_A, int width_A, size_t nnz_A, const float* elem_A, const int* colInd_A, const int* rowPtr_A, 
                   int height_B, int width_B, size_t nnz_B, const float* elem_B, const int* colInd_B, const int* rowPtr_B) 
{

    if (width_A != height_B){
        fprintf(stderr, "矩阵无法相乘\n");
        exit(EXIT_FAILURE);
    }

    if (nnz_A > 0 && nnz_B > 0) {
        float beta = 0.0f;
        size_t rowPtrSize_A = (size_t)height_A + 1;
        size_t rowPtrSize_B = (size_t)height_B + 1;

        cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
        cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
        cudaDataType        computeType = CUDA_R_32F;

        int* rowPtr_A_d, * colInd_A_d, * rowPtr_B_d, * colInd_B_d, * rowPtr_C_d, * colInd_C_d;
        float* elem_A_d, * elem_B_d, * elem_C_d;
        // allocate A
        checkCuda(cudaMalloc((void**)&rowPtr_A_d, (size_t)rowPtrSize_A * sizeof(int)));
        checkCuda(cudaMalloc((void**)&colInd_A_d, (size_t)nnz_A * sizeof(int)));
        checkCuda(cudaMalloc((void**)&elem_A_d, (size_t)nnz_A * sizeof(float)));
        // allocate B
        checkCuda(cudaMalloc((void**)&rowPtr_B_d, (size_t)rowPtrSize_B * sizeof(int)));
        checkCuda(cudaMalloc((void**)&colInd_B_d, (size_t)nnz_B * sizeof(int)));
        checkCuda(cudaMalloc((void**)&elem_B_d, (size_t)nnz_B * sizeof(float)));
        // allocate C rowPtr
        checkCuda(cudaMalloc((void**)&rowPtr_C_d, (size_t)rowPtrSize_A * sizeof(int)));

        // copy A
        checkCuda(cudaMemcpy(rowPtr_A_d, rowPtr_A, (size_t)rowPtrSize_A * sizeof(int), cudaMemcpyHostToDevice));
        checkCuda(cudaMemcpy(colInd_A_d, colInd_A, (size_t)nnz_A * sizeof(int), cudaMemcpyHostToDevice));
        checkCuda(cudaMemcpy(elem_A_d, elem_A, (size_t)nnz_A * sizeof(float), cudaMemcpyHostToDevice));
        // copy B
        checkCuda(cudaMemcpy(rowPtr_B_d, rowPtr_B, (size_t)rowPtrSize_B * sizeof(int), cudaMemcpyHostToDevice));
        checkCuda(cudaMemcpy(colInd_B_d, colInd_B, (size_t)nnz_B * sizeof(int), cudaMemcpyHostToDevice));
        checkCuda(cudaMemcpy(elem_B_d, elem_B, (size_t)nnz_B * sizeof(float), cudaMemcpyHostToDevice));
           
        // CUSPARSE APIs
        cusparseHandle_t     handle = NULL;
        cusparseSpMatDescr_t matA, matB, matC;
        void* dBuffer1 = NULL, * dBuffer2 = NULL;
        size_t bufferSize1 = 0, bufferSize2 = 0;
        checkCusparse(cusparseCreate(&handle));
        // Create sparse matrix A in CSR format
        checkCusparse(cusparseCreateCsr(&matA, height_A, width_A, nnz_A, rowPtr_A_d, colInd_A_d, elem_A_d,
                                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
        checkCusparse(cusparseCreateCsr(&matB, height_B, width_B, nnz_B, rowPtr_B_d, colInd_B_d, elem_B_d,
                                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
        checkCusparse(cusparseCreateCsr(&matC, height_A, width_B, 0, NULL, NULL, NULL,
                                        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
         
        // SpGEMM Computation
        cusparseSpGEMMDescr_t spgemmDesc;
        checkCusparse(cusparseSpGEMM_createDescr(&spgemmDesc));

        // ask bufferSize1 bytes for external memory
        checkCusparse(cusparseSpGEMM_workEstimation(handle, opA, opB, &alpha, matA, matB, &beta, matC,
                                                    computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, NULL));
        cudaMalloc((void**)&dBuffer1, bufferSize1);
        // inspect the matrices A and B to understand the memory requiremnent for
        // the next step
        cusparseSpGEMM_workEstimation(handle, opA, opB, &alpha, matA, matB, &beta, matC, 
                                      computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, dBuffer1);

        // ask bufferSize2 bytes for external memory
        cusparseSpGEMM_compute(handle, opA, opB, &alpha, matA, matB, &beta, matC, computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, NULL);
        cudaMalloc((void**)&dBuffer2, bufferSize2);

        // compute the intermediate product of A * B
        cusparseSpGEMM_compute(handle, opA, opB, &alpha, matA, matB, &beta, matC, 
                               computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, dBuffer2);
        // get matrix C non-zero entries C_num_nnz1
        int64_t height_C, width_C, nnz_C;
        cusparseSpMatGetSize(matC, &height_C, &width_C, &nnz_C);

        if (nnz_C == 0) {
            return CSRmatrix(height_A, width_B);
        }

        // allocate matrix C
        cudaMalloc((void**)&colInd_C_d, (size_t)nnz_C * sizeof(int));
        cudaMalloc((void**)&elem_C_d, (size_t)nnz_C * sizeof(float));
        // update matC with the new pointers
        cusparseCsrSetPointers(matC, rowPtr_C_d, colInd_C_d, elem_C_d);

        // copy the final products to the matrix C
        cusparseSpGEMM_copy(handle, opA, opB, &alpha, matA, matB, &beta, matC,
                            computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc);

        // destroy matrix/vector descriptors
        checkCusparse(cusparseSpGEMM_destroyDescr(spgemmDesc));
        checkCusparse(cusparseDestroySpMat(matA));
        checkCusparse(cusparseDestroySpMat(matB));
        checkCusparse(cusparseDestroySpMat(matC));
        checkCusparse(cusparseDestroy(handle));

        vector<int> rowPtr_C(rowPtrSize_A);
        vector<int> colInd_C(nnz_C);
        vector<float> elem_C(nnz_C);

        checkCuda(cudaMemcpy(rowPtr_C.data(), rowPtr_C_d, (size_t)rowPtrSize_A * sizeof(int), cudaMemcpyDeviceToHost));
        checkCuda(cudaMemcpy(colInd_C.data(), colInd_C_d, (size_t)nnz_C * sizeof(int), cudaMemcpyDeviceToHost));
        checkCuda(cudaMemcpy(elem_C.data(), elem_C_d, (size_t)nnz_C * sizeof(float), cudaMemcpyDeviceToHost));
        
        // device memory deallocation
        checkCuda(cudaFree(dBuffer1));
        checkCuda(cudaFree(dBuffer2));
        checkCuda(cudaFree(rowPtr_A_d));
        checkCuda(cudaFree(colInd_A_d));
        checkCuda(cudaFree(elem_A_d));
        checkCuda(cudaFree(rowPtr_B_d));
        checkCuda(cudaFree(colInd_B_d));
        checkCuda(cudaFree(elem_B_d));
        checkCuda(cudaFree(rowPtr_C_d));
        checkCuda(cudaFree(colInd_C_d));
        checkCuda(cudaFree(elem_C_d));
        
        return  CSRmatrix(height_A, width_B, nnz_C, elem_C, colInd_C, rowPtr_C);
    }
    
    return CSRmatrix(height_A, width_B);
    
}

C++ CUDA API failed at line 394 with error: an illegal memory access was encount