我用了cusparse库实行了csr+csr，但是我有多少csr相加，需要怎么处理？？可加分

这是我引用库所作的csr+csr，需要写核函数吗？但是在核函数能套这个算法进去吗？
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include<iostream>
using namespace std;

//iDivUp 函数
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

//Cuda 错误检查
void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

static const char* _cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

    case CUSPARSE_STATUS_SUCCESS:
        return "CUSPARSE_STATUS_SUCCESS";

    case CUSPARSE_STATUS_NOT_INITIALIZED:
        return "CUSPARSE_STATUS_NOT_INITIALIZED";

    case CUSPARSE_STATUS_ALLOC_FAILED:
        return "CUSPARSE_STATUS_ALLOC_FAILED";

    case CUSPARSE_STATUS_INVALID_VALUE:
        return "CUSPARSE_STATUS_INVALID_VALUE";

    case CUSPARSE_STATUS_ARCH_MISMATCH:
        return "CUSPARSE_STATUS_ARCH_MISMATCH";

    case CUSPARSE_STATUS_MAPPING_ERROR:
        return "CUSPARSE_STATUS_MAPPING_ERROR";

    case CUSPARSE_STATUS_EXECUTION_FAILED:
        return "CUSPARSE_STATUS_EXECUTION_FAILED";

    case CUSPARSE_STATUS_INTERNAL_ERROR:
        return "CUSPARSE_STATUS_INTERNAL_ERROR";

    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    case CUSPARSE_STATUS_ZERO_PIVOT:
        return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char* file, const int line)
{
    if (CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
            _cusparseGetErrorEnum(err)); \
            assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

int main()
{
    //初始化 Cusparse
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    // 初始化 matrix descriptors
    cusparseMatDescr_t descrA, descrB, descrC;
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseCreateMatDescr(&descrB));
    cusparseSafeCall(cusparseCreateMatDescr(&descrC));

    const int M = 2;                                    //  行数
    const int N = 2;                                    //  列数
    const int nnzA = 4;                                //  矩阵A非零的个数
    const int nnzB = 4;

    //matrix A 的CSR格式
    int h_csrRowPtrA[M + 1] = { 0, 2, 4 };
    int h_csrColIndA[nnzA] = { 0, 1, 0, 1 };
    float h_csrValA[nnzA] = { 1, 1, 1, 1 };

    //matrix B 的CSR格式
    int h_csrRowPtrB[M + 1] = { 0, 2, 4 };
    int h_csrColIndB[nnzB] = { 0, 1, 0, 1 };
    float h_csrValB[nnzB] = { 1, 1, 1, 1 };

    // 定义Gpu上的矩阵A与矩阵B的三行式
    float* d_csrValA;       gpuErrchk(cudaMalloc(&d_csrValA, nnzA * sizeof(float)));
    int* d_csrRowPtrA;      gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
    int* d_csrColIndA;      gpuErrchk(cudaMalloc(&d_csrColIndA, nnzA * sizeof(int)));

    float* d_csrValB;       gpuErrchk(cudaMalloc(&d_csrValB, nnzB * sizeof(float)));
    int* d_csrRowPtrB;      gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
    int* d_csrColIndB;      gpuErrchk(cudaMalloc(&d_csrColIndB, nnzB * sizeof(int)));

    gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnzA * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnzA * sizeof(int), cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnzB * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnzB * sizeof(int), cudaMemcpyHostToDevice));

    // matrix A 与matrix B 求和
    int baseC = NULL, nnzC = NULL;
    char* buffer = NULL;

    //nnzTotalDevHostPtr指向主机内存
    float alpha = 1.f, beta = 1.f;
    size_t BufferSizeInBytes;
    int* nnzTotalDevHostPtr = &nnzC;
    cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
    int* d_csrRowPtrC = NULL, * d_csrColIndC = NULL; float* d_csrValC = NULL;
    gpuErrchk(cudaMalloc(&d_csrRowPtrC, sizeof(int) * (M + 1)));
    cusparseScsrgeam2_bufferSizeExt(handle, N, M,
        &alpha,
        descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
        &beta,
        descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
        descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
        &BufferSizeInBytes);
    gpuErrchk(cudaMalloc((void**)&buffer, sizeof(char) * BufferSizeInBytes));
    cusparseSafeCall(cusparseXcsrgeam2Nnz(handle, N, M, descrA, nnzA, d_csrRowPtrA, d_csrColIndA, descrB, nnzB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC,
        nnzTotalDevHostPtr, buffer));
    if (NULL != nnzTotalDevHostPtr)
    {
        nnzC = *nnzTotalDevHostPtr;
    }
    else
    {
        gpuErrchk(cudaMemcpy(&nnzC, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
        nnzC -= baseC;
    }
    gpuErrchk(cudaMalloc(&d_csrColIndC, sizeof(int) * nnzC));
    gpuErrchk(cudaMalloc(&d_csrValC, sizeof(double) * nnzC));
    cusparseSafeCall(cusparseScsrgeam2(handle, N, M,
        &alpha,
        descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
        &beta,
        descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
        descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
        buffer));

    //将矩阵C的所有资料往CPU上回传
    int* csrRowPtrC = NULL;
    int* csrColIndC = NULL;
    float* csrValC = NULL;

    csrRowPtrC = (int*)malloc(sizeof(int) * (M + 1));
    csrColIndC = (int*)malloc(sizeof(int) * nnzC);
    csrValC = (float*)malloc(sizeof(float) * nnzC);

    assert(NULL != csrRowPtrC);
    assert(NULL != csrColIndC);
    assert(NULL != csrValC);

    gpuErrchk((cudaMemcpy(csrRowPtrC, d_csrRowPtrC, sizeof(int) * (M + 1), cudaMemcpyDeviceToHost)));
    gpuErrchk((cudaMemcpy(csrColIndC, d_csrColIndC, sizeof(int) * nnzC, cudaMemcpyDeviceToHost)));
    gpuErrchk((cudaMemcpy(csrValC, d_csrValC, sizeof(float) * nnzC, cudaMemcpyDeviceToHost)));

    cout << "csrValC:" << endl;
    for (int i = 0; i < nnzC - baseC; i++)
    {
        cout << csrValC[i] << ",";

    }
    cout << endl;

    cout << "csrColIndC:" << endl;
    for (int i = 0; i < nnzC - baseC; i++)
    {
        cout << csrColIndC[i] << ",";

    }
    cout << endl;

    cout << "csrRowPtrC:" << endl;
    for (int i = 0; i < M + 1; i++)
    {
        cout << csrRowPtrC[i] << ",";
    }
    cout << endl;

    if (d_csrValA) cudaFree(d_csrValA);
    if (d_csrColIndA) cudaFree(d_csrColIndA);
    if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
    if (d_csrValB) cudaFree(d_csrValB);
    if (d_csrColIndB) cudaFree(d_csrColIndB);
    if (d_csrRowPtrB) cudaFree(d_csrRowPtrB);
    if (d_csrValC) cudaFree(d_csrValC);
    if (d_csrColIndC) cudaFree(d_csrColIndC);
    if (d_csrRowPtrC) cudaFree(d_csrRowPtrC);
    if (csrRowPtrC) free(csrRowPtrC);
    if (csrColIndC) free(csrColIndC);
    if (csrValC) free(csrValC);
    if (handle) cusparseDestroy(handle);
    if (descrA) cusparseDestroyMatDescr(descrA);
    if (descrB) cusparseDestroyMatDescr(descrB);
    if (descrC) cusparseDestroyMatDescr(descrC);
    return 0;
}