csr矩阵加法，我写完后的输出很奇怪，引用cusparse进行求解（Cuda)

有大佬知道我哪里错了吗？？

#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include<iostream>
using namespace std;

int main()
{
   //初始化 cuSparse library
   cusparseHandle_t handle = NULL;
   cusparseMatDescr_t descrA, descrB, descrC = NULL;
   cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
   cudaError_t cudaStat1 = cudaSuccess;
   cudaError_t cudaStat2 = cudaSuccess;
   cudaError_t cudaStat3 = cudaSuccess;
   cudaError_t cudaStat4 = cudaSuccess;
   status = cusparseCreate(&handle);
   assert(CUSPARSE_STATUS_SUCCESS == status);

   //初始化矩阵描述符
   status = cusparseCreateMatDescr(&descrA);
   assert(CUSPARSE_STATUS_SUCCESS == status);
   status = cusparseCreateMatDescr(&descrB);
   assert(CUSPARSE_STATUS_SUCCESS == status);
   status = cusparseCreateMatDescr(&descrC);
   assert(CUSPARSE_STATUS_SUCCESS == status);

   const int n = 3; // --- 行数与列数
   const int nnzA = 4; // --- matrix A 的非零个数
   const int nnzB = 4; // --- matrix B 的非零个数
   //matrix A 的CSR格式
   const int csrRowPtrA[n + 1] = { 1, 2, 4, 5 };
   const int csrColIndA[nnzA] = { 1, 2, 3, 3 };
   const double csrValA[nnzA] = { 1.0, 2.0, 4.0, 5.0 };

   //matrix B 的CSR格式
   const int csrRowPtrB[n + 1] = { 1, 1, 1, 1 };
   const int csrColIndB[nnzB] = { 1, 2, 2, 3 };
   const double csrValB[nnzB] = { 1.0, 3.0, 4.0, 5.0 };

   //在Cuda上分配matrix A与matrix B的内存及数据转移
   int* d_csrRowPtrA;
   cudaStat1 = cudaMalloc(&d_csrRowPtrA, sizeof(int) * (n + 1));
   assert(cudaSuccess == cudaStat1);
   cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
   assert(cudaSuccess == cudaStat1);
   int* d_csrColIndA;
   cudaStat2 = cudaMalloc(&d_csrColIndA, sizeof(int) * nnzA);
   assert(cudaSuccess == cudaStat2);
   cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
   assert(cudaSuccess == cudaStat2);
   double* d_csrValA;
   cudaStat3 = cudaMalloc(&d_csrValA, sizeof(double) * nnzA);
   assert(cudaSuccess == cudaStat3);
   cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice);
   assert(cudaSuccess == cudaStat3);

   int* d_csrRowPtrB;
   cudaStat1 = cudaMalloc(&d_csrRowPtrB, sizeof(int) * (n + 1));
   assert(cudaSuccess == cudaStat1);
   cudaStat1 = cudaMemcpy(d_csrRowPtrB, csrRowPtrB, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
   assert(cudaSuccess == cudaStat1);
   int* d_csrColIndB;
   cudaStat2 = cudaMalloc(&d_csrColIndB, sizeof(int) * nnzB);
   assert(cudaSuccess == cudaStat2);
   cudaStat2 = cudaMemcpy(d_csrColIndB, csrColIndB, sizeof(int) * nnzB, cudaMemcpyHostToDevice);
   assert(cudaSuccess == cudaStat2);
   double* d_csrValB;
   cudaStat3 = cudaMalloc(&d_csrValB, sizeof(double) * nnzB);
   assert(cudaSuccess == cudaStat3);
   cudaStat3 = cudaMemcpy(d_csrValB, csrValB, sizeof(double) * nnzB, cudaMemcpyHostToDevice);
   assert(cudaSuccess == cudaStat3);

   //matrix A与matrix B 求和
   int baseC, nnzC;
   double alpha = 1.f, beta = 1.f;
   size_t BufferSizeInBytes;
   char* buffer = NULL;
   int* nnzTotalDevHostPtr = &nnzC;
   cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
   int* d_csrRowPtrC = NULL, * d_csrColIndC = NULL; double* d_csrValC = NULL;
   cudaStat1 = cudaMalloc(&d_csrRowPtrC, sizeof(int) * (n + 1));
   assert(cudaSuccess == cudaStat1);

   cusparseDcsrgeam2_bufferSizeExt(handle, n, n,
       &alpha,
       descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
       &beta,
       descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
       descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
       &BufferSizeInBytes);
   cudaMalloc((void**)&buffer, sizeof(char) * BufferSizeInBytes);
   status = cusparseXcsrgeam2Nnz(handle, n, n, descrA, nnzA, d_csrRowPtrA, d_csrColIndA, descrB, nnzB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC,
       nnzTotalDevHostPtr, buffer);
   if (NULL != nnzTotalDevHostPtr)
   {
       nnzC = *nnzTotalDevHostPtr;
   }
   else
   {
       cudaMemcpy(&nnzC, d_csrRowPtrC + n, sizeof(int), cudaMemcpyDeviceToHost);
       cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
       nnzC -= baseC;
   }
   cudaMalloc(&d_csrColIndC, sizeof(int) * nnzC);
   cudaMalloc(&d_csrValC, sizeof(double) * nnzC);
   status = cusparseDcsrgeam2(
       handle, n, n,
       &alpha,
       descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
       &beta,
       descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
       descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
       buffer);

   //将矩阵C的所有资料往CPU上回传
   int* csrRowPtrC = NULL;
   int* csrColIndC = NULL;
   float* csrValC = NULL;
   csrRowPtrC = (int*)malloc(sizeof(int) * (n + 1));
   csrColIndC = (int*)malloc(sizeof(int) * nnzC);
   csrValC = (float*)malloc(sizeof(float) * nnzC);
   assert(NULL != csrRowPtrC);
   assert(NULL != csrColIndC);
   assert(NULL != csrValC);
   cudaStat1 = cudaMemcpy(csrRowPtrC, d_csrRowPtrC, sizeof(int) * (n + 1),cudaMemcpyDeviceToHost);
   cudaStat2 = cudaMemcpy(csrColIndC, d_csrColIndC, sizeof(int) * nnzC,cudaMemcpyDeviceToHost);
   cudaStat3 = cudaMemcpy(csrValC, d_csrValC, sizeof(float) * nnzC,cudaMemcpyDeviceToHost);
   assert(cudaSuccess == cudaStat1);
   assert(cudaSuccess == cudaStat2);
   assert(cudaSuccess == cudaStat3);

   cout << *csrValC << ",";
   cout << endl;
   cout << *csrColIndC << ",";
   cout << endl;
   cout << *csrRowPtrC << ",";
   cout << endl;

return 0;

}

知道哪里有问题吗？？