用cublas库求矩阵范数,求各位大神看看代码哪里不对

void *device_numofduanyuan;
cublasHandle_t handler;
cudaEvent_t cublas_start,cublas_stop,cula_start,cula_stop;
clock_t begin,end;

begin=clock();
cudaEventCreate(&cublas_start);
cudaEventCreate(&cublas_stop);
cudaEventCreate(&cula_start);
cudaEventCreate(&cula_stop);

//cublasCreate(&handle);
cudaMalloc((void **)&device_numofduanyuan,sizeof(int));
cudaMemset(device_numofduanyuan,0,sizeof(int));

if (type==3||type==0||type==1||type==2)
{

    //float *matVt,*matMt,*matOMEGA;
    //matVt=(float *)malloc(sizeof(float)*bands*width);  //matVt  所有波段的一行数据;
    //matMt=(float *)malloc(sizeof(float)*height*width);
    //matOMEGA=(float *)malloc(sizeof(float)*bands*numofduanyuan);

    matrix_f matVt,matMt,matOMEGA;
    Init_fmatrix(matVt,bands,width);
    Init_fmatrix(matMt,height,width);
    Init_fmatrix(matOMEGA,bands,numofduanyuan);

    float *temp_data=(float *)host_data;
    float *dev_matVt,*dev_matMt,*dev_matOMEGA;
    cudaMalloc((void**)&dev_matVt,sizeof(float)*bands);
    cudaMalloc((void**)&dev_matMt,sizeof(float)*height*width);
    cudaMalloc((void**)&dev_matOMEGA,sizeof(float)*bands*numofduanyuan);

#pragma unroll
for (int i=0;i<height;i++)
{
for (int j=0;j<bands;j++)
{
for (int k=0;k<width;k++)
{
int pos=i*width+k+j*(width*height);
//float temp=host_data[pos];
matVt.mat[j][k]=temp_data[pos];
}
}
cudaMemcpy(dev_matVt,matVt.mat,sizeof(float)*width*bands,cudaMemcpyHostToDevice);
cudaMemcpy(dev_matMt,matMt.mat,sizeof(float)*height*width,cudaMemcpyHostToDevice);
cublasSnrm2(handler,bands,dev_matVt,sizeof(float),dev_matMt);

http://zhidao.baidu.com/link?url=MxNfYOilcsvOMZ4RJHfPDXWGK0KI5dZFSdGIIoEq04RIqPv2_40UjgU3z-vZJAeONnbl1QmuqBx34zJQZn18idkQy7jzpX5SdL4i2gFPkwu