Cuda矩阵乘法Nsight调试结果正确，但是直接运行矩阵有近一半是4e-8这样的数，哪出错了？

void compare_mat(const float* a, int lda, const float* b, int ldb, int n) {
float max_err = 0;
float average_err = 0;
int i, j;
int ccount = 0;
int fcount = 0;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
if (b[i * ldb + j] != 0) {
float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);
if (err >= 1){
fcount += 1;
}else{
ccount += 1;

            }
                //printf("%g,%g,%g\n", a[i * lda + j], b[i * ldb + j],  err);
            if (max_err < err)
                max_err = err;
            average_err += err;
        }
    }
}
printf("ccount:%d, fcount:%d\n", ccount, fcount);
printf("Max error: %g Average error: %g\n", max_err, average_err / (n * n));

}

#define NUM_THREADS 256
global static void matMultCUDA(const float* a, size_t lda,
const float* b, size_t ldb, float* c, size_t ldc, int n) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int idx = bid * blockDim.x + tid;
const int row = idx / n;
const int column = idx % n;
int i;
if (row < n && column < n) {
float t = 0;
for (i = 0; i < n; i++) {
t += a[row * lda + i] * b[i * ldb + column];
}
c[row * ldc + column] = t;
}
}
clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n) {
float *ac, *bc, *cc;
clock_t start, end;
start = clock();
cudaMalloc((void**)&ac, sizeof(float) * n * n);
cudaMalloc((void**)&bc, sizeof(float) * n * n);
cudaMalloc((void**)&cc, sizeof(float) * n * n);
cudaMemcpy2D(ac, sizeof(float) * n, a, sizeof(float) * lda, sizeof(float) * n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float) * n, b, sizeof(float) * ldb, sizeof(float) * n, n, cudaMemcpyHostToDevice);
int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
printf("%d,%d\n", blocks,NUM_THREADS);
matMultCUDA << <blocks * n, NUM_THREADS >> > (ac, n, bc, n, cc, n, n)
cudaMemcpy2D(c, sizeof(float) * ldc, cc, sizeof(float) * n, sizeof(float) * n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
end = clock();
return end - start;
}

int main() {
float a, b, c, d;
int n = 1000;
if (!InitCUDA())
return 0;
a = (float)malloc(sizeof(float) * n * n);
b = (float)malloc(sizeof(float) * n * n);
c = (float)malloc(sizeof(float) * n * n);
d = (float)malloc(sizeof(float) * n * n);
srand(0);
matgen(a, n, n);
matgen(b, n, n);
clock_t time = matmultCUDA(a, n, b, n, c, n, n);
matmult(a, n, b, n, d, n, n);
compare_mat(c, n, d, n, n);
double sec = (double)time / CLOCKS_PER_SEC;
printf("Time used: %.2f (%.2lf GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
free(a);
free(b);
free(c);
free(d);
system("pause");
return 0;
}

你好，我是有问必答小助手，非常抱歉，本次您提出的有问必答问题，技术专家团超时未为您做出解答

本次提问扣除的有问必答次数，将会以问答VIP体验卡（1次有问必答机会、商城购买实体图书享受95折优惠）的形式为您补发到账户。

因为有问必答VIP体验卡有效期仅有1天，您在需要使用的时候【私信】联系我，我会为您补发。