#include<bits/stdc++.h>
using namespace std;
#define large 1000
double a[large][large]={0.0};
double b[large][large]={0.0};
double c[large][large]={0.0};
int main(void){
#pragma omp parallel for num_threads(64)
for(int i=0;i<large;i++){
for(int j=0;j<large;j++){
a[i][j]=(double)rand()/(double)RAND_MAX;
b[i][j]=(double)rand()/(double)RAND_MAX;
}
}
clock_t start=clock();
#pragma omp parallel for num_threads(64)
for(int i=0;i<large;i++){
for(int j=0;j<large;j++){
for(int k=0;k<large;k++){
c[i][j]+=a[i][k]*b[k][j];
}
}
}
clock_t finish=clock();
cout<<(double)((finish-start)/1e6)<<endl;
return 0;
}
只对单层循环进行优化,嵌套的多层循环优化效果不佳,可转换为单层循环先
#pragma omp parallel for num_threads(64) 64个核心?
您使用了不正确的算法以ijk顺序乘以矩阵。
c[i][j]+=a[i][k]*b[k][j];
for(i=0; i<large; ++i){
for(k=0; k<large; ++k){
double r = a[i][k];
for(j=0; j<large; ++j){
c[i][j] += r * b[k][j];
}
}
}