提问,最近在做RC4算法密钥流概率统计,用不同的线程跑RC4算法,统计密钥流出现的次数,但是结果串行和并行的概率分布出现了较大的偏差,怀疑是统计次数时线程太多导致用的atomicadd命令加和出现错误,这种情况要怎么解决才好?
以下是部分代码:
__device__ void KSA(unsigned char *S, unsigned char *Key){
unsigned char i = 0;
unsigned char j = 0;
#pragma unroll 256
for(int k = 0; k < 256; k++){
j = j + S[i] + Key[i % KEY_LEN];
Swap(S[i++],S[j]);
}
}
__device__ void PRGA(unsigned char *S, int *cipher_static){
unsigned char i = 0;
unsigned char j = 0;
unsigned char temp;
//#pragma unroll 128
for(int k = 0; k < 128; k++){
i = i + 1;
j = j + S[i];
Swap(S[i],S[j]);
temp = S[i] + S[j];
atomicAdd(&(cipher_static[k*256 + S[temp]]), 1);
}
__global__ void use(curandState *globalState, int *cipher_static)
{
unsigned char key[KEY_LEN];
curandState localState = globalState[threadIdx.x];
for(int round = 0; round < (1<<16); round++){
unsigned char S[256];
#pragma unroll 256
for(int b = 0; b < 256; b++){
S[b] = b;
}
for (int i = 0; i < KEY_LEN; i++){
key[i] = (curand(&localState)) % N;
}
KSA(S,key);
PRGA(S,cipher_static);
}
}
运行主函数是
void data_gen(int *result)
{
size_t size = size_t(PLAINTEXT_LENGTH) * size_t(256);
int *cipher_static;
int *cipher_stats = new int[size];
for(int t = 0; t < size; t++) cipher_stats[t] = 0;
cudaMalloc(&cipher_static, size * sizeof(int) );
cudaMemcpy(cipher_static, cipher_stats, size * sizeof(int), cudaMemcpyHostToDevice);
use <<< 128, 128 >>> (devStates,cipher_static);
cudaMemcpy(cipher_stats, cipher_static, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(devStates);
cudaFree(cipher_static);
cudaDeviceReset();
}
理论结果如下:
将block 和 threads改成 <<<1,1>>>就可以得到正常结果,说明加密函数没有问题,这种情况该如何修改?