关于CUDA并行计数的问题，atomicadd给出错误结果，如何解决？

提问，最近在做RC4算法密钥流概率统计，用不同的线程跑RC4算法，统计密钥流出现的次数，但是结果串行和并行的概率分布出现了较大的偏差，怀疑是统计次数时线程太多导致用的atomicadd命令加和出现错误，这种情况要怎么解决才好？
以下是部分代码：

__device__ void KSA(unsigned char *S, unsigned char *Key){
unsigned char i = 0;
    unsigned char j = 0;
    #pragma unroll 256
    for(int k = 0; k < 256; k++){
        j = j + S[i] + Key[i % KEY_LEN];
        Swap(S[i++],S[j]);
    }
}
__device__ void PRGA(unsigned char *S, int *cipher_static){
    
    unsigned char i = 0;
    unsigned char j = 0;
    unsigned char temp;

    //#pragma unroll 128
    for(int k = 0; k < 128; k++){
        i = i + 1;
        j =  j + S[i];
        Swap(S[i],S[j]);
        temp = S[i] + S[j];
        atomicAdd(&(cipher_static[k*256 + S[temp]]), 1);
    }
__global__ void use(curandState *globalState, int *cipher_static)
{   

    unsigned char key[KEY_LEN];
    curandState localState = globalState[threadIdx.x];

    for(int round = 0; round < (1<<16); round++){
        
        unsigned char S[256];
        #pragma unroll 256
        for(int b = 0; b < 256; b++){
            S[b] = b;
        }
        for (int i = 0; i < KEY_LEN; i++){
            key[i] = (curand(&localState)) % N;
        }
        KSA(S,key);
        PRGA(S,cipher_static);
    }
}

运行主函数是

void data_gen(int *result)
{   
    size_t size = size_t(PLAINTEXT_LENGTH) * size_t(256);

    int *cipher_static;

    int *cipher_stats = new int[size];

    for(int t = 0; t < size; t++) cipher_stats[t] = 0;

    cudaMalloc(&cipher_static,  size * sizeof(int) );

    cudaMemcpy(cipher_static, cipher_stats, size * sizeof(int), cudaMemcpyHostToDevice);

    use <<< 128, 128 >>> (devStates,cipher_static);

    cudaMemcpy(cipher_stats, cipher_static, size * sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(devStates);
    cudaFree(cipher_static);
    cudaDeviceReset();
}

运行结果及报错内容

理论结果如下：

实际结果如下：

将block 和 threads改成 <<<1,1>>>就可以得到正常结果，说明加密函数没有问题，这种情况该如何修改？