CUDA并行计算为什么输出全为零

在CUDA中,我这里将h1[0]和h2[0]拷贝到cpu输出是正常的,从del[0]开始输出就为零了,我之前将这些参数都是使用变量还存储计算,结果为零我以为是寄存器满了,后面用数组来存储计算也有这样的问题,是因为计算的过程中不能读取多个数组或变量么?
遇到的现象和发生背景,请写出第一个错误信息
用代码块功能插入代码,请勿粘贴截图。 不用代码块回答率下降 50%
运行结果及详细报错内容
我的解答思路和尝试过的方法,不写自己思路的,回答率下降 60%
__global__ void Pretreatment(float *DevX, float *DevY,  int Len,float *h1, float *h2, float *w1, float *w2, float *hs, float *del1, float *del2,float *dmax,float*dmin, float *OutputD)
{
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;

 
    for (int i = threadID; i < Len; i += numThreads)
    {
        
        if (i == 0)
        {
            h1[i] = DevX[i + 1] - DevX[i];
            h2[i] = DevX[i + 2] - DevX[i + 1];
            del1[i] = (DevY[i + 1] - DevY[i])/ h1[i];
            del2[i] = (DevY[i + 2] - DevY[i + 1]) / h2[i];
            OutputD[i] = ((2 * h1[i] + h2[i])*del1[i] - h1[i] * del2[i]) / (h1[i] + h2[i]);
            if (OutputD[i] * del1[i] < 0)
            {
                OutputD[i] = 0;
            }
            else if ((del1[i]*del2[i] < 0) && (fabs(OutputD[i]) > fabs(3 * del1[i])))
            {
                OutputD[i] = 3 * del1[i];
            }
            
        }
        else if (i == Len - 1)
        {
            h1[i] = DevX[i - 1] - DevX[i - 2];
            h2[i] = DevX[i] - DevX[i - 1];
            del1[i]= (DevY[i - 1] - DevY[i - 2]) / h1[i];
            del2[i] = (DevY[i] - DevY[i - 1]) / h2[i];
            OutputD[i] = ((2 * h2[i] + h1[i])*del2[i] - h2[i] * del1[i]) / (h1[i] + h2[i]);
            if (OutputD[i] * del2[i] < 0)
            {
                OutputD[i] = 0;
            }
            else if ((del1[i]*del2[i] < 0) && (fabs(OutputD[i]) > fabs(3 * del2[i])))
            {
                OutputD[i] = 3 * del2[i];
            }    
        }
        else if ((0 < i) && (i < Len - 1))
        {
            OutputD[i] = DevX[i] - DevX[i - 1];
            h2[i] = DevX[i + 1] - DevX[i];

            del1[i] = (DevY[i] - DevY[i - 1]) / h1[i];
            del2[i] = (DevY[i + 1] - DevY[i]) / h2[i];

            if (del1[i]*del2[i] > 0)
            {
                hs[i] = h1[i] + h2[i];
                w1[i] = (h1[i] + hs[i]) / (3 * hs[i]);
                w2[i] = (h2[i] + hs[i]) / (3 * hs[i]);
                dmax[i] = fmax(fabs(del1[i]), fabs(del2[i]));
                dmin[i] = fmin(fabs(del1[i]), fabs(del2[i]));
                
                OutputD[i]=dmin[i] / (w1[i] *(del1[i] / dmax[i]) + w2[i] * (del2[i] / dmax[i]));
            }
            // printf("d[%d]=%f\n",i, OutputD[i]);
        }
    }
}