并行计算cross(交叉积)

如何在GPU并行计算张量第dim维度的cross(交叉积)?
template <typename T>
__global__ void Cross(const size_t size, T* output, const T* x1, const T* x2, const int x1stride, 
                      const int x2stride, const int ostride) {
  for(size_t pos=blockIdx.x * blockDim.x + threadIdx.x; pos<size; pos+=blockDim.x*gridDim.x){


    //const int offsets = ???;
    T* out_row = output + offsets;
    const T* x1_row = x1 + offsets;
    const T* x2_row = x2 + offsets;
    //val0
    out_row[0 * ostride] = (x1_row[1 * x1stride] * x2_row[2 * x2stride] -
                            x1_row[2 * x1stride] * x2_row[1 * x2stride]);
    //val1
    out_row[1 * ostride] = (x1_row[2 * x1stride] * x2_row[0 * x2stride] -
                            x1_row[0 * x1stride] * x2_row[2 * x2stride]);
    //val2
    out_row[2 * ostride] = (x1_row[0 * x1stride] * x2_row[1 * x2stride] -
                            x1_row[1 * x1stride] * x2_row[0 * x2stride]);

    
  }
  return;
}
stride是tensor的步长
那么对于offsets如何计算呢?