如何在GPU并行计算张量第dim维度的cross(交叉积)?
template <typename T>
__global__ void Cross(const size_t size, T* output, const T* x1, const T* x2, const int x1stride,
const int x2stride, const int ostride) {
for(size_t pos=blockIdx.x * blockDim.x + threadIdx.x; pos<size; pos+=blockDim.x*gridDim.x){
//const int offsets = ???;
T* out_row = output + offsets;
const T* x1_row = x1 + offsets;
const T* x2_row = x2 + offsets;
//val0
out_row[0 * ostride] = (x1_row[1 * x1stride] * x2_row[2 * x2stride] -
x1_row[2 * x1stride] * x2_row[1 * x2stride]);
//val1
out_row[1 * ostride] = (x1_row[2 * x1stride] * x2_row[0 * x2stride] -
x1_row[0 * x1stride] * x2_row[2 * x2stride]);
//val2
out_row[2 * ostride] = (x1_row[0 * x1stride] * x2_row[1 * x2stride] -
x1_row[1 * x1stride] * x2_row[0 * x2stride]);
}
return;
}
stride是tensor的步长
那么对于offsets如何计算呢?