以下是我的出错代码的所在函数:
float sseAdd(vectorops){
size_t cntbuf = ops.size();
float __declspec(align(16)) s = 0; // 求和变量.
size_t nBlockWidth = 4; // 块宽. SSE寄存器能一次处理4个float.
size_t cntBlock = cntbuf / nBlockWidth; // 块数.
size_t cntRem = cntbuf % nBlockWidth; // 剩余数量.
__m128 xfsSum = _mm_setzero_ps(); // 求和变量。[SSE] 赋初值0
__m128 xfsSum1 = _mm_setzero_ps();
__m128 xfsSum2 = _mm_setzero_ps();
__m128 xfsSum3 = _mm_setzero_ps();
__m128 xfsLoad;
__m128 xfsLoad1;
__m128 xfsLoad2;
__m128 xfsLoad3; // 加载.
const float __declspec(align(16))* q; // 将SSE变量上的多个数值合并时所用指针.
// SSE批量处理.
for (int i = 0; i < cntBlock-16; )
{
//cout << "this is " << i << endl;
xfsLoad = _mm_load_ps(&ops.at(i*4)); // [SSE] 加载
xfsLoad1 = _mm_load_ps(&ops.at((i+1) * 4)); // [SSE] 加载
xfsLoad2 = _mm_load_ps(&ops.at((i+2) * 4)); // [SSE] 加载
xfsLoad3 = _mm_load_ps(&ops.at((i+3) * 4)); // [SSE] 加载
xfsSum = _mm_add_ps(xfsSum, xfsLoad); // [SSE] 单精浮点紧缩加法
xfsSum1 = _mm_add_ps(xfsSum1, xfsLoad1);
xfsSum2 = _mm_add_ps(xfsSum2, xfsLoad2);
xfsSum3 = _mm_add_ps(xfsSum3, xfsLoad3);
i = i + 4;
}
xfsSum = _mm_add_ps(xfsSum, xfsSum1); // 两两合并(0~1).
xfsSum2 = _mm_add_ps(xfsSum2, xfsSum3); // 两两合并(2~3).
xfsSum = _mm_add_ps(xfsSum, xfsSum2);
// 合并.
q = (const float __declspec(align(16))*)&xfsSum;
s = q[0] + q[1] + q[2] + q[3];
// 处理剩下的.
for (int i = 0; i<cntRem; i++)
{
s += ops.at(cntbuf - 1 - i);
}
return s;
};
为什么我在debug下可以运行,release下会报这个错呢?刚接触sse,不太熟悉,求行内大牛帮忙解答一下