据我所知转换MMX 32位MMX内在不再允许__m64。所以,我有很大的麻烦升级这块code上交所。有人告诉我,在另一个堆栈溢出后发表我的code。或许这练习将帮助其他人。
我注释掉_mm_empty认为是正确的事情。我发现,喜欢在emmintrin.h所有其他__m128i opertions功能,但事情仍然是错误的。
原来的32位功能code:
DWORD CSumInsideHorizontalTask :: InternalDoWork()
{
////////////////////////////////////////////////// //////////
//获取本地变量重新presenting从原来的调用参数
USHORT * arrayIn = m_taskdata.arrayIn;
USHORT arrayLen0 = m_taskdata.arrayLen0;
USHORT arrayLen1 = m_taskdata.arrayLen1;
USHORT *内核= m_taskdata.kernel;
USHORT kernelLen = m_taskdata.kernelLen;
uint32_t的* norm_r = m_taskdata.norm_r;
USHORT * outputArray = m_taskdata.outputArray;
USHORT * interArray = m_taskdata.interArray;
////////////////////////////////////////////////// //////////
USHORT tailLength =(USHORT)((kernelLen - 1)/ 2);
_ASSERTE(interArray);
// USHORT *船头= NULL; //当前行
// USHORT * pInterRow = NULL; //在interarray当前行
INT_PTR lpRow =(INT_PTR)arrayIn; //整数指针算术
INT_PTR lpInterRow =(INT_PTR)interArray; //整数指针算术
INT_PTR rowstride = sizeof的(USHORT)* arrayLen1;
INT_PTR lpKernel;
//调整为非零启动
lpRow + = m_nRowStart * rowstride;
lpInterRow + = m_nRowStart * rowstride;
//要处理的只有那些(边)的像素所需要的肠子循环条件
const int的knLeftEdgeMax = kernelLen - 1 - tailLength; //从0到左侧边缘的结束
const int的knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; //使用这个工作的边缘内
INT H,I;
UINT相加,分;
uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;
//使用这个为MMX优化
INT fourcount = kernelLen / 4;
INT remainingcount = kernelLen%4;
INT mmxcount = 4 * fourcount; //这是其中剩余的处理
INT loopcount = 0; //使用快速循环测试
_mm_empty();
__m64 ACCU,温度;
__m64移位= _m_from_int(32);
对于(H = m_nRowStart; H< m_nRowEnd; H ++)//为每行
{
//跳过左边缘
lpInterRowInside = lpInterRow + cbLeftEdgeStride;
对于(i = knLeftEdgeMax; I< knRightEdgeStart;我++)//为每个边缘内
{
总和= 0;
分= 0;
lpKernel =(INT_PTR)内核;
lpInnerPixels = lpRow +((ⅰ - tailLength)其中;&小于1); //这是我们开始的行
// MMX优化
ACCU = _mm_setzero_si64(); //零累加器
//矢量处理
为(loopcount = fourcount; loopcount = 0;!loopcount--)// //为可以被处理为一个向量中的每个内核项
{
//总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I - tailLength + J));
// _m_pmaddwd:4 * 16位乘加,得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
// _mm_add_pi32 / _m_paddd:2 * 32位插件
临时= _m_pmaddwd(*(__ M64 *)lpKernel,*(__ M64 *)lpInnerPixels);
ACCU = _mm_add_pi32(ACCU,温度); //每个双字都有一个部分和
lpKernel + = 8; lpInnerPixels + = 8;
} //循环内核
//复制HI-DWORD MM0到LO-DWORD的MM1,再总结MMO + MM1
//最后结果存储在变量ACCU
ACCU = _mm_add_pi32(ACCU,_mm_srl_si64(ACCU,移位)); //组合来自上层的结果,下双字
总和= _m_to_int(ACCU); //将MMX结果总结
//标量
对于(loopcount = remainingcount; loopcount = 0;!loopcount--)//为无法被处理成一个向量中每个内核项目
{
//总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I - tailLength + J));
总和+ =(UINT)((*(USHORT *)lpKernel)*(USHORT *)(lpInnerPixels));
//点++;
lpKernel + = 2; lpInnerPixels + = 2;
} //循环内核
// *(interArray + H * arrayLen1 + I)=(USHORT)(总和/ *(norm_r +点 - 1));
*(USHORT *)lpInterRowInside =(USHORT)(总和/标准);
lpInterRowInside + = 2; //移动到下一列的sizeof(USHORT)
} //为每列
lpRow + = rowstride; //移动到下一行(H * arrayLen1)
lpInterRow + = rowstride;
} //对每行
_mm_empty();
返回0;
}
64位尝试:
DWORD CSumInsideHorizontalTask :: InternalDoWork()
{
////////////////////////////////////////////////// //////////
//获取本地变量重新presenting从原来的调用参数
USHORT * arrayIn = m_taskdata.arrayIn;
USHORT arrayLen0 = m_taskdata.arrayLen0;
USHORT arrayLen1 = m_taskdata.arrayLen1;
USHORT *内核= m_taskdata.kernel;
USHORT kernelLen = m_taskdata.kernelLen;
uint32_t的* norm_r = m_taskdata.norm_r;
USHORT * outputArray = m_taskdata.outputArray;
USHORT * interArray = m_taskdata.interArray;
////////////////////////////////////////////////// //////////
USHORT tailLength =(USHORT)((kernelLen - 1)/ 2);
_ASSERTE(interArray);
// USHORT *船头= NULL; //当前行
// USHORT * pInterRow = NULL; //在interarray当前行
INT_PTR lpRow =(INT_PTR)arrayIn; //整数指针算术
INT_PTR lpInterRow =(INT_PTR)interArray; //整数指针算术
INT_PTR rowstride = sizeof的(USHORT)* arrayLen1;
INT_PTR lpKernel;
//调整为非零启动
lpRow + = m_nRowStart * rowstride;
lpInterRow + = m_nRowStart * rowstride;
//要处理的只有那些(边)的像素所需要的肠子循环条件
const int的knLeftEdgeMax = kernelLen - 1 - tailLength; //从0到左侧边缘的结束
const int的knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; //使用这个工作的边缘内
INT H,I;
UINT相加,分;
uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;
//使用这个为MMX优化
INT fourcount = kernelLen / 4;
INT remainingcount = kernelLen%4;
INT mmxcount = 4 * fourcount; //这是其中剩余的处理
INT loopcount = 0; //使用快速循环测试
// _ mm_empty();
__m128i ACCU,温度;
__m128i移位= _mm_cvtsi32_si128(32);
对于(H = m_nRowStart; H< m_nRowEnd; H ++)//为每行
{
//跳过左边缘
lpInterRowInside = lpInterRow + cbLeftEdgeStride;
对于(i = knLeftEdgeMax; I< knRightEdgeStart;我++)//为每个边缘内
{
总和= 0;
分= 0;
lpKernel =(INT_PTR)内核;
lpInnerPixels = lpRow +((ⅰ - tailLength)其中;&小于1); //这是我们开始的行
// MMX优化
ACCU = _mm_setzero_si128(); //零累加器
//矢量处理
为(loopcount = fourcount; loopcount = 0;!loopcount--)// //为可以被处理为一个向量中的每个内核项
{
//总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I - tailLength + J));
// _m_pmaddwd:4 * 16位乘加,得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
// _mm_add_pi32 / _m_paddd:2 * 32位插件
//临时= _m_pmaddwd(*(__ m128i *)lpKernel,*(__ m128i *)lpInnerPixels);
临时= _mm_madd_epi16(*(__ m128i *)lpKernel,*(__ m128i *)lpInnerPixels);
ACCU = _mm_add_epi32(ACCU,温度); //每个双字都有一个部分和
lpKernel + = 8; lpInnerPixels + = 8;
} //循环内核
//复制HI-DWORD MM0到LO-DWORD的MM1,再总结MMO + MM1
//最后结果存储在变量ACCU
ACCU = _mm_add_epi32(ACCU,_mm_sll_epi64(ACCU,移位)); //组合来自上层的结果,下双字
总和= _mm_cvtsi128_si32(ACCU); //将MMX结果总结
//标量
对于(loopcount = remainingcount; loopcount = 0;!loopcount--)//为无法被处理成一个向量中每个内核项目
{
//总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I - tailLength + J));
总和+ =(UINT)((*(USHORT *)lpKernel)*(USHORT *)(lpInnerPixels));
//点++;
lpKernel + = 2; lpInnerPixels + = 2;
} //循环内核
// *(interArray + H * arrayLen1 + I)=(USHORT)(总和/ *(norm_r +点 - 1));
*(USHORT *)lpInterRowInside =(USHORT)(总和/标准);
lpInterRowInside + = 2; //移动到下一列的sizeof(USHORT)
} //为每列
lpRow + = rowstride; //移动到下一行(H * arrayLen1)
lpInterRow + = rowstride;
} //对每行
// _ mm_empty();
返回0;
}
解决方案
通过上面的评论中提到修复了所有的问题。 这里是最后的64位工作证卷积code:
DWORD CSumInsideHorizontalTask :: InternalDoWork()
{
////////////////////////////////////////////////// //////////
//获取本地变量重新presenting从原来的调用参数
USHORT * arrayIn = m_taskdata.arrayIn;
USHORT arrayLen0 = m_taskdata.arrayLen0;
USHORT arrayLen1 = m_taskdata.arrayLen1;
USHORT *内核= m_taskdata.kernel;
USHORT kernelLen = m_taskdata.kernelLen;
uint32_t的* norm_r = m_taskdata.norm_r;
USHORT * outputArray = m_taskdata.outputArray;
USHORT * interArray = m_taskdata.interArray;
////////////////////////////////////////////////// //////////
USHORT tailLength =(USHORT)((kernelLen - 1)/ 2);
_ASSERTE(interArray);
// USHORT *船头= NULL; //当前行
// USHORT * pInterRow = NULL; //在interarray当前行
INT_PTR lpRow =(INT_PTR)arrayIn; //整数指针算术
INT_PTR lpInterRow =(INT_PTR)interArray; //整数指针算术
INT_PTR rowstride = sizeof的(USHORT)* arrayLen1;
INT_PTR lpKernel;
//调整为非零启动
lpRow + = m_nRowStart * rowstride;
lpInterRow + = m_nRowStart * rowstride;
//要处理的只有那些(边)的像素所需要的肠子循环条件
const int的knLeftEdgeMax = kernelLen - 1 - tailLength; //从0到左侧边缘的结束
const int的knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; //使用这个工作的边缘内
INT H,I;
UINT相加,分;
uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;
//使用这个为MMX优化
INT fourcount = kernelLen / 4;
INT remainingcount = kernelLen%4;
INT mmxcount = 4 * fourcount; //这是其中剩余的处理
INT loopcount = 0; //使用快速循环测试
// _ mm_empty();
__m128i ACCU,温度,mlpkernel,mlpInnerPixels;
__m128i移位= _mm_cvtsi32_si128(32);
对于(H = m_nRowStart; H< m_nRowEnd; H ++)//为每行
{
//跳过左边缘
lpInterRowInside = lpInterRow + cbLeftEdgeStride;
对于(i = knLeftEdgeMax; I< knRightEdgeStart;我++)//为每个边缘内
{
总和= 0;
分= 0;
lpKernel =(INT_PTR)内核;
lpInnerPixels = lpRow +((ⅰ - tailLength)其中;&小于1); //这是我们开始的行
// MMX优化
ACCU = _mm_setzero_si128(); //零累加器
//矢量处理
为(loopcount = fourcount; loopcount = 0;!loopcount--)// //为可以被处理为一个向量中的每个内核项
{
//总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I - tailLength + J));
// _m_pmaddwd:4 * 16位乘加,得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
// _mm_add_pi32 / _m_paddd:2 * 32位插件
//临时= _m_pmaddwd(*(__ m128i *)lpKernel,*(__ m128i *)lpInnerPixels);
// mlpkernel = _mm_cvtsi32_si128(lpKernel);
mlpkernel = _mm_cvtsi64_si128(*(__的Int64 *)lpKernel);
mlpInnerPixels = _mm_cvtsi64_si128(*(__的Int64 *)lpInnerPixels);
临时= _mm_madd_epi16(mlpkernel,mlpInnerPixels);
ACCU = _mm_add_epi32(ACCU,温度); //每个双字都有一个部分和
lpKernel + = 8; lpInnerPixels + = 8;
} //循环内核
//复制HI-DWORD MM0到LO-DWORD的MM1,再总结MMO + MM1
//最后结果存储在变量ACCU
ACCU = _mm_add_epi32(ACCU,_mm_srl_epi64(ACCU,移位)); //组合来自上层的结果,下双字
总和= _mm_cvtsi128_si32(ACCU); //将MMX结果总结
//标量
对于(loopcount = remainingcount; loopcount = 0;!loopcount--)//为无法被处理成一个向量中每个内核项目
{
//总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I - tailLength + J));
总和+ =(UINT)((*(USHORT *)lpKernel)*(USHORT *)(lpInnerPixels));
//点++;
lpKernel + = 2; lpInnerPixels + = 2;
} //循环内核
// *(interArray + H * arrayLen1 + I)=(USHORT)(总和/ *(norm_r +点 - 1));
*(USHORT *)lpInterRowInside =(USHORT)(总和/标准);
lpInterRowInside + = 2; //移动到下一列的sizeof(USHORT)
} //为每列
lpRow + = rowstride; //移动到下一行(H * arrayLen1)
lpInterRow + = rowstride;
} //对每行
// _ mm_empty();
返回0;
}
I understand converting MMX 32bit mmx intrinsics no longer allows the __m64. So I was having great trouble upgrading this piece of code to SSE. I was told on another stack-Overflow post to post my code. Perhaps this exercise will help others as well.
I commented out '_mm_empty' thinking that was the right thing to do. I found like functions in the emmintrin.h for all the other __m128i opertions, but something is still wrong.
original 32-bit function code:
DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn = m_taskdata.arrayIn;
ushort arrayLen0 = m_taskdata.arrayLen0;
ushort arrayLen1 = m_taskdata.arrayLen1;
ushort* kernel = m_taskdata.kernel;
ushort kernelLen = m_taskdata.kernelLen;
uint32_t* norm_r = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;
ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////
ushort tailLength = (ushort)((kernelLen - 1) / 2);
_ASSERTE(interArray);
//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray
INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;
// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;
// want to process only those (edge) pixels that need the innner loop condition
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges
int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;
// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests
_mm_empty();
__m64 accu, temp;
__m64 shifter = _m_from_int(32);
for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
// skip over left edge
lpInterRowInside = lpInterRow + cbLeftEdgeStride;
for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
{
sum = 0;
points = 0;
lpKernel = (INT_PTR)kernel;
lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row
// MMX Optimizations
accu = _mm_setzero_si64(); // zero the accumulator
// VECTOR processing
for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
// _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
// _mm_add_pi32/_m_paddd: 2*32bit add
temp = _m_pmaddwd(*(__m64*)lpKernel, *(__m64*)lpInnerPixels);
accu = _mm_add_pi32(accu, temp); // each double word has a partial sum
lpKernel += 8; lpInnerPixels += 8;
} // loop over the kernel
// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
// and finally store the result into the variable "accu"
accu = _mm_add_pi32(accu, _mm_srl_si64(accu, shifter)); // combine results from upper and lower double words
sum = _m_to_int(accu); // move mmx result to sum
// SCALAR
for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
//points++;
lpKernel += 2; lpInnerPixels += 2;
} // loop over the kernel
//*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));
*(ushort*)lpInterRowInside = (ushort)(sum/norm);
lpInterRowInside += 2; // move to next column sizeof(ushort)
} // for each column
lpRow += rowstride; // move to next row ( h * arrayLen1 )
lpInterRow += rowstride;
} // for each row
_mm_empty();
return 0;
}
64 Bit Attempt:
DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn = m_taskdata.arrayIn;
ushort arrayLen0 = m_taskdata.arrayLen0;
ushort arrayLen1 = m_taskdata.arrayLen1;
ushort* kernel = m_taskdata.kernel;
ushort kernelLen = m_taskdata.kernelLen;
uint32_t* norm_r = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;
ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////
ushort tailLength = (ushort)((kernelLen - 1) / 2);
_ASSERTE(interArray);
//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray
INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;
// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;
// want to process only those (edge) pixels that need the innner loop condition
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges
int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;
// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests
//_mm_empty();
__m128i accu, temp;
__m128i shifter = _mm_cvtsi32_si128(32);
for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
// skip over left edge
lpInterRowInside = lpInterRow + cbLeftEdgeStride;
for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
{
sum = 0;
points = 0;
lpKernel = (INT_PTR)kernel;
lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row
// MMX Optimizations
accu = _mm_setzero_si128(); // zero the accumulator
// VECTOR processing
for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
// _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
// _mm_add_pi32/_m_paddd: 2*32bit add
//temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
temp = _mm_madd_epi16(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
accu = _mm_add_epi32(accu, temp); // each double word has a partial sum
lpKernel += 8; lpInnerPixels += 8;
} // loop over the kernel
// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
// and finally store the result into the variable "accu"
accu = _mm_add_epi32(accu, _mm_sll_epi64(accu, shifter)); // combine results from upper and lower double words
sum = _mm_cvtsi128_si32(accu); // move mmx result to sum
// SCALAR
for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
//points++;
lpKernel += 2; lpInnerPixels += 2;
} // loop over the kernel
//*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));
*(ushort*)lpInterRowInside = (ushort)(sum/norm);
lpInterRowInside += 2; // move to next column sizeof(ushort)
} // for each column
lpRow += rowstride; // move to next row ( h * arrayLen1 )
lpInterRow += rowstride;
} // for each row
//_mm_empty();
return 0;
}
解决方案
With all the issues fixed mentioned above in the comments. Here is the final working x64 SSE Convolution code:
DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn = m_taskdata.arrayIn;
ushort arrayLen0 = m_taskdata.arrayLen0;
ushort arrayLen1 = m_taskdata.arrayLen1;
ushort* kernel = m_taskdata.kernel;
ushort kernelLen = m_taskdata.kernelLen;
uint32_t* norm_r = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;
ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////
ushort tailLength = (ushort)((kernelLen - 1) / 2);
_ASSERTE(interArray);
//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray
INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;
// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;
// want to process only those (edge) pixels that need the innner loop condition
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges
int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;
// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests
//_mm_empty();
__m128i accu, temp, mlpkernel, mlpInnerPixels;
__m128i shifter = _mm_cvtsi32_si128(32);
for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
// skip over left edge
lpInterRowInside = lpInterRow + cbLeftEdgeStride;
for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
{
sum = 0;
points = 0;
lpKernel = (INT_PTR)kernel;
lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row
// MMX Optimizations
accu = _mm_setzero_si128(); // zero the accumulator
// VECTOR processing
for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
// _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
// _mm_add_pi32/_m_paddd: 2*32bit add
//temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
//mlpkernel = _mm_cvtsi32_si128(lpKernel);
mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel);
mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels);
temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels);
accu = _mm_add_epi32(accu, temp); // each double word has a partial sum
lpKernel += 8; lpInnerPixels += 8;
} // loop over the kernel
// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
// and finally store the result into the variable "accu"
accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter)); // combine results from upper and lower double words
sum = _mm_cvtsi128_si32(accu); // move mmx result to sum
// SCALAR
for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
{
//sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
//points++;
lpKernel += 2; lpInnerPixels += 2;
} // loop over the kernel
//*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));
*(ushort*)lpInterRowInside = (ushort)(sum/norm);
lpInterRowInside += 2; // move to next column sizeof(ushort)
} // for each column
lpRow += rowstride; // move to next row ( h * arrayLen1 )
lpInterRow += rowstride;
} // for each row
//_mm_empty();
return 0;
}