我做了错误的转换我MMX内部函数到x64(SSE)?函数、错误、我做了、MMX

2023-09-08 10:37:17 作者:不如一刀致命来得痛快

据我所知转换MMX 32位MMX内在不再允许__m64。所以,我有很大的麻烦升级这块code上交所。有人告诉我,在另一个堆栈溢出后发表我的code。或许这练习将帮助其他人。

我注释掉_mm_empty认为是正确的事情。我发现,喜欢在emmintrin.h所有其他__m128i opertions功能,但事情仍然是错误的。

原来的32位功能code:

  DWORD CSumInsideHorizo​​ntalTask​​ :: InternalDoWork()
{
    ////////////////////////////////////////////////// //////////
    //获取本地变量重新presenting从原来的调用参数
    USHORT * arrayIn = m_taskdata.arrayIn;
    USHORT arrayLen0 = m_taskdata.arrayLen0;
    USHORT arrayLen1 = m_taskdata.arrayLen1;
    USHORT *内核= m_taskdata.kernel;
    USHORT kernelLen = m_taskdata.kernelLen;
    uint32_t的* norm_r = m_taskdata.norm_r;
    USHORT * outputArray = m_taskdata.outputArray;

    USHORT * interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////// //////////

    USHORT tailLength =(USHORT)((kernelLen  -  1)/ 2);

    _ASSERTE(interArray);

    // USHORT *船头= NULL; //当前行
    // USHORT * pInterRow = NULL; //在interarray当前行

    INT_PTR lpRow =(INT_PTR)arrayIn; //整数指针算术
    INT_PTR lpInterRow =(INT_PTR)interArray; //整数指针算术
    INT_PTR rowstride = sizeof的(USHORT)* arrayLen1;
    INT_PTR lpKernel;

    //调整为非零启动
    lpRow + = m_nRowStart * rowstride;
    lpInterRow + = m_nRowStart * rowstride;

    //要处理的只有那些(边)的像素所需要的肠子循环条件
    const int的knLeftEdgeMax = kernelLen  -  1  -  tailLength; //从0到左侧边缘的结束
    const int的knRightEdgeStart = arrayLen1  -  kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; //使用这个工作的边缘内

    INT H,I;
    UINT相加,分;
    uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
    INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
    INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;

    //使用这个为MMX优化
    INT fourcount = kernelLen / 4;
    INT remainingcount = kernelLen%4;
    INT mmxcount = 4 * fourcount; //这是其中剩余的处理
    INT loopcount = 0; //使用快速循环测试

    _mm_empty();
    __m64 ACCU,温度;
    __m64移位= _m_from_int(32);

    对于(H = m_nRowStart; H< m_nRowEnd; H ++)//为每行
    {
        //跳过左边缘
        lpInterRowInside = lpInterRow + cbLeftEdgeStride;

        对于(i = knLeftEdgeMax; I< knRightEdgeStart;我++)//为每个边缘内
        {
            总和= 0;
            分= 0;
            lpKernel =(INT_PTR)内核;

            lpInnerPixels = lpRow +((ⅰ -  tailLength)其中;&小于1); //这是我们开始的行

            // MMX优化
            ACCU = _mm_setzero_si64(); //零累加器

            //矢量处理
            为(loopcount = fourcount; loopcount = 0;!loopcount--)// //为可以被处理为一个向量中的每个内核项
            {
                //总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I  -  tailLength + J));

                // _m_pmaddwd:4 * 16位乘加,得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
                // _mm_add_pi32 / _m_paddd:2 * 32位插件
                临时= _m_pmaddwd(*(__ M64 *)lpKernel,*(__ M64 *)lpInnerPixels);

                ACCU = _mm_add_pi32(ACCU,温度); //每个双字都有一个部分和

                lpKernel + = 8; lpInnerPixels + = 8;

            } //循环内核

            //复制HI-DWORD MM0到LO-DWORD的MM1,再总结MMO + MM1
            //最后结果存储在变量ACCU
            ACCU = _mm_add_pi32(ACCU,_mm_srl_si64(ACCU,移位)); //组合来自上层的结果,下双字

            总和= _m_to_int(ACCU); //将MMX结果总结

            //标量
            对于(loopcount = remainingcount; loopcount = 0;!loopcount--)//为无法被处理成一个向量中每个内核项目
            {
                //总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I  -  tailLength + J));
                总和+ =(UINT)((*(USHORT *)lpKernel)*(USHORT *)(lpInnerPixels));
                //点​​++;
                lpKernel + = 2; lpInnerPixels + = 2;
            } //循环内核


            // *(interArray + H * arrayLen1 + I)=(USHORT)(总和/ *(norm_r +点 -  1));

            *(USHORT *)lpInterRowInside =(USHORT)(总和/标准);
            lpInterRowInside + = 2; //移动到下一列的sizeof(USHORT)
        } //为每列


        lpRow + = rowstride; //移动到下一行(H * arrayLen1)
        lpInterRow + = rowstride;


    } //对每行

    _mm_empty();

    返回0;

}
 

64位尝试:

  DWORD CSumInsideHorizo​​ntalTask​​ :: InternalDoWork()
{
    ////////////////////////////////////////////////// //////////
    //获取本地变量重新presenting从原来的调用参数
    USHORT * arrayIn = m_taskdata.arrayIn;
    USHORT arrayLen0 = m_taskdata.arrayLen0;
    USHORT arrayLen1 = m_taskdata.arrayLen1;
    USHORT *内核= m_taskdata.kernel;
    USHORT kernelLen = m_taskdata.kernelLen;
    uint32_t的* norm_r = m_taskdata.norm_r;
    USHORT * outputArray = m_taskdata.outputArray;

    USHORT * interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////// //////////

    USHORT tailLength =(USHORT)((kernelLen  -  1)/ 2);

    _ASSERTE(interArray);

    // USHORT *船头= NULL; //当前行
    // USHORT * pInterRow = NULL; //在interarray当前行

    INT_PTR lpRow =(INT_PTR)arrayIn; //整数指针算术
    INT_PTR lpInterRow =(INT_PTR)interArray; //整数指针算术
    INT_PTR rowstride = sizeof的(USHORT)* arrayLen1;
    INT_PTR lpKernel;

    //调整为非零启动
    lpRow + = m_nRowStart * rowstride;
    lpInterRow + = m_nRowStart * rowstride;


    //要处理的只有那些(边)的像素所需要的肠子循环条件
    const int的knLeftEdgeMax = kernelLen  -  1  -  tailLength; //从0到左侧边缘的结束
    const int的knRightEdgeStart = arrayLen1  -  kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; //使用这个工作的边缘内

    INT H,I;
    UINT相加,分;
    uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
    INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
    INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;

    //使用这个为MMX优化
    INT fourcount = kernelLen / 4;
    INT remainingcount = kernelLen%4;
    INT mmxcount = 4 * fourcount; //这是其中剩余的处理
    INT loopcount = 0; //使用快速循环测试

    // _ mm_empty();
    __m128i ACCU,温度;
    __m128i移位= _mm_cvtsi32_si128(32);

    对于(H = m_nRowStart; H< m_nRowEnd; H ++)//为每行
    {
        //跳过左边缘
        lpInterRowInside = lpInterRow + cbLeftEdgeStride;

        对于(i = knLeftEdgeMax; I< knRightEdgeStart;我++)//为每个边缘内
        {
            总和= 0;
            分= 0;
            lpKernel =(INT_PTR)内核;

            lpInnerPixels = lpRow +((ⅰ -  tailLength)其中;&小于1); //这是我们开始的行

            // MMX优化
            ACCU = _mm_setzero_si128(); //零累加器

            //矢量处理
            为(loopcount = fourcount; loopcount = 0;!loopcount--)// //为可以被处理为一个向量中的每个内核项
            {
                //总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I  -  tailLength + J));

                // _m_pmaddwd:4 * 16位乘加,得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
                // _mm_add_pi32 / _m_paddd:2 * 32位插件
                //临时= _m_pmaddwd(*(__ m128i *)lpKernel,*(__ m128i *)lpInnerPixels);
                临时= _mm_madd_epi16(*(__ m128i *)lpKernel,*(__ m128i *)lpInnerPixels);

                ACCU = _mm_add_epi32(ACCU,温度); //每个双字都有一个部分和

                lpKernel + = 8; lpInnerPixels + = 8;

            } //循环内核

            //复制HI-DWORD MM0到LO-DWORD的MM1,再总结MMO + MM1
            //最后结果存储在变量ACCU
            ACCU = _mm_add_epi32(ACCU,_mm_sll_epi64(ACCU,移位)); //组合来自上层的结果,下双字

            总和= _mm_cvtsi128_si32(ACCU); //将MMX结果总结

            //标量
            对于(loopcount = remainingcount; loopcount = 0;!loopcount--)//为无法被处理成一个向量中每个内核项目
            {
                //总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I  -  tailLength + J));
                总和+ =(UINT)((*(USHORT *)lpKernel)*(USHORT *)(lpInnerPixels));
                //点​​++;
                lpKernel + = 2; lpInnerPixels + = 2;
            } //循环内核


            // *(interArray + H * arrayLen1 + I)=(USHORT)(总和/ *(norm_r +点 -  1));

            *(USHORT *)lpInterRowInside =(USHORT)(总和/标准);
            lpInterRowInside + = 2; //移动到下一列的sizeof(USHORT)
        } //为每列


        lpRow + = rowstride; //移动到下一行(H * arrayLen1)
        lpInterRow + = rowstride;


    } //对每行

    // _ mm_empty();

    返回0;

}
 

解决方案 如何将excel格式的电话转换成手机电话簿

通过上面的评论中提到修复了所有的问题。 这里是最后的64位工作证卷积code:

  DWORD CSumInsideHorizo​​ntalTask​​ :: InternalDoWork()
{
////////////////////////////////////////////////// //////////
//获取本地变量重新presenting从原来的调用参数
USHORT * arrayIn = m_taskdata.arrayIn;
USHORT arrayLen0 = m_taskdata.arrayLen0;
USHORT arrayLen1 = m_taskdata.arrayLen1;
USHORT *内核= m_taskdata.kernel;
USHORT kernelLen = m_taskdata.kernelLen;
uint32_t的* norm_r = m_taskdata.norm_r;
USHORT * outputArray = m_taskdata.outputArray;

USHORT * interArray = m_taskdata.interArray;
////////////////////////////////////////////////// //////////

USHORT tailLength =(USHORT)((kernelLen  -  1)/ 2);

_ASSERTE(interArray);

// USHORT *船头= NULL; //当前行
// USHORT * pInterRow = NULL; //在interarray当前行

INT_PTR lpRow =(INT_PTR)arrayIn; //整数指针算术
INT_PTR lpInterRow =(INT_PTR)interArray; //整数指针算术
INT_PTR rowstride = sizeof的(USHORT)* arrayLen1;
INT_PTR lpKernel;

//调整为非零启动
lpRow + = m_nRowStart * rowstride;
lpInterRow + = m_nRowStart * rowstride;


//要处理的只有那些(边)的像素所需要的肠子循环条件
const int的knLeftEdgeMax = kernelLen  -  1  -  tailLength; //从0到左侧边缘的结束
const int的knRightEdgeStart = arrayLen1  -  kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; //使用这个工作的边缘内

INT H,I;
UINT相加,分;
uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;

//使用这个为MMX优化
INT fourcount = kernelLen / 4;
INT remainingcount = kernelLen%4;
INT mmxcount = 4 * fourcount; //这是其中剩余的处理
INT loopcount = 0; //使用快速循环测试

// _ mm_empty();
__m128i ACCU,温度,mlpkernel,mlpInnerPixels;
__m128i移位= _mm_cvtsi32_si128(32);

对于(H = m_nRowStart; H< m_nRowEnd; H ++)//为每行
{
    //跳过左边缘
    lpInterRowInside = lpInterRow + cbLeftEdgeStride;

    对于(i = knLeftEdgeMax; I< knRightEdgeStart;我++)//为每个边缘内
    {
        总和= 0;
        分= 0;
        lpKernel =(INT_PTR)内核;

        lpInnerPixels = lpRow +((ⅰ -  tailLength)其中;&小于1); //这是我们开始的行

        // MMX优化
        ACCU = _mm_setzero_si128(); //零累加器

        //矢量处理
        为(loopcount = fourcount; loopcount = 0;!loopcount--)// //为可以被处理为一个向量中的每个内核项
        {
            //总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I  -  tailLength + J));

            // _m_pmaddwd:4 * 16位乘加,得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
            // _mm_add_pi32 / _m_paddd:2 * 32位插件
            //临时= _m_pmaddwd(*(__ m128i *)lpKernel,*(__ m128i *)lpInnerPixels);
            // mlpkernel = _mm_cvtsi32_si128(lpKernel);
            mlpkernel = _mm_cvtsi64_si128(*(__的Int64 *)lpKernel);
            mlpInnerPixels = _mm_cvtsi64_si128(*(__的Int64 *)lpInnerPixels);
            临时= _mm_madd_epi16(mlpkernel,mlpInnerPixels);

            ACCU = _mm_add_epi32(ACCU,温度); //每个双字都有一个部分和

            lpKernel + = 8; lpInnerPixels + = 8;

        } //循环内核

        //复制HI-DWORD MM0到LO-DWORD的MM1,再总结MMO + MM1
        //最后结果存储在变量ACCU
        ACCU = _mm_add_epi32(ACCU,_mm_srl_epi64(ACCU,移位)); //组合来自上层的结果,下双字

        总和= _mm_cvtsi128_si32(ACCU); //将MMX结果总结

        //标量
        对于(loopcount = remainingcount; loopcount = 0;!loopcount--)//为无法被处理成一个向量中每个内核项目
        {
            //总和+ =(UINT)(*(内核+ J)*(arrayIn + H * arrayLen1 + I  -  tailLength + J));
            总和+ =(UINT)((*(USHORT *)lpKernel)*(USHORT *)(lpInnerPixels));
            //点​​++;
            lpKernel + = 2; lpInnerPixels + = 2;
        } //循环内核


        // *(interArray + H * arrayLen1 + I)=(USHORT)(总和/ *(norm_r +点 -  1));

        *(USHORT *)lpInterRowInside =(USHORT)(总和/标准);
        lpInterRowInside + = 2; //移动到下一列的sizeof(USHORT)
    } //为每列


    lpRow + = rowstride; //移动到下一行(H * arrayLen1)
    lpInterRow + = rowstride;


} //对每行

// _ mm_empty();

返回0;

}
 

I understand converting MMX 32bit mmx intrinsics no longer allows the __m64. So I was having great trouble upgrading this piece of code to SSE. I was told on another stack-Overflow post to post my code. Perhaps this exercise will help others as well.

I commented out '_mm_empty' thinking that was the right thing to do. I found like functions in the emmintrin.h for all the other __m128i opertions, but something is still wrong.

original 32-bit function code:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
    ////////////////////////////////////////////////////////////
    // get local vars representing parameters from original call
    ushort* arrayIn     = m_taskdata.arrayIn;
    ushort arrayLen0    = m_taskdata.arrayLen0;
    ushort arrayLen1    = m_taskdata.arrayLen1;
    ushort* kernel      = m_taskdata.kernel;
    ushort kernelLen    = m_taskdata.kernelLen;
    uint32_t* norm_r        = m_taskdata.norm_r;
    ushort* outputArray = m_taskdata.outputArray;

    ushort* interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////////////////

    ushort tailLength = (ushort)((kernelLen - 1) / 2);

    _ASSERTE(interArray);

    //ushort* pRow = NULL; // the current row
    //ushort* pInterRow = NULL; // the current row in the interarray

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1;
    INT_PTR lpKernel;

    // adjust for non-zero start
    lpRow += m_nRowStart*rowstride;
    lpInterRow += m_nRowStart*rowstride;

    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; // use this to work inside the edges

    int h, i;
    uint sum, points;
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

    // use this for MMX optimizations
    int fourcount = kernelLen/4;
    int remainingcount = kernelLen%4;
    int mmxcount = 4*fourcount; // this is where the remainder is handled
    int loopcount = 0; // use the for fast looping tests

    _mm_empty();
    __m64 accu, temp;
    __m64 shifter = _m_from_int(32);

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
    {
        // skip over left edge
        lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

        for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
        {
            sum = 0;
            points = 0;
            lpKernel = (INT_PTR)kernel;

            lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

            // MMX Optimizations
            accu = _mm_setzero_si64(); // zero the accumulator

            // VECTOR processing
            for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

                // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
                // _mm_add_pi32/_m_paddd: 2*32bit add 
                temp = _m_pmaddwd(*(__m64*)lpKernel, *(__m64*)lpInnerPixels);

                accu = _mm_add_pi32(accu, temp); // each double word has a partial sum

                lpKernel += 8; lpInnerPixels += 8;

            } // loop over the kernel

            // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
            // and finally store the result into the variable "accu"
            accu = _mm_add_pi32(accu, _mm_srl_si64(accu, shifter)); // combine results from upper and lower double words

            sum = _m_to_int(accu); // move mmx result to sum

            // SCALAR
            for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
                sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
                //points++;
                lpKernel += 2; lpInnerPixels += 2;
            } // loop over the kernel


            //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

            *(ushort*)lpInterRowInside = (ushort)(sum/norm);
            lpInterRowInside += 2; // move to next column sizeof(ushort)
        } // for each column


        lpRow += rowstride; // move to next row ( h * arrayLen1 )
        lpInterRow += rowstride;


    } // for each row

    _mm_empty();

    return 0;

}

64 Bit Attempt:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
    ////////////////////////////////////////////////////////////
    // get local vars representing parameters from original call
    ushort* arrayIn     = m_taskdata.arrayIn;
    ushort arrayLen0    = m_taskdata.arrayLen0;
    ushort arrayLen1    = m_taskdata.arrayLen1;
    ushort* kernel      = m_taskdata.kernel;
    ushort kernelLen    = m_taskdata.kernelLen;
    uint32_t* norm_r        = m_taskdata.norm_r;
    ushort* outputArray = m_taskdata.outputArray;

    ushort* interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////////////////

    ushort tailLength = (ushort)((kernelLen - 1) / 2);

    _ASSERTE(interArray);

    //ushort* pRow = NULL; // the current row
    //ushort* pInterRow = NULL; // the current row in the interarray

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1;
    INT_PTR lpKernel;

    // adjust for non-zero start
    lpRow += m_nRowStart*rowstride;
    lpInterRow += m_nRowStart*rowstride;


    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; // use this to work inside the edges

    int h, i;
    uint sum, points;
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

    // use this for MMX optimizations
    int fourcount = kernelLen/4;
    int remainingcount = kernelLen%4;
    int mmxcount = 4*fourcount; // this is where the remainder is handled
    int loopcount = 0; // use the for fast looping tests

    //_mm_empty();
    __m128i accu, temp;
    __m128i shifter = _mm_cvtsi32_si128(32);

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
    {
        // skip over left edge
        lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

        for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
        {
            sum = 0;
            points = 0;
            lpKernel = (INT_PTR)kernel;

            lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

            // MMX Optimizations
            accu = _mm_setzero_si128(); // zero the accumulator

            // VECTOR processing
            for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

                // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
                // _mm_add_pi32/_m_paddd: 2*32bit add 
                //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
                temp = _mm_madd_epi16(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);

                accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

                lpKernel += 8; lpInnerPixels += 8;

            } // loop over the kernel

            // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
            // and finally store the result into the variable "accu"
            accu = _mm_add_epi32(accu, _mm_sll_epi64(accu, shifter)); // combine results from upper and lower double words

            sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

            // SCALAR
            for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
                sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
                //points++;
                lpKernel += 2; lpInnerPixels += 2;
            } // loop over the kernel


            //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

            *(ushort*)lpInterRowInside = (ushort)(sum/norm);
            lpInterRowInside += 2; // move to next column sizeof(ushort)
        } // for each column


        lpRow += rowstride; // move to next row ( h * arrayLen1 )
        lpInterRow += rowstride;


    } // for each row

    //_mm_empty();

    return 0;

}

解决方案

With all the issues fixed mentioned above in the comments. Here is the final working x64 SSE Convolution code:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn     = m_taskdata.arrayIn;
ushort arrayLen0    = m_taskdata.arrayLen0;
ushort arrayLen1    = m_taskdata.arrayLen1;
ushort* kernel      = m_taskdata.kernel;
ushort kernelLen    = m_taskdata.kernelLen;
uint32_t* norm_r        = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;

ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////

ushort tailLength = (ushort)((kernelLen - 1) / 2);

_ASSERTE(interArray);

//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;

// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;


// want to process only those (edge) pixels that need the innner loop condition 
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges

int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests

//_mm_empty();
__m128i accu, temp, mlpkernel, mlpInnerPixels;
__m128i shifter = _mm_cvtsi32_si128(32);

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
    // skip over left edge
    lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

    for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
    {
        sum = 0;
        points = 0;
        lpKernel = (INT_PTR)kernel;

        lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

        // MMX Optimizations
        accu = _mm_setzero_si128(); // zero the accumulator

        // VECTOR processing
        for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
        {
            //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

            // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
            // _mm_add_pi32/_m_paddd: 2*32bit add 
            //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
            //mlpkernel = _mm_cvtsi32_si128(lpKernel);
            mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel);
            mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels);
            temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels);

            accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

            lpKernel += 8; lpInnerPixels += 8;

        } // loop over the kernel

        // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
        // and finally store the result into the variable "accu"
        accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter)); // combine results from upper and lower double words

        sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

        // SCALAR
        for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
        {
            //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
            sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
            //points++;
            lpKernel += 2; lpInnerPixels += 2;
        } // loop over the kernel


        //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

        *(ushort*)lpInterRowInside = (ushort)(sum/norm);
        lpInterRowInside += 2; // move to next column sizeof(ushort)
    } // for each column


    lpRow += rowstride; // move to next row ( h * arrayLen1 )
    lpInterRow += rowstride;


} // for each row

//_mm_empty();

return 0;

}

 
精彩推荐
图片推荐