我做了错误的转换我MMX内部函数到x64（SSE）？函数、错误、我做了、MMX

2023-09-08 10:37:17 作者：不如一刀致命来得痛快

据我所知转换MMX 32位MMX内在不再允许__m64。所以，我有很大的麻烦升级这块code上交所。有人告诉我，在另一个堆栈溢出后发表我的code。或许这练习将帮助其他人。

我注释掉_mm_empty认为是正确的事情。我发现，喜欢在emmintrin.h所有其他__m128i opertions功能，但事情仍然是错误的。

原来的32位功能code：

  DWORD CSumInsideHorizontalTask :: InternalDoWork（）
{
    ////////////////////////////////////////////////// //////////
    //获取本地变量重新presenting从原来的调用参数
    USHORT * arrayIn = m_taskdata.arrayIn;
    USHORT arrayLen0 = m_taskdata.arrayLen0;
    USHORT arrayLen1 = m_taskdata.arrayLen1;
    USHORT *内核= m_taskdata.kernel;
    USHORT kernelLen = m_taskdata.kernelLen;
    uint32_t的* norm_r = m_taskdata.norm_r;
    USHORT * outputArray = m_taskdata.outputArray;

    USHORT * interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////// //////////

    USHORT tailLength =（USHORT）（（kernelLen  -  1）/ 2）;

    _ASSERTE（interArray）;

    // USHORT *船头= NULL; //当前行
    // USHORT * pInterRow = NULL; //在interarray当前行

    INT_PTR lpRow =（INT_PTR）arrayIn; //整数指针算术
    INT_PTR lpInterRow =（INT_PTR）interArray; //整数指针算术
    INT_PTR rowstride = sizeof的（USHORT）* arrayLen1;
    INT_PTR lpKernel;

    //调整为非零启动
    lpRow + = m_nRowStart * rowstride;
    lpInterRow + = m_nRowStart * rowstride;

    //要处理的只有那些（边）的像素所需要的肠子循环条件
    const int的knLeftEdgeMax = kernelLen  -  1  -  tailLength; //从0到左侧边缘的结束
    const int的knRightEdgeStart = arrayLen1  -  kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; //使用这个工作的边缘内

    INT H，I;
    UINT相加，分;
    uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
    INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
    INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;

    //使用这个为MMX优化
    INT fourcount = kernelLen / 4;
    INT remainingcount = kernelLen％4;
    INT mmxcount = 4 * fourcount; //这是其中剩余的处理
    INT loopcount = 0; //使用快速循环测试

    _mm_empty（）;
    __m64 ACCU，温度;
    __m64移位= _m_from_int（32）;

    对于（H = m_nRowStart; H＆LT; m_nRowEnd; H ++）//为每行
    {
        //跳过左边缘
        lpInterRowInside = lpInterRow + cbLeftEdgeStride;

        对于（i = knLeftEdgeMax; I＆LT; knRightEdgeStart;我++）//为每个边缘内
        {
            总和= 0;
            分= 0;
            lpKernel =（INT_PTR）内核;

            lpInnerPixels = lpRow +（（ⅰ -  tailLength）其中;＆小于1）; //这是我们开始的行

            // MMX优化
            ACCU = _mm_setzero_si64（）; //零累加器

            //矢量处理
            为（loopcount = fourcount; loopcount = 0;！loopcount--）// //为可以被处理为一个向量中的每个内核项
            {
                //总和+ =（UINT）（*（内核+ J）*（arrayIn + H * arrayLen1 + I  -  tailLength + J））;

                // _m_pmaddwd：4 * 16位乘加，得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
                // _mm_add_pi32 / _m_paddd：2 * 32位插件
                临时= _m_pmaddwd（*（__ M64 *）lpKernel，*（__ M64 *）lpInnerPixels）;

                ACCU = _mm_add_pi32（ACCU，温度）; //每个双字都有一个部分和

                lpKernel + = 8; lpInnerPixels + = 8;

            } //循环内核

            //复制HI-DWORD MM0到LO-DWORD的MM1，再总结MMO + MM1
            //最后结果存储在变量ACCU
            ACCU = _mm_add_pi32（ACCU，_mm_srl_si64（ACCU，移位））; //组合来自上层的结果，下双字

            总和= _m_to_int（ACCU）; //将MMX结果总结

            //标量
            对于（loopcount = remainingcount; loopcount = 0;！loopcount--）//为无法被处理成一个向量中每个内核项目
            {
                //总和+ =（UINT）（*（内核+ J）*（arrayIn + H * arrayLen1 + I  -  tailLength + J））;
                总和+ =（UINT）（（*（USHORT *）lpKernel）*（USHORT *）（lpInnerPixels））;
                //点++;
                lpKernel + = 2; lpInnerPixels + = 2;
            } //循环内核


            // *（interArray + H * arrayLen1 + I）=（USHORT）（总和/ *（norm_r +点 -  1））;

            *（USHORT *）lpInterRowInside =（USHORT）（总和/标准）;
            lpInterRowInside + = 2; //移动到下一列的sizeof（USHORT）
        } //为每列


        lpRow + = rowstride; //移动到下一行（H * arrayLen1）
        lpInterRow + = rowstride;


    } //对每行

    _mm_empty（）;

    返回0;

}

64位尝试：

  DWORD CSumInsideHorizontalTask :: InternalDoWork（）
{
    ////////////////////////////////////////////////// //////////
    //获取本地变量重新presenting从原来的调用参数
    USHORT * arrayIn = m_taskdata.arrayIn;
    USHORT arrayLen0 = m_taskdata.arrayLen0;
    USHORT arrayLen1 = m_taskdata.arrayLen1;
    USHORT *内核= m_taskdata.kernel;
    USHORT kernelLen = m_taskdata.kernelLen;
    uint32_t的* norm_r = m_taskdata.norm_r;
    USHORT * outputArray = m_taskdata.outputArray;

    USHORT * interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////// //////////

    USHORT tailLength =（USHORT）（（kernelLen  -  1）/ 2）;

    _ASSERTE（interArray）;

    // USHORT *船头= NULL; //当前行
    // USHORT * pInterRow = NULL; //在interarray当前行

    INT_PTR lpRow =（INT_PTR）arrayIn; //整数指针算术
    INT_PTR lpInterRow =（INT_PTR）interArray; //整数指针算术
    INT_PTR rowstride = sizeof的（USHORT）* arrayLen1;
    INT_PTR lpKernel;

    //调整为非零启动
    lpRow + = m_nRowStart * rowstride;
    lpInterRow + = m_nRowStart * rowstride;


    //要处理的只有那些（边）的像素所需要的肠子循环条件
    const int的knLeftEdgeMax = kernelLen  -  1  -  tailLength; //从0到左侧边缘的结束
    const int的knRightEdgeStart = arrayLen1  -  kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; //使用这个工作的边缘内

    INT H，I;
    UINT相加，分;
    uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
    INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
    INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;

    //使用这个为MMX优化
    INT fourcount = kernelLen / 4;
    INT remainingcount = kernelLen％4;
    INT mmxcount = 4 * fourcount; //这是其中剩余的处理
    INT loopcount = 0; //使用快速循环测试

    // _ mm_empty（）;
    __m128i ACCU，温度;
    __m128i移位= _mm_cvtsi32_si128（32）;

    对于（H = m_nRowStart; H＆LT; m_nRowEnd; H ++）//为每行
    {
        //跳过左边缘
        lpInterRowInside = lpInterRow + cbLeftEdgeStride;

        对于（i = knLeftEdgeMax; I＆LT; knRightEdgeStart;我++）//为每个边缘内
        {
            总和= 0;
            分= 0;
            lpKernel =（INT_PTR）内核;

            lpInnerPixels = lpRow +（（ⅰ -  tailLength）其中;＆小于1）; //这是我们开始的行

            // MMX优化
            ACCU = _mm_setzero_si128（）; //零累加器

            //矢量处理
            为（loopcount = fourcount; loopcount = 0;！loopcount--）// //为可以被处理为一个向量中的每个内核项
            {
                //总和+ =（UINT）（*（内核+ J）*（arrayIn + H * arrayLen1 + I  -  tailLength + J））;

                // _m_pmaddwd：4 * 16位乘加，得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
                // _mm_add_pi32 / _m_paddd：2 * 32位插件
                //临时= _m_pmaddwd（*（__ m128i *）lpKernel，*（__ m128i *）lpInnerPixels）;
                临时= _mm_madd_epi16（*（__ m128i *）lpKernel，*（__ m128i *）lpInnerPixels）;

                ACCU = _mm_add_epi32（ACCU，温度）; //每个双字都有一个部分和

                lpKernel + = 8; lpInnerPixels + = 8;

            } //循环内核

            //复制HI-DWORD MM0到LO-DWORD的MM1，再总结MMO + MM1
            //最后结果存储在变量ACCU
            ACCU = _mm_add_epi32（ACCU，_mm_sll_epi64（ACCU，移位））; //组合来自上层的结果，下双字

            总和= _mm_cvtsi128_si32（ACCU）; //将MMX结果总结

            //标量
            对于（loopcount = remainingcount; loopcount = 0;！loopcount--）//为无法被处理成一个向量中每个内核项目
            {
                //总和+ =（UINT）（*（内核+ J）*（arrayIn + H * arrayLen1 + I  -  tailLength + J））;
                总和+ =（UINT）（（*（USHORT *）lpKernel）*（USHORT *）（lpInnerPixels））;
                //点++;
                lpKernel + = 2; lpInnerPixels + = 2;
            } //循环内核


            // *（interArray + H * arrayLen1 + I）=（USHORT）（总和/ *（norm_r +点 -  1））;

            *（USHORT *）lpInterRowInside =（USHORT）（总和/标准）;
            lpInterRowInside + = 2; //移动到下一列的sizeof（USHORT）
        } //为每列


        lpRow + = rowstride; //移动到下一行（H * arrayLen1）
        lpInterRow + = rowstride;


    } //对每行

    // _ mm_empty（）;

    返回0;

}

解决方案如何将excel格式的电话转换成手机电话簿

通过上面的评论中提到修复了所有的问题。这里是最后的64位工作证卷积code：

  DWORD CSumInsideHorizontalTask :: InternalDoWork（）
{
////////////////////////////////////////////////// //////////
//获取本地变量重新presenting从原来的调用参数
USHORT * arrayIn = m_taskdata.arrayIn;
USHORT arrayLen0 = m_taskdata.arrayLen0;
USHORT arrayLen1 = m_taskdata.arrayLen1;
USHORT *内核= m_taskdata.kernel;
USHORT kernelLen = m_taskdata.kernelLen;
uint32_t的* norm_r = m_taskdata.norm_r;
USHORT * outputArray = m_taskdata.outputArray;

USHORT * interArray = m_taskdata.interArray;
////////////////////////////////////////////////// //////////

USHORT tailLength =（USHORT）（（kernelLen  -  1）/ 2）;

_ASSERTE（interArray）;

// USHORT *船头= NULL; //当前行
// USHORT * pInterRow = NULL; //在interarray当前行

INT_PTR lpRow =（INT_PTR）arrayIn; //整数指针算术
INT_PTR lpInterRow =（INT_PTR）interArray; //整数指针算术
INT_PTR rowstride = sizeof的（USHORT）* arrayLen1;
INT_PTR lpKernel;

//调整为非零启动
lpRow + = m_nRowStart * rowstride;
lpInterRow + = m_nRowStart * rowstride;


//要处理的只有那些（边）的像素所需要的肠子循环条件
const int的knLeftEdgeMax = kernelLen  -  1  -  tailLength; //从0到左侧边缘的结束
const int的knRightEdgeStart = arrayLen1  -  kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; //使用这个工作的边缘内

INT H，I;
UINT相加，分;
uint32_t的规范= norm_r [kernelLen-1]; //总是处理完整的内核
INT_PTR lpInnerPixels; //使用这个简化的数学指针在内核中环
INT_PTR cbLeftEdgeStride = 2 * knLeftEdgeMax;

//使用这个为MMX优化
INT fourcount = kernelLen / 4;
INT remainingcount = kernelLen％4;
INT mmxcount = 4 * fourcount; //这是其中剩余的处理
INT loopcount = 0; //使用快速循环测试

// _ mm_empty（）;
__m128i ACCU，温度，mlpkernel，mlpInnerPixels;
__m128i移位= _mm_cvtsi32_si128（32）;

对于（H = m_nRowStart; H＆LT; m_nRowEnd; H ++）//为每行
{
    //跳过左边缘
    lpInterRowInside = lpInterRow + cbLeftEdgeStride;

    对于（i = knLeftEdgeMax; I＆LT; knRightEdgeStart;我++）//为每个边缘内
    {
        总和= 0;
        分= 0;
        lpKernel =（INT_PTR）内核;

        lpInnerPixels = lpRow +（（ⅰ -  tailLength）其中;＆小于1）; //这是我们开始的行

        // MMX优化
        ACCU = _mm_setzero_si128（）; //零累加器

        //矢量处理
        为（loopcount = fourcount; loopcount = 0;！loopcount--）// //为可以被处理为一个向量中的每个内核项
        {
            //总和+ =（UINT）（*（内核+ J）*（arrayIn + H * arrayLen1 + I  -  tailLength + J））;

            // _m_pmaddwd：4 * 16位乘加，得到的两个32位= A0 * B0 + A1 * B1; A2 * B2 + A3 * B3]
            // _mm_add_pi32 / _m_paddd：2 * 32位插件
            //临时= _m_pmaddwd（*（__ m128i *）lpKernel，*（__ m128i *）lpInnerPixels）;
            // mlpkernel = _mm_cvtsi32_si128（lpKernel）;
            mlpkernel = _mm_cvtsi64_si128（*（__的Int64 *）lpKernel）;
            mlpInnerPixels = _mm_cvtsi64_si128（*（__的Int64 *）lpInnerPixels）;
            临时= _mm_madd_epi16（mlpkernel，mlpInnerPixels）;

            ACCU = _mm_add_epi32（ACCU，温度）; //每个双字都有一个部分和

            lpKernel + = 8; lpInnerPixels + = 8;

        } //循环内核

        //复制HI-DWORD MM0到LO-DWORD的MM1，再总结MMO + MM1
        //最后结果存储在变量ACCU
        ACCU = _mm_add_epi32（ACCU，_mm_srl_epi64（ACCU，移位））; //组合来自上层的结果，下双字

        总和= _mm_cvtsi128_si32（ACCU）; //将MMX结果总结

        //标量
        对于（loopcount = remainingcount; loopcount = 0;！loopcount--）//为无法被处理成一个向量中每个内核项目
        {
            //总和+ =（UINT）（*（内核+ J）*（arrayIn + H * arrayLen1 + I  -  tailLength + J））;
            总和+ =（UINT）（（*（USHORT *）lpKernel）*（USHORT *）（lpInnerPixels））;
            //点++;
            lpKernel + = 2; lpInnerPixels + = 2;
        } //循环内核


        // *（interArray + H * arrayLen1 + I）=（USHORT）（总和/ *（norm_r +点 -  1））;

        *（USHORT *）lpInterRowInside =（USHORT）（总和/标准）;
        lpInterRowInside + = 2; //移动到下一列的sizeof（USHORT）
    } //为每列


    lpRow + = rowstride; //移动到下一行（H * arrayLen1）
    lpInterRow + = rowstride;


} //对每行

// _ mm_empty（）;

返回0;

}

I understand converting MMX 32bit mmx intrinsics no longer allows the __m64. So I was having great trouble upgrading this piece of code to SSE. I was told on another stack-Overflow post to post my code. Perhaps this exercise will help others as well.

I commented out '_mm_empty' thinking that was the right thing to do. I found like functions in the emmintrin.h for all the other __m128i opertions, but something is still wrong.

original 32-bit function code:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
    ////////////////////////////////////////////////////////////
    // get local vars representing parameters from original call
    ushort* arrayIn     = m_taskdata.arrayIn;
    ushort arrayLen0    = m_taskdata.arrayLen0;
    ushort arrayLen1    = m_taskdata.arrayLen1;
    ushort* kernel      = m_taskdata.kernel;
    ushort kernelLen    = m_taskdata.kernelLen;
    uint32_t* norm_r        = m_taskdata.norm_r;
    ushort* outputArray = m_taskdata.outputArray;

    ushort* interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////////////////

    ushort tailLength = (ushort)((kernelLen - 1) / 2);

    _ASSERTE(interArray);

    //ushort* pRow = NULL; // the current row
    //ushort* pInterRow = NULL; // the current row in the interarray

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1;
    INT_PTR lpKernel;

    // adjust for non-zero start
    lpRow += m_nRowStart*rowstride;
    lpInterRow += m_nRowStart*rowstride;

    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; // use this to work inside the edges

    int h, i;
    uint sum, points;
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

    // use this for MMX optimizations
    int fourcount = kernelLen/4;
    int remainingcount = kernelLen%4;
    int mmxcount = 4*fourcount; // this is where the remainder is handled
    int loopcount = 0; // use the for fast looping tests

    _mm_empty();
    __m64 accu, temp;
    __m64 shifter = _m_from_int(32);

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
    {
        // skip over left edge
        lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

        for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
        {
            sum = 0;
            points = 0;
            lpKernel = (INT_PTR)kernel;

            lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

            // MMX Optimizations
            accu = _mm_setzero_si64(); // zero the accumulator

            // VECTOR processing
            for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

                // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
                // _mm_add_pi32/_m_paddd: 2*32bit add 
                temp = _m_pmaddwd(*(__m64*)lpKernel, *(__m64*)lpInnerPixels);

                accu = _mm_add_pi32(accu, temp); // each double word has a partial sum

                lpKernel += 8; lpInnerPixels += 8;

            } // loop over the kernel

            // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
            // and finally store the result into the variable "accu"
            accu = _mm_add_pi32(accu, _mm_srl_si64(accu, shifter)); // combine results from upper and lower double words

            sum = _m_to_int(accu); // move mmx result to sum

            // SCALAR
            for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
                sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
                //points++;
                lpKernel += 2; lpInnerPixels += 2;
            } // loop over the kernel


            //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

            *(ushort*)lpInterRowInside = (ushort)(sum/norm);
            lpInterRowInside += 2; // move to next column sizeof(ushort)
        } // for each column


        lpRow += rowstride; // move to next row ( h * arrayLen1 )
        lpInterRow += rowstride;


    } // for each row

    _mm_empty();

    return 0;

}

64 Bit Attempt:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
    ////////////////////////////////////////////////////////////
    // get local vars representing parameters from original call
    ushort* arrayIn     = m_taskdata.arrayIn;
    ushort arrayLen0    = m_taskdata.arrayLen0;
    ushort arrayLen1    = m_taskdata.arrayLen1;
    ushort* kernel      = m_taskdata.kernel;
    ushort kernelLen    = m_taskdata.kernelLen;
    uint32_t* norm_r        = m_taskdata.norm_r;
    ushort* outputArray = m_taskdata.outputArray;

    ushort* interArray = m_taskdata.interArray;
    ////////////////////////////////////////////////////////////

    ushort tailLength = (ushort)((kernelLen - 1) / 2);

    _ASSERTE(interArray);

    //ushort* pRow = NULL; // the current row
    //ushort* pInterRow = NULL; // the current row in the interarray

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1;
    INT_PTR lpKernel;

    // adjust for non-zero start
    lpRow += m_nRowStart*rowstride;
    lpInterRow += m_nRowStart*rowstride;


    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
    INT_PTR lpInterRowInside; // use this to work inside the edges

    int h, i;
    uint sum, points;
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

    // use this for MMX optimizations
    int fourcount = kernelLen/4;
    int remainingcount = kernelLen%4;
    int mmxcount = 4*fourcount; // this is where the remainder is handled
    int loopcount = 0; // use the for fast looping tests

    //_mm_empty();
    __m128i accu, temp;
    __m128i shifter = _mm_cvtsi32_si128(32);

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
    {
        // skip over left edge
        lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

        for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
        {
            sum = 0;
            points = 0;
            lpKernel = (INT_PTR)kernel;

            lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

            // MMX Optimizations
            accu = _mm_setzero_si128(); // zero the accumulator

            // VECTOR processing
            for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

                // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
                // _mm_add_pi32/_m_paddd: 2*32bit add 
                //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
                temp = _mm_madd_epi16(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);

                accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

                lpKernel += 8; lpInnerPixels += 8;

            } // loop over the kernel

            // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
            // and finally store the result into the variable "accu"
            accu = _mm_add_epi32(accu, _mm_sll_epi64(accu, shifter)); // combine results from upper and lower double words

            sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

            // SCALAR
            for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
            {
                //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
                sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
                //points++;
                lpKernel += 2; lpInnerPixels += 2;
            } // loop over the kernel


            //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

            *(ushort*)lpInterRowInside = (ushort)(sum/norm);
            lpInterRowInside += 2; // move to next column sizeof(ushort)
        } // for each column


        lpRow += rowstride; // move to next row ( h * arrayLen1 )
        lpInterRow += rowstride;


    } // for each row

    //_mm_empty();

    return 0;

}

解决方案

With all the issues fixed mentioned above in the comments. Here is the final working x64 SSE Convolution code:

DWORD CSumInsideHorizontalTask::InternalDoWork()
{
////////////////////////////////////////////////////////////
// get local vars representing parameters from original call
ushort* arrayIn     = m_taskdata.arrayIn;
ushort arrayLen0    = m_taskdata.arrayLen0;
ushort arrayLen1    = m_taskdata.arrayLen1;
ushort* kernel      = m_taskdata.kernel;
ushort kernelLen    = m_taskdata.kernelLen;
uint32_t* norm_r        = m_taskdata.norm_r;
ushort* outputArray = m_taskdata.outputArray;

ushort* interArray = m_taskdata.interArray;
////////////////////////////////////////////////////////////

ushort tailLength = (ushort)((kernelLen - 1) / 2);

_ASSERTE(interArray);

//ushort* pRow = NULL; // the current row
//ushort* pInterRow = NULL; // the current row in the interarray

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
INT_PTR rowstride = sizeof(ushort)*arrayLen1;
INT_PTR lpKernel;

// adjust for non-zero start
lpRow += m_nRowStart*rowstride;
lpInterRow += m_nRowStart*rowstride;


// want to process only those (edge) pixels that need the innner loop condition 
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength;
INT_PTR lpInterRowInside; // use this to work inside the edges

int h, i;
uint sum, points;
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax;

// use this for MMX optimizations
int fourcount = kernelLen/4;
int remainingcount = kernelLen%4;
int mmxcount = 4*fourcount; // this is where the remainder is handled
int loopcount = 0; // use the for fast looping tests

//_mm_empty();
__m128i accu, temp, mlpkernel, mlpInnerPixels;
__m128i shifter = _mm_cvtsi32_si128(32);

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row
{
    // skip over left edge
    lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

    for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges
    {
        sum = 0;
        points = 0;
        lpKernel = (INT_PTR)kernel;

        lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row

        // MMX Optimizations
        accu = _mm_setzero_si128(); // zero the accumulator

        // VECTOR processing
        for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector
        {
            //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));

            // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
            // _mm_add_pi32/_m_paddd: 2*32bit add 
            //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels);
            //mlpkernel = _mm_cvtsi32_si128(lpKernel);
            mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel);
            mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels);
            temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels);

            accu = _mm_add_epi32(accu, temp); // each double word has a partial sum

            lpKernel += 8; lpInnerPixels += 8;

        } // loop over the kernel

        // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
        // and finally store the result into the variable "accu"
        accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter)); // combine results from upper and lower double words

        sum = _mm_cvtsi128_si32(accu); // move mmx result to sum

        // SCALAR
        for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector
        {
            //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j));
            sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels));
            //points++;
            lpKernel += 2; lpInnerPixels += 2;
        } // loop over the kernel


        //*(interArray + h * arrayLen1 + i) = (ushort)(sum / *(norm_r + points - 1));

        *(ushort*)lpInterRowInside = (ushort)(sum/norm);
        lpInterRowInside += 2; // move to next column sizeof(ushort)
    } // for each column


    lpRow += rowstride; // move to next row ( h * arrayLen1 )
    lpInterRow += rowstride;


} // for each row

//_mm_empty();

return 0;

}

上一篇：与Mac OS X的64位处理器上运行32位汇编器上、OS、Mac

下一篇：WebGL的：如何使一个对象的一部分透明？透明、对象、WebGL

相关推荐

精彩图集

精彩推荐

图片推荐