/*vaan_idct.cpp:  The Ivec version of the iDCT routine.
********************************************************************
*
*                    Copyright (c) 1998 Intel Corporation
*    
*    THIS SOURCE CODE IS PROVIDED "AS IS" WITH NO WARRANTIES WHATSOEVER, 
*    INCLUDING ANY WARRANTY OF MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR ANY 
*    PARTICULAR PURPOSE, OR ANY WARRANTY OTHERWISE ARISING OUT OF ANY PROPOSAL, 
*    SPECIFICATION OR SAMPLE. Intel disclaims all liability, including liability 
*    for infringement of any proprietary rights, relating to use of information 
*    in this specification. No license, express or implied, by estoppel or 
*    otherwise, to any intellectual property rights is granted herein, other 
*    than a royalty-free copyright license to use, copy, modify, distribute, and 
*    otherwise dispose of this source code and resulting object code in any 
*    format, for execution on Intel Architecture processors.
*
*    Microprocessors may contain design defects or errors known as errata which
*    may cause the product to deviate from published specifications. Current 
*    characterized errata for Intel microprocessors are available on request.
*
*
********************************************************************
*/

// NOTE: This function must be compiled with an Intrinsics-aware
// C++ compiler. (e.g. the Intel C/C++ Compiler, version 3.1,
// available later in 1998)

#include "aan_idct.h"

/*F*
////////////////////////////////////////////////////////////////////////////
// Name:		MMXIvec_iDCT8x8AAN
//
// Purpose:		Performs an inverse DCT on an 8x8 block.
//				Optimized assembly code using MMX(TM) technology.
//
// Context:		The AAN (Arai, Agui, and Nakajima) algorithm from
//				Trans. IEICE, vol. E 71(11), 1095-1097, Nov. 1988 is used.
//				This implementation using MMX(TM) Ivec classes was developed
//				by Intel Corporation.
//
// Returns:		None.
//
// Parameters:
//				coef_block	Input:  A set of 1 DC and 63 AC coefficients.
//							Ouput:  An 8x8 raster of image values.
//
////////////////////////////////////////////////////////////////////////////
*F*/
void MMXIvec_iDCT8x8AAN(short *coef_block)
{
    // Variables
    __m64 *ptr64;       // Access the data 64 bits at a time

	Is16vec4 row1, row2, row3, row4;
	Is16vec4 zeroes, ones, twos, threes, fours, fives, sixes, sevens;
	Is16vec4 lower_hi, lower_lo, upper_hi, upper_lo;
	Is16vec4 stg0, stg1, stg2, stg3, stg4, stg5, stg6, stg7;

	Is16vec4 stg21, stg23, stg24, stg25, stg26;
	Is16vec4 stg10, stg11, stg12, stg13, stg15, stg16, stg17;
	Is16vec4 stg33;

	Iu8vec8 tmpub, M64Const_zero;
	Is16vec4 tmps;


    // Given an 8x8 matrix of shorts in the following configuration:
    //          a0 a1 a2 a3 a4 a5 a6 a7
    //          b0 b1 b2 b3 b4 b5 b6 b7
    //          c0 c1 c2 c3 c4 c5 c6 c7
    //          d0 d1 d2 d3 d4 d5 d6 d7
    //          e0 e1 e2 e3 e4 e5 e6 e7
    //          f0 f1 f2 f3 f4 f5 f6 f7
    //          g0 g1 g2 g3 g4 g5 g6 g7
    //          h0 h1 h2 h3 h4 h5 h6 h7
    //
    // Do the columns
    // Break the column calculations into two iterations of a loop, one for the
    // left half of the matrix and the other for the top.  
    // No transposition need occur for the columnar arithmetic.
    ptr64 = (__m64 *) coef_block; 
    
    // Process the columns with this loop.
    // Loop will only execute twice, once for top half of matrix, once for bottom.
    // Read in half of matrix
        zeroes  = (Is16vec4)*(ptr64 + 0);
        ones    = (Is16vec4)*(ptr64 + 2);
        twos    = (Is16vec4)*(ptr64 + 4);
        threes  = (Is16vec4)*(ptr64 + 6);
        fours   = (Is16vec4)*(ptr64 + 8);
        fives   = (Is16vec4)*(ptr64 + 10);
        sixes   = (Is16vec4)*(ptr64 + 12);
        sevens  = (Is16vec4)*(ptr64 + 14);
	
    // Remember, all of the following calculations are SIMD.
    //
    //		stg0 = lptr[0] + lptr[32];
    //		stg4 = lptr[0] - lptr[32];
    //		stg1  = lptr[8] + lptr[56];
    //		stg7  = lptr[8] - lptr[56];
    //		stg6 = lptr[16] - lptr[48];
    //		stg2 = lptr[16] + lptr[48];
    //		stg3  = lptr[40] - lptr[24];
    //		stg5  = lptr[40] + lptr[24];
    //
        stg0 = zeroes + fours;
        stg4 = zeroes - fours;
        stg1 = ones + sevens;
        stg7 = ones - sevens;
        stg6 = twos - sixes;
        stg2 = twos + sixes;
        stg3 = fives - threes;
        stg5 = fives + threes;

    //		stg15 = stg1 - stg5;
    //		stg11 = stg1 + stg5;
        stg15 = stg1 - stg5;
        stg11 = stg1 + stg5;

    //		stg12 = stg0 + stg2;
    //		stg10 = stg0 - stg2;
        stg12 = stg0 + stg2;
        stg10 = stg0 - stg2;

    //		stg16 = SCALEM(b1*stg6) - stg2;
    //		stg24 = stg4 + stg16;
    //		stg26 = stg4 - stg16;
        tmps = stg6 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg16 = tmps - stg2;

        stg24 = stg4 + stg16;
        stg26 = stg4 - stg16;

    //		stg13 = SCALEM(b5*(stg3-stg7));     // Changed to stg3+stg7 per ASM implementation
    //		stg17 = SCALEM(b4*stg7) - stg13;
    //		stg21 = stg17 - stg11;              // Changed to stg23-stg11 per ASM
        tmps = stg3 + stg7;
        tmps <<= 2;
		stg13 = mul_high (tmps, M64Const_xm4);

        tmps = stg7 << 2;
		tmps = mul_high (tmps, M64Const_xm3);
        stg17 = tmps - stg13;

    //		stg23 = stg13 - SCALEM(b2*stg3);    // Changed to + per ASM 
        tmps = stg3 << 3;
		tmps = mul_high (tmps, M64Const_xm2);
        stg23 = stg13 + tmps;

        stg21 = stg23 - stg11;

    //		stg25 = SCALEM(b3*stg15) - stg21;
    //		stg33 = stg25 + stg23;              // Changed to stg17+stg25 per ASM
        tmps = stg15 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg25 = tmps - stg21;

        stg33 = stg17 + stg25;

    //		lptr[0]  = (stg12 + stg11);
    //		lptr[8]  = (stg24 + stg21);
    //		lptr[16] = (stg26 + stg25);
    //		lptr[24] = (stg10 - stg33);
    //		lptr[32] = (stg10 + stg33);
    //		lptr[40] = (stg26 - stg25);
    //		lptr[48] = (stg24 - stg21);
    //		lptr[56] = (stg12 - stg11);
        zeroes = stg12 + stg11;
        ones = stg24 + stg21;
        twos = stg26 + stg25;
        threes = stg10 - stg33;
        fours = stg10 + stg33;
        fives = stg26 - stg25;
        sixes = stg24 - stg21;
        sevens = stg12 - stg11;

        // Write out half of matrix
        *(ptr64 + 0) = zeroes;
        *(ptr64 + 2) = ones;
        *(ptr64 + 4) = twos;
        *(ptr64 + 6) = threes;
        *(ptr64 + 8) = fours;
        *(ptr64 + 10) = fives;
        *(ptr64 + 12) = sixes;
        *(ptr64 + 14) = sevens;

        ptr64 += 1;             // Move to right half of matrix

        // Read in half of matrix
        zeroes  = (Is16vec4)*(ptr64 + 0);
        ones    = (Is16vec4)*(ptr64 + 2);
        twos    = (Is16vec4)*(ptr64 + 4);
        threes  = (Is16vec4)*(ptr64 + 6);
        fours   = (Is16vec4)*(ptr64 + 8);
        fives   = (Is16vec4)*(ptr64 + 10);
        sixes   = (Is16vec4)*(ptr64 + 12);
        sevens  = (Is16vec4)*(ptr64 + 14);
	
    // Remember, all of the following calculations are SIMD.
    //
    //		stg0 = lptr[0] + lptr[32];
    //		stg4 = lptr[0] - lptr[32];
    //		stg1  = lptr[8] + lptr[56];
    //		stg7  = lptr[8] - lptr[56];
    //		stg6 = lptr[16] - lptr[48];
    //		stg2 = lptr[16] + lptr[48];
    //		stg3  = lptr[40] - lptr[24];
    //		stg5  = lptr[40] + lptr[24];
    //
        stg0 = zeroes + fours;
        stg4 = zeroes - fours;
        stg1 = ones + sevens;
        stg7 = ones - sevens;
        stg6 = twos - sixes;
        stg2 = twos + sixes;
        stg3 = fives - threes;
        stg5 = fives + threes;

    //		stg15 = stg1 - stg5;
    //		stg11 = stg1 + stg5;
        stg15 = stg1 - stg5;
        stg11 = stg1 + stg5;

    //		stg12 = stg0 + stg2;
    //		stg10 = stg0 - stg2;
        stg12 = stg0 + stg2;
        stg10 = stg0 - stg2;

    //		stg16 = SCALEM(b1*stg6) - stg2;
    //		stg24 = stg4 + stg16;
    //		stg26 = stg4 - stg16;
        tmps = stg6 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg16 = tmps - stg2;

        stg24 = stg4 + stg16;
        stg26 = stg4 - stg16;

    //		stg13 = SCALEM(b5*(stg3-stg7));     // Changed to stg3+stg7 per ASM implementation
    //		stg17 = SCALEM(b4*stg7) - stg13;
    //		stg21 = stg17 - stg11;              // Changed to stg23-stg11 per ASM
        tmps = stg3 + stg7;
        tmps <<= 2;
		stg13 = mul_high (tmps, M64Const_xm4);

        tmps = stg7 << 2;
		tmps = mul_high (tmps, M64Const_xm3);
        stg17 = tmps - stg13;

    //		stg23 = stg13 - SCALEM(b2*stg3);    // Changed to + per ASM 
        tmps = stg3 << 3;
		tmps = mul_high (tmps, M64Const_xm2);
        stg23 = stg13 + tmps;

        stg21 = stg23 - stg11;

    //		stg25 = SCALEM(b3*stg15) - stg21;
    //		stg33 = stg25 + stg23;              // Changed to stg17+stg25 per ASM
        tmps = stg15 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg25 = tmps - stg21;

        stg33 = stg17 + stg25;

    //		lptr[0]  = (stg12 + stg11);
    //		lptr[8]  = (stg24 + stg21);
    //		lptr[16] = (stg26 + stg25);
    //		lptr[24] = (stg10 - stg33);
    //		lptr[32] = (stg10 + stg33);
    //		lptr[40] = (stg26 - stg25);
    //		lptr[48] = (stg24 - stg21);
    //		lptr[56] = (stg12 - stg11);
        zeroes = stg12 + stg11;
        ones = stg24 + stg21;
        twos = stg26 + stg25;
        threes = stg10 - stg33;
        fours = stg10 + stg33;
        fives = stg26 - stg25;
        sixes = stg24 - stg21;
        sevens = stg12 - stg11;

        // Write out half of matrix
        *(ptr64 + 0) = zeroes;
        *(ptr64 + 2) = ones;
        *(ptr64 + 4) = twos;
        *(ptr64 + 6) = threes;
        *(ptr64 + 8) = fours;
        *(ptr64 + 10) = fives;
        *(ptr64 + 12) = sixes;
        *(ptr64 + 14) = sevens;

    // Transpose each of the four quadrants to isolate common column components
    // For example, quadrant 1:
    //          a0 b0 c0 d0
    //          a1 b1 c1 d1
    //          a2 b2 c2 d2
    //          a3 b3 c3 b3
    //
    // This allows for SIMD Row operations.  Work on 1/2 (top/bottom) of the
    // matrix at a time.

    // Break the row calculations into two iterations of a loop, one for the
    // top half of the matrix and the other for the bottom.  At the end of two
    // iterations, the matrix will be returned to its proper order for the
    // vertical calculations.
    ptr64 = (__m64 *) coef_block; 
    
    // Process the rows with this loop.
    // Loop will only execute twice, once for top half of matrix, once for bottom.
        // Read in 1st quadrant
        row1 = (Is16vec4)*(ptr64 + 0);
        row2 = (Is16vec4)*(ptr64 + 2);
        row3 = (Is16vec4)*(ptr64 + 4);
        row4 = (Is16vec4)*(ptr64 + 6);

        // Transpose to isolate rows
        lower_lo = unpack_low (row1, row3);
        lower_hi = unpack_high (row1, row3);
        upper_lo = unpack_low (row2, row4);
        upper_hi = unpack_high (row2, row4);

        zeroes = unpack_low (lower_lo, upper_lo);
        ones = unpack_high (lower_lo, upper_lo);
        twos = unpack_low (lower_hi, upper_hi);
        threes = unpack_high (lower_hi, upper_hi);

        // Read in 2nd quadrant
        row1 = (Is16vec4)*(ptr64 + 1);
        row2 = (Is16vec4)*(ptr64 + 3);
        row3 = (Is16vec4)*(ptr64 + 5);
        row4 = (Is16vec4)*(ptr64 + 7);

        // Transpose to isolate rows
        lower_lo = unpack_low (row1, row3);
        lower_hi = unpack_high (row1, row3);
        upper_lo = unpack_low (row2, row4);
        upper_hi = unpack_high (row2, row4);

        fours = unpack_low (lower_lo, upper_lo);
        fives = unpack_high (lower_lo, upper_lo);
        sixes = unpack_low (lower_hi, upper_hi);
        sevens = unpack_high (lower_hi, upper_hi);

    // Remember, all of the following calculations are SIMD.
    //
    //		stg0 = lptr[0] + lptr[32];
    //		stg4 = lptr[0] - lptr[32];
    //		stg1  = lptr[8] + lptr[56];
    //		stg7  = lptr[8] - lptr[56];
    //		stg6 = lptr[16] - lptr[48];
    //		stg2 = lptr[16] + lptr[48];
    //		stg3  = lptr[40] - lptr[24];
    //		stg5  = lptr[40] + lptr[24];
    //
        stg0 = zeroes + fours;
        stg4 = zeroes - fours;
        stg1 = ones + sevens;
        stg7 = ones - sevens;
        stg6 = twos - sixes;
        stg2 = twos + sixes;
        stg3 = fives - threes;
        stg5 = fives + threes;

    //		stg15 = stg1 - stg5;
    //		stg11 = stg1 + stg5;
        stg15 = stg1 - stg5;
        stg11 = stg1 + stg5;

    //		stg12 = stg0 + stg2;
    //		stg10 = stg0 - stg2;
        stg12 = stg0 + stg2;
        stg10 = stg0 - stg2;

    //		stg16 = SCALEM(b1*stg6) - stg2;
    //		stg24 = stg4 + stg16;
    //		stg26 = stg4 - stg16;
        tmps = stg6 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg16 = tmps - stg2;

        stg24 = stg4 + stg16;
        stg26 = stg4 - stg16;

    //		stg13 = SCALEM(b5*(stg3-stg7));     // Changed to stg3+stg7 per ASM implementation
    //		stg17 = SCALEM(b4*stg7) - stg13;
    //		stg21 = stg17 - stg11;              // Changed to stg23-stg11 per ASM
        tmps = stg3 + stg7;
        tmps <<= 2;
		stg13 = mul_high (tmps, M64Const_xm4);

        tmps = stg7 << 2;
		tmps = mul_high (tmps, M64Const_xm3);
        stg17 = tmps - stg13;

    //		stg23 = stg13 - SCALEM(b2*stg3);    // Changed to + per ASM 
        tmps = stg3 << 3;
		tmps = mul_high (tmps, M64Const_xm2);
        stg23 = stg13 + tmps;

        stg21 = stg23 - stg11;

    //		stg25 = SCALEM(b3*stg15) - stg21;
    //		stg33 = stg25 + stg23;              // Changed to stg17+stg25 per ASM
        tmps = stg15 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg25 = tmps - stg21;

        stg33 = stg17 + stg25;

    //		lptr[0]  = SHIFT_AND_BOUND(stg12 + stg11);
    //		lptr[8]  = SHIFT_AND_BOUND(stg24 + stg21);
    //		lptr[16] = SHIFT_AND_BOUND(stg26 + stg25);
    //		lptr[24] = SHIFT_AND_BOUND(stg10 - stg33);
    //		lptr[32] = SHIFT_AND_BOUND(stg10 + stg33);
    //		lptr[40] = SHIFT_AND_BOUND(stg26 - stg25);
    //		lptr[48] = SHIFT_AND_BOUND(stg24 - stg21);
    //		lptr[56] = SHIFT_AND_BOUND(stg12 - stg11);
        M64Const_zero = (Iu8vec8)_m_from_int (0);

        zeroes = stg12 + stg11;
        tmps = zeroes >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        zeroes = unpack_low (tmpub, M64Const_zero); 

        ones = stg24 + stg21;
        tmps = ones >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        ones = unpack_low (tmpub, M64Const_zero);

        twos = stg26 + stg25;
        tmps = twos >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        twos = unpack_low (tmpub, M64Const_zero);

        threes = stg10 - stg33;
        tmps = threes >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        threes = unpack_low (tmpub, M64Const_zero);

        fours = stg10 + stg33;
        tmps = fours >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        fours = unpack_low (tmpub, M64Const_zero);

        fives = stg26 - stg25;
        tmps = fives >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        fives = unpack_low (tmpub, M64Const_zero);

        sixes = stg24 - stg21;
        tmps = sixes >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        sixes = unpack_low (tmpub, M64Const_zero);

        sevens = stg12 - stg11;
        tmps = sevens >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        sevens = unpack_low (tmpub, M64Const_zero);

        // Write out first quadrant
        lower_lo = unpack_low (zeroes, ones);
        lower_hi = unpack_high (zeroes, ones);
        upper_lo = unpack_low (twos, threes);
        upper_hi = unpack_high (twos, threes);

		// Arguments of unpacks are Is16vec4 type.
		// Need to cast to Is32vec2 so we use correct unpack intrinsics
        row1 = unpack_low ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row2 = unpack_high ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row3 = unpack_low ((Is32vec2)lower_hi, (Is32vec2)upper_hi);
        row4 = unpack_high ((Is32vec2)lower_hi, (Is32vec2)upper_hi);

        *(ptr64+0) = row1;
        *(ptr64+2) = row2;
        *(ptr64+4) = row3;
        *(ptr64+6) = row4;
        
        // Write out second quadrant
        lower_lo = unpack_low (fours, fives);
        lower_hi = unpack_high (fours, fives);
        upper_lo = unpack_low (sixes, sevens);
        upper_hi = unpack_high (sixes, sevens);

		// Arguments of unpacks are Is16vec4 type.
		// Need to cast to Is32vec2 so we use correct unpack intrinsics
        row1 = unpack_low ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row2 = unpack_high ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row3 = unpack_low ((Is32vec2)lower_hi, (Is32vec2)upper_hi);
        row4 = unpack_high ((Is32vec2)lower_hi, (Is32vec2)upper_hi);

        *(ptr64+1) = row1;
        *(ptr64+3) = row2;
        *(ptr64+5) = row3;
        *(ptr64+7) = row4;
        
        ptr64 += 8;             // Move to bottom half of matrix

        // Read in 1st quadrant
        row1 = (Is16vec4)*(ptr64 + 0);
        row2 = (Is16vec4)*(ptr64 + 2);
        row3 = (Is16vec4)*(ptr64 + 4);
        row4 = (Is16vec4)*(ptr64 + 6);

        // Transpose to isolate rows
        lower_lo = unpack_low (row1, row3);
        lower_hi = unpack_high (row1, row3);
        upper_lo = unpack_low (row2, row4);
        upper_hi = unpack_high (row2, row4);

        zeroes = unpack_low (lower_lo, upper_lo);
        ones = unpack_high (lower_lo, upper_lo);
        twos = unpack_low (lower_hi, upper_hi);
        threes = unpack_high (lower_hi, upper_hi);

        // Read in 2nd quadrant
        row1 = (Is16vec4)*(ptr64 + 1);
        row2 = (Is16vec4)*(ptr64 + 3);
        row3 = (Is16vec4)*(ptr64 + 5);
        row4 = (Is16vec4)*(ptr64 + 7);

        // Transpose to isolate rows
        lower_lo = unpack_low (row1, row3);
        lower_hi = unpack_high (row1, row3);
        upper_lo = unpack_low (row2, row4);
        upper_hi = unpack_high (row2, row4);

        fours = unpack_low (lower_lo, upper_lo);
        fives = unpack_high (lower_lo, upper_lo);
        sixes = unpack_low (lower_hi, upper_hi);
        sevens = unpack_high (lower_hi, upper_hi);

    // Remember, all of the following calculations are SIMD.
    //
    //		stg0 = lptr[0] + lptr[32];
    //		stg4 = lptr[0] - lptr[32];
    //		stg1  = lptr[8] + lptr[56];
    //		stg7  = lptr[8] - lptr[56];
    //		stg6 = lptr[16] - lptr[48];
    //		stg2 = lptr[16] + lptr[48];
    //		stg3  = lptr[40] - lptr[24];
    //		stg5  = lptr[40] + lptr[24];
    //
        stg0 = zeroes + fours;
        stg4 = zeroes - fours;
        stg1 = ones + sevens;
        stg7 = ones - sevens;
        stg6 = twos - sixes;
        stg2 = twos + sixes;
        stg3 = fives - threes;
        stg5 = fives + threes;

    //		stg15 = stg1 - stg5;
    //		stg11 = stg1 + stg5;
        stg15 = stg1 - stg5;
        stg11 = stg1 + stg5;

    //		stg12 = stg0 + stg2;
    //		stg10 = stg0 - stg2;
        stg12 = stg0 + stg2;
        stg10 = stg0 - stg2;

    //		stg16 = SCALEM(b1*stg6) - stg2;
    //		stg24 = stg4 + stg16;
    //		stg26 = stg4 - stg16;
        tmps = stg6 << 2;
	tmps = mul_high (tmps, M64Const_xm1);
        stg16 = tmps - stg2;

        stg24 = stg4 + stg16;
        stg26 = stg4 - stg16;

    //		stg13 = SCALEM(b5*(stg3-stg7));     // Changed to stg3+stg7 per ASM implementation
    //		stg17 = SCALEM(b4*stg7) - stg13;
    //		stg21 = stg17 - stg11;              // Changed to stg23-stg11 per ASM
        tmps = stg3 + stg7;
        tmps = tmps << 2;
		stg13 = mul_high (tmps, M64Const_xm4);

        tmps = stg7 << 2;
		tmps = mul_high (tmps, M64Const_xm3);
        stg17 = tmps - stg13;

    //		stg23 = stg13 - SCALEM(b2*stg3);    // Changed to + per ASM 
        tmps = stg3 << 3;
		tmps = mul_high (tmps, M64Const_xm2);
        stg23 = stg13 + tmps;

        stg21 = stg23 - stg11;

    //		stg25 = SCALEM(b3*stg15) - stg21;
    //		stg33 = stg25 + stg23;              // Changed to stg17+stg25 per ASM
        tmps = stg15 << 2;
		tmps = mul_high (tmps, M64Const_xm1);
        stg25 = tmps - stg21;

        stg33 = stg17 + stg25;

    //		lptr[0]  = SHIFT_AND_BOUND(stg12 + stg11);
    //		lptr[8]  = SHIFT_AND_BOUND(stg24 + stg21);
    //		lptr[16] = SHIFT_AND_BOUND(stg26 + stg25);
    //		lptr[24] = SHIFT_AND_BOUND(stg10 - stg33);
    //		lptr[32] = SHIFT_AND_BOUND(stg10 + stg33);
    //		lptr[40] = SHIFT_AND_BOUND(stg26 - stg25);
    //		lptr[48] = SHIFT_AND_BOUND(stg24 - stg21);
    //		lptr[56] = SHIFT_AND_BOUND(stg12 - stg11);
        M64Const_zero = (Iu8vec8)_m_from_int (0);

        zeroes = stg12 + stg11;
        tmps = zeroes >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        zeroes = unpack_low (tmpub, M64Const_zero);

        ones = stg24 + stg21;
        tmps = ones >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        ones = unpack_low (tmpub, M64Const_zero);

        twos = stg26 + stg25;
        tmps = twos >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        twos = unpack_low (tmpub, M64Const_zero);

        threes = stg10 - stg33;
        tmps = threes >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        threes = unpack_low (tmpub, M64Const_zero);

        fours = stg10 + stg33;
        tmps = fours >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        fours = unpack_low (tmpub, M64Const_zero);

        fives = stg26 - stg25;
        tmps = fives >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        fives = unpack_low (tmpub, M64Const_zero);

        sixes = stg24 - stg21;
        tmps = sixes >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        sixes = unpack_low (tmpub, M64Const_zero);

        sevens = stg12 - stg11;
        tmps = sevens >> 5;
        tmps = tmps + M64Const_128;
        tmpub = packu_sat (tmps, tmps);
		// result - implicit conversion from Iu8vec8 to Is16vec4
        sevens = unpack_low (tmpub, M64Const_zero);

        // Write out first quadrant
        lower_lo = unpack_low (zeroes, ones);
        lower_hi = unpack_high (zeroes, ones);
        upper_lo = unpack_low(twos, threes);
        upper_hi = unpack_high (twos, threes);

		// Arguments of unpacks are Is16vec4 type.
		// Need to cast to Is32vec2 so we use correct unpack intrinsics
        row1 = unpack_low ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row2 = unpack_high ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row3 = unpack_low ((Is32vec2)lower_hi, (Is32vec2)upper_hi);
        row4 = unpack_high ((Is32vec2)lower_hi, (Is32vec2)upper_hi);

        *(ptr64+0) = row1;
        *(ptr64+2) = row2;
        *(ptr64+4) = row3;
        *(ptr64+6) = row4;
        
        // Write out second quadrant
        lower_lo = unpack_low (fours, fives);
        lower_hi = unpack_high (fours, fives);
        upper_lo = unpack_low (sixes, sevens);
        upper_hi = unpack_high (sixes, sevens);

		// Arguments of unpacks are Is16vec4 type.
		// Need to cast to Is32vec2 so we use correct unpack intrinsics
        row1 = unpack_low ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row2 = unpack_high ((Is32vec2)lower_lo, (Is32vec2)upper_lo);
        row3 = unpack_low ((Is32vec2)lower_hi, (Is32vec2)upper_hi);
        row4 = unpack_high ((Is32vec2)lower_hi, (Is32vec2)upper_hi);

        *(ptr64+1) = row1;
        *(ptr64+3) = row2;
        *(ptr64+5) = row3;
        *(ptr64+7) = row4;

} // end of MMXIvec_iDCT8x8AAN


