/*! *************************************************************************** * \file transform.c * * \brief * Transform functions * * \author * Main contributors (see contributors.h for copyright, address and affiliation details) * - Alexis Michael Tourapis * \date * 01. July 2007 ************************************************************************** */ #include "global.h" #include "transform.h" #include void forward4x4(int **block, int **tblock, int pos_y, int pos_x) { int i, ii; int tmp[16]; int *pTmp = tmp, *pblock; int p0,p1,p2,p3; int t0,t1,t2,t3; // Horizontal for (i=pos_y; i < pos_y + BLOCK_SIZE; i++) { pblock = &block[i][pos_x]; p0 = *(pblock++); p1 = *(pblock++); p2 = *(pblock++); p3 = *(pblock ); t0 = p0 + p3; t1 = p1 + p2; t2 = p1 - p2; t3 = p0 - p3; *(pTmp++) = t0 + t1; *(pTmp++) = (t3 << 1) + t2; *(pTmp++) = t0 - t1; *(pTmp++) = t3 - (t2 << 1); } // Vertical for (i=0; i < BLOCK_SIZE; i++) { pTmp = tmp + i; p0 = *pTmp; p1 = *(pTmp += BLOCK_SIZE); p2 = *(pTmp += BLOCK_SIZE); p3 = *(pTmp += BLOCK_SIZE); t0 = p0 + p3; t1 = p1 + p2; t2 = p1 - p2; t3 = p0 - p3; ii = pos_x + i; tblock[pos_y ][ii] = t0 + t1; tblock[pos_y + 1][ii] = t2 + (t3 << 1); tblock[pos_y + 2][ii] = t0 - t1; tblock[pos_y + 3][ii] = t3 - (t2 << 1); } } static void inverse4x4(const h264_short_block_t tblock, h264_short_block_t block, int pos_y, int pos_x) { int i; short tmp[16]; short *pTmp = tmp; int p0,p1,p2,p3; int t0,t1,t2,t3; // Horizontal for (i = 0; i < BLOCK_SIZE; i++) { t0 = tblock[i][0]; t1 = tblock[i][1]; t2 = tblock[i][2]; t3 = tblock[i][3]; p0 = t0 + t2; p1 = t0 - t2; p2 = (t1 >> 1) - t3; p3 = t1 + (t3 >> 1); *(pTmp++) = p0 + p3; *(pTmp++) = p1 + p2; *(pTmp++) = p1 - p2; *(pTmp++) = p0 - p3; } // Vertical for (i = 0; i < BLOCK_SIZE; i++) { pTmp = tmp + i; t0 = *pTmp; t1 = *(pTmp += BLOCK_SIZE); t2 = *(pTmp += BLOCK_SIZE); t3 = *(pTmp += BLOCK_SIZE); p0 = t0 + t2; p1 = t0 - t2; p2 =(t1 >> 1) - t3; p3 = t1 + (t3 >> 1); block[0][i] = p0 + p3; block[1][i] = p1 + p2; block[2][i] = p1 - p2; block[3][i] = p0 - p3; } } #ifdef _M_IX86 // benski> this exists just for conformance testing. not used in production code static void inverse4x4_sse2_x86(const h264_short_macroblock_t tblock, h264_short_macroblock_t block, int pos_y, int pos_x) { __asm { mov edx, pos_y shl edx, 4 // 16 step stride add edx, pos_x shl edx, 1 // * sizeof(short) // eax: pointer to the start of tblock (offset by passed pos_y, pos_x) mov eax, edx add eax, tblock // esi: results mov esi, edx add esi, block // load 4x4 matrix movq mm0, MMWORD PTR 0[eax] movq mm1, MMWORD PTR 32[eax] movq mm2, MMWORD PTR 64[eax] movq mm3, MMWORD PTR 96[eax] // rotate 4x4 matrix movq mm4, mm0 // p0 = mm4 (copy) punpcklwd mm0, mm2 // r0 = mm0 punpckhwd mm4, mm2 // r2 = mm4 movq mm5, mm1 // p1 = mm5 (copy) punpcklwd mm1, mm3 // r1 = mm1 punpckhwd mm5, mm3 // r3 = mm5 movq mm6, mm0 // r0 = mm6 (copy) punpcklwd mm0, mm1 // t0 = mm0 punpckhwd mm6, mm1 // t1 = mm6 movq mm1, mm4 // r2 = mm1 (copy) punpcklwd mm1, mm5 // t2 = mm1 punpckhwd mm4, mm5 // t3 = mm4 /* register state: mm0: t0 mm1: t2 mm2: mm3: mm4: t3 mm5: mm6: t1 mm7: */ /* p0 = t0 + t2; p1 = t0 - t2; p2 = (t1 >> 1) - t3; p3 = t1 + (t3 >> 1); */ movq mm2, mm0 // mm2 = t0 (copy) paddw mm0, mm1 // mm0 = p0 psubw mm2, mm1 // mm2 = p1, mm1 available movq mm5, mm6 // mm5 = t1 (copy) psraw mm5, 1 // mm5 = (t1 >> 1) psubw mm5, mm4 // mm5 = p2 psraw mm4, 1 // mm4 = (t3 >> 1) paddw mm6, mm4 // mm6 = p3 /* register state: mm0: p0 mm1: mm2: p1 mm3: mm4: mm5: p2 mm6: p3 mm7: */ /* *(pTmp++) = p0 + p3; *(pTmp++) = p1 + p2; *(pTmp++) = p1 - p2; *(pTmp++) = p0 - p3; */ movq mm3, mm0 // mm3 = p0 (copy) paddw mm0, mm6 // mm0 = r0 movq mm1, mm2 // mm1 = p1 (copy) paddw mm1, mm5 // mm1 = r1 psubw mm2, mm5 // mm2 = r2, mm5 available psubw mm3, mm6 // mm3 = r3 /* register state: mm0: r0 mm1: r1 mm2: r2 mm3: r3 mm4: mm5: mm6: mm7: */ // rotate 4x4 matrix to set up for vertical movq mm4, mm0 // r0 = mm4 (copy) punpcklwd mm0, mm2 // p0 = mm0 punpckhwd mm4, mm2 // p2 = mm4 movq mm5, mm1 // r1 = mm5 (copy) punpcklwd mm1, mm3 // p1 = mm1 punpckhwd mm5, mm3 // p3 = mm5 movq mm6, mm0 // p0 = mm6 (copy) punpcklwd mm0, mm1 // t0 = mm0 punpckhwd mm6, mm1 // t1 = mm6 movq mm1, mm4 // p2 = mm1 (copy) punpcklwd mm1, mm5 // t2 = mm1 punpckhwd mm4, mm5 // t3 = mm4 /* register state: mm0: t0 mm1: t2 mm2: mm3: mm4: t3 mm5: mm6: t1 mm7: */ /* p0 = t0 + t2; p1 = t0 - t2; p2 = (t1 >> 1) - t3; p3 = t1 + (t3 >> 1); */ movq mm2, mm0 // mm2 = t0 (copy) paddw mm0, mm1 // mm0 = p0 psubw mm2, mm1 // mm2 = p1, mm1 available movq mm5, mm6 // mm5 = t1 (copy) psraw mm5, 1 // mm5 = (t1 >> 1) psubw mm5, mm4 // mm5 = p2 psraw mm4, 1 // mm4 = (t3 >> 1) paddw mm6, mm4 // mm6 = p3 /* register state: mm0: p0 mm1: mm2: p1 mm3: mm4: mm5: p2 mm6: p3 mm7: */ /* *(pTmp++) = p0 + p3; *(pTmp++) = p1 + p2; *(pTmp++) = p1 - p2; *(pTmp++) = p0 - p3; */ movq mm3, mm0 // mm3 = p0 (copy) paddw mm0, mm6 // mm0 = r0 movq mm1, mm2 // mm1 = p1 (copy) paddw mm1, mm5 // mm1 = r1 psubw mm2, mm5 // mm2 = r2, mm5 available psubw mm3, mm6 // mm3 = r3 /* register state: mm0: r0 mm1: r1 mm2: r2 mm3: r3 mm4: mm5: mm6: mm7: */ movq XMMWORD PTR 0[esi], mm0 movq XMMWORD PTR 32[esi], mm1 movq XMMWORD PTR 64[esi], mm2 movq XMMWORD PTR 96[esi], mm3 } } #endif static void sample_reconstruct(h264_imgpel_macroblock_t curImg, const h264_imgpel_macroblock_t mpr, const h264_short_block_t tblock, int joff, int mb_x, int max_imgpel_value) { #ifdef _M_IX86 __asm { // mm0 : constant value 32 mov edx, 0x00200020 movd mm0, edx punpckldq mm0, mm0 // ecx: y offset mov ecx, joff shl ecx, 4 // imgpel stuff is going to be 16 byte stride add ecx, mb_x // eax: curImg mov eax, curImg add eax, ecx // edx: mpr mov edx, mpr add edx, ecx // ecx: tblock (which is short, not byte) mov ecx, tblock // mm7: zero pxor mm7, mm7 // load coefficients movq mm1, MMWORD PTR 0[ecx] movq mm2, MMWORD PTR 8[ecx] movq mm3, MMWORD PTR 16[ecx] movq mm4, MMWORD PTR 24[ecx] paddw mm1, mm0 // rres + 32 paddw mm2, mm0 // rres + 32 paddw mm3, mm0 // rres + 32 paddw mm0, mm4 // rres + 32 psraw mm1, 6 // (rres + 32) >> 6 psraw mm2, 6 // (rres + 32) >> 6 psraw mm3, 6 // (rres + 32) >> 6 psraw mm0, 6 // (rres + 32) >> 6 // mm1-mm3: tblock[0] - tblock[2], mm0: tblock[3] // convert mpr from unsigned char to short movd mm4, DWORD PTR 0[edx] movd mm5, DWORD PTR 16[edx] movd mm6, DWORD PTR 32[edx] punpcklbw mm4, mm7 punpcklbw mm5, mm7 punpcklbw mm6, mm7 paddsw mm4, mm1 // pred_row + rres_row movd mm1, DWORD PTR 48[edx] // reuse mm1 for mpr[3] paddsw mm5, mm2 // pred_row + rres_row punpcklbw mm1, mm7 paddsw mm6, mm3 // pred_row + rres_row paddsw mm1, mm0 // pred_row + rres_row // results in mm4, mm5, mm6, mm1 // move back to 8 bit packuswb mm4, mm7 packuswb mm5, mm7 packuswb mm6, mm7 packuswb mm1, mm7 movd DWORD PTR 0[eax], mm4 movd DWORD PTR 16[eax], mm5 movd DWORD PTR 32[eax], mm6 movd DWORD PTR 48[eax], mm1 } #else int i, j; for (j = 0; j < BLOCK_SIZE; j++) { for (i=0;i> 1) - t3; p3 = t1 + (t3 >> 1); */ movq mm2, mm0 // mm2 = t0 (copy) paddw mm0, mm1 // mm0 = p0 psubw mm2, mm1 // mm2 = p1, mm1 available movq mm5, mm6 // mm5 = t1 (copy) psraw mm5, 1 // mm5 = (t1 >> 1) psubw mm5, mm4 // mm5 = p2 psraw mm4, 1 // mm4 = (t3 >> 1) paddw mm6, mm4 // mm6 = p3 /* register state: mm0: p0 mm1: mm2: p1 mm3: mm4: mm5: p2 mm6: p3 mm7: */ /* *(pTmp++) = p0 + p3; *(pTmp++) = p1 + p2; *(pTmp++) = p1 - p2; *(pTmp++) = p0 - p3; */ movq mm3, mm0 // mm3 = p0 (copy) paddw mm0, mm6 // mm0 = r0 movq mm1, mm2 // mm1 = p1 (copy) paddw mm1, mm5 // mm1 = r1 psubw mm2, mm5 // mm2 = r2, mm5 available psubw mm3, mm6 // mm3 = r3 /* register state: mm0: r0 mm1: r1 mm2: r2 mm3: r3 mm4: mm5: mm6: mm7: */ // rotate 4x4 matrix to set up for vertical movq mm4, mm0 // r0 = mm4 (copy) punpcklwd mm0, mm2 // p0 = mm0 punpckhwd mm4, mm2 // p2 = mm4 movq mm5, mm1 // r1 = mm5 (copy) punpcklwd mm1, mm3 // p1 = mm1 punpckhwd mm5, mm3 // p3 = mm5 movq mm6, mm0 // p0 = mm6 (copy) punpcklwd mm0, mm1 // t0 = mm0 punpckhwd mm6, mm1 // t1 = mm6 movq mm1, mm4 // p2 = mm1 (copy) punpcklwd mm1, mm5 // t2 = mm1 punpckhwd mm4, mm5 // t3 = mm4 /* register state: mm0: t0 mm1: t2 mm2: mm3: mm4: t3 mm5: mm6: t1 mm7: */ /* p0 = t0 + t2; p1 = t0 - t2; p2 = (t1 >> 1) - t3; p3 = t1 + (t3 >> 1); */ movq mm2, mm0 // mm2 = t0 (copy) paddw mm0, mm1 // mm0 = p0 psubw mm2, mm1 // mm2 = p1, mm1 available movq mm5, mm6 // mm5 = t1 (copy) psraw mm5, 1 // mm5 = (t1 >> 1) psubw mm5, mm4 // mm5 = p2 psraw mm4, 1 // mm4 = (t3 >> 1) paddw mm6, mm4 // mm6 = p3 /* register state: mm0: p0 mm1: mm2: p1 mm3: mm4: mm5: p2 mm6: p3 mm7: */ /* *(pTmp++) = p0 + p3; *(pTmp++) = p1 + p2; *(pTmp++) = p1 - p2; *(pTmp++) = p0 - p3; */ movq mm3, mm0 // mm3 = p0 (copy) paddw mm0, mm6 // mm0 = r0 movq mm1, mm2 // mm1 = p1 (copy) paddw mm1, mm5 // mm1 = r1 psubw mm2, mm5 // mm2 = r2, mm5 available psubw mm3, mm6 // mm3 = r3 /* register state: mm0: r0 mm1: r1 mm2: r2 mm3: r3 mm4: mm5: mm6: mm7: */ /* --- 4x4 iDCT done, now time to combine with mpr --- */ // mm0 : constant value 32 movq mm7, const32 paddw mm0, mm7 // rres + 32 psraw mm0, 6 // (rres + 32) >> 6 paddw mm1, mm7 // rres + 32 psraw mm1, 6 // (rres + 32) >> 6 paddw mm2, mm7 // rres + 32 psraw mm2, 6 // (rres + 32) >> 6 paddw mm3, mm7 // rres + 32 psraw mm3, 6 // (rres + 32) >> 6 pxor mm7, mm7 // convert mpr from unsigned char to short movd mm4, DWORD PTR 0[ecx] movd mm5, DWORD PTR 16[ecx] movd mm6, DWORD PTR 32[ecx] punpcklbw mm4, mm7 punpcklbw mm5, mm7 punpcklbw mm6, mm7 paddsw mm4, mm0 // pred_row + rres_row movd mm0, DWORD PTR 48[ecx] // reuse mm0 for mpr[3] paddsw mm5, mm1 // pred_row + rres_row punpcklbw mm0, mm7 paddsw mm6, mm2 // pred_row + rres_row paddsw mm0, mm3 // pred_row + rres_row // results in mm4, mm5, mm6, mm0 // move back to 8 bit packuswb mm4, mm7 packuswb mm5, mm7 packuswb mm6, mm7 packuswb mm0, mm7 movd DWORD PTR 0[edx], mm4 movd DWORD PTR 16[edx], mm5 movd DWORD PTR 32[edx], mm6 movd DWORD PTR 48[edx], mm0 } } #elif defined(_M_X64) static void itrans4x4_sse2(const h264_int_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y) { __declspec(align(32)) static const int const32[4] = {32, 32, 32, 32}; __m128i p0,p1,p2,p3; __m128i t0,t1,t2,t3; __m128i r0,r1,r2,r3; __m128i c32, zero; // horizontal // load registers in vertical mode, we'll rotate them next p0 = _mm_loadu_si128((__m128i *)&tblock[pos_y][pos_x]); // 00 01 02 03 p1 = _mm_loadu_si128((__m128i *)&tblock[pos_y+1][pos_x]); // 10 11 12 13 p2 = _mm_loadu_si128((__m128i *)&tblock[pos_y+2][pos_x]); // 20 21 22 23 p3 = _mm_loadu_si128((__m128i *)&tblock[pos_y+3][pos_x]); // 30 31 32 33 // rotate 4x4 matrix r0 = _mm_unpacklo_epi32(p0, p2); // 00 20 01 21 r1 = _mm_unpacklo_epi32(p1, p3); // 10 30 11 31 r2 = _mm_unpackhi_epi32(p0, p2); // 02 22 03 23 r3 = _mm_unpackhi_epi32(p1, p3); // 12 32 13 33 t0 = _mm_unpacklo_epi32(r0, r1); // 00 10 20 30 t1 = _mm_unpackhi_epi32(r0, r1); // 01 11 21 31 t2 = _mm_unpacklo_epi32(r2, r3); // 02 12 22 32 t3 = _mm_unpackhi_epi32(r2, r3); // 03 13 23 33 p0 = _mm_add_epi32(t0, t2); //t0 + t2; p1 = _mm_sub_epi32(t0, t2); // t0 - t2; p2 = _mm_srai_epi32(t1, 1); // t1 >> 1 p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3; p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1) p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1); t0 = _mm_add_epi32(p0, p3); //p0 + p3; t1 = _mm_add_epi32(p1, p2);//p1 + p2; t2 = _mm_sub_epi32(p1, p2); //p1 - p2; t3 = _mm_sub_epi32(p0, p3); //p0 - p3; // rotate 4x4 matrix to set up for vertical r0 = _mm_unpacklo_epi32(t0, t2); r1 = _mm_unpacklo_epi32(t1, t3); r2 = _mm_unpackhi_epi32(t0, t2); r3 = _mm_unpackhi_epi32(t1, t3); t0 = _mm_unpacklo_epi32(r0, r1); t1 = _mm_unpackhi_epi32(r0, r1); t2 = _mm_unpacklo_epi32(r2, r3); t3 = _mm_unpackhi_epi32(r2, r3); // vertical p0 = _mm_add_epi32(t0, t2); //t0 + t2; p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1) p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1); r0 = _mm_add_epi32(p0, p3); //p0 + p3; r3 = _mm_sub_epi32(p0, p3); //p0 - p3; p1 = _mm_sub_epi32(t0, t2); // t0 - t2; p2 = _mm_srai_epi32(t1, 1); // t1 >> 1 p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3; r1 = _mm_add_epi32(p1, p2);//p1 + p2; r2 = _mm_sub_epi32(p1, p2); //p1 - p2; c32 = _mm_load_si128((const __m128i *)const32); zero = _mm_setzero_si128(); // (x + 32) >> 6 r0 = _mm_add_epi32(r0, c32); r0 = _mm_srai_epi32(r0, 6); r1 = _mm_add_epi32(r1, c32); r1 = _mm_srai_epi32(r1, 6); r2 = _mm_add_epi32(r2, c32); r2 = _mm_srai_epi32(r2, 6); r3 = _mm_add_epi32(r3, c32); r3 = _mm_srai_epi32(r3, 6); // convert to 16bit values r0 = _mm_packs_epi32(r0, r1); r2 = _mm_packs_epi32(r2, r3); // convert mpr from unsigned char to short p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y][pos_x]); p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+1][pos_x]); p0 = _mm_unpacklo_epi32(p0, p1); p0 = _mm_unpacklo_epi8(p0, zero); // convert to short r0 = _mm_add_epi16(r0, p0); p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+2][pos_x]); p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+3][pos_x]); p0 = _mm_unpacklo_epi32(p0, p1); p0 = _mm_unpacklo_epi8(p0, zero); // convert to short r2 = _mm_add_epi16(r2, p0); r0 = _mm_packus_epi16(r0, r2); // convert to unsigned char *(int32_t *)&mb_rec[pos_y][pos_x] = _mm_cvtsi128_si32(r0); r0 = _mm_srli_si128(r0, 4); *(int32_t *)&mb_rec[pos_y+1][pos_x] = _mm_cvtsi128_si32(r0); r0 = _mm_srli_si128(r0, 4); *(int32_t *)&mb_rec[pos_y+2][pos_x] = _mm_cvtsi128_si32(r0); r0 = _mm_srli_si128(r0, 4); *(int32_t *)&mb_rec[pos_y+3][pos_x] = _mm_cvtsi128_si32(r0); } #endif void itrans4x4_c(const h264_short_block_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y) { inverse4x4(tblock, (h264_short_block_row_t *)tblock,pos_y,pos_x); sample_reconstruct(mb_rec, mb_pred, tblock, pos_y, pos_x, 255); } void ihadamard4x4(int block[4][4]) { int i; int tmp[16]; int *pTmp = tmp; int p0,p1,p2,p3; int t0,t1,t2,t3; // Horizontal for (i = 0; i < BLOCK_SIZE; i++) { t0 = block[i][0]; t1 = block[i][1]; t2 = block[i][2]; t3 = block[i][3]; p0 = t0 + t2; p1 = t0 - t2; p2 = t1 - t3; p3 = t1 + t3; *(pTmp++) = p0 + p3; *(pTmp++) = p1 + p2; *(pTmp++) = p1 - p2; *(pTmp++) = p0 - p3; } // Vertical for (i = 0; i < BLOCK_SIZE; i++) { pTmp = tmp + i; t0 = *pTmp; t1 = *(pTmp += BLOCK_SIZE); t2 = *(pTmp += BLOCK_SIZE); t3 = *(pTmp += BLOCK_SIZE); p0 = t0 + t2; p1 = t0 - t2; p2 = t1 - t3; p3 = t1 + t3; block[0][i] = p0 + p3; block[1][i] = p1 + p2; block[2][i] = p1 - p2; block[3][i] = p0 - p3; } } void ihadamard4x2(int **tblock, int **block) { int i; int tmp[8]; int *pTmp = tmp; int p0,p1,p2,p3; int t0,t1,t2,t3; // Horizontal *(pTmp++) = tblock[0][0] + tblock[1][0]; *(pTmp++) = tblock[0][1] + tblock[1][1]; *(pTmp++) = tblock[0][2] + tblock[1][2]; *(pTmp++) = tblock[0][3] + tblock[1][3]; *(pTmp++) = tblock[0][0] - tblock[1][0]; *(pTmp++) = tblock[0][1] - tblock[1][1]; *(pTmp++) = tblock[0][2] - tblock[1][2]; *(pTmp ) = tblock[0][3] - tblock[1][3]; // Vertical pTmp = tmp; for (i = 0; i < 2; i++) { p0 = *(pTmp++); p1 = *(pTmp++); p2 = *(pTmp++); p3 = *(pTmp++); t0 = p0 + p2; t1 = p0 - p2; t2 = p1 - p3; t3 = p1 + p3; // coefficients (transposed) block[0][i] = t0 + t3; block[1][i] = t1 + t2; block[2][i] = t1 - t2; block[3][i] = t0 - t3; } } //following functions perform 8 additions, 8 assignments. Should be a bit faster void ihadamard2x2(int tblock[4], int block[4]) { int t0,t1,t2,t3; t0 = tblock[0] + tblock[1]; t1 = tblock[0] - tblock[1]; t2 = tblock[2] + tblock[3]; t3 = tblock[2] - tblock[3]; block[0] = (t0 + t2); block[1] = (t1 + t3); block[2] = (t0 - t2); block[3] = (t1 - t3); }