winamp/Src/h264dec/ldecod/src/prediction.asm

1626 lines
38 KiB
NASM
Raw Normal View History

2024-09-24 12:54:57 +00:00
.686
.XMM
.model FLAT
copy_image_data_16x16_stride@OptimizedFunctions = 32
dec_picture@VideoParameters = 698192
p_Slice@MacroBlock = 0
plane_images@StorablePicture = 158512
mb_rec@Slice = 1696
mb_pred@Slice = 928
cof@Slice = 2464
CONST SEGMENT
align 16
const32 DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H
CONST ENDS
;
;
;
;
PUBLIC _weighted_bi_prediction4x4
_TEXT SEGMENT
mb_pred = 4
block_l0 = 8
wp_scale_l0 = 12
wp_scale_l1 = 16
wp_offset = 20
weight_denom = 24
_weighted_bi_prediction4x4 PROC ; COMDAT
mov eax, DWORD PTR weight_denom[esp]
pxor mm0, mm0
pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0
test eax, eax
pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0
pshufw mm3, MMWORD PTR wp_offset[esp], 0
jle BI_PRED4x4@LEFT_SHIFT
movd mm4, eax
lea ecx, DWORD PTR [eax-1] ;
mov edx, 1
shl edx, cl
movd mm5, edx
mov eax, mb_pred[esp]
mov edx, block_l0[esp]
pshufw mm5, mm5, 0
movd mm6, DWORD PTR 0[edx] ; block_l0
movd mm7, DWORD PTR 0[eax] ; mb_pred
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
movd mm7, DWORD PTR 16[eax] ; mb_pred
paddw mm6, mm5
psraw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 0[eax], mm6
movd mm6, DWORD PTR 16[edx] ; block_l0
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
movd mm7, DWORD PTR 32[eax] ; mb_pred
paddw mm6, mm5
psraw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 16[eax], mm6
movd mm6, DWORD PTR 32[edx] ; block_l0
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
movd mm7, DWORD PTR 48[eax] ; mb_pred
paddw mm6, mm5
psraw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 0[eax], mm6
movd mm6, DWORD PTR 48[edx] ; block_l0
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
paddw mm6, mm5
psraw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 48[eax], mm6
ret 0
BI_PRED4x4@LEFT_SHIFT:
neg eax
movd mm4, eax
mov eax, mb_pred[esp]
mov edx, block_l0[esp]
movd mm6, DWORD PTR 0[edx] ; block_l0
movd mm7, DWORD PTR 0[eax] ; mb_pred
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
movd mm7, DWORD PTR 16[eax] ; mb_pred
psllw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 0[eax], mm6
movd mm6, DWORD PTR 16[edx] ; block_l0
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
movd mm7, DWORD PTR 32[eax] ; mb_pred
psllw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 16[eax], mm6
movd mm6, DWORD PTR 32[edx] ; block_l0
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
movd mm7, DWORD PTR 48[eax] ; mb_pred
psllw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 0[eax], mm6
movd mm6, DWORD PTR 48[edx] ; block_l0
punpcklbw mm6, mm0
punpcklbw mm7, mm0
pmullw mm6, mm1
pmullw mm7, mm2
paddw mm6, mm7
psllw mm6, mm4
paddw mm6, mm3
packuswb mm6, mm6
movd DWORD PTR 48[eax], mm6
ret 0
_weighted_bi_prediction4x4 ENDP
_TEXT ENDS
PUBLIC _itrans4x4_mmx
_TEXT SEGMENT
_tblock$ = 4 ; size = 4
_mb_pred$ = 8 ; size = 4
_mb_rec$ = 12 ; size = 4
_pos_x$ = 16 ; size = 4
_pos_y$ = 20 ; size = 4
_itrans4x4_mmx PROC ; COMDAT
mov edx, DWORD PTR _pos_y$[esp]
shl edx, 4
add edx, DWORD PTR _pos_x$[esp]
mov eax, DWORD PTR _tblock$[esp]
mov ecx, DWORD PTR _mb_pred$[esp]
add ecx, edx
add edx, DWORD PTR _mb_rec$[esp]
_itrans4x4_mmx_direct PROC ; COMDAT
; load 4x4 matrix
movq mm0, MMWORD PTR 0[eax]
movq mm1, MMWORD PTR 8[eax]
movq mm2, MMWORD PTR 16[eax]
movq mm3, MMWORD PTR 24[eax]
; rotate 4x4 matrix
movq mm4, mm0 ; p0 = mm4 (copy)
punpcklwd mm0, mm2 ; r0 = mm0
punpckhwd mm4, mm2 ; r2 = mm4
movq mm5, mm1 ; p1 = mm5 (copy)
punpcklwd mm1, mm3 ; r1 = mm1
punpckhwd mm5, mm3 ; r3 = mm5
movq mm6, mm0 ; r0 = mm6 (copy)
punpcklwd mm0, mm1 ; t0 = mm0
punpckhwd mm6, mm1 ; t1 = mm6
movq mm1, mm4 ; r2 = mm1 (copy)
punpcklwd mm1, mm5 ; t2 = mm1
punpckhwd mm4, mm5 ; t3 = mm4
movq mm2, mm0 ; mm2 = t0 (copy)
paddw mm0, mm1 ; mm0 = p0
psubw mm2, mm1 ; mm2 = p1, mm1 available
movq mm5, mm6 ; mm5 = t1 (copy)
psraw mm5, 1 ; mm5 = (t1 >> 1)
psubw mm5, mm4 ; mm5 = p2
psraw mm4, 1 ; mm4 = (t3 >> 1)
paddw mm6, mm4 ; mm6 = p3
movq mm3, mm0 ; mm3 = p0 (copy)
paddw mm0, mm6 ; mm0 = r0
movq mm1, mm2 ; mm1 = p1 (copy)
paddw mm1, mm5 ; mm1 = r1
psubw mm2, mm5 ; mm2 = r2, mm5 available
psubw mm3, mm6 ; mm3 = r3
; rotate 4x4 matrix to set up for vertical
movq mm4, mm0 ; r0 = mm4 (copy)
punpcklwd mm0, mm2 ; p0 = mm0
punpckhwd mm4, mm2 ; p2 = mm4
movq mm5, mm1 ; r1 = mm5 (copy)
punpcklwd mm1, mm3 ; p1 = mm1
punpckhwd mm5, mm3 ; p3 = mm5
movq mm6, mm0 ; p0 = mm6 (copy)
punpcklwd mm0, mm1 ; t0 = mm0
punpckhwd mm6, mm1 ; t1 = mm6
movq mm1, mm4 ; p2 = mm1 (copy)
punpcklwd mm1, mm5 ; t2 = mm1
punpckhwd mm4, mm5 ; t3 = mm4
movq mm2, mm0 ; mm2 = t0 (copy)
paddw mm0, mm1 ; mm0 = p0
psubw mm2, mm1 ; mm2 = p1, mm1 available
movq mm5, mm6 ; mm5 = t1 (copy)
psraw mm5, 1 ; mm5 = (t1 >> 1)
psubw mm5, mm4 ; mm5 = p2
psraw mm4, 1 ; mm4 = (t3 >> 1)
paddw mm6, mm4 ; mm6 = p3
movq mm3, mm0 ; mm3 = p0 (copy)
paddw mm0, mm6 ; mm0 = r0
movq mm1, mm2 ; mm1 = p1 (copy)
paddw mm1, mm5 ; mm1 = r1
psubw mm2, mm5 ; mm2 = r2, mm5 available
psubw mm3, mm6 ; mm3 = r3
; --- 4x4 iDCT done, now time to combine with mpr ---
movq mm7, MMWORD PTR const32
paddw mm0, mm7 ; rres + 32
psraw mm0, 6 ; (rres + 32) >> 6
paddw mm1, mm7 ; rres + 32
psraw mm1, 6 ; (rres + 32) >> 6
paddw mm2, mm7 ; rres + 32
psraw mm2, 6 ; (rres + 32) >> 6
paddw mm3, mm7 ; rres + 32
psraw mm3, 6 ; (rres + 32) >> 6
pxor mm7, mm7
; convert mpr from unsigned char to short
movd mm4, DWORD PTR 0[ecx]
movd mm5, DWORD PTR 16[ecx]
movd mm6, DWORD PTR 32[ecx]
punpcklbw mm4, mm7
punpcklbw mm5, mm7
punpcklbw mm6, mm7
paddsw mm4, mm0 ; pred_row + rres_row
movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3]
paddsw mm5, mm1 ; pred_row + rres_row
punpcklbw mm0, mm7
paddsw mm6, mm2 ; pred_row + rres_row
paddsw mm0, mm3 ; pred_row + rres_row
; results in mm4, mm5, mm6, mm0
; move back to 8 bit
packuswb mm4, mm7
packuswb mm5, mm7
packuswb mm6, mm7
packuswb mm0, mm7
movd DWORD PTR 0[edx], mm4
movd DWORD PTR 16[edx], mm5
movd DWORD PTR 32[edx], mm6
movd DWORD PTR 48[edx], mm0
ret 0
_itrans4x4_mmx_direct ENDP
_itrans4x4_mmx ENDP
_TEXT ENDS
EXTRN _itrans_sp:PROC
EXTRN _Inv_Residual_trans_4x4:PROC
PUBLIC _iMBtrans4x4
EXTRN _opt:BYTE
_TEXT SEGMENT
_currSlice$ = -4 ; size = 4
_mb_rec$166704 = 8 ; size = 4
_currMB$ = 8 ; size = 4
_curr_img$ = 12 ; size = 4
_pl$ = 8 ; second parameter
_smb$ = 16 ; size = 4
_iMBtrans4x4 PROC
push ecx
push ebx
push ebp
push esi
STACKOFFSET = 16
; 408 : VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
mov esi, DWORD PTR _pl$[esp+STACKOFFSET]
push edi
STACKOFFSET = STACKOFFSET + 4
mov edi, DWORD PTR _currMB$[esp+16]
mov ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice
mov eax, DWORD PTR [edi+4]
mov eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture;
mov DWORD PTR _currSlice$[esp+20], ebp
mov ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture]
mov DWORD PTR _curr_img$[esp+16], ecx
cmp DWORD PTR _smb$[esp+16], 0 ; if (smb)
; 413 : {
; 414 : h264_short_block_t *blocks = currSlice->cof4[pl];
; 415 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
; 416 :
; 417 : itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
je $LN4@iMBtrans4x
push 0
push 0
mov eax, esi
shl eax, 9
lea ebx, DWORD PTR [eax+ebp+cof@Slice]
mov ecx, esi
shl ecx, 8
lea ebp, DWORD PTR [ecx+ebp+mb_pred@Slice]
push esi
push ebp
push ebx
mov eax, edi
call _itrans_sp
; 418 : itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
push 0
push 4
push esi
lea edx, DWORD PTR [ebx+32]
push ebp
push edx
mov eax, edi
call _itrans_sp
; 419 : itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
push 4
push 0
push esi
lea eax, DWORD PTR [ebx+64]
push ebp
push eax
mov eax, edi
call _itrans_sp
; 420 : itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
push 4
push 4
push esi
lea ecx, DWORD PTR [ebx+96]
push ebp
push ecx
mov eax, edi
call _itrans_sp
add esp, 80 ; 00000050H
; 421 : itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
push 0
push 8
push esi
lea edx, DWORD PTR [ebx+128]
push ebp
push edx
mov eax, edi
call _itrans_sp
; 422 : itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
push 0
push 12 ; 0000000cH
push esi
lea eax, DWORD PTR [ebx+160]
push ebp
push eax
mov eax, edi
call _itrans_sp
; 423 : itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
push 4
push 8
push esi
lea ecx, DWORD PTR [ebx+192]
push ebp
push ecx
mov eax, edi
call _itrans_sp
; 424 : itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
push 4
push 12 ; 0000000cH
push esi
lea edx, DWORD PTR [ebx+224]
push ebp
push edx
mov eax, edi
call _itrans_sp
add esp, 80 ; 00000050H
; 425 : itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
push 8
push 0
push esi
lea eax, DWORD PTR [ebx+256]
push ebp
push eax
mov eax, edi
call _itrans_sp
; 426 : itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
push 8
push 4
push esi
push ebp
lea ecx, DWORD PTR [ebx+288]
push ecx
mov eax, edi
call _itrans_sp
; 427 : itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
push 12 ; 0000000cH
push 0
push esi
lea edx, DWORD PTR [ebx+320]
push ebp
push edx
mov eax, edi
call _itrans_sp
; 428 : itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
push 12 ; 0000000cH
push 4
push esi
lea eax, DWORD PTR [ebx+352]
push ebp
push eax
mov eax, edi
call _itrans_sp
add esp, 80 ; 00000050H
; 429 : itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
push 8
push 8
push esi
lea ecx, DWORD PTR [ebx+384]
push ebp
push ecx
mov eax, edi
call _itrans_sp
; 430 : itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
push 8
push 12 ; 0000000cH
push esi
lea edx, DWORD PTR [ebx+416]
push ebp
push edx
mov eax, edi
call _itrans_sp
; 431 : itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
push 12 ; 0000000cH
push 8
push esi
lea eax, DWORD PTR [ebx+448]
push ebp
push eax
mov eax, edi
call _itrans_sp
; 432 : itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
push 12 ; 0000000cH
push 12 ; 0000000cH
push esi
add ebx, 480 ; 000001e0H
push ebp
push ebx
mov eax, edi
call _itrans_sp
mov ebp, DWORD PTR _currSlice$[esp+100]
add esp, 80 ; 00000050H
jmp COPY_16x16
$LN4@iMBtrans4x:
; 433 : }
; 434 : else if (currMB->is_lossless)
cmp DWORD PTR [edi+84], 0
je $LN2@iMBtrans4x
push 0
push 0
; 435 : {
; 436 : Inv_Residual_trans_4x4(currMB, pl, 0, 0);
push esi
push edi
call _Inv_Residual_trans_4x4
; 437 : Inv_Residual_trans_4x4(currMB, pl, 4, 0);
push 0
push 4
push esi
push edi
call _Inv_Residual_trans_4x4
; 438 : Inv_Residual_trans_4x4(currMB, pl, 0, 4);
push 4
push 0
push esi
push edi
call _Inv_Residual_trans_4x4
; 439 : Inv_Residual_trans_4x4(currMB, pl, 4, 4);
push 4
push 4
push esi
push edi
call _Inv_Residual_trans_4x4
add esp, 64 ; 00000040H
; 440 : Inv_Residual_trans_4x4(currMB, pl, 8, 0);
push 0
push 8
push esi
push edi
call _Inv_Residual_trans_4x4
; 441 : Inv_Residual_trans_4x4(currMB, pl, 12, 0);
push 0
push 12 ; 0000000cH
push esi
push edi
call _Inv_Residual_trans_4x4
; 442 : Inv_Residual_trans_4x4(currMB, pl, 8, 4);
push 4
push 8
push esi
push edi
call _Inv_Residual_trans_4x4
; 443 : Inv_Residual_trans_4x4(currMB, pl, 12, 4);
push 4
push 12 ; 0000000cH
push esi
push edi
call _Inv_Residual_trans_4x4
add esp, 64 ; 00000040H
; 444 : Inv_Residual_trans_4x4(currMB, pl, 0, 8);
push 8
push 0
push esi
push edi
call _Inv_Residual_trans_4x4
; 445 : Inv_Residual_trans_4x4(currMB, pl, 4, 8);
push 8
push 4
push esi
push edi
call _Inv_Residual_trans_4x4
; 446 : Inv_Residual_trans_4x4(currMB, pl, 0, 12);
push 12 ; 0000000cH
push 0
push esi
push edi
call _Inv_Residual_trans_4x4
; 447 : Inv_Residual_trans_4x4(currMB, pl, 4, 12);
push 12 ; 0000000cH
push 4
push esi
push edi
call _Inv_Residual_trans_4x4
add esp, 64 ; 00000040H
; 448 : Inv_Residual_trans_4x4(currMB, pl, 8, 8);
push 8
push 8
push esi
push edi
call _Inv_Residual_trans_4x4
; 449 : Inv_Residual_trans_4x4(currMB, pl, 12, 8);
push 8
push 12 ; 0000000cH
push esi
push edi
call _Inv_Residual_trans_4x4
; 450 : Inv_Residual_trans_4x4(currMB, pl, 8, 12);
push 12 ; 0000000cH
push 8
push esi
push edi
call _Inv_Residual_trans_4x4
; 451 : Inv_Residual_trans_4x4(currMB, pl, 12, 12);
push 12 ; 0000000cH
push 12 ; 0000000cH
push esi
push edi
call _Inv_Residual_trans_4x4
add esp, 64 ; 00000040H
; 452 : }
; 453 : else
jmp COPY_16x16
$LN2@iMBtrans4x:
; 454 : {
; 455 : const h264_short_block_t *blocks = currSlice->cof4[pl];
; 456 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
mov edx, esi
mov ecx, esi
shl edx, 8
shl ecx, 9
lea eax, DWORD PTR [edx+ebp]
lea ebx, DWORD PTR [ecx+ebp+cof@Slice]
; 457 : h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
; put things in registers that itrans4x4_mmx_direct wants
lea edx, [eax + mb_rec@Slice]; mb_rec
lea ecx, [eax + mb_pred@Slice] ; mb_pred
mov eax, ebx ; blocks
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+32]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+128]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+160]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
; second row
lea edx, [edx+52]
lea ecx, [ecx+52]
lea eax, [ebx+64]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+96]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+192]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+224]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
; third row
lea edx, [edx+52]
lea ecx, [ecx+52]
lea eax, [ebx+256]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+288]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+384]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+416]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
; fourth row
lea edx, [edx+52]
lea ecx, [ecx+52]
lea eax, [ebx+320]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+352]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+448]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
lea edx, [edx+4]
lea ecx, [ecx+4]
lea eax, [ebx+480]
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
COPY_16x16:
; construct picture from 4x4 blocks
; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
mov eax, DWORD PTR [edi+40]
mov ecx, DWORD PTR [edi+36]
shl esi, 8
lea edx, DWORD PTR [esi+ebp+mb_rec@Slice]
push edx
mov edx, DWORD PTR _curr_img$[esp+20]
push eax
push ecx
push edx
call DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions
add esp, 16 ; 00000010H
pop edi
pop esi
pop ebp
pop ebx
pop ecx
ret 0
_iMBtrans4x4 ENDP
_TEXT ENDS
_TEXT SEGMENT
ALIGN 2
PUBLIC _itrans8x8_sse2
_itrans8x8_sse2 PROC NEAR
; parameter 1(mb_rec): 8 + ebp
; parameter 2(mb_pred): 12 + ebp
; parameter 3(block): 16 + ebp
; parameter 4(pos_x): 20 + ebp
push ebp
mov ebp, esp
and esp, -16
sub esp, 176
mov edx, DWORD PTR [ebp+20]
mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec
add ecx, edx
add edx, DWORD PTR [ebp+12] ; edx: mb_pred
mov eax, DWORD PTR [ebp+16] ; eax: block
;;; __m128i a0, a1, a2, a3;
;;; __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
;;; __m128i b0, b1, b2, b3, b4, b5, b6, b7;
;;; __m128i r0, r1, r2, r3, r4, r5, r6, r7;
;;; __m128i const32, zero;
;;; __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
;;; __m128i pred0, pred1;
;;;
;;; const32 = _mm_load_si128((const __m128i *)c32);
movdqa xmm0, XMMWORD PTR const32
;;; zero = _mm_setzero_si128();
;;;
;;; // Horizontal
;;; b0 = _mm_load_si128((__m128i *)(block[0]));
movdqa xmm4, XMMWORD PTR [eax]
;;; b1 = _mm_load_si128((__m128i *)(block[1]));
movdqa xmm7, XMMWORD PTR [eax+16]
;;; b2 = _mm_load_si128((__m128i *)(block[2]));
movdqa xmm5, XMMWORD PTR [eax+32]
;;; b3 = _mm_load_si128((__m128i *)(block[3]));
movdqa xmm3, XMMWORD PTR [eax+48]
;;; b4 = _mm_load_si128((__m128i *)(block[4]));
movdqa xmm6, XMMWORD PTR [eax+64]
;;; b5 = _mm_load_si128((__m128i *)(block[5]));
;;; b6 = _mm_load_si128((__m128i *)(block[6]));
movdqa xmm1, XMMWORD PTR [eax+96]
;;; b7 = _mm_load_si128((__m128i *)(block[7]));
movdqa xmm2, XMMWORD PTR [eax+112]
movdqa XMMWORD PTR [esp], xmm0
movdqa xmm0, XMMWORD PTR [eax+80]
movdqa XMMWORD PTR [esp+16], xmm2
;;;
;;; /* rotate 8x8 (ugh) */
;;; r0 = _mm_unpacklo_epi16(b0, b2);
movdqa xmm2, xmm4
punpcklwd xmm2, xmm5
;;; r1 = _mm_unpacklo_epi16(b1, b3);
;;; r2 = _mm_unpackhi_epi16(b0, b2);
punpckhwd xmm4, xmm5
;;; r3 = _mm_unpackhi_epi16(b1, b3);
;;; r4 = _mm_unpacklo_epi16(b4, b6);
;;; r5 = _mm_unpacklo_epi16(b5, b7);
movdqa xmm5, xmm0
movdqa XMMWORD PTR [esp+32], xmm2
movdqa xmm2, xmm7
punpcklwd xmm2, xmm3
punpckhwd xmm7, xmm3
movdqa xmm3, xmm6
punpcklwd xmm3, xmm1
movdqa XMMWORD PTR [esp+48], xmm3
movdqa xmm3, XMMWORD PTR [esp+16]
punpcklwd xmm5, xmm3
;;; r6 = _mm_unpackhi_epi16(b4, b6);
punpckhwd xmm6, xmm1
;;; r7 = _mm_unpackhi_epi16(b5, b7);
punpckhwd xmm0, xmm3
;;;
;;; b0 = _mm_unpacklo_epi16(r0, r1);
movdqa xmm3, XMMWORD PTR [esp+32]
movdqa xmm1, xmm3
punpcklwd xmm1, xmm2
;;; b1 = _mm_unpackhi_epi16(r0, r1);
punpckhwd xmm3, xmm2
;;; b2 = _mm_unpacklo_epi16(r2, r3);
movdqa xmm2, xmm4
punpcklwd xmm2, xmm7
;;; b3 = _mm_unpackhi_epi16(r2, r3);
punpckhwd xmm4, xmm7
movdqa XMMWORD PTR [esp+64], xmm4
;;; b4 = _mm_unpacklo_epi16(r4, r5);
movdqa xmm4, XMMWORD PTR [esp+48]
movdqa xmm7, xmm4
punpcklwd xmm7, xmm5
;;; b5 = _mm_unpackhi_epi16(r4, r5);
punpckhwd xmm4, xmm5
;;; b6 = _mm_unpacklo_epi16(r6, r7);
movdqa xmm5, xmm6
punpcklwd xmm5, xmm0
;;; b7 = _mm_unpackhi_epi16(r6, r7);
punpckhwd xmm6, xmm0
;;;
;;; p0 = _mm_unpacklo_epi64(b0, b4);
movdqa xmm0, xmm1
punpcklqdq xmm0, xmm7
;;; p1 = _mm_unpackhi_epi64(b0, b4);
punpckhqdq xmm1, xmm7
movdqa XMMWORD PTR [esp+16], xmm1
;;; p2 = _mm_unpacklo_epi64(b1, b5);
movdqa xmm1, xmm3
punpcklqdq xmm1, xmm4
;;; p3 = _mm_unpackhi_epi64(b1, b5);
;;; p4 = _mm_unpacklo_epi64(b2, b6);
;;; p5 = _mm_unpackhi_epi64(b2, b6);
;;; p6 = _mm_unpacklo_epi64(b3, b7);
;;; p7 = _mm_unpackhi_epi64(b3, b7);
;;;
;;; /* perform approx DCT */
;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
movdqa xmm7, xmm1
psraw xmm7, 1
punpckhqdq xmm3, xmm4
movdqa XMMWORD PTR [esp+32], xmm3
movdqa xmm3, xmm2
punpcklqdq xmm3, xmm5
punpckhqdq xmm2, xmm5
movdqa xmm5, XMMWORD PTR [esp+64]
movdqa xmm4, xmm5
punpcklqdq xmm4, xmm6
punpckhqdq xmm5, xmm6
movdqa xmm6, xmm0
paddw xmm6, xmm3
psubw xmm0, xmm3
;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
movdqa xmm3, xmm4
;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
psraw xmm4, 1
psubw xmm3, xmm7
;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
paddw xmm1, xmm4
;;;
;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
movdqa xmm4, xmm6
;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
movdqa xmm7, xmm0
paddw xmm4, xmm1
psubw xmm7, xmm3
movdqa XMMWORD PTR [esp+48], xmm7
;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
paddw xmm0, xmm3
movdqa XMMWORD PTR [esp+80], xmm0
;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
;;;
;;; //-p3 + p5 - p7 - (p7 >> 1);
;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
movdqa xmm0, XMMWORD PTR [esp+32]
psubw xmm6, xmm1
movdqa xmm1, xmm5
psraw xmm1, 1
movdqa xmm3, xmm2
;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
;;;
;;; //p1 + p7 - p3 - (p3 >> 1);
;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
movdqa xmm7, xmm0
movdqa XMMWORD PTR [esp+96], xmm6
;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
movdqa xmm6, XMMWORD PTR [esp+16]
psubw xmm3, xmm0
psubw xmm3, xmm5
psraw xmm7, 1
psubw xmm3, xmm1
movdqa xmm1, xmm6
paddw xmm1, xmm5
;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
psubw xmm1, xmm0
;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
psubw xmm1, xmm7
;;;
;;; // -p1 + p7 + p5 + (p5 >> 1);
;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
movdqa xmm7, xmm2
psraw xmm7, 1
;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
psubw xmm5, xmm6
;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
paddw xmm5, xmm2
;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
paddw xmm5, xmm7
;;;
;;; // p3 + p5 + p1 + (p1 >> 1);
;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
paddw xmm0, xmm2
;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
;;; p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
;;; a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
;;;
;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
;;;
;;; p0 = _mm_add_epi16(b0, b7); // b0 + b7;
;;; p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
;;; p2 = _mm_add_epi16(b4, b3); // b4 + b3;
;;; p3 = _mm_add_epi16(b6, b1); // b6 + b1;
movdqa xmm2, XMMWORD PTR [esp+96]
paddw xmm0, xmm6
psraw xmm6, 1
paddw xmm0, xmm6
movdqa xmm7, xmm0
movdqa xmm6, xmm5
psraw xmm7, 2
paddw xmm7, xmm3
psraw xmm6, 2
paddw xmm6, xmm1
psraw xmm1, 2
psubw xmm5, xmm1
movdqa xmm1, xmm4
psraw xmm3, 2
psubw xmm0, xmm3
movdqa xmm3, XMMWORD PTR [esp+80]
movdqa XMMWORD PTR [esp+32], xmm0
;;; p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
;;; p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
;;; p6 = _mm_add_epi16(b2, b5); // b2 + b5;
;;; p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
psubw xmm4, XMMWORD PTR [esp+32]
paddw xmm1, xmm0
movdqa XMMWORD PTR [esp+112], xmm1
movdqa xmm1, XMMWORD PTR [esp+48]
movdqa xmm0, xmm1
psubw xmm0, xmm5
movdqa XMMWORD PTR [esp+16], xmm0
movdqa xmm0, xmm3
paddw xmm0, xmm6
psubw xmm3, xmm6
movdqa XMMWORD PTR [esp+128], xmm0
;;;
;;; /* rotate 8x8 (ugh) */
;;; r0 = _mm_unpacklo_epi16(p0, p2);
movdqa xmm6, XMMWORD PTR [esp+128]
movdqa xmm0, xmm2
paddw xmm0, xmm7
psubw xmm2, xmm7
paddw xmm1, xmm5
movdqa xmm5, XMMWORD PTR [esp+112]
movdqa XMMWORD PTR [esp+144], xmm4
movdqa xmm4, xmm5
punpcklwd xmm4, xmm6
;;; r1 = _mm_unpacklo_epi16(p1, p3);
;;; r2 = _mm_unpackhi_epi16(p0, p2);
punpckhwd xmm5, xmm6
;;; r3 = _mm_unpackhi_epi16(p1, p3);
;;; r4 = _mm_unpacklo_epi16(p4, p6);
;;; r5 = _mm_unpacklo_epi16(p5, p7);
movdqa xmm6, xmm3
movdqa XMMWORD PTR [esp+64], xmm4
movdqa xmm4, XMMWORD PTR [esp+16]
movdqa xmm7, xmm4
punpcklwd xmm7, xmm0
punpckhwd xmm4, xmm0
movdqa xmm0, xmm2
punpcklwd xmm0, xmm1
movdqa XMMWORD PTR [esp+128], xmm0
movdqa xmm0, XMMWORD PTR [esp+144]
punpcklwd xmm6, xmm0
;;; r6 = _mm_unpackhi_epi16(p4, p6);
punpckhwd xmm2, xmm1
;;; r7 = _mm_unpackhi_epi16(p5, p7);
;;;
;;; b0 = _mm_unpacklo_epi16(r0, r1);
movdqa xmm1, XMMWORD PTR [esp+64]
punpckhwd xmm3, xmm0
movdqa xmm0, xmm1
punpcklwd xmm0, xmm7
;;; b1 = _mm_unpackhi_epi16(r0, r1);
punpckhwd xmm1, xmm7
;;; b2 = _mm_unpacklo_epi16(r2, r3);
movdqa xmm7, xmm5
punpcklwd xmm7, xmm4
;;; b3 = _mm_unpackhi_epi16(r2, r3);
punpckhwd xmm5, xmm4
movdqa XMMWORD PTR [esp+112], xmm5
;;; b4 = _mm_unpacklo_epi16(r4, r5);
movdqa xmm5, XMMWORD PTR [esp+128]
movdqa xmm4, xmm5
punpcklwd xmm4, xmm6
;;; b5 = _mm_unpackhi_epi16(r4, r5);
punpckhwd xmm5, xmm6
;;; b6 = _mm_unpacklo_epi16(r6, r7);
movdqa xmm6, xmm2
punpcklwd xmm6, xmm3
;;; b7 = _mm_unpackhi_epi16(r6, r7);
punpckhwd xmm2, xmm3
;;;
;;; p0 = _mm_unpacklo_epi64(b0, b4);
movdqa xmm3, xmm0
punpcklqdq xmm3, xmm4
;;; p1 = _mm_unpackhi_epi64(b0, b4);
punpckhqdq xmm0, xmm4
movdqa XMMWORD PTR [esp+144], xmm0
;;; p2 = _mm_unpacklo_epi64(b1, b5);
;;; p3 = _mm_unpackhi_epi64(b1, b5);
;;; p4 = _mm_unpacklo_epi64(b2, b6);
;;; p5 = _mm_unpackhi_epi64(b2, b6);
;;; p6 = _mm_unpacklo_epi64(b3, b7);
movdqa xmm0, XMMWORD PTR [esp+112]
movdqa xmm4, xmm1
punpcklqdq xmm4, xmm5
punpckhqdq xmm1, xmm5
movdqa XMMWORD PTR [esp+64], xmm1
movdqa xmm1, xmm7
movdqa xmm5, xmm0
punpcklqdq xmm1, xmm6
punpckhqdq xmm7, xmm6
;;; p7 = _mm_unpackhi_epi64(b3, b7);
;;;
;;;
;;; /* Vertical */
;;;
;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
movdqa xmm6, xmm4
psraw xmm6, 1
punpcklqdq xmm5, xmm2
punpckhqdq xmm0, xmm2
movdqa xmm2, xmm3
paddw xmm2, xmm1
psubw xmm3, xmm1
;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
movdqa xmm1, xmm5
;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
psraw xmm5, 1
psubw xmm1, xmm6
;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
paddw xmm4, xmm5
;;;
;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
movdqa xmm5, xmm2
;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
movdqa xmm6, xmm3
paddw xmm5, xmm4
psubw xmm6, xmm1
movdqa XMMWORD PTR [esp+128], xmm6
;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
;;;
;;; //-p3 + p5 - p7 - (p7 >> 1);
;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
movdqa xmm6, XMMWORD PTR [esp+64]
paddw xmm3, xmm1
movdqa XMMWORD PTR [esp+80], xmm3
psubw xmm2, xmm4
movdqa xmm1, xmm0
psraw xmm1, 1
movdqa xmm3, xmm7
movdqa XMMWORD PTR [esp+96], xmm2
psubw xmm3, xmm6
;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
psubw xmm3, xmm0
;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
;;;
;;; //p1 + p7 - p3 - (p3 >> 1);
;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
movdqa xmm2, xmm6
psraw xmm2, 1
psubw xmm3, xmm1
;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
movdqa xmm1, XMMWORD PTR [esp+144]
movdqa xmm4, xmm1
paddw xmm4, xmm0
;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
psubw xmm4, xmm6
;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
psubw xmm4, xmm2
;;;
;;; // -p1 + p7 + p5 + (p5 >> 1);
;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
movdqa xmm2, xmm7
psraw xmm2, 1
;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
psubw xmm0, xmm1
;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
paddw xmm0, xmm7
;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
paddw xmm0, xmm2
;;;
;;; // p3 + p5 + p1 + (p1 >> 1);
;;; r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
movdqa xmm2, xmm1
psraw xmm2, 1
;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
paddw xmm6, xmm7
;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
;;; a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
;;;
;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
;;;
;;; r0 = _mm_add_epi16(b0, b7); // b0 + b7;
;;; r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
movdqa xmm7, XMMWORD PTR [esp+128]
paddw xmm6, xmm1
paddw xmm6, xmm2
movdqa xmm1, xmm6
psraw xmm1, 2
movdqa xmm2, xmm0
paddw xmm1, xmm3
psraw xmm2, 2
paddw xmm2, xmm4
psraw xmm4, 2
psubw xmm0, xmm4
psraw xmm3, 2
psubw xmm6, xmm3
movdqa XMMWORD PTR [esp+64], xmm6
movdqa xmm3, xmm5
;;; r2 = _mm_add_epi16(b4, b3); // b4 + b3;
;;; r3 = _mm_add_epi16(b6, b1); // b6 + b1;
;;; r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
;;; r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
;;; r6 = _mm_add_epi16(b2, b5); // b2 + b5;
;;; r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
psubw xmm5, XMMWORD PTR [esp+64]
paddw xmm3, xmm6
movdqa XMMWORD PTR [esp+144], xmm3
movdqa xmm3, xmm7
psubw xmm3, xmm0
movdqa XMMWORD PTR [esp+48], xmm3
movdqa xmm3, XMMWORD PTR [esp+80]
movdqa xmm4, xmm3
paddw xmm4, xmm2
psubw xmm3, xmm2
;;;
;;;
;;; // add in prediction values
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
;;; // (x + 32) >> 6
;;; r0 = _mm_adds_epi16(r0, const32);
movdqa xmm2, XMMWORD PTR const32
movdqa XMMWORD PTR [esp+16], xmm4
movdqa xmm4, XMMWORD PTR [esp+96]
movdqa xmm6, xmm4
paddw xmm6, xmm1
psubw xmm4, xmm1
;;; r0 = _mm_srai_epi16(r0, 6);
;;; r1 = _mm_adds_epi16(r1, const32);
movdqa xmm1, XMMWORD PTR [esp+48]
paddw xmm7, xmm0
movdqa xmm0, XMMWORD PTR [esp+144]
movdqa XMMWORD PTR [esp+128], xmm7
;;; r1 = _mm_srai_epi16(r1, 6);
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
movq xmm7, QWORD PTR [edx+16]
movdqa XMMWORD PTR [esp+32], xmm5
paddsw xmm0, xmm2
psraw xmm0, 6
paddsw xmm1, xmm2
pxor xmm2, xmm2
punpcklbw xmm7, xmm2
movq xmm5, QWORD PTR [edx]
punpcklbw xmm5, xmm2
psraw xmm1, 6
;;; pred0 = _mm_adds_epi16(pred0, r0);
;;; pred1 = _mm_adds_epi16(pred1, r1);
paddsw xmm7, xmm1
paddsw xmm5, xmm0
;;;
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
packuswb xmm5, xmm7
;;;
;;; // store
;;; _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
movdqa xmm0, XMMWORD PTR [esp+32]
movdqa xmm2, XMMWORD PTR [esp+128]
movq QWORD PTR [ecx], xmm5
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
;;; pred0 = _mm_srli_si128(pred0, 8);
psrldq xmm5, 8
;;; _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
movq QWORD PTR [ecx+16], xmm5
;;;
;;; /* --- */
;;;
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
movq xmm1, QWORD PTR [edx+32]
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
;;; // (x + 32) >> 6
;;; r2 = _mm_adds_epi16(r2, const32);
movdqa xmm5, XMMWORD PTR [esp]
movdqa XMMWORD PTR [esp+32], xmm0 ;
;;; r2 = _mm_srai_epi16(r2, 6);
;;; r3 = _mm_adds_epi16(r3, const32);
paddsw xmm6, xmm5
;;; r3 = _mm_srai_epi16(r3, 6);
psraw xmm6, 6
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
pxor xmm7, xmm7
punpcklbw xmm1, xmm7
movdqa xmm0, XMMWORD PTR [esp+16]
paddsw xmm0, xmm5
psraw xmm0, 6
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
;;; pred0 = _mm_adds_epi16(pred0, r2);
paddsw xmm1, xmm0
;;; pred1 = _mm_adds_epi16(pred1, r3);
;;;
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
;;;
;;; // store
;;; _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
movdqa xmm0, XMMWORD PTR [esp+32]
movq xmm5, QWORD PTR [edx+48]
punpcklbw xmm5, xmm7
paddsw xmm5, xmm6
packuswb xmm1, xmm5
movq QWORD PTR [ecx+32], xmm1
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
;;; pred0 = _mm_srli_si128(pred0, 8);
psrldq xmm1, 8
;;; _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
movq QWORD PTR [ecx+48], xmm1
;;;
;;; /* --- */
;;;
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
movq xmm7, QWORD PTR [edx+64]
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
movq xmm6, QWORD PTR [edx+80]
;;; // (x + 32) >> 6
;;; r4 = _mm_adds_epi16(r4, const32);
;;; r4 = _mm_srai_epi16(r4, 6);
;;; r5 = _mm_adds_epi16(r5, const32);
;;; r5 = _mm_srai_epi16(r5, 6);
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
pxor xmm5, xmm5
punpcklbw xmm7, xmm5
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
punpcklbw xmm6, xmm5
movdqa xmm1, XMMWORD PTR [esp]
paddsw xmm4, xmm1
psraw xmm4, 6
paddsw xmm3, xmm1
psraw xmm3, 6
;;; pred0 = _mm_adds_epi16(pred0, r4);
paddsw xmm7, xmm4
;;; pred1 = _mm_adds_epi16(pred1, r5);
paddsw xmm6, xmm3
;;;
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
packuswb xmm7, xmm6
;;;
;;; // store
;;; _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
movq QWORD PTR [ecx+64], xmm7
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
;;; pred0 = _mm_srli_si128(pred0, 8);
psrldq xmm7, 8
;;; _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
movq QWORD PTR [ecx+80], xmm7
;;;
;;; /* --- */
;;;
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
movq xmm5, QWORD PTR [edx+96]
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
movq xmm4, QWORD PTR [edx+112]
;;; // (x + 32) >> 6
;;; r6 = _mm_adds_epi16(r6, const32);
;;; r6 = _mm_srai_epi16(r6, 6);
;;; r7 = _mm_adds_epi16(r7, const32);
;;; r7 = _mm_srai_epi16(r7, 6);
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
pxor xmm3, xmm3
punpcklbw xmm5, xmm3
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
punpcklbw xmm4, xmm3
movdqa xmm1, XMMWORD PTR [esp]
paddsw xmm2, xmm1
psraw xmm2, 6
paddsw xmm0, xmm1
psraw xmm0, 6
;;; pred0 = _mm_adds_epi16(pred0, r6);
paddsw xmm5, xmm2
;;; pred1 = _mm_adds_epi16(pred1, r7);
paddsw xmm4, xmm0
;;;
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
packuswb xmm5, xmm4
;;;
;;; // store
;;; _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
movq QWORD PTR [ecx+96], xmm5
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
;;; pred0 = _mm_srli_si128(pred0, 8);
psrldq xmm5, 8
;;; _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
movq QWORD PTR [ecx+112], xmm5
mov esp, ebp
pop ebp
ret
ALIGN 2
_itrans8x8_sse2 ENDP
END