1626 lines
38 KiB
NASM
1626 lines
38 KiB
NASM
.686
|
|
.XMM
|
|
.model FLAT
|
|
|
|
copy_image_data_16x16_stride@OptimizedFunctions = 32
|
|
dec_picture@VideoParameters = 698192
|
|
p_Slice@MacroBlock = 0
|
|
plane_images@StorablePicture = 158512
|
|
mb_rec@Slice = 1696
|
|
mb_pred@Slice = 928
|
|
cof@Slice = 2464
|
|
|
|
CONST SEGMENT
|
|
align 16
|
|
const32 DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H
|
|
CONST ENDS
|
|
|
|
;
|
|
;
|
|
;
|
|
;
|
|
|
|
PUBLIC _weighted_bi_prediction4x4
|
|
_TEXT SEGMENT
|
|
mb_pred = 4
|
|
block_l0 = 8
|
|
wp_scale_l0 = 12
|
|
wp_scale_l1 = 16
|
|
wp_offset = 20
|
|
weight_denom = 24
|
|
_weighted_bi_prediction4x4 PROC ; COMDAT
|
|
mov eax, DWORD PTR weight_denom[esp]
|
|
pxor mm0, mm0
|
|
pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0
|
|
test eax, eax
|
|
pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0
|
|
pshufw mm3, MMWORD PTR wp_offset[esp], 0
|
|
jle BI_PRED4x4@LEFT_SHIFT
|
|
|
|
movd mm4, eax
|
|
lea ecx, DWORD PTR [eax-1] ;
|
|
mov edx, 1
|
|
shl edx, cl
|
|
movd mm5, edx
|
|
mov eax, mb_pred[esp]
|
|
mov edx, block_l0[esp]
|
|
pshufw mm5, mm5, 0
|
|
movd mm6, DWORD PTR 0[edx] ; block_l0
|
|
movd mm7, DWORD PTR 0[eax] ; mb_pred
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
movd mm7, DWORD PTR 16[eax] ; mb_pred
|
|
paddw mm6, mm5
|
|
psraw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 0[eax], mm6
|
|
|
|
movd mm6, DWORD PTR 16[edx] ; block_l0
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
movd mm7, DWORD PTR 32[eax] ; mb_pred
|
|
paddw mm6, mm5
|
|
psraw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 16[eax], mm6
|
|
|
|
movd mm6, DWORD PTR 32[edx] ; block_l0
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
movd mm7, DWORD PTR 48[eax] ; mb_pred
|
|
paddw mm6, mm5
|
|
psraw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 0[eax], mm6
|
|
|
|
movd mm6, DWORD PTR 48[edx] ; block_l0
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
paddw mm6, mm5
|
|
psraw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 48[eax], mm6
|
|
ret 0
|
|
|
|
BI_PRED4x4@LEFT_SHIFT:
|
|
neg eax
|
|
movd mm4, eax
|
|
mov eax, mb_pred[esp]
|
|
mov edx, block_l0[esp]
|
|
movd mm6, DWORD PTR 0[edx] ; block_l0
|
|
movd mm7, DWORD PTR 0[eax] ; mb_pred
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
movd mm7, DWORD PTR 16[eax] ; mb_pred
|
|
psllw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 0[eax], mm6
|
|
|
|
movd mm6, DWORD PTR 16[edx] ; block_l0
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
movd mm7, DWORD PTR 32[eax] ; mb_pred
|
|
psllw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 16[eax], mm6
|
|
|
|
movd mm6, DWORD PTR 32[edx] ; block_l0
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
movd mm7, DWORD PTR 48[eax] ; mb_pred
|
|
psllw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 0[eax], mm6
|
|
|
|
movd mm6, DWORD PTR 48[edx] ; block_l0
|
|
punpcklbw mm6, mm0
|
|
punpcklbw mm7, mm0
|
|
pmullw mm6, mm1
|
|
pmullw mm7, mm2
|
|
paddw mm6, mm7
|
|
psllw mm6, mm4
|
|
paddw mm6, mm3
|
|
packuswb mm6, mm6
|
|
movd DWORD PTR 48[eax], mm6
|
|
ret 0
|
|
_weighted_bi_prediction4x4 ENDP
|
|
_TEXT ENDS
|
|
|
|
PUBLIC _itrans4x4_mmx
|
|
_TEXT SEGMENT
|
|
_tblock$ = 4 ; size = 4
|
|
_mb_pred$ = 8 ; size = 4
|
|
_mb_rec$ = 12 ; size = 4
|
|
_pos_x$ = 16 ; size = 4
|
|
_pos_y$ = 20 ; size = 4
|
|
_itrans4x4_mmx PROC ; COMDAT
|
|
|
|
mov edx, DWORD PTR _pos_y$[esp]
|
|
shl edx, 4
|
|
add edx, DWORD PTR _pos_x$[esp]
|
|
mov eax, DWORD PTR _tblock$[esp]
|
|
mov ecx, DWORD PTR _mb_pred$[esp]
|
|
add ecx, edx
|
|
add edx, DWORD PTR _mb_rec$[esp]
|
|
_itrans4x4_mmx_direct PROC ; COMDAT
|
|
; load 4x4 matrix
|
|
movq mm0, MMWORD PTR 0[eax]
|
|
movq mm1, MMWORD PTR 8[eax]
|
|
movq mm2, MMWORD PTR 16[eax]
|
|
movq mm3, MMWORD PTR 24[eax]
|
|
|
|
; rotate 4x4 matrix
|
|
movq mm4, mm0 ; p0 = mm4 (copy)
|
|
punpcklwd mm0, mm2 ; r0 = mm0
|
|
punpckhwd mm4, mm2 ; r2 = mm4
|
|
movq mm5, mm1 ; p1 = mm5 (copy)
|
|
punpcklwd mm1, mm3 ; r1 = mm1
|
|
punpckhwd mm5, mm3 ; r3 = mm5
|
|
movq mm6, mm0 ; r0 = mm6 (copy)
|
|
punpcklwd mm0, mm1 ; t0 = mm0
|
|
punpckhwd mm6, mm1 ; t1 = mm6
|
|
movq mm1, mm4 ; r2 = mm1 (copy)
|
|
punpcklwd mm1, mm5 ; t2 = mm1
|
|
punpckhwd mm4, mm5 ; t3 = mm4
|
|
|
|
movq mm2, mm0 ; mm2 = t0 (copy)
|
|
paddw mm0, mm1 ; mm0 = p0
|
|
psubw mm2, mm1 ; mm2 = p1, mm1 available
|
|
movq mm5, mm6 ; mm5 = t1 (copy)
|
|
psraw mm5, 1 ; mm5 = (t1 >> 1)
|
|
psubw mm5, mm4 ; mm5 = p2
|
|
psraw mm4, 1 ; mm4 = (t3 >> 1)
|
|
paddw mm6, mm4 ; mm6 = p3
|
|
|
|
movq mm3, mm0 ; mm3 = p0 (copy)
|
|
paddw mm0, mm6 ; mm0 = r0
|
|
movq mm1, mm2 ; mm1 = p1 (copy)
|
|
paddw mm1, mm5 ; mm1 = r1
|
|
psubw mm2, mm5 ; mm2 = r2, mm5 available
|
|
psubw mm3, mm6 ; mm3 = r3
|
|
|
|
; rotate 4x4 matrix to set up for vertical
|
|
movq mm4, mm0 ; r0 = mm4 (copy)
|
|
punpcklwd mm0, mm2 ; p0 = mm0
|
|
punpckhwd mm4, mm2 ; p2 = mm4
|
|
movq mm5, mm1 ; r1 = mm5 (copy)
|
|
punpcklwd mm1, mm3 ; p1 = mm1
|
|
punpckhwd mm5, mm3 ; p3 = mm5
|
|
movq mm6, mm0 ; p0 = mm6 (copy)
|
|
punpcklwd mm0, mm1 ; t0 = mm0
|
|
punpckhwd mm6, mm1 ; t1 = mm6
|
|
movq mm1, mm4 ; p2 = mm1 (copy)
|
|
punpcklwd mm1, mm5 ; t2 = mm1
|
|
punpckhwd mm4, mm5 ; t3 = mm4
|
|
|
|
movq mm2, mm0 ; mm2 = t0 (copy)
|
|
paddw mm0, mm1 ; mm0 = p0
|
|
psubw mm2, mm1 ; mm2 = p1, mm1 available
|
|
movq mm5, mm6 ; mm5 = t1 (copy)
|
|
psraw mm5, 1 ; mm5 = (t1 >> 1)
|
|
psubw mm5, mm4 ; mm5 = p2
|
|
psraw mm4, 1 ; mm4 = (t3 >> 1)
|
|
paddw mm6, mm4 ; mm6 = p3
|
|
movq mm3, mm0 ; mm3 = p0 (copy)
|
|
paddw mm0, mm6 ; mm0 = r0
|
|
movq mm1, mm2 ; mm1 = p1 (copy)
|
|
paddw mm1, mm5 ; mm1 = r1
|
|
psubw mm2, mm5 ; mm2 = r2, mm5 available
|
|
psubw mm3, mm6 ; mm3 = r3
|
|
|
|
|
|
; --- 4x4 iDCT done, now time to combine with mpr ---
|
|
|
|
movq mm7, MMWORD PTR const32
|
|
|
|
paddw mm0, mm7 ; rres + 32
|
|
psraw mm0, 6 ; (rres + 32) >> 6
|
|
paddw mm1, mm7 ; rres + 32
|
|
psraw mm1, 6 ; (rres + 32) >> 6
|
|
paddw mm2, mm7 ; rres + 32
|
|
psraw mm2, 6 ; (rres + 32) >> 6
|
|
paddw mm3, mm7 ; rres + 32
|
|
psraw mm3, 6 ; (rres + 32) >> 6
|
|
|
|
pxor mm7, mm7
|
|
|
|
; convert mpr from unsigned char to short
|
|
movd mm4, DWORD PTR 0[ecx]
|
|
movd mm5, DWORD PTR 16[ecx]
|
|
movd mm6, DWORD PTR 32[ecx]
|
|
punpcklbw mm4, mm7
|
|
punpcklbw mm5, mm7
|
|
punpcklbw mm6, mm7
|
|
paddsw mm4, mm0 ; pred_row + rres_row
|
|
movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3]
|
|
paddsw mm5, mm1 ; pred_row + rres_row
|
|
punpcklbw mm0, mm7
|
|
paddsw mm6, mm2 ; pred_row + rres_row
|
|
paddsw mm0, mm3 ; pred_row + rres_row
|
|
; results in mm4, mm5, mm6, mm0
|
|
|
|
; move back to 8 bit
|
|
packuswb mm4, mm7
|
|
packuswb mm5, mm7
|
|
packuswb mm6, mm7
|
|
packuswb mm0, mm7
|
|
movd DWORD PTR 0[edx], mm4
|
|
movd DWORD PTR 16[edx], mm5
|
|
movd DWORD PTR 32[edx], mm6
|
|
movd DWORD PTR 48[edx], mm0
|
|
ret 0
|
|
|
|
_itrans4x4_mmx_direct ENDP
|
|
_itrans4x4_mmx ENDP
|
|
_TEXT ENDS
|
|
|
|
EXTRN _itrans_sp:PROC
|
|
EXTRN _Inv_Residual_trans_4x4:PROC
|
|
PUBLIC _iMBtrans4x4
|
|
EXTRN _opt:BYTE
|
|
_TEXT SEGMENT
|
|
_currSlice$ = -4 ; size = 4
|
|
_mb_rec$166704 = 8 ; size = 4
|
|
_currMB$ = 8 ; size = 4
|
|
_curr_img$ = 12 ; size = 4
|
|
_pl$ = 8 ; second parameter
|
|
_smb$ = 16 ; size = 4
|
|
_iMBtrans4x4 PROC
|
|
push ecx
|
|
push ebx
|
|
push ebp
|
|
push esi
|
|
STACKOFFSET = 16
|
|
; 408 : VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
|
|
|
|
mov esi, DWORD PTR _pl$[esp+STACKOFFSET]
|
|
push edi
|
|
STACKOFFSET = STACKOFFSET + 4
|
|
mov edi, DWORD PTR _currMB$[esp+16]
|
|
mov ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice
|
|
mov eax, DWORD PTR [edi+4]
|
|
mov eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture;
|
|
mov DWORD PTR _currSlice$[esp+20], ebp
|
|
mov ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture]
|
|
mov DWORD PTR _curr_img$[esp+16], ecx
|
|
|
|
cmp DWORD PTR _smb$[esp+16], 0 ; if (smb)
|
|
; 413 : {
|
|
; 414 : h264_short_block_t *blocks = currSlice->cof4[pl];
|
|
; 415 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
|
|
; 416 :
|
|
; 417 : itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
|
|
|
|
je $LN4@iMBtrans4x
|
|
push 0
|
|
push 0
|
|
mov eax, esi
|
|
shl eax, 9
|
|
lea ebx, DWORD PTR [eax+ebp+cof@Slice]
|
|
mov ecx, esi
|
|
shl ecx, 8
|
|
lea ebp, DWORD PTR [ecx+ebp+mb_pred@Slice]
|
|
push esi
|
|
push ebp
|
|
push ebx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 418 : itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
|
|
|
|
push 0
|
|
push 4
|
|
push esi
|
|
lea edx, DWORD PTR [ebx+32]
|
|
push ebp
|
|
push edx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 419 : itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
|
|
|
|
push 4
|
|
push 0
|
|
push esi
|
|
lea eax, DWORD PTR [ebx+64]
|
|
push ebp
|
|
push eax
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 420 : itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
|
|
|
|
push 4
|
|
push 4
|
|
push esi
|
|
lea ecx, DWORD PTR [ebx+96]
|
|
push ebp
|
|
push ecx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
add esp, 80 ; 00000050H
|
|
|
|
; 421 : itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
|
|
|
|
push 0
|
|
push 8
|
|
push esi
|
|
lea edx, DWORD PTR [ebx+128]
|
|
push ebp
|
|
push edx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 422 : itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
|
|
|
|
push 0
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
lea eax, DWORD PTR [ebx+160]
|
|
push ebp
|
|
push eax
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 423 : itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
|
|
|
|
push 4
|
|
push 8
|
|
push esi
|
|
lea ecx, DWORD PTR [ebx+192]
|
|
push ebp
|
|
push ecx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 424 : itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
|
|
|
|
push 4
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
lea edx, DWORD PTR [ebx+224]
|
|
push ebp
|
|
push edx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
add esp, 80 ; 00000050H
|
|
|
|
; 425 : itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
|
|
|
|
push 8
|
|
push 0
|
|
push esi
|
|
lea eax, DWORD PTR [ebx+256]
|
|
push ebp
|
|
push eax
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 426 : itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
|
|
|
|
push 8
|
|
push 4
|
|
push esi
|
|
push ebp
|
|
lea ecx, DWORD PTR [ebx+288]
|
|
push ecx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 427 : itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 0
|
|
push esi
|
|
lea edx, DWORD PTR [ebx+320]
|
|
push ebp
|
|
push edx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 428 : itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 4
|
|
push esi
|
|
lea eax, DWORD PTR [ebx+352]
|
|
push ebp
|
|
push eax
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
add esp, 80 ; 00000050H
|
|
|
|
; 429 : itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
|
|
|
|
push 8
|
|
push 8
|
|
push esi
|
|
lea ecx, DWORD PTR [ebx+384]
|
|
push ebp
|
|
push ecx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 430 : itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
|
|
|
|
push 8
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
lea edx, DWORD PTR [ebx+416]
|
|
push ebp
|
|
push edx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 431 : itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 8
|
|
push esi
|
|
lea eax, DWORD PTR [ebx+448]
|
|
push ebp
|
|
push eax
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
|
|
; 432 : itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
add ebx, 480 ; 000001e0H
|
|
push ebp
|
|
push ebx
|
|
mov eax, edi
|
|
call _itrans_sp
|
|
mov ebp, DWORD PTR _currSlice$[esp+100]
|
|
add esp, 80 ; 00000050H
|
|
jmp COPY_16x16
|
|
|
|
$LN4@iMBtrans4x:
|
|
|
|
; 433 : }
|
|
; 434 : else if (currMB->is_lossless)
|
|
|
|
cmp DWORD PTR [edi+84], 0
|
|
je $LN2@iMBtrans4x
|
|
|
|
push 0
|
|
push 0
|
|
|
|
; 435 : {
|
|
; 436 : Inv_Residual_trans_4x4(currMB, pl, 0, 0);
|
|
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 437 : Inv_Residual_trans_4x4(currMB, pl, 4, 0);
|
|
|
|
push 0
|
|
push 4
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 438 : Inv_Residual_trans_4x4(currMB, pl, 0, 4);
|
|
|
|
push 4
|
|
push 0
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 439 : Inv_Residual_trans_4x4(currMB, pl, 4, 4);
|
|
|
|
push 4
|
|
push 4
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
add esp, 64 ; 00000040H
|
|
|
|
; 440 : Inv_Residual_trans_4x4(currMB, pl, 8, 0);
|
|
|
|
push 0
|
|
push 8
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 441 : Inv_Residual_trans_4x4(currMB, pl, 12, 0);
|
|
|
|
push 0
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 442 : Inv_Residual_trans_4x4(currMB, pl, 8, 4);
|
|
|
|
push 4
|
|
push 8
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 443 : Inv_Residual_trans_4x4(currMB, pl, 12, 4);
|
|
|
|
push 4
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
add esp, 64 ; 00000040H
|
|
|
|
; 444 : Inv_Residual_trans_4x4(currMB, pl, 0, 8);
|
|
|
|
push 8
|
|
push 0
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 445 : Inv_Residual_trans_4x4(currMB, pl, 4, 8);
|
|
|
|
push 8
|
|
push 4
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 446 : Inv_Residual_trans_4x4(currMB, pl, 0, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 0
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 447 : Inv_Residual_trans_4x4(currMB, pl, 4, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 4
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
add esp, 64 ; 00000040H
|
|
|
|
; 448 : Inv_Residual_trans_4x4(currMB, pl, 8, 8);
|
|
|
|
push 8
|
|
push 8
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 449 : Inv_Residual_trans_4x4(currMB, pl, 12, 8);
|
|
|
|
push 8
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 450 : Inv_Residual_trans_4x4(currMB, pl, 8, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 8
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
|
|
; 451 : Inv_Residual_trans_4x4(currMB, pl, 12, 12);
|
|
|
|
push 12 ; 0000000cH
|
|
push 12 ; 0000000cH
|
|
push esi
|
|
push edi
|
|
call _Inv_Residual_trans_4x4
|
|
add esp, 64 ; 00000040H
|
|
|
|
; 452 : }
|
|
; 453 : else
|
|
|
|
jmp COPY_16x16
|
|
$LN2@iMBtrans4x:
|
|
|
|
; 454 : {
|
|
; 455 : const h264_short_block_t *blocks = currSlice->cof4[pl];
|
|
; 456 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
|
|
|
|
mov edx, esi
|
|
mov ecx, esi
|
|
shl edx, 8
|
|
shl ecx, 9
|
|
lea eax, DWORD PTR [edx+ebp]
|
|
lea ebx, DWORD PTR [ecx+ebp+cof@Slice]
|
|
|
|
; 457 : h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
|
|
|
|
; put things in registers that itrans4x4_mmx_direct wants
|
|
lea edx, [eax + mb_rec@Slice]; mb_rec
|
|
lea ecx, [eax + mb_pred@Slice] ; mb_pred
|
|
mov eax, ebx ; blocks
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+32]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+128]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+160]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
|
|
|
|
; second row
|
|
lea edx, [edx+52]
|
|
lea ecx, [ecx+52]
|
|
lea eax, [ebx+64]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+96]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+192]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+224]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
|
|
|
|
; third row
|
|
lea edx, [edx+52]
|
|
lea ecx, [ecx+52]
|
|
lea eax, [ebx+256]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+288]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+384]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+416]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
|
|
|
|
; fourth row
|
|
lea edx, [edx+52]
|
|
lea ecx, [ecx+52]
|
|
lea eax, [ebx+320]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+352]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+448]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
|
|
|
|
lea edx, [edx+4]
|
|
lea ecx, [ecx+4]
|
|
lea eax, [ebx+480]
|
|
call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
|
|
COPY_16x16:
|
|
|
|
; construct picture from 4x4 blocks
|
|
; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
|
|
|
|
mov eax, DWORD PTR [edi+40]
|
|
mov ecx, DWORD PTR [edi+36]
|
|
shl esi, 8
|
|
lea edx, DWORD PTR [esi+ebp+mb_rec@Slice]
|
|
push edx
|
|
mov edx, DWORD PTR _curr_img$[esp+20]
|
|
push eax
|
|
push ecx
|
|
push edx
|
|
call DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions
|
|
add esp, 16 ; 00000010H
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
pop ebx
|
|
pop ecx
|
|
ret 0
|
|
_iMBtrans4x4 ENDP
|
|
_TEXT ENDS
|
|
|
|
_TEXT SEGMENT
|
|
ALIGN 2
|
|
PUBLIC _itrans8x8_sse2
|
|
_itrans8x8_sse2 PROC NEAR
|
|
; parameter 1(mb_rec): 8 + ebp
|
|
; parameter 2(mb_pred): 12 + ebp
|
|
; parameter 3(block): 16 + ebp
|
|
; parameter 4(pos_x): 20 + ebp
|
|
push ebp
|
|
mov ebp, esp
|
|
and esp, -16
|
|
sub esp, 176
|
|
mov edx, DWORD PTR [ebp+20]
|
|
mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec
|
|
add ecx, edx
|
|
add edx, DWORD PTR [ebp+12] ; edx: mb_pred
|
|
mov eax, DWORD PTR [ebp+16] ; eax: block
|
|
|
|
;;; __m128i a0, a1, a2, a3;
|
|
;;; __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
|
|
;;; __m128i b0, b1, b2, b3, b4, b5, b6, b7;
|
|
;;; __m128i r0, r1, r2, r3, r4, r5, r6, r7;
|
|
;;; __m128i const32, zero;
|
|
;;; __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
|
|
;;; __m128i pred0, pred1;
|
|
;;;
|
|
;;; const32 = _mm_load_si128((const __m128i *)c32);
|
|
|
|
movdqa xmm0, XMMWORD PTR const32
|
|
|
|
;;; zero = _mm_setzero_si128();
|
|
;;;
|
|
;;; // Horizontal
|
|
;;; b0 = _mm_load_si128((__m128i *)(block[0]));
|
|
|
|
movdqa xmm4, XMMWORD PTR [eax]
|
|
|
|
;;; b1 = _mm_load_si128((__m128i *)(block[1]));
|
|
|
|
movdqa xmm7, XMMWORD PTR [eax+16]
|
|
|
|
;;; b2 = _mm_load_si128((__m128i *)(block[2]));
|
|
|
|
movdqa xmm5, XMMWORD PTR [eax+32]
|
|
|
|
;;; b3 = _mm_load_si128((__m128i *)(block[3]));
|
|
|
|
movdqa xmm3, XMMWORD PTR [eax+48]
|
|
|
|
;;; b4 = _mm_load_si128((__m128i *)(block[4]));
|
|
|
|
movdqa xmm6, XMMWORD PTR [eax+64]
|
|
|
|
;;; b5 = _mm_load_si128((__m128i *)(block[5]));
|
|
;;; b6 = _mm_load_si128((__m128i *)(block[6]));
|
|
|
|
movdqa xmm1, XMMWORD PTR [eax+96]
|
|
|
|
;;; b7 = _mm_load_si128((__m128i *)(block[7]));
|
|
|
|
movdqa xmm2, XMMWORD PTR [eax+112]
|
|
movdqa XMMWORD PTR [esp], xmm0
|
|
movdqa xmm0, XMMWORD PTR [eax+80]
|
|
movdqa XMMWORD PTR [esp+16], xmm2
|
|
|
|
;;;
|
|
;;; /* rotate 8x8 (ugh) */
|
|
;;; r0 = _mm_unpacklo_epi16(b0, b2);
|
|
|
|
movdqa xmm2, xmm4
|
|
punpcklwd xmm2, xmm5
|
|
|
|
;;; r1 = _mm_unpacklo_epi16(b1, b3);
|
|
;;; r2 = _mm_unpackhi_epi16(b0, b2);
|
|
|
|
punpckhwd xmm4, xmm5
|
|
|
|
;;; r3 = _mm_unpackhi_epi16(b1, b3);
|
|
;;; r4 = _mm_unpacklo_epi16(b4, b6);
|
|
;;; r5 = _mm_unpacklo_epi16(b5, b7);
|
|
|
|
movdqa xmm5, xmm0
|
|
movdqa XMMWORD PTR [esp+32], xmm2
|
|
movdqa xmm2, xmm7
|
|
punpcklwd xmm2, xmm3
|
|
punpckhwd xmm7, xmm3
|
|
movdqa xmm3, xmm6
|
|
punpcklwd xmm3, xmm1
|
|
movdqa XMMWORD PTR [esp+48], xmm3
|
|
movdqa xmm3, XMMWORD PTR [esp+16]
|
|
punpcklwd xmm5, xmm3
|
|
|
|
;;; r6 = _mm_unpackhi_epi16(b4, b6);
|
|
|
|
punpckhwd xmm6, xmm1
|
|
;;; r7 = _mm_unpackhi_epi16(b5, b7);
|
|
|
|
punpckhwd xmm0, xmm3
|
|
|
|
;;;
|
|
;;; b0 = _mm_unpacklo_epi16(r0, r1);
|
|
|
|
movdqa xmm3, XMMWORD PTR [esp+32]
|
|
movdqa xmm1, xmm3
|
|
punpcklwd xmm1, xmm2
|
|
|
|
;;; b1 = _mm_unpackhi_epi16(r0, r1);
|
|
|
|
punpckhwd xmm3, xmm2
|
|
|
|
;;; b2 = _mm_unpacklo_epi16(r2, r3);
|
|
|
|
movdqa xmm2, xmm4
|
|
punpcklwd xmm2, xmm7
|
|
|
|
;;; b3 = _mm_unpackhi_epi16(r2, r3);
|
|
|
|
punpckhwd xmm4, xmm7
|
|
movdqa XMMWORD PTR [esp+64], xmm4
|
|
|
|
;;; b4 = _mm_unpacklo_epi16(r4, r5);
|
|
|
|
movdqa xmm4, XMMWORD PTR [esp+48]
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm5
|
|
|
|
;;; b5 = _mm_unpackhi_epi16(r4, r5);
|
|
|
|
punpckhwd xmm4, xmm5
|
|
|
|
;;; b6 = _mm_unpacklo_epi16(r6, r7);
|
|
|
|
movdqa xmm5, xmm6
|
|
punpcklwd xmm5, xmm0
|
|
|
|
;;; b7 = _mm_unpackhi_epi16(r6, r7);
|
|
|
|
punpckhwd xmm6, xmm0
|
|
|
|
;;;
|
|
;;; p0 = _mm_unpacklo_epi64(b0, b4);
|
|
|
|
movdqa xmm0, xmm1
|
|
punpcklqdq xmm0, xmm7
|
|
|
|
;;; p1 = _mm_unpackhi_epi64(b0, b4);
|
|
|
|
punpckhqdq xmm1, xmm7
|
|
movdqa XMMWORD PTR [esp+16], xmm1
|
|
|
|
;;; p2 = _mm_unpacklo_epi64(b1, b5);
|
|
|
|
movdqa xmm1, xmm3
|
|
punpcklqdq xmm1, xmm4
|
|
|
|
;;; p3 = _mm_unpackhi_epi64(b1, b5);
|
|
;;; p4 = _mm_unpacklo_epi64(b2, b6);
|
|
;;; p5 = _mm_unpackhi_epi64(b2, b6);
|
|
;;; p6 = _mm_unpacklo_epi64(b3, b7);
|
|
;;; p7 = _mm_unpackhi_epi64(b3, b7);
|
|
;;;
|
|
;;; /* perform approx DCT */
|
|
;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
|
|
;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
|
|
;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
|
|
|
|
movdqa xmm7, xmm1
|
|
psraw xmm7, 1
|
|
punpckhqdq xmm3, xmm4
|
|
movdqa XMMWORD PTR [esp+32], xmm3
|
|
movdqa xmm3, xmm2
|
|
punpcklqdq xmm3, xmm5
|
|
punpckhqdq xmm2, xmm5
|
|
movdqa xmm5, XMMWORD PTR [esp+64]
|
|
movdqa xmm4, xmm5
|
|
punpcklqdq xmm4, xmm6
|
|
punpckhqdq xmm5, xmm6
|
|
movdqa xmm6, xmm0
|
|
paddw xmm6, xmm3
|
|
psubw xmm0, xmm3
|
|
|
|
;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
|
|
|
|
movdqa xmm3, xmm4
|
|
|
|
;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
|
|
|
|
psraw xmm4, 1
|
|
psubw xmm3, xmm7
|
|
|
|
;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
|
|
|
|
paddw xmm1, xmm4
|
|
|
|
;;;
|
|
;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
|
|
|
|
movdqa xmm4, xmm6
|
|
|
|
;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
|
|
|
|
movdqa xmm7, xmm0
|
|
paddw xmm4, xmm1
|
|
psubw xmm7, xmm3
|
|
movdqa XMMWORD PTR [esp+48], xmm7
|
|
|
|
;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
|
|
|
|
paddw xmm0, xmm3
|
|
movdqa XMMWORD PTR [esp+80], xmm0
|
|
|
|
;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
|
|
;;;
|
|
;;; //-p3 + p5 - p7 - (p7 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
|
|
;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
|
|
|
|
movdqa xmm0, XMMWORD PTR [esp+32]
|
|
psubw xmm6, xmm1
|
|
movdqa xmm1, xmm5
|
|
psraw xmm1, 1
|
|
movdqa xmm3, xmm2
|
|
|
|
;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
|
|
;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
|
|
;;;
|
|
;;; //p1 + p7 - p3 - (p3 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
|
|
|
|
movdqa xmm7, xmm0
|
|
movdqa XMMWORD PTR [esp+96], xmm6
|
|
|
|
;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
|
|
|
|
movdqa xmm6, XMMWORD PTR [esp+16]
|
|
psubw xmm3, xmm0
|
|
psubw xmm3, xmm5
|
|
psraw xmm7, 1
|
|
psubw xmm3, xmm1
|
|
movdqa xmm1, xmm6
|
|
paddw xmm1, xmm5
|
|
|
|
;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
|
|
|
|
psubw xmm1, xmm0
|
|
|
|
;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
|
|
|
|
psubw xmm1, xmm7
|
|
|
|
;;;
|
|
;;; // -p1 + p7 + p5 + (p5 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
|
|
|
|
movdqa xmm7, xmm2
|
|
psraw xmm7, 1
|
|
|
|
;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
|
|
|
|
psubw xmm5, xmm6
|
|
|
|
;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
|
|
|
|
paddw xmm5, xmm2
|
|
|
|
;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
|
|
|
|
paddw xmm5, xmm7
|
|
|
|
;;;
|
|
;;; // p3 + p5 + p1 + (p1 >> 1);
|
|
;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
|
|
|
|
paddw xmm0, xmm2
|
|
|
|
;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
|
|
;;; p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
|
|
;;; a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
|
|
;;;
|
|
;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
|
|
;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
|
|
;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
|
|
;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
|
|
;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
|
|
;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
|
|
;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
|
|
;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
|
|
;;;
|
|
;;; p0 = _mm_add_epi16(b0, b7); // b0 + b7;
|
|
;;; p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
|
|
;;; p2 = _mm_add_epi16(b4, b3); // b4 + b3;
|
|
;;; p3 = _mm_add_epi16(b6, b1); // b6 + b1;
|
|
|
|
movdqa xmm2, XMMWORD PTR [esp+96]
|
|
paddw xmm0, xmm6
|
|
psraw xmm6, 1
|
|
paddw xmm0, xmm6
|
|
movdqa xmm7, xmm0
|
|
movdqa xmm6, xmm5
|
|
psraw xmm7, 2
|
|
paddw xmm7, xmm3
|
|
psraw xmm6, 2
|
|
paddw xmm6, xmm1
|
|
psraw xmm1, 2
|
|
psubw xmm5, xmm1
|
|
movdqa xmm1, xmm4
|
|
psraw xmm3, 2
|
|
psubw xmm0, xmm3
|
|
movdqa xmm3, XMMWORD PTR [esp+80]
|
|
movdqa XMMWORD PTR [esp+32], xmm0
|
|
|
|
;;; p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
|
|
;;; p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
|
|
;;; p6 = _mm_add_epi16(b2, b5); // b2 + b5;
|
|
;;; p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
|
|
|
|
psubw xmm4, XMMWORD PTR [esp+32]
|
|
paddw xmm1, xmm0
|
|
movdqa XMMWORD PTR [esp+112], xmm1
|
|
movdqa xmm1, XMMWORD PTR [esp+48]
|
|
movdqa xmm0, xmm1
|
|
psubw xmm0, xmm5
|
|
movdqa XMMWORD PTR [esp+16], xmm0
|
|
movdqa xmm0, xmm3
|
|
paddw xmm0, xmm6
|
|
psubw xmm3, xmm6
|
|
movdqa XMMWORD PTR [esp+128], xmm0
|
|
|
|
;;;
|
|
;;; /* rotate 8x8 (ugh) */
|
|
;;; r0 = _mm_unpacklo_epi16(p0, p2);
|
|
|
|
movdqa xmm6, XMMWORD PTR [esp+128]
|
|
movdqa xmm0, xmm2
|
|
paddw xmm0, xmm7
|
|
psubw xmm2, xmm7
|
|
paddw xmm1, xmm5
|
|
movdqa xmm5, XMMWORD PTR [esp+112]
|
|
movdqa XMMWORD PTR [esp+144], xmm4
|
|
movdqa xmm4, xmm5
|
|
punpcklwd xmm4, xmm6
|
|
|
|
;;; r1 = _mm_unpacklo_epi16(p1, p3);
|
|
;;; r2 = _mm_unpackhi_epi16(p0, p2);
|
|
|
|
punpckhwd xmm5, xmm6
|
|
|
|
;;; r3 = _mm_unpackhi_epi16(p1, p3);
|
|
;;; r4 = _mm_unpacklo_epi16(p4, p6);
|
|
;;; r5 = _mm_unpacklo_epi16(p5, p7);
|
|
|
|
movdqa xmm6, xmm3
|
|
movdqa XMMWORD PTR [esp+64], xmm4
|
|
movdqa xmm4, XMMWORD PTR [esp+16]
|
|
movdqa xmm7, xmm4
|
|
punpcklwd xmm7, xmm0
|
|
punpckhwd xmm4, xmm0
|
|
movdqa xmm0, xmm2
|
|
punpcklwd xmm0, xmm1
|
|
movdqa XMMWORD PTR [esp+128], xmm0
|
|
movdqa xmm0, XMMWORD PTR [esp+144]
|
|
punpcklwd xmm6, xmm0
|
|
|
|
;;; r6 = _mm_unpackhi_epi16(p4, p6);
|
|
|
|
punpckhwd xmm2, xmm1
|
|
|
|
;;; r7 = _mm_unpackhi_epi16(p5, p7);
|
|
;;;
|
|
;;; b0 = _mm_unpacklo_epi16(r0, r1);
|
|
|
|
movdqa xmm1, XMMWORD PTR [esp+64]
|
|
punpckhwd xmm3, xmm0
|
|
movdqa xmm0, xmm1
|
|
punpcklwd xmm0, xmm7
|
|
|
|
;;; b1 = _mm_unpackhi_epi16(r0, r1);
|
|
|
|
punpckhwd xmm1, xmm7
|
|
|
|
;;; b2 = _mm_unpacklo_epi16(r2, r3);
|
|
|
|
movdqa xmm7, xmm5
|
|
punpcklwd xmm7, xmm4
|
|
|
|
;;; b3 = _mm_unpackhi_epi16(r2, r3);
|
|
|
|
punpckhwd xmm5, xmm4
|
|
movdqa XMMWORD PTR [esp+112], xmm5
|
|
|
|
;;; b4 = _mm_unpacklo_epi16(r4, r5);
|
|
|
|
movdqa xmm5, XMMWORD PTR [esp+128]
|
|
movdqa xmm4, xmm5
|
|
punpcklwd xmm4, xmm6
|
|
|
|
;;; b5 = _mm_unpackhi_epi16(r4, r5);
|
|
|
|
punpckhwd xmm5, xmm6
|
|
|
|
;;; b6 = _mm_unpacklo_epi16(r6, r7);
|
|
|
|
movdqa xmm6, xmm2
|
|
punpcklwd xmm6, xmm3
|
|
|
|
;;; b7 = _mm_unpackhi_epi16(r6, r7);
|
|
|
|
punpckhwd xmm2, xmm3
|
|
|
|
;;;
|
|
;;; p0 = _mm_unpacklo_epi64(b0, b4);
|
|
|
|
movdqa xmm3, xmm0
|
|
punpcklqdq xmm3, xmm4
|
|
|
|
;;; p1 = _mm_unpackhi_epi64(b0, b4);
|
|
|
|
punpckhqdq xmm0, xmm4
|
|
movdqa XMMWORD PTR [esp+144], xmm0
|
|
|
|
;;; p2 = _mm_unpacklo_epi64(b1, b5);
|
|
;;; p3 = _mm_unpackhi_epi64(b1, b5);
|
|
;;; p4 = _mm_unpacklo_epi64(b2, b6);
|
|
;;; p5 = _mm_unpackhi_epi64(b2, b6);
|
|
;;; p6 = _mm_unpacklo_epi64(b3, b7);
|
|
|
|
movdqa xmm0, XMMWORD PTR [esp+112]
|
|
movdqa xmm4, xmm1
|
|
punpcklqdq xmm4, xmm5
|
|
punpckhqdq xmm1, xmm5
|
|
movdqa XMMWORD PTR [esp+64], xmm1
|
|
movdqa xmm1, xmm7
|
|
movdqa xmm5, xmm0
|
|
punpcklqdq xmm1, xmm6
|
|
punpckhqdq xmm7, xmm6
|
|
|
|
;;; p7 = _mm_unpackhi_epi64(b3, b7);
|
|
;;;
|
|
;;;
|
|
;;; /* Vertical */
|
|
;;;
|
|
;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
|
|
;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
|
|
;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
|
|
|
|
movdqa xmm6, xmm4
|
|
psraw xmm6, 1
|
|
punpcklqdq xmm5, xmm2
|
|
punpckhqdq xmm0, xmm2
|
|
movdqa xmm2, xmm3
|
|
paddw xmm2, xmm1
|
|
psubw xmm3, xmm1
|
|
|
|
;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
|
|
|
|
movdqa xmm1, xmm5
|
|
|
|
;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
|
|
|
|
psraw xmm5, 1
|
|
psubw xmm1, xmm6
|
|
|
|
;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
|
|
|
|
paddw xmm4, xmm5
|
|
|
|
;;;
|
|
;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
|
|
|
|
movdqa xmm5, xmm2
|
|
|
|
;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
|
|
|
|
movdqa xmm6, xmm3
|
|
paddw xmm5, xmm4
|
|
psubw xmm6, xmm1
|
|
movdqa XMMWORD PTR [esp+128], xmm6
|
|
|
|
;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
|
|
;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
|
|
;;;
|
|
;;; //-p3 + p5 - p7 - (p7 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
|
|
;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
|
|
|
|
movdqa xmm6, XMMWORD PTR [esp+64]
|
|
paddw xmm3, xmm1
|
|
movdqa XMMWORD PTR [esp+80], xmm3
|
|
psubw xmm2, xmm4
|
|
movdqa xmm1, xmm0
|
|
psraw xmm1, 1
|
|
movdqa xmm3, xmm7
|
|
movdqa XMMWORD PTR [esp+96], xmm2
|
|
psubw xmm3, xmm6
|
|
|
|
;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
|
|
|
|
psubw xmm3, xmm0
|
|
|
|
;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
|
|
;;;
|
|
;;; //p1 + p7 - p3 - (p3 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
|
|
|
|
movdqa xmm2, xmm6
|
|
psraw xmm2, 1
|
|
psubw xmm3, xmm1
|
|
|
|
;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
|
|
|
|
movdqa xmm1, XMMWORD PTR [esp+144]
|
|
movdqa xmm4, xmm1
|
|
paddw xmm4, xmm0
|
|
|
|
;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
|
|
|
|
psubw xmm4, xmm6
|
|
|
|
;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
|
|
|
|
psubw xmm4, xmm2
|
|
|
|
;;;
|
|
;;; // -p1 + p7 + p5 + (p5 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
|
|
|
|
movdqa xmm2, xmm7
|
|
psraw xmm2, 1
|
|
|
|
;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
|
|
|
|
psubw xmm0, xmm1
|
|
|
|
;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
|
|
|
|
paddw xmm0, xmm7
|
|
|
|
;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
|
|
|
|
paddw xmm0, xmm2
|
|
|
|
;;;
|
|
;;; // p3 + p5 + p1 + (p1 >> 1);
|
|
;;; r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
|
|
|
|
movdqa xmm2, xmm1
|
|
psraw xmm2, 1
|
|
|
|
;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
|
|
|
|
paddw xmm6, xmm7
|
|
|
|
;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
|
|
;;; a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
|
|
;;;
|
|
;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
|
|
;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
|
|
;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
|
|
;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
|
|
;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
|
|
;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
|
|
;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
|
|
;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
|
|
;;;
|
|
;;; r0 = _mm_add_epi16(b0, b7); // b0 + b7;
|
|
;;; r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
|
|
|
|
movdqa xmm7, XMMWORD PTR [esp+128]
|
|
paddw xmm6, xmm1
|
|
paddw xmm6, xmm2
|
|
movdqa xmm1, xmm6
|
|
psraw xmm1, 2
|
|
movdqa xmm2, xmm0
|
|
paddw xmm1, xmm3
|
|
psraw xmm2, 2
|
|
paddw xmm2, xmm4
|
|
psraw xmm4, 2
|
|
psubw xmm0, xmm4
|
|
psraw xmm3, 2
|
|
psubw xmm6, xmm3
|
|
movdqa XMMWORD PTR [esp+64], xmm6
|
|
movdqa xmm3, xmm5
|
|
|
|
;;; r2 = _mm_add_epi16(b4, b3); // b4 + b3;
|
|
;;; r3 = _mm_add_epi16(b6, b1); // b6 + b1;
|
|
;;; r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
|
|
;;; r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
|
|
;;; r6 = _mm_add_epi16(b2, b5); // b2 + b5;
|
|
;;; r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
|
|
|
|
psubw xmm5, XMMWORD PTR [esp+64]
|
|
paddw xmm3, xmm6
|
|
movdqa XMMWORD PTR [esp+144], xmm3
|
|
movdqa xmm3, xmm7
|
|
psubw xmm3, xmm0
|
|
movdqa XMMWORD PTR [esp+48], xmm3
|
|
movdqa xmm3, XMMWORD PTR [esp+80]
|
|
movdqa xmm4, xmm3
|
|
paddw xmm4, xmm2
|
|
psubw xmm3, xmm2
|
|
|
|
;;;
|
|
;;;
|
|
;;; // add in prediction values
|
|
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
|
|
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
|
|
;;; // (x + 32) >> 6
|
|
;;; r0 = _mm_adds_epi16(r0, const32);
|
|
|
|
movdqa xmm2, XMMWORD PTR const32
|
|
movdqa XMMWORD PTR [esp+16], xmm4
|
|
movdqa xmm4, XMMWORD PTR [esp+96]
|
|
movdqa xmm6, xmm4
|
|
paddw xmm6, xmm1
|
|
psubw xmm4, xmm1
|
|
|
|
;;; r0 = _mm_srai_epi16(r0, 6);
|
|
;;; r1 = _mm_adds_epi16(r1, const32);
|
|
|
|
movdqa xmm1, XMMWORD PTR [esp+48]
|
|
paddw xmm7, xmm0
|
|
movdqa xmm0, XMMWORD PTR [esp+144]
|
|
movdqa XMMWORD PTR [esp+128], xmm7
|
|
|
|
;;; r1 = _mm_srai_epi16(r1, 6);
|
|
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
|
|
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
|
|
|
|
movq xmm7, QWORD PTR [edx+16]
|
|
movdqa XMMWORD PTR [esp+32], xmm5
|
|
paddsw xmm0, xmm2
|
|
psraw xmm0, 6
|
|
paddsw xmm1, xmm2
|
|
pxor xmm2, xmm2
|
|
punpcklbw xmm7, xmm2
|
|
movq xmm5, QWORD PTR [edx]
|
|
punpcklbw xmm5, xmm2
|
|
psraw xmm1, 6
|
|
|
|
;;; pred0 = _mm_adds_epi16(pred0, r0);
|
|
;;; pred1 = _mm_adds_epi16(pred1, r1);
|
|
|
|
paddsw xmm7, xmm1
|
|
paddsw xmm5, xmm0
|
|
|
|
;;;
|
|
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
|
|
|
|
packuswb xmm5, xmm7
|
|
|
|
;;;
|
|
;;; // store
|
|
;;; _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
|
|
|
|
movdqa xmm0, XMMWORD PTR [esp+32]
|
|
movdqa xmm2, XMMWORD PTR [esp+128]
|
|
movq QWORD PTR [ecx], xmm5
|
|
|
|
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
|
|
;;; pred0 = _mm_srli_si128(pred0, 8);
|
|
|
|
psrldq xmm5, 8
|
|
|
|
;;; _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
|
|
|
|
movq QWORD PTR [ecx+16], xmm5
|
|
|
|
;;;
|
|
;;; /* --- */
|
|
;;;
|
|
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
|
|
|
|
movq xmm1, QWORD PTR [edx+32]
|
|
|
|
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
|
|
;;; // (x + 32) >> 6
|
|
;;; r2 = _mm_adds_epi16(r2, const32);
|
|
|
|
movdqa xmm5, XMMWORD PTR [esp]
|
|
movdqa XMMWORD PTR [esp+32], xmm0 ;
|
|
|
|
;;; r2 = _mm_srai_epi16(r2, 6);
|
|
;;; r3 = _mm_adds_epi16(r3, const32);
|
|
|
|
paddsw xmm6, xmm5
|
|
|
|
;;; r3 = _mm_srai_epi16(r3, 6);
|
|
|
|
psraw xmm6, 6
|
|
|
|
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
|
|
|
|
pxor xmm7, xmm7
|
|
punpcklbw xmm1, xmm7
|
|
movdqa xmm0, XMMWORD PTR [esp+16]
|
|
paddsw xmm0, xmm5
|
|
psraw xmm0, 6
|
|
|
|
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
|
|
;;; pred0 = _mm_adds_epi16(pred0, r2);
|
|
|
|
paddsw xmm1, xmm0
|
|
|
|
;;; pred1 = _mm_adds_epi16(pred1, r3);
|
|
;;;
|
|
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
|
|
;;;
|
|
;;; // store
|
|
;;; _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
|
|
|
|
movdqa xmm0, XMMWORD PTR [esp+32]
|
|
movq xmm5, QWORD PTR [edx+48]
|
|
punpcklbw xmm5, xmm7
|
|
paddsw xmm5, xmm6
|
|
packuswb xmm1, xmm5
|
|
movq QWORD PTR [ecx+32], xmm1
|
|
|
|
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
|
|
;;; pred0 = _mm_srli_si128(pred0, 8);
|
|
|
|
psrldq xmm1, 8
|
|
|
|
;;; _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
|
|
|
|
movq QWORD PTR [ecx+48], xmm1
|
|
|
|
;;;
|
|
;;; /* --- */
|
|
;;;
|
|
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
|
|
|
|
movq xmm7, QWORD PTR [edx+64]
|
|
|
|
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
|
|
|
|
movq xmm6, QWORD PTR [edx+80]
|
|
|
|
;;; // (x + 32) >> 6
|
|
;;; r4 = _mm_adds_epi16(r4, const32);
|
|
;;; r4 = _mm_srai_epi16(r4, 6);
|
|
;;; r5 = _mm_adds_epi16(r5, const32);
|
|
;;; r5 = _mm_srai_epi16(r5, 6);
|
|
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
|
|
|
|
pxor xmm5, xmm5
|
|
punpcklbw xmm7, xmm5
|
|
|
|
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
|
|
|
|
punpcklbw xmm6, xmm5
|
|
movdqa xmm1, XMMWORD PTR [esp]
|
|
paddsw xmm4, xmm1
|
|
psraw xmm4, 6
|
|
paddsw xmm3, xmm1
|
|
psraw xmm3, 6
|
|
|
|
;;; pred0 = _mm_adds_epi16(pred0, r4);
|
|
|
|
paddsw xmm7, xmm4
|
|
|
|
;;; pred1 = _mm_adds_epi16(pred1, r5);
|
|
|
|
paddsw xmm6, xmm3
|
|
|
|
;;;
|
|
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
|
|
|
|
packuswb xmm7, xmm6
|
|
|
|
;;;
|
|
;;; // store
|
|
;;; _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
|
|
|
|
movq QWORD PTR [ecx+64], xmm7
|
|
|
|
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
|
|
;;; pred0 = _mm_srli_si128(pred0, 8);
|
|
|
|
psrldq xmm7, 8
|
|
|
|
;;; _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
|
|
|
|
movq QWORD PTR [ecx+80], xmm7
|
|
|
|
|
|
;;;
|
|
;;; /* --- */
|
|
;;;
|
|
;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
|
|
|
|
movq xmm5, QWORD PTR [edx+96]
|
|
|
|
;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
|
|
|
|
movq xmm4, QWORD PTR [edx+112]
|
|
|
|
;;; // (x + 32) >> 6
|
|
;;; r6 = _mm_adds_epi16(r6, const32);
|
|
;;; r6 = _mm_srai_epi16(r6, 6);
|
|
;;; r7 = _mm_adds_epi16(r7, const32);
|
|
;;; r7 = _mm_srai_epi16(r7, 6);
|
|
;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
|
|
|
|
pxor xmm3, xmm3
|
|
punpcklbw xmm5, xmm3
|
|
|
|
;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
|
|
|
|
punpcklbw xmm4, xmm3
|
|
movdqa xmm1, XMMWORD PTR [esp]
|
|
paddsw xmm2, xmm1
|
|
psraw xmm2, 6
|
|
paddsw xmm0, xmm1
|
|
psraw xmm0, 6
|
|
|
|
;;; pred0 = _mm_adds_epi16(pred0, r6);
|
|
|
|
paddsw xmm5, xmm2
|
|
|
|
;;; pred1 = _mm_adds_epi16(pred1, r7);
|
|
|
|
paddsw xmm4, xmm0
|
|
|
|
;;;
|
|
;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
|
|
|
|
packuswb xmm5, xmm4
|
|
|
|
;;;
|
|
;;; // store
|
|
;;; _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
|
|
|
|
movq QWORD PTR [ecx+96], xmm5
|
|
|
|
;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
|
|
;;; pred0 = _mm_srli_si128(pred0, 8);
|
|
|
|
psrldq xmm5, 8
|
|
|
|
;;; _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
|
|
|
|
movq QWORD PTR [ecx+112], xmm5
|
|
mov esp, ebp
|
|
pop ebp
|
|
ret
|
|
ALIGN 2
|
|
_itrans8x8_sse2 ENDP
|
|
|
|
|
|
END |