|
|
@@ -142,6 +142,9 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
|
|
|
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
|
|
|
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
|
|
|
|
|
|
|
pw_20091: times 4 dw 20091 |
|
|
|
pw_17734: times 4 dw 17734 |
|
|
|
|
|
|
|
cextern pw_3 |
|
|
|
cextern pw_4 |
|
|
|
cextern pw_64 |
|
|
@@ -923,6 +926,92 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
|
|
|
pextrd [r1+r2], xmm2, 3 |
|
|
|
RET |
|
|
|
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
|
|
|
; calculate %1=%2+%1; %2=%2-%1, with %3=temp register |
|
|
|
%macro SUMSUB 3 |
|
|
|
mova %3, %1 |
|
|
|
paddw %1, %2 |
|
|
|
psubw %2, %3 |
|
|
|
%endmacro |
|
|
|
|
|
|
|
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |
|
|
|
; this macro assumes that m6/m7 have words for 20091/17734 loaded |
|
|
|
%macro VP8_MULTIPLY_SUMSUB 4 |
|
|
|
mova %3, %1 |
|
|
|
mova %4, %2 |
|
|
|
pmulhw %3, m6 ;20091(1) |
|
|
|
pmulhw %4, m6 ;20091(2) |
|
|
|
paddw %3, %1 |
|
|
|
paddw %4, %2 |
|
|
|
psllw %1, 1 |
|
|
|
psllw %2, 1 |
|
|
|
pmulhw %1, m7 ;35468(1) |
|
|
|
pmulhw %2, m7 ;35468(2) |
|
|
|
psubw %1, %4 |
|
|
|
paddw %2, %3 |
|
|
|
%endmacro |
|
|
|
|
|
|
|
; calculate x0=%1+%3; x1=%1-%3 |
|
|
|
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |
|
|
|
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |
|
|
|
; %5/%6 are temporary registers |
|
|
|
; we assume m6/m7 have constant words 20091/17734 loaded in them |
|
|
|
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |
|
|
|
SUMSUB_BA m%3, m%1, m%5 ;t0, t1 |
|
|
|
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |
|
|
|
SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 |
|
|
|
SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 |
|
|
|
SWAP %4, %1 |
|
|
|
SWAP %4, %3 |
|
|
|
%endmacro |
|
|
|
|
|
|
|
; transpose a 4x4 table |
|
|
|
%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3 |
|
|
|
mova m%5, m%1 |
|
|
|
punpcklwd m%1, m%2 |
|
|
|
punpckhwd m%5, m%2 |
|
|
|
mova m%2, m%3 |
|
|
|
punpcklwd m%3, m%4 |
|
|
|
punpckhwd m%2, m%4 |
|
|
|
mova m%4, m%1 |
|
|
|
punpckldq m%1, m%3 ;col0 |
|
|
|
punpckhdq m%4, m%3 ;col1 |
|
|
|
mova m%3, m%5 |
|
|
|
punpckldq m%5, m%2 ;col2 |
|
|
|
punpckhdq m%3, m%2 ;col3 |
|
|
|
SWAP %4, %2 |
|
|
|
SWAP %4, %5 |
|
|
|
SWAP %4, %3 |
|
|
|
%endmacro |
|
|
|
|
|
|
|
INIT_MMX |
|
|
|
cglobal vp8_idct_add_mmx, 3, 3 |
|
|
|
; load block data |
|
|
|
movq m0, [r1] |
|
|
|
movq m1, [r1+8] |
|
|
|
movq m2, [r1+16] |
|
|
|
movq m3, [r1+24] |
|
|
|
movq m6, [pw_20091] |
|
|
|
movq m7, [pw_17734] |
|
|
|
|
|
|
|
; actual IDCT |
|
|
|
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
|
|
|
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
|
|
paddw m0, [pw_4] |
|
|
|
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
|
|
|
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
|
|
|
|
|
|
; store |
|
|
|
pxor m4, m4 |
|
|
|
lea r1, [r0+2*r2] |
|
|
|
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 |
|
|
|
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 |
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
|
|
|
;----------------------------------------------------------------------------- |
|
|
|