Browse Source

vp9: use registers for constant loading where possible.

tags/n3.0
Ronald S. Bultje 10 years ago
parent
commit
e578638382
1 changed files with 146 additions and 122 deletions
  1. +146
    -122
      libavcodec/x86/vp9itxfm_16bpp.asm

+ 146
- 122
libavcodec/x86/vp9itxfm_16bpp.asm View File

@@ -345,9 +345,9 @@ IADST4_FN iadst, IADST4, iadst, IADST4
; ;
; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
%macro SUMSUB_MUL 6 ; src/dst 1-2, tmp1-2, coef1-2
pand m%3, m%1, [pd_3fff]
pand m%4, m%2, [pd_3fff]
%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
pand m%3, m%1, %8
pand m%4, m%2, %8
psrad m%1, 14 psrad m%1, 14
psrad m%2, 14 psrad m%2, 14
packssdw m%4, m%2 packssdw m%4, m%2
@@ -358,20 +358,20 @@ IADST4_FN iadst, IADST4, iadst, IADST4
pmaddwd m%1, m%2, [pw_%6_%5] pmaddwd m%1, m%2, [pw_%6_%5]
pmaddwd m%4, [pw_m%5_%6] pmaddwd m%4, [pw_m%5_%6]
pmaddwd m%2, [pw_m%5_%6] pmaddwd m%2, [pw_m%5_%6]
paddd m%3, [pd_8192]
paddd m%4, [pd_8192]
paddd m%3, %7
paddd m%4, %7
psrad m%3, 14 psrad m%3, 14
psrad m%4, 14 psrad m%4, 14
paddd m%1, m%3 paddd m%1, m%3
paddd m%2, m%4 paddd m%2, m%4
%endmacro %endmacro


%macro IDCT4_12BPP_1D 0-6 0, 1, 2, 3, 4, 5
SUMSUB_MUL %1, %3, %5, %6, 11585, 11585
SUMSUB_MUL %2, %4, %5, %6, 15137, 6270
SUMSUB_BA d, %2, %1, %5
SUMSUB_BA d, %4, %3, %5
SWAP %2, %4, %1
%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2
SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2
SUMSUB_BA d, %4, %3, %7
SUMSUB_BA d, %6, %5, %7
SWAP %4, %6, %3
%endmacro %endmacro


%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max %macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
@@ -433,10 +433,12 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
mova m1, [blockq+1*16] mova m1, [blockq+1*16]
mova m2, [blockq+2*16] mova m2, [blockq+2*16]
mova m3, [blockq+3*16] mova m3, [blockq+3*16]
mova m6, [pd_8192]
mova m7, [pd_3fff]


IDCT4_12BPP_1D
IDCT4_12BPP_1D m6, m7
TRANSPOSE4x4D 0, 1, 2, 3, 4 TRANSPOSE4x4D 0, 1, 2, 3, 4
IDCT4_12BPP_1D
IDCT4_12BPP_1D m6, m7


pxor m4, m4 pxor m4, m4
ZERO_BLOCK blockq, 16, 4, m4 ZERO_BLOCK blockq, 16, 4, m4
@@ -445,7 +447,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
DEFINE_ARGS dst, stride, stride3 DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3] lea stride3q, [strideq*3]
mova m5, [pw_4095] mova m5, [pw_4095]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, [pd_8], 4
mova m6, [pd_8]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
RET RET


%macro SCRATCH 3-4 %macro SCRATCH 3-4
@@ -473,21 +476,32 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
%endif %endif
%endmacro %endmacro


%macro PRELOAD 2-3
%if ARCH_X86_64
mova m%1, [%2]
%if %0 == 3
%define reg_%3 m%1
%endif
%elif %0 == 3
%define reg_%3 [%2]
%endif
%endmacro

; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 ; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 ; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 ; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 ; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
%macro IADST4_12BPP_1D 0
pand m4, m0, [pd_3fff]
pand m5, m1, [pd_3fff]
%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
pand m4, m0, %2
pand m5, m1, %2
psrad m0, 14 psrad m0, 14
psrad m1, 14 psrad m1, 14
packssdw m5, m1 packssdw m5, m1
packssdw m4, m0 packssdw m4, m0
punpckhwd m1, m4, m5 punpckhwd m1, m4, m5
punpcklwd m4, m5 punpcklwd m4, m5
pand m5, m2, [pd_3fff]
pand m6, m3, [pd_3fff]
pand m5, m2, %2
pand m6, m3, %2
psrad m2, 14 psrad m2, 14
psrad m3, 14 psrad m3, 14
packssdw m6, m3 packssdw m6, m3
@@ -501,29 +515,35 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
; m4/5 have the low bits of 0,1,2,3 ; m4/5 have the low bits of 0,1,2,3
; m0/2/6/7 are free ; m0/2/6/7 are free


pmaddwd m7, reg_b, [pw_15212_9929]
pmaddwd m6, m4, [pw_5283_13377]
pmaddwd m2, m3, [pw_15212_9929]
pmaddwd m0, reg_a, [pw_5283_13377]
mova m2, [pw_15212_9929]
mova m0, [pw_5283_13377]
pmaddwd m7, m2, reg_b
pmaddwd m6, m4, m0
pmaddwd m2, m3
pmaddwd m0, reg_a
paddd m6, m7 paddd m6, m7
paddd m0, m2 paddd m0, m2
pmaddwd m7, reg_b, [pw_m13377_13377]
pmaddwd m2, m4, [pw_13377_0]
pmaddwd m1, m3, [pw_m13377_13377]
pmaddwd m5, reg_a, [pw_13377_0]
mova m1, [pw_m13377_13377]
mova m5, [pw_13377_0]
pmaddwd m7, m1, reg_b
pmaddwd m2, m4, m5
pmaddwd m1, m3
pmaddwd m5, reg_a
paddd m2, m7 paddd m2, m7
paddd m1, m5 paddd m1, m5
paddd m6, [pd_8192]
paddd m2, [pd_8192]
paddd m6, %1
paddd m2, %1
psrad m6, 14 psrad m6, 14
psrad m2, 14 psrad m2, 14
paddd m0, m6 ; t0 paddd m0, m6 ; t0
paddd m2, m1 ; t2 paddd m2, m1 ; t2


pmaddwd m1, reg_b, [pw_m5283_m15212]
pmaddwd m6, m4, [pw_9929_13377]
pmaddwd m7, m3, [pw_m5283_m15212]
pmaddwd m5, reg_a, [pw_9929_13377]
mova m7, [pw_m5283_m15212]
mova m5, [pw_9929_13377]
pmaddwd m1, m7, reg_b
pmaddwd m6, m4, m5
pmaddwd m7, m3
pmaddwd m5, reg_a
paddd m6, m1 paddd m6, m1
paddd m7, m5 paddd m7, m5
UNSCRATCH 5, 9, rsp+1*mmsize, b UNSCRATCH 5, 9, rsp+1*mmsize, b
@@ -534,8 +554,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
pmaddwd m1, [pw_15212_m13377] pmaddwd m1, [pw_15212_m13377]
paddd m4, m5 paddd m4, m5
paddd m3, m1 paddd m3, m1
paddd m6, [pd_8192]
paddd m4, [pd_8192]
paddd m6, %1
paddd m4, %1
psrad m6, 14 psrad m6, 14
psrad m4, 14 psrad m4, 14
paddd m7, m6 ; t1 paddd m7, m6 ; t1
@@ -545,15 +565,17 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
%endmacro %endmacro


%macro IADST4_12BPP_FN 4 %macro IADST4_12BPP_FN 4
cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
mova m0, [blockq+0*16] mova m0, [blockq+0*16]
mova m1, [blockq+1*16] mova m1, [blockq+1*16]
mova m2, [blockq+2*16] mova m2, [blockq+2*16]
mova m3, [blockq+3*16] mova m3, [blockq+3*16]


%2_12BPP_1D
PRELOAD 10, pd_8192, rnd
PRELOAD 11, pd_3fff, mask
%2_12BPP_1D reg_rnd, reg_mask
TRANSPOSE4x4D 0, 1, 2, 3, 4 TRANSPOSE4x4D 0, 1, 2, 3, 4
%4_12BPP_1D
%4_12BPP_1D reg_rnd, reg_mask


pxor m4, m4 pxor m4, m4
ZERO_BLOCK blockq, 16, 4, m4 ZERO_BLOCK blockq, 16, 4, m4
@@ -562,7 +584,8 @@ cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, b
DEFINE_ARGS dst, stride, stride3 DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3] lea stride3q, [strideq*3]
mova m5, [pw_4095] mova m5, [pw_4095]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, [pd_8], 4
mova m6, [pd_8]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
RET RET
%endmacro %endmacro


@@ -573,30 +596,30 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4


; the following line has not been executed at the end of this macro: ; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+%3*mmsize ; UNSCRATCH 6, 8, rsp+%3*mmsize
%macro IDCT8_1D 1-3 2 * mmsize, 17 ; src, src_stride, stack_offset
mova m0, [%1+0*%2]
mova m2, [%1+2*%2]
mova m4, [%1+4*%2]
mova m6, [%1+6*%2]
IDCT4_12BPP_1D 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
SCRATCH 4, 8, rsp+(%3+0)*mmsize
SCRATCH 6, 9, rsp+(%3+1)*mmsize
mova m1, [%1+1*%2]
mova m3, [%1+3*%2]
mova m5, [%1+5*%2]
mova m7, [%1+7*%2]
SUMSUB_MUL 1, 7, 4, 6, 16069, 3196 ; m1=t7a, m7=t4a
SUMSUB_MUL 5, 3, 4, 6, 9102, 13623 ; m5=t6a, m3=t5a
SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
SUMSUB_MUL 1, 7, 4, 6, 11585, 11585 ; m1=t6, m7=t5
SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
UNSCRATCH 4, 8, rsp+(%3+0)*mmsize
UNSCRATCH 6, 9, rsp+(%3+1)*mmsize
SCRATCH 2, 8, rsp+(%3+0)*mmsize
SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
mova m0, [%1+0*%4]
mova m2, [%1+2*%4]
mova m4, [%1+4*%4]
mova m6, [%1+6*%4]
IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
SCRATCH 4, 8, rsp+(%5+0)*mmsize
SCRATCH 6, 9, rsp+(%5+1)*mmsize
mova m1, [%1+1*%4]
mova m3, [%1+3*%4]
mova m5, [%1+5*%4]
mova m7, [%1+7*%4]
SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a
SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a
SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5
SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
UNSCRATCH 4, 8, rsp+(%5+0)*mmsize
UNSCRATCH 6, 9, rsp+(%5+1)*mmsize
SCRATCH 2, 8, rsp+(%5+0)*mmsize
SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
SWAP 0, 5, 4, 6, 2, 7 SWAP 0, 5, 4, 6, 2, 7
%endmacro %endmacro


@@ -613,23 +636,12 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
mova [%6+%7*1], m%2 mova [%6+%7*1], m%2
%endmacro %endmacro


%macro PRELOAD 2-3
%if ARCH_X86_64
mova m%1, [%2]
%if %0 == 3
%define reg_%3 m%1
%endif
%elif %0 == 3
%define reg_%3 [%2]
%endif
%endmacro

; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
; storage also instead of allocating two more stack spaces. This doesn't ; storage also instead of allocating two more stack spaces. This doesn't
; matter much but it's something... ; matter much but it's something...
INIT_XMM sse2 INIT_XMM sse2
cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
17 * mmsize + 2 * ARCH_X86_32 * mmsize, \
cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
dst, stride, block, eob dst, stride, block, eob
mova m0, [pw_1023] mova m0, [pw_1023]
cmp eobd, 1 cmp eobd, 1
@@ -654,7 +666,7 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
RET RET


.idctfull: .idctfull:
mova [rsp+16*mmsize], m0
SCRATCH 0, 12, rsp+16*mmsize, max
DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64 %if ARCH_X86_64
mov dstbakq, dstq mov dstbakq, dstq
@@ -669,8 +681,11 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
mov skipd, 2 mov skipd, 2
sub skipd, cntd sub skipd, cntd
mov ptrq, rsp mov ptrq, rsp
PRELOAD 10, pd_8192, rnd
PRELOAD 11, pd_3fff, mask
PRELOAD 13, pd_16, srnd
.loop_1: .loop_1:
IDCT8_1D blockq
IDCT8_1D blockq, reg_rnd, reg_mask


TRANSPOSE4x4D 0, 1, 2, 3, 6 TRANSPOSE4x4D 0, 1, 2, 3, 6
mova [ptrq+ 0*mmsize], m0 mova [ptrq+ 0*mmsize], m0
@@ -709,14 +724,15 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
mov cntd, 2 mov cntd, 2
mov ptrq, rsp mov ptrq, rsp
.loop_2: .loop_2:
IDCT8_1D ptrq
IDCT8_1D ptrq, reg_rnd, reg_mask


pxor m6, m6 pxor m6, m6
PRELOAD 9, rsp+16*mmsize, max
ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, [pd_16], 5
ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
UNSCRATCH 0, 8, rsp+17*mmsize UNSCRATCH 0, 8, rsp+17*mmsize
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, reg_max, [pd_16], 5
UNSCRATCH 1, 12, rsp+16*mmsize, max
UNSCRATCH 2, 13, pd_16, srnd
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
add ptrq, 16 add ptrq, 16
%if ARCH_X86_64 %if ARCH_X86_64
lea dstq, [dstbakq+8] lea dstq, [dstbakq+8]
@@ -763,8 +779,8 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
%endmacro %endmacro


INIT_XMM sse2 INIT_XMM sse2
cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
17 * mmsize + 2 * ARCH_X86_32 * mmsize, \
cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
dst, stride, block, eob dst, stride, block, eob
mova m0, [pw_4095] mova m0, [pw_4095]
cmp eobd, 1 cmp eobd, 1
@@ -791,9 +807,9 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; ;
; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
%macro SUMSUB_MUL_D 6 ; src/dst 1-2, dst3-4, coef1-2
pand m%3, m%1, [pd_3fff]
pand m%4, m%2, [pd_3fff]
%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
pand m%3, m%1, %7
pand m%4, m%2, %7
psrad m%1, 14 psrad m%1, 14
psrad m%2, 14 psrad m%2, 14
packssdw m%4, m%2 packssdw m%4, m%2
@@ -808,11 +824,11 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \


; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
%macro SUMSUB_PACK_D 5 ; src/dst 1-2, src3-4, tmp
%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
SUMSUB_BA d, %1, %2, %5 SUMSUB_BA d, %1, %2, %5
SUMSUB_BA d, %3, %4, %5 SUMSUB_BA d, %3, %4, %5
paddd m%3, [pd_8192]
paddd m%4, [pd_8192]
paddd m%3, %6
paddd m%4, %6
psrad m%3, 14 psrad m%3, 14
psrad m%4, 14 psrad m%4, 14
paddd m%1, m%3 paddd m%1, m%3
@@ -830,17 +846,17 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \


; the following line has not been executed at the end of this macro: ; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+17*mmsize ; UNSCRATCH 6, 8, rsp+17*mmsize
%macro IADST8_1D 1 ; src
%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
mova m0, [%1+ 0*mmsize] mova m0, [%1+ 0*mmsize]
mova m3, [%1+ 6*mmsize] mova m3, [%1+ 6*mmsize]
mova m4, [%1+ 8*mmsize] mova m4, [%1+ 8*mmsize]
mova m7, [%1+14*mmsize] mova m7, [%1+14*mmsize]
SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606 ; m7/1=t0a, m0/2=t1a
SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665 ; m3/5=t4a, m4/6=t5a
SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a
SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a
SCRATCH 0, 8, rsp+17*mmsize SCRATCH 0, 8, rsp+17*mmsize
SUMSUB_PACK_D 3, 7, 5, 1, 0 ; m3=t0, m7=t4
SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4
UNSCRATCH 0, 8, rsp+17*mmsize UNSCRATCH 0, 8, rsp+17*mmsize
SUMSUB_PACK_D 4, 0, 6, 2, 1 ; m4=t1, m0=t5
SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5


SCRATCH 3, 8, rsp+17*mmsize SCRATCH 3, 8, rsp+17*mmsize
SCRATCH 4, 9, rsp+18*mmsize SCRATCH 4, 9, rsp+18*mmsize
@@ -851,26 +867,26 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
mova m2, [%1+ 4*mmsize] mova m2, [%1+ 4*mmsize]
mova m5, [%1+10*mmsize] mova m5, [%1+10*mmsize]
mova m6, [%1+12*mmsize] mova m6, [%1+12*mmsize]
SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723 ; m5/8=t2a, m2/9=t3a
SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679 ; m1/10=t6a, m6/11=t7a
SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a
SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a
SCRATCH 2, 12, rsp+21*mmsize SCRATCH 2, 12, rsp+21*mmsize
SUMSUB_PACK_D 1, 5, 7, 3, 2 ; m1=t2, m5=t6
SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6
UNSCRATCH 2, 12, rsp+21*mmsize UNSCRATCH 2, 12, rsp+21*mmsize
SUMSUB_PACK_D 6, 2, 0, 4, 3 ; m6=t3, m2=t7
SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7


UNSCRATCH 7, 10, rsp+19*mmsize UNSCRATCH 7, 10, rsp+19*mmsize
UNSCRATCH 0, 11, rsp+20*mmsize UNSCRATCH 0, 11, rsp+20*mmsize
SCRATCH 1, 10, rsp+19*mmsize SCRATCH 1, 10, rsp+19*mmsize
SCRATCH 6, 11, rsp+20*mmsize SCRATCH 6, 11, rsp+20*mmsize


SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270 ; m7/8=t4a, m0/9=t5a
SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137 ; m2/10=t7a, m5/11=t6a
SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a
SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a
SCRATCH 2, 12, rsp+21*mmsize SCRATCH 2, 12, rsp+21*mmsize
SUMSUB_PACK_D 5, 7, 6, 3, 2 ; m5=-out1, m7=t6
SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6
UNSCRATCH 2, 12, rsp+21*mmsize UNSCRATCH 2, 12, rsp+21*mmsize
NEGD m5 ; m5=out1 NEGD m5 ; m5=out1
SUMSUB_PACK_D 2, 0, 1, 4, 3 ; m2=out6, m0=t7
SUMSUB_MUL 7, 0, 3, 4, 11585, 11585 ; m7=out2, m0=-out5
SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7
SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5
NEGD m0 ; m0=out5 NEGD m0 ; m0=out5


UNSCRATCH 3, 8, rsp+17*mmsize UNSCRATCH 3, 8, rsp+17*mmsize
@@ -883,7 +899,7 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2
SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3
NEGD m6 ; m6=out7 NEGD m6 ; m6=out7
SUMSUB_MUL 3, 4, 2, 0, 11585, 11585 ; m3=-out3, m4=out4
SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
NEGD m3 ; m3=out3 NEGD m3 ; m3=out3


UNSCRATCH 0, 9, rsp+18*mmsize UNSCRATCH 0, 9, rsp+18*mmsize
@@ -899,7 +915,7 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mova m0, [pw_1023] mova m0, [pw_1023]


.body: .body:
mova [rsp+16*mmsize], m0
SCRATCH 0, 13, rsp+16*mmsize, max
DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64 %if ARCH_X86_64
mov dstbakq, dstq mov dstbakq, dstq
@@ -914,8 +930,10 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov skipd, 2 mov skipd, 2
sub skipd, cntd sub skipd, cntd
mov ptrq, rsp mov ptrq, rsp
PRELOAD 14, pd_8192, rnd
PRELOAD 15, pd_3fff, mask
.loop_1: .loop_1:
%2_1D blockq
%2_1D blockq, reg_rnd, reg_mask


TRANSPOSE4x4D 0, 1, 2, 3, 6 TRANSPOSE4x4D 0, 1, 2, 3, 6
mova [ptrq+ 0*mmsize], m0 mova [ptrq+ 0*mmsize], m0
@@ -954,14 +972,16 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov cntd, 2 mov cntd, 2
mov ptrq, rsp mov ptrq, rsp
.loop_2: .loop_2:
%4_1D ptrq
%4_1D ptrq, reg_rnd, reg_mask


pxor m6, m6 pxor m6, m6
PRELOAD 9, rsp+16*mmsize, max
ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, [pd_16], 5
PRELOAD 9, pd_16, srnd
ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
UNSCRATCH 0, 8, rsp+17*mmsize UNSCRATCH 0, 8, rsp+17*mmsize
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, reg_max, [pd_16], 5
UNSCRATCH 1, 13, rsp+16*mmsize, max
UNSCRATCH 2, 9, pd_16, srnd
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
add ptrq, 16 add ptrq, 16
%if ARCH_X86_64 %if ARCH_X86_64
lea dstq, [dstbakq+8] lea dstq, [dstbakq+8]
@@ -989,7 +1009,7 @@ IADST8_FN iadst, IADST8, idct, IDCT8, col
IADST8_FN iadst, IADST8, iadst, IADST8, default IADST8_FN iadst, IADST8, iadst, IADST8, default


%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
IDCT8_1D %1, %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a
SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a
@@ -1186,7 +1206,9 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
mova m0, [rsp+65*mmsize] mova m0, [rsp+65*mmsize]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
mova m1, [rsp+64*mmsize]
mova m2, [pd_32]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6


%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
@@ -1194,10 +1216,10 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstq, dstm mov dstq, dstm
%endif %endif
UNSCRATCH 0, 8, rsp+67*mmsize UNSCRATCH 0, 8, rsp+67*mmsize
UNSCRATCH 1, 9, rsp+68*mmsize
UNSCRATCH 2, 10, rsp+69*mmsize
UNSCRATCH 4, 9, rsp+68*mmsize
UNSCRATCH 5, 10, rsp+69*mmsize
UNSCRATCH 3, 11, rsp+70*mmsize UNSCRATCH 3, 11, rsp+70*mmsize
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
lea dstq, [dstbakq+stride3q*4] lea dstq, [dstbakq+stride3q*4]
@@ -1208,7 +1230,7 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
UNSCRATCH 5, 13, rsp+72*mmsize UNSCRATCH 5, 13, rsp+72*mmsize
UNSCRATCH 6, 14, rsp+73*mmsize UNSCRATCH 6, 14, rsp+73*mmsize
UNSCRATCH 0, 15, rsp+74*mmsize UNSCRATCH 0, 15, rsp+74*mmsize
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6


add ptrq, mmsize add ptrq, mmsize
%if ARCH_X86_64 %if ARCH_X86_64
@@ -1501,7 +1523,9 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
mova m0, [rsp+65*mmsize] mova m0, [rsp+65*mmsize]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
mova m1, [rsp+64*mmsize]
mova m2, [pd_32]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6


%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
@@ -1509,10 +1533,10 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstq, dstm mov dstq, dstm
%endif %endif
UNSCRATCH 0, 8, rsp+(%6+0)*mmsize UNSCRATCH 0, 8, rsp+(%6+0)*mmsize
UNSCRATCH 1, 9, rsp+(%6+1)*mmsize
UNSCRATCH 2, 10, rsp+(%6+2)*mmsize
UNSCRATCH 4, 9, rsp+(%6+1)*mmsize
UNSCRATCH 5, 10, rsp+(%6+2)*mmsize
UNSCRATCH 3, 11, rsp+(%6+3)*mmsize UNSCRATCH 3, 11, rsp+(%6+3)*mmsize
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
lea dstq, [dstbakq+stride3q*4] lea dstq, [dstbakq+stride3q*4]
@@ -1523,7 +1547,7 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
UNSCRATCH 5, 13, rsp+(%6+5)*mmsize UNSCRATCH 5, 13, rsp+(%6+5)*mmsize
UNSCRATCH 6, 14, rsp+(%6+6)*mmsize UNSCRATCH 6, 14, rsp+(%6+6)*mmsize
UNSCRATCH 0, 15, rsp+(%6+7)*mmsize UNSCRATCH 0, 15, rsp+(%6+7)*mmsize
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6


add ptrq, mmsize add ptrq, mmsize
%if ARCH_X86_64 %if ARCH_X86_64


Loading…
Cancel
Save