Mainly ported from 8-bit H.264 predict. Some code ported from x264. LGPL ok by author. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>tags/n0.9
| @@ -29,11 +29,19 @@ SECTION_RODATA | |||||
| SECTION .text | SECTION .text | ||||
| cextern pw_16 | |||||
| cextern pw_8 | cextern pw_8 | ||||
| cextern pw_4 | cextern pw_4 | ||||
| cextern pw_2 | cextern pw_2 | ||||
| cextern pw_1 | cextern pw_1 | ||||
| pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 | |||||
| pw_m3: times 8 dw -3 | |||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | |||||
| pw_512: times 8 dw 512 | |||||
| pd_17: times 4 dd 17 | |||||
| pd_16: times 4 dd 16 | |||||
| ; dest, left, right, src | ; dest, left, right, src | ||||
| ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 | ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 | ||||
| %macro PRED4x4_LOWPASS 4 | %macro PRED4x4_LOWPASS 4 | ||||
| @@ -464,7 +472,92 @@ PRED8x8_TOP_DC mmxext, pshufw | |||||
| INIT_XMM | INIT_XMM | ||||
| PRED8x8_TOP_DC sse2 , pshuflw | PRED8x8_TOP_DC sse2 , pshuflw | ||||
| ;----------------------------------------------------------------------------- | |||||
| ; void pred8x8_plane(pixel *src, int stride) | |||||
| ;----------------------------------------------------------------------------- | |||||
| INIT_XMM | |||||
| cglobal pred8x8_plane_10_sse2, 2,7,7 | |||||
| sub r0, r1 | |||||
| lea r2, [r1+r1*2] | |||||
| lea r3, [r0+r1*4] | |||||
| mova m2, [r0] | |||||
| pmaddwd m2, [pw_m32101234] | |||||
| HADDD m2, m1 | |||||
| movd m0, [r0-4] | |||||
| psrld m0, 14 | |||||
| psubw m2, m0 ; H | |||||
| movd m0, [r3+r1*4-4] | |||||
| movd m1, [r0+12] | |||||
| paddw m0, m1 | |||||
| psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) | |||||
| movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] | |||||
| movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] | |||||
| sub r4d, r5d | |||||
| movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] | |||||
| movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] | |||||
| sub r6d, r5d | |||||
| lea r4d, [r4+r6*2] | |||||
| movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] | |||||
| movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] | |||||
| sub r5d, r6d | |||||
| lea r5d, [r5+r5*2] | |||||
| add r4d, r5d | |||||
| movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] | |||||
| movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] | |||||
| sub r6d, r5d | |||||
| lea r4d, [r4+r6*4] | |||||
| movd m3, r4d ; V | |||||
| punpckldq m2, m3 | |||||
| pmaddwd m2, [pd_17] | |||||
| paddd m2, [pd_16] | |||||
| psrad m2, 5 ; b, c | |||||
| mova m3, [pw_pixel_max] | |||||
| pxor m1, m1 | |||||
| SPLATW m0, m0, 1 | |||||
| SPLATW m4, m2, 2 | |||||
| SPLATW m2, m2, 0 | |||||
| pmullw m2, [pw_m32101234] ; b | |||||
| pmullw m5, m4, [pw_m3] ; c | |||||
| paddw m5, [pw_16] | |||||
| mov r2d, 8 | |||||
| add r0, r1 | |||||
| .loop: | |||||
| paddsw m6, m2, m5 | |||||
| paddsw m6, m0 | |||||
| psraw m6, 5 | |||||
| CLIPW m6, m1, m3 | |||||
| mova [r0], m6 | |||||
| paddw m5, m4 | |||||
| add r0, r1 | |||||
| dec r2d | |||||
| jg .loop | |||||
| REP_RET | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride) | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro PRED8x8L_128_DC 1 | |||||
| cglobal pred8x8l_128_dc_10_%1, 4,4 | |||||
| mova m0, [pw_512] | |||||
| lea r1, [r3+r3*2] | |||||
| lea r2, [r0+r3*4] | |||||
| MOV8 r0+r3*0, m0, m0 | |||||
| MOV8 r0+r3*1, m0, m0 | |||||
| MOV8 r0+r3*2, m0, m0 | |||||
| MOV8 r0+r1*1, m0, m0 | |||||
| MOV8 r2+r3*0, m0, m0 | |||||
| MOV8 r2+r3*1, m0, m0 | |||||
| MOV8 r2+r3*2, m0, m0 | |||||
| MOV8 r2+r1*1, m0, m0 | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| PRED8x8L_128_DC mmxext | |||||
| INIT_XMM | |||||
| PRED8x8L_128_DC sse2 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) | ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) | ||||
| @@ -1258,7 +1351,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3 | |||||
| MOV16 r0+r1*1, m1, m1, m1, m1 | MOV16 r0+r1*1, m1, m1, m1, m1 | ||||
| lea r0, [r0+r1*2] | lea r0, [r0+r1*2] | ||||
| dec r2 | dec r2 | ||||
| jge .vloop | |||||
| jg .vloop | |||||
| REP_RET | REP_RET | ||||
| %endmacro | %endmacro | ||||
| @@ -1266,3 +1359,139 @@ INIT_MMX | |||||
| PRED16x16_HORIZONTAL mmxext | PRED16x16_HORIZONTAL mmxext | ||||
| INIT_XMM | INIT_XMM | ||||
| PRED16x16_HORIZONTAL sse2 | PRED16x16_HORIZONTAL sse2 | ||||
| ;----------------------------------------------------------------------------- | |||||
| ; void pred16x16_dc(pixel *src, int stride) | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro PRED16x16_DC 1 | |||||
| cglobal pred16x16_dc_10_%1, 2,7 | |||||
| mov r4, r0 | |||||
| sub r0, r1 | |||||
| mova m0, [r0+0] | |||||
| paddw m0, [r0+mmsize] | |||||
| %if mmsize==8 | |||||
| paddw m0, [r0+16] | |||||
| paddw m0, [r0+24] | |||||
| %endif | |||||
| HADDW m0, m2 | |||||
| sub r0, 2 | |||||
| movzx r3d, word [r0+r1*1] | |||||
| movzx r5d, word [r0+r1*2] | |||||
| %rep 7 | |||||
| lea r0, [r0+r1*2] | |||||
| movzx r2d, word [r0+r1*1] | |||||
| add r3d, r2d | |||||
| movzx r2d, word [r0+r1*2] | |||||
| add r5d, r2d | |||||
| %endrep | |||||
| lea r3d, [r3+r5+16] | |||||
| movd m1, r3d | |||||
| paddw m0, m1 | |||||
| psrlw m0, 5 | |||||
| SPLATW m0, m0 | |||||
| mov r3d, 8 | |||||
| .loop: | |||||
| MOV16 r4+r1*0, m0, m0, m0, m0 | |||||
| MOV16 r4+r1*1, m0, m0, m0, m0 | |||||
| lea r4, [r4+r1*2] | |||||
| dec r3d | |||||
| jg .loop | |||||
| REP_RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| PRED16x16_DC mmxext | |||||
| INIT_XMM | |||||
| PRED16x16_DC sse2 | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; void pred16x16_top_dc(pixel *src, int stride) | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro PRED16x16_TOP_DC 1 | |||||
| cglobal pred16x16_top_dc_10_%1, 2,3 | |||||
| sub r0, r1 | |||||
| mova m0, [r0+0] | |||||
| paddw m0, [r0+mmsize] | |||||
| %if mmsize==8 | |||||
| paddw m0, [r0+16] | |||||
| paddw m0, [r0+24] | |||||
| %endif | |||||
| HADDW m0, m2 | |||||
| SPLATW m0, m0 | |||||
| paddw m0, [pw_8] | |||||
| psrlw m0, 4 | |||||
| mov r2d, 8 | |||||
| .loop: | |||||
| MOV16 r0+r1*1, m0, m0, m0, m0 | |||||
| MOV16 r0+r1*2, m0, m0, m0, m0 | |||||
| lea r0, [r0+r1*2] | |||||
| dec r2d | |||||
| jg .loop | |||||
| REP_RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| PRED16x16_TOP_DC mmxext | |||||
| INIT_XMM | |||||
| PRED16x16_TOP_DC sse2 | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; void pred16x16_left_dc(pixel *src, int stride) | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro PRED16x16_LEFT_DC 1 | |||||
| cglobal pred16x16_left_dc_10_%1, 2,7 | |||||
| mov r4, r0 | |||||
| sub r0, 2 | |||||
| movzx r5d, word [r0+r1*0] | |||||
| movzx r6d, word [r0+r1*1] | |||||
| %rep 7 | |||||
| lea r0, [r0+r1*2] | |||||
| movzx r2d, word [r0+r1*0] | |||||
| movzx r3d, word [r0+r1*1] | |||||
| add r5d, r2d | |||||
| add r6d, r3d | |||||
| %endrep | |||||
| lea r2d, [r5+r6+8] | |||||
| shr r2d, 4 | |||||
| movd m0, r2d | |||||
| SPLATW m0, m0 | |||||
| mov r3d, 8 | |||||
| .loop: | |||||
| MOV16 r4+r1*0, m0, m0, m0, m0 | |||||
| MOV16 r4+r1*1, m0, m0, m0, m0 | |||||
| lea r4, [r4+r1*2] | |||||
| dec r3d | |||||
| jg .loop | |||||
| REP_RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| PRED16x16_LEFT_DC mmxext | |||||
| INIT_XMM | |||||
| PRED16x16_LEFT_DC sse2 | |||||
| ;----------------------------------------------------------------------------- | |||||
| ; void pred16x16_128_dc(pixel *src, int stride) | |||||
| ;----------------------------------------------------------------------------- | |||||
| %macro PRED16x16_128_DC 1 | |||||
| cglobal pred16x16_128_dc_10_%1, 2,3 | |||||
| mova m0, [pw_512] | |||||
| mov r2d, 8 | |||||
| .loop: | |||||
| MOV16 r0+r1*0, m0, m0, m0, m0 | |||||
| MOV16 r0+r1*1, m0, m0, m0, m0 | |||||
| lea r0, [r0+r1*2] | |||||
| dec r2d | |||||
| jg .loop | |||||
| REP_RET | |||||
| %endmacro | |||||
| INIT_MMX | |||||
| PRED16x16_128_DC mmxext | |||||
| INIT_XMM | |||||
| PRED16x16_128_DC sse2 | |||||
| @@ -47,6 +47,7 @@ PRED8x8(dc, 10, mmxext) | |||||
| PRED8x8(dc, 10, sse2) | PRED8x8(dc, 10, sse2) | ||||
| PRED8x8(top_dc, 10, mmxext) | PRED8x8(top_dc, 10, mmxext) | ||||
| PRED8x8(top_dc, 10, sse2) | PRED8x8(top_dc, 10, sse2) | ||||
| PRED8x8(plane, 10, sse2) | |||||
| PRED8x8(vertical, 10, sse2) | PRED8x8(vertical, 10, sse2) | ||||
| PRED8x8(horizontal, 10, sse2) | PRED8x8(horizontal, 10, sse2) | ||||
| @@ -55,6 +56,8 @@ void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_tople | |||||
| PRED8x8L(dc, 10, sse2) | PRED8x8L(dc, 10, sse2) | ||||
| PRED8x8L(dc, 10, ssse3) | PRED8x8L(dc, 10, ssse3) | ||||
| PRED8x8L(128_dc, 10, mmxext) | |||||
| PRED8x8L(128_dc, 10, sse2) | |||||
| PRED8x8L(top_dc, 10, sse2) | PRED8x8L(top_dc, 10, sse2) | ||||
| PRED8x8L(top_dc, 10, ssse3) | PRED8x8L(top_dc, 10, ssse3) | ||||
| PRED8x8L(vertical, 10, sse2) | PRED8x8L(vertical, 10, sse2) | ||||
| @@ -73,6 +76,14 @@ PRED8x8L(horizontal_up, 10, ssse3) | |||||
| #define PRED16x16(TYPE, DEPTH, OPT)\ | #define PRED16x16(TYPE, DEPTH, OPT)\ | ||||
| void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); | void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); | ||||
| PRED16x16(dc, 10, mmxext) | |||||
| PRED16x16(dc, 10, sse2) | |||||
| PRED16x16(top_dc, 10, mmxext) | |||||
| PRED16x16(top_dc, 10, sse2) | |||||
| PRED16x16(128_dc, 10, mmxext) | |||||
| PRED16x16(128_dc, 10, sse2) | |||||
| PRED16x16(left_dc, 10, mmxext) | |||||
| PRED16x16(left_dc, 10, sse2) | |||||
| PRED16x16(vertical, 10, mmxext) | PRED16x16(vertical, 10, mmxext) | ||||
| PRED16x16(vertical, 10, sse2) | PRED16x16(vertical, 10, sse2) | ||||
| PRED16x16(horizontal, 10, mmxext) | PRED16x16(horizontal, 10, mmxext) | ||||
| @@ -289,6 +300,12 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth | |||||
| h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; | h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; | ||||
| h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; | h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; | ||||
| h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; | |||||
| h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext; | |||||
| h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext; | |||||
| h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext; | |||||
| h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext; | |||||
| h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; | h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; | ||||
| h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; | h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; | ||||
| } | } | ||||
| @@ -301,18 +318,24 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth | |||||
| h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; | h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; | ||||
| h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; | h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; | ||||
| h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; | |||||
| h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; | h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; | ||||
| h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; | h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; | ||||
| h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; | h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; | ||||
| h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; | h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; | ||||
| h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; | h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; | ||||
| h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2; | |||||
| h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; | h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; | ||||
| h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; | h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; | ||||
| h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; | h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; | ||||
| h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; | h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; | ||||
| h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; | h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; | ||||
| h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2; | |||||
| h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2; | |||||
| h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2; | |||||
| h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2; | |||||
| h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; | h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; | ||||
| h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; | h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; | ||||
| } | } | ||||