| @@ -29,11 +29,16 @@ pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 | |||
| SECTION .text | |||
| INIT_XMM ssse3 | |||
| cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x | |||
| mova m0, [pb_flip_byte] | |||
| ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) | |||
| %macro HFLIP 3 | |||
| cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x | |||
| mova m0, [pb_flip_%1] | |||
| xor xq, xq | |||
| %if %3 == 1 | |||
| movsxdifnidn wq, wd | |||
| %else ; short | |||
| add wd, wd | |||
| %endif | |||
| mov rq, wq | |||
| and rq, 2 * mmsize - 1 | |||
| cmp wq, 2 * mmsize | |||
| @@ -42,8 +47,8 @@ cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x | |||
| .loop0: | |||
| neg xq | |||
| movu m1, [srcq + xq - mmsize + 1] | |||
| movu m2, [srcq + xq - 2 * mmsize + 1] | |||
| movu m1, [srcq + xq - mmsize + %3] | |||
| movu m2, [srcq + xq - 2 * mmsize + %3] | |||
| pshufb m1, m0 | |||
| pshufb m2, m0 | |||
| neg xq | |||
| @@ -59,49 +64,17 @@ cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x | |||
| .loop1: | |||
| neg xq | |||
| mov rb, [srcq + xq] | |||
| mov r%2, [srcq + xq] | |||
| neg xq | |||
| mov [dstq + xq], rb | |||
| add xq, 1 | |||
| mov [dstq + xq], r%2 | |||
| add xq, %3 | |||
| cmp xq, wq | |||
| jl .loop1 | |||
| .end: | |||
| RET | |||
| %endmacro | |||
| cglobal hflip_short, 3, 5, 3, src, dst, w, r, x | |||
| mova m0, [pb_flip_short] | |||
| xor xq, xq | |||
| add wd, wd | |||
| mov rq, wq | |||
| and rq, 2 * mmsize - 1 | |||
| cmp wq, 2 * mmsize | |||
| jl .loop1 | |||
| sub wq, rq | |||
| .loop0: | |||
| neg xq | |||
| movu m1, [srcq + xq - mmsize + 2] | |||
| movu m2, [srcq + xq - 2 * mmsize + 2] | |||
| pshufb m1, m0 | |||
| pshufb m2, m0 | |||
| neg xq | |||
| movu [dstq + xq ], m1 | |||
| movu [dstq + xq + mmsize], m2 | |||
| add xq, mmsize * 2 | |||
| cmp xq, wq | |||
| jl .loop0 | |||
| cmp rq, 0 | |||
| je .end | |||
| add wq, rq | |||
| INIT_XMM ssse3 | |||
| HFLIP byte, b, 1 | |||
| HFLIP short, w, 2 | |||
| .loop1: | |||
| neg xq | |||
| mov rw, [srcq + xq] | |||
| neg xq | |||
| mov [dstq + xq], rw | |||
| add xq, 2 | |||
| cmp xq, wq | |||
| jl .loop1 | |||
| .end: | |||
| RET | |||