|
|
|
@@ -30,6 +30,7 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
|
|
|
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
|
|
|
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 |
|
|
|
pd_16384: times 4 dd 16384 |
|
|
|
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
|
|
|
|
|
|
|
SECTION_TEXT |
|
|
|
|
|
|
|
@@ -1178,3 +1179,125 @@ INIT_XMM sse |
|
|
|
BUTTERFLIES_FLOAT_INTERLEAVE |
|
|
|
INIT_YMM avx |
|
|
|
BUTTERFLIES_FLOAT_INTERLEAVE |
|
|
|
|
|
|
|
INIT_XMM sse2 |
|
|
|
; %1 = aligned/unaligned |
|
|
|
%macro BSWAP_LOOPS_SSE2 1 |
|
|
|
mov r3, r2 |
|
|
|
sar r2, 3 |
|
|
|
jz .left4_%1 |
|
|
|
.loop8_%1: |
|
|
|
mov%1 m0, [r1 + 0] |
|
|
|
mov%1 m1, [r1 + 16] |
|
|
|
pshuflw m0, m0, 10110001b |
|
|
|
pshuflw m1, m1, 10110001b |
|
|
|
pshufhw m0, m0, 10110001b |
|
|
|
pshufhw m1, m1, 10110001b |
|
|
|
mova m2, m0 |
|
|
|
mova m3, m1 |
|
|
|
psllw m0, 8 |
|
|
|
psllw m1, 8 |
|
|
|
psrlw m2, 8 |
|
|
|
psrlw m3, 8 |
|
|
|
por m2, m0 |
|
|
|
por m3, m1 |
|
|
|
mova [r0 + 0], m2 |
|
|
|
mova [r0 + 16], m3 |
|
|
|
add r1, 32 |
|
|
|
add r0, 32 |
|
|
|
dec r2 |
|
|
|
jnz .loop8_%1 |
|
|
|
.left4_%1: |
|
|
|
mov r2, r3 |
|
|
|
and r3, 4 |
|
|
|
jz .left |
|
|
|
mov%1 m0, [r1] |
|
|
|
pshuflw m0, m0, 10110001b |
|
|
|
pshufhw m0, m0, 10110001b |
|
|
|
mova m2, m0 |
|
|
|
psllw m0, 8 |
|
|
|
psrlw m2, 8 |
|
|
|
por m2, m0 |
|
|
|
mova [r0], m2 |
|
|
|
add r1, 16 |
|
|
|
add r0, 16 |
|
|
|
%endmacro |
|
|
|
|
|
|
|
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); |
|
|
|
cglobal bswap32_buf, 3,4,5 |
|
|
|
mov r3, r1 |
|
|
|
and r3, 15 |
|
|
|
jz .start_align |
|
|
|
BSWAP_LOOPS_SSE2 u |
|
|
|
jmp .left |
|
|
|
.start_align: |
|
|
|
BSWAP_LOOPS_SSE2 a |
|
|
|
.left: |
|
|
|
and r2, 3 |
|
|
|
jz .end |
|
|
|
.loop2: |
|
|
|
mov r3d, [r1] |
|
|
|
bswap r3d |
|
|
|
mov [r0], r3d |
|
|
|
add r1, 4 |
|
|
|
add r0, 4 |
|
|
|
dec r2 |
|
|
|
jnz .loop2 |
|
|
|
.end |
|
|
|
RET |
|
|
|
|
|
|
|
; %1 = aligned/unaligned |
|
|
|
%macro BSWAP_LOOPS_SSSE3 1 |
|
|
|
mov r3, r2 |
|
|
|
sar r2, 3 |
|
|
|
jz .left4_%1 |
|
|
|
.loop8_%1: |
|
|
|
mov%1 m0, [r1 + 0] |
|
|
|
mov%1 m1, [r1 + 16] |
|
|
|
pshufb m0, m2 |
|
|
|
pshufb m1, m2 |
|
|
|
mova [r0 + 0], m0 |
|
|
|
mova [r0 + 16], m1 |
|
|
|
add r0, 32 |
|
|
|
add r1, 32 |
|
|
|
dec r2 |
|
|
|
jnz .loop8_%1 |
|
|
|
.left4_%1: |
|
|
|
mov r2, r3 |
|
|
|
and r3, 4 |
|
|
|
jz .left2 |
|
|
|
mov%1 m0, [r1] |
|
|
|
pshufb m0, m2 |
|
|
|
mova [r0], m0 |
|
|
|
add r1, 16 |
|
|
|
add r0, 16 |
|
|
|
%endmacro |
|
|
|
|
|
|
|
INIT_XMM ssse3 |
|
|
|
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); |
|
|
|
cglobal bswap32_buf, 3,4,3 |
|
|
|
mov r3, r1 |
|
|
|
mova m2, [pb_bswap32] |
|
|
|
and r3, 15 |
|
|
|
jz .start_align |
|
|
|
BSWAP_LOOPS_SSSE3 u |
|
|
|
jmp .left2 |
|
|
|
.start_align: |
|
|
|
BSWAP_LOOPS_SSSE3 a |
|
|
|
.left2: |
|
|
|
mov r3, r2 |
|
|
|
and r2, 2 |
|
|
|
jz .left1 |
|
|
|
movq m0, [r1] |
|
|
|
pshufb m0, m2 |
|
|
|
movq [r0], m0 |
|
|
|
add r1, 8 |
|
|
|
add r0, 8 |
|
|
|
.left1: |
|
|
|
and r3, 1 |
|
|
|
jz .end |
|
|
|
mov r2d, [r1] |
|
|
|
bswap r2d |
|
|
|
mov [r0], r2d |
|
|
|
.end: |
|
|
|
RET |