|
- /*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
- #include "libavutil/aarch64/asm.S"
- #include "neon.S"
-
- function ff_h264_idct_add_neon, export=1
- ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
- sxtw x2, w2
- movi v30.8H, #0
-
- add v4.4H, v0.4H, v2.4H
- sshr v16.4H, v1.4H, #1
- st1 {v30.8H}, [x1], #16
- sshr v17.4H, v3.4H, #1
- st1 {v30.8H}, [x1], #16
- sub v5.4H, v0.4H, v2.4H
- sub v6.4H, v16.4H, v3.4H
- add v7.4H, v1.4H, v17.4H
- add v0.4H, v4.4H, v7.4H
- add v1.4H, v5.4H, v6.4H
- sub v2.4H, v5.4H, v6.4H
- sub v3.4H, v4.4H, v7.4H
-
- transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
-
- add v4.4H, v0.4H, v2.4H
- ld1 {v18.S}[0], [x0], x2
- sshr v16.4H, v3.4H, #1
- sshr v17.4H, v1.4H, #1
- ld1 {v18.S}[1], [x0], x2
- sub v5.4H, v0.4H, v2.4H
- ld1 {v19.S}[1], [x0], x2
- add v6.4H, v16.4H, v1.4H
- ins v4.D[1], v5.D[0]
- sub v7.4H, v17.4H, v3.4H
- ld1 {v19.S}[0], [x0], x2
- ins v6.D[1], v7.D[0]
- sub x0, x0, x2, lsl #2
- add v0.8H, v4.8H, v6.8H
- sub v1.8H, v4.8H, v6.8H
-
- srshr v0.8H, v0.8H, #6
- srshr v1.8H, v1.8H, #6
-
- uaddw v0.8H, v0.8H, v18.8B
- uaddw v1.8H, v1.8H, v19.8B
-
- sqxtun v0.8B, v0.8H
- sqxtun v1.8B, v1.8H
-
- st1 {v0.S}[0], [x0], x2
- st1 {v0.S}[1], [x0], x2
- st1 {v1.S}[1], [x0], x2
- st1 {v1.S}[0], [x0], x2
-
- sub x1, x1, #32
- ret
- endfunc
-
- function ff_h264_idct_dc_add_neon, export=1
- sxtw x2, w2
- mov w3, #0
- ld1r {v2.8H}, [x1]
- strh w3, [x1]
- srshr v2.8H, v2.8H, #6
- ld1 {v0.S}[0], [x0], x2
- ld1 {v0.S}[1], [x0], x2
- uaddw v3.8H, v2.8H, v0.8B
- ld1 {v1.S}[0], [x0], x2
- ld1 {v1.S}[1], [x0], x2
- uaddw v4.8H, v2.8H, v1.8B
- sqxtun v0.8B, v3.8H
- sqxtun v1.8B, v4.8H
- sub x0, x0, x2, lsl #2
- st1 {v0.S}[0], [x0], x2
- st1 {v0.S}[1], [x0], x2
- st1 {v1.S}[0], [x0], x2
- st1 {v1.S}[1], [x0], x2
- ret
- endfunc
-
- function ff_h264_idct_add16_neon, export=1
- mov x12, x30
- mov x6, x0 // dest
- mov x5, x1 // block_offset
- mov x1, x2 // block
- mov w9, w3 // stride
- movrel x7, scan8
- mov x10, #16
- movrel x13, X(ff_h264_idct_dc_add_neon)
- movrel x14, X(ff_h264_idct_add_neon)
- 1: mov w2, w9
- ldrb w3, [x7], #1
- ldrsw x0, [x5], #4
- ldrb w3, [x4, w3, uxtw]
- subs w3, w3, #1
- b.lt 2f
- ldrsh w3, [x1]
- add x0, x0, x6
- ccmp w3, #0, #4, eq
- csel x15, x13, x14, ne
- blr x15
- 2: subs x10, x10, #1
- add x1, x1, #32
- b.ne 1b
- ret x12
- endfunc
-
- function ff_h264_idct_add16intra_neon, export=1
- mov x12, x30
- mov x6, x0 // dest
- mov x5, x1 // block_offset
- mov x1, x2 // block
- mov w9, w3 // stride
- movrel x7, scan8
- mov x10, #16
- movrel x13, X(ff_h264_idct_dc_add_neon)
- movrel x14, X(ff_h264_idct_add_neon)
- 1: mov w2, w9
- ldrb w3, [x7], #1
- ldrsw x0, [x5], #4
- ldrb w3, [x4, w3, uxtw]
- add x0, x0, x6
- cmp w3, #0
- ldrsh w3, [x1]
- csel x15, x13, x14, eq
- ccmp w3, #0, #0, eq
- b.eq 2f
- blr x15
- 2: subs x10, x10, #1
- add x1, x1, #32
- b.ne 1b
- ret x12
- endfunc
-
- function ff_h264_idct_add8_neon, export=1
- sub sp, sp, #0x40
- stp x19, x20, [sp]
- mov x12, x30
- ldp x6, x15, [x0] // dest[0], dest[1]
- add x5, x1, #16*4 // block_offset
- add x9, x2, #16*32 // block
- mov w19, w3 // stride
- movrel x13, X(ff_h264_idct_dc_add_neon)
- movrel x14, X(ff_h264_idct_add_neon)
- movrel x7, scan8+16
- mov x10, #0
- mov x11, #16
- 1: mov w2, w19
- ldrb w3, [x7, x10] // scan8[i]
- ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
- ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
- add x0, x0, x6 // block_offset[i] + dst[j-1]
- add x1, x9, x10, lsl #5 // block + i * 16
- cmp w3, #0
- ldrsh w3, [x1] // block[i*16]
- csel x20, x13, x14, eq
- ccmp w3, #0, #0, eq
- b.eq 2f
- blr x20
- 2: add x10, x10, #1
- cmp x10, #4
- csel x10, x11, x10, eq // mov x10, #16
- csel x6, x15, x6, eq
- cmp x10, #20
- b.lt 1b
- ldp x19, x20, [sp]
- add sp, sp, #0x40
- ret x12
- endfunc
-
- .macro idct8x8_cols pass
- .if \pass == 0
- va .req v18
- vb .req v30
- sshr v18.8H, v26.8H, #1
- add v16.8H, v24.8H, v28.8H
- ld1 {v30.8H, v31.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
- sub v17.8H, v24.8H, v28.8H
- sshr v19.8H, v30.8H, #1
- sub v18.8H, v18.8H, v30.8H
- add v19.8H, v19.8H, v26.8H
- .else
- va .req v30
- vb .req v18
- sshr v30.8H, v26.8H, #1
- sshr v19.8H, v18.8H, #1
- add v16.8H, v24.8H, v28.8H
- sub v17.8H, v24.8H, v28.8H
- sub v30.8H, v30.8H, v18.8H
- add v19.8H, v19.8H, v26.8H
- .endif
- add v26.8H, v17.8H, va.8H
- sub v28.8H, v17.8H, va.8H
- add v24.8H, v16.8H, v19.8H
- sub vb.8H, v16.8H, v19.8H
- sub v16.8H, v29.8H, v27.8H
- add v17.8H, v31.8H, v25.8H
- sub va.8H, v31.8H, v25.8H
- add v19.8H, v29.8H, v27.8H
- sub v16.8H, v16.8H, v31.8H
- sub v17.8H, v17.8H, v27.8H
- add va.8H, va.8H, v29.8H
- add v19.8H, v19.8H, v25.8H
- sshr v25.8H, v25.8H, #1
- sshr v27.8H, v27.8H, #1
- sshr v29.8H, v29.8H, #1
- sshr v31.8H, v31.8H, #1
- sub v16.8H, v16.8H, v31.8H
- sub v17.8H, v17.8H, v27.8H
- add va.8H, va.8H, v29.8H
- add v19.8H, v19.8H, v25.8H
- sshr v25.8H, v16.8H, #2
- sshr v27.8H, v17.8H, #2
- sshr v29.8H, va.8H, #2
- sshr v31.8H, v19.8H, #2
- sub v19.8H, v19.8H, v25.8H
- sub va.8H, v27.8H, va.8H
- add v17.8H, v17.8H, v29.8H
- add v16.8H, v16.8H, v31.8H
- .if \pass == 0
- sub v31.8H, v24.8H, v19.8H
- add v24.8H, v24.8H, v19.8H
- add v25.8H, v26.8H, v18.8H
- sub v18.8H, v26.8H, v18.8H
- add v26.8H, v28.8H, v17.8H
- add v27.8H, v30.8H, v16.8H
- sub v29.8H, v28.8H, v17.8H
- sub v28.8H, v30.8H, v16.8H
- .else
- sub v31.8H, v24.8H, v19.8H
- add v24.8H, v24.8H, v19.8H
- add v25.8H, v26.8H, v30.8H
- sub v30.8H, v26.8H, v30.8H
- add v26.8H, v28.8H, v17.8H
- sub v29.8H, v28.8H, v17.8H
- add v27.8H, v18.8H, v16.8H
- sub v28.8H, v18.8H, v16.8H
- .endif
- .unreq va
- .unreq vb
- .endm
-
- function ff_h264_idct8_add_neon, export=1
- movi v19.8H, #0
- ld1 {v24.8H, v25.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
- ld1 {v26.8H, v27.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
- ld1 {v28.8H, v29.8H}, [x1]
- st1 {v19.8H}, [x1], #16
- st1 {v19.8H}, [x1], #16
-
- idct8x8_cols 0
- transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
- idct8x8_cols 1
-
- mov x3, x0
- srshr v24.8H, v24.8H, #6
- ld1 {v0.8B}, [x0], x2
- srshr v25.8H, v25.8H, #6
- ld1 {v1.8B}, [x0], x2
- srshr v26.8H, v26.8H, #6
- ld1 {v2.8B}, [x0], x2
- srshr v27.8H, v27.8H, #6
- ld1 {v3.8B}, [x0], x2
- srshr v28.8H, v28.8H, #6
- ld1 {v4.8B}, [x0], x2
- srshr v29.8H, v29.8H, #6
- ld1 {v5.8B}, [x0], x2
- srshr v30.8H, v30.8H, #6
- ld1 {v6.8B}, [x0], x2
- srshr v31.8H, v31.8H, #6
- ld1 {v7.8B}, [x0], x2
- uaddw v24.8H, v24.8H, v0.8B
- uaddw v25.8H, v25.8H, v1.8B
- uaddw v26.8H, v26.8H, v2.8B
- sqxtun v0.8B, v24.8H
- uaddw v27.8H, v27.8H, v3.8B
- sqxtun v1.8B, v25.8H
- uaddw v28.8H, v28.8H, v4.8B
- sqxtun v2.8B, v26.8H
- st1 {v0.8B}, [x3], x2
- uaddw v29.8H, v29.8H, v5.8B
- sqxtun v3.8B, v27.8H
- st1 {v1.8B}, [x3], x2
- uaddw v30.8H, v30.8H, v6.8B
- sqxtun v4.8B, v28.8H
- st1 {v2.8B}, [x3], x2
- uaddw v31.8H, v31.8H, v7.8B
- sqxtun v5.8B, v29.8H
- st1 {v3.8B}, [x3], x2
- sqxtun v6.8B, v30.8H
- sqxtun v7.8B, v31.8H
- st1 {v4.8B}, [x3], x2
- st1 {v5.8B}, [x3], x2
- st1 {v6.8B}, [x3], x2
- st1 {v7.8B}, [x3], x2
-
- sub x1, x1, #128
- ret
- endfunc
-
- function ff_h264_idct8_dc_add_neon, export=1
- mov w3, #0
- sxtw x2, w2
- ld1r {v31.8H}, [x1]
- strh w3, [x1]
- ld1 {v0.8B}, [x0], x2
- srshr v31.8H, v31.8H, #6
- ld1 {v1.8B}, [x0], x2
- ld1 {v2.8B}, [x0], x2
- uaddw v24.8H, v31.8H, v0.8B
- ld1 {v3.8B}, [x0], x2
- uaddw v25.8H, v31.8H, v1.8B
- ld1 {v4.8B}, [x0], x2
- uaddw v26.8H, v31.8H, v2.8B
- ld1 {v5.8B}, [x0], x2
- uaddw v27.8H, v31.8H, v3.8B
- ld1 {v6.8B}, [x0], x2
- uaddw v28.8H, v31.8H, v4.8B
- ld1 {v7.8B}, [x0], x2
- uaddw v29.8H, v31.8H, v5.8B
- uaddw v30.8H, v31.8H, v6.8B
- uaddw v31.8H, v31.8H, v7.8B
- sqxtun v0.8B, v24.8H
- sqxtun v1.8B, v25.8H
- sqxtun v2.8B, v26.8H
- sqxtun v3.8B, v27.8H
- sub x0, x0, x2, lsl #3
- st1 {v0.8B}, [x0], x2
- sqxtun v4.8B, v28.8H
- st1 {v1.8B}, [x0], x2
- sqxtun v5.8B, v29.8H
- st1 {v2.8B}, [x0], x2
- sqxtun v6.8B, v30.8H
- st1 {v3.8B}, [x0], x2
- sqxtun v7.8B, v31.8H
- st1 {v4.8B}, [x0], x2
- st1 {v5.8B}, [x0], x2
- st1 {v6.8B}, [x0], x2
- st1 {v7.8B}, [x0], x2
- ret
- endfunc
-
- function ff_h264_idct8_add4_neon, export=1
- mov x12, x30
- mov x6, x0
- mov x5, x1
- mov x1, x2
- mov w2, w3
- movrel x7, scan8
- mov w10, #16
- movrel x13, X(ff_h264_idct8_dc_add_neon)
- movrel x14, X(ff_h264_idct8_add_neon)
- 1: ldrb w9, [x7], #4
- ldrsw x0, [x5], #16
- ldrb w9, [x4, w9, UXTW]
- subs w9, w9, #1
- b.lt 2f
- ldrsh w11, [x1]
- add x0, x6, x0
- ccmp w11, #0, #4, eq
- csel x15, x13, x14, ne
- blr x15
- 2: subs w10, w10, #4
- add x1, x1, #128
- b.ne 1b
- ret x12
- endfunc
-
- const scan8
- .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
- .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
- .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
- .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
- .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
- .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
- .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
- .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
- .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
- .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
- .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
- .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
- endconst
|