Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.6
| @@ -35,21 +35,30 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; | ||||
| DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL }; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; | ||||
| DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; | DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; | ||||
| DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; | DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; | ||||
| DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; | DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; | ||||
| DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; | DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, | |||||
| 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, | DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, | ||||
| 0x0100010001000100ULL, 0x0100010001000100ULL }; | 0x0100010001000100ULL, 0x0100010001000100ULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL, | |||||
| 0x0200020002000200ULL, 0x0200020002000200ULL }; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL }; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL }; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL, | |||||
| 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL}; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL, | |||||
| 0x0400040004000400ULL, 0x0400040004000400ULL}; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL, | |||||
| 0x0800080008000800ULL, 0x0800080008000800ULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL, | |||||
| 0x1000100010001000ULL, 0x1000100010001000ULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL, | |||||
| 0x2000200020002000ULL, 0x2000200020002000ULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, | DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, | ||||
| 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL }; | 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL }; | ||||
| @@ -63,6 +72,7 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x030 | |||||
| 0x0303030303030303ULL, 0x0303030303030303ULL }; | 0x0303030303030303ULL, 0x0303030303030303ULL }; | ||||
| DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL }; | DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; | |||||
| DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; | DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; | ||||
| @@ -35,18 +35,20 @@ extern const xmm_reg ff_pw_9; | |||||
| extern const uint64_t ff_pw_15; | extern const uint64_t ff_pw_15; | ||||
| extern const xmm_reg ff_pw_16; | extern const xmm_reg ff_pw_16; | ||||
| extern const xmm_reg ff_pw_18; | extern const xmm_reg ff_pw_18; | ||||
| extern const uint64_t ff_pw_20; | |||||
| extern const xmm_reg ff_pw_20; | |||||
| extern const xmm_reg ff_pw_32; | extern const xmm_reg ff_pw_32; | ||||
| extern const uint64_t ff_pw_42; | extern const uint64_t ff_pw_42; | ||||
| extern const uint64_t ff_pw_53; | extern const uint64_t ff_pw_53; | ||||
| extern const xmm_reg ff_pw_64; | extern const xmm_reg ff_pw_64; | ||||
| extern const uint64_t ff_pw_96; | extern const uint64_t ff_pw_96; | ||||
| extern const uint64_t ff_pw_128; | extern const uint64_t ff_pw_128; | ||||
| extern const xmm_reg ff_pw_255; | |||||
| extern const xmm_reg ff_pw_512; | |||||
| extern const xmm_reg ff_pw_1024; | |||||
| extern const xmm_reg ff_pw_2048; | |||||
| extern const xmm_reg ff_pw_8192; | |||||
| extern const ymm_reg ff_pw_255; | |||||
| extern const ymm_reg ff_pw_512; | |||||
| extern const ymm_reg ff_pw_1023; | |||||
| extern const ymm_reg ff_pw_1024; | |||||
| extern const ymm_reg ff_pw_2048; | |||||
| extern const ymm_reg ff_pw_4096; | |||||
| extern const ymm_reg ff_pw_8192; | |||||
| extern const ymm_reg ff_pw_m1; | extern const ymm_reg ff_pw_m1; | ||||
| extern const ymm_reg ff_pb_0; | extern const ymm_reg ff_pb_0; | ||||
| @@ -54,7 +56,7 @@ extern const ymm_reg ff_pb_1; | |||||
| extern const ymm_reg ff_pb_2; | extern const ymm_reg ff_pb_2; | ||||
| extern const ymm_reg ff_pb_3; | extern const ymm_reg ff_pb_3; | ||||
| extern const xmm_reg ff_pb_80; | extern const xmm_reg ff_pb_80; | ||||
| extern const xmm_reg ff_pb_F8; | |||||
| extern const xmm_reg ff_pb_FE; | |||||
| extern const uint64_t ff_pb_FC; | extern const uint64_t ff_pb_FC; | ||||
| extern const xmm_reg ff_ps_neg; | extern const xmm_reg ff_ps_neg; | ||||
| @@ -26,15 +26,13 @@ | |||||
| %include "libavutil/x86/x86util.asm" | %include "libavutil/x86/x86util.asm" | ||||
| SECTION_RODATA | |||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | |||||
| SECTION .text | SECTION .text | ||||
| cextern pw_2 | cextern pw_2 | ||||
| cextern pw_3 | cextern pw_3 | ||||
| cextern pw_4 | cextern pw_4 | ||||
| cextern pw_1023 | |||||
| %define pw_pixel_max pw_1023 | |||||
| ; out: %4 = |%1-%2|-%3 | ; out: %4 = |%1-%2|-%3 | ||||
| ; clobbers: %5 | ; clobbers: %5 | ||||
| @@ -26,11 +26,13 @@ | |||||
| SECTION_RODATA | SECTION_RODATA | ||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | |||||
| pd_32: times 4 dd 32 | pd_32: times 4 dd 32 | ||||
| SECTION .text | SECTION .text | ||||
| cextern pw_1023 | |||||
| %define pw_pixel_max pw_1023 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) | ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| @@ -26,6 +26,8 @@ | |||||
| SECTION_RODATA | SECTION_RODATA | ||||
| cextern pw_1023 | |||||
| %define pw_pixel_max pw_1023 | |||||
| cextern pw_512 | cextern pw_512 | ||||
| cextern pw_16 | cextern pw_16 | ||||
| cextern pw_8 | cextern pw_8 | ||||
| @@ -35,7 +37,6 @@ cextern pw_1 | |||||
| pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 | pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 | ||||
| pw_m3: times 8 dw -3 | pw_m3: times 8 dw -3 | ||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | |||||
| pd_17: times 4 dd 17 | pd_17: times 4 dd 17 | ||||
| pd_16: times 4 dd 16 | pd_16: times 4 dd 16 | ||||
| @@ -26,12 +26,12 @@ | |||||
| SECTION_RODATA 32 | SECTION_RODATA 32 | ||||
| cextern pw_1023 | |||||
| %define pw_pixel_max pw_1023 | |||||
| cextern pw_16 | cextern pw_16 | ||||
| cextern pw_1 | cextern pw_1 | ||||
| cextern pb_0 | cextern pb_0 | ||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | |||||
| pad10: times 8 dw 10*1023 | pad10: times 8 dw 10*1023 | ||||
| pad20: times 8 dw 20*1023 | pad20: times 8 dw 20*1023 | ||||
| pad30: times 8 dw 30*1023 | pad30: times 8 dw 30*1023 | ||||
| @@ -26,11 +26,12 @@ | |||||
| SECTION_RODATA 32 | SECTION_RODATA 32 | ||||
| pw_pixel_max: times 8 dw ((1 << 10)-1) | |||||
| sq_1: dq 1 | sq_1: dq 1 | ||||
| dq 0 | dq 0 | ||||
| cextern pw_1 | cextern pw_1 | ||||
| cextern pw_1023 | |||||
| %define pw_pixel_max pw_1023 | |||||
| SECTION .text | SECTION .text | ||||
| @@ -26,8 +26,9 @@ | |||||
| SECTION_RODATA | SECTION_RODATA | ||||
| cextern pw_1023 | |||||
| %define pw_pixel_max_10 pw_1023 | |||||
| pw_pixel_max_12: times 8 dw ((1 << 12)-1) | pw_pixel_max_12: times 8 dw ((1 << 12)-1) | ||||
| pw_pixel_max_10: times 8 dw ((1 << 10)-1) | |||||
| pw_m2: times 8 dw -2 | pw_m2: times 8 dw -2 | ||||
| pd_1 : times 4 dd 1 | pd_1 : times 4 dd 1 | ||||
| @@ -21,14 +21,21 @@ | |||||
| %include "libavutil/x86/x86util.asm" | %include "libavutil/x86/x86util.asm" | ||||
| SECTION_RODATA 32 | SECTION_RODATA 32 | ||||
| pw_8: times 16 dw (1 << 9) | |||||
| pw_10: times 16 dw (1 << 11) | |||||
| pw_12: times 16 dw (1 << 13) | |||||
| cextern pw_255 | |||||
| cextern pw_512 | |||||
| cextern pw_2048 | |||||
| cextern pw_8192 | |||||
| cextern pw_1023 | |||||
| cextern pw_1024 | |||||
| cextern pw_4096 | |||||
| %define pw_8 pw_512 | |||||
| %define pw_10 pw_2048 | |||||
| %define pw_12 pw_8192 | |||||
| %define pw_bi_10 pw_1024 | |||||
| %define pw_bi_12 pw_4096 | |||||
| %define max_pixels_8 pw_255 | |||||
| %define max_pixels_10 pw_1023 | |||||
| pw_bi_8: times 16 dw (1 << 8) | pw_bi_8: times 16 dw (1 << 8) | ||||
| pw_bi_10: times 16 dw (1 << 10) | |||||
| pw_bi_12: times 16 dw (1 << 12) | |||||
| max_pixels_8: times 16 dw ((1 << 8)-1) | |||||
| max_pixels_10: times 16 dw ((1 << 10)-1) | |||||
| max_pixels_12: times 16 dw ((1 << 12)-1) | max_pixels_12: times 16 dw ((1 << 12)-1) | ||||
| cextern pd_1 | cextern pd_1 | ||||
| cextern pb_0 | cextern pb_0 | ||||
| @@ -20,11 +20,11 @@ | |||||
| ; */ | ; */ | ||||
| %include "libavutil/x86/x86util.asm" | %include "libavutil/x86/x86util.asm" | ||||
| SECTION_RODATA 32 | |||||
| max_pixels_10: times 16 dw ((1 << 10)-1) | |||||
| SECTION .text | |||||
| cextern pw_1023 | |||||
| %define max_pixels_10 pw_1023 | |||||
| SECTION .text | |||||
| ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file | ;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file | ||||
| %macro TR_ADD_MMX_4_8 0 | %macro TR_ADD_MMX_4_8 0 | ||||
| @@ -23,7 +23,8 @@ | |||||
| SECTION_RODATA | SECTION_RODATA | ||||
| v210_enc_min_10: times 8 dw 0x4 | |||||
| cextern pw_4 | |||||
| %define v210_enc_min_10 pw_4 | |||||
| v210_enc_max_10: times 8 dw 0x3fb | v210_enc_max_10: times 8 dw 0x3fb | ||||
| v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 | v210_enc_luma_mult_10: dw 4,1,16,4,1,16,0,0 | ||||
| @@ -32,8 +33,10 @@ v210_enc_luma_shuf_10: db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11 | |||||
| v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 | v210_enc_chroma_mult_10: dw 1,4,16,0,16,1,4,0 | ||||
| v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 | v210_enc_chroma_shuf_10: db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1 | ||||
| v210_enc_min_8: times 16 db 0x1 | |||||
| v210_enc_max_8: times 16 db 0xfe | |||||
| cextern pb_1 | |||||
| %define v210_enc_min_8 pb_1 | |||||
| cextern pb_FE | |||||
| %define v210_enc_max_8 pb_FE | |||||
| v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 | v210_enc_luma_shuf_8: db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1 | ||||
| v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 | v210_enc_luma_mult_8: dw 16,4,64,16,4,64,0,0 | ||||
| @@ -36,11 +36,11 @@ vp3_idct_data: times 8 dw 64277 | |||||
| pb_7: times 8 db 0x07 | pb_7: times 8 db 0x07 | ||||
| pb_1F: times 8 db 0x1f | pb_1F: times 8 db 0x1f | ||||
| pb_81: times 8 db 0x81 | pb_81: times 8 db 0x81 | ||||
| pb_FE: times 8 db 0xFE | |||||
| cextern pb_1 | cextern pb_1 | ||||
| cextern pb_3 | cextern pb_3 | ||||
| cextern pb_80 | cextern pb_80 | ||||
| cextern pb_FE | |||||
| cextern pw_8 | cextern pw_8 | ||||