Based on x264 code Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.2-rc1
| @@ -323,6 +323,7 @@ Optimization options (experts only): | |||||
| --disable-sse42 disable SSE4.2 optimizations | --disable-sse42 disable SSE4.2 optimizations | ||||
| --disable-avx disable AVX optimizations | --disable-avx disable AVX optimizations | ||||
| --disable-xop disable XOP optimizations | --disable-xop disable XOP optimizations | ||||
| --disable-fma3 disable FMA3 optimizations | |||||
| --disable-fma4 disable FMA4 optimizations | --disable-fma4 disable FMA4 optimizations | ||||
| --disable-avx2 disable AVX2 optimizations | --disable-avx2 disable AVX2 optimizations | ||||
| --disable-armv5te disable armv5te optimizations | --disable-armv5te disable armv5te optimizations | ||||
| @@ -1455,6 +1456,7 @@ ARCH_EXT_LIST_X86=' | |||||
| avx | avx | ||||
| avx2 | avx2 | ||||
| cpunop | cpunop | ||||
| fma3 | |||||
| fma4 | fma4 | ||||
| i686 | i686 | ||||
| mmx | mmx | ||||
| @@ -1828,6 +1830,7 @@ sse4_deps="ssse3" | |||||
| sse42_deps="sse4" | sse42_deps="sse4" | ||||
| avx_deps="sse42" | avx_deps="sse42" | ||||
| xop_deps="avx" | xop_deps="avx" | ||||
| fma3_deps="avx" | |||||
| fma4_deps="avx" | fma4_deps="avx" | ||||
| avx2_deps="avx" | avx2_deps="avx" | ||||
| @@ -4252,6 +4255,7 @@ EOF | |||||
| die "yasm/nasm not found or too old. Use --disable-yasm for a crippled build." | die "yasm/nasm not found or too old. Use --disable-yasm for a crippled build." | ||||
| check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx_external avresample | check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx_external avresample | ||||
| check_yasm "vpmacsdd xmm0, xmm1, xmm2, xmm3" || disable xop_external | check_yasm "vpmacsdd xmm0, xmm1, xmm2, xmm3" || disable xop_external | ||||
| check_yasm "vfmadd132ps ymm0, ymm1, ymm2" || disable fma3_external | |||||
| check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4_external | check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4_external | ||||
| check_yasm "CPU amdnop" && enable cpunop | check_yasm "CPU amdnop" && enable cpunop | ||||
| fi | fi | ||||
| @@ -4937,6 +4941,7 @@ if enabled x86; then | |||||
| echo "SSSE3 enabled ${ssse3-no}" | echo "SSSE3 enabled ${ssse3-no}" | ||||
| echo "AVX enabled ${avx-no}" | echo "AVX enabled ${avx-no}" | ||||
| echo "XOP enabled ${xop-no}" | echo "XOP enabled ${xop-no}" | ||||
| echo "FMA3 enabled ${fma3-no}" | |||||
| echo "FMA4 enabled ${fma4-no}" | echo "FMA4 enabled ${fma4-no}" | ||||
| echo "i686 features enabled ${i686-no}" | echo "i686 features enabled ${i686-no}" | ||||
| echo "CMOV is fast ${fast_cmov-no}" | echo "CMOV is fast ${fast_cmov-no}" | ||||
| @@ -91,6 +91,7 @@ int av_parse_cpu_flags(const char *s) | |||||
| #define CPUFLAG_SSE42 (AV_CPU_FLAG_SSE42 | CPUFLAG_SSE4) | #define CPUFLAG_SSE42 (AV_CPU_FLAG_SSE42 | CPUFLAG_SSE4) | ||||
| #define CPUFLAG_AVX (AV_CPU_FLAG_AVX | CPUFLAG_SSE42) | #define CPUFLAG_AVX (AV_CPU_FLAG_AVX | CPUFLAG_SSE42) | ||||
| #define CPUFLAG_XOP (AV_CPU_FLAG_XOP | CPUFLAG_AVX) | #define CPUFLAG_XOP (AV_CPU_FLAG_XOP | CPUFLAG_AVX) | ||||
| #define CPUFLAG_FMA3 (AV_CPU_FLAG_FMA3 | CPUFLAG_AVX) | |||||
| #define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX) | #define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX) | ||||
| #define CPUFLAG_AVX2 (AV_CPU_FLAG_AVX2 | CPUFLAG_AVX) | #define CPUFLAG_AVX2 (AV_CPU_FLAG_AVX2 | CPUFLAG_AVX) | ||||
| static const AVOption cpuflags_opts[] = { | static const AVOption cpuflags_opts[] = { | ||||
| @@ -111,6 +112,7 @@ int av_parse_cpu_flags(const char *s) | |||||
| { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42 }, .unit = "flags" }, | { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42 }, .unit = "flags" }, | ||||
| { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX }, .unit = "flags" }, | { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX }, .unit = "flags" }, | ||||
| { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP }, .unit = "flags" }, | { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP }, .unit = "flags" }, | ||||
| { "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3 }, .unit = "flags" }, | |||||
| { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4 }, .unit = "flags" }, | { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4 }, .unit = "flags" }, | ||||
| { "avx2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX2 }, .unit = "flags" }, | { "avx2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX2 }, .unit = "flags" }, | ||||
| { "3dnow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_3DNOW }, .unit = "flags" }, | { "3dnow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_3DNOW }, .unit = "flags" }, | ||||
| @@ -166,6 +168,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s) | |||||
| { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE42 }, .unit = "flags" }, | { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE42 }, .unit = "flags" }, | ||||
| { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX }, .unit = "flags" }, | { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX }, .unit = "flags" }, | ||||
| { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_XOP }, .unit = "flags" }, | { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_XOP }, .unit = "flags" }, | ||||
| { "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA3 }, .unit = "flags" }, | |||||
| { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA4 }, .unit = "flags" }, | { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA4 }, .unit = "flags" }, | ||||
| { "avx2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX2 }, .unit = "flags" }, | { "avx2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX2 }, .unit = "flags" }, | ||||
| { "3dnow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_3DNOW }, .unit = "flags" }, | { "3dnow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_3DNOW }, .unit = "flags" }, | ||||
| @@ -279,6 +282,7 @@ static const struct { | |||||
| { AV_CPU_FLAG_SSE42, "sse4.2" }, | { AV_CPU_FLAG_SSE42, "sse4.2" }, | ||||
| { AV_CPU_FLAG_AVX, "avx" }, | { AV_CPU_FLAG_AVX, "avx" }, | ||||
| { AV_CPU_FLAG_XOP, "xop" }, | { AV_CPU_FLAG_XOP, "xop" }, | ||||
| { AV_CPU_FLAG_FMA3, "fma3" }, | |||||
| { AV_CPU_FLAG_FMA4, "fma4" }, | { AV_CPU_FLAG_FMA4, "fma4" }, | ||||
| { AV_CPU_FLAG_3DNOW, "3dnow" }, | { AV_CPU_FLAG_3DNOW, "3dnow" }, | ||||
| { AV_CPU_FLAG_3DNOWEXT, "3dnowext" }, | { AV_CPU_FLAG_3DNOWEXT, "3dnowext" }, | ||||
| @@ -51,6 +51,7 @@ | |||||
| // #define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction | // #define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction | ||||
| // #endif | // #endif | ||||
| #define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used | #define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used | ||||
| #define AV_CPU_FLAG_FMA3 0x10000 ///< Haswell FMA3 functions | |||||
| #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard | #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard | ||||
| @@ -131,8 +131,11 @@ int ff_get_cpu_flags_x86(void) | |||||
| if ((ecx & 0x18000000) == 0x18000000) { | if ((ecx & 0x18000000) == 0x18000000) { | ||||
| /* Check for OS support */ | /* Check for OS support */ | ||||
| xgetbv(0, eax, edx); | xgetbv(0, eax, edx); | ||||
| if ((eax & 0x6) == 0x6) | |||||
| if ((eax & 0x6) == 0x6) { | |||||
| rval |= AV_CPU_FLAG_AVX; | rval |= AV_CPU_FLAG_AVX; | ||||
| if (ecx&0x00001000) | |||||
| rval |= AV_CPU_FLAG_FMA3; | |||||
| } | |||||
| } | } | ||||
| #if HAVE_AVX2 | #if HAVE_AVX2 | ||||
| if (max_std_level >= 7) { | if (max_std_level >= 7) { | ||||
| @@ -38,6 +38,7 @@ | |||||
| #define X86_SSE42(flags) CPUEXT(flags, SSE42) | #define X86_SSE42(flags) CPUEXT(flags, SSE42) | ||||
| #define X86_AVX(flags) CPUEXT(flags, AVX) | #define X86_AVX(flags) CPUEXT(flags, AVX) | ||||
| #define X86_XOP(flags) CPUEXT(flags, XOP) | #define X86_XOP(flags) CPUEXT(flags, XOP) | ||||
| #define X86_FMA3(flags) CPUEXT(flags, FMA3) | |||||
| #define X86_FMA4(flags) CPUEXT(flags, FMA4) | #define X86_FMA4(flags) CPUEXT(flags, FMA4) | ||||
| #define X86_AVX2(flags) CPUEXT(flags, AVX2) | #define X86_AVX2(flags) CPUEXT(flags, AVX2) | ||||
| @@ -53,6 +54,7 @@ | |||||
| #define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) | #define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) | ||||
| #define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) | #define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) | ||||
| #define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP) | #define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP) | ||||
| #define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3) | |||||
| #define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) | #define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) | ||||
| #define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) | #define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) | ||||
| @@ -68,6 +70,7 @@ | |||||
| #define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42) | #define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42) | ||||
| #define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX) | #define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX) | ||||
| #define INLINE_XOP(flags) CPUEXT_SUFFIX(flags, _INLINE, XOP) | #define INLINE_XOP(flags) CPUEXT_SUFFIX(flags, _INLINE, XOP) | ||||
| #define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3) | |||||
| #define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4) | #define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4) | ||||
| #define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2) | #define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2) | ||||