4x-6x faster on sandybridge Signed-off-by: Luca Barbato <lu_zero@gentoo.org>tags/n2.0
| @@ -200,7 +200,9 @@ int ff_lpc_calc_coefs(LPCContext *s, | |||||
| ref[i] = fabs(lpc[i][i]); | ref[i] = fabs(lpc[i][i]); | ||||
| } else if (lpc_type == FF_LPC_TYPE_CHOLESKY) { | } else if (lpc_type == FF_LPC_TYPE_CHOLESKY) { | ||||
| LLSModel m[2]; | LLSModel m[2]; | ||||
| double var[MAX_LPC_ORDER+1], av_uninit(weight); | |||||
| LOCAL_ALIGNED(32, double, var, [FFALIGN(MAX_LPC_ORDER+1,4)]); | |||||
| double av_uninit(weight); | |||||
| memset(var, 0, FFALIGN(MAX_LPC_ORDER+1,4)*sizeof(*var)); | |||||
| for(pass=0; pass<lpc_passes; pass++){ | for(pass=0; pass<lpc_passes; pass++){ | ||||
| avpriv_init_lls(&m[pass&1], max_order); | avpriv_init_lls(&m[pass&1], max_order); | ||||
| @@ -46,8 +46,8 @@ static void update_lls(LLSModel *m, double *var) | |||||
| void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order) | void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order) | ||||
| { | { | ||||
| int i, j, k; | int i, j, k; | ||||
| double (*factor)[MAX_VARS + 1] = (void *) &m->covariance[1][0]; | |||||
| double (*covar) [MAX_VARS + 1] = (void *) &m->covariance[1][1]; | |||||
| double (*factor)[MAX_VARS_ALIGN] = (void *) &m->covariance[1][0]; | |||||
| double (*covar) [MAX_VARS_ALIGN] = (void *) &m->covariance[1][1]; | |||||
| double *covar_y = m->covariance[0]; | double *covar_y = m->covariance[0]; | ||||
| int count = m->indep_count; | int count = m->indep_count; | ||||
| @@ -117,6 +117,8 @@ av_cold void avpriv_init_lls(LLSModel *m, int indep_count) | |||||
| m->indep_count = indep_count; | m->indep_count = indep_count; | ||||
| m->update_lls = update_lls; | m->update_lls = update_lls; | ||||
| m->evaluate_lls = evaluate_lls; | m->evaluate_lls = evaluate_lls; | ||||
| if (ARCH_X86) | |||||
| ff_init_lls_x86(m); | |||||
| } | } | ||||
| #if FF_API_LLS_PRIVATE | #if FF_API_LLS_PRIVATE | ||||
| @@ -154,7 +156,7 @@ int main(void) | |||||
| avpriv_init_lls(&m, 3); | avpriv_init_lls(&m, 3); | ||||
| for (i = 0; i < 100; i++) { | for (i = 0; i < 100; i++) { | ||||
| double var[4]; | |||||
| LOCAL_ALIGNED(32, double, var, [4]); | |||||
| double eval; | double eval; | ||||
| var[0] = (av_lfg_get(&lfg) / (double) UINT_MAX - 0.5) * 2; | var[0] = (av_lfg_get(&lfg) / (double) UINT_MAX - 0.5) * 2; | ||||
| @@ -23,9 +23,12 @@ | |||||
| #ifndef AVUTIL_LLS_H | #ifndef AVUTIL_LLS_H | ||||
| #define AVUTIL_LLS_H | #define AVUTIL_LLS_H | ||||
| #include "common.h" | |||||
| #include "mem.h" | |||||
| #include "version.h" | #include "version.h" | ||||
| #define MAX_VARS 32 | #define MAX_VARS 32 | ||||
| #define MAX_VARS_ALIGN FFALIGN(MAX_VARS+1,4) | |||||
| //FIXME avoid direct access to LLSModel from outside | //FIXME avoid direct access to LLSModel from outside | ||||
| @@ -33,26 +36,29 @@ | |||||
| * Linear least squares model. | * Linear least squares model. | ||||
| */ | */ | ||||
| typedef struct LLSModel { | typedef struct LLSModel { | ||||
| double covariance[MAX_VARS + 1][MAX_VARS + 1]; | |||||
| double coeff[MAX_VARS][MAX_VARS]; | |||||
| DECLARE_ALIGNED(32, double, covariance[MAX_VARS_ALIGN][MAX_VARS_ALIGN]); | |||||
| DECLARE_ALIGNED(32, double, coeff[MAX_VARS][MAX_VARS]); | |||||
| double variance[MAX_VARS]; | double variance[MAX_VARS]; | ||||
| int indep_count; | int indep_count; | ||||
| /** | /** | ||||
| * Take the outer-product of var[] with itself, and add to the covariance matrix. | * Take the outer-product of var[] with itself, and add to the covariance matrix. | ||||
| * @param m this context | * @param m this context | ||||
| * @param var training samples, starting with the value to be predicted | * @param var training samples, starting with the value to be predicted | ||||
| * 32-byte aligned, and any padding elements must be initialized | |||||
| * (i.e not denormal/nan). | |||||
| */ | */ | ||||
| void (*update_lls)(struct LLSModel *m, double *var); | void (*update_lls)(struct LLSModel *m, double *var); | ||||
| /** | /** | ||||
| * Inner product of var[] and the LPC coefs. | * Inner product of var[] and the LPC coefs. | ||||
| * @param m this context | * @param m this context | ||||
| * @param var training samples, excluding the value to be predicted | |||||
| * @param var training samples, excluding the value to be predicted. unaligned. | |||||
| * @param order lpc order | * @param order lpc order | ||||
| */ | */ | ||||
| double (*evaluate_lls)(struct LLSModel *m, double *var, int order); | double (*evaluate_lls)(struct LLSModel *m, double *var, int order); | ||||
| } LLSModel; | } LLSModel; | ||||
| void avpriv_init_lls(LLSModel *m, int indep_count); | void avpriv_init_lls(LLSModel *m, int indep_count); | ||||
| void ff_init_lls_x86(LLSModel *m); | |||||
| void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order); | void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order); | ||||
| #if FF_API_LLS_PRIVATE | #if FF_API_LLS_PRIVATE | ||||
| @@ -1,6 +1,8 @@ | |||||
| OBJS += x86/cpu.o \ | OBJS += x86/cpu.o \ | ||||
| x86/float_dsp_init.o \ | x86/float_dsp_init.o \ | ||||
| x86/lls_init.o \ | |||||
| YASM-OBJS += x86/cpuid.o \ | YASM-OBJS += x86/cpuid.o \ | ||||
| x86/emms.o \ | x86/emms.o \ | ||||
| x86/float_dsp.o \ | x86/float_dsp.o \ | ||||
| x86/lls.o \ | |||||
| @@ -0,0 +1,196 @@ | |||||
| ;****************************************************************************** | |||||
| ;* linear least squares model | |||||
| ;* | |||||
| ;* Copyright (c) 2013 Loren Merritt | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "x86util.asm" | |||||
| SECTION .text | |||||
| %define MAX_VARS 32 | |||||
| %define MAX_VARS_ALIGN (MAX_VARS+4) | |||||
| %define COVAR_STRIDE MAX_VARS_ALIGN*8 | |||||
| %define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] | |||||
| struc LLSModel | |||||
| .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN | |||||
| .coeff: resq MAX_VARS*MAX_VARS | |||||
| .variance: resq MAX_VARS | |||||
| .indep_count: resd 1 | |||||
| endstruc | |||||
| %macro ADDPD_MEM 2 | |||||
| %if cpuflag(avx) | |||||
| vaddpd %2, %1 | |||||
| %else | |||||
| addpd %2, %1 | |||||
| %endif | |||||
| mova %1, %2 | |||||
| %endmacro | |||||
| INIT_XMM sse2 | |||||
| %define movdqa movaps | |||||
| cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 | |||||
| %define covarq ctxq | |||||
| mov id, [ctxq + LLSModel.indep_count] | |||||
| lea varq, [varq + iq*8] | |||||
| neg iq | |||||
| mov covar2q, covarq | |||||
| .loopi: | |||||
| ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal | |||||
| mova m1, [varq + iq*8] | |||||
| mova m3, [varq + iq*8 + 16] | |||||
| pshufd m4, m1, q1010 | |||||
| pshufd m5, m1, q3232 | |||||
| pshufd m6, m3, q1010 | |||||
| pshufd m7, m3, q3232 | |||||
| mulpd m0, m1, m4 | |||||
| mulpd m1, m1, m5 | |||||
| lea covarq, [covar2q + 16] | |||||
| ADDPD_MEM COVAR(-2,0), m0 | |||||
| ADDPD_MEM COVAR(-2,1), m1 | |||||
| lea jq, [iq + 2] | |||||
| cmp jd, -2 | |||||
| jg .skip4x4 | |||||
| .loop4x4: | |||||
| ; Compute all 16 pairwise products of a 4x4 block | |||||
| mulpd m0, m4, m3 | |||||
| mulpd m1, m5, m3 | |||||
| mulpd m2, m6, m3 | |||||
| mulpd m3, m3, m7 | |||||
| ADDPD_MEM COVAR(0,0), m0 | |||||
| ADDPD_MEM COVAR(0,1), m1 | |||||
| ADDPD_MEM COVAR(0,2), m2 | |||||
| ADDPD_MEM COVAR(0,3), m3 | |||||
| mova m3, [varq + jq*8 + 16] | |||||
| mulpd m0, m4, m3 | |||||
| mulpd m1, m5, m3 | |||||
| mulpd m2, m6, m3 | |||||
| mulpd m3, m3, m7 | |||||
| ADDPD_MEM COVAR(2,0), m0 | |||||
| ADDPD_MEM COVAR(2,1), m1 | |||||
| ADDPD_MEM COVAR(2,2), m2 | |||||
| ADDPD_MEM COVAR(2,3), m3 | |||||
| mova m3, [varq + jq*8 + 32] | |||||
| add covarq, 32 | |||||
| add jq, 4 | |||||
| cmp jd, -2 | |||||
| jle .loop4x4 | |||||
| .skip4x4: | |||||
| test jd, jd | |||||
| jg .skip2x4 | |||||
| mulpd m4, m3 | |||||
| mulpd m5, m3 | |||||
| mulpd m6, m3 | |||||
| mulpd m7, m3 | |||||
| ADDPD_MEM COVAR(0,0), m4 | |||||
| ADDPD_MEM COVAR(0,1), m5 | |||||
| ADDPD_MEM COVAR(0,2), m6 | |||||
| ADDPD_MEM COVAR(0,3), m7 | |||||
| .skip2x4: | |||||
| add iq, 4 | |||||
| add covar2q, 4*COVAR_STRIDE+32 | |||||
| cmp id, -2 | |||||
| jle .loopi | |||||
| test id, id | |||||
| jg .ret | |||||
| mov jq, iq | |||||
| %define covarq covar2q | |||||
| .loop2x1: | |||||
| movsd m0, [varq + iq*8] | |||||
| movlhps m0, m0 | |||||
| mulpd m0, [varq + jq*8] | |||||
| ADDPD_MEM COVAR(0,0), m0 | |||||
| inc iq | |||||
| add covarq, COVAR_STRIDE | |||||
| test id, id | |||||
| jle .loop2x1 | |||||
| .ret: | |||||
| REP_RET | |||||
| INIT_YMM avx | |||||
| cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 | |||||
| %define covarq ctxq | |||||
| mov countd, [ctxq + LLSModel.indep_count] | |||||
| lea count2d, [countq-2] | |||||
| xor id, id | |||||
| .loopi: | |||||
| ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal | |||||
| mova ymm1, [varq + iq*8] | |||||
| vbroadcastsd ymm4, [varq + iq*8] | |||||
| vbroadcastsd ymm5, [varq + iq*8 + 8] | |||||
| vbroadcastsd ymm6, [varq + iq*8 + 16] | |||||
| vbroadcastsd ymm7, [varq + iq*8 + 24] | |||||
| vextractf128 xmm3, ymm1, 1 | |||||
| vmulpd ymm0, ymm1, ymm4 | |||||
| vmulpd ymm1, ymm1, ymm5 | |||||
| vmulpd xmm2, xmm3, xmm6 | |||||
| vmulpd xmm3, xmm3, xmm7 | |||||
| ADDPD_MEM COVAR(iq ,0), ymm0 | |||||
| ADDPD_MEM COVAR(iq ,1), ymm1 | |||||
| ADDPD_MEM COVAR(iq+2,2), xmm2 | |||||
| ADDPD_MEM COVAR(iq+2,3), xmm3 | |||||
| lea jd, [iq + 4] | |||||
| cmp jd, count2d | |||||
| jg .skip4x4 | |||||
| .loop4x4: | |||||
| ; Compute all 16 pairwise products of a 4x4 block | |||||
| mova ymm3, [varq + jq*8] | |||||
| vmulpd ymm0, ymm3, ymm4 | |||||
| vmulpd ymm1, ymm3, ymm5 | |||||
| vmulpd ymm2, ymm3, ymm6 | |||||
| vmulpd ymm3, ymm3, ymm7 | |||||
| ADDPD_MEM COVAR(jq,0), ymm0 | |||||
| ADDPD_MEM COVAR(jq,1), ymm1 | |||||
| ADDPD_MEM COVAR(jq,2), ymm2 | |||||
| ADDPD_MEM COVAR(jq,3), ymm3 | |||||
| add jd, 4 | |||||
| cmp jd, count2d | |||||
| jle .loop4x4 | |||||
| .skip4x4: | |||||
| cmp jd, countd | |||||
| jg .skip2x4 | |||||
| mova xmm3, [varq + jq*8] | |||||
| vmulpd xmm0, xmm3, xmm4 | |||||
| vmulpd xmm1, xmm3, xmm5 | |||||
| vmulpd xmm2, xmm3, xmm6 | |||||
| vmulpd xmm3, xmm3, xmm7 | |||||
| ADDPD_MEM COVAR(jq,0), xmm0 | |||||
| ADDPD_MEM COVAR(jq,1), xmm1 | |||||
| ADDPD_MEM COVAR(jq,2), xmm2 | |||||
| ADDPD_MEM COVAR(jq,3), xmm3 | |||||
| .skip2x4: | |||||
| add id, 4 | |||||
| add covarq, 4*COVAR_STRIDE | |||||
| cmp id, count2d | |||||
| jle .loopi | |||||
| cmp id, countd | |||||
| jg .ret | |||||
| mov jd, id | |||||
| .loop2x1: | |||||
| vmovddup xmm0, [varq + iq*8] | |||||
| vmulpd xmm0, [varq + jq*8] | |||||
| ADDPD_MEM COVAR(jq,0), xmm0 | |||||
| inc id | |||||
| add covarq, COVAR_STRIDE | |||||
| cmp id, countd | |||||
| jle .loop2x1 | |||||
| .ret: | |||||
| REP_RET | |||||
| @@ -0,0 +1,38 @@ | |||||
| /* | |||||
| * linear least squares model | |||||
| * | |||||
| * Copyright (c) 2013 Loren Merritt | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavutil/lls.h" | |||||
| #include "libavutil/x86/cpu.h" | |||||
| void ff_update_lls_sse2(LLSModel *m, double *var); | |||||
| void ff_update_lls_avx(LLSModel *m, double *var); | |||||
| av_cold void ff_init_lls_x86(LLSModel *m) | |||||
| { | |||||
| int cpu_flags = av_get_cpu_flags(); | |||||
| if (EXTERNAL_SSE2(cpu_flags)) { | |||||
| m->update_lls = ff_update_lls_sse2; | |||||
| } | |||||
| if (EXTERNAL_AVX(cpu_flags)) { | |||||
| m->update_lls = ff_update_lls_avx; | |||||
| } | |||||
| } | |||||