|
|
|
@@ -68,6 +68,39 @@ SCALARPRODUCT |
|
|
|
INIT_XMM sse2 |
|
|
|
SCALARPRODUCT |
|
|
|
|
|
|
|
INIT_XMM sse4 |
|
|
|
; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3, |
|
|
|
; int order, int mul) |
|
|
|
cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul |
|
|
|
shl orderq, 1 |
|
|
|
movd m7, mulm |
|
|
|
SPLATW m7, m7 |
|
|
|
pxor m6, m6 |
|
|
|
add v1q, orderq |
|
|
|
lea v2q, [v2q + 2*orderq] |
|
|
|
add v3q, orderq |
|
|
|
neg orderq |
|
|
|
.loop: |
|
|
|
mova m3, [v1q + orderq] |
|
|
|
movu m0, [v2q + 2*orderq] |
|
|
|
pmovsxwd m4, m3 |
|
|
|
movu m1, [v2q + 2*orderq + mmsize] |
|
|
|
movhlps m5, m3 |
|
|
|
movu m2, [v3q + orderq] |
|
|
|
pmovsxwd m5, m5 |
|
|
|
pmullw m2, m7 |
|
|
|
pmulld m0, m4 |
|
|
|
pmulld m1, m5 |
|
|
|
paddw m2, m3 |
|
|
|
paddd m6, m0 |
|
|
|
paddd m6, m1 |
|
|
|
mova [v1q + orderq], m2 |
|
|
|
add orderq, 16 |
|
|
|
jl .loop |
|
|
|
HADDD m6, m0 |
|
|
|
movd eax, m6 |
|
|
|
RET |
|
|
|
|
|
|
|
%macro SCALARPRODUCT_LOOP 1 |
|
|
|
align 16 |
|
|
|
.loop%1: |
|
|
|
|