From f3e084909bff422f0c853507a82f92ff2efc0d28 Mon Sep 17 00:00:00 2001
From: Christophe GISQUET <christophe.gisquet@gmail.com>
Date: Wed, 18 Jan 2012 22:34:29 +0100
Subject: [PATCH] mpegaudio: replace memcpy by SIMD code

By replacing memcpy with an unrolled loop using the alignment knowledge
it has, some speedup can be obtained.

Before (gcc 4.6.1): ~400 cycles
After: ~370 cycles

Overall, around 2% speed increase when decoding a 2400s mp3 to f32le.

Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
---
 libavcodec/x86/mpegaudiodec_mmx.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c
index 06ffbca90a..f51a06d14c 100644
--- a/libavcodec/x86/mpegaudiodec_mmx.c
+++ b/libavcodec/x86/mpegaudiodec_mmx.c
@@ -106,7 +106,26 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     float sum;
 
     /* copy to avoid wrap */
-    memcpy(in + 512, in, 32 * sizeof(*in));
+    __asm__ volatile(
+            "movaps    0(%0), %%xmm0   \n\t" \
+            "movaps   16(%0), %%xmm1   \n\t" \
+            "movaps   32(%0), %%xmm2   \n\t" \
+            "movaps   48(%0), %%xmm3   \n\t" \
+            "movaps   %%xmm0,   0(%1) \n\t" \
+            "movaps   %%xmm1,  16(%1) \n\t" \
+            "movaps   %%xmm2,  32(%1) \n\t" \
+            "movaps   %%xmm3,  48(%1) \n\t" \
+            "movaps   64(%0), %%xmm0   \n\t" \
+            "movaps   80(%0), %%xmm1   \n\t" \
+            "movaps   96(%0), %%xmm2   \n\t" \
+            "movaps  112(%0), %%xmm3   \n\t" \
+            "movaps   %%xmm0,  64(%1) \n\t" \
+            "movaps   %%xmm1,  80(%1) \n\t" \
+            "movaps   %%xmm2,  96(%1) \n\t" \
+            "movaps   %%xmm3, 112(%1) \n\t"
+            ::"r"(in), "r"(in+512)
+            :"memory"
+            );
 
     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);