diff --git a/configure.ac b/configure.ac index 84b66c2..0460bc6 100644 --- a/configure.ac +++ b/configure.ac @@ -17,7 +17,7 @@ dnl changes are made dnl --- JACK_MAJOR_VERSION=0 JACK_MINOR_VERSION=107 -JACK_MICRO_VERSION=1 +JACK_MICRO_VERSION=2 dnl --- dnl HOWTO: updating the jack protocol version diff --git a/drivers/oss/oss_driver.c b/drivers/oss/oss_driver.c index 0035161..e3a3578 100644 --- a/drivers/oss/oss_driver.c +++ b/drivers/oss/oss_driver.c @@ -1,7 +1,7 @@ /* OSS driver for Jack - Copyright (C) 2003-2005 Jussi Laako + Copyright (C) 2003-2007 Jussi Laako This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -456,6 +456,7 @@ static int oss_driver_start (oss_driver_t *driver) samplesize = sizeof(short); break; } + driver->trigger = 0; if (strcmp(indev, outdev) != 0) { if (driver->capture_channels > 0) @@ -529,6 +530,8 @@ static int oss_driver_start (oss_driver_t *driver) } if (infd >= 0 && outfd >= 0) { + ioctl(outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); + driver->trigger = (PCM_ENABLE_INPUT|PCM_ENABLE_OUTPUT); if (ioctl(infd, SNDCTL_DSP_SETDUPLEX, 0) < 0) { if (errno != EINVAL) /* Dont care */ @@ -961,6 +964,13 @@ static void *io_thread (void *param) __FILE__, __LINE__); return NULL; } + if (driver->trigger) + { + /* don't care too much if this fails */ + memset(localbuf, 0x00, localsize); + write(driver->outfd, localbuf, localsize); + ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); + } while (driver->run) { @@ -992,6 +1002,13 @@ static void *io_thread (void *param) jack_error("OSS: malloc() failed: %s@%i", __FILE__, __LINE__); return NULL; } + if (driver->trigger) + { + /* don't care too much if this fails */ + memset(localbuf, 0x00, localsize); + write(driver->outfd, localbuf, driver->outdevbufsize); + ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); + } while (driver->run) { @@ -1119,6 +1136,7 @@ jack_driver_t * driver_initialize (jack_client_t *client, driver->indev = NULL; driver->outdev = NULL; driver->ignorehwbuf = 0; + driver->trigger = 0; pnode = params; while (pnode != NULL) diff --git a/drivers/oss/oss_driver.h b/drivers/oss/oss_driver.h index 6db70f3..bcb2bd3 100644 --- a/drivers/oss/oss_driver.h +++ b/drivers/oss/oss_driver.h @@ -1,7 +1,7 @@ /* OSS driver for Jack - Copyright (C) 2003-2005 Jussi Laako + Copyright (C) 2003-2007 Jussi Laako This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -62,6 +62,7 @@ typedef struct _oss_driver int outfd; int format; int ignorehwbuf; + int trigger; size_t indevbufsize; size_t outdevbufsize; diff --git a/jack/intsimd.h b/jack/intsimd.h index 8f80eae..0853be7 100644 --- a/jack/intsimd.h +++ b/jack/intsimd.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2005 Jussi Laako + Copyright (C) 2005-2007 Jussi Laako This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -39,9 +39,16 @@ typedef v4sf * pv4sf; extern int cpu_type; +int have_3dnow (void); +int have_sse (void); +void x86_3dnow_copyf (float *, const float *, int); +void x86_3dnow_add2f (float *, const float *, int); +void x86_sse_copyf (float *, const float *, int); +void x86_sse_add2f (float *, const float *, int); + #endif /* ARCH_X86 */ -extern void jack_port_set_funcs (void); +void jack_port_set_funcs (void); #endif /* __jack_intsimd_h__ */ diff --git a/libjack/Makefile.am b/libjack/Makefile.am index 0181a59..601e3f7 100644 --- a/libjack/Makefile.am +++ b/libjack/Makefile.am @@ -23,7 +23,8 @@ SOURCE_FILES = \ time.c \ timestamps.c \ transclient.c \ - unlock.c + unlock.c \ + simd.c lib_LTLIBRARIES = libjack.la diff --git a/libjack/client.c b/libjack/client.c index b3956b4..e734e37 100644 --- a/libjack/client.c +++ b/libjack/client.c @@ -63,9 +63,6 @@ static pthread_mutex_t client_lock; static pthread_cond_t client_ready; -#ifdef ARCH_X86 -int cpu_type = 0; -#endif /* ARCH_X86 */ #define EVENT_POLL_INDEX 0 #define WAIT_POLL_INDEX 1 @@ -82,98 +79,7 @@ typedef struct { #ifdef ARCH_X86 -static int -have_3dnow () -{ - unsigned int res = 0; - -#ifdef __x86_64__ - asm volatile ("pushq %%rbx\n\t" : : : "memory"); -#else - asm volatile ("pushl %%ebx\n\t" : : : "memory"); -#endif - asm volatile ( - "movl $0x80000000, %%eax\n\t" \ - "cpuid\n\t" \ - "cmpl $0x80000001, %%eax\n\t" \ - "jl tdnow_prexit\n\t" \ - \ - "movl $0x80000001, %%eax\n\t" \ - "cpuid\n\t" \ - \ - "xorl %%eax, %%eax\n\t" \ - \ - "movl $1, %%ecx\n\t" \ - "shll $31, %%ecx\n\t" \ - "testl %%ecx, %%edx\n\t" \ - "jz tdnow_testexit\n\t" \ - "movl $1, %%eax\n\t" \ - \ - "movl $1, %%ecx\n\t" \ - "shll $30, %%ecx\n\t" \ - "testl %%ecx, %%edx\n\t" \ - "jz tdnow_testexit\n\t" \ - "movl $2, %%eax\n\t" \ - "jmp tdnow_testexit\n\t" \ - \ - "tdnow_prexit:\n\t" \ - "xorl %%eax, %%eax\n\t" \ - "tdnow_testexit:\n\t" - : "=a" (res) - : - : "ecx", "edx", "memory"); -#ifdef __x86_64__ - asm volatile ("popq %%rbx\n\t" : : : "memory"); -#else - asm volatile ("popl %%ebx\n\t" : : : "memory"); -#endif - return res; -} - -static int -have_sse () -{ - unsigned int res = 0; - -#ifdef __x86_64__ - asm volatile ("pushq %%rbx\n\t" : : : "memory"); -#else - asm volatile ("pushl %%ebx\n\t" : : : "memory"); -#endif - asm volatile ( - "movl $1, %%eax\n\t" \ - "cpuid\n\t" \ - \ - "xorl %%eax, %%eax\n\t" \ - \ - "movl $1, %%ebx\n\t" \ - "shll $25, %%ebx\n\t" \ - "testl %%ebx, %%edx\n\t" \ - "jz sse_testexit\n\t" \ - "movl $1, %%eax\n\t" \ - \ - "movl $1, %%ebx\n\t" \ - "shll $26, %%ebx\n\t" \ - "testl %%ebx, %%edx\n\t" \ - "jz sse_testexit\n\t" \ - "movl $2, %%eax\n\t" \ - \ - "movl $1, %%ebx\n\t" \ - "testl %%ebx, %%ecx\n\t" \ - "jz sse_testexit\n\t" \ - "movl $3, %%eax\n\t" \ - \ - "sse_testexit:\n\t" - : "=a" (res) - : - : "ecx", "edx", "memory"); -#ifdef __x86_64__ - asm volatile ("popq %%rbx\n\t" : : : "memory"); -#else - asm volatile ("popl %%ebx\n\t" : : : "memory"); -#endif - return res; -} +int cpu_type = 0; static void init_cpu () diff --git a/libjack/port.c b/libjack/port.c index e87cdf1..1d365f5 100644 --- a/libjack/port.c +++ b/libjack/port.c @@ -96,216 +96,6 @@ gen_mixf (float *dest, const float *src, int length) #ifdef ARCH_X86 -static void -x86_3dnow_add2f (float *dest, const float *src, int length) -{ - int i, n; - pv2sf m64p_dest = (pv2sf) dest; - pv2sf m64p_src = (pv2sf) src; - - n = (length >> 1); - for (i = 0; i < n; i++) - { - asm volatile ( - "movq %1, %%mm0\n\t" \ - "pfadd %2, %%mm0\n\t" \ - "movq %%mm0, %0\n\t" - : "=m" (m64p_dest[i]) - : "m0" (m64p_dest[i]), - "m" (m64p_src[i]) - : "mm0", "memory"); - } - if (n & 0x1) - { - asm volatile ( - "movd %1, %%mm0\n\t" \ - "movd %2, %%mm1\n\t" \ - "pfadd %%mm1, %%mm0\n\t" \ - "movd %%mm0, %0\n\t" - : "=m" (dest[length - 1]) - : "m0" (dest[length - 1]), - "m" (src[length - 1]) - : "mm0", "mm1", "memory"); - } - asm volatile ( - "femms\n\t" \ - "sfence\n\t"); -} - -static void -x86_3dnow_copyf (float *dest, const float *src, int length) -{ - int i, n1, n2; - pv2sf m64p_src = (pv2sf) src; - pv2sf m64p_dest = (pv2sf) dest; - - n1 = (length >> 4); - n2 = ((length & 0xf) >> 1); - for (i = 0; i < n1; i++) - { - asm volatile ("movq %0, %%mm0\n\t" - : : "m" (*m64p_src++) : "mm0", "memory"); - asm volatile ("movq %0, %%mm1\n\t" - : : "m" (*m64p_src++) : "mm1", "memory"); - asm volatile ("movq %0, %%mm2\n\t" - : : "m" (*m64p_src++) : "mm2", "memory"); - asm volatile ("movq %0, %%mm3\n\t" - : : "m" (*m64p_src++) : "mm3", "memory"); - asm volatile ("movq %0, %%mm4\n\t" - : : "m" (*m64p_src++) : "mm4", "memory"); - asm volatile ("movq %0, %%mm5\n\t" - : : "m" (*m64p_src++) : "mm5", "memory"); - asm volatile ("movq %0, %%mm6\n\t" - : : "m" (*m64p_src++) : "mm6", "memory"); - asm volatile ("movq %0, %%mm7\n\t" - : : "m" (*m64p_src++) : "xmm7", "memory"); - - asm volatile ("movq %%mm0, %0\n\t" - : "=m" (*m64p_dest++) : : "mm0", "memory"); - asm volatile ("movq %%mm1, %0\n\t" - : "=m" (*m64p_dest++) : : "mm1", "memory"); - asm volatile ("movq %%mm2, %0\n\t" - : "=m" (*m64p_dest++) : : "mm2", "memory"); - asm volatile ("movq %%mm3, %0\n\t" - : "=m" (*m64p_dest++) : : "mm3", "memory"); - asm volatile ("movq %%mm4, %0\n\t" - : "=m" (*m64p_dest++) : : "mm4", "memory"); - asm volatile ("movq %%mm5, %0\n\t" - : "=m" (*m64p_dest++) : : "mm5", "memory"); - asm volatile ("movq %%mm6, %0\n\t" - : "=m" (*m64p_dest++) : : "mm6", "memory"); - asm volatile ("movq %%mm7, %0\n\t" - : "=m" (*m64p_dest++) : : "mm7", "memory"); - } - for (i = 0; i < n2; i++) - { - asm volatile ( - "movq %1, %%mm0\n\t" \ - "movq %%mm0, %0\n\t" - : "=m" (*m64p_dest++) - : "m" (*m64p_src++) - : "mm0", "memory"); - } - if (length & 0x1) - { - asm volatile ( - "movd %1, %%mm0\n\t" \ - "movd %%mm0, %0\n\t" - : "=m" (dest[length - 1]) - : "m" (src[length - 1]) - : "mm0", "memory"); - } - asm volatile ( - "femms\n\t" \ - "sfence\n\t"); -} - -static void -x86_sse_copyf (float *dest, const float *src, int length) -{ - int i, n1, n2, si3; - pv4sf m128p_src = (pv4sf) src; - pv4sf m128p_dest = (pv4sf) dest; - - n1 = (length >> 5); - n2 = ((length & 0x1f) >> 2); - si3 = (length & ~0x3); - for (i = 0; i < n1; i++) - { - asm volatile ("movaps %0, %%xmm0\n\t" - : : "m" (*m128p_src++) : "xmm0", "memory"); - asm volatile ("movaps %0, %%xmm1\n\t" - : : "m" (*m128p_src++) : "xmm1", "memory"); - asm volatile ("movaps %0, %%xmm2\n\t" - : : "m" (*m128p_src++) : "xmm2", "memory"); - asm volatile ("movaps %0, %%xmm3\n\t" - : : "m" (*m128p_src++) : "xmm3", "memory"); - asm volatile ("movaps %0, %%xmm4\n\t" - : : "m" (*m128p_src++) : "xmm4", "memory"); - asm volatile ("movaps %0, %%xmm5\n\t" - : : "m" (*m128p_src++) : "xmm5", "memory"); - asm volatile ("movaps %0, %%xmm6\n\t" - : : "m" (*m128p_src++) : "xmm6", "memory"); - asm volatile ("movaps %0, %%xmm7\n\t" - : : "m" (*m128p_src++) : "xmm7", "memory"); - - asm volatile ("movaps %%xmm0, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm0", "memory"); - asm volatile ("movaps %%xmm1, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm1", "memory"); - asm volatile ("movaps %%xmm2, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm2", "memory"); - asm volatile ("movaps %%xmm3, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm3", "memory"); - asm volatile ("movaps %%xmm4, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm4", "memory"); - asm volatile ("movaps %%xmm5, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm5", "memory"); - asm volatile ("movaps %%xmm6, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm6", "memory"); - asm volatile ("movaps %%xmm7, %0\n\t" - : "=m" (*m128p_dest++) : : "xmm7", "memory"); - } - for (i = 0; i < n2; i++) - { - asm volatile ( - "movaps %1, %%xmm0\n\t" \ - "movaps %%xmm0, %0\n\t" - : "=m" (*m128p_dest++) - : "m" (*m128p_src++) - : "xmm0", "memory"); - } - for (i = si3; i < length; i++) - { - asm volatile ( - "movss %1, %%xmm0\n\t" \ - "movss %%xmm0, %0\n\t" - : "=m" (dest[i]) - : "m" (src[i]) - : "xmm0", "memory"); - } -} - -static void -x86_sse_add2f (float *dest, const float *src, int length) -{ - int i, n, si2; - pv4sf m128p_src = (pv4sf) src; - pv4sf m128p_dest = (pv4sf) dest; - - if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0)) - { - /*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/ - si2 = 0; - goto sse_nonalign; - } - si2 = (length & ~0x3); - n = (length >> 2); - for (i = 0; i < n; i++) - { - asm volatile ( - "movaps %1, %%xmm0\n\t" \ - "addps %2, %%xmm0\n\t" \ - "movaps %%xmm0, %0\n\t" - : "=m" (m128p_dest[i]) - : "m0" (m128p_dest[i]), - "m" (m128p_src[i]) - : "xmm0", "memory"); - } -sse_nonalign: - for (i = si2; i < length; i++) - { - asm volatile ( - "movss %1, %%xmm0\n\t" \ - "addss %2, %%xmm0\n\t" \ - "movss %%xmm0, %0\n\t" - : "=m" (dest[i]) - : "m0" (dest[i]), - "m" (src[i]) - : "xmm0", "memory"); - } -} - void jack_port_set_funcs () { if (ARCH_X86_HAVE_SSE2(cpu_type)) { diff --git a/libjack/simd.c b/libjack/simd.c new file mode 100644 index 0000000..f899b0f --- /dev/null +++ b/libjack/simd.c @@ -0,0 +1,335 @@ +/* -*- mode: c; c-file-style: "bsd"; -*- */ +/* + Copyright (C) 2005-2007 Jussi Laako + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +*/ + + +#include +#include + +#ifdef USE_DYNSIMD + +#ifdef ARCH_X86 + +int +have_3dnow () +{ + unsigned int res = 0; + +#ifdef __x86_64__ + asm volatile ("pushq %%rbx\n\t" : : : "memory"); +#else + asm volatile ("pushl %%ebx\n\t" : : : "memory"); +#endif + asm volatile ( + "movl $0x80000000, %%eax\n\t" \ + "cpuid\n\t" \ + "cmpl $0x80000001, %%eax\n\t" \ + "jl tdnow_prexit\n\t" \ + \ + "movl $0x80000001, %%eax\n\t" \ + "cpuid\n\t" \ + \ + "xorl %%eax, %%eax\n\t" \ + \ + "movl $1, %%ecx\n\t" \ + "shll $31, %%ecx\n\t" \ + "testl %%ecx, %%edx\n\t" \ + "jz tdnow_testexit\n\t" \ + "movl $1, %%eax\n\t" \ + \ + "movl $1, %%ecx\n\t" \ + "shll $30, %%ecx\n\t" \ + "testl %%ecx, %%edx\n\t" \ + "jz tdnow_testexit\n\t" \ + "movl $2, %%eax\n\t" \ + "jmp tdnow_testexit\n\t" \ + \ + "tdnow_prexit:\n\t" \ + "xorl %%eax, %%eax\n\t" \ + "tdnow_testexit:\n\t" + : "=a" (res) + : + : "ecx", "edx", "memory"); +#ifdef __x86_64__ + asm volatile ("popq %%rbx\n\t" : : : "memory"); +#else + asm volatile ("popl %%ebx\n\t" : : : "memory"); +#endif + return res; +} + +int +have_sse () +{ + unsigned int res = 0; + +#ifdef __x86_64__ + asm volatile ("pushq %%rbx\n\t" : : : "memory"); +#else + asm volatile ("pushl %%ebx\n\t" : : : "memory"); +#endif + asm volatile ( + "movl $1, %%eax\n\t" \ + "cpuid\n\t" \ + \ + "xorl %%eax, %%eax\n\t" \ + \ + "movl $1, %%ebx\n\t" \ + "shll $25, %%ebx\n\t" \ + "testl %%ebx, %%edx\n\t" \ + "jz sse_testexit\n\t" \ + "movl $1, %%eax\n\t" \ + \ + "movl $1, %%ebx\n\t" \ + "shll $26, %%ebx\n\t" \ + "testl %%ebx, %%edx\n\t" \ + "jz sse_testexit\n\t" \ + "movl $2, %%eax\n\t" \ + \ + "movl $1, %%ebx\n\t" \ + "testl %%ebx, %%ecx\n\t" \ + "jz sse_testexit\n\t" \ + "movl $3, %%eax\n\t" \ + \ + "sse_testexit:\n\t" + : "=a" (res) + : + : "ecx", "edx", "memory"); +#ifdef __x86_64__ + asm volatile ("popq %%rbx\n\t" : : : "memory"); +#else + asm volatile ("popl %%ebx\n\t" : : : "memory"); +#endif + return res; +} + +void +x86_3dnow_copyf (float *dest, const float *src, int length) +{ + int i, n1, n2; + pv2sf m64p_src = (pv2sf) src; + pv2sf m64p_dest = (pv2sf) dest; + + n1 = (length >> 4); + n2 = ((length & 0xf) >> 1); + for (i = 0; i < n1; i++) + { + asm volatile ("movq %0, %%mm0\n\t" + : : "m" (*m64p_src++) : "mm0", "memory"); + asm volatile ("movq %0, %%mm1\n\t" + : : "m" (*m64p_src++) : "mm1", "memory"); + asm volatile ("movq %0, %%mm2\n\t" + : : "m" (*m64p_src++) : "mm2", "memory"); + asm volatile ("movq %0, %%mm3\n\t" + : : "m" (*m64p_src++) : "mm3", "memory"); + asm volatile ("movq %0, %%mm4\n\t" + : : "m" (*m64p_src++) : "mm4", "memory"); + asm volatile ("movq %0, %%mm5\n\t" + : : "m" (*m64p_src++) : "mm5", "memory"); + asm volatile ("movq %0, %%mm6\n\t" + : : "m" (*m64p_src++) : "mm6", "memory"); + asm volatile ("movq %0, %%mm7\n\t" + : : "m" (*m64p_src++) : "xmm7", "memory"); + + asm volatile ("movq %%mm0, %0\n\t" + : "=m" (*m64p_dest++) : : "mm0", "memory"); + asm volatile ("movq %%mm1, %0\n\t" + : "=m" (*m64p_dest++) : : "mm1", "memory"); + asm volatile ("movq %%mm2, %0\n\t" + : "=m" (*m64p_dest++) : : "mm2", "memory"); + asm volatile ("movq %%mm3, %0\n\t" + : "=m" (*m64p_dest++) : : "mm3", "memory"); + asm volatile ("movq %%mm4, %0\n\t" + : "=m" (*m64p_dest++) : : "mm4", "memory"); + asm volatile ("movq %%mm5, %0\n\t" + : "=m" (*m64p_dest++) : : "mm5", "memory"); + asm volatile ("movq %%mm6, %0\n\t" + : "=m" (*m64p_dest++) : : "mm6", "memory"); + asm volatile ("movq %%mm7, %0\n\t" + : "=m" (*m64p_dest++) : : "mm7", "memory"); + } + for (i = 0; i < n2; i++) + { + asm volatile ( + "movq %1, %%mm0\n\t" \ + "movq %%mm0, %0\n\t" + : "=m" (*m64p_dest++) + : "m" (*m64p_src++) + : "mm0", "memory"); + } + if (length & 0x1) + { + asm volatile ( + "movd %1, %%mm0\n\t" \ + "movd %%mm0, %0\n\t" + : "=m" (dest[length - 1]) + : "m" (src[length - 1]) + : "mm0", "memory"); + } + asm volatile ( + "femms\n\t" \ + "sfence\n\t"); +} + +void +x86_3dnow_add2f (float *dest, const float *src, int length) +{ + int i, n; + pv2sf m64p_dest = (pv2sf) dest; + pv2sf m64p_src = (pv2sf) src; + + n = (length >> 1); + for (i = 0; i < n; i++) + { + asm volatile ( + "movq %1, %%mm0\n\t" \ + "pfadd %2, %%mm0\n\t" \ + "movq %%mm0, %0\n\t" + : "=m" (m64p_dest[i]) + : "m0" (m64p_dest[i]), + "m" (m64p_src[i]) + : "mm0", "memory"); + } + if (n & 0x1) + { + asm volatile ( + "movd %1, %%mm0\n\t" \ + "movd %2, %%mm1\n\t" \ + "pfadd %%mm1, %%mm0\n\t" \ + "movd %%mm0, %0\n\t" + : "=m" (dest[length - 1]) + : "m0" (dest[length - 1]), + "m" (src[length - 1]) + : "mm0", "mm1", "memory"); + } + asm volatile ( + "femms\n\t" \ + "sfence\n\t"); +} + +void +x86_sse_copyf (float *dest, const float *src, int length) +{ + int i, n1, n2, si3; + pv4sf m128p_src = (pv4sf) src; + pv4sf m128p_dest = (pv4sf) dest; + + n1 = (length >> 5); + n2 = ((length & 0x1f) >> 2); + si3 = (length & ~0x3); + for (i = 0; i < n1; i++) + { + asm volatile ("movaps %0, %%xmm0\n\t" + : : "m" (*m128p_src++) : "xmm0", "memory"); + asm volatile ("movaps %0, %%xmm1\n\t" + : : "m" (*m128p_src++) : "xmm1", "memory"); + asm volatile ("movaps %0, %%xmm2\n\t" + : : "m" (*m128p_src++) : "xmm2", "memory"); + asm volatile ("movaps %0, %%xmm3\n\t" + : : "m" (*m128p_src++) : "xmm3", "memory"); + asm volatile ("movaps %0, %%xmm4\n\t" + : : "m" (*m128p_src++) : "xmm4", "memory"); + asm volatile ("movaps %0, %%xmm5\n\t" + : : "m" (*m128p_src++) : "xmm5", "memory"); + asm volatile ("movaps %0, %%xmm6\n\t" + : : "m" (*m128p_src++) : "xmm6", "memory"); + asm volatile ("movaps %0, %%xmm7\n\t" + : : "m" (*m128p_src++) : "xmm7", "memory"); + + asm volatile ("movaps %%xmm0, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm0", "memory"); + asm volatile ("movaps %%xmm1, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm1", "memory"); + asm volatile ("movaps %%xmm2, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm2", "memory"); + asm volatile ("movaps %%xmm3, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm3", "memory"); + asm volatile ("movaps %%xmm4, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm4", "memory"); + asm volatile ("movaps %%xmm5, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm5", "memory"); + asm volatile ("movaps %%xmm6, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm6", "memory"); + asm volatile ("movaps %%xmm7, %0\n\t" + : "=m" (*m128p_dest++) : : "xmm7", "memory"); + } + for (i = 0; i < n2; i++) + { + asm volatile ( + "movaps %1, %%xmm0\n\t" \ + "movaps %%xmm0, %0\n\t" + : "=m" (*m128p_dest++) + : "m" (*m128p_src++) + : "xmm0", "memory"); + } + for (i = si3; i < length; i++) + { + asm volatile ( + "movss %1, %%xmm0\n\t" \ + "movss %%xmm0, %0\n\t" + : "=m" (dest[i]) + : "m" (src[i]) + : "xmm0", "memory"); + } +} + +void +x86_sse_add2f (float *dest, const float *src, int length) +{ + int i, n, si2; + pv4sf m128p_src = (pv4sf) src; + pv4sf m128p_dest = (pv4sf) dest; + + if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0)) + { + /*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/ + si2 = 0; + goto sse_nonalign; + } + si2 = (length & ~0x3); + n = (length >> 2); + for (i = 0; i < n; i++) + { + asm volatile ( + "movaps %1, %%xmm0\n\t" \ + "addps %2, %%xmm0\n\t" \ + "movaps %%xmm0, %0\n\t" + : "=m" (m128p_dest[i]) + : "m0" (m128p_dest[i]), + "m" (m128p_src[i]) + : "xmm0", "memory"); + } +sse_nonalign: + for (i = si2; i < length; i++) + { + asm volatile ( + "movss %1, %%xmm0\n\t" \ + "addss %2, %%xmm0\n\t" \ + "movss %%xmm0, %0\n\t" + : "=m" (dest[i]) + : "m0" (dest[i]), + "m" (src[i]) + : "xmm0", "memory"); + } +} + +#endif /* ARCH_X86 */ + +#endif /* USE_DYNSIMD */ +