- Move inline asm SIMD functionality to a separate source file git-svn-id: svn+ssh://jackaudio.org/trunk/jack@1050 0c269be4-1314-0410-8aa9-9f06e86f4224tags/0.109.0
| @@ -17,7 +17,7 @@ dnl changes are made | |||
| dnl --- | |||
| JACK_MAJOR_VERSION=0 | |||
| JACK_MINOR_VERSION=107 | |||
| JACK_MICRO_VERSION=1 | |||
| JACK_MICRO_VERSION=2 | |||
| dnl --- | |||
| dnl HOWTO: updating the jack protocol version | |||
| @@ -1,7 +1,7 @@ | |||
| /* | |||
| OSS driver for Jack | |||
| Copyright (C) 2003-2005 Jussi Laako <jussi@sonarnerd.net> | |||
| Copyright (C) 2003-2007 Jussi Laako <jussi@sonarnerd.net> | |||
| This program is free software; you can redistribute it and/or modify | |||
| it under the terms of the GNU General Public License as published by | |||
| @@ -456,6 +456,7 @@ static int oss_driver_start (oss_driver_t *driver) | |||
| samplesize = sizeof(short); | |||
| break; | |||
| } | |||
| driver->trigger = 0; | |||
| if (strcmp(indev, outdev) != 0) | |||
| { | |||
| if (driver->capture_channels > 0) | |||
| @@ -529,6 +530,8 @@ static int oss_driver_start (oss_driver_t *driver) | |||
| } | |||
| if (infd >= 0 && outfd >= 0) | |||
| { | |||
| ioctl(outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); | |||
| driver->trigger = (PCM_ENABLE_INPUT|PCM_ENABLE_OUTPUT); | |||
| if (ioctl(infd, SNDCTL_DSP_SETDUPLEX, 0) < 0) | |||
| { | |||
| if (errno != EINVAL) /* Dont care */ | |||
| @@ -961,6 +964,13 @@ static void *io_thread (void *param) | |||
| __FILE__, __LINE__); | |||
| return NULL; | |||
| } | |||
| if (driver->trigger) | |||
| { | |||
| /* don't care too much if this fails */ | |||
| memset(localbuf, 0x00, localsize); | |||
| write(driver->outfd, localbuf, localsize); | |||
| ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); | |||
| } | |||
| while (driver->run) | |||
| { | |||
| @@ -992,6 +1002,13 @@ static void *io_thread (void *param) | |||
| jack_error("OSS: malloc() failed: %s@%i", __FILE__, __LINE__); | |||
| return NULL; | |||
| } | |||
| if (driver->trigger) | |||
| { | |||
| /* don't care too much if this fails */ | |||
| memset(localbuf, 0x00, localsize); | |||
| write(driver->outfd, localbuf, driver->outdevbufsize); | |||
| ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); | |||
| } | |||
| while (driver->run) | |||
| { | |||
| @@ -1119,6 +1136,7 @@ jack_driver_t * driver_initialize (jack_client_t *client, | |||
| driver->indev = NULL; | |||
| driver->outdev = NULL; | |||
| driver->ignorehwbuf = 0; | |||
| driver->trigger = 0; | |||
| pnode = params; | |||
| while (pnode != NULL) | |||
| @@ -1,7 +1,7 @@ | |||
| /* | |||
| OSS driver for Jack | |||
| Copyright (C) 2003-2005 Jussi Laako <jussi@sonarnerd.net> | |||
| Copyright (C) 2003-2007 Jussi Laako <jussi@sonarnerd.net> | |||
| This program is free software; you can redistribute it and/or modify | |||
| it under the terms of the GNU General Public License as published by | |||
| @@ -62,6 +62,7 @@ typedef struct _oss_driver | |||
| int outfd; | |||
| int format; | |||
| int ignorehwbuf; | |||
| int trigger; | |||
| size_t indevbufsize; | |||
| size_t outdevbufsize; | |||
| @@ -1,5 +1,5 @@ | |||
| /* | |||
| Copyright (C) 2005 Jussi Laako | |||
| Copyright (C) 2005-2007 Jussi Laako | |||
| This program is free software; you can redistribute it and/or modify | |||
| it under the terms of the GNU General Public License as published by | |||
| @@ -39,9 +39,16 @@ typedef v4sf * pv4sf; | |||
| extern int cpu_type; | |||
| int have_3dnow (void); | |||
| int have_sse (void); | |||
| void x86_3dnow_copyf (float *, const float *, int); | |||
| void x86_3dnow_add2f (float *, const float *, int); | |||
| void x86_sse_copyf (float *, const float *, int); | |||
| void x86_sse_add2f (float *, const float *, int); | |||
| #endif /* ARCH_X86 */ | |||
| extern void jack_port_set_funcs (void); | |||
| void jack_port_set_funcs (void); | |||
| #endif /* __jack_intsimd_h__ */ | |||
| @@ -23,7 +23,8 @@ SOURCE_FILES = \ | |||
| time.c \ | |||
| timestamps.c \ | |||
| transclient.c \ | |||
| unlock.c | |||
| unlock.c \ | |||
| simd.c | |||
| lib_LTLIBRARIES = libjack.la | |||
| @@ -63,9 +63,6 @@ | |||
| static pthread_mutex_t client_lock; | |||
| static pthread_cond_t client_ready; | |||
| #ifdef ARCH_X86 | |||
| int cpu_type = 0; | |||
| #endif /* ARCH_X86 */ | |||
| #define EVENT_POLL_INDEX 0 | |||
| #define WAIT_POLL_INDEX 1 | |||
| @@ -82,98 +79,7 @@ typedef struct { | |||
| #ifdef ARCH_X86 | |||
| static int | |||
| have_3dnow () | |||
| { | |||
| unsigned int res = 0; | |||
| #ifdef __x86_64__ | |||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| asm volatile ( | |||
| "movl $0x80000000, %%eax\n\t" \ | |||
| "cpuid\n\t" \ | |||
| "cmpl $0x80000001, %%eax\n\t" \ | |||
| "jl tdnow_prexit\n\t" \ | |||
| \ | |||
| "movl $0x80000001, %%eax\n\t" \ | |||
| "cpuid\n\t" \ | |||
| \ | |||
| "xorl %%eax, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ecx\n\t" \ | |||
| "shll $31, %%ecx\n\t" \ | |||
| "testl %%ecx, %%edx\n\t" \ | |||
| "jz tdnow_testexit\n\t" \ | |||
| "movl $1, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ecx\n\t" \ | |||
| "shll $30, %%ecx\n\t" \ | |||
| "testl %%ecx, %%edx\n\t" \ | |||
| "jz tdnow_testexit\n\t" \ | |||
| "movl $2, %%eax\n\t" \ | |||
| "jmp tdnow_testexit\n\t" \ | |||
| \ | |||
| "tdnow_prexit:\n\t" \ | |||
| "xorl %%eax, %%eax\n\t" \ | |||
| "tdnow_testexit:\n\t" | |||
| : "=a" (res) | |||
| : | |||
| : "ecx", "edx", "memory"); | |||
| #ifdef __x86_64__ | |||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| return res; | |||
| } | |||
| static int | |||
| have_sse () | |||
| { | |||
| unsigned int res = 0; | |||
| #ifdef __x86_64__ | |||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| asm volatile ( | |||
| "movl $1, %%eax\n\t" \ | |||
| "cpuid\n\t" \ | |||
| \ | |||
| "xorl %%eax, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ebx\n\t" \ | |||
| "shll $25, %%ebx\n\t" \ | |||
| "testl %%ebx, %%edx\n\t" \ | |||
| "jz sse_testexit\n\t" \ | |||
| "movl $1, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ebx\n\t" \ | |||
| "shll $26, %%ebx\n\t" \ | |||
| "testl %%ebx, %%edx\n\t" \ | |||
| "jz sse_testexit\n\t" \ | |||
| "movl $2, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ebx\n\t" \ | |||
| "testl %%ebx, %%ecx\n\t" \ | |||
| "jz sse_testexit\n\t" \ | |||
| "movl $3, %%eax\n\t" \ | |||
| \ | |||
| "sse_testexit:\n\t" | |||
| : "=a" (res) | |||
| : | |||
| : "ecx", "edx", "memory"); | |||
| #ifdef __x86_64__ | |||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| return res; | |||
| } | |||
| int cpu_type = 0; | |||
| static void | |||
| init_cpu () | |||
| @@ -96,216 +96,6 @@ gen_mixf (float *dest, const float *src, int length) | |||
| #ifdef ARCH_X86 | |||
| static void | |||
| x86_3dnow_add2f (float *dest, const float *src, int length) | |||
| { | |||
| int i, n; | |||
| pv2sf m64p_dest = (pv2sf) dest; | |||
| pv2sf m64p_src = (pv2sf) src; | |||
| n = (length >> 1); | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| asm volatile ( | |||
| "movq %1, %%mm0\n\t" \ | |||
| "pfadd %2, %%mm0\n\t" \ | |||
| "movq %%mm0, %0\n\t" | |||
| : "=m" (m64p_dest[i]) | |||
| : "m0" (m64p_dest[i]), | |||
| "m" (m64p_src[i]) | |||
| : "mm0", "memory"); | |||
| } | |||
| if (n & 0x1) | |||
| { | |||
| asm volatile ( | |||
| "movd %1, %%mm0\n\t" \ | |||
| "movd %2, %%mm1\n\t" \ | |||
| "pfadd %%mm1, %%mm0\n\t" \ | |||
| "movd %%mm0, %0\n\t" | |||
| : "=m" (dest[length - 1]) | |||
| : "m0" (dest[length - 1]), | |||
| "m" (src[length - 1]) | |||
| : "mm0", "mm1", "memory"); | |||
| } | |||
| asm volatile ( | |||
| "femms\n\t" \ | |||
| "sfence\n\t"); | |||
| } | |||
| static void | |||
| x86_3dnow_copyf (float *dest, const float *src, int length) | |||
| { | |||
| int i, n1, n2; | |||
| pv2sf m64p_src = (pv2sf) src; | |||
| pv2sf m64p_dest = (pv2sf) dest; | |||
| n1 = (length >> 4); | |||
| n2 = ((length & 0xf) >> 1); | |||
| for (i = 0; i < n1; i++) | |||
| { | |||
| asm volatile ("movq %0, %%mm0\n\t" | |||
| : : "m" (*m64p_src++) : "mm0", "memory"); | |||
| asm volatile ("movq %0, %%mm1\n\t" | |||
| : : "m" (*m64p_src++) : "mm1", "memory"); | |||
| asm volatile ("movq %0, %%mm2\n\t" | |||
| : : "m" (*m64p_src++) : "mm2", "memory"); | |||
| asm volatile ("movq %0, %%mm3\n\t" | |||
| : : "m" (*m64p_src++) : "mm3", "memory"); | |||
| asm volatile ("movq %0, %%mm4\n\t" | |||
| : : "m" (*m64p_src++) : "mm4", "memory"); | |||
| asm volatile ("movq %0, %%mm5\n\t" | |||
| : : "m" (*m64p_src++) : "mm5", "memory"); | |||
| asm volatile ("movq %0, %%mm6\n\t" | |||
| : : "m" (*m64p_src++) : "mm6", "memory"); | |||
| asm volatile ("movq %0, %%mm7\n\t" | |||
| : : "m" (*m64p_src++) : "xmm7", "memory"); | |||
| asm volatile ("movq %%mm0, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm0", "memory"); | |||
| asm volatile ("movq %%mm1, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm1", "memory"); | |||
| asm volatile ("movq %%mm2, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm2", "memory"); | |||
| asm volatile ("movq %%mm3, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm3", "memory"); | |||
| asm volatile ("movq %%mm4, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm4", "memory"); | |||
| asm volatile ("movq %%mm5, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm5", "memory"); | |||
| asm volatile ("movq %%mm6, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm6", "memory"); | |||
| asm volatile ("movq %%mm7, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm7", "memory"); | |||
| } | |||
| for (i = 0; i < n2; i++) | |||
| { | |||
| asm volatile ( | |||
| "movq %1, %%mm0\n\t" \ | |||
| "movq %%mm0, %0\n\t" | |||
| : "=m" (*m64p_dest++) | |||
| : "m" (*m64p_src++) | |||
| : "mm0", "memory"); | |||
| } | |||
| if (length & 0x1) | |||
| { | |||
| asm volatile ( | |||
| "movd %1, %%mm0\n\t" \ | |||
| "movd %%mm0, %0\n\t" | |||
| : "=m" (dest[length - 1]) | |||
| : "m" (src[length - 1]) | |||
| : "mm0", "memory"); | |||
| } | |||
| asm volatile ( | |||
| "femms\n\t" \ | |||
| "sfence\n\t"); | |||
| } | |||
| static void | |||
| x86_sse_copyf (float *dest, const float *src, int length) | |||
| { | |||
| int i, n1, n2, si3; | |||
| pv4sf m128p_src = (pv4sf) src; | |||
| pv4sf m128p_dest = (pv4sf) dest; | |||
| n1 = (length >> 5); | |||
| n2 = ((length & 0x1f) >> 2); | |||
| si3 = (length & ~0x3); | |||
| for (i = 0; i < n1; i++) | |||
| { | |||
| asm volatile ("movaps %0, %%xmm0\n\t" | |||
| : : "m" (*m128p_src++) : "xmm0", "memory"); | |||
| asm volatile ("movaps %0, %%xmm1\n\t" | |||
| : : "m" (*m128p_src++) : "xmm1", "memory"); | |||
| asm volatile ("movaps %0, %%xmm2\n\t" | |||
| : : "m" (*m128p_src++) : "xmm2", "memory"); | |||
| asm volatile ("movaps %0, %%xmm3\n\t" | |||
| : : "m" (*m128p_src++) : "xmm3", "memory"); | |||
| asm volatile ("movaps %0, %%xmm4\n\t" | |||
| : : "m" (*m128p_src++) : "xmm4", "memory"); | |||
| asm volatile ("movaps %0, %%xmm5\n\t" | |||
| : : "m" (*m128p_src++) : "xmm5", "memory"); | |||
| asm volatile ("movaps %0, %%xmm6\n\t" | |||
| : : "m" (*m128p_src++) : "xmm6", "memory"); | |||
| asm volatile ("movaps %0, %%xmm7\n\t" | |||
| : : "m" (*m128p_src++) : "xmm7", "memory"); | |||
| asm volatile ("movaps %%xmm0, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm0", "memory"); | |||
| asm volatile ("movaps %%xmm1, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm1", "memory"); | |||
| asm volatile ("movaps %%xmm2, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm2", "memory"); | |||
| asm volatile ("movaps %%xmm3, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm3", "memory"); | |||
| asm volatile ("movaps %%xmm4, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm4", "memory"); | |||
| asm volatile ("movaps %%xmm5, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm5", "memory"); | |||
| asm volatile ("movaps %%xmm6, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm6", "memory"); | |||
| asm volatile ("movaps %%xmm7, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm7", "memory"); | |||
| } | |||
| for (i = 0; i < n2; i++) | |||
| { | |||
| asm volatile ( | |||
| "movaps %1, %%xmm0\n\t" \ | |||
| "movaps %%xmm0, %0\n\t" | |||
| : "=m" (*m128p_dest++) | |||
| : "m" (*m128p_src++) | |||
| : "xmm0", "memory"); | |||
| } | |||
| for (i = si3; i < length; i++) | |||
| { | |||
| asm volatile ( | |||
| "movss %1, %%xmm0\n\t" \ | |||
| "movss %%xmm0, %0\n\t" | |||
| : "=m" (dest[i]) | |||
| : "m" (src[i]) | |||
| : "xmm0", "memory"); | |||
| } | |||
| } | |||
| static void | |||
| x86_sse_add2f (float *dest, const float *src, int length) | |||
| { | |||
| int i, n, si2; | |||
| pv4sf m128p_src = (pv4sf) src; | |||
| pv4sf m128p_dest = (pv4sf) dest; | |||
| if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0)) | |||
| { | |||
| /*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/ | |||
| si2 = 0; | |||
| goto sse_nonalign; | |||
| } | |||
| si2 = (length & ~0x3); | |||
| n = (length >> 2); | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| asm volatile ( | |||
| "movaps %1, %%xmm0\n\t" \ | |||
| "addps %2, %%xmm0\n\t" \ | |||
| "movaps %%xmm0, %0\n\t" | |||
| : "=m" (m128p_dest[i]) | |||
| : "m0" (m128p_dest[i]), | |||
| "m" (m128p_src[i]) | |||
| : "xmm0", "memory"); | |||
| } | |||
| sse_nonalign: | |||
| for (i = si2; i < length; i++) | |||
| { | |||
| asm volatile ( | |||
| "movss %1, %%xmm0\n\t" \ | |||
| "addss %2, %%xmm0\n\t" \ | |||
| "movss %%xmm0, %0\n\t" | |||
| : "=m" (dest[i]) | |||
| : "m0" (dest[i]), | |||
| "m" (src[i]) | |||
| : "xmm0", "memory"); | |||
| } | |||
| } | |||
| void jack_port_set_funcs () | |||
| { | |||
| if (ARCH_X86_HAVE_SSE2(cpu_type)) { | |||
| @@ -0,0 +1,335 @@ | |||
| /* -*- mode: c; c-file-style: "bsd"; -*- */ | |||
| /* | |||
| Copyright (C) 2005-2007 Jussi Laako | |||
| This program is free software; you can redistribute it and/or modify | |||
| it under the terms of the GNU Lesser General Public License as published by | |||
| the Free Software Foundation; either version 2.1 of the License, or | |||
| (at your option) any later version. | |||
| This program is distributed in the hope that it will be useful, | |||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| GNU Lesser General Public License for more details. | |||
| You should have received a copy of the GNU Lesser General Public License | |||
| along with this program; if not, write to the Free Software | |||
| Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |||
| */ | |||
| #include <config.h> | |||
| #include <jack/intsimd.h> | |||
| #ifdef USE_DYNSIMD | |||
| #ifdef ARCH_X86 | |||
| int | |||
| have_3dnow () | |||
| { | |||
| unsigned int res = 0; | |||
| #ifdef __x86_64__ | |||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| asm volatile ( | |||
| "movl $0x80000000, %%eax\n\t" \ | |||
| "cpuid\n\t" \ | |||
| "cmpl $0x80000001, %%eax\n\t" \ | |||
| "jl tdnow_prexit\n\t" \ | |||
| \ | |||
| "movl $0x80000001, %%eax\n\t" \ | |||
| "cpuid\n\t" \ | |||
| \ | |||
| "xorl %%eax, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ecx\n\t" \ | |||
| "shll $31, %%ecx\n\t" \ | |||
| "testl %%ecx, %%edx\n\t" \ | |||
| "jz tdnow_testexit\n\t" \ | |||
| "movl $1, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ecx\n\t" \ | |||
| "shll $30, %%ecx\n\t" \ | |||
| "testl %%ecx, %%edx\n\t" \ | |||
| "jz tdnow_testexit\n\t" \ | |||
| "movl $2, %%eax\n\t" \ | |||
| "jmp tdnow_testexit\n\t" \ | |||
| \ | |||
| "tdnow_prexit:\n\t" \ | |||
| "xorl %%eax, %%eax\n\t" \ | |||
| "tdnow_testexit:\n\t" | |||
| : "=a" (res) | |||
| : | |||
| : "ecx", "edx", "memory"); | |||
| #ifdef __x86_64__ | |||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| return res; | |||
| } | |||
| int | |||
| have_sse () | |||
| { | |||
| unsigned int res = 0; | |||
| #ifdef __x86_64__ | |||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| asm volatile ( | |||
| "movl $1, %%eax\n\t" \ | |||
| "cpuid\n\t" \ | |||
| \ | |||
| "xorl %%eax, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ebx\n\t" \ | |||
| "shll $25, %%ebx\n\t" \ | |||
| "testl %%ebx, %%edx\n\t" \ | |||
| "jz sse_testexit\n\t" \ | |||
| "movl $1, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ebx\n\t" \ | |||
| "shll $26, %%ebx\n\t" \ | |||
| "testl %%ebx, %%edx\n\t" \ | |||
| "jz sse_testexit\n\t" \ | |||
| "movl $2, %%eax\n\t" \ | |||
| \ | |||
| "movl $1, %%ebx\n\t" \ | |||
| "testl %%ebx, %%ecx\n\t" \ | |||
| "jz sse_testexit\n\t" \ | |||
| "movl $3, %%eax\n\t" \ | |||
| \ | |||
| "sse_testexit:\n\t" | |||
| : "=a" (res) | |||
| : | |||
| : "ecx", "edx", "memory"); | |||
| #ifdef __x86_64__ | |||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||
| #else | |||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||
| #endif | |||
| return res; | |||
| } | |||
| void | |||
| x86_3dnow_copyf (float *dest, const float *src, int length) | |||
| { | |||
| int i, n1, n2; | |||
| pv2sf m64p_src = (pv2sf) src; | |||
| pv2sf m64p_dest = (pv2sf) dest; | |||
| n1 = (length >> 4); | |||
| n2 = ((length & 0xf) >> 1); | |||
| for (i = 0; i < n1; i++) | |||
| { | |||
| asm volatile ("movq %0, %%mm0\n\t" | |||
| : : "m" (*m64p_src++) : "mm0", "memory"); | |||
| asm volatile ("movq %0, %%mm1\n\t" | |||
| : : "m" (*m64p_src++) : "mm1", "memory"); | |||
| asm volatile ("movq %0, %%mm2\n\t" | |||
| : : "m" (*m64p_src++) : "mm2", "memory"); | |||
| asm volatile ("movq %0, %%mm3\n\t" | |||
| : : "m" (*m64p_src++) : "mm3", "memory"); | |||
| asm volatile ("movq %0, %%mm4\n\t" | |||
| : : "m" (*m64p_src++) : "mm4", "memory"); | |||
| asm volatile ("movq %0, %%mm5\n\t" | |||
| : : "m" (*m64p_src++) : "mm5", "memory"); | |||
| asm volatile ("movq %0, %%mm6\n\t" | |||
| : : "m" (*m64p_src++) : "mm6", "memory"); | |||
| asm volatile ("movq %0, %%mm7\n\t" | |||
| : : "m" (*m64p_src++) : "xmm7", "memory"); | |||
| asm volatile ("movq %%mm0, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm0", "memory"); | |||
| asm volatile ("movq %%mm1, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm1", "memory"); | |||
| asm volatile ("movq %%mm2, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm2", "memory"); | |||
| asm volatile ("movq %%mm3, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm3", "memory"); | |||
| asm volatile ("movq %%mm4, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm4", "memory"); | |||
| asm volatile ("movq %%mm5, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm5", "memory"); | |||
| asm volatile ("movq %%mm6, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm6", "memory"); | |||
| asm volatile ("movq %%mm7, %0\n\t" | |||
| : "=m" (*m64p_dest++) : : "mm7", "memory"); | |||
| } | |||
| for (i = 0; i < n2; i++) | |||
| { | |||
| asm volatile ( | |||
| "movq %1, %%mm0\n\t" \ | |||
| "movq %%mm0, %0\n\t" | |||
| : "=m" (*m64p_dest++) | |||
| : "m" (*m64p_src++) | |||
| : "mm0", "memory"); | |||
| } | |||
| if (length & 0x1) | |||
| { | |||
| asm volatile ( | |||
| "movd %1, %%mm0\n\t" \ | |||
| "movd %%mm0, %0\n\t" | |||
| : "=m" (dest[length - 1]) | |||
| : "m" (src[length - 1]) | |||
| : "mm0", "memory"); | |||
| } | |||
| asm volatile ( | |||
| "femms\n\t" \ | |||
| "sfence\n\t"); | |||
| } | |||
| void | |||
| x86_3dnow_add2f (float *dest, const float *src, int length) | |||
| { | |||
| int i, n; | |||
| pv2sf m64p_dest = (pv2sf) dest; | |||
| pv2sf m64p_src = (pv2sf) src; | |||
| n = (length >> 1); | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| asm volatile ( | |||
| "movq %1, %%mm0\n\t" \ | |||
| "pfadd %2, %%mm0\n\t" \ | |||
| "movq %%mm0, %0\n\t" | |||
| : "=m" (m64p_dest[i]) | |||
| : "m0" (m64p_dest[i]), | |||
| "m" (m64p_src[i]) | |||
| : "mm0", "memory"); | |||
| } | |||
| if (n & 0x1) | |||
| { | |||
| asm volatile ( | |||
| "movd %1, %%mm0\n\t" \ | |||
| "movd %2, %%mm1\n\t" \ | |||
| "pfadd %%mm1, %%mm0\n\t" \ | |||
| "movd %%mm0, %0\n\t" | |||
| : "=m" (dest[length - 1]) | |||
| : "m0" (dest[length - 1]), | |||
| "m" (src[length - 1]) | |||
| : "mm0", "mm1", "memory"); | |||
| } | |||
| asm volatile ( | |||
| "femms\n\t" \ | |||
| "sfence\n\t"); | |||
| } | |||
| void | |||
| x86_sse_copyf (float *dest, const float *src, int length) | |||
| { | |||
| int i, n1, n2, si3; | |||
| pv4sf m128p_src = (pv4sf) src; | |||
| pv4sf m128p_dest = (pv4sf) dest; | |||
| n1 = (length >> 5); | |||
| n2 = ((length & 0x1f) >> 2); | |||
| si3 = (length & ~0x3); | |||
| for (i = 0; i < n1; i++) | |||
| { | |||
| asm volatile ("movaps %0, %%xmm0\n\t" | |||
| : : "m" (*m128p_src++) : "xmm0", "memory"); | |||
| asm volatile ("movaps %0, %%xmm1\n\t" | |||
| : : "m" (*m128p_src++) : "xmm1", "memory"); | |||
| asm volatile ("movaps %0, %%xmm2\n\t" | |||
| : : "m" (*m128p_src++) : "xmm2", "memory"); | |||
| asm volatile ("movaps %0, %%xmm3\n\t" | |||
| : : "m" (*m128p_src++) : "xmm3", "memory"); | |||
| asm volatile ("movaps %0, %%xmm4\n\t" | |||
| : : "m" (*m128p_src++) : "xmm4", "memory"); | |||
| asm volatile ("movaps %0, %%xmm5\n\t" | |||
| : : "m" (*m128p_src++) : "xmm5", "memory"); | |||
| asm volatile ("movaps %0, %%xmm6\n\t" | |||
| : : "m" (*m128p_src++) : "xmm6", "memory"); | |||
| asm volatile ("movaps %0, %%xmm7\n\t" | |||
| : : "m" (*m128p_src++) : "xmm7", "memory"); | |||
| asm volatile ("movaps %%xmm0, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm0", "memory"); | |||
| asm volatile ("movaps %%xmm1, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm1", "memory"); | |||
| asm volatile ("movaps %%xmm2, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm2", "memory"); | |||
| asm volatile ("movaps %%xmm3, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm3", "memory"); | |||
| asm volatile ("movaps %%xmm4, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm4", "memory"); | |||
| asm volatile ("movaps %%xmm5, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm5", "memory"); | |||
| asm volatile ("movaps %%xmm6, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm6", "memory"); | |||
| asm volatile ("movaps %%xmm7, %0\n\t" | |||
| : "=m" (*m128p_dest++) : : "xmm7", "memory"); | |||
| } | |||
| for (i = 0; i < n2; i++) | |||
| { | |||
| asm volatile ( | |||
| "movaps %1, %%xmm0\n\t" \ | |||
| "movaps %%xmm0, %0\n\t" | |||
| : "=m" (*m128p_dest++) | |||
| : "m" (*m128p_src++) | |||
| : "xmm0", "memory"); | |||
| } | |||
| for (i = si3; i < length; i++) | |||
| { | |||
| asm volatile ( | |||
| "movss %1, %%xmm0\n\t" \ | |||
| "movss %%xmm0, %0\n\t" | |||
| : "=m" (dest[i]) | |||
| : "m" (src[i]) | |||
| : "xmm0", "memory"); | |||
| } | |||
| } | |||
| void | |||
| x86_sse_add2f (float *dest, const float *src, int length) | |||
| { | |||
| int i, n, si2; | |||
| pv4sf m128p_src = (pv4sf) src; | |||
| pv4sf m128p_dest = (pv4sf) dest; | |||
| if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0)) | |||
| { | |||
| /*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/ | |||
| si2 = 0; | |||
| goto sse_nonalign; | |||
| } | |||
| si2 = (length & ~0x3); | |||
| n = (length >> 2); | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| asm volatile ( | |||
| "movaps %1, %%xmm0\n\t" \ | |||
| "addps %2, %%xmm0\n\t" \ | |||
| "movaps %%xmm0, %0\n\t" | |||
| : "=m" (m128p_dest[i]) | |||
| : "m0" (m128p_dest[i]), | |||
| "m" (m128p_src[i]) | |||
| : "xmm0", "memory"); | |||
| } | |||
| sse_nonalign: | |||
| for (i = si2; i < length; i++) | |||
| { | |||
| asm volatile ( | |||
| "movss %1, %%xmm0\n\t" \ | |||
| "addss %2, %%xmm0\n\t" \ | |||
| "movss %%xmm0, %0\n\t" | |||
| : "=m" (dest[i]) | |||
| : "m0" (dest[i]), | |||
| "m" (src[i]) | |||
| : "xmm0", "memory"); | |||
| } | |||
| } | |||
| #endif /* ARCH_X86 */ | |||
| #endif /* USE_DYNSIMD */ | |||