- Move inline asm SIMD functionality to a separate source file git-svn-id: svn+ssh://jackaudio.org/trunk/jack@1050 0c269be4-1314-0410-8aa9-9f06e86f4224tags/0.109.0
| @@ -17,7 +17,7 @@ dnl changes are made | |||||
| dnl --- | dnl --- | ||||
| JACK_MAJOR_VERSION=0 | JACK_MAJOR_VERSION=0 | ||||
| JACK_MINOR_VERSION=107 | JACK_MINOR_VERSION=107 | ||||
| JACK_MICRO_VERSION=1 | |||||
| JACK_MICRO_VERSION=2 | |||||
| dnl --- | dnl --- | ||||
| dnl HOWTO: updating the jack protocol version | dnl HOWTO: updating the jack protocol version | ||||
| @@ -1,7 +1,7 @@ | |||||
| /* | /* | ||||
| OSS driver for Jack | OSS driver for Jack | ||||
| Copyright (C) 2003-2005 Jussi Laako <jussi@sonarnerd.net> | |||||
| Copyright (C) 2003-2007 Jussi Laako <jussi@sonarnerd.net> | |||||
| This program is free software; you can redistribute it and/or modify | This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | ||||
| @@ -456,6 +456,7 @@ static int oss_driver_start (oss_driver_t *driver) | |||||
| samplesize = sizeof(short); | samplesize = sizeof(short); | ||||
| break; | break; | ||||
| } | } | ||||
| driver->trigger = 0; | |||||
| if (strcmp(indev, outdev) != 0) | if (strcmp(indev, outdev) != 0) | ||||
| { | { | ||||
| if (driver->capture_channels > 0) | if (driver->capture_channels > 0) | ||||
| @@ -529,6 +530,8 @@ static int oss_driver_start (oss_driver_t *driver) | |||||
| } | } | ||||
| if (infd >= 0 && outfd >= 0) | if (infd >= 0 && outfd >= 0) | ||||
| { | { | ||||
| ioctl(outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); | |||||
| driver->trigger = (PCM_ENABLE_INPUT|PCM_ENABLE_OUTPUT); | |||||
| if (ioctl(infd, SNDCTL_DSP_SETDUPLEX, 0) < 0) | if (ioctl(infd, SNDCTL_DSP_SETDUPLEX, 0) < 0) | ||||
| { | { | ||||
| if (errno != EINVAL) /* Dont care */ | if (errno != EINVAL) /* Dont care */ | ||||
| @@ -961,6 +964,13 @@ static void *io_thread (void *param) | |||||
| __FILE__, __LINE__); | __FILE__, __LINE__); | ||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| if (driver->trigger) | |||||
| { | |||||
| /* don't care too much if this fails */ | |||||
| memset(localbuf, 0x00, localsize); | |||||
| write(driver->outfd, localbuf, localsize); | |||||
| ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); | |||||
| } | |||||
| while (driver->run) | while (driver->run) | ||||
| { | { | ||||
| @@ -992,6 +1002,13 @@ static void *io_thread (void *param) | |||||
| jack_error("OSS: malloc() failed: %s@%i", __FILE__, __LINE__); | jack_error("OSS: malloc() failed: %s@%i", __FILE__, __LINE__); | ||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| if (driver->trigger) | |||||
| { | |||||
| /* don't care too much if this fails */ | |||||
| memset(localbuf, 0x00, localsize); | |||||
| write(driver->outfd, localbuf, driver->outdevbufsize); | |||||
| ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger); | |||||
| } | |||||
| while (driver->run) | while (driver->run) | ||||
| { | { | ||||
| @@ -1119,6 +1136,7 @@ jack_driver_t * driver_initialize (jack_client_t *client, | |||||
| driver->indev = NULL; | driver->indev = NULL; | ||||
| driver->outdev = NULL; | driver->outdev = NULL; | ||||
| driver->ignorehwbuf = 0; | driver->ignorehwbuf = 0; | ||||
| driver->trigger = 0; | |||||
| pnode = params; | pnode = params; | ||||
| while (pnode != NULL) | while (pnode != NULL) | ||||
| @@ -1,7 +1,7 @@ | |||||
| /* | /* | ||||
| OSS driver for Jack | OSS driver for Jack | ||||
| Copyright (C) 2003-2005 Jussi Laako <jussi@sonarnerd.net> | |||||
| Copyright (C) 2003-2007 Jussi Laako <jussi@sonarnerd.net> | |||||
| This program is free software; you can redistribute it and/or modify | This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | ||||
| @@ -62,6 +62,7 @@ typedef struct _oss_driver | |||||
| int outfd; | int outfd; | ||||
| int format; | int format; | ||||
| int ignorehwbuf; | int ignorehwbuf; | ||||
| int trigger; | |||||
| size_t indevbufsize; | size_t indevbufsize; | ||||
| size_t outdevbufsize; | size_t outdevbufsize; | ||||
| @@ -1,5 +1,5 @@ | |||||
| /* | /* | ||||
| Copyright (C) 2005 Jussi Laako | |||||
| Copyright (C) 2005-2007 Jussi Laako | |||||
| This program is free software; you can redistribute it and/or modify | This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | ||||
| @@ -39,9 +39,16 @@ typedef v4sf * pv4sf; | |||||
| extern int cpu_type; | extern int cpu_type; | ||||
| int have_3dnow (void); | |||||
| int have_sse (void); | |||||
| void x86_3dnow_copyf (float *, const float *, int); | |||||
| void x86_3dnow_add2f (float *, const float *, int); | |||||
| void x86_sse_copyf (float *, const float *, int); | |||||
| void x86_sse_add2f (float *, const float *, int); | |||||
| #endif /* ARCH_X86 */ | #endif /* ARCH_X86 */ | ||||
| extern void jack_port_set_funcs (void); | |||||
| void jack_port_set_funcs (void); | |||||
| #endif /* __jack_intsimd_h__ */ | #endif /* __jack_intsimd_h__ */ | ||||
| @@ -23,7 +23,8 @@ SOURCE_FILES = \ | |||||
| time.c \ | time.c \ | ||||
| timestamps.c \ | timestamps.c \ | ||||
| transclient.c \ | transclient.c \ | ||||
| unlock.c | |||||
| unlock.c \ | |||||
| simd.c | |||||
| lib_LTLIBRARIES = libjack.la | lib_LTLIBRARIES = libjack.la | ||||
| @@ -63,9 +63,6 @@ | |||||
| static pthread_mutex_t client_lock; | static pthread_mutex_t client_lock; | ||||
| static pthread_cond_t client_ready; | static pthread_cond_t client_ready; | ||||
| #ifdef ARCH_X86 | |||||
| int cpu_type = 0; | |||||
| #endif /* ARCH_X86 */ | |||||
| #define EVENT_POLL_INDEX 0 | #define EVENT_POLL_INDEX 0 | ||||
| #define WAIT_POLL_INDEX 1 | #define WAIT_POLL_INDEX 1 | ||||
| @@ -82,98 +79,7 @@ typedef struct { | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| static int | |||||
| have_3dnow () | |||||
| { | |||||
| unsigned int res = 0; | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| asm volatile ( | |||||
| "movl $0x80000000, %%eax\n\t" \ | |||||
| "cpuid\n\t" \ | |||||
| "cmpl $0x80000001, %%eax\n\t" \ | |||||
| "jl tdnow_prexit\n\t" \ | |||||
| \ | |||||
| "movl $0x80000001, %%eax\n\t" \ | |||||
| "cpuid\n\t" \ | |||||
| \ | |||||
| "xorl %%eax, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ecx\n\t" \ | |||||
| "shll $31, %%ecx\n\t" \ | |||||
| "testl %%ecx, %%edx\n\t" \ | |||||
| "jz tdnow_testexit\n\t" \ | |||||
| "movl $1, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ecx\n\t" \ | |||||
| "shll $30, %%ecx\n\t" \ | |||||
| "testl %%ecx, %%edx\n\t" \ | |||||
| "jz tdnow_testexit\n\t" \ | |||||
| "movl $2, %%eax\n\t" \ | |||||
| "jmp tdnow_testexit\n\t" \ | |||||
| \ | |||||
| "tdnow_prexit:\n\t" \ | |||||
| "xorl %%eax, %%eax\n\t" \ | |||||
| "tdnow_testexit:\n\t" | |||||
| : "=a" (res) | |||||
| : | |||||
| : "ecx", "edx", "memory"); | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| return res; | |||||
| } | |||||
| static int | |||||
| have_sse () | |||||
| { | |||||
| unsigned int res = 0; | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| asm volatile ( | |||||
| "movl $1, %%eax\n\t" \ | |||||
| "cpuid\n\t" \ | |||||
| \ | |||||
| "xorl %%eax, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ebx\n\t" \ | |||||
| "shll $25, %%ebx\n\t" \ | |||||
| "testl %%ebx, %%edx\n\t" \ | |||||
| "jz sse_testexit\n\t" \ | |||||
| "movl $1, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ebx\n\t" \ | |||||
| "shll $26, %%ebx\n\t" \ | |||||
| "testl %%ebx, %%edx\n\t" \ | |||||
| "jz sse_testexit\n\t" \ | |||||
| "movl $2, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ebx\n\t" \ | |||||
| "testl %%ebx, %%ecx\n\t" \ | |||||
| "jz sse_testexit\n\t" \ | |||||
| "movl $3, %%eax\n\t" \ | |||||
| \ | |||||
| "sse_testexit:\n\t" | |||||
| : "=a" (res) | |||||
| : | |||||
| : "ecx", "edx", "memory"); | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| return res; | |||||
| } | |||||
| int cpu_type = 0; | |||||
| static void | static void | ||||
| init_cpu () | init_cpu () | ||||
| @@ -96,216 +96,6 @@ gen_mixf (float *dest, const float *src, int length) | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| static void | |||||
| x86_3dnow_add2f (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n; | |||||
| pv2sf m64p_dest = (pv2sf) dest; | |||||
| pv2sf m64p_src = (pv2sf) src; | |||||
| n = (length >> 1); | |||||
| for (i = 0; i < n; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movq %1, %%mm0\n\t" \ | |||||
| "pfadd %2, %%mm0\n\t" \ | |||||
| "movq %%mm0, %0\n\t" | |||||
| : "=m" (m64p_dest[i]) | |||||
| : "m0" (m64p_dest[i]), | |||||
| "m" (m64p_src[i]) | |||||
| : "mm0", "memory"); | |||||
| } | |||||
| if (n & 0x1) | |||||
| { | |||||
| asm volatile ( | |||||
| "movd %1, %%mm0\n\t" \ | |||||
| "movd %2, %%mm1\n\t" \ | |||||
| "pfadd %%mm1, %%mm0\n\t" \ | |||||
| "movd %%mm0, %0\n\t" | |||||
| : "=m" (dest[length - 1]) | |||||
| : "m0" (dest[length - 1]), | |||||
| "m" (src[length - 1]) | |||||
| : "mm0", "mm1", "memory"); | |||||
| } | |||||
| asm volatile ( | |||||
| "femms\n\t" \ | |||||
| "sfence\n\t"); | |||||
| } | |||||
| static void | |||||
| x86_3dnow_copyf (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n1, n2; | |||||
| pv2sf m64p_src = (pv2sf) src; | |||||
| pv2sf m64p_dest = (pv2sf) dest; | |||||
| n1 = (length >> 4); | |||||
| n2 = ((length & 0xf) >> 1); | |||||
| for (i = 0; i < n1; i++) | |||||
| { | |||||
| asm volatile ("movq %0, %%mm0\n\t" | |||||
| : : "m" (*m64p_src++) : "mm0", "memory"); | |||||
| asm volatile ("movq %0, %%mm1\n\t" | |||||
| : : "m" (*m64p_src++) : "mm1", "memory"); | |||||
| asm volatile ("movq %0, %%mm2\n\t" | |||||
| : : "m" (*m64p_src++) : "mm2", "memory"); | |||||
| asm volatile ("movq %0, %%mm3\n\t" | |||||
| : : "m" (*m64p_src++) : "mm3", "memory"); | |||||
| asm volatile ("movq %0, %%mm4\n\t" | |||||
| : : "m" (*m64p_src++) : "mm4", "memory"); | |||||
| asm volatile ("movq %0, %%mm5\n\t" | |||||
| : : "m" (*m64p_src++) : "mm5", "memory"); | |||||
| asm volatile ("movq %0, %%mm6\n\t" | |||||
| : : "m" (*m64p_src++) : "mm6", "memory"); | |||||
| asm volatile ("movq %0, %%mm7\n\t" | |||||
| : : "m" (*m64p_src++) : "xmm7", "memory"); | |||||
| asm volatile ("movq %%mm0, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm0", "memory"); | |||||
| asm volatile ("movq %%mm1, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm1", "memory"); | |||||
| asm volatile ("movq %%mm2, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm2", "memory"); | |||||
| asm volatile ("movq %%mm3, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm3", "memory"); | |||||
| asm volatile ("movq %%mm4, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm4", "memory"); | |||||
| asm volatile ("movq %%mm5, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm5", "memory"); | |||||
| asm volatile ("movq %%mm6, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm6", "memory"); | |||||
| asm volatile ("movq %%mm7, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm7", "memory"); | |||||
| } | |||||
| for (i = 0; i < n2; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movq %1, %%mm0\n\t" \ | |||||
| "movq %%mm0, %0\n\t" | |||||
| : "=m" (*m64p_dest++) | |||||
| : "m" (*m64p_src++) | |||||
| : "mm0", "memory"); | |||||
| } | |||||
| if (length & 0x1) | |||||
| { | |||||
| asm volatile ( | |||||
| "movd %1, %%mm0\n\t" \ | |||||
| "movd %%mm0, %0\n\t" | |||||
| : "=m" (dest[length - 1]) | |||||
| : "m" (src[length - 1]) | |||||
| : "mm0", "memory"); | |||||
| } | |||||
| asm volatile ( | |||||
| "femms\n\t" \ | |||||
| "sfence\n\t"); | |||||
| } | |||||
| static void | |||||
| x86_sse_copyf (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n1, n2, si3; | |||||
| pv4sf m128p_src = (pv4sf) src; | |||||
| pv4sf m128p_dest = (pv4sf) dest; | |||||
| n1 = (length >> 5); | |||||
| n2 = ((length & 0x1f) >> 2); | |||||
| si3 = (length & ~0x3); | |||||
| for (i = 0; i < n1; i++) | |||||
| { | |||||
| asm volatile ("movaps %0, %%xmm0\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm0", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm1\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm1", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm2\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm2", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm3\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm3", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm4\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm4", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm5\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm5", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm6\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm6", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm7\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm7", "memory"); | |||||
| asm volatile ("movaps %%xmm0, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm0", "memory"); | |||||
| asm volatile ("movaps %%xmm1, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm1", "memory"); | |||||
| asm volatile ("movaps %%xmm2, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm2", "memory"); | |||||
| asm volatile ("movaps %%xmm3, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm3", "memory"); | |||||
| asm volatile ("movaps %%xmm4, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm4", "memory"); | |||||
| asm volatile ("movaps %%xmm5, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm5", "memory"); | |||||
| asm volatile ("movaps %%xmm6, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm6", "memory"); | |||||
| asm volatile ("movaps %%xmm7, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm7", "memory"); | |||||
| } | |||||
| for (i = 0; i < n2; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movaps %1, %%xmm0\n\t" \ | |||||
| "movaps %%xmm0, %0\n\t" | |||||
| : "=m" (*m128p_dest++) | |||||
| : "m" (*m128p_src++) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| for (i = si3; i < length; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movss %1, %%xmm0\n\t" \ | |||||
| "movss %%xmm0, %0\n\t" | |||||
| : "=m" (dest[i]) | |||||
| : "m" (src[i]) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| } | |||||
| static void | |||||
| x86_sse_add2f (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n, si2; | |||||
| pv4sf m128p_src = (pv4sf) src; | |||||
| pv4sf m128p_dest = (pv4sf) dest; | |||||
| if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0)) | |||||
| { | |||||
| /*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/ | |||||
| si2 = 0; | |||||
| goto sse_nonalign; | |||||
| } | |||||
| si2 = (length & ~0x3); | |||||
| n = (length >> 2); | |||||
| for (i = 0; i < n; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movaps %1, %%xmm0\n\t" \ | |||||
| "addps %2, %%xmm0\n\t" \ | |||||
| "movaps %%xmm0, %0\n\t" | |||||
| : "=m" (m128p_dest[i]) | |||||
| : "m0" (m128p_dest[i]), | |||||
| "m" (m128p_src[i]) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| sse_nonalign: | |||||
| for (i = si2; i < length; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movss %1, %%xmm0\n\t" \ | |||||
| "addss %2, %%xmm0\n\t" \ | |||||
| "movss %%xmm0, %0\n\t" | |||||
| : "=m" (dest[i]) | |||||
| : "m0" (dest[i]), | |||||
| "m" (src[i]) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| } | |||||
| void jack_port_set_funcs () | void jack_port_set_funcs () | ||||
| { | { | ||||
| if (ARCH_X86_HAVE_SSE2(cpu_type)) { | if (ARCH_X86_HAVE_SSE2(cpu_type)) { | ||||
| @@ -0,0 +1,335 @@ | |||||
| /* -*- mode: c; c-file-style: "bsd"; -*- */ | |||||
| /* | |||||
| Copyright (C) 2005-2007 Jussi Laako | |||||
| This program is free software; you can redistribute it and/or modify | |||||
| it under the terms of the GNU Lesser General Public License as published by | |||||
| the Free Software Foundation; either version 2.1 of the License, or | |||||
| (at your option) any later version. | |||||
| This program is distributed in the hope that it will be useful, | |||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| GNU Lesser General Public License for more details. | |||||
| You should have received a copy of the GNU Lesser General Public License | |||||
| along with this program; if not, write to the Free Software | |||||
| Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |||||
| */ | |||||
| #include <config.h> | |||||
| #include <jack/intsimd.h> | |||||
| #ifdef USE_DYNSIMD | |||||
| #ifdef ARCH_X86 | |||||
| int | |||||
| have_3dnow () | |||||
| { | |||||
| unsigned int res = 0; | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| asm volatile ( | |||||
| "movl $0x80000000, %%eax\n\t" \ | |||||
| "cpuid\n\t" \ | |||||
| "cmpl $0x80000001, %%eax\n\t" \ | |||||
| "jl tdnow_prexit\n\t" \ | |||||
| \ | |||||
| "movl $0x80000001, %%eax\n\t" \ | |||||
| "cpuid\n\t" \ | |||||
| \ | |||||
| "xorl %%eax, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ecx\n\t" \ | |||||
| "shll $31, %%ecx\n\t" \ | |||||
| "testl %%ecx, %%edx\n\t" \ | |||||
| "jz tdnow_testexit\n\t" \ | |||||
| "movl $1, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ecx\n\t" \ | |||||
| "shll $30, %%ecx\n\t" \ | |||||
| "testl %%ecx, %%edx\n\t" \ | |||||
| "jz tdnow_testexit\n\t" \ | |||||
| "movl $2, %%eax\n\t" \ | |||||
| "jmp tdnow_testexit\n\t" \ | |||||
| \ | |||||
| "tdnow_prexit:\n\t" \ | |||||
| "xorl %%eax, %%eax\n\t" \ | |||||
| "tdnow_testexit:\n\t" | |||||
| : "=a" (res) | |||||
| : | |||||
| : "ecx", "edx", "memory"); | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| return res; | |||||
| } | |||||
| int | |||||
| have_sse () | |||||
| { | |||||
| unsigned int res = 0; | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("pushq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("pushl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| asm volatile ( | |||||
| "movl $1, %%eax\n\t" \ | |||||
| "cpuid\n\t" \ | |||||
| \ | |||||
| "xorl %%eax, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ebx\n\t" \ | |||||
| "shll $25, %%ebx\n\t" \ | |||||
| "testl %%ebx, %%edx\n\t" \ | |||||
| "jz sse_testexit\n\t" \ | |||||
| "movl $1, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ebx\n\t" \ | |||||
| "shll $26, %%ebx\n\t" \ | |||||
| "testl %%ebx, %%edx\n\t" \ | |||||
| "jz sse_testexit\n\t" \ | |||||
| "movl $2, %%eax\n\t" \ | |||||
| \ | |||||
| "movl $1, %%ebx\n\t" \ | |||||
| "testl %%ebx, %%ecx\n\t" \ | |||||
| "jz sse_testexit\n\t" \ | |||||
| "movl $3, %%eax\n\t" \ | |||||
| \ | |||||
| "sse_testexit:\n\t" | |||||
| : "=a" (res) | |||||
| : | |||||
| : "ecx", "edx", "memory"); | |||||
| #ifdef __x86_64__ | |||||
| asm volatile ("popq %%rbx\n\t" : : : "memory"); | |||||
| #else | |||||
| asm volatile ("popl %%ebx\n\t" : : : "memory"); | |||||
| #endif | |||||
| return res; | |||||
| } | |||||
| void | |||||
| x86_3dnow_copyf (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n1, n2; | |||||
| pv2sf m64p_src = (pv2sf) src; | |||||
| pv2sf m64p_dest = (pv2sf) dest; | |||||
| n1 = (length >> 4); | |||||
| n2 = ((length & 0xf) >> 1); | |||||
| for (i = 0; i < n1; i++) | |||||
| { | |||||
| asm volatile ("movq %0, %%mm0\n\t" | |||||
| : : "m" (*m64p_src++) : "mm0", "memory"); | |||||
| asm volatile ("movq %0, %%mm1\n\t" | |||||
| : : "m" (*m64p_src++) : "mm1", "memory"); | |||||
| asm volatile ("movq %0, %%mm2\n\t" | |||||
| : : "m" (*m64p_src++) : "mm2", "memory"); | |||||
| asm volatile ("movq %0, %%mm3\n\t" | |||||
| : : "m" (*m64p_src++) : "mm3", "memory"); | |||||
| asm volatile ("movq %0, %%mm4\n\t" | |||||
| : : "m" (*m64p_src++) : "mm4", "memory"); | |||||
| asm volatile ("movq %0, %%mm5\n\t" | |||||
| : : "m" (*m64p_src++) : "mm5", "memory"); | |||||
| asm volatile ("movq %0, %%mm6\n\t" | |||||
| : : "m" (*m64p_src++) : "mm6", "memory"); | |||||
| asm volatile ("movq %0, %%mm7\n\t" | |||||
| : : "m" (*m64p_src++) : "xmm7", "memory"); | |||||
| asm volatile ("movq %%mm0, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm0", "memory"); | |||||
| asm volatile ("movq %%mm1, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm1", "memory"); | |||||
| asm volatile ("movq %%mm2, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm2", "memory"); | |||||
| asm volatile ("movq %%mm3, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm3", "memory"); | |||||
| asm volatile ("movq %%mm4, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm4", "memory"); | |||||
| asm volatile ("movq %%mm5, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm5", "memory"); | |||||
| asm volatile ("movq %%mm6, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm6", "memory"); | |||||
| asm volatile ("movq %%mm7, %0\n\t" | |||||
| : "=m" (*m64p_dest++) : : "mm7", "memory"); | |||||
| } | |||||
| for (i = 0; i < n2; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movq %1, %%mm0\n\t" \ | |||||
| "movq %%mm0, %0\n\t" | |||||
| : "=m" (*m64p_dest++) | |||||
| : "m" (*m64p_src++) | |||||
| : "mm0", "memory"); | |||||
| } | |||||
| if (length & 0x1) | |||||
| { | |||||
| asm volatile ( | |||||
| "movd %1, %%mm0\n\t" \ | |||||
| "movd %%mm0, %0\n\t" | |||||
| : "=m" (dest[length - 1]) | |||||
| : "m" (src[length - 1]) | |||||
| : "mm0", "memory"); | |||||
| } | |||||
| asm volatile ( | |||||
| "femms\n\t" \ | |||||
| "sfence\n\t"); | |||||
| } | |||||
| void | |||||
| x86_3dnow_add2f (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n; | |||||
| pv2sf m64p_dest = (pv2sf) dest; | |||||
| pv2sf m64p_src = (pv2sf) src; | |||||
| n = (length >> 1); | |||||
| for (i = 0; i < n; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movq %1, %%mm0\n\t" \ | |||||
| "pfadd %2, %%mm0\n\t" \ | |||||
| "movq %%mm0, %0\n\t" | |||||
| : "=m" (m64p_dest[i]) | |||||
| : "m0" (m64p_dest[i]), | |||||
| "m" (m64p_src[i]) | |||||
| : "mm0", "memory"); | |||||
| } | |||||
| if (n & 0x1) | |||||
| { | |||||
| asm volatile ( | |||||
| "movd %1, %%mm0\n\t" \ | |||||
| "movd %2, %%mm1\n\t" \ | |||||
| "pfadd %%mm1, %%mm0\n\t" \ | |||||
| "movd %%mm0, %0\n\t" | |||||
| : "=m" (dest[length - 1]) | |||||
| : "m0" (dest[length - 1]), | |||||
| "m" (src[length - 1]) | |||||
| : "mm0", "mm1", "memory"); | |||||
| } | |||||
| asm volatile ( | |||||
| "femms\n\t" \ | |||||
| "sfence\n\t"); | |||||
| } | |||||
| void | |||||
| x86_sse_copyf (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n1, n2, si3; | |||||
| pv4sf m128p_src = (pv4sf) src; | |||||
| pv4sf m128p_dest = (pv4sf) dest; | |||||
| n1 = (length >> 5); | |||||
| n2 = ((length & 0x1f) >> 2); | |||||
| si3 = (length & ~0x3); | |||||
| for (i = 0; i < n1; i++) | |||||
| { | |||||
| asm volatile ("movaps %0, %%xmm0\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm0", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm1\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm1", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm2\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm2", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm3\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm3", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm4\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm4", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm5\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm5", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm6\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm6", "memory"); | |||||
| asm volatile ("movaps %0, %%xmm7\n\t" | |||||
| : : "m" (*m128p_src++) : "xmm7", "memory"); | |||||
| asm volatile ("movaps %%xmm0, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm0", "memory"); | |||||
| asm volatile ("movaps %%xmm1, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm1", "memory"); | |||||
| asm volatile ("movaps %%xmm2, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm2", "memory"); | |||||
| asm volatile ("movaps %%xmm3, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm3", "memory"); | |||||
| asm volatile ("movaps %%xmm4, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm4", "memory"); | |||||
| asm volatile ("movaps %%xmm5, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm5", "memory"); | |||||
| asm volatile ("movaps %%xmm6, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm6", "memory"); | |||||
| asm volatile ("movaps %%xmm7, %0\n\t" | |||||
| : "=m" (*m128p_dest++) : : "xmm7", "memory"); | |||||
| } | |||||
| for (i = 0; i < n2; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movaps %1, %%xmm0\n\t" \ | |||||
| "movaps %%xmm0, %0\n\t" | |||||
| : "=m" (*m128p_dest++) | |||||
| : "m" (*m128p_src++) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| for (i = si3; i < length; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movss %1, %%xmm0\n\t" \ | |||||
| "movss %%xmm0, %0\n\t" | |||||
| : "=m" (dest[i]) | |||||
| : "m" (src[i]) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| } | |||||
| void | |||||
| x86_sse_add2f (float *dest, const float *src, int length) | |||||
| { | |||||
| int i, n, si2; | |||||
| pv4sf m128p_src = (pv4sf) src; | |||||
| pv4sf m128p_dest = (pv4sf) dest; | |||||
| if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0)) | |||||
| { | |||||
| /*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/ | |||||
| si2 = 0; | |||||
| goto sse_nonalign; | |||||
| } | |||||
| si2 = (length & ~0x3); | |||||
| n = (length >> 2); | |||||
| for (i = 0; i < n; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movaps %1, %%xmm0\n\t" \ | |||||
| "addps %2, %%xmm0\n\t" \ | |||||
| "movaps %%xmm0, %0\n\t" | |||||
| : "=m" (m128p_dest[i]) | |||||
| : "m0" (m128p_dest[i]), | |||||
| "m" (m128p_src[i]) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| sse_nonalign: | |||||
| for (i = si2; i < length; i++) | |||||
| { | |||||
| asm volatile ( | |||||
| "movss %1, %%xmm0\n\t" \ | |||||
| "addss %2, %%xmm0\n\t" \ | |||||
| "movss %%xmm0, %0\n\t" | |||||
| : "=m" (dest[i]) | |||||
| : "m0" (dest[i]), | |||||
| "m" (src[i]) | |||||
| : "xmm0", "memory"); | |||||
| } | |||||
| } | |||||
| #endif /* ARCH_X86 */ | |||||
| #endif /* USE_DYNSIMD */ | |||||