Browse Source

- Add support for proper triggering in OSS driver when in full duplex mode

- Move inline asm SIMD functionality to a separate source file


git-svn-id: svn+ssh://jackaudio.org/trunk/jack@1050 0c269be4-1314-0410-8aa9-9f06e86f4224
tags/0.109.0
sonarnerd 18 years ago
parent
commit
04df4321bd
8 changed files with 369 additions and 311 deletions
  1. +1
    -1
      configure.ac
  2. +19
    -1
      drivers/oss/oss_driver.c
  3. +2
    -1
      drivers/oss/oss_driver.h
  4. +9
    -2
      jack/intsimd.h
  5. +2
    -1
      libjack/Makefile.am
  6. +1
    -95
      libjack/client.c
  7. +0
    -210
      libjack/port.c
  8. +335
    -0
      libjack/simd.c

+ 1
- 1
configure.ac View File

@@ -17,7 +17,7 @@ dnl changes are made
dnl ---
JACK_MAJOR_VERSION=0
JACK_MINOR_VERSION=107
JACK_MICRO_VERSION=1
JACK_MICRO_VERSION=2

dnl ---
dnl HOWTO: updating the jack protocol version


+ 19
- 1
drivers/oss/oss_driver.c View File

@@ -1,7 +1,7 @@
/*

OSS driver for Jack
Copyright (C) 2003-2005 Jussi Laako <jussi@sonarnerd.net>
Copyright (C) 2003-2007 Jussi Laako <jussi@sonarnerd.net>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -456,6 +456,7 @@ static int oss_driver_start (oss_driver_t *driver)
samplesize = sizeof(short);
break;
}
driver->trigger = 0;
if (strcmp(indev, outdev) != 0)
{
if (driver->capture_channels > 0)
@@ -529,6 +530,8 @@ static int oss_driver_start (oss_driver_t *driver)
}
if (infd >= 0 && outfd >= 0)
{
ioctl(outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger);
driver->trigger = (PCM_ENABLE_INPUT|PCM_ENABLE_OUTPUT);
if (ioctl(infd, SNDCTL_DSP_SETDUPLEX, 0) < 0)
{
if (errno != EINVAL) /* Dont care */
@@ -961,6 +964,13 @@ static void *io_thread (void *param)
__FILE__, __LINE__);
return NULL;
}
if (driver->trigger)
{
/* don't care too much if this fails */
memset(localbuf, 0x00, localsize);
write(driver->outfd, localbuf, localsize);
ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger);
}

while (driver->run)
{
@@ -992,6 +1002,13 @@ static void *io_thread (void *param)
jack_error("OSS: malloc() failed: %s@%i", __FILE__, __LINE__);
return NULL;
}
if (driver->trigger)
{
/* don't care too much if this fails */
memset(localbuf, 0x00, localsize);
write(driver->outfd, localbuf, driver->outdevbufsize);
ioctl(driver->outfd, SNDCTL_DSP_SETTRIGGER, &driver->trigger);
}

while (driver->run)
{
@@ -1119,6 +1136,7 @@ jack_driver_t * driver_initialize (jack_client_t *client,
driver->indev = NULL;
driver->outdev = NULL;
driver->ignorehwbuf = 0;
driver->trigger = 0;

pnode = params;
while (pnode != NULL)


+ 2
- 1
drivers/oss/oss_driver.h View File

@@ -1,7 +1,7 @@
/*

OSS driver for Jack
Copyright (C) 2003-2005 Jussi Laako <jussi@sonarnerd.net>
Copyright (C) 2003-2007 Jussi Laako <jussi@sonarnerd.net>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -62,6 +62,7 @@ typedef struct _oss_driver
int outfd;
int format;
int ignorehwbuf;
int trigger;

size_t indevbufsize;
size_t outdevbufsize;


+ 9
- 2
jack/intsimd.h View File

@@ -1,5 +1,5 @@
/*
Copyright (C) 2005 Jussi Laako
Copyright (C) 2005-2007 Jussi Laako
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -39,9 +39,16 @@ typedef v4sf * pv4sf;

extern int cpu_type;

int have_3dnow (void);
int have_sse (void);
void x86_3dnow_copyf (float *, const float *, int);
void x86_3dnow_add2f (float *, const float *, int);
void x86_sse_copyf (float *, const float *, int);
void x86_sse_add2f (float *, const float *, int);

#endif /* ARCH_X86 */

extern void jack_port_set_funcs (void);
void jack_port_set_funcs (void);

#endif /* __jack_intsimd_h__ */


+ 2
- 1
libjack/Makefile.am View File

@@ -23,7 +23,8 @@ SOURCE_FILES = \
time.c \
timestamps.c \
transclient.c \
unlock.c
unlock.c \
simd.c

lib_LTLIBRARIES = libjack.la



+ 1
- 95
libjack/client.c View File

@@ -63,9 +63,6 @@

static pthread_mutex_t client_lock;
static pthread_cond_t client_ready;
#ifdef ARCH_X86
int cpu_type = 0;
#endif /* ARCH_X86 */

#define EVENT_POLL_INDEX 0
#define WAIT_POLL_INDEX 1
@@ -82,98 +79,7 @@ typedef struct {

#ifdef ARCH_X86

static int
have_3dnow ()
{
unsigned int res = 0;

#ifdef __x86_64__
asm volatile ("pushq %%rbx\n\t" : : : "memory");
#else
asm volatile ("pushl %%ebx\n\t" : : : "memory");
#endif
asm volatile (
"movl $0x80000000, %%eax\n\t" \
"cpuid\n\t" \
"cmpl $0x80000001, %%eax\n\t" \
"jl tdnow_prexit\n\t" \
\
"movl $0x80000001, %%eax\n\t" \
"cpuid\n\t" \
\
"xorl %%eax, %%eax\n\t" \
\
"movl $1, %%ecx\n\t" \
"shll $31, %%ecx\n\t" \
"testl %%ecx, %%edx\n\t" \
"jz tdnow_testexit\n\t" \
"movl $1, %%eax\n\t" \
\
"movl $1, %%ecx\n\t" \
"shll $30, %%ecx\n\t" \
"testl %%ecx, %%edx\n\t" \
"jz tdnow_testexit\n\t" \
"movl $2, %%eax\n\t" \
"jmp tdnow_testexit\n\t" \
\
"tdnow_prexit:\n\t" \
"xorl %%eax, %%eax\n\t" \
"tdnow_testexit:\n\t"
: "=a" (res)
:
: "ecx", "edx", "memory");
#ifdef __x86_64__
asm volatile ("popq %%rbx\n\t" : : : "memory");
#else
asm volatile ("popl %%ebx\n\t" : : : "memory");
#endif
return res;
}

static int
have_sse ()
{
unsigned int res = 0;

#ifdef __x86_64__
asm volatile ("pushq %%rbx\n\t" : : : "memory");
#else
asm volatile ("pushl %%ebx\n\t" : : : "memory");
#endif
asm volatile (
"movl $1, %%eax\n\t" \
"cpuid\n\t" \
\
"xorl %%eax, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"shll $25, %%ebx\n\t" \
"testl %%ebx, %%edx\n\t" \
"jz sse_testexit\n\t" \
"movl $1, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"shll $26, %%ebx\n\t" \
"testl %%ebx, %%edx\n\t" \
"jz sse_testexit\n\t" \
"movl $2, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"testl %%ebx, %%ecx\n\t" \
"jz sse_testexit\n\t" \
"movl $3, %%eax\n\t" \
\
"sse_testexit:\n\t"
: "=a" (res)
:
: "ecx", "edx", "memory");
#ifdef __x86_64__
asm volatile ("popq %%rbx\n\t" : : : "memory");
#else
asm volatile ("popl %%ebx\n\t" : : : "memory");
#endif
return res;
}
int cpu_type = 0;

static void
init_cpu ()


+ 0
- 210
libjack/port.c View File

@@ -96,216 +96,6 @@ gen_mixf (float *dest, const float *src, int length)

#ifdef ARCH_X86

static void
x86_3dnow_add2f (float *dest, const float *src, int length)
{
int i, n;
pv2sf m64p_dest = (pv2sf) dest;
pv2sf m64p_src = (pv2sf) src;

n = (length >> 1);
for (i = 0; i < n; i++)
{
asm volatile (
"movq %1, %%mm0\n\t" \
"pfadd %2, %%mm0\n\t" \
"movq %%mm0, %0\n\t"
: "=m" (m64p_dest[i])
: "m0" (m64p_dest[i]),
"m" (m64p_src[i])
: "mm0", "memory");
}
if (n & 0x1)
{
asm volatile (
"movd %1, %%mm0\n\t" \
"movd %2, %%mm1\n\t" \
"pfadd %%mm1, %%mm0\n\t" \
"movd %%mm0, %0\n\t"
: "=m" (dest[length - 1])
: "m0" (dest[length - 1]),
"m" (src[length - 1])
: "mm0", "mm1", "memory");
}
asm volatile (
"femms\n\t" \
"sfence\n\t");
}

static void
x86_3dnow_copyf (float *dest, const float *src, int length)
{
int i, n1, n2;
pv2sf m64p_src = (pv2sf) src;
pv2sf m64p_dest = (pv2sf) dest;

n1 = (length >> 4);
n2 = ((length & 0xf) >> 1);
for (i = 0; i < n1; i++)
{
asm volatile ("movq %0, %%mm0\n\t"
: : "m" (*m64p_src++) : "mm0", "memory");
asm volatile ("movq %0, %%mm1\n\t"
: : "m" (*m64p_src++) : "mm1", "memory");
asm volatile ("movq %0, %%mm2\n\t"
: : "m" (*m64p_src++) : "mm2", "memory");
asm volatile ("movq %0, %%mm3\n\t"
: : "m" (*m64p_src++) : "mm3", "memory");
asm volatile ("movq %0, %%mm4\n\t"
: : "m" (*m64p_src++) : "mm4", "memory");
asm volatile ("movq %0, %%mm5\n\t"
: : "m" (*m64p_src++) : "mm5", "memory");
asm volatile ("movq %0, %%mm6\n\t"
: : "m" (*m64p_src++) : "mm6", "memory");
asm volatile ("movq %0, %%mm7\n\t"
: : "m" (*m64p_src++) : "xmm7", "memory");

asm volatile ("movq %%mm0, %0\n\t"
: "=m" (*m64p_dest++) : : "mm0", "memory");
asm volatile ("movq %%mm1, %0\n\t"
: "=m" (*m64p_dest++) : : "mm1", "memory");
asm volatile ("movq %%mm2, %0\n\t"
: "=m" (*m64p_dest++) : : "mm2", "memory");
asm volatile ("movq %%mm3, %0\n\t"
: "=m" (*m64p_dest++) : : "mm3", "memory");
asm volatile ("movq %%mm4, %0\n\t"
: "=m" (*m64p_dest++) : : "mm4", "memory");
asm volatile ("movq %%mm5, %0\n\t"
: "=m" (*m64p_dest++) : : "mm5", "memory");
asm volatile ("movq %%mm6, %0\n\t"
: "=m" (*m64p_dest++) : : "mm6", "memory");
asm volatile ("movq %%mm7, %0\n\t"
: "=m" (*m64p_dest++) : : "mm7", "memory");
}
for (i = 0; i < n2; i++)
{
asm volatile (
"movq %1, %%mm0\n\t" \
"movq %%mm0, %0\n\t"
: "=m" (*m64p_dest++)
: "m" (*m64p_src++)
: "mm0", "memory");
}
if (length & 0x1)
{
asm volatile (
"movd %1, %%mm0\n\t" \
"movd %%mm0, %0\n\t"
: "=m" (dest[length - 1])
: "m" (src[length - 1])
: "mm0", "memory");
}
asm volatile (
"femms\n\t" \
"sfence\n\t");
}

static void
x86_sse_copyf (float *dest, const float *src, int length)
{
int i, n1, n2, si3;
pv4sf m128p_src = (pv4sf) src;
pv4sf m128p_dest = (pv4sf) dest;

n1 = (length >> 5);
n2 = ((length & 0x1f) >> 2);
si3 = (length & ~0x3);
for (i = 0; i < n1; i++)
{
asm volatile ("movaps %0, %%xmm0\n\t"
: : "m" (*m128p_src++) : "xmm0", "memory");
asm volatile ("movaps %0, %%xmm1\n\t"
: : "m" (*m128p_src++) : "xmm1", "memory");
asm volatile ("movaps %0, %%xmm2\n\t"
: : "m" (*m128p_src++) : "xmm2", "memory");
asm volatile ("movaps %0, %%xmm3\n\t"
: : "m" (*m128p_src++) : "xmm3", "memory");
asm volatile ("movaps %0, %%xmm4\n\t"
: : "m" (*m128p_src++) : "xmm4", "memory");
asm volatile ("movaps %0, %%xmm5\n\t"
: : "m" (*m128p_src++) : "xmm5", "memory");
asm volatile ("movaps %0, %%xmm6\n\t"
: : "m" (*m128p_src++) : "xmm6", "memory");
asm volatile ("movaps %0, %%xmm7\n\t"
: : "m" (*m128p_src++) : "xmm7", "memory");

asm volatile ("movaps %%xmm0, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm0", "memory");
asm volatile ("movaps %%xmm1, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm1", "memory");
asm volatile ("movaps %%xmm2, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm2", "memory");
asm volatile ("movaps %%xmm3, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm3", "memory");
asm volatile ("movaps %%xmm4, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm4", "memory");
asm volatile ("movaps %%xmm5, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm5", "memory");
asm volatile ("movaps %%xmm6, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm6", "memory");
asm volatile ("movaps %%xmm7, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm7", "memory");
}
for (i = 0; i < n2; i++)
{
asm volatile (
"movaps %1, %%xmm0\n\t" \
"movaps %%xmm0, %0\n\t"
: "=m" (*m128p_dest++)
: "m" (*m128p_src++)
: "xmm0", "memory");
}
for (i = si3; i < length; i++)
{
asm volatile (
"movss %1, %%xmm0\n\t" \
"movss %%xmm0, %0\n\t"
: "=m" (dest[i])
: "m" (src[i])
: "xmm0", "memory");
}
}

static void
x86_sse_add2f (float *dest, const float *src, int length)
{
int i, n, si2;
pv4sf m128p_src = (pv4sf) src;
pv4sf m128p_dest = (pv4sf) dest;

if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0))
{
/*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/
si2 = 0;
goto sse_nonalign;
}
si2 = (length & ~0x3);
n = (length >> 2);
for (i = 0; i < n; i++)
{
asm volatile (
"movaps %1, %%xmm0\n\t" \
"addps %2, %%xmm0\n\t" \
"movaps %%xmm0, %0\n\t"
: "=m" (m128p_dest[i])
: "m0" (m128p_dest[i]),
"m" (m128p_src[i])
: "xmm0", "memory");
}
sse_nonalign:
for (i = si2; i < length; i++)
{
asm volatile (
"movss %1, %%xmm0\n\t" \
"addss %2, %%xmm0\n\t" \
"movss %%xmm0, %0\n\t"
: "=m" (dest[i])
: "m0" (dest[i]),
"m" (src[i])
: "xmm0", "memory");
}
}

void jack_port_set_funcs ()
{
if (ARCH_X86_HAVE_SSE2(cpu_type)) {


+ 335
- 0
libjack/simd.c View File

@@ -0,0 +1,335 @@
/* -*- mode: c; c-file-style: "bsd"; -*- */
/*
Copyright (C) 2005-2007 Jussi Laako
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

*/


#include <config.h>
#include <jack/intsimd.h>

#ifdef USE_DYNSIMD

#ifdef ARCH_X86

int
have_3dnow ()
{
unsigned int res = 0;

#ifdef __x86_64__
asm volatile ("pushq %%rbx\n\t" : : : "memory");
#else
asm volatile ("pushl %%ebx\n\t" : : : "memory");
#endif
asm volatile (
"movl $0x80000000, %%eax\n\t" \
"cpuid\n\t" \
"cmpl $0x80000001, %%eax\n\t" \
"jl tdnow_prexit\n\t" \
\
"movl $0x80000001, %%eax\n\t" \
"cpuid\n\t" \
\
"xorl %%eax, %%eax\n\t" \
\
"movl $1, %%ecx\n\t" \
"shll $31, %%ecx\n\t" \
"testl %%ecx, %%edx\n\t" \
"jz tdnow_testexit\n\t" \
"movl $1, %%eax\n\t" \
\
"movl $1, %%ecx\n\t" \
"shll $30, %%ecx\n\t" \
"testl %%ecx, %%edx\n\t" \
"jz tdnow_testexit\n\t" \
"movl $2, %%eax\n\t" \
"jmp tdnow_testexit\n\t" \
\
"tdnow_prexit:\n\t" \
"xorl %%eax, %%eax\n\t" \
"tdnow_testexit:\n\t"
: "=a" (res)
:
: "ecx", "edx", "memory");
#ifdef __x86_64__
asm volatile ("popq %%rbx\n\t" : : : "memory");
#else
asm volatile ("popl %%ebx\n\t" : : : "memory");
#endif
return res;
}

int
have_sse ()
{
unsigned int res = 0;

#ifdef __x86_64__
asm volatile ("pushq %%rbx\n\t" : : : "memory");
#else
asm volatile ("pushl %%ebx\n\t" : : : "memory");
#endif
asm volatile (
"movl $1, %%eax\n\t" \
"cpuid\n\t" \
\
"xorl %%eax, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"shll $25, %%ebx\n\t" \
"testl %%ebx, %%edx\n\t" \
"jz sse_testexit\n\t" \
"movl $1, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"shll $26, %%ebx\n\t" \
"testl %%ebx, %%edx\n\t" \
"jz sse_testexit\n\t" \
"movl $2, %%eax\n\t" \
\
"movl $1, %%ebx\n\t" \
"testl %%ebx, %%ecx\n\t" \
"jz sse_testexit\n\t" \
"movl $3, %%eax\n\t" \
\
"sse_testexit:\n\t"
: "=a" (res)
:
: "ecx", "edx", "memory");
#ifdef __x86_64__
asm volatile ("popq %%rbx\n\t" : : : "memory");
#else
asm volatile ("popl %%ebx\n\t" : : : "memory");
#endif
return res;
}

void
x86_3dnow_copyf (float *dest, const float *src, int length)
{
int i, n1, n2;
pv2sf m64p_src = (pv2sf) src;
pv2sf m64p_dest = (pv2sf) dest;

n1 = (length >> 4);
n2 = ((length & 0xf) >> 1);
for (i = 0; i < n1; i++)
{
asm volatile ("movq %0, %%mm0\n\t"
: : "m" (*m64p_src++) : "mm0", "memory");
asm volatile ("movq %0, %%mm1\n\t"
: : "m" (*m64p_src++) : "mm1", "memory");
asm volatile ("movq %0, %%mm2\n\t"
: : "m" (*m64p_src++) : "mm2", "memory");
asm volatile ("movq %0, %%mm3\n\t"
: : "m" (*m64p_src++) : "mm3", "memory");
asm volatile ("movq %0, %%mm4\n\t"
: : "m" (*m64p_src++) : "mm4", "memory");
asm volatile ("movq %0, %%mm5\n\t"
: : "m" (*m64p_src++) : "mm5", "memory");
asm volatile ("movq %0, %%mm6\n\t"
: : "m" (*m64p_src++) : "mm6", "memory");
asm volatile ("movq %0, %%mm7\n\t"
: : "m" (*m64p_src++) : "xmm7", "memory");

asm volatile ("movq %%mm0, %0\n\t"
: "=m" (*m64p_dest++) : : "mm0", "memory");
asm volatile ("movq %%mm1, %0\n\t"
: "=m" (*m64p_dest++) : : "mm1", "memory");
asm volatile ("movq %%mm2, %0\n\t"
: "=m" (*m64p_dest++) : : "mm2", "memory");
asm volatile ("movq %%mm3, %0\n\t"
: "=m" (*m64p_dest++) : : "mm3", "memory");
asm volatile ("movq %%mm4, %0\n\t"
: "=m" (*m64p_dest++) : : "mm4", "memory");
asm volatile ("movq %%mm5, %0\n\t"
: "=m" (*m64p_dest++) : : "mm5", "memory");
asm volatile ("movq %%mm6, %0\n\t"
: "=m" (*m64p_dest++) : : "mm6", "memory");
asm volatile ("movq %%mm7, %0\n\t"
: "=m" (*m64p_dest++) : : "mm7", "memory");
}
for (i = 0; i < n2; i++)
{
asm volatile (
"movq %1, %%mm0\n\t" \
"movq %%mm0, %0\n\t"
: "=m" (*m64p_dest++)
: "m" (*m64p_src++)
: "mm0", "memory");
}
if (length & 0x1)
{
asm volatile (
"movd %1, %%mm0\n\t" \
"movd %%mm0, %0\n\t"
: "=m" (dest[length - 1])
: "m" (src[length - 1])
: "mm0", "memory");
}
asm volatile (
"femms\n\t" \
"sfence\n\t");
}

void
x86_3dnow_add2f (float *dest, const float *src, int length)
{
int i, n;
pv2sf m64p_dest = (pv2sf) dest;
pv2sf m64p_src = (pv2sf) src;

n = (length >> 1);
for (i = 0; i < n; i++)
{
asm volatile (
"movq %1, %%mm0\n\t" \
"pfadd %2, %%mm0\n\t" \
"movq %%mm0, %0\n\t"
: "=m" (m64p_dest[i])
: "m0" (m64p_dest[i]),
"m" (m64p_src[i])
: "mm0", "memory");
}
if (n & 0x1)
{
asm volatile (
"movd %1, %%mm0\n\t" \
"movd %2, %%mm1\n\t" \
"pfadd %%mm1, %%mm0\n\t" \
"movd %%mm0, %0\n\t"
: "=m" (dest[length - 1])
: "m0" (dest[length - 1]),
"m" (src[length - 1])
: "mm0", "mm1", "memory");
}
asm volatile (
"femms\n\t" \
"sfence\n\t");
}

void
x86_sse_copyf (float *dest, const float *src, int length)
{
int i, n1, n2, si3;
pv4sf m128p_src = (pv4sf) src;
pv4sf m128p_dest = (pv4sf) dest;

n1 = (length >> 5);
n2 = ((length & 0x1f) >> 2);
si3 = (length & ~0x3);
for (i = 0; i < n1; i++)
{
asm volatile ("movaps %0, %%xmm0\n\t"
: : "m" (*m128p_src++) : "xmm0", "memory");
asm volatile ("movaps %0, %%xmm1\n\t"
: : "m" (*m128p_src++) : "xmm1", "memory");
asm volatile ("movaps %0, %%xmm2\n\t"
: : "m" (*m128p_src++) : "xmm2", "memory");
asm volatile ("movaps %0, %%xmm3\n\t"
: : "m" (*m128p_src++) : "xmm3", "memory");
asm volatile ("movaps %0, %%xmm4\n\t"
: : "m" (*m128p_src++) : "xmm4", "memory");
asm volatile ("movaps %0, %%xmm5\n\t"
: : "m" (*m128p_src++) : "xmm5", "memory");
asm volatile ("movaps %0, %%xmm6\n\t"
: : "m" (*m128p_src++) : "xmm6", "memory");
asm volatile ("movaps %0, %%xmm7\n\t"
: : "m" (*m128p_src++) : "xmm7", "memory");

asm volatile ("movaps %%xmm0, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm0", "memory");
asm volatile ("movaps %%xmm1, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm1", "memory");
asm volatile ("movaps %%xmm2, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm2", "memory");
asm volatile ("movaps %%xmm3, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm3", "memory");
asm volatile ("movaps %%xmm4, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm4", "memory");
asm volatile ("movaps %%xmm5, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm5", "memory");
asm volatile ("movaps %%xmm6, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm6", "memory");
asm volatile ("movaps %%xmm7, %0\n\t"
: "=m" (*m128p_dest++) : : "xmm7", "memory");
}
for (i = 0; i < n2; i++)
{
asm volatile (
"movaps %1, %%xmm0\n\t" \
"movaps %%xmm0, %0\n\t"
: "=m" (*m128p_dest++)
: "m" (*m128p_src++)
: "xmm0", "memory");
}
for (i = si3; i < length; i++)
{
asm volatile (
"movss %1, %%xmm0\n\t" \
"movss %%xmm0, %0\n\t"
: "=m" (dest[i])
: "m" (src[i])
: "xmm0", "memory");
}
}

void
x86_sse_add2f (float *dest, const float *src, int length)
{
int i, n, si2;
pv4sf m128p_src = (pv4sf) src;
pv4sf m128p_dest = (pv4sf) dest;

if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0))
{
/*fprintf(stderr, "x86_sse_add2f(): non aligned pointers!\n");*/
si2 = 0;
goto sse_nonalign;
}
si2 = (length & ~0x3);
n = (length >> 2);
for (i = 0; i < n; i++)
{
asm volatile (
"movaps %1, %%xmm0\n\t" \
"addps %2, %%xmm0\n\t" \
"movaps %%xmm0, %0\n\t"
: "=m" (m128p_dest[i])
: "m0" (m128p_dest[i]),
"m" (m128p_src[i])
: "xmm0", "memory");
}
sse_nonalign:
for (i = si2; i < length; i++)
{
asm volatile (
"movss %1, %%xmm0\n\t" \
"addss %2, %%xmm0\n\t" \
"movss %%xmm0, %0\n\t"
: "=m" (dest[i])
: "m0" (dest[i]),
"m" (src[i])
: "xmm0", "memory");
}
}

#endif /* ARCH_X86 */

#endif /* USE_DYNSIMD */


Loading…
Cancel
Save