|
- /* -*- mode: c; c-file-style: "bsd"; -*- */
- /*
- Copyright (C) 2005-2008 Jussi Laako
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation; either version 2.1 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- */
-
-
- #include <config.h>
- #include "intsimd.h"
-
- #ifdef USE_DYNSIMD
-
- #ifdef ARCH_X86
-
- int
- have_3dnow ()
- {
- unsigned int res = 0;
-
- #ifdef __x86_64__
- asm volatile ("pushq %%rbx\n\t" : : : "memory");
- #else
- asm volatile ("pushl %%ebx\n\t" : : : "memory");
- #endif
- asm volatile (
- "movl $0x80000000, %%eax\n\t" \
- "cpuid\n\t" \
- "cmpl $0x80000001, %%eax\n\t" \
- "jl tdnow_prexit\n\t" \
- \
- "movl $0x80000001, %%eax\n\t" \
- "cpuid\n\t" \
- \
- "xorl %%eax, %%eax\n\t" \
- \
- "movl $1, %%ecx\n\t" \
- "shll $31, %%ecx\n\t" \
- "testl %%ecx, %%edx\n\t" \
- "jz tdnow_testexit\n\t" \
- "movl $1, %%eax\n\t" \
- \
- "movl $1, %%ecx\n\t" \
- "shll $30, %%ecx\n\t" \
- "testl %%ecx, %%edx\n\t" \
- "jz tdnow_testexit\n\t" \
- "movl $2, %%eax\n\t" \
- "jmp tdnow_testexit\n\t" \
- \
- "tdnow_prexit:\n\t" \
- "xorl %%eax, %%eax\n\t" \
- "tdnow_testexit:\n\t"
- : "=a" (res)
- :
- : "ecx", "edx", "memory");
- #ifdef __x86_64__
- asm volatile ("popq %%rbx\n\t" : : : "memory");
- #else
- asm volatile ("popl %%ebx\n\t" : : : "memory");
- #endif
- return res;
- }
-
- int
- have_sse ()
- {
- unsigned int res = 0;
-
- #ifdef __x86_64__
- asm volatile ("pushq %%rbx\n\t" : : : "memory");
- #else
- asm volatile ("pushl %%ebx\n\t" : : : "memory");
- #endif
- asm volatile (
- "movl $1, %%eax\n\t" \
- "cpuid\n\t" \
- \
- "xorl %%eax, %%eax\n\t" \
- \
- "movl $1, %%ebx\n\t" \
- "shll $25, %%ebx\n\t" \
- "testl %%ebx, %%edx\n\t" \
- "jz sse_testexit\n\t" \
- "movl $1, %%eax\n\t" \
- \
- "movl $1, %%ebx\n\t" \
- "shll $26, %%ebx\n\t" \
- "testl %%ebx, %%edx\n\t" \
- "jz sse_testexit\n\t" \
- "movl $2, %%eax\n\t" \
- \
- "movl $1, %%ebx\n\t" \
- "testl %%ebx, %%ecx\n\t" \
- "jz sse_testexit\n\t" \
- "movl $3, %%eax\n\t" \
- \
- "sse_testexit:\n\t"
- : "=a" (res)
- :
- : "ecx", "edx", "memory");
- #ifdef __x86_64__
- asm volatile ("popq %%rbx\n\t" : : : "memory");
- #else
- asm volatile ("popl %%ebx\n\t" : : : "memory");
- #endif
- return res;
- }
-
- void
- x86_3dnow_copyf (float *dest, const float *src, int length)
- {
- int i, n1, n2;
- pv2sf m64p_src = (pv2sf) src;
- pv2sf m64p_dest = (pv2sf) dest;
-
- n1 = (length >> 4);
- n2 = ((length & 0xf) >> 1);
- for (i = 0; i < n1; i++)
- {
- asm volatile ("movq %0, %%mm0\n\t"
- : : "m" (*m64p_src++) : "mm0", "memory");
- asm volatile ("movq %0, %%mm1\n\t"
- : : "m" (*m64p_src++) : "mm1", "memory");
- asm volatile ("movq %0, %%mm2\n\t"
- : : "m" (*m64p_src++) : "mm2", "memory");
- asm volatile ("movq %0, %%mm3\n\t"
- : : "m" (*m64p_src++) : "mm3", "memory");
- asm volatile ("movq %0, %%mm4\n\t"
- : : "m" (*m64p_src++) : "mm4", "memory");
- asm volatile ("movq %0, %%mm5\n\t"
- : : "m" (*m64p_src++) : "mm5", "memory");
- asm volatile ("movq %0, %%mm6\n\t"
- : : "m" (*m64p_src++) : "mm6", "memory");
- asm volatile ("movq %0, %%mm7\n\t"
- : : "m" (*m64p_src++) : "mm7", "memory");
-
- asm volatile ("movq %%mm0, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm0", "memory");
- asm volatile ("movq %%mm1, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm1", "memory");
- asm volatile ("movq %%mm2, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm2", "memory");
- asm volatile ("movq %%mm3, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm3", "memory");
- asm volatile ("movq %%mm4, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm4", "memory");
- asm volatile ("movq %%mm5, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm5", "memory");
- asm volatile ("movq %%mm6, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm6", "memory");
- asm volatile ("movq %%mm7, %0\n\t"
- : "=m" (*m64p_dest++) : : "mm7", "memory");
- }
- for (i = 0; i < n2; i++)
- {
- asm volatile (
- "movq %1, %%mm0\n\t" \
- "movq %%mm0, %0\n\t"
- : "=m" (*m64p_dest++)
- : "m" (*m64p_src++)
- : "mm0", "memory");
- }
- if (length & 0x1)
- {
- asm volatile (
- "movd %1, %%mm0\n\t" \
- "movd %%mm0, %0\n\t"
- : "=m" (dest[length - 1])
- : "m" (src[length - 1])
- : "mm0", "memory");
- }
- asm volatile (
- "femms\n\t" \
- "sfence\n\t");
- }
-
- void
- x86_3dnow_add2f (float *dest, const float *src, int length)
- {
- int i, n;
- pv2sf m64p_dest = (pv2sf) dest;
- pv2sf m64p_src = (pv2sf) src;
-
- n = (length >> 1);
- for (i = 0; i < n; i++)
- {
- asm volatile (
- "movq %1, %%mm0\n\t" \
- "pfadd %2, %%mm0\n\t" \
- "movq %%mm0, %0\n\t"
- : "=m" (m64p_dest[i])
- : "m0" (m64p_dest[i]),
- "m" (m64p_src[i])
- : "mm0", "memory");
- }
- if (n & 0x1)
- {
- asm volatile (
- "movd %1, %%mm0\n\t" \
- "movd %2, %%mm1\n\t" \
- "pfadd %%mm1, %%mm0\n\t" \
- "movd %%mm0, %0\n\t"
- : "=m" (dest[length - 1])
- : "m0" (dest[length - 1]),
- "m" (src[length - 1])
- : "mm0", "mm1", "memory");
- }
- asm volatile (
- "femms\n\t" \
- "sfence\n\t");
- }
-
- void
- x86_sse_copyf (float *dest, const float *src, int length)
- {
- int i, n1, n2, si3;
- pv4sf m128p_src = (pv4sf) src;
- pv4sf m128p_dest = (pv4sf) dest;
-
- n1 = (length >> 5);
- n2 = ((length & 0x1f) >> 2);
- si3 = (length & ~0x3);
- for (i = 0; i < n1; i++)
- {
- asm volatile ("movaps %0, %%xmm0\n\t"
- : : "m" (*m128p_src++) : "xmm0", "memory");
- asm volatile ("movaps %0, %%xmm1\n\t"
- : : "m" (*m128p_src++) : "xmm1", "memory");
- asm volatile ("movaps %0, %%xmm2\n\t"
- : : "m" (*m128p_src++) : "xmm2", "memory");
- asm volatile ("movaps %0, %%xmm3\n\t"
- : : "m" (*m128p_src++) : "xmm3", "memory");
- asm volatile ("movaps %0, %%xmm4\n\t"
- : : "m" (*m128p_src++) : "xmm4", "memory");
- asm volatile ("movaps %0, %%xmm5\n\t"
- : : "m" (*m128p_src++) : "xmm5", "memory");
- asm volatile ("movaps %0, %%xmm6\n\t"
- : : "m" (*m128p_src++) : "xmm6", "memory");
- asm volatile ("movaps %0, %%xmm7\n\t"
- : : "m" (*m128p_src++) : "xmm7", "memory");
-
- asm volatile ("movaps %%xmm0, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm0", "memory");
- asm volatile ("movaps %%xmm1, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm1", "memory");
- asm volatile ("movaps %%xmm2, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm2", "memory");
- asm volatile ("movaps %%xmm3, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm3", "memory");
- asm volatile ("movaps %%xmm4, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm4", "memory");
- asm volatile ("movaps %%xmm5, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm5", "memory");
- asm volatile ("movaps %%xmm6, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm6", "memory");
- asm volatile ("movaps %%xmm7, %0\n\t"
- : "=m" (*m128p_dest++) : : "xmm7", "memory");
- }
- for (i = 0; i < n2; i++)
- {
- asm volatile (
- "movaps %1, %%xmm0\n\t" \
- "movaps %%xmm0, %0\n\t"
- : "=m" (*m128p_dest++)
- : "m" (*m128p_src++)
- : "xmm0", "memory");
- }
- for (i = si3; i < length; i++)
- {
- asm volatile (
- "movss %1, %%xmm0\n\t" \
- "movss %%xmm0, %0\n\t"
- : "=m" (dest[i])
- : "m" (src[i])
- : "xmm0", "memory");
- }
- }
-
- void
- x86_sse_add2f (float *dest, const float *src, int length)
- {
- int i, n, si2;
- pv4sf m128p_src = (pv4sf) src;
- pv4sf m128p_dest = (pv4sf) dest;
-
- if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0))
- {
- /*jack_error("x86_sse_add2f(): non aligned pointers!");*/
- si2 = 0;
- goto sse_nonalign;
- }
- si2 = (length & ~0x3);
- n = (length >> 2);
- for (i = 0; i < n; i++)
- {
- asm volatile (
- "movaps %1, %%xmm0\n\t" \
- "addps %2, %%xmm0\n\t" \
- "movaps %%xmm0, %0\n\t"
- : "=m" (m128p_dest[i])
- : "m0" (m128p_dest[i]),
- "m" (m128p_src[i])
- : "xmm0", "memory");
- }
- sse_nonalign:
- for (i = si2; i < length; i++)
- {
- asm volatile (
- "movss %1, %%xmm0\n\t" \
- "addss %2, %%xmm0\n\t" \
- "movss %%xmm0, %0\n\t"
- : "=m" (dest[i])
- : "m0" (dest[i]),
- "m" (src[i])
- : "xmm0", "memory");
- }
- }
-
- void x86_sse_f2i (int *dest, const float *src, int length, float scale)
- {
- int i;
- static const float max[4] __attribute__((aligned(16))) =
- { -1.0F, -1.0F, -1.0F, -1.0F };
- static const float min[4] __attribute__((aligned(16))) =
- { 1.0F, 1.0F, 1.0F, 1.0F };
- float s[4] __attribute__((aligned(16)));
-
- s[0] = s[1] = s[2] = s[3] = scale;
- asm volatile (
- "movaps %0, %%xmm4\n\t" \
- "movaps %1, %%xmm5\n\t" \
- "movaps %2, %%xmm6\n\t"
- :
- : "m" (*max),
- "m" (*min),
- "m" (*s)
- : "xmm4", "xmm5", "xmm6");
-
- if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
- goto sse_nonalign;
- for (i = 0; i < length; i += 4)
- {
- asm volatile (
- "movaps %1, %%xmm1\n\t" \
- "maxps %%xmm4, %%xmm1\n\t" \
- "minps %%xmm5, %%xmm1\n\t" \
- "mulps %%xmm6, %%xmm1\n\t" \
- "cvtps2dq %%xmm1, %%xmm0\n\t" \
- "movdqa %%xmm0, %0\n\t"
- : "=m" (dest[i])
- : "m" (src[i])
- : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
- }
- return;
-
- sse_nonalign:
- for (i = 0; i < length; i += 4)
- {
- asm volatile (
- "movups %1, %%xmm1\n\t" \
- "maxps %%xmm4, %%xmm1\n\t" \
- "minps %%xmm5, %%xmm1\n\t" \
- "mulps %%xmm6, %%xmm1\n\t" \
- "cvtps2dq %%xmm1, %%xmm0\n\t" \
- "movdqu %%xmm0, %0\n\t"
- : "=m" (dest[i])
- : "m" (src[i])
- : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
- }
- }
-
-
- void x86_sse_i2f (float *dest, const int *src, int length, float scale)
- {
- int i;
- float s[4] __attribute__((aligned(16)));
-
- s[0] = s[1] = s[2] = s[3] = scale;
- asm volatile (
- "movaps %0, %%xmm4\n\t"
- :
- : "m" (*s)
- : "xmm4" );
-
- if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
- goto sse_nonalign;
- for (i = 0; i < length; i += 4)
- {
- asm volatile (
- "cvtdq2ps %1, %%xmm0\n\t" \
- "mulps %%xmm4, %%xmm0\n\t" \
- "movaps %%xmm0, %0\n\t"
- : "=m" (dest[i])
- : "m" (src[i])
- : "xmm0", "xmm4", "memory");
- }
- return;
-
- sse_nonalign:
- for (i = 0; i < length; i += 4)
- {
- asm volatile (
- "movdqu %1, %%xmm1\n\t" \
- "cvtdq2ps %%xmm1, %%xmm0\n\t" \
- "mulps %%xmm4, %%xmm0\n\t" \
- "movups %%xmm0, %0\n\t"
- : "=m" (dest[i])
- : "m" (src[i])
- : "xmm0", "xmm1", "xmm4", "memory");
- }
- }
-
- #endif /* ARCH_X86 */
-
- #endif /* USE_DYNSIMD */
|