jack1 codebase
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

430 lines
10KB

  1. /* -*- mode: c; c-file-style: "bsd"; -*- */
  2. /*
  3. Copyright (C) 2005-2008 Jussi Laako
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as published by
  6. the Free Software Foundation; either version 2.1 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. */
  16. #include <config.h>
  17. #include <jack/intsimd.h>
  18. #ifdef USE_DYNSIMD
  19. #ifdef ARCH_X86
  20. int
  21. have_3dnow ()
  22. {
  23. unsigned int res = 0;
  24. #ifdef __x86_64__
  25. asm volatile ("pushq %%rbx\n\t" : : : "memory");
  26. #else
  27. asm volatile ("pushl %%ebx\n\t" : : : "memory");
  28. #endif
  29. asm volatile (
  30. "movl $0x80000000, %%eax\n\t" \
  31. "cpuid\n\t" \
  32. "cmpl $0x80000001, %%eax\n\t" \
  33. "jl tdnow_prexit\n\t" \
  34. \
  35. "movl $0x80000001, %%eax\n\t" \
  36. "cpuid\n\t" \
  37. \
  38. "xorl %%eax, %%eax\n\t" \
  39. \
  40. "movl $1, %%ecx\n\t" \
  41. "shll $31, %%ecx\n\t" \
  42. "testl %%ecx, %%edx\n\t" \
  43. "jz tdnow_testexit\n\t" \
  44. "movl $1, %%eax\n\t" \
  45. \
  46. "movl $1, %%ecx\n\t" \
  47. "shll $30, %%ecx\n\t" \
  48. "testl %%ecx, %%edx\n\t" \
  49. "jz tdnow_testexit\n\t" \
  50. "movl $2, %%eax\n\t" \
  51. "jmp tdnow_testexit\n\t" \
  52. \
  53. "tdnow_prexit:\n\t" \
  54. "xorl %%eax, %%eax\n\t" \
  55. "tdnow_testexit:\n\t"
  56. : "=a" (res)
  57. :
  58. : "ecx", "edx", "memory");
  59. #ifdef __x86_64__
  60. asm volatile ("popq %%rbx\n\t" : : : "memory");
  61. #else
  62. asm volatile ("popl %%ebx\n\t" : : : "memory");
  63. #endif
  64. return res;
  65. }
  66. int
  67. have_sse ()
  68. {
  69. unsigned int res = 0;
  70. #ifdef __x86_64__
  71. asm volatile ("pushq %%rbx\n\t" : : : "memory");
  72. #else
  73. asm volatile ("pushl %%ebx\n\t" : : : "memory");
  74. #endif
  75. asm volatile (
  76. "movl $1, %%eax\n\t" \
  77. "cpuid\n\t" \
  78. \
  79. "xorl %%eax, %%eax\n\t" \
  80. \
  81. "movl $1, %%ebx\n\t" \
  82. "shll $25, %%ebx\n\t" \
  83. "testl %%ebx, %%edx\n\t" \
  84. "jz sse_testexit\n\t" \
  85. "movl $1, %%eax\n\t" \
  86. \
  87. "movl $1, %%ebx\n\t" \
  88. "shll $26, %%ebx\n\t" \
  89. "testl %%ebx, %%edx\n\t" \
  90. "jz sse_testexit\n\t" \
  91. "movl $2, %%eax\n\t" \
  92. \
  93. "movl $1, %%ebx\n\t" \
  94. "testl %%ebx, %%ecx\n\t" \
  95. "jz sse_testexit\n\t" \
  96. "movl $3, %%eax\n\t" \
  97. \
  98. "sse_testexit:\n\t"
  99. : "=a" (res)
  100. :
  101. : "ecx", "edx", "memory");
  102. #ifdef __x86_64__
  103. asm volatile ("popq %%rbx\n\t" : : : "memory");
  104. #else
  105. asm volatile ("popl %%ebx\n\t" : : : "memory");
  106. #endif
  107. return res;
  108. }
  109. void
  110. x86_3dnow_copyf (float *dest, const float *src, int length)
  111. {
  112. int i, n1, n2;
  113. pv2sf m64p_src = (pv2sf) src;
  114. pv2sf m64p_dest = (pv2sf) dest;
  115. n1 = (length >> 4);
  116. n2 = ((length & 0xf) >> 1);
  117. for (i = 0; i < n1; i++)
  118. {
  119. asm volatile ("movq %0, %%mm0\n\t"
  120. : : "m" (*m64p_src++) : "mm0", "memory");
  121. asm volatile ("movq %0, %%mm1\n\t"
  122. : : "m" (*m64p_src++) : "mm1", "memory");
  123. asm volatile ("movq %0, %%mm2\n\t"
  124. : : "m" (*m64p_src++) : "mm2", "memory");
  125. asm volatile ("movq %0, %%mm3\n\t"
  126. : : "m" (*m64p_src++) : "mm3", "memory");
  127. asm volatile ("movq %0, %%mm4\n\t"
  128. : : "m" (*m64p_src++) : "mm4", "memory");
  129. asm volatile ("movq %0, %%mm5\n\t"
  130. : : "m" (*m64p_src++) : "mm5", "memory");
  131. asm volatile ("movq %0, %%mm6\n\t"
  132. : : "m" (*m64p_src++) : "mm6", "memory");
  133. asm volatile ("movq %0, %%mm7\n\t"
  134. : : "m" (*m64p_src++) : "mm7", "memory");
  135. asm volatile ("movq %%mm0, %0\n\t"
  136. : "=m" (*m64p_dest++) : : "mm0", "memory");
  137. asm volatile ("movq %%mm1, %0\n\t"
  138. : "=m" (*m64p_dest++) : : "mm1", "memory");
  139. asm volatile ("movq %%mm2, %0\n\t"
  140. : "=m" (*m64p_dest++) : : "mm2", "memory");
  141. asm volatile ("movq %%mm3, %0\n\t"
  142. : "=m" (*m64p_dest++) : : "mm3", "memory");
  143. asm volatile ("movq %%mm4, %0\n\t"
  144. : "=m" (*m64p_dest++) : : "mm4", "memory");
  145. asm volatile ("movq %%mm5, %0\n\t"
  146. : "=m" (*m64p_dest++) : : "mm5", "memory");
  147. asm volatile ("movq %%mm6, %0\n\t"
  148. : "=m" (*m64p_dest++) : : "mm6", "memory");
  149. asm volatile ("movq %%mm7, %0\n\t"
  150. : "=m" (*m64p_dest++) : : "mm7", "memory");
  151. }
  152. for (i = 0; i < n2; i++)
  153. {
  154. asm volatile (
  155. "movq %1, %%mm0\n\t" \
  156. "movq %%mm0, %0\n\t"
  157. : "=m" (*m64p_dest++)
  158. : "m" (*m64p_src++)
  159. : "mm0", "memory");
  160. }
  161. if (length & 0x1)
  162. {
  163. asm volatile (
  164. "movd %1, %%mm0\n\t" \
  165. "movd %%mm0, %0\n\t"
  166. : "=m" (dest[length - 1])
  167. : "m" (src[length - 1])
  168. : "mm0", "memory");
  169. }
  170. asm volatile (
  171. "femms\n\t" \
  172. "sfence\n\t");
  173. }
  174. void
  175. x86_3dnow_add2f (float *dest, const float *src, int length)
  176. {
  177. int i, n;
  178. pv2sf m64p_dest = (pv2sf) dest;
  179. pv2sf m64p_src = (pv2sf) src;
  180. n = (length >> 1);
  181. for (i = 0; i < n; i++)
  182. {
  183. asm volatile (
  184. "movq %1, %%mm0\n\t" \
  185. "pfadd %2, %%mm0\n\t" \
  186. "movq %%mm0, %0\n\t"
  187. : "=m" (m64p_dest[i])
  188. : "m0" (m64p_dest[i]),
  189. "m" (m64p_src[i])
  190. : "mm0", "memory");
  191. }
  192. if (n & 0x1)
  193. {
  194. asm volatile (
  195. "movd %1, %%mm0\n\t" \
  196. "movd %2, %%mm1\n\t" \
  197. "pfadd %%mm1, %%mm0\n\t" \
  198. "movd %%mm0, %0\n\t"
  199. : "=m" (dest[length - 1])
  200. : "m0" (dest[length - 1]),
  201. "m" (src[length - 1])
  202. : "mm0", "mm1", "memory");
  203. }
  204. asm volatile (
  205. "femms\n\t" \
  206. "sfence\n\t");
  207. }
  208. void
  209. x86_sse_copyf (float *dest, const float *src, int length)
  210. {
  211. int i, n1, n2, si3;
  212. pv4sf m128p_src = (pv4sf) src;
  213. pv4sf m128p_dest = (pv4sf) dest;
  214. n1 = (length >> 5);
  215. n2 = ((length & 0x1f) >> 2);
  216. si3 = (length & ~0x3);
  217. for (i = 0; i < n1; i++)
  218. {
  219. asm volatile ("movaps %0, %%xmm0\n\t"
  220. : : "m" (*m128p_src++) : "xmm0", "memory");
  221. asm volatile ("movaps %0, %%xmm1\n\t"
  222. : : "m" (*m128p_src++) : "xmm1", "memory");
  223. asm volatile ("movaps %0, %%xmm2\n\t"
  224. : : "m" (*m128p_src++) : "xmm2", "memory");
  225. asm volatile ("movaps %0, %%xmm3\n\t"
  226. : : "m" (*m128p_src++) : "xmm3", "memory");
  227. asm volatile ("movaps %0, %%xmm4\n\t"
  228. : : "m" (*m128p_src++) : "xmm4", "memory");
  229. asm volatile ("movaps %0, %%xmm5\n\t"
  230. : : "m" (*m128p_src++) : "xmm5", "memory");
  231. asm volatile ("movaps %0, %%xmm6\n\t"
  232. : : "m" (*m128p_src++) : "xmm6", "memory");
  233. asm volatile ("movaps %0, %%xmm7\n\t"
  234. : : "m" (*m128p_src++) : "xmm7", "memory");
  235. asm volatile ("movaps %%xmm0, %0\n\t"
  236. : "=m" (*m128p_dest++) : : "xmm0", "memory");
  237. asm volatile ("movaps %%xmm1, %0\n\t"
  238. : "=m" (*m128p_dest++) : : "xmm1", "memory");
  239. asm volatile ("movaps %%xmm2, %0\n\t"
  240. : "=m" (*m128p_dest++) : : "xmm2", "memory");
  241. asm volatile ("movaps %%xmm3, %0\n\t"
  242. : "=m" (*m128p_dest++) : : "xmm3", "memory");
  243. asm volatile ("movaps %%xmm4, %0\n\t"
  244. : "=m" (*m128p_dest++) : : "xmm4", "memory");
  245. asm volatile ("movaps %%xmm5, %0\n\t"
  246. : "=m" (*m128p_dest++) : : "xmm5", "memory");
  247. asm volatile ("movaps %%xmm6, %0\n\t"
  248. : "=m" (*m128p_dest++) : : "xmm6", "memory");
  249. asm volatile ("movaps %%xmm7, %0\n\t"
  250. : "=m" (*m128p_dest++) : : "xmm7", "memory");
  251. }
  252. for (i = 0; i < n2; i++)
  253. {
  254. asm volatile (
  255. "movaps %1, %%xmm0\n\t" \
  256. "movaps %%xmm0, %0\n\t"
  257. : "=m" (*m128p_dest++)
  258. : "m" (*m128p_src++)
  259. : "xmm0", "memory");
  260. }
  261. for (i = si3; i < length; i++)
  262. {
  263. asm volatile (
  264. "movss %1, %%xmm0\n\t" \
  265. "movss %%xmm0, %0\n\t"
  266. : "=m" (dest[i])
  267. : "m" (src[i])
  268. : "xmm0", "memory");
  269. }
  270. }
  271. void
  272. x86_sse_add2f (float *dest, const float *src, int length)
  273. {
  274. int i, n, si2;
  275. pv4sf m128p_src = (pv4sf) src;
  276. pv4sf m128p_dest = (pv4sf) dest;
  277. if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0))
  278. {
  279. /*jack_error("x86_sse_add2f(): non aligned pointers!");*/
  280. si2 = 0;
  281. goto sse_nonalign;
  282. }
  283. si2 = (length & ~0x3);
  284. n = (length >> 2);
  285. for (i = 0; i < n; i++)
  286. {
  287. asm volatile (
  288. "movaps %1, %%xmm0\n\t" \
  289. "addps %2, %%xmm0\n\t" \
  290. "movaps %%xmm0, %0\n\t"
  291. : "=m" (m128p_dest[i])
  292. : "m0" (m128p_dest[i]),
  293. "m" (m128p_src[i])
  294. : "xmm0", "memory");
  295. }
  296. sse_nonalign:
  297. for (i = si2; i < length; i++)
  298. {
  299. asm volatile (
  300. "movss %1, %%xmm0\n\t" \
  301. "addss %2, %%xmm0\n\t" \
  302. "movss %%xmm0, %0\n\t"
  303. : "=m" (dest[i])
  304. : "m0" (dest[i]),
  305. "m" (src[i])
  306. : "xmm0", "memory");
  307. }
  308. }
  309. void x86_sse_f2i (int *dest, const float *src, int length, float scale)
  310. {
  311. int i;
  312. static const float max[4] __attribute__((aligned(16))) =
  313. { -1.0F, -1.0F, -1.0F, -1.0F };
  314. static const float min[4] __attribute__((aligned(16))) =
  315. { 1.0F, 1.0F, 1.0F, 1.0F };
  316. float s[4] __attribute__((aligned(16)));
  317. s[0] = s[1] = s[2] = s[3] = scale;
  318. asm volatile (
  319. "movaps %0, %%xmm4\n\t" \
  320. "movaps %1, %%xmm5\n\t" \
  321. "movaps %2, %%xmm6\n\t"
  322. :
  323. : "m" (*max),
  324. "m" (*min),
  325. "m" (*s)
  326. : "xmm4", "xmm5", "xmm6");
  327. if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
  328. goto sse_nonalign;
  329. for (i = 0; i < length; i += 4)
  330. {
  331. asm volatile (
  332. "movaps %1, %%xmm1\n\t" \
  333. "maxps %%xmm4, %%xmm1\n\t" \
  334. "minps %%xmm5, %%xmm1\n\t" \
  335. "mulps %%xmm6, %%xmm1\n\t" \
  336. "cvtps2dq %%xmm1, %%xmm0\n\t" \
  337. "movdqa %%xmm0, %0\n\t"
  338. : "=m" (dest[i])
  339. : "m" (src[i])
  340. : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
  341. }
  342. return;
  343. sse_nonalign:
  344. for (i = 0; i < length; i += 4)
  345. {
  346. asm volatile (
  347. "movups %1, %%xmm1\n\t" \
  348. "maxps %%xmm4, %%xmm1\n\t" \
  349. "minps %%xmm5, %%xmm1\n\t" \
  350. "mulps %%xmm6, %%xmm1\n\t" \
  351. "cvtps2dq %%xmm1, %%xmm0\n\t" \
  352. "movdqu %%xmm0, %0\n\t"
  353. : "=m" (dest[i])
  354. : "m" (src[i])
  355. : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
  356. }
  357. }
  358. void x86_sse_i2f (float *dest, const int *src, int length, float scale)
  359. {
  360. int i;
  361. float s[4] __attribute__((aligned(16)));
  362. s[0] = s[1] = s[2] = s[3] = scale;
  363. asm volatile (
  364. "movaps %0, %%xmm4\n\t"
  365. :
  366. : "m" (*s)
  367. : "xmm4" );
  368. if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
  369. goto sse_nonalign;
  370. for (i = 0; i < length; i += 4)
  371. {
  372. asm volatile (
  373. "cvtdq2ps %1, %%xmm0\n\t" \
  374. "mulps %%xmm4, %%xmm0\n\t" \
  375. "movaps %%xmm0, %0\n\t"
  376. : "=m" (dest[i])
  377. : "m" (src[i])
  378. : "xmm0", "xmm4", "memory");
  379. }
  380. return;
  381. sse_nonalign:
  382. for (i = 0; i < length; i += 4)
  383. {
  384. asm volatile (
  385. "movdqu %1, %%xmm1\n\t" \
  386. "cvtdq2ps %%xmm1, %%xmm0\n\t" \
  387. "mulps %%xmm4, %%xmm0\n\t" \
  388. "movups %%xmm0, %0\n\t"
  389. : "=m" (dest[i])
  390. : "m" (src[i])
  391. : "xmm0", "xmm1", "xmm4", "memory");
  392. }
  393. }
  394. #endif /* ARCH_X86 */
  395. #endif /* USE_DYNSIMD */