You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

538 lines
16KB

  1. /*
  2. * resample_mmx.c
  3. * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
  4. *
  5. * This file is part of a52dec, a free ATSC A-52 stream decoder.
  6. * See http://liba52.sourceforge.net/ for updates.
  7. *
  8. * a52dec is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * a52dec is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. /* optimization TODO / NOTES
  23. movntq is slightly faster (0.5% with the current test.c benchmark)
  24. (but thats just test.c so that needs to be testd in reallity)
  25. and it would mean (C / MMX2 / MMX / 3DNOW) versions
  26. */
  27. static uint64_t __attribute__((aligned(8))) attribute_used magicF2W= 0x43c0000043c00000LL;
  28. static uint64_t __attribute__((aligned(8))) attribute_used wm1010= 0xFFFF0000FFFF0000LL;
  29. static uint64_t __attribute__((aligned(8))) attribute_used wm0101= 0x0000FFFF0000FFFFLL;
  30. static uint64_t __attribute__((aligned(8))) attribute_used wm1100= 0xFFFFFFFF00000000LL;
  31. static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
  32. int32_t * f = (int32_t *) _f;
  33. asm volatile(
  34. "movl $-512, %%esi \n\t"
  35. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  36. "movq "MANGLE(wm1100)", %%mm3 \n\t"
  37. "movq "MANGLE(wm0101)", %%mm4 \n\t"
  38. "movq "MANGLE(wm1010)", %%mm5 \n\t"
  39. "pxor %%mm6, %%mm6 \n\t"
  40. "1: \n\t"
  41. "movq (%1, %%esi, 2), %%mm0 \n\t"
  42. "movq 8(%1, %%esi, 2), %%mm1 \n\t"
  43. "leal (%%esi, %%esi, 4), %%edi \n\t"
  44. "psubd %%mm7, %%mm0 \n\t"
  45. "psubd %%mm7, %%mm1 \n\t"
  46. "packssdw %%mm1, %%mm0 \n\t"
  47. "movq %%mm0, %%mm1 \n\t"
  48. "pand %%mm4, %%mm0 \n\t"
  49. "pand %%mm5, %%mm1 \n\t"
  50. "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0
  51. "movd %%mm0, 8(%0, %%edi) \n\t" // A 0
  52. "pand %%mm3, %%mm0 \n\t"
  53. "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0
  54. "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B
  55. "pand %%mm3, %%mm1 \n\t"
  56. "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0
  57. "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0
  58. "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B
  59. "addl $8, %%esi \n\t"
  60. " jnz 1b \n\t"
  61. "emms \n\t"
  62. :: "r" (s16+1280), "r" (f+256)
  63. :"%esi", "%edi", "memory"
  64. );
  65. return 5*256;
  66. }
  67. static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
  68. int32_t * f = (int32_t *) _f;
  69. /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
  70. #ifdef HAVE_SSE
  71. asm volatile(
  72. "movl $-1024, %%esi \n\t"
  73. "1: \n\t"
  74. "cvtps2pi (%1, %%esi), %%mm0 \n\t"
  75. "cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
  76. "movq %%mm0, %%mm1 \n\t"
  77. "punpcklwd %%mm2, %%mm0 \n\t"
  78. "punpckhwd %%mm2, %%mm1 \n\t"
  79. "movq %%mm0, (%0, %%esi) \n\t"
  80. "movq %%mm1, 8(%0, %%esi) \n\t"
  81. "addl $16, %%esi \n\t"
  82. " jnz 1b \n\t"
  83. "emms \n\t"
  84. :: "r" (s16+512), "r" (f+256)
  85. :"%esi", "memory"
  86. );*/
  87. asm volatile(
  88. "movl $-1024, %%esi \n\t"
  89. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  90. "1: \n\t"
  91. "movq (%1, %%esi), %%mm0 \n\t"
  92. "movq 8(%1, %%esi), %%mm1 \n\t"
  93. "movq 1024(%1, %%esi), %%mm2 \n\t"
  94. "movq 1032(%1, %%esi), %%mm3 \n\t"
  95. "psubd %%mm7, %%mm0 \n\t"
  96. "psubd %%mm7, %%mm1 \n\t"
  97. "psubd %%mm7, %%mm2 \n\t"
  98. "psubd %%mm7, %%mm3 \n\t"
  99. "packssdw %%mm1, %%mm0 \n\t"
  100. "packssdw %%mm3, %%mm2 \n\t"
  101. "movq %%mm0, %%mm1 \n\t"
  102. "punpcklwd %%mm2, %%mm0 \n\t"
  103. "punpckhwd %%mm2, %%mm1 \n\t"
  104. "movq %%mm0, (%0, %%esi) \n\t"
  105. "movq %%mm1, 8(%0, %%esi) \n\t"
  106. "addl $16, %%esi \n\t"
  107. " jnz 1b \n\t"
  108. "emms \n\t"
  109. :: "r" (s16+512), "r" (f+256)
  110. :"%esi", "memory"
  111. );
  112. return 2*256;
  113. }
  114. static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
  115. int32_t * f = (int32_t *) _f;
  116. asm volatile(
  117. "movl $-1024, %%esi \n\t"
  118. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  119. "pxor %%mm6, %%mm6 \n\t"
  120. "movq %%mm7, %%mm5 \n\t"
  121. "punpckldq %%mm6, %%mm5 \n\t"
  122. "1: \n\t"
  123. "movd (%1, %%esi), %%mm0 \n\t"
  124. "punpckldq 2048(%1, %%esi), %%mm0\n\t"
  125. "movd 1024(%1, %%esi), %%mm1 \n\t"
  126. "punpckldq 4(%1, %%esi), %%mm1 \n\t"
  127. "movd 2052(%1, %%esi), %%mm2 \n\t"
  128. "movq %%mm7, %%mm3 \n\t"
  129. "punpckldq 1028(%1, %%esi), %%mm3\n\t"
  130. "movd 8(%1, %%esi), %%mm4 \n\t"
  131. "punpckldq 2056(%1, %%esi), %%mm4\n\t"
  132. "leal (%%esi, %%esi, 4), %%edi \n\t"
  133. "sarl $1, %%edi \n\t"
  134. "psubd %%mm7, %%mm0 \n\t"
  135. "psubd %%mm7, %%mm1 \n\t"
  136. "psubd %%mm5, %%mm2 \n\t"
  137. "psubd %%mm7, %%mm3 \n\t"
  138. "psubd %%mm7, %%mm4 \n\t"
  139. "packssdw %%mm6, %%mm0 \n\t"
  140. "packssdw %%mm2, %%mm1 \n\t"
  141. "packssdw %%mm4, %%mm3 \n\t"
  142. "movq %%mm0, (%0, %%edi) \n\t"
  143. "movq %%mm1, 8(%0, %%edi) \n\t"
  144. "movq %%mm3, 16(%0, %%edi) \n\t"
  145. "movd 1032(%1, %%esi), %%mm1 \n\t"
  146. "punpckldq 12(%1, %%esi), %%mm1\n\t"
  147. "movd 2060(%1, %%esi), %%mm2 \n\t"
  148. "movq %%mm7, %%mm3 \n\t"
  149. "punpckldq 1036(%1, %%esi), %%mm3\n\t"
  150. "pxor %%mm0, %%mm0 \n\t"
  151. "psubd %%mm7, %%mm1 \n\t"
  152. "psubd %%mm5, %%mm2 \n\t"
  153. "psubd %%mm7, %%mm3 \n\t"
  154. "packssdw %%mm1, %%mm0 \n\t"
  155. "packssdw %%mm3, %%mm2 \n\t"
  156. "movq %%mm0, 24(%0, %%edi) \n\t"
  157. "movq %%mm2, 32(%0, %%edi) \n\t"
  158. "addl $16, %%esi \n\t"
  159. " jnz 1b \n\t"
  160. "emms \n\t"
  161. :: "r" (s16+1280), "r" (f+256)
  162. :"%esi", "%edi", "memory"
  163. );
  164. return 5*256;
  165. }
  166. static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
  167. int32_t * f = (int32_t *) _f;
  168. asm volatile(
  169. "movl $-1024, %%esi \n\t"
  170. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  171. "1: \n\t"
  172. "movq (%1, %%esi), %%mm0 \n\t"
  173. "movq 8(%1, %%esi), %%mm1 \n\t"
  174. "movq 1024(%1, %%esi), %%mm2 \n\t"
  175. "movq 1032(%1, %%esi), %%mm3 \n\t"
  176. "psubd %%mm7, %%mm0 \n\t"
  177. "psubd %%mm7, %%mm1 \n\t"
  178. "psubd %%mm7, %%mm2 \n\t"
  179. "psubd %%mm7, %%mm3 \n\t"
  180. "packssdw %%mm1, %%mm0 \n\t"
  181. "packssdw %%mm3, %%mm2 \n\t"
  182. "movq 2048(%1, %%esi), %%mm3 \n\t"
  183. "movq 2056(%1, %%esi), %%mm4 \n\t"
  184. "movq 3072(%1, %%esi), %%mm5 \n\t"
  185. "movq 3080(%1, %%esi), %%mm6 \n\t"
  186. "psubd %%mm7, %%mm3 \n\t"
  187. "psubd %%mm7, %%mm4 \n\t"
  188. "psubd %%mm7, %%mm5 \n\t"
  189. "psubd %%mm7, %%mm6 \n\t"
  190. "packssdw %%mm4, %%mm3 \n\t"
  191. "packssdw %%mm6, %%mm5 \n\t"
  192. "movq %%mm0, %%mm1 \n\t"
  193. "movq %%mm3, %%mm4 \n\t"
  194. "punpcklwd %%mm2, %%mm0 \n\t"
  195. "punpckhwd %%mm2, %%mm1 \n\t"
  196. "punpcklwd %%mm5, %%mm3 \n\t"
  197. "punpckhwd %%mm5, %%mm4 \n\t"
  198. "movq %%mm0, %%mm2 \n\t"
  199. "movq %%mm1, %%mm5 \n\t"
  200. "punpckldq %%mm3, %%mm0 \n\t"
  201. "punpckhdq %%mm3, %%mm2 \n\t"
  202. "punpckldq %%mm4, %%mm1 \n\t"
  203. "punpckhdq %%mm4, %%mm5 \n\t"
  204. "movq %%mm0, (%0, %%esi,2) \n\t"
  205. "movq %%mm2, 8(%0, %%esi,2) \n\t"
  206. "movq %%mm1, 16(%0, %%esi,2) \n\t"
  207. "movq %%mm5, 24(%0, %%esi,2) \n\t"
  208. "addl $16, %%esi \n\t"
  209. " jnz 1b \n\t"
  210. "emms \n\t"
  211. :: "r" (s16+1024), "r" (f+256)
  212. :"%esi", "memory"
  213. );
  214. return 4*256;
  215. }
  216. static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
  217. int32_t * f = (int32_t *) _f;
  218. asm volatile(
  219. "movl $-1024, %%esi \n\t"
  220. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  221. "1: \n\t"
  222. "movd (%1, %%esi), %%mm0 \n\t"
  223. "punpckldq 2048(%1, %%esi), %%mm0\n\t"
  224. "movd 3072(%1, %%esi), %%mm1 \n\t"
  225. "punpckldq 4096(%1, %%esi), %%mm1\n\t"
  226. "movd 1024(%1, %%esi), %%mm2 \n\t"
  227. "punpckldq 4(%1, %%esi), %%mm2 \n\t"
  228. "movd 2052(%1, %%esi), %%mm3 \n\t"
  229. "punpckldq 3076(%1, %%esi), %%mm3\n\t"
  230. "movd 4100(%1, %%esi), %%mm4 \n\t"
  231. "punpckldq 1028(%1, %%esi), %%mm4\n\t"
  232. "movd 8(%1, %%esi), %%mm5 \n\t"
  233. "punpckldq 2056(%1, %%esi), %%mm5\n\t"
  234. "leal (%%esi, %%esi, 4), %%edi \n\t"
  235. "sarl $1, %%edi \n\t"
  236. "psubd %%mm7, %%mm0 \n\t"
  237. "psubd %%mm7, %%mm1 \n\t"
  238. "psubd %%mm7, %%mm2 \n\t"
  239. "psubd %%mm7, %%mm3 \n\t"
  240. "psubd %%mm7, %%mm4 \n\t"
  241. "psubd %%mm7, %%mm5 \n\t"
  242. "packssdw %%mm1, %%mm0 \n\t"
  243. "packssdw %%mm3, %%mm2 \n\t"
  244. "packssdw %%mm5, %%mm4 \n\t"
  245. "movq %%mm0, (%0, %%edi) \n\t"
  246. "movq %%mm2, 8(%0, %%edi) \n\t"
  247. "movq %%mm4, 16(%0, %%edi) \n\t"
  248. "movd 3080(%1, %%esi), %%mm0 \n\t"
  249. "punpckldq 4104(%1, %%esi), %%mm0\n\t"
  250. "movd 1032(%1, %%esi), %%mm1 \n\t"
  251. "punpckldq 12(%1, %%esi), %%mm1\n\t"
  252. "movd 2060(%1, %%esi), %%mm2 \n\t"
  253. "punpckldq 3084(%1, %%esi), %%mm2\n\t"
  254. "movd 4108(%1, %%esi), %%mm3 \n\t"
  255. "punpckldq 1036(%1, %%esi), %%mm3\n\t"
  256. "psubd %%mm7, %%mm0 \n\t"
  257. "psubd %%mm7, %%mm1 \n\t"
  258. "psubd %%mm7, %%mm2 \n\t"
  259. "psubd %%mm7, %%mm3 \n\t"
  260. "packssdw %%mm1, %%mm0 \n\t"
  261. "packssdw %%mm3, %%mm2 \n\t"
  262. "movq %%mm0, 24(%0, %%edi) \n\t"
  263. "movq %%mm2, 32(%0, %%edi) \n\t"
  264. "addl $16, %%esi \n\t"
  265. " jnz 1b \n\t"
  266. "emms \n\t"
  267. :: "r" (s16+1280), "r" (f+256)
  268. :"%esi", "%edi", "memory"
  269. );
  270. return 5*256;
  271. }
  272. static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
  273. int32_t * f = (int32_t *) _f;
  274. asm volatile(
  275. "movl $-1024, %%esi \n\t"
  276. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  277. "pxor %%mm6, %%mm6 \n\t"
  278. "1: \n\t"
  279. "movq 1024(%1, %%esi), %%mm0 \n\t"
  280. "movq 1032(%1, %%esi), %%mm1 \n\t"
  281. "movq (%1, %%esi), %%mm2 \n\t"
  282. "movq 8(%1, %%esi), %%mm3 \n\t"
  283. "psubd %%mm7, %%mm0 \n\t"
  284. "psubd %%mm7, %%mm1 \n\t"
  285. "psubd %%mm7, %%mm2 \n\t"
  286. "psubd %%mm7, %%mm3 \n\t"
  287. "packssdw %%mm1, %%mm0 \n\t"
  288. "packssdw %%mm3, %%mm2 \n\t"
  289. "movq %%mm0, %%mm1 \n\t"
  290. "punpcklwd %%mm2, %%mm0 \n\t"
  291. "punpckhwd %%mm2, %%mm1 \n\t"
  292. "leal (%%esi, %%esi, 2), %%edi \n\t"
  293. "movq %%mm6, (%0, %%edi) \n\t"
  294. "movd %%mm0, 8(%0, %%edi) \n\t"
  295. "punpckhdq %%mm0, %%mm0 \n\t"
  296. "movq %%mm6, 12(%0, %%edi) \n\t"
  297. "movd %%mm0, 20(%0, %%edi) \n\t"
  298. "movq %%mm6, 24(%0, %%edi) \n\t"
  299. "movd %%mm1, 32(%0, %%edi) \n\t"
  300. "punpckhdq %%mm1, %%mm1 \n\t"
  301. "movq %%mm6, 36(%0, %%edi) \n\t"
  302. "movd %%mm1, 44(%0, %%edi) \n\t"
  303. "addl $16, %%esi \n\t"
  304. " jnz 1b \n\t"
  305. "emms \n\t"
  306. :: "r" (s16+1536), "r" (f+256)
  307. :"%esi", "%edi", "memory"
  308. );
  309. return 6*256;
  310. }
  311. static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
  312. int32_t * f = (int32_t *) _f;
  313. asm volatile(
  314. "movl $-1024, %%esi \n\t"
  315. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  316. "pxor %%mm6, %%mm6 \n\t"
  317. "1: \n\t"
  318. "movq 1024(%1, %%esi), %%mm0 \n\t"
  319. "movq 2048(%1, %%esi), %%mm1 \n\t"
  320. "movq (%1, %%esi), %%mm5 \n\t"
  321. "psubd %%mm7, %%mm0 \n\t"
  322. "psubd %%mm7, %%mm1 \n\t"
  323. "psubd %%mm7, %%mm5 \n\t"
  324. "leal (%%esi, %%esi, 2), %%edi \n\t"
  325. "pxor %%mm4, %%mm4 \n\t"
  326. "packssdw %%mm5, %%mm0 \n\t" // FfAa
  327. "packssdw %%mm4, %%mm1 \n\t" // 00Bb
  328. "punpckhwd %%mm0, %%mm4 \n\t" // F0f0
  329. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  330. "movq %%mm0, %%mm1 \n\t" // BAba
  331. "punpckldq %%mm4, %%mm3 \n\t" // f0XX
  332. "punpckldq %%mm6, %%mm0 \n\t" // 00ba
  333. "punpckhdq %%mm1, %%mm3 \n\t" // BAf0
  334. "movq %%mm0, (%0, %%edi) \n\t" // 00ba
  335. "punpckhdq %%mm4, %%mm0 \n\t" // F000
  336. "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0
  337. "movq %%mm0, 16(%0, %%edi) \n\t" // F000
  338. "addl $8, %%esi \n\t"
  339. " jnz 1b \n\t"
  340. "emms \n\t"
  341. :: "r" (s16+1536), "r" (f+256)
  342. :"%esi", "%edi", "memory"
  343. );
  344. return 6*256;
  345. }
  346. static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
  347. int32_t * f = (int32_t *) _f;
  348. asm volatile(
  349. "movl $-1024, %%esi \n\t"
  350. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  351. "pxor %%mm6, %%mm6 \n\t"
  352. "1: \n\t"
  353. "movq 1024(%1, %%esi), %%mm0 \n\t"
  354. "movq 3072(%1, %%esi), %%mm1 \n\t"
  355. "movq 2048(%1, %%esi), %%mm4 \n\t"
  356. "movq (%1, %%esi), %%mm5 \n\t"
  357. "psubd %%mm7, %%mm0 \n\t"
  358. "psubd %%mm7, %%mm1 \n\t"
  359. "psubd %%mm7, %%mm4 \n\t"
  360. "psubd %%mm7, %%mm5 \n\t"
  361. "leal (%%esi, %%esi, 2), %%edi \n\t"
  362. "packssdw %%mm4, %%mm0 \n\t" // EeAa
  363. "packssdw %%mm5, %%mm1 \n\t" // FfBb
  364. "movq %%mm0, %%mm2 \n\t" // EeAa
  365. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  366. "punpckhwd %%mm1, %%mm2 \n\t" // FEfe
  367. "movq %%mm0, %%mm1 \n\t" // BAba
  368. "punpckldq %%mm6, %%mm0 \n\t" // 00ba
  369. "punpckhdq %%mm1, %%mm1 \n\t" // BABA
  370. "movq %%mm0, (%0, %%edi) \n\t"
  371. "punpckhdq %%mm2, %%mm0 \n\t" // FE00
  372. "punpckldq %%mm1, %%mm2 \n\t" // BAfe
  373. "movq %%mm2, 8(%0, %%edi) \n\t"
  374. "movq %%mm0, 16(%0, %%edi) \n\t"
  375. "addl $8, %%esi \n\t"
  376. " jnz 1b \n\t"
  377. "emms \n\t"
  378. :: "r" (s16+1536), "r" (f+256)
  379. :"%esi", "%edi", "memory"
  380. );
  381. return 6*256;
  382. }
  383. static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
  384. int32_t * f = (int32_t *) _f;
  385. asm volatile(
  386. "movl $-1024, %%esi \n\t"
  387. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  388. // "pxor %%mm6, %%mm6 \n\t"
  389. "1: \n\t"
  390. "movq 1024(%1, %%esi), %%mm0 \n\t"
  391. "movq 2048(%1, %%esi), %%mm1 \n\t"
  392. "movq 3072(%1, %%esi), %%mm2 \n\t"
  393. "movq 4096(%1, %%esi), %%mm3 \n\t"
  394. "movq (%1, %%esi), %%mm5 \n\t"
  395. "psubd %%mm7, %%mm0 \n\t"
  396. "psubd %%mm7, %%mm1 \n\t"
  397. "psubd %%mm7, %%mm2 \n\t"
  398. "psubd %%mm7, %%mm3 \n\t"
  399. "psubd %%mm7, %%mm5 \n\t"
  400. "leal (%%esi, %%esi, 2), %%edi \n\t"
  401. "packssdw %%mm2, %%mm0 \n\t" // CcAa
  402. "packssdw %%mm3, %%mm1 \n\t" // DdBb
  403. "packssdw %%mm5, %%mm5 \n\t" // FfFf
  404. "movq %%mm0, %%mm2 \n\t" // CcAa
  405. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  406. "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
  407. "pxor %%mm4, %%mm4 \n\t" // 0000
  408. "punpcklwd %%mm5, %%mm4 \n\t" // F0f0
  409. "movq %%mm0, %%mm1 \n\t" // BAba
  410. "movq %%mm4, %%mm3 \n\t" // F0f0
  411. "punpckldq %%mm2, %%mm0 \n\t" // dcba
  412. "punpckhdq %%mm1, %%mm1 \n\t" // BABA
  413. "punpckldq %%mm1, %%mm4 \n\t" // BAf0
  414. "punpckhdq %%mm3, %%mm2 \n\t" // F0DC
  415. "movq %%mm0, (%0, %%edi) \n\t"
  416. "movq %%mm4, 8(%0, %%edi) \n\t"
  417. "movq %%mm2, 16(%0, %%edi) \n\t"
  418. "addl $8, %%esi \n\t"
  419. " jnz 1b \n\t"
  420. "emms \n\t"
  421. :: "r" (s16+1536), "r" (f+256)
  422. :"%esi", "%edi", "memory"
  423. );
  424. return 6*256;
  425. }
  426. static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
  427. int32_t * f = (int32_t *) _f;
  428. asm volatile(
  429. "movl $-1024, %%esi \n\t"
  430. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  431. // "pxor %%mm6, %%mm6 \n\t"
  432. "1: \n\t"
  433. "movq 1024(%1, %%esi), %%mm0 \n\t"
  434. "movq 3072(%1, %%esi), %%mm1 \n\t"
  435. "movq 4096(%1, %%esi), %%mm2 \n\t"
  436. "movq 5120(%1, %%esi), %%mm3 \n\t"
  437. "movq 2048(%1, %%esi), %%mm4 \n\t"
  438. "movq (%1, %%esi), %%mm5 \n\t"
  439. "psubd %%mm7, %%mm0 \n\t"
  440. "psubd %%mm7, %%mm1 \n\t"
  441. "psubd %%mm7, %%mm2 \n\t"
  442. "psubd %%mm7, %%mm3 \n\t"
  443. "psubd %%mm7, %%mm4 \n\t"
  444. "psubd %%mm7, %%mm5 \n\t"
  445. "leal (%%esi, %%esi, 2), %%edi \n\t"
  446. "packssdw %%mm2, %%mm0 \n\t" // CcAa
  447. "packssdw %%mm3, %%mm1 \n\t" // DdBb
  448. "packssdw %%mm4, %%mm4 \n\t" // EeEe
  449. "packssdw %%mm5, %%mm5 \n\t" // FfFf
  450. "movq %%mm0, %%mm2 \n\t" // CcAa
  451. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  452. "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
  453. "punpcklwd %%mm5, %%mm4 \n\t" // FEfe
  454. "movq %%mm0, %%mm1 \n\t" // BAba
  455. "movq %%mm4, %%mm3 \n\t" // FEfe
  456. "punpckldq %%mm2, %%mm0 \n\t" // dcba
  457. "punpckhdq %%mm1, %%mm1 \n\t" // BABA
  458. "punpckldq %%mm1, %%mm4 \n\t" // BAfe
  459. "punpckhdq %%mm3, %%mm2 \n\t" // FEDC
  460. "movq %%mm0, (%0, %%edi) \n\t"
  461. "movq %%mm4, 8(%0, %%edi) \n\t"
  462. "movq %%mm2, 16(%0, %%edi) \n\t"
  463. "addl $8, %%esi \n\t"
  464. " jnz 1b \n\t"
  465. "emms \n\t"
  466. :: "r" (s16+1536), "r" (f+256)
  467. :"%esi", "%edi", "memory"
  468. );
  469. return 6*256;
  470. }
  471. static void* a52_resample_MMX(int flags, int ch){
  472. switch (flags) {
  473. case A52_MONO:
  474. if(ch==5) return a52_resample_MONO_to_5_MMX;
  475. break;
  476. case A52_CHANNEL:
  477. case A52_STEREO:
  478. case A52_DOLBY:
  479. if(ch==2) return a52_resample_STEREO_to_2_MMX;
  480. break;
  481. case A52_3F:
  482. if(ch==5) return a52_resample_3F_to_5_MMX;
  483. break;
  484. case A52_2F2R:
  485. if(ch==4) return a52_resample_2F_2R_to_4_MMX;
  486. break;
  487. case A52_3F2R:
  488. if(ch==5) return a52_resample_3F_2R_to_5_MMX;
  489. break;
  490. case A52_MONO | A52_LFE:
  491. if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
  492. break;
  493. case A52_CHANNEL | A52_LFE:
  494. case A52_STEREO | A52_LFE:
  495. case A52_DOLBY | A52_LFE:
  496. if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
  497. break;
  498. case A52_3F | A52_LFE:
  499. if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
  500. break;
  501. case A52_2F2R | A52_LFE:
  502. if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
  503. break;
  504. case A52_3F2R | A52_LFE:
  505. if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
  506. break;
  507. }
  508. return NULL;
  509. }