You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

519 lines
15KB

  1. // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
  2. /* optimization TODO / NOTES
  3. movntq is slightly faster (0.5% with the current test.c benchmark)
  4. (but thats just test.c so that needs to be testd in reallity)
  5. and it would mean (C / MMX2 / MMX / 3DNOW) versions
  6. */
  7. static uint64_t __attribute__((aligned(8))) attribute_used magicF2W= 0x43c0000043c00000LL;
  8. static uint64_t __attribute__((aligned(8))) attribute_used wm1010= 0xFFFF0000FFFF0000LL;
  9. static uint64_t __attribute__((aligned(8))) attribute_used wm0101= 0x0000FFFF0000FFFFLL;
  10. static uint64_t __attribute__((aligned(8))) attribute_used wm1100= 0xFFFFFFFF00000000LL;
  11. static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
  12. int32_t * f = (int32_t *) _f;
  13. asm volatile(
  14. "movl $-512, %%esi \n\t"
  15. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  16. "movq "MANGLE(wm1100)", %%mm3 \n\t"
  17. "movq "MANGLE(wm0101)", %%mm4 \n\t"
  18. "movq "MANGLE(wm1010)", %%mm5 \n\t"
  19. "pxor %%mm6, %%mm6 \n\t"
  20. "1: \n\t"
  21. "movq (%1, %%esi, 2), %%mm0 \n\t"
  22. "movq 8(%1, %%esi, 2), %%mm1 \n\t"
  23. "leal (%%esi, %%esi, 4), %%edi \n\t"
  24. "psubd %%mm7, %%mm0 \n\t"
  25. "psubd %%mm7, %%mm1 \n\t"
  26. "packssdw %%mm1, %%mm0 \n\t"
  27. "movq %%mm0, %%mm1 \n\t"
  28. "pand %%mm4, %%mm0 \n\t"
  29. "pand %%mm5, %%mm1 \n\t"
  30. "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0
  31. "movd %%mm0, 8(%0, %%edi) \n\t" // A 0
  32. "pand %%mm3, %%mm0 \n\t"
  33. "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0
  34. "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B
  35. "pand %%mm3, %%mm1 \n\t"
  36. "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0
  37. "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0
  38. "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B
  39. "addl $8, %%esi \n\t"
  40. " jnz 1b \n\t"
  41. "emms \n\t"
  42. :: "r" (s16+1280), "r" (f+256)
  43. :"%esi", "%edi", "memory"
  44. );
  45. return 5*256;
  46. }
  47. static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
  48. int32_t * f = (int32_t *) _f;
  49. /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
  50. #ifdef HAVE_SSE
  51. asm volatile(
  52. "movl $-1024, %%esi \n\t"
  53. "1: \n\t"
  54. "cvtps2pi (%1, %%esi), %%mm0 \n\t"
  55. "cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
  56. "movq %%mm0, %%mm1 \n\t"
  57. "punpcklwd %%mm2, %%mm0 \n\t"
  58. "punpckhwd %%mm2, %%mm1 \n\t"
  59. "movq %%mm0, (%0, %%esi) \n\t"
  60. "movq %%mm1, 8(%0, %%esi) \n\t"
  61. "addl $16, %%esi \n\t"
  62. " jnz 1b \n\t"
  63. "emms \n\t"
  64. :: "r" (s16+512), "r" (f+256)
  65. :"%esi", "memory"
  66. );*/
  67. asm volatile(
  68. "movl $-1024, %%esi \n\t"
  69. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  70. "1: \n\t"
  71. "movq (%1, %%esi), %%mm0 \n\t"
  72. "movq 8(%1, %%esi), %%mm1 \n\t"
  73. "movq 1024(%1, %%esi), %%mm2 \n\t"
  74. "movq 1032(%1, %%esi), %%mm3 \n\t"
  75. "psubd %%mm7, %%mm0 \n\t"
  76. "psubd %%mm7, %%mm1 \n\t"
  77. "psubd %%mm7, %%mm2 \n\t"
  78. "psubd %%mm7, %%mm3 \n\t"
  79. "packssdw %%mm1, %%mm0 \n\t"
  80. "packssdw %%mm3, %%mm2 \n\t"
  81. "movq %%mm0, %%mm1 \n\t"
  82. "punpcklwd %%mm2, %%mm0 \n\t"
  83. "punpckhwd %%mm2, %%mm1 \n\t"
  84. "movq %%mm0, (%0, %%esi) \n\t"
  85. "movq %%mm1, 8(%0, %%esi) \n\t"
  86. "addl $16, %%esi \n\t"
  87. " jnz 1b \n\t"
  88. "emms \n\t"
  89. :: "r" (s16+512), "r" (f+256)
  90. :"%esi", "memory"
  91. );
  92. return 2*256;
  93. }
  94. static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
  95. int32_t * f = (int32_t *) _f;
  96. asm volatile(
  97. "movl $-1024, %%esi \n\t"
  98. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  99. "pxor %%mm6, %%mm6 \n\t"
  100. "movq %%mm7, %%mm5 \n\t"
  101. "punpckldq %%mm6, %%mm5 \n\t"
  102. "1: \n\t"
  103. "movd (%1, %%esi), %%mm0 \n\t"
  104. "punpckldq 2048(%1, %%esi), %%mm0\n\t"
  105. "movd 1024(%1, %%esi), %%mm1 \n\t"
  106. "punpckldq 4(%1, %%esi), %%mm1 \n\t"
  107. "movd 2052(%1, %%esi), %%mm2 \n\t"
  108. "movq %%mm7, %%mm3 \n\t"
  109. "punpckldq 1028(%1, %%esi), %%mm3\n\t"
  110. "movd 8(%1, %%esi), %%mm4 \n\t"
  111. "punpckldq 2056(%1, %%esi), %%mm4\n\t"
  112. "leal (%%esi, %%esi, 4), %%edi \n\t"
  113. "sarl $1, %%edi \n\t"
  114. "psubd %%mm7, %%mm0 \n\t"
  115. "psubd %%mm7, %%mm1 \n\t"
  116. "psubd %%mm5, %%mm2 \n\t"
  117. "psubd %%mm7, %%mm3 \n\t"
  118. "psubd %%mm7, %%mm4 \n\t"
  119. "packssdw %%mm6, %%mm0 \n\t"
  120. "packssdw %%mm2, %%mm1 \n\t"
  121. "packssdw %%mm4, %%mm3 \n\t"
  122. "movq %%mm0, (%0, %%edi) \n\t"
  123. "movq %%mm1, 8(%0, %%edi) \n\t"
  124. "movq %%mm3, 16(%0, %%edi) \n\t"
  125. "movd 1032(%1, %%esi), %%mm1 \n\t"
  126. "punpckldq 12(%1, %%esi), %%mm1\n\t"
  127. "movd 2060(%1, %%esi), %%mm2 \n\t"
  128. "movq %%mm7, %%mm3 \n\t"
  129. "punpckldq 1036(%1, %%esi), %%mm3\n\t"
  130. "pxor %%mm0, %%mm0 \n\t"
  131. "psubd %%mm7, %%mm1 \n\t"
  132. "psubd %%mm5, %%mm2 \n\t"
  133. "psubd %%mm7, %%mm3 \n\t"
  134. "packssdw %%mm1, %%mm0 \n\t"
  135. "packssdw %%mm3, %%mm2 \n\t"
  136. "movq %%mm0, 24(%0, %%edi) \n\t"
  137. "movq %%mm2, 32(%0, %%edi) \n\t"
  138. "addl $16, %%esi \n\t"
  139. " jnz 1b \n\t"
  140. "emms \n\t"
  141. :: "r" (s16+1280), "r" (f+256)
  142. :"%esi", "%edi", "memory"
  143. );
  144. return 5*256;
  145. }
  146. static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
  147. int32_t * f = (int32_t *) _f;
  148. asm volatile(
  149. "movl $-1024, %%esi \n\t"
  150. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  151. "1: \n\t"
  152. "movq (%1, %%esi), %%mm0 \n\t"
  153. "movq 8(%1, %%esi), %%mm1 \n\t"
  154. "movq 1024(%1, %%esi), %%mm2 \n\t"
  155. "movq 1032(%1, %%esi), %%mm3 \n\t"
  156. "psubd %%mm7, %%mm0 \n\t"
  157. "psubd %%mm7, %%mm1 \n\t"
  158. "psubd %%mm7, %%mm2 \n\t"
  159. "psubd %%mm7, %%mm3 \n\t"
  160. "packssdw %%mm1, %%mm0 \n\t"
  161. "packssdw %%mm3, %%mm2 \n\t"
  162. "movq 2048(%1, %%esi), %%mm3 \n\t"
  163. "movq 2056(%1, %%esi), %%mm4 \n\t"
  164. "movq 3072(%1, %%esi), %%mm5 \n\t"
  165. "movq 3080(%1, %%esi), %%mm6 \n\t"
  166. "psubd %%mm7, %%mm3 \n\t"
  167. "psubd %%mm7, %%mm4 \n\t"
  168. "psubd %%mm7, %%mm5 \n\t"
  169. "psubd %%mm7, %%mm6 \n\t"
  170. "packssdw %%mm4, %%mm3 \n\t"
  171. "packssdw %%mm6, %%mm5 \n\t"
  172. "movq %%mm0, %%mm1 \n\t"
  173. "movq %%mm3, %%mm4 \n\t"
  174. "punpcklwd %%mm2, %%mm0 \n\t"
  175. "punpckhwd %%mm2, %%mm1 \n\t"
  176. "punpcklwd %%mm5, %%mm3 \n\t"
  177. "punpckhwd %%mm5, %%mm4 \n\t"
  178. "movq %%mm0, %%mm2 \n\t"
  179. "movq %%mm1, %%mm5 \n\t"
  180. "punpckldq %%mm3, %%mm0 \n\t"
  181. "punpckhdq %%mm3, %%mm2 \n\t"
  182. "punpckldq %%mm4, %%mm1 \n\t"
  183. "punpckhdq %%mm4, %%mm5 \n\t"
  184. "movq %%mm0, (%0, %%esi,2) \n\t"
  185. "movq %%mm2, 8(%0, %%esi,2) \n\t"
  186. "movq %%mm1, 16(%0, %%esi,2) \n\t"
  187. "movq %%mm5, 24(%0, %%esi,2) \n\t"
  188. "addl $16, %%esi \n\t"
  189. " jnz 1b \n\t"
  190. "emms \n\t"
  191. :: "r" (s16+1024), "r" (f+256)
  192. :"%esi", "memory"
  193. );
  194. return 4*256;
  195. }
  196. static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
  197. int32_t * f = (int32_t *) _f;
  198. asm volatile(
  199. "movl $-1024, %%esi \n\t"
  200. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  201. "1: \n\t"
  202. "movd (%1, %%esi), %%mm0 \n\t"
  203. "punpckldq 2048(%1, %%esi), %%mm0\n\t"
  204. "movd 3072(%1, %%esi), %%mm1 \n\t"
  205. "punpckldq 4096(%1, %%esi), %%mm1\n\t"
  206. "movd 1024(%1, %%esi), %%mm2 \n\t"
  207. "punpckldq 4(%1, %%esi), %%mm2 \n\t"
  208. "movd 2052(%1, %%esi), %%mm3 \n\t"
  209. "punpckldq 3076(%1, %%esi), %%mm3\n\t"
  210. "movd 4100(%1, %%esi), %%mm4 \n\t"
  211. "punpckldq 1028(%1, %%esi), %%mm4\n\t"
  212. "movd 8(%1, %%esi), %%mm5 \n\t"
  213. "punpckldq 2056(%1, %%esi), %%mm5\n\t"
  214. "leal (%%esi, %%esi, 4), %%edi \n\t"
  215. "sarl $1, %%edi \n\t"
  216. "psubd %%mm7, %%mm0 \n\t"
  217. "psubd %%mm7, %%mm1 \n\t"
  218. "psubd %%mm7, %%mm2 \n\t"
  219. "psubd %%mm7, %%mm3 \n\t"
  220. "psubd %%mm7, %%mm4 \n\t"
  221. "psubd %%mm7, %%mm5 \n\t"
  222. "packssdw %%mm1, %%mm0 \n\t"
  223. "packssdw %%mm3, %%mm2 \n\t"
  224. "packssdw %%mm5, %%mm4 \n\t"
  225. "movq %%mm0, (%0, %%edi) \n\t"
  226. "movq %%mm2, 8(%0, %%edi) \n\t"
  227. "movq %%mm4, 16(%0, %%edi) \n\t"
  228. "movd 3080(%1, %%esi), %%mm0 \n\t"
  229. "punpckldq 4104(%1, %%esi), %%mm0\n\t"
  230. "movd 1032(%1, %%esi), %%mm1 \n\t"
  231. "punpckldq 12(%1, %%esi), %%mm1\n\t"
  232. "movd 2060(%1, %%esi), %%mm2 \n\t"
  233. "punpckldq 3084(%1, %%esi), %%mm2\n\t"
  234. "movd 4108(%1, %%esi), %%mm3 \n\t"
  235. "punpckldq 1036(%1, %%esi), %%mm3\n\t"
  236. "psubd %%mm7, %%mm0 \n\t"
  237. "psubd %%mm7, %%mm1 \n\t"
  238. "psubd %%mm7, %%mm2 \n\t"
  239. "psubd %%mm7, %%mm3 \n\t"
  240. "packssdw %%mm1, %%mm0 \n\t"
  241. "packssdw %%mm3, %%mm2 \n\t"
  242. "movq %%mm0, 24(%0, %%edi) \n\t"
  243. "movq %%mm2, 32(%0, %%edi) \n\t"
  244. "addl $16, %%esi \n\t"
  245. " jnz 1b \n\t"
  246. "emms \n\t"
  247. :: "r" (s16+1280), "r" (f+256)
  248. :"%esi", "%edi", "memory"
  249. );
  250. return 5*256;
  251. }
  252. static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
  253. int32_t * f = (int32_t *) _f;
  254. asm volatile(
  255. "movl $-1024, %%esi \n\t"
  256. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  257. "pxor %%mm6, %%mm6 \n\t"
  258. "1: \n\t"
  259. "movq 1024(%1, %%esi), %%mm0 \n\t"
  260. "movq 1032(%1, %%esi), %%mm1 \n\t"
  261. "movq (%1, %%esi), %%mm2 \n\t"
  262. "movq 8(%1, %%esi), %%mm3 \n\t"
  263. "psubd %%mm7, %%mm0 \n\t"
  264. "psubd %%mm7, %%mm1 \n\t"
  265. "psubd %%mm7, %%mm2 \n\t"
  266. "psubd %%mm7, %%mm3 \n\t"
  267. "packssdw %%mm1, %%mm0 \n\t"
  268. "packssdw %%mm3, %%mm2 \n\t"
  269. "movq %%mm0, %%mm1 \n\t"
  270. "punpcklwd %%mm2, %%mm0 \n\t"
  271. "punpckhwd %%mm2, %%mm1 \n\t"
  272. "leal (%%esi, %%esi, 2), %%edi \n\t"
  273. "movq %%mm6, (%0, %%edi) \n\t"
  274. "movd %%mm0, 8(%0, %%edi) \n\t"
  275. "punpckhdq %%mm0, %%mm0 \n\t"
  276. "movq %%mm6, 12(%0, %%edi) \n\t"
  277. "movd %%mm0, 20(%0, %%edi) \n\t"
  278. "movq %%mm6, 24(%0, %%edi) \n\t"
  279. "movd %%mm1, 32(%0, %%edi) \n\t"
  280. "punpckhdq %%mm1, %%mm1 \n\t"
  281. "movq %%mm6, 36(%0, %%edi) \n\t"
  282. "movd %%mm1, 44(%0, %%edi) \n\t"
  283. "addl $16, %%esi \n\t"
  284. " jnz 1b \n\t"
  285. "emms \n\t"
  286. :: "r" (s16+1536), "r" (f+256)
  287. :"%esi", "%edi", "memory"
  288. );
  289. return 6*256;
  290. }
  291. static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
  292. int32_t * f = (int32_t *) _f;
  293. asm volatile(
  294. "movl $-1024, %%esi \n\t"
  295. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  296. "pxor %%mm6, %%mm6 \n\t"
  297. "1: \n\t"
  298. "movq 1024(%1, %%esi), %%mm0 \n\t"
  299. "movq 2048(%1, %%esi), %%mm1 \n\t"
  300. "movq (%1, %%esi), %%mm5 \n\t"
  301. "psubd %%mm7, %%mm0 \n\t"
  302. "psubd %%mm7, %%mm1 \n\t"
  303. "psubd %%mm7, %%mm5 \n\t"
  304. "leal (%%esi, %%esi, 2), %%edi \n\t"
  305. "pxor %%mm4, %%mm4 \n\t"
  306. "packssdw %%mm5, %%mm0 \n\t" // FfAa
  307. "packssdw %%mm4, %%mm1 \n\t" // 00Bb
  308. "punpckhwd %%mm0, %%mm4 \n\t" // F0f0
  309. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  310. "movq %%mm0, %%mm1 \n\t" // BAba
  311. "punpckldq %%mm4, %%mm3 \n\t" // f0XX
  312. "punpckldq %%mm6, %%mm0 \n\t" // 00ba
  313. "punpckhdq %%mm1, %%mm3 \n\t" // BAf0
  314. "movq %%mm0, (%0, %%edi) \n\t" // 00ba
  315. "punpckhdq %%mm4, %%mm0 \n\t" // F000
  316. "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0
  317. "movq %%mm0, 16(%0, %%edi) \n\t" // F000
  318. "addl $8, %%esi \n\t"
  319. " jnz 1b \n\t"
  320. "emms \n\t"
  321. :: "r" (s16+1536), "r" (f+256)
  322. :"%esi", "%edi", "memory"
  323. );
  324. return 6*256;
  325. }
  326. static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
  327. int32_t * f = (int32_t *) _f;
  328. asm volatile(
  329. "movl $-1024, %%esi \n\t"
  330. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  331. "pxor %%mm6, %%mm6 \n\t"
  332. "1: \n\t"
  333. "movq 1024(%1, %%esi), %%mm0 \n\t"
  334. "movq 3072(%1, %%esi), %%mm1 \n\t"
  335. "movq 2048(%1, %%esi), %%mm4 \n\t"
  336. "movq (%1, %%esi), %%mm5 \n\t"
  337. "psubd %%mm7, %%mm0 \n\t"
  338. "psubd %%mm7, %%mm1 \n\t"
  339. "psubd %%mm7, %%mm4 \n\t"
  340. "psubd %%mm7, %%mm5 \n\t"
  341. "leal (%%esi, %%esi, 2), %%edi \n\t"
  342. "packssdw %%mm4, %%mm0 \n\t" // EeAa
  343. "packssdw %%mm5, %%mm1 \n\t" // FfBb
  344. "movq %%mm0, %%mm2 \n\t" // EeAa
  345. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  346. "punpckhwd %%mm1, %%mm2 \n\t" // FEfe
  347. "movq %%mm0, %%mm1 \n\t" // BAba
  348. "punpckldq %%mm6, %%mm0 \n\t" // 00ba
  349. "punpckhdq %%mm1, %%mm1 \n\t" // BABA
  350. "movq %%mm0, (%0, %%edi) \n\t"
  351. "punpckhdq %%mm2, %%mm0 \n\t" // FE00
  352. "punpckldq %%mm1, %%mm2 \n\t" // BAfe
  353. "movq %%mm2, 8(%0, %%edi) \n\t"
  354. "movq %%mm0, 16(%0, %%edi) \n\t"
  355. "addl $8, %%esi \n\t"
  356. " jnz 1b \n\t"
  357. "emms \n\t"
  358. :: "r" (s16+1536), "r" (f+256)
  359. :"%esi", "%edi", "memory"
  360. );
  361. return 6*256;
  362. }
  363. static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
  364. int32_t * f = (int32_t *) _f;
  365. asm volatile(
  366. "movl $-1024, %%esi \n\t"
  367. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  368. // "pxor %%mm6, %%mm6 \n\t"
  369. "1: \n\t"
  370. "movq 1024(%1, %%esi), %%mm0 \n\t"
  371. "movq 2048(%1, %%esi), %%mm1 \n\t"
  372. "movq 3072(%1, %%esi), %%mm2 \n\t"
  373. "movq 4096(%1, %%esi), %%mm3 \n\t"
  374. "movq (%1, %%esi), %%mm5 \n\t"
  375. "psubd %%mm7, %%mm0 \n\t"
  376. "psubd %%mm7, %%mm1 \n\t"
  377. "psubd %%mm7, %%mm2 \n\t"
  378. "psubd %%mm7, %%mm3 \n\t"
  379. "psubd %%mm7, %%mm5 \n\t"
  380. "leal (%%esi, %%esi, 2), %%edi \n\t"
  381. "packssdw %%mm2, %%mm0 \n\t" // CcAa
  382. "packssdw %%mm3, %%mm1 \n\t" // DdBb
  383. "packssdw %%mm5, %%mm5 \n\t" // FfFf
  384. "movq %%mm0, %%mm2 \n\t" // CcAa
  385. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  386. "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
  387. "pxor %%mm4, %%mm4 \n\t" // 0000
  388. "punpcklwd %%mm5, %%mm4 \n\t" // F0f0
  389. "movq %%mm0, %%mm1 \n\t" // BAba
  390. "movq %%mm4, %%mm3 \n\t" // F0f0
  391. "punpckldq %%mm2, %%mm0 \n\t" // dcba
  392. "punpckhdq %%mm1, %%mm1 \n\t" // BABA
  393. "punpckldq %%mm1, %%mm4 \n\t" // BAf0
  394. "punpckhdq %%mm3, %%mm2 \n\t" // F0DC
  395. "movq %%mm0, (%0, %%edi) \n\t"
  396. "movq %%mm4, 8(%0, %%edi) \n\t"
  397. "movq %%mm2, 16(%0, %%edi) \n\t"
  398. "addl $8, %%esi \n\t"
  399. " jnz 1b \n\t"
  400. "emms \n\t"
  401. :: "r" (s16+1536), "r" (f+256)
  402. :"%esi", "%edi", "memory"
  403. );
  404. return 6*256;
  405. }
  406. static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
  407. int32_t * f = (int32_t *) _f;
  408. asm volatile(
  409. "movl $-1024, %%esi \n\t"
  410. "movq "MANGLE(magicF2W)", %%mm7 \n\t"
  411. // "pxor %%mm6, %%mm6 \n\t"
  412. "1: \n\t"
  413. "movq 1024(%1, %%esi), %%mm0 \n\t"
  414. "movq 3072(%1, %%esi), %%mm1 \n\t"
  415. "movq 4096(%1, %%esi), %%mm2 \n\t"
  416. "movq 5120(%1, %%esi), %%mm3 \n\t"
  417. "movq 2048(%1, %%esi), %%mm4 \n\t"
  418. "movq (%1, %%esi), %%mm5 \n\t"
  419. "psubd %%mm7, %%mm0 \n\t"
  420. "psubd %%mm7, %%mm1 \n\t"
  421. "psubd %%mm7, %%mm2 \n\t"
  422. "psubd %%mm7, %%mm3 \n\t"
  423. "psubd %%mm7, %%mm4 \n\t"
  424. "psubd %%mm7, %%mm5 \n\t"
  425. "leal (%%esi, %%esi, 2), %%edi \n\t"
  426. "packssdw %%mm2, %%mm0 \n\t" // CcAa
  427. "packssdw %%mm3, %%mm1 \n\t" // DdBb
  428. "packssdw %%mm4, %%mm4 \n\t" // EeEe
  429. "packssdw %%mm5, %%mm5 \n\t" // FfFf
  430. "movq %%mm0, %%mm2 \n\t" // CcAa
  431. "punpcklwd %%mm1, %%mm0 \n\t" // BAba
  432. "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
  433. "punpcklwd %%mm5, %%mm4 \n\t" // FEfe
  434. "movq %%mm0, %%mm1 \n\t" // BAba
  435. "movq %%mm4, %%mm3 \n\t" // FEfe
  436. "punpckldq %%mm2, %%mm0 \n\t" // dcba
  437. "punpckhdq %%mm1, %%mm1 \n\t" // BABA
  438. "punpckldq %%mm1, %%mm4 \n\t" // BAfe
  439. "punpckhdq %%mm3, %%mm2 \n\t" // FEDC
  440. "movq %%mm0, (%0, %%edi) \n\t"
  441. "movq %%mm4, 8(%0, %%edi) \n\t"
  442. "movq %%mm2, 16(%0, %%edi) \n\t"
  443. "addl $8, %%esi \n\t"
  444. " jnz 1b \n\t"
  445. "emms \n\t"
  446. :: "r" (s16+1536), "r" (f+256)
  447. :"%esi", "%edi", "memory"
  448. );
  449. return 6*256;
  450. }
  451. static void* a52_resample_MMX(int flags, int ch){
  452. switch (flags) {
  453. case A52_MONO:
  454. if(ch==5) return a52_resample_MONO_to_5_MMX;
  455. break;
  456. case A52_CHANNEL:
  457. case A52_STEREO:
  458. case A52_DOLBY:
  459. if(ch==2) return a52_resample_STEREO_to_2_MMX;
  460. break;
  461. case A52_3F:
  462. if(ch==5) return a52_resample_3F_to_5_MMX;
  463. break;
  464. case A52_2F2R:
  465. if(ch==4) return a52_resample_2F_2R_to_4_MMX;
  466. break;
  467. case A52_3F2R:
  468. if(ch==5) return a52_resample_3F_2R_to_5_MMX;
  469. break;
  470. case A52_MONO | A52_LFE:
  471. if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
  472. break;
  473. case A52_CHANNEL | A52_LFE:
  474. case A52_STEREO | A52_LFE:
  475. case A52_DOLBY | A52_LFE:
  476. if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
  477. break;
  478. case A52_3F | A52_LFE:
  479. if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
  480. break;
  481. case A52_2F2R | A52_LFE:
  482. if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
  483. break;
  484. case A52_3F2R | A52_LFE:
  485. if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
  486. break;
  487. }
  488. return NULL;
  489. }