You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1263 lines
43KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86inc.asm"
  23. %include "x86util.asm"
  24. %include "util.asm"
  25. SECTION_RODATA 32
  26. pf_s32_inv_scale: times 8 dd 0x30000000
  27. pf_s32_scale: times 8 dd 0x4f000000
  28. pf_s16_inv_scale: times 4 dd 0x38000000
  29. pf_s16_scale: times 4 dd 0x47000000
  30. pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
  31. pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
  32. pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
  33. pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
  34. pw_zero_even: times 4 dw 0x0000, 0xffff
  35. SECTION_TEXT
  36. ;------------------------------------------------------------------------------
  37. ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
  38. ;------------------------------------------------------------------------------
  39. INIT_XMM sse2
  40. cglobal conv_s16_to_s32, 3,3,3, dst, src, len
  41. lea lenq, [2*lend]
  42. lea dstq, [dstq+2*lenq]
  43. add srcq, lenq
  44. neg lenq
  45. .loop:
  46. mova m2, [srcq+lenq]
  47. pxor m0, m0
  48. pxor m1, m1
  49. punpcklwd m0, m2
  50. punpckhwd m1, m2
  51. mova [dstq+2*lenq ], m0
  52. mova [dstq+2*lenq+mmsize], m1
  53. add lenq, mmsize
  54. jl .loop
  55. REP_RET
  56. ;------------------------------------------------------------------------------
  57. ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
  58. ;------------------------------------------------------------------------------
  59. %macro CONV_S16_TO_FLT 0
  60. cglobal conv_s16_to_flt, 3,3,3, dst, src, len
  61. lea lenq, [2*lend]
  62. add srcq, lenq
  63. lea dstq, [dstq + 2*lenq]
  64. neg lenq
  65. mova m2, [pf_s16_inv_scale]
  66. ALIGN 16
  67. .loop:
  68. mova m0, [srcq+lenq]
  69. S16_TO_S32_SX 0, 1
  70. cvtdq2ps m0, m0
  71. cvtdq2ps m1, m1
  72. mulps m0, m2
  73. mulps m1, m2
  74. mova [dstq+2*lenq ], m0
  75. mova [dstq+2*lenq+mmsize], m1
  76. add lenq, mmsize
  77. jl .loop
  78. REP_RET
  79. %endmacro
  80. INIT_XMM sse2
  81. CONV_S16_TO_FLT
  82. INIT_XMM sse4
  83. CONV_S16_TO_FLT
  84. ;------------------------------------------------------------------------------
  85. ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
  86. ;------------------------------------------------------------------------------
  87. %macro CONV_S32_TO_S16 0
  88. cglobal conv_s32_to_s16, 3,3,4, dst, src, len
  89. lea lenq, [2*lend]
  90. lea srcq, [srcq+2*lenq]
  91. add dstq, lenq
  92. neg lenq
  93. .loop:
  94. mova m0, [srcq+2*lenq ]
  95. mova m1, [srcq+2*lenq+ mmsize]
  96. mova m2, [srcq+2*lenq+2*mmsize]
  97. mova m3, [srcq+2*lenq+3*mmsize]
  98. psrad m0, 16
  99. psrad m1, 16
  100. psrad m2, 16
  101. psrad m3, 16
  102. packssdw m0, m1
  103. packssdw m2, m3
  104. mova [dstq+lenq ], m0
  105. mova [dstq+lenq+mmsize], m2
  106. add lenq, mmsize*2
  107. jl .loop
  108. %if mmsize == 8
  109. emms
  110. RET
  111. %else
  112. REP_RET
  113. %endif
  114. %endmacro
  115. INIT_MMX mmx
  116. CONV_S32_TO_S16
  117. INIT_XMM sse2
  118. CONV_S32_TO_S16
  119. ;------------------------------------------------------------------------------
  120. ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
  121. ;------------------------------------------------------------------------------
  122. %macro CONV_S32_TO_FLT 0
  123. cglobal conv_s32_to_flt, 3,3,3, dst, src, len
  124. lea lenq, [4*lend]
  125. add srcq, lenq
  126. add dstq, lenq
  127. neg lenq
  128. mova m0, [pf_s32_inv_scale]
  129. ALIGN 16
  130. .loop:
  131. cvtdq2ps m1, [srcq+lenq ]
  132. cvtdq2ps m2, [srcq+lenq+mmsize]
  133. mulps m1, m1, m0
  134. mulps m2, m2, m0
  135. mova [dstq+lenq ], m1
  136. mova [dstq+lenq+mmsize], m2
  137. add lenq, mmsize*2
  138. jl .loop
  139. REP_RET
  140. %endmacro
  141. INIT_XMM sse2
  142. CONV_S32_TO_FLT
  143. %if HAVE_AVX_EXTERNAL
  144. INIT_YMM avx
  145. CONV_S32_TO_FLT
  146. %endif
  147. ;------------------------------------------------------------------------------
  148. ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
  149. ;------------------------------------------------------------------------------
  150. INIT_XMM sse2
  151. cglobal conv_flt_to_s16, 3,3,5, dst, src, len
  152. lea lenq, [2*lend]
  153. lea srcq, [srcq+2*lenq]
  154. add dstq, lenq
  155. neg lenq
  156. mova m4, [pf_s16_scale]
  157. .loop:
  158. mova m0, [srcq+2*lenq ]
  159. mova m1, [srcq+2*lenq+1*mmsize]
  160. mova m2, [srcq+2*lenq+2*mmsize]
  161. mova m3, [srcq+2*lenq+3*mmsize]
  162. mulps m0, m4
  163. mulps m1, m4
  164. mulps m2, m4
  165. mulps m3, m4
  166. cvtps2dq m0, m0
  167. cvtps2dq m1, m1
  168. cvtps2dq m2, m2
  169. cvtps2dq m3, m3
  170. packssdw m0, m1
  171. packssdw m2, m3
  172. mova [dstq+lenq ], m0
  173. mova [dstq+lenq+mmsize], m2
  174. add lenq, mmsize*2
  175. jl .loop
  176. REP_RET
  177. ;------------------------------------------------------------------------------
  178. ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
  179. ;------------------------------------------------------------------------------
  180. %macro CONV_FLT_TO_S32 0
  181. cglobal conv_flt_to_s32, 3,3,5, dst, src, len
  182. lea lenq, [lend*4]
  183. add srcq, lenq
  184. add dstq, lenq
  185. neg lenq
  186. mova m4, [pf_s32_scale]
  187. .loop:
  188. mulps m0, m4, [srcq+lenq ]
  189. mulps m1, m4, [srcq+lenq+1*mmsize]
  190. mulps m2, m4, [srcq+lenq+2*mmsize]
  191. mulps m3, m4, [srcq+lenq+3*mmsize]
  192. cvtps2dq m0, m0
  193. cvtps2dq m1, m1
  194. cvtps2dq m2, m2
  195. cvtps2dq m3, m3
  196. mova [dstq+lenq ], m0
  197. mova [dstq+lenq+1*mmsize], m1
  198. mova [dstq+lenq+2*mmsize], m2
  199. mova [dstq+lenq+3*mmsize], m3
  200. add lenq, mmsize*4
  201. jl .loop
  202. REP_RET
  203. %endmacro
  204. INIT_XMM sse2
  205. CONV_FLT_TO_S32
  206. %if HAVE_AVX_EXTERNAL
  207. INIT_YMM avx
  208. CONV_FLT_TO_S32
  209. %endif
  210. ;------------------------------------------------------------------------------
  211. ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
  212. ; int channels);
  213. ;------------------------------------------------------------------------------
  214. %macro CONV_S16P_TO_S16_2CH 0
  215. cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
  216. mov src1q, [src0q+gprsize]
  217. mov src0q, [src0q ]
  218. lea lenq, [2*lend]
  219. add src0q, lenq
  220. add src1q, lenq
  221. lea dstq, [dstq+2*lenq]
  222. neg lenq
  223. .loop
  224. mova m0, [src0q+lenq ]
  225. mova m1, [src1q+lenq ]
  226. mova m2, [src0q+lenq+mmsize]
  227. mova m3, [src1q+lenq+mmsize]
  228. SBUTTERFLY2 wd, 0, 1, 4
  229. SBUTTERFLY2 wd, 2, 3, 4
  230. mova [dstq+2*lenq+0*mmsize], m0
  231. mova [dstq+2*lenq+1*mmsize], m1
  232. mova [dstq+2*lenq+2*mmsize], m2
  233. mova [dstq+2*lenq+3*mmsize], m3
  234. add lenq, 2*mmsize
  235. jl .loop
  236. REP_RET
  237. %endmacro
  238. INIT_XMM sse2
  239. CONV_S16P_TO_S16_2CH
  240. %if HAVE_AVX_EXTERNAL
  241. INIT_XMM avx
  242. CONV_S16P_TO_S16_2CH
  243. %endif
  244. ;------------------------------------------------------------------------------
  245. ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
  246. ; int channels);
  247. ;------------------------------------------------------------------------------
  248. ;------------------------------------------------------------------------------
  249. ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
  250. ; instead of just a counter, which would avoid incrementing the
  251. ; pointers, but the extra complexity and amount of code is not worth
  252. ; the small gain. On x86-32 there are not enough registers to use len
  253. ; as an index without keeping two of the pointers on the stack and
  254. ; loading them in each iteration.
  255. ;------------------------------------------------------------------------------
  256. %macro CONV_S16P_TO_S16_6CH 0
  257. %if ARCH_X86_64
  258. cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
  259. %else
  260. cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
  261. %define lend dword r2m
  262. %endif
  263. mov src1q, [src0q+1*gprsize]
  264. mov src2q, [src0q+2*gprsize]
  265. mov src3q, [src0q+3*gprsize]
  266. mov src4q, [src0q+4*gprsize]
  267. mov src5q, [src0q+5*gprsize]
  268. mov src0q, [src0q]
  269. sub src1q, src0q
  270. sub src2q, src0q
  271. sub src3q, src0q
  272. sub src4q, src0q
  273. sub src5q, src0q
  274. .loop:
  275. %if cpuflag(sse2slow)
  276. movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
  277. movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  278. movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  279. movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  280. movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  281. movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  282. ; unpack words:
  283. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  284. punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
  285. punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
  286. ; blend dwords
  287. shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
  288. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  289. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  290. ; shuffle dwords
  291. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  292. pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
  293. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  294. movq [dstq+0*mmsize/2], m1
  295. movq [dstq+1*mmsize/2], m0
  296. movq [dstq+2*mmsize/2], m2
  297. movhps [dstq+3*mmsize/2], m1
  298. movhps [dstq+4*mmsize/2], m0
  299. movhps [dstq+5*mmsize/2], m2
  300. add src0q, mmsize/2
  301. add dstq, mmsize*3
  302. sub lend, mmsize/4
  303. %else
  304. mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
  305. mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
  306. mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
  307. mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
  308. mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
  309. mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
  310. ; unpack words:
  311. SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  312. ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
  313. SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  314. ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
  315. SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  316. ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
  317. ; blend dwords
  318. shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
  319. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  320. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  321. SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
  322. shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
  323. shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
  324. shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
  325. SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
  326. ; shuffle dwords
  327. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  328. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  329. pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
  330. pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
  331. pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
  332. pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
  333. ; shuffle qwords
  334. punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
  335. punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  336. shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
  337. SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
  338. punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
  339. punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
  340. shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
  341. SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
  342. mova [dstq+0*mmsize], m4
  343. mova [dstq+1*mmsize], m2
  344. mova [dstq+2*mmsize], m0
  345. mova [dstq+3*mmsize], m5
  346. mova [dstq+4*mmsize], m3
  347. mova [dstq+5*mmsize], m1
  348. add src0q, mmsize
  349. add dstq, mmsize*6
  350. sub lend, mmsize/2
  351. %endif
  352. jg .loop
  353. REP_RET
  354. %endmacro
  355. INIT_XMM sse2
  356. CONV_S16P_TO_S16_6CH
  357. INIT_XMM sse2slow
  358. CONV_S16P_TO_S16_6CH
  359. %if HAVE_AVX_EXTERNAL
  360. INIT_XMM avx
  361. CONV_S16P_TO_S16_6CH
  362. %endif
  363. ;------------------------------------------------------------------------------
  364. ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
  365. ; int channels);
  366. ;------------------------------------------------------------------------------
  367. %macro CONV_S16P_TO_FLT_2CH 0
  368. cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
  369. lea lenq, [2*lend]
  370. mov src1q, [src0q+gprsize]
  371. mov src0q, [src0q ]
  372. lea dstq, [dstq+4*lenq]
  373. add src0q, lenq
  374. add src1q, lenq
  375. neg lenq
  376. mova m5, [pf_s32_inv_scale]
  377. .loop:
  378. mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
  379. mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
  380. SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
  381. ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
  382. pxor m3, m3
  383. punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
  384. punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
  385. punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
  386. punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
  387. cvtdq2ps m0, m0
  388. cvtdq2ps m1, m1
  389. cvtdq2ps m2, m2
  390. cvtdq2ps m3, m3
  391. mulps m0, m5
  392. mulps m1, m5
  393. mulps m2, m5
  394. mulps m3, m5
  395. mova [dstq+4*lenq ], m0
  396. mova [dstq+4*lenq+ mmsize], m1
  397. mova [dstq+4*lenq+2*mmsize], m2
  398. mova [dstq+4*lenq+3*mmsize], m3
  399. add lenq, mmsize
  400. jl .loop
  401. REP_RET
  402. %endmacro
  403. INIT_XMM sse2
  404. CONV_S16P_TO_FLT_2CH
  405. %if HAVE_AVX_EXTERNAL
  406. INIT_XMM avx
  407. CONV_S16P_TO_FLT_2CH
  408. %endif
  409. ;------------------------------------------------------------------------------
  410. ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
  411. ; int channels);
  412. ;------------------------------------------------------------------------------
  413. %macro CONV_S16P_TO_FLT_6CH 0
  414. %if ARCH_X86_64
  415. cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
  416. %else
  417. cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
  418. %define lend dword r2m
  419. %endif
  420. mov src1q, [srcq+1*gprsize]
  421. mov src2q, [srcq+2*gprsize]
  422. mov src3q, [srcq+3*gprsize]
  423. mov src4q, [srcq+4*gprsize]
  424. mov src5q, [srcq+5*gprsize]
  425. mov srcq, [srcq]
  426. sub src1q, srcq
  427. sub src2q, srcq
  428. sub src3q, srcq
  429. sub src4q, srcq
  430. sub src5q, srcq
  431. mova m7, [pf_s32_inv_scale]
  432. %if cpuflag(ssse3)
  433. %define unpack_even m6
  434. mova m6, [pb_shuf_unpack_even]
  435. %if ARCH_X86_64
  436. %define unpack_odd m8
  437. mova m8, [pb_shuf_unpack_odd]
  438. %else
  439. %define unpack_odd [pb_shuf_unpack_odd]
  440. %endif
  441. %endif
  442. .loop:
  443. movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
  444. movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  445. movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  446. movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  447. movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  448. movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  449. ; unpack words:
  450. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  451. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  452. punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  453. ; blend dwords
  454. shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
  455. shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
  456. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  457. %if cpuflag(ssse3)
  458. pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
  459. pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
  460. pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
  461. pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
  462. pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
  463. pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
  464. %else
  465. ; shuffle dwords
  466. pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
  467. pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
  468. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  469. pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
  470. punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
  471. punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
  472. punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
  473. punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
  474. punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
  475. punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
  476. SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
  477. %endif
  478. cvtdq2ps m0, m0 ; convert s32 to float
  479. cvtdq2ps m1, m1
  480. cvtdq2ps m2, m2
  481. cvtdq2ps m3, m3
  482. cvtdq2ps m4, m4
  483. cvtdq2ps m5, m5
  484. mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
  485. mulps m1, m7
  486. mulps m2, m7
  487. mulps m3, m7
  488. mulps m4, m7
  489. mulps m5, m7
  490. mova [dstq ], m0
  491. mova [dstq+ mmsize], m1
  492. mova [dstq+2*mmsize], m2
  493. mova [dstq+3*mmsize], m3
  494. mova [dstq+4*mmsize], m4
  495. mova [dstq+5*mmsize], m5
  496. add srcq, mmsize/2
  497. add dstq, mmsize*6
  498. sub lend, mmsize/4
  499. jg .loop
  500. REP_RET
  501. %endmacro
  502. INIT_XMM sse2
  503. CONV_S16P_TO_FLT_6CH
  504. INIT_XMM ssse3
  505. CONV_S16P_TO_FLT_6CH
  506. %if HAVE_AVX_EXTERNAL
  507. INIT_XMM avx
  508. CONV_S16P_TO_FLT_6CH
  509. %endif
  510. ;------------------------------------------------------------------------------
  511. ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
  512. ; int channels);
  513. ;------------------------------------------------------------------------------
  514. %macro CONV_FLTP_TO_S16_2CH 0
  515. cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
  516. lea lenq, [4*lend]
  517. mov src1q, [src0q+gprsize]
  518. mov src0q, [src0q ]
  519. add dstq, lenq
  520. add src0q, lenq
  521. add src1q, lenq
  522. neg lenq
  523. mova m2, [pf_s16_scale]
  524. %if cpuflag(ssse3)
  525. mova m3, [pb_interleave_words]
  526. %endif
  527. .loop:
  528. mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
  529. mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
  530. cvtps2dq m0, m0
  531. cvtps2dq m1, m1
  532. %if cpuflag(ssse3)
  533. packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  534. pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  535. %else
  536. packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
  537. packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
  538. punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  539. %endif
  540. mova [dstq+lenq], m0
  541. add lenq, mmsize
  542. jl .loop
  543. REP_RET
  544. %endmacro
  545. INIT_XMM sse2
  546. CONV_FLTP_TO_S16_2CH
  547. INIT_XMM ssse3
  548. CONV_FLTP_TO_S16_2CH
  549. ;------------------------------------------------------------------------------
  550. ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
  551. ; int channels);
  552. ;------------------------------------------------------------------------------
  553. %macro CONV_FLTP_TO_S16_6CH 0
  554. %if ARCH_X86_64
  555. cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
  556. %else
  557. cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
  558. %define lend dword r2m
  559. %endif
  560. mov src1q, [srcq+1*gprsize]
  561. mov src2q, [srcq+2*gprsize]
  562. mov src3q, [srcq+3*gprsize]
  563. mov src4q, [srcq+4*gprsize]
  564. mov src5q, [srcq+5*gprsize]
  565. mov srcq, [srcq]
  566. sub src1q, srcq
  567. sub src2q, srcq
  568. sub src3q, srcq
  569. sub src4q, srcq
  570. sub src5q, srcq
  571. movaps xmm6, [pf_s16_scale]
  572. .loop:
  573. %if cpuflag(sse2)
  574. mulps m0, m6, [srcq ]
  575. mulps m1, m6, [srcq+src1q]
  576. mulps m2, m6, [srcq+src2q]
  577. mulps m3, m6, [srcq+src3q]
  578. mulps m4, m6, [srcq+src4q]
  579. mulps m5, m6, [srcq+src5q]
  580. cvtps2dq m0, m0
  581. cvtps2dq m1, m1
  582. cvtps2dq m2, m2
  583. cvtps2dq m3, m3
  584. cvtps2dq m4, m4
  585. cvtps2dq m5, m5
  586. packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
  587. packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
  588. packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
  589. ; unpack words:
  590. movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
  591. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  592. punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
  593. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  594. ; blend dwords:
  595. shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
  596. shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  597. shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  598. ; shuffle dwords:
  599. shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  600. shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
  601. shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  602. mova [dstq+0*mmsize], m3
  603. mova [dstq+1*mmsize], m1
  604. mova [dstq+2*mmsize], m0
  605. %else ; sse
  606. movlps xmm0, [srcq ]
  607. movlps xmm1, [srcq+src1q]
  608. movlps xmm2, [srcq+src2q]
  609. movlps xmm3, [srcq+src3q]
  610. movlps xmm4, [srcq+src4q]
  611. movlps xmm5, [srcq+src5q]
  612. mulps xmm0, xmm6
  613. mulps xmm1, xmm6
  614. mulps xmm2, xmm6
  615. mulps xmm3, xmm6
  616. mulps xmm4, xmm6
  617. mulps xmm5, xmm6
  618. cvtps2pi mm0, xmm0
  619. cvtps2pi mm1, xmm1
  620. cvtps2pi mm2, xmm2
  621. cvtps2pi mm3, xmm3
  622. cvtps2pi mm4, xmm4
  623. cvtps2pi mm5, xmm5
  624. packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
  625. packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
  626. packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
  627. ; unpack words
  628. pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
  629. punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
  630. punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
  631. punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
  632. ; unpack dwords
  633. pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
  634. punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
  635. punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
  636. punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
  637. mova [dstq+0*mmsize], mm0
  638. mova [dstq+1*mmsize], mm1
  639. mova [dstq+2*mmsize], mm2
  640. %endif
  641. add srcq, mmsize
  642. add dstq, mmsize*3
  643. sub lend, mmsize/4
  644. jg .loop
  645. %if mmsize == 8
  646. emms
  647. RET
  648. %else
  649. REP_RET
  650. %endif
  651. %endmacro
  652. INIT_MMX sse
  653. CONV_FLTP_TO_S16_6CH
  654. INIT_XMM sse2
  655. CONV_FLTP_TO_S16_6CH
  656. %if HAVE_AVX_EXTERNAL
  657. INIT_XMM avx
  658. CONV_FLTP_TO_S16_6CH
  659. %endif
  660. ;------------------------------------------------------------------------------
  661. ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
  662. ; int channels);
  663. ;------------------------------------------------------------------------------
  664. %macro CONV_FLTP_TO_FLT_2CH 0
  665. cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
  666. mov src1q, [src0q+gprsize]
  667. mov src0q, [src0q]
  668. lea lenq, [4*lend]
  669. add src0q, lenq
  670. add src1q, lenq
  671. lea dstq, [dstq+2*lenq]
  672. neg lenq
  673. .loop
  674. mova m0, [src0q+lenq ]
  675. mova m1, [src1q+lenq ]
  676. mova m2, [src0q+lenq+mmsize]
  677. mova m3, [src1q+lenq+mmsize]
  678. SBUTTERFLYPS 0, 1, 4
  679. SBUTTERFLYPS 2, 3, 4
  680. mova [dstq+2*lenq+0*mmsize], m0
  681. mova [dstq+2*lenq+1*mmsize], m1
  682. mova [dstq+2*lenq+2*mmsize], m2
  683. mova [dstq+2*lenq+3*mmsize], m3
  684. add lenq, 2*mmsize
  685. jl .loop
  686. REP_RET
  687. %endmacro
  688. INIT_XMM sse
  689. CONV_FLTP_TO_FLT_2CH
  690. %if HAVE_AVX_EXTERNAL
  691. INIT_XMM avx
  692. CONV_FLTP_TO_FLT_2CH
  693. %endif
  694. ;-----------------------------------------------------------------------------
  695. ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
  696. ; int channels);
  697. ;-----------------------------------------------------------------------------
  698. %macro CONV_FLTP_TO_FLT_6CH 0
  699. cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  700. %if ARCH_X86_64
  701. mov lend, r2d
  702. %else
  703. %define lend dword r2m
  704. %endif
  705. mov src1q, [srcq+1*gprsize]
  706. mov src2q, [srcq+2*gprsize]
  707. mov src3q, [srcq+3*gprsize]
  708. mov src4q, [srcq+4*gprsize]
  709. mov src5q, [srcq+5*gprsize]
  710. mov srcq, [srcq]
  711. sub src1q, srcq
  712. sub src2q, srcq
  713. sub src3q, srcq
  714. sub src4q, srcq
  715. sub src5q, srcq
  716. .loop:
  717. mova m0, [srcq ]
  718. mova m1, [srcq+src1q]
  719. mova m2, [srcq+src2q]
  720. mova m3, [srcq+src3q]
  721. mova m4, [srcq+src4q]
  722. mova m5, [srcq+src5q]
  723. %if cpuflag(sse4)
  724. SBUTTERFLYPS 0, 1, 6
  725. SBUTTERFLYPS 2, 3, 6
  726. SBUTTERFLYPS 4, 5, 6
  727. blendps m6, m4, m0, 1100b
  728. movlhps m0, m2
  729. movhlps m4, m2
  730. blendps m2, m5, m1, 1100b
  731. movlhps m1, m3
  732. movhlps m5, m3
  733. movaps [dstq ], m0
  734. movaps [dstq+16], m6
  735. movaps [dstq+32], m4
  736. movaps [dstq+48], m1
  737. movaps [dstq+64], m2
  738. movaps [dstq+80], m5
  739. %else ; mmx
  740. SBUTTERFLY dq, 0, 1, 6
  741. SBUTTERFLY dq, 2, 3, 6
  742. SBUTTERFLY dq, 4, 5, 6
  743. movq [dstq ], m0
  744. movq [dstq+ 8], m2
  745. movq [dstq+16], m4
  746. movq [dstq+24], m1
  747. movq [dstq+32], m3
  748. movq [dstq+40], m5
  749. %endif
  750. add srcq, mmsize
  751. add dstq, mmsize*6
  752. sub lend, mmsize/4
  753. jg .loop
  754. %if mmsize == 8
  755. emms
  756. RET
  757. %else
  758. REP_RET
  759. %endif
  760. %endmacro
  761. INIT_MMX mmx
  762. CONV_FLTP_TO_FLT_6CH
  763. INIT_XMM sse4
  764. CONV_FLTP_TO_FLT_6CH
  765. %if HAVE_AVX_EXTERNAL
  766. INIT_XMM avx
  767. CONV_FLTP_TO_FLT_6CH
  768. %endif
  769. ;------------------------------------------------------------------------------
  770. ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
  771. ; int channels);
  772. ;------------------------------------------------------------------------------
  773. %macro CONV_S16_TO_S16P_2CH 0
  774. cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
  775. lea lenq, [2*lend]
  776. mov dst1q, [dst0q+gprsize]
  777. mov dst0q, [dst0q ]
  778. lea srcq, [srcq+2*lenq]
  779. add dst0q, lenq
  780. add dst1q, lenq
  781. neg lenq
  782. %if cpuflag(ssse3)
  783. mova m3, [pb_deinterleave_words]
  784. %endif
  785. .loop:
  786. mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  787. mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  788. %if cpuflag(ssse3)
  789. pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  790. pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
  791. SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  792. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  793. %else ; sse2
  794. pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
  795. pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
  796. pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
  797. pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
  798. DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  799. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  800. %endif
  801. mova [dst0q+lenq], m0
  802. mova [dst1q+lenq], m1
  803. add lenq, mmsize
  804. jl .loop
  805. REP_RET
  806. %endmacro
  807. INIT_XMM sse2
  808. CONV_S16_TO_S16P_2CH
  809. INIT_XMM ssse3
  810. CONV_S16_TO_S16P_2CH
  811. %if HAVE_AVX_EXTERNAL
  812. INIT_XMM avx
  813. CONV_S16_TO_S16P_2CH
  814. %endif
  815. ;------------------------------------------------------------------------------
  816. ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
  817. ; int channels);
  818. ;------------------------------------------------------------------------------
  819. %macro CONV_S16_TO_S16P_6CH 0
  820. %if ARCH_X86_64
  821. cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
  822. %else
  823. cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
  824. %define lend dword r2m
  825. %endif
  826. mov dst1q, [dstq+ gprsize]
  827. mov dst2q, [dstq+2*gprsize]
  828. mov dst3q, [dstq+3*gprsize]
  829. mov dst4q, [dstq+4*gprsize]
  830. mov dst5q, [dstq+5*gprsize]
  831. mov dstq, [dstq ]
  832. sub dst1q, dstq
  833. sub dst2q, dstq
  834. sub dst3q, dstq
  835. sub dst4q, dstq
  836. sub dst5q, dstq
  837. .loop:
  838. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  839. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  840. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  841. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  842. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  843. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  844. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  845. ; m1 = 4, 10, 5, 11, x, x, x, x
  846. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  847. ; m2 = 16, 22, 17, 23, x, x, x, x
  848. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  849. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  850. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  851. movq [dstq ], m0
  852. movhps [dstq+dst1q], m0
  853. movq [dstq+dst2q], m3
  854. movhps [dstq+dst3q], m3
  855. movq [dstq+dst4q], m1
  856. movhps [dstq+dst5q], m1
  857. add srcq, mmsize*3
  858. add dstq, mmsize/2
  859. sub lend, mmsize/4
  860. jg .loop
  861. REP_RET
  862. %endmacro
  863. %define PALIGNR PALIGNR_MMX
  864. INIT_XMM sse2
  865. CONV_S16_TO_S16P_6CH
  866. %define PALIGNR PALIGNR_SSSE3
  867. INIT_XMM ssse3
  868. CONV_S16_TO_S16P_6CH
  869. %if HAVE_AVX_EXTERNAL
  870. INIT_XMM avx
  871. CONV_S16_TO_S16P_6CH
  872. %endif
  873. ;------------------------------------------------------------------------------
  874. ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
  875. ; int channels);
  876. ;------------------------------------------------------------------------------
  877. %macro CONV_S16_TO_FLTP_2CH 0
  878. cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
  879. lea lenq, [4*lend]
  880. mov dst1q, [dst0q+gprsize]
  881. mov dst0q, [dst0q ]
  882. add srcq, lenq
  883. add dst0q, lenq
  884. add dst1q, lenq
  885. neg lenq
  886. mova m3, [pf_s32_inv_scale]
  887. mova m4, [pw_zero_even]
  888. .loop:
  889. mova m1, [srcq+lenq]
  890. pslld m0, m1, 16
  891. pand m1, m4
  892. cvtdq2ps m0, m0
  893. cvtdq2ps m1, m1
  894. mulps m0, m0, m3
  895. mulps m1, m1, m3
  896. mova [dst0q+lenq], m0
  897. mova [dst1q+lenq], m1
  898. add lenq, mmsize
  899. jl .loop
  900. REP_RET
  901. %endmacro
  902. INIT_XMM sse2
  903. CONV_S16_TO_FLTP_2CH
  904. %if HAVE_AVX_EXTERNAL
  905. INIT_XMM avx
  906. CONV_S16_TO_FLTP_2CH
  907. %endif
  908. ;------------------------------------------------------------------------------
  909. ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
  910. ; int channels);
  911. ;------------------------------------------------------------------------------
  912. %macro CONV_S16_TO_FLTP_6CH 0
  913. %if ARCH_X86_64
  914. cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  915. %else
  916. cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  917. %define lend dword r2m
  918. %endif
  919. mov dst1q, [dstq+ gprsize]
  920. mov dst2q, [dstq+2*gprsize]
  921. mov dst3q, [dstq+3*gprsize]
  922. mov dst4q, [dstq+4*gprsize]
  923. mov dst5q, [dstq+5*gprsize]
  924. mov dstq, [dstq ]
  925. sub dst1q, dstq
  926. sub dst2q, dstq
  927. sub dst3q, dstq
  928. sub dst4q, dstq
  929. sub dst5q, dstq
  930. mova m6, [pf_s16_inv_scale]
  931. .loop:
  932. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  933. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  934. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  935. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  936. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  937. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  938. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  939. ; m1 = 4, 10, 5, 11, x, x, x, x
  940. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  941. ; m2 = 16, 22, 17, 23, x, x, x, x
  942. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  943. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  944. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  945. S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
  946. ; m2 = 1, 7, 13, 19
  947. S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
  948. ; m4 = 3, 9, 15, 21
  949. S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
  950. ; m5 = 5, 11, 17, 23
  951. SWAP 1,2,3,4
  952. cvtdq2ps m0, m0
  953. cvtdq2ps m1, m1
  954. cvtdq2ps m2, m2
  955. cvtdq2ps m3, m3
  956. cvtdq2ps m4, m4
  957. cvtdq2ps m5, m5
  958. mulps m0, m6
  959. mulps m1, m6
  960. mulps m2, m6
  961. mulps m3, m6
  962. mulps m4, m6
  963. mulps m5, m6
  964. mova [dstq ], m0
  965. mova [dstq+dst1q], m1
  966. mova [dstq+dst2q], m2
  967. mova [dstq+dst3q], m3
  968. mova [dstq+dst4q], m4
  969. mova [dstq+dst5q], m5
  970. add srcq, mmsize*3
  971. add dstq, mmsize
  972. sub lend, mmsize/4
  973. jg .loop
  974. REP_RET
  975. %endmacro
  976. %define PALIGNR PALIGNR_MMX
  977. INIT_XMM sse2
  978. CONV_S16_TO_FLTP_6CH
  979. %define PALIGNR PALIGNR_SSSE3
  980. INIT_XMM ssse3
  981. CONV_S16_TO_FLTP_6CH
  982. INIT_XMM sse4
  983. CONV_S16_TO_FLTP_6CH
  984. %if HAVE_AVX_EXTERNAL
  985. INIT_XMM avx
  986. CONV_S16_TO_FLTP_6CH
  987. %endif
  988. ;------------------------------------------------------------------------------
  989. ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
  990. ; int channels);
  991. ;------------------------------------------------------------------------------
  992. %macro CONV_FLT_TO_S16P_2CH 0
  993. cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
  994. lea lenq, [2*lend]
  995. mov dst1q, [dst0q+gprsize]
  996. mov dst0q, [dst0q ]
  997. lea srcq, [srcq+4*lenq]
  998. add dst0q, lenq
  999. add dst1q, lenq
  1000. neg lenq
  1001. mova m5, [pf_s16_scale]
  1002. .loop:
  1003. mova m0, [srcq+4*lenq ]
  1004. mova m1, [srcq+4*lenq+ mmsize]
  1005. mova m2, [srcq+4*lenq+2*mmsize]
  1006. mova m3, [srcq+4*lenq+3*mmsize]
  1007. DEINT2_PS 0, 1, 4
  1008. DEINT2_PS 2, 3, 4
  1009. mulps m0, m0, m5
  1010. mulps m1, m1, m5
  1011. mulps m2, m2, m5
  1012. mulps m3, m3, m5
  1013. cvtps2dq m0, m0
  1014. cvtps2dq m1, m1
  1015. cvtps2dq m2, m2
  1016. cvtps2dq m3, m3
  1017. packssdw m0, m2
  1018. packssdw m1, m3
  1019. mova [dst0q+lenq], m0
  1020. mova [dst1q+lenq], m1
  1021. add lenq, mmsize
  1022. jl .loop
  1023. REP_RET
  1024. %endmacro
  1025. INIT_XMM sse2
  1026. CONV_FLT_TO_S16P_2CH
  1027. %if HAVE_AVX_EXTERNAL
  1028. INIT_XMM avx
  1029. CONV_FLT_TO_S16P_2CH
  1030. %endif
  1031. ;------------------------------------------------------------------------------
  1032. ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
  1033. ; int channels);
  1034. ;------------------------------------------------------------------------------
  1035. %macro CONV_FLT_TO_S16P_6CH 0
  1036. %if ARCH_X86_64
  1037. cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1038. %else
  1039. cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1040. %define lend dword r2m
  1041. %endif
  1042. mov dst1q, [dstq+ gprsize]
  1043. mov dst2q, [dstq+2*gprsize]
  1044. mov dst3q, [dstq+3*gprsize]
  1045. mov dst4q, [dstq+4*gprsize]
  1046. mov dst5q, [dstq+5*gprsize]
  1047. mov dstq, [dstq ]
  1048. sub dst1q, dstq
  1049. sub dst2q, dstq
  1050. sub dst3q, dstq
  1051. sub dst4q, dstq
  1052. sub dst5q, dstq
  1053. mova m6, [pf_s16_scale]
  1054. .loop:
  1055. mulps m0, m6, [srcq+0*mmsize]
  1056. mulps m3, m6, [srcq+1*mmsize]
  1057. mulps m1, m6, [srcq+2*mmsize]
  1058. mulps m4, m6, [srcq+3*mmsize]
  1059. mulps m2, m6, [srcq+4*mmsize]
  1060. mulps m5, m6, [srcq+5*mmsize]
  1061. cvtps2dq m0, m0
  1062. cvtps2dq m1, m1
  1063. cvtps2dq m2, m2
  1064. cvtps2dq m3, m3
  1065. cvtps2dq m4, m4
  1066. cvtps2dq m5, m5
  1067. packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  1068. packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  1069. packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  1070. PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
  1071. shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
  1072. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  1073. SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  1074. ; m3 = 4, 10, 5, 11, x, x, x, x
  1075. SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
  1076. ; m2 = 16, 22, 17, 23, x, x, x, x
  1077. SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  1078. ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
  1079. punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
  1080. movq [dstq ], m0
  1081. movhps [dstq+dst1q], m0
  1082. movq [dstq+dst2q], m1
  1083. movhps [dstq+dst3q], m1
  1084. movq [dstq+dst4q], m3
  1085. movhps [dstq+dst5q], m3
  1086. add srcq, mmsize*6
  1087. add dstq, mmsize/2
  1088. sub lend, mmsize/4
  1089. jg .loop
  1090. REP_RET
  1091. %endmacro
  1092. %define PALIGNR PALIGNR_MMX
  1093. INIT_XMM sse2
  1094. CONV_FLT_TO_S16P_6CH
  1095. %define PALIGNR PALIGNR_SSSE3
  1096. INIT_XMM ssse3
  1097. CONV_FLT_TO_S16P_6CH
  1098. %if HAVE_AVX_EXTERNAL
  1099. INIT_XMM avx
  1100. CONV_FLT_TO_S16P_6CH
  1101. %endif
  1102. ;------------------------------------------------------------------------------
  1103. ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
  1104. ; int channels);
  1105. ;------------------------------------------------------------------------------
  1106. %macro CONV_FLT_TO_FLTP_2CH 0
  1107. cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
  1108. lea lenq, [4*lend]
  1109. mov dst1q, [dst0q+gprsize]
  1110. mov dst0q, [dst0q ]
  1111. lea srcq, [srcq+2*lenq]
  1112. add dst0q, lenq
  1113. add dst1q, lenq
  1114. neg lenq
  1115. .loop:
  1116. mova m0, [srcq+2*lenq ]
  1117. mova m1, [srcq+2*lenq+mmsize]
  1118. DEINT2_PS 0, 1, 2
  1119. mova [dst0q+lenq], m0
  1120. mova [dst1q+lenq], m1
  1121. add lenq, mmsize
  1122. jl .loop
  1123. REP_RET
  1124. %endmacro
  1125. INIT_XMM sse
  1126. CONV_FLT_TO_FLTP_2CH
  1127. %if HAVE_AVX_EXTERNAL
  1128. INIT_XMM avx
  1129. CONV_FLT_TO_FLTP_2CH
  1130. %endif
  1131. ;------------------------------------------------------------------------------
  1132. ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
  1133. ; int channels);
  1134. ;------------------------------------------------------------------------------
  1135. %macro CONV_FLT_TO_FLTP_6CH 0
  1136. %if ARCH_X86_64
  1137. cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1138. %else
  1139. cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1140. %define lend dword r2m
  1141. %endif
  1142. mov dst1q, [dstq+ gprsize]
  1143. mov dst2q, [dstq+2*gprsize]
  1144. mov dst3q, [dstq+3*gprsize]
  1145. mov dst4q, [dstq+4*gprsize]
  1146. mov dst5q, [dstq+5*gprsize]
  1147. mov dstq, [dstq ]
  1148. sub dst1q, dstq
  1149. sub dst2q, dstq
  1150. sub dst3q, dstq
  1151. sub dst4q, dstq
  1152. sub dst5q, dstq
  1153. .loop:
  1154. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
  1155. mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
  1156. mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
  1157. mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
  1158. mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
  1159. mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
  1160. SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
  1161. ; m3 = 2, 14, 3, 15
  1162. SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
  1163. ; m4 = 6, 18, 7, 19
  1164. SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
  1165. ; m5 = 10, 22, 11, 23
  1166. SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
  1167. ; m4 = 1, 7, 13, 19
  1168. SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
  1169. ; m2 = 3, 9, 15, 21
  1170. SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
  1171. ; m5 = 5, 11, 17, 23
  1172. mova [dstq ], m0
  1173. mova [dstq+dst1q], m4
  1174. mova [dstq+dst2q], m3
  1175. mova [dstq+dst3q], m2
  1176. mova [dstq+dst4q], m1
  1177. mova [dstq+dst5q], m5
  1178. add srcq, mmsize*6
  1179. add dstq, mmsize
  1180. sub lend, mmsize/4
  1181. jg .loop
  1182. REP_RET
  1183. %endmacro
  1184. INIT_XMM sse2
  1185. CONV_FLT_TO_FLTP_6CH
  1186. %if HAVE_AVX_EXTERNAL
  1187. INIT_XMM avx
  1188. CONV_FLT_TO_FLTP_6CH
  1189. %endif