You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1269 lines
43KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86inc.asm"
  23. %include "x86util.asm"
  24. %include "util.asm"
  25. SECTION_RODATA 32
  26. pf_s32_inv_scale: times 8 dd 0x30000000
  27. pf_s32_scale: times 8 dd 0x4f000000
  28. pf_s32_clip: times 8 dd 0x4effffff
  29. pf_s16_inv_scale: times 4 dd 0x38000000
  30. pf_s16_scale: times 4 dd 0x47000000
  31. pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
  32. pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
  33. pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
  34. pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
  35. pw_zero_even: times 4 dw 0x0000, 0xffff
  36. SECTION_TEXT
  37. ;------------------------------------------------------------------------------
  38. ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
  39. ;------------------------------------------------------------------------------
  40. INIT_XMM sse2
  41. cglobal conv_s16_to_s32, 3,3,3, dst, src, len
  42. lea lenq, [2*lend]
  43. lea dstq, [dstq+2*lenq]
  44. add srcq, lenq
  45. neg lenq
  46. .loop:
  47. mova m2, [srcq+lenq]
  48. pxor m0, m0
  49. pxor m1, m1
  50. punpcklwd m0, m2
  51. punpckhwd m1, m2
  52. mova [dstq+2*lenq ], m0
  53. mova [dstq+2*lenq+mmsize], m1
  54. add lenq, mmsize
  55. jl .loop
  56. REP_RET
  57. ;------------------------------------------------------------------------------
  58. ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
  59. ;------------------------------------------------------------------------------
  60. %macro CONV_S16_TO_FLT 0
  61. cglobal conv_s16_to_flt, 3,3,3, dst, src, len
  62. lea lenq, [2*lend]
  63. add srcq, lenq
  64. lea dstq, [dstq + 2*lenq]
  65. neg lenq
  66. mova m2, [pf_s16_inv_scale]
  67. ALIGN 16
  68. .loop:
  69. mova m0, [srcq+lenq]
  70. S16_TO_S32_SX 0, 1
  71. cvtdq2ps m0, m0
  72. cvtdq2ps m1, m1
  73. mulps m0, m2
  74. mulps m1, m2
  75. mova [dstq+2*lenq ], m0
  76. mova [dstq+2*lenq+mmsize], m1
  77. add lenq, mmsize
  78. jl .loop
  79. REP_RET
  80. %endmacro
  81. INIT_XMM sse2
  82. CONV_S16_TO_FLT
  83. INIT_XMM sse4
  84. CONV_S16_TO_FLT
  85. ;------------------------------------------------------------------------------
  86. ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
  87. ;------------------------------------------------------------------------------
  88. %macro CONV_S32_TO_S16 0
  89. cglobal conv_s32_to_s16, 3,3,4, dst, src, len
  90. lea lenq, [2*lend]
  91. lea srcq, [srcq+2*lenq]
  92. add dstq, lenq
  93. neg lenq
  94. .loop:
  95. mova m0, [srcq+2*lenq ]
  96. mova m1, [srcq+2*lenq+ mmsize]
  97. mova m2, [srcq+2*lenq+2*mmsize]
  98. mova m3, [srcq+2*lenq+3*mmsize]
  99. psrad m0, 16
  100. psrad m1, 16
  101. psrad m2, 16
  102. psrad m3, 16
  103. packssdw m0, m1
  104. packssdw m2, m3
  105. mova [dstq+lenq ], m0
  106. mova [dstq+lenq+mmsize], m2
  107. add lenq, mmsize*2
  108. jl .loop
  109. %if mmsize == 8
  110. emms
  111. RET
  112. %else
  113. REP_RET
  114. %endif
  115. %endmacro
  116. INIT_MMX mmx
  117. CONV_S32_TO_S16
  118. INIT_XMM sse2
  119. CONV_S32_TO_S16
  120. ;------------------------------------------------------------------------------
  121. ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
  122. ;------------------------------------------------------------------------------
  123. %macro CONV_S32_TO_FLT 0
  124. cglobal conv_s32_to_flt, 3,3,3, dst, src, len
  125. lea lenq, [4*lend]
  126. add srcq, lenq
  127. add dstq, lenq
  128. neg lenq
  129. mova m0, [pf_s32_inv_scale]
  130. ALIGN 16
  131. .loop:
  132. cvtdq2ps m1, [srcq+lenq ]
  133. cvtdq2ps m2, [srcq+lenq+mmsize]
  134. mulps m1, m1, m0
  135. mulps m2, m2, m0
  136. mova [dstq+lenq ], m1
  137. mova [dstq+lenq+mmsize], m2
  138. add lenq, mmsize*2
  139. jl .loop
  140. REP_RET
  141. %endmacro
  142. INIT_XMM sse2
  143. CONV_S32_TO_FLT
  144. %if HAVE_AVX_EXTERNAL
  145. INIT_YMM avx
  146. CONV_S32_TO_FLT
  147. %endif
  148. ;------------------------------------------------------------------------------
  149. ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
  150. ;------------------------------------------------------------------------------
  151. INIT_XMM sse2
  152. cglobal conv_flt_to_s16, 3,3,5, dst, src, len
  153. lea lenq, [2*lend]
  154. lea srcq, [srcq+2*lenq]
  155. add dstq, lenq
  156. neg lenq
  157. mova m4, [pf_s16_scale]
  158. .loop:
  159. mova m0, [srcq+2*lenq ]
  160. mova m1, [srcq+2*lenq+1*mmsize]
  161. mova m2, [srcq+2*lenq+2*mmsize]
  162. mova m3, [srcq+2*lenq+3*mmsize]
  163. mulps m0, m4
  164. mulps m1, m4
  165. mulps m2, m4
  166. mulps m3, m4
  167. cvtps2dq m0, m0
  168. cvtps2dq m1, m1
  169. cvtps2dq m2, m2
  170. cvtps2dq m3, m3
  171. packssdw m0, m1
  172. packssdw m2, m3
  173. mova [dstq+lenq ], m0
  174. mova [dstq+lenq+mmsize], m2
  175. add lenq, mmsize*2
  176. jl .loop
  177. REP_RET
  178. ;------------------------------------------------------------------------------
  179. ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
  180. ;------------------------------------------------------------------------------
  181. %macro CONV_FLT_TO_S32 0
  182. cglobal conv_flt_to_s32, 3,3,6, dst, src, len
  183. lea lenq, [lend*4]
  184. add srcq, lenq
  185. add dstq, lenq
  186. neg lenq
  187. mova m4, [pf_s32_scale]
  188. mova m5, [pf_s32_clip]
  189. .loop:
  190. mulps m0, m4, [srcq+lenq ]
  191. mulps m1, m4, [srcq+lenq+1*mmsize]
  192. mulps m2, m4, [srcq+lenq+2*mmsize]
  193. mulps m3, m4, [srcq+lenq+3*mmsize]
  194. minps m0, m0, m5
  195. minps m1, m1, m5
  196. minps m2, m2, m5
  197. minps m3, m3, m5
  198. cvtps2dq m0, m0
  199. cvtps2dq m1, m1
  200. cvtps2dq m2, m2
  201. cvtps2dq m3, m3
  202. mova [dstq+lenq ], m0
  203. mova [dstq+lenq+1*mmsize], m1
  204. mova [dstq+lenq+2*mmsize], m2
  205. mova [dstq+lenq+3*mmsize], m3
  206. add lenq, mmsize*4
  207. jl .loop
  208. REP_RET
  209. %endmacro
  210. INIT_XMM sse2
  211. CONV_FLT_TO_S32
  212. %if HAVE_AVX_EXTERNAL
  213. INIT_YMM avx
  214. CONV_FLT_TO_S32
  215. %endif
  216. ;------------------------------------------------------------------------------
  217. ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
  218. ; int channels);
  219. ;------------------------------------------------------------------------------
  220. %macro CONV_S16P_TO_S16_2CH 0
  221. cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
  222. mov src1q, [src0q+gprsize]
  223. mov src0q, [src0q ]
  224. lea lenq, [2*lend]
  225. add src0q, lenq
  226. add src1q, lenq
  227. lea dstq, [dstq+2*lenq]
  228. neg lenq
  229. .loop
  230. mova m0, [src0q+lenq ]
  231. mova m1, [src1q+lenq ]
  232. mova m2, [src0q+lenq+mmsize]
  233. mova m3, [src1q+lenq+mmsize]
  234. SBUTTERFLY2 wd, 0, 1, 4
  235. SBUTTERFLY2 wd, 2, 3, 4
  236. mova [dstq+2*lenq+0*mmsize], m0
  237. mova [dstq+2*lenq+1*mmsize], m1
  238. mova [dstq+2*lenq+2*mmsize], m2
  239. mova [dstq+2*lenq+3*mmsize], m3
  240. add lenq, 2*mmsize
  241. jl .loop
  242. REP_RET
  243. %endmacro
  244. INIT_XMM sse2
  245. CONV_S16P_TO_S16_2CH
  246. %if HAVE_AVX_EXTERNAL
  247. INIT_XMM avx
  248. CONV_S16P_TO_S16_2CH
  249. %endif
  250. ;------------------------------------------------------------------------------
  251. ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
  252. ; int channels);
  253. ;------------------------------------------------------------------------------
  254. ;------------------------------------------------------------------------------
  255. ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
  256. ; instead of just a counter, which would avoid incrementing the
  257. ; pointers, but the extra complexity and amount of code is not worth
  258. ; the small gain. On x86-32 there are not enough registers to use len
  259. ; as an index without keeping two of the pointers on the stack and
  260. ; loading them in each iteration.
  261. ;------------------------------------------------------------------------------
  262. %macro CONV_S16P_TO_S16_6CH 0
  263. %if ARCH_X86_64
  264. cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
  265. %else
  266. cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
  267. %define lend dword r2m
  268. %endif
  269. mov src1q, [src0q+1*gprsize]
  270. mov src2q, [src0q+2*gprsize]
  271. mov src3q, [src0q+3*gprsize]
  272. mov src4q, [src0q+4*gprsize]
  273. mov src5q, [src0q+5*gprsize]
  274. mov src0q, [src0q]
  275. sub src1q, src0q
  276. sub src2q, src0q
  277. sub src3q, src0q
  278. sub src4q, src0q
  279. sub src5q, src0q
  280. .loop:
  281. %if cpuflag(sse2slow)
  282. movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
  283. movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  284. movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  285. movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  286. movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  287. movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  288. ; unpack words:
  289. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  290. punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
  291. punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
  292. ; blend dwords
  293. shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
  294. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  295. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  296. ; shuffle dwords
  297. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  298. pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
  299. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  300. movq [dstq+0*mmsize/2], m1
  301. movq [dstq+1*mmsize/2], m0
  302. movq [dstq+2*mmsize/2], m2
  303. movhps [dstq+3*mmsize/2], m1
  304. movhps [dstq+4*mmsize/2], m0
  305. movhps [dstq+5*mmsize/2], m2
  306. add src0q, mmsize/2
  307. add dstq, mmsize*3
  308. sub lend, mmsize/4
  309. %else
  310. mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
  311. mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
  312. mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
  313. mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
  314. mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
  315. mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
  316. ; unpack words:
  317. SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  318. ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
  319. SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  320. ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
  321. SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  322. ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
  323. ; blend dwords
  324. shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
  325. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  326. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  327. SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
  328. shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
  329. shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
  330. shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
  331. SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
  332. ; shuffle dwords
  333. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  334. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  335. pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
  336. pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
  337. pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
  338. pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
  339. ; shuffle qwords
  340. punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
  341. punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  342. shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
  343. SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
  344. punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
  345. punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
  346. shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
  347. SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
  348. mova [dstq+0*mmsize], m4
  349. mova [dstq+1*mmsize], m2
  350. mova [dstq+2*mmsize], m0
  351. mova [dstq+3*mmsize], m5
  352. mova [dstq+4*mmsize], m3
  353. mova [dstq+5*mmsize], m1
  354. add src0q, mmsize
  355. add dstq, mmsize*6
  356. sub lend, mmsize/2
  357. %endif
  358. jg .loop
  359. REP_RET
  360. %endmacro
  361. INIT_XMM sse2
  362. CONV_S16P_TO_S16_6CH
  363. INIT_XMM sse2slow
  364. CONV_S16P_TO_S16_6CH
  365. %if HAVE_AVX_EXTERNAL
  366. INIT_XMM avx
  367. CONV_S16P_TO_S16_6CH
  368. %endif
  369. ;------------------------------------------------------------------------------
  370. ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
  371. ; int channels);
  372. ;------------------------------------------------------------------------------
  373. %macro CONV_S16P_TO_FLT_2CH 0
  374. cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
  375. lea lenq, [2*lend]
  376. mov src1q, [src0q+gprsize]
  377. mov src0q, [src0q ]
  378. lea dstq, [dstq+4*lenq]
  379. add src0q, lenq
  380. add src1q, lenq
  381. neg lenq
  382. mova m5, [pf_s32_inv_scale]
  383. .loop:
  384. mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
  385. mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
  386. SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
  387. ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
  388. pxor m3, m3
  389. punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
  390. punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
  391. punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
  392. punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
  393. cvtdq2ps m0, m0
  394. cvtdq2ps m1, m1
  395. cvtdq2ps m2, m2
  396. cvtdq2ps m3, m3
  397. mulps m0, m5
  398. mulps m1, m5
  399. mulps m2, m5
  400. mulps m3, m5
  401. mova [dstq+4*lenq ], m0
  402. mova [dstq+4*lenq+ mmsize], m1
  403. mova [dstq+4*lenq+2*mmsize], m2
  404. mova [dstq+4*lenq+3*mmsize], m3
  405. add lenq, mmsize
  406. jl .loop
  407. REP_RET
  408. %endmacro
  409. INIT_XMM sse2
  410. CONV_S16P_TO_FLT_2CH
  411. %if HAVE_AVX_EXTERNAL
  412. INIT_XMM avx
  413. CONV_S16P_TO_FLT_2CH
  414. %endif
  415. ;------------------------------------------------------------------------------
  416. ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
  417. ; int channels);
  418. ;------------------------------------------------------------------------------
  419. %macro CONV_S16P_TO_FLT_6CH 0
  420. %if ARCH_X86_64
  421. cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
  422. %else
  423. cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
  424. %define lend dword r2m
  425. %endif
  426. mov src1q, [srcq+1*gprsize]
  427. mov src2q, [srcq+2*gprsize]
  428. mov src3q, [srcq+3*gprsize]
  429. mov src4q, [srcq+4*gprsize]
  430. mov src5q, [srcq+5*gprsize]
  431. mov srcq, [srcq]
  432. sub src1q, srcq
  433. sub src2q, srcq
  434. sub src3q, srcq
  435. sub src4q, srcq
  436. sub src5q, srcq
  437. mova m7, [pf_s32_inv_scale]
  438. %if cpuflag(ssse3)
  439. %define unpack_even m6
  440. mova m6, [pb_shuf_unpack_even]
  441. %if ARCH_X86_64
  442. %define unpack_odd m8
  443. mova m8, [pb_shuf_unpack_odd]
  444. %else
  445. %define unpack_odd [pb_shuf_unpack_odd]
  446. %endif
  447. %endif
  448. .loop:
  449. movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
  450. movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  451. movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  452. movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  453. movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  454. movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  455. ; unpack words:
  456. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  457. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  458. punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  459. ; blend dwords
  460. shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
  461. shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
  462. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  463. %if cpuflag(ssse3)
  464. pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
  465. pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
  466. pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
  467. pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
  468. pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
  469. pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
  470. %else
  471. ; shuffle dwords
  472. pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
  473. pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
  474. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  475. pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
  476. punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
  477. punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
  478. punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
  479. punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
  480. punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
  481. punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
  482. SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
  483. %endif
  484. cvtdq2ps m0, m0 ; convert s32 to float
  485. cvtdq2ps m1, m1
  486. cvtdq2ps m2, m2
  487. cvtdq2ps m3, m3
  488. cvtdq2ps m4, m4
  489. cvtdq2ps m5, m5
  490. mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
  491. mulps m1, m7
  492. mulps m2, m7
  493. mulps m3, m7
  494. mulps m4, m7
  495. mulps m5, m7
  496. mova [dstq ], m0
  497. mova [dstq+ mmsize], m1
  498. mova [dstq+2*mmsize], m2
  499. mova [dstq+3*mmsize], m3
  500. mova [dstq+4*mmsize], m4
  501. mova [dstq+5*mmsize], m5
  502. add srcq, mmsize/2
  503. add dstq, mmsize*6
  504. sub lend, mmsize/4
  505. jg .loop
  506. REP_RET
  507. %endmacro
  508. INIT_XMM sse2
  509. CONV_S16P_TO_FLT_6CH
  510. INIT_XMM ssse3
  511. CONV_S16P_TO_FLT_6CH
  512. %if HAVE_AVX_EXTERNAL
  513. INIT_XMM avx
  514. CONV_S16P_TO_FLT_6CH
  515. %endif
  516. ;------------------------------------------------------------------------------
  517. ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
  518. ; int channels);
  519. ;------------------------------------------------------------------------------
  520. %macro CONV_FLTP_TO_S16_2CH 0
  521. cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
  522. lea lenq, [4*lend]
  523. mov src1q, [src0q+gprsize]
  524. mov src0q, [src0q ]
  525. add dstq, lenq
  526. add src0q, lenq
  527. add src1q, lenq
  528. neg lenq
  529. mova m2, [pf_s16_scale]
  530. %if cpuflag(ssse3)
  531. mova m3, [pb_interleave_words]
  532. %endif
  533. .loop:
  534. mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
  535. mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
  536. cvtps2dq m0, m0
  537. cvtps2dq m1, m1
  538. %if cpuflag(ssse3)
  539. packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  540. pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  541. %else
  542. packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
  543. packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
  544. punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  545. %endif
  546. mova [dstq+lenq], m0
  547. add lenq, mmsize
  548. jl .loop
  549. REP_RET
  550. %endmacro
  551. INIT_XMM sse2
  552. CONV_FLTP_TO_S16_2CH
  553. INIT_XMM ssse3
  554. CONV_FLTP_TO_S16_2CH
  555. ;------------------------------------------------------------------------------
  556. ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
  557. ; int channels);
  558. ;------------------------------------------------------------------------------
  559. %macro CONV_FLTP_TO_S16_6CH 0
  560. %if ARCH_X86_64
  561. cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
  562. %else
  563. cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
  564. %define lend dword r2m
  565. %endif
  566. mov src1q, [srcq+1*gprsize]
  567. mov src2q, [srcq+2*gprsize]
  568. mov src3q, [srcq+3*gprsize]
  569. mov src4q, [srcq+4*gprsize]
  570. mov src5q, [srcq+5*gprsize]
  571. mov srcq, [srcq]
  572. sub src1q, srcq
  573. sub src2q, srcq
  574. sub src3q, srcq
  575. sub src4q, srcq
  576. sub src5q, srcq
  577. movaps xmm6, [pf_s16_scale]
  578. .loop:
  579. %if cpuflag(sse2)
  580. mulps m0, m6, [srcq ]
  581. mulps m1, m6, [srcq+src1q]
  582. mulps m2, m6, [srcq+src2q]
  583. mulps m3, m6, [srcq+src3q]
  584. mulps m4, m6, [srcq+src4q]
  585. mulps m5, m6, [srcq+src5q]
  586. cvtps2dq m0, m0
  587. cvtps2dq m1, m1
  588. cvtps2dq m2, m2
  589. cvtps2dq m3, m3
  590. cvtps2dq m4, m4
  591. cvtps2dq m5, m5
  592. packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
  593. packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
  594. packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
  595. ; unpack words:
  596. movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
  597. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  598. punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
  599. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  600. ; blend dwords:
  601. shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
  602. shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  603. shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  604. ; shuffle dwords:
  605. shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  606. shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
  607. shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  608. mova [dstq+0*mmsize], m3
  609. mova [dstq+1*mmsize], m1
  610. mova [dstq+2*mmsize], m0
  611. %else ; sse
  612. movlps xmm0, [srcq ]
  613. movlps xmm1, [srcq+src1q]
  614. movlps xmm2, [srcq+src2q]
  615. movlps xmm3, [srcq+src3q]
  616. movlps xmm4, [srcq+src4q]
  617. movlps xmm5, [srcq+src5q]
  618. mulps xmm0, xmm6
  619. mulps xmm1, xmm6
  620. mulps xmm2, xmm6
  621. mulps xmm3, xmm6
  622. mulps xmm4, xmm6
  623. mulps xmm5, xmm6
  624. cvtps2pi mm0, xmm0
  625. cvtps2pi mm1, xmm1
  626. cvtps2pi mm2, xmm2
  627. cvtps2pi mm3, xmm3
  628. cvtps2pi mm4, xmm4
  629. cvtps2pi mm5, xmm5
  630. packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
  631. packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
  632. packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
  633. ; unpack words
  634. pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
  635. punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
  636. punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
  637. punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
  638. ; unpack dwords
  639. pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
  640. punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
  641. punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
  642. punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
  643. mova [dstq+0*mmsize], mm0
  644. mova [dstq+1*mmsize], mm1
  645. mova [dstq+2*mmsize], mm2
  646. %endif
  647. add srcq, mmsize
  648. add dstq, mmsize*3
  649. sub lend, mmsize/4
  650. jg .loop
  651. %if mmsize == 8
  652. emms
  653. RET
  654. %else
  655. REP_RET
  656. %endif
  657. %endmacro
  658. INIT_MMX sse
  659. CONV_FLTP_TO_S16_6CH
  660. INIT_XMM sse2
  661. CONV_FLTP_TO_S16_6CH
  662. %if HAVE_AVX_EXTERNAL
  663. INIT_XMM avx
  664. CONV_FLTP_TO_S16_6CH
  665. %endif
  666. ;------------------------------------------------------------------------------
  667. ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
  668. ; int channels);
  669. ;------------------------------------------------------------------------------
  670. %macro CONV_FLTP_TO_FLT_2CH 0
  671. cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
  672. mov src1q, [src0q+gprsize]
  673. mov src0q, [src0q]
  674. lea lenq, [4*lend]
  675. add src0q, lenq
  676. add src1q, lenq
  677. lea dstq, [dstq+2*lenq]
  678. neg lenq
  679. .loop
  680. mova m0, [src0q+lenq ]
  681. mova m1, [src1q+lenq ]
  682. mova m2, [src0q+lenq+mmsize]
  683. mova m3, [src1q+lenq+mmsize]
  684. SBUTTERFLYPS 0, 1, 4
  685. SBUTTERFLYPS 2, 3, 4
  686. mova [dstq+2*lenq+0*mmsize], m0
  687. mova [dstq+2*lenq+1*mmsize], m1
  688. mova [dstq+2*lenq+2*mmsize], m2
  689. mova [dstq+2*lenq+3*mmsize], m3
  690. add lenq, 2*mmsize
  691. jl .loop
  692. REP_RET
  693. %endmacro
  694. INIT_XMM sse
  695. CONV_FLTP_TO_FLT_2CH
  696. %if HAVE_AVX_EXTERNAL
  697. INIT_XMM avx
  698. CONV_FLTP_TO_FLT_2CH
  699. %endif
  700. ;-----------------------------------------------------------------------------
  701. ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
  702. ; int channels);
  703. ;-----------------------------------------------------------------------------
  704. %macro CONV_FLTP_TO_FLT_6CH 0
  705. cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  706. %if ARCH_X86_64
  707. mov lend, r2d
  708. %else
  709. %define lend dword r2m
  710. %endif
  711. mov src1q, [srcq+1*gprsize]
  712. mov src2q, [srcq+2*gprsize]
  713. mov src3q, [srcq+3*gprsize]
  714. mov src4q, [srcq+4*gprsize]
  715. mov src5q, [srcq+5*gprsize]
  716. mov srcq, [srcq]
  717. sub src1q, srcq
  718. sub src2q, srcq
  719. sub src3q, srcq
  720. sub src4q, srcq
  721. sub src5q, srcq
  722. .loop:
  723. mova m0, [srcq ]
  724. mova m1, [srcq+src1q]
  725. mova m2, [srcq+src2q]
  726. mova m3, [srcq+src3q]
  727. mova m4, [srcq+src4q]
  728. mova m5, [srcq+src5q]
  729. %if cpuflag(sse4)
  730. SBUTTERFLYPS 0, 1, 6
  731. SBUTTERFLYPS 2, 3, 6
  732. SBUTTERFLYPS 4, 5, 6
  733. blendps m6, m4, m0, 1100b
  734. movlhps m0, m2
  735. movhlps m4, m2
  736. blendps m2, m5, m1, 1100b
  737. movlhps m1, m3
  738. movhlps m5, m3
  739. movaps [dstq ], m0
  740. movaps [dstq+16], m6
  741. movaps [dstq+32], m4
  742. movaps [dstq+48], m1
  743. movaps [dstq+64], m2
  744. movaps [dstq+80], m5
  745. %else ; mmx
  746. SBUTTERFLY dq, 0, 1, 6
  747. SBUTTERFLY dq, 2, 3, 6
  748. SBUTTERFLY dq, 4, 5, 6
  749. movq [dstq ], m0
  750. movq [dstq+ 8], m2
  751. movq [dstq+16], m4
  752. movq [dstq+24], m1
  753. movq [dstq+32], m3
  754. movq [dstq+40], m5
  755. %endif
  756. add srcq, mmsize
  757. add dstq, mmsize*6
  758. sub lend, mmsize/4
  759. jg .loop
  760. %if mmsize == 8
  761. emms
  762. RET
  763. %else
  764. REP_RET
  765. %endif
  766. %endmacro
  767. INIT_MMX mmx
  768. CONV_FLTP_TO_FLT_6CH
  769. INIT_XMM sse4
  770. CONV_FLTP_TO_FLT_6CH
  771. %if HAVE_AVX_EXTERNAL
  772. INIT_XMM avx
  773. CONV_FLTP_TO_FLT_6CH
  774. %endif
  775. ;------------------------------------------------------------------------------
  776. ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
  777. ; int channels);
  778. ;------------------------------------------------------------------------------
  779. %macro CONV_S16_TO_S16P_2CH 0
  780. cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
  781. lea lenq, [2*lend]
  782. mov dst1q, [dst0q+gprsize]
  783. mov dst0q, [dst0q ]
  784. lea srcq, [srcq+2*lenq]
  785. add dst0q, lenq
  786. add dst1q, lenq
  787. neg lenq
  788. %if cpuflag(ssse3)
  789. mova m3, [pb_deinterleave_words]
  790. %endif
  791. .loop:
  792. mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  793. mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  794. %if cpuflag(ssse3)
  795. pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  796. pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
  797. SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  798. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  799. %else ; sse2
  800. pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
  801. pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
  802. pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
  803. pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
  804. DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  805. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  806. %endif
  807. mova [dst0q+lenq], m0
  808. mova [dst1q+lenq], m1
  809. add lenq, mmsize
  810. jl .loop
  811. REP_RET
  812. %endmacro
  813. INIT_XMM sse2
  814. CONV_S16_TO_S16P_2CH
  815. INIT_XMM ssse3
  816. CONV_S16_TO_S16P_2CH
  817. %if HAVE_AVX_EXTERNAL
  818. INIT_XMM avx
  819. CONV_S16_TO_S16P_2CH
  820. %endif
  821. ;------------------------------------------------------------------------------
  822. ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
  823. ; int channels);
  824. ;------------------------------------------------------------------------------
  825. %macro CONV_S16_TO_S16P_6CH 0
  826. %if ARCH_X86_64
  827. cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
  828. %else
  829. cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
  830. %define lend dword r2m
  831. %endif
  832. mov dst1q, [dstq+ gprsize]
  833. mov dst2q, [dstq+2*gprsize]
  834. mov dst3q, [dstq+3*gprsize]
  835. mov dst4q, [dstq+4*gprsize]
  836. mov dst5q, [dstq+5*gprsize]
  837. mov dstq, [dstq ]
  838. sub dst1q, dstq
  839. sub dst2q, dstq
  840. sub dst3q, dstq
  841. sub dst4q, dstq
  842. sub dst5q, dstq
  843. .loop:
  844. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  845. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  846. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  847. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  848. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  849. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  850. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  851. ; m1 = 4, 10, 5, 11, x, x, x, x
  852. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  853. ; m2 = 16, 22, 17, 23, x, x, x, x
  854. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  855. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  856. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  857. movq [dstq ], m0
  858. movhps [dstq+dst1q], m0
  859. movq [dstq+dst2q], m3
  860. movhps [dstq+dst3q], m3
  861. movq [dstq+dst4q], m1
  862. movhps [dstq+dst5q], m1
  863. add srcq, mmsize*3
  864. add dstq, mmsize/2
  865. sub lend, mmsize/4
  866. jg .loop
  867. REP_RET
  868. %endmacro
  869. %define PALIGNR PALIGNR_MMX
  870. INIT_XMM sse2
  871. CONV_S16_TO_S16P_6CH
  872. %define PALIGNR PALIGNR_SSSE3
  873. INIT_XMM ssse3
  874. CONV_S16_TO_S16P_6CH
  875. %if HAVE_AVX_EXTERNAL
  876. INIT_XMM avx
  877. CONV_S16_TO_S16P_6CH
  878. %endif
  879. ;------------------------------------------------------------------------------
  880. ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
  881. ; int channels);
  882. ;------------------------------------------------------------------------------
  883. %macro CONV_S16_TO_FLTP_2CH 0
  884. cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
  885. lea lenq, [4*lend]
  886. mov dst1q, [dst0q+gprsize]
  887. mov dst0q, [dst0q ]
  888. add srcq, lenq
  889. add dst0q, lenq
  890. add dst1q, lenq
  891. neg lenq
  892. mova m3, [pf_s32_inv_scale]
  893. mova m4, [pw_zero_even]
  894. .loop:
  895. mova m1, [srcq+lenq]
  896. pslld m0, m1, 16
  897. pand m1, m4
  898. cvtdq2ps m0, m0
  899. cvtdq2ps m1, m1
  900. mulps m0, m0, m3
  901. mulps m1, m1, m3
  902. mova [dst0q+lenq], m0
  903. mova [dst1q+lenq], m1
  904. add lenq, mmsize
  905. jl .loop
  906. REP_RET
  907. %endmacro
  908. INIT_XMM sse2
  909. CONV_S16_TO_FLTP_2CH
  910. %if HAVE_AVX_EXTERNAL
  911. INIT_XMM avx
  912. CONV_S16_TO_FLTP_2CH
  913. %endif
  914. ;------------------------------------------------------------------------------
  915. ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
  916. ; int channels);
  917. ;------------------------------------------------------------------------------
  918. %macro CONV_S16_TO_FLTP_6CH 0
  919. %if ARCH_X86_64
  920. cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  921. %else
  922. cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  923. %define lend dword r2m
  924. %endif
  925. mov dst1q, [dstq+ gprsize]
  926. mov dst2q, [dstq+2*gprsize]
  927. mov dst3q, [dstq+3*gprsize]
  928. mov dst4q, [dstq+4*gprsize]
  929. mov dst5q, [dstq+5*gprsize]
  930. mov dstq, [dstq ]
  931. sub dst1q, dstq
  932. sub dst2q, dstq
  933. sub dst3q, dstq
  934. sub dst4q, dstq
  935. sub dst5q, dstq
  936. mova m6, [pf_s16_inv_scale]
  937. .loop:
  938. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  939. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  940. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  941. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  942. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  943. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  944. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  945. ; m1 = 4, 10, 5, 11, x, x, x, x
  946. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  947. ; m2 = 16, 22, 17, 23, x, x, x, x
  948. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  949. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  950. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  951. S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
  952. ; m2 = 1, 7, 13, 19
  953. S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
  954. ; m4 = 3, 9, 15, 21
  955. S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
  956. ; m5 = 5, 11, 17, 23
  957. SWAP 1,2,3,4
  958. cvtdq2ps m0, m0
  959. cvtdq2ps m1, m1
  960. cvtdq2ps m2, m2
  961. cvtdq2ps m3, m3
  962. cvtdq2ps m4, m4
  963. cvtdq2ps m5, m5
  964. mulps m0, m6
  965. mulps m1, m6
  966. mulps m2, m6
  967. mulps m3, m6
  968. mulps m4, m6
  969. mulps m5, m6
  970. mova [dstq ], m0
  971. mova [dstq+dst1q], m1
  972. mova [dstq+dst2q], m2
  973. mova [dstq+dst3q], m3
  974. mova [dstq+dst4q], m4
  975. mova [dstq+dst5q], m5
  976. add srcq, mmsize*3
  977. add dstq, mmsize
  978. sub lend, mmsize/4
  979. jg .loop
  980. REP_RET
  981. %endmacro
  982. %define PALIGNR PALIGNR_MMX
  983. INIT_XMM sse2
  984. CONV_S16_TO_FLTP_6CH
  985. %define PALIGNR PALIGNR_SSSE3
  986. INIT_XMM ssse3
  987. CONV_S16_TO_FLTP_6CH
  988. INIT_XMM sse4
  989. CONV_S16_TO_FLTP_6CH
  990. %if HAVE_AVX_EXTERNAL
  991. INIT_XMM avx
  992. CONV_S16_TO_FLTP_6CH
  993. %endif
  994. ;------------------------------------------------------------------------------
  995. ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
  996. ; int channels);
  997. ;------------------------------------------------------------------------------
  998. %macro CONV_FLT_TO_S16P_2CH 0
  999. cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
  1000. lea lenq, [2*lend]
  1001. mov dst1q, [dst0q+gprsize]
  1002. mov dst0q, [dst0q ]
  1003. lea srcq, [srcq+4*lenq]
  1004. add dst0q, lenq
  1005. add dst1q, lenq
  1006. neg lenq
  1007. mova m5, [pf_s16_scale]
  1008. .loop:
  1009. mova m0, [srcq+4*lenq ]
  1010. mova m1, [srcq+4*lenq+ mmsize]
  1011. mova m2, [srcq+4*lenq+2*mmsize]
  1012. mova m3, [srcq+4*lenq+3*mmsize]
  1013. DEINT2_PS 0, 1, 4
  1014. DEINT2_PS 2, 3, 4
  1015. mulps m0, m0, m5
  1016. mulps m1, m1, m5
  1017. mulps m2, m2, m5
  1018. mulps m3, m3, m5
  1019. cvtps2dq m0, m0
  1020. cvtps2dq m1, m1
  1021. cvtps2dq m2, m2
  1022. cvtps2dq m3, m3
  1023. packssdw m0, m2
  1024. packssdw m1, m3
  1025. mova [dst0q+lenq], m0
  1026. mova [dst1q+lenq], m1
  1027. add lenq, mmsize
  1028. jl .loop
  1029. REP_RET
  1030. %endmacro
  1031. INIT_XMM sse2
  1032. CONV_FLT_TO_S16P_2CH
  1033. %if HAVE_AVX_EXTERNAL
  1034. INIT_XMM avx
  1035. CONV_FLT_TO_S16P_2CH
  1036. %endif
  1037. ;------------------------------------------------------------------------------
  1038. ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
  1039. ; int channels);
  1040. ;------------------------------------------------------------------------------
  1041. %macro CONV_FLT_TO_S16P_6CH 0
  1042. %if ARCH_X86_64
  1043. cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1044. %else
  1045. cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1046. %define lend dword r2m
  1047. %endif
  1048. mov dst1q, [dstq+ gprsize]
  1049. mov dst2q, [dstq+2*gprsize]
  1050. mov dst3q, [dstq+3*gprsize]
  1051. mov dst4q, [dstq+4*gprsize]
  1052. mov dst5q, [dstq+5*gprsize]
  1053. mov dstq, [dstq ]
  1054. sub dst1q, dstq
  1055. sub dst2q, dstq
  1056. sub dst3q, dstq
  1057. sub dst4q, dstq
  1058. sub dst5q, dstq
  1059. mova m6, [pf_s16_scale]
  1060. .loop:
  1061. mulps m0, m6, [srcq+0*mmsize]
  1062. mulps m3, m6, [srcq+1*mmsize]
  1063. mulps m1, m6, [srcq+2*mmsize]
  1064. mulps m4, m6, [srcq+3*mmsize]
  1065. mulps m2, m6, [srcq+4*mmsize]
  1066. mulps m5, m6, [srcq+5*mmsize]
  1067. cvtps2dq m0, m0
  1068. cvtps2dq m1, m1
  1069. cvtps2dq m2, m2
  1070. cvtps2dq m3, m3
  1071. cvtps2dq m4, m4
  1072. cvtps2dq m5, m5
  1073. packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  1074. packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  1075. packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  1076. PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
  1077. shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
  1078. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  1079. SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  1080. ; m3 = 4, 10, 5, 11, x, x, x, x
  1081. SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
  1082. ; m2 = 16, 22, 17, 23, x, x, x, x
  1083. SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  1084. ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
  1085. punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
  1086. movq [dstq ], m0
  1087. movhps [dstq+dst1q], m0
  1088. movq [dstq+dst2q], m1
  1089. movhps [dstq+dst3q], m1
  1090. movq [dstq+dst4q], m3
  1091. movhps [dstq+dst5q], m3
  1092. add srcq, mmsize*6
  1093. add dstq, mmsize/2
  1094. sub lend, mmsize/4
  1095. jg .loop
  1096. REP_RET
  1097. %endmacro
  1098. %define PALIGNR PALIGNR_MMX
  1099. INIT_XMM sse2
  1100. CONV_FLT_TO_S16P_6CH
  1101. %define PALIGNR PALIGNR_SSSE3
  1102. INIT_XMM ssse3
  1103. CONV_FLT_TO_S16P_6CH
  1104. %if HAVE_AVX_EXTERNAL
  1105. INIT_XMM avx
  1106. CONV_FLT_TO_S16P_6CH
  1107. %endif
  1108. ;------------------------------------------------------------------------------
  1109. ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
  1110. ; int channels);
  1111. ;------------------------------------------------------------------------------
  1112. %macro CONV_FLT_TO_FLTP_2CH 0
  1113. cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
  1114. lea lenq, [4*lend]
  1115. mov dst1q, [dst0q+gprsize]
  1116. mov dst0q, [dst0q ]
  1117. lea srcq, [srcq+2*lenq]
  1118. add dst0q, lenq
  1119. add dst1q, lenq
  1120. neg lenq
  1121. .loop:
  1122. mova m0, [srcq+2*lenq ]
  1123. mova m1, [srcq+2*lenq+mmsize]
  1124. DEINT2_PS 0, 1, 2
  1125. mova [dst0q+lenq], m0
  1126. mova [dst1q+lenq], m1
  1127. add lenq, mmsize
  1128. jl .loop
  1129. REP_RET
  1130. %endmacro
  1131. INIT_XMM sse
  1132. CONV_FLT_TO_FLTP_2CH
  1133. %if HAVE_AVX_EXTERNAL
  1134. INIT_XMM avx
  1135. CONV_FLT_TO_FLTP_2CH
  1136. %endif
  1137. ;------------------------------------------------------------------------------
  1138. ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
  1139. ; int channels);
  1140. ;------------------------------------------------------------------------------
  1141. %macro CONV_FLT_TO_FLTP_6CH 0
  1142. %if ARCH_X86_64
  1143. cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1144. %else
  1145. cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1146. %define lend dword r2m
  1147. %endif
  1148. mov dst1q, [dstq+ gprsize]
  1149. mov dst2q, [dstq+2*gprsize]
  1150. mov dst3q, [dstq+3*gprsize]
  1151. mov dst4q, [dstq+4*gprsize]
  1152. mov dst5q, [dstq+5*gprsize]
  1153. mov dstq, [dstq ]
  1154. sub dst1q, dstq
  1155. sub dst2q, dstq
  1156. sub dst3q, dstq
  1157. sub dst4q, dstq
  1158. sub dst5q, dstq
  1159. .loop:
  1160. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
  1161. mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
  1162. mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
  1163. mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
  1164. mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
  1165. mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
  1166. SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
  1167. ; m3 = 2, 14, 3, 15
  1168. SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
  1169. ; m4 = 6, 18, 7, 19
  1170. SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
  1171. ; m5 = 10, 22, 11, 23
  1172. SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
  1173. ; m4 = 1, 7, 13, 19
  1174. SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
  1175. ; m2 = 3, 9, 15, 21
  1176. SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
  1177. ; m5 = 5, 11, 17, 23
  1178. mova [dstq ], m0
  1179. mova [dstq+dst1q], m4
  1180. mova [dstq+dst2q], m3
  1181. mova [dstq+dst3q], m2
  1182. mova [dstq+dst4q], m1
  1183. mova [dstq+dst5q], m5
  1184. add srcq, mmsize*6
  1185. add dstq, mmsize
  1186. sub lend, mmsize/4
  1187. jg .loop
  1188. REP_RET
  1189. %endmacro
  1190. INIT_XMM sse2
  1191. CONV_FLT_TO_FLTP_6CH
  1192. %if HAVE_AVX_EXTERNAL
  1193. INIT_XMM avx
  1194. CONV_FLT_TO_FLTP_6CH
  1195. %endif