You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1268 lines
43KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. %include "util.asm"
  24. SECTION_RODATA 32
  25. pf_s32_inv_scale: times 8 dd 0x30000000
  26. pf_s32_scale: times 8 dd 0x4f000000
  27. pf_s32_clip: times 8 dd 0x4effffff
  28. pf_s16_inv_scale: times 4 dd 0x38000000
  29. pf_s16_scale: times 4 dd 0x47000000
  30. pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
  31. pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
  32. pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
  33. pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
  34. pw_zero_even: times 4 dw 0x0000, 0xffff
  35. SECTION_TEXT
  36. ;------------------------------------------------------------------------------
  37. ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
  38. ;------------------------------------------------------------------------------
  39. INIT_XMM sse2
  40. cglobal conv_s16_to_s32, 3,3,3, dst, src, len
  41. lea lenq, [2*lend]
  42. lea dstq, [dstq+2*lenq]
  43. add srcq, lenq
  44. neg lenq
  45. .loop:
  46. mova m2, [srcq+lenq]
  47. pxor m0, m0
  48. pxor m1, m1
  49. punpcklwd m0, m2
  50. punpckhwd m1, m2
  51. mova [dstq+2*lenq ], m0
  52. mova [dstq+2*lenq+mmsize], m1
  53. add lenq, mmsize
  54. jl .loop
  55. REP_RET
  56. ;------------------------------------------------------------------------------
  57. ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
  58. ;------------------------------------------------------------------------------
  59. %macro CONV_S16_TO_FLT 0
  60. cglobal conv_s16_to_flt, 3,3,3, dst, src, len
  61. lea lenq, [2*lend]
  62. add srcq, lenq
  63. lea dstq, [dstq + 2*lenq]
  64. neg lenq
  65. mova m2, [pf_s16_inv_scale]
  66. ALIGN 16
  67. .loop:
  68. mova m0, [srcq+lenq]
  69. S16_TO_S32_SX 0, 1
  70. cvtdq2ps m0, m0
  71. cvtdq2ps m1, m1
  72. mulps m0, m2
  73. mulps m1, m2
  74. mova [dstq+2*lenq ], m0
  75. mova [dstq+2*lenq+mmsize], m1
  76. add lenq, mmsize
  77. jl .loop
  78. REP_RET
  79. %endmacro
  80. INIT_XMM sse2
  81. CONV_S16_TO_FLT
  82. INIT_XMM sse4
  83. CONV_S16_TO_FLT
  84. ;------------------------------------------------------------------------------
  85. ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
  86. ;------------------------------------------------------------------------------
  87. %macro CONV_S32_TO_S16 0
  88. cglobal conv_s32_to_s16, 3,3,4, dst, src, len
  89. lea lenq, [2*lend]
  90. lea srcq, [srcq+2*lenq]
  91. add dstq, lenq
  92. neg lenq
  93. .loop:
  94. mova m0, [srcq+2*lenq ]
  95. mova m1, [srcq+2*lenq+ mmsize]
  96. mova m2, [srcq+2*lenq+2*mmsize]
  97. mova m3, [srcq+2*lenq+3*mmsize]
  98. psrad m0, 16
  99. psrad m1, 16
  100. psrad m2, 16
  101. psrad m3, 16
  102. packssdw m0, m1
  103. packssdw m2, m3
  104. mova [dstq+lenq ], m0
  105. mova [dstq+lenq+mmsize], m2
  106. add lenq, mmsize*2
  107. jl .loop
  108. %if mmsize == 8
  109. emms
  110. RET
  111. %else
  112. REP_RET
  113. %endif
  114. %endmacro
  115. INIT_MMX mmx
  116. CONV_S32_TO_S16
  117. INIT_XMM sse2
  118. CONV_S32_TO_S16
  119. ;------------------------------------------------------------------------------
  120. ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
  121. ;------------------------------------------------------------------------------
  122. %macro CONV_S32_TO_FLT 0
  123. cglobal conv_s32_to_flt, 3,3,3, dst, src, len
  124. lea lenq, [4*lend]
  125. add srcq, lenq
  126. add dstq, lenq
  127. neg lenq
  128. mova m0, [pf_s32_inv_scale]
  129. ALIGN 16
  130. .loop:
  131. cvtdq2ps m1, [srcq+lenq ]
  132. cvtdq2ps m2, [srcq+lenq+mmsize]
  133. mulps m1, m1, m0
  134. mulps m2, m2, m0
  135. mova [dstq+lenq ], m1
  136. mova [dstq+lenq+mmsize], m2
  137. add lenq, mmsize*2
  138. jl .loop
  139. REP_RET
  140. %endmacro
  141. INIT_XMM sse2
  142. CONV_S32_TO_FLT
  143. %if HAVE_AVX_EXTERNAL
  144. INIT_YMM avx
  145. CONV_S32_TO_FLT
  146. %endif
  147. ;------------------------------------------------------------------------------
  148. ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
  149. ;------------------------------------------------------------------------------
  150. INIT_XMM sse2
  151. cglobal conv_flt_to_s16, 3,3,5, dst, src, len
  152. lea lenq, [2*lend]
  153. lea srcq, [srcq+2*lenq]
  154. add dstq, lenq
  155. neg lenq
  156. mova m4, [pf_s16_scale]
  157. .loop:
  158. mova m0, [srcq+2*lenq ]
  159. mova m1, [srcq+2*lenq+1*mmsize]
  160. mova m2, [srcq+2*lenq+2*mmsize]
  161. mova m3, [srcq+2*lenq+3*mmsize]
  162. mulps m0, m4
  163. mulps m1, m4
  164. mulps m2, m4
  165. mulps m3, m4
  166. cvtps2dq m0, m0
  167. cvtps2dq m1, m1
  168. cvtps2dq m2, m2
  169. cvtps2dq m3, m3
  170. packssdw m0, m1
  171. packssdw m2, m3
  172. mova [dstq+lenq ], m0
  173. mova [dstq+lenq+mmsize], m2
  174. add lenq, mmsize*2
  175. jl .loop
  176. REP_RET
  177. ;------------------------------------------------------------------------------
  178. ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
  179. ;------------------------------------------------------------------------------
  180. %macro CONV_FLT_TO_S32 0
  181. cglobal conv_flt_to_s32, 3,3,6, dst, src, len
  182. lea lenq, [lend*4]
  183. add srcq, lenq
  184. add dstq, lenq
  185. neg lenq
  186. mova m4, [pf_s32_scale]
  187. mova m5, [pf_s32_clip]
  188. .loop:
  189. mulps m0, m4, [srcq+lenq ]
  190. mulps m1, m4, [srcq+lenq+1*mmsize]
  191. mulps m2, m4, [srcq+lenq+2*mmsize]
  192. mulps m3, m4, [srcq+lenq+3*mmsize]
  193. minps m0, m0, m5
  194. minps m1, m1, m5
  195. minps m2, m2, m5
  196. minps m3, m3, m5
  197. cvtps2dq m0, m0
  198. cvtps2dq m1, m1
  199. cvtps2dq m2, m2
  200. cvtps2dq m3, m3
  201. mova [dstq+lenq ], m0
  202. mova [dstq+lenq+1*mmsize], m1
  203. mova [dstq+lenq+2*mmsize], m2
  204. mova [dstq+lenq+3*mmsize], m3
  205. add lenq, mmsize*4
  206. jl .loop
  207. REP_RET
  208. %endmacro
  209. INIT_XMM sse2
  210. CONV_FLT_TO_S32
  211. %if HAVE_AVX_EXTERNAL
  212. INIT_YMM avx
  213. CONV_FLT_TO_S32
  214. %endif
  215. ;------------------------------------------------------------------------------
  216. ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
  217. ; int channels);
  218. ;------------------------------------------------------------------------------
  219. %macro CONV_S16P_TO_S16_2CH 0
  220. cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
  221. mov src1q, [src0q+gprsize]
  222. mov src0q, [src0q ]
  223. lea lenq, [2*lend]
  224. add src0q, lenq
  225. add src1q, lenq
  226. lea dstq, [dstq+2*lenq]
  227. neg lenq
  228. .loop:
  229. mova m0, [src0q+lenq ]
  230. mova m1, [src1q+lenq ]
  231. mova m2, [src0q+lenq+mmsize]
  232. mova m3, [src1q+lenq+mmsize]
  233. SBUTTERFLY2 wd, 0, 1, 4
  234. SBUTTERFLY2 wd, 2, 3, 4
  235. mova [dstq+2*lenq+0*mmsize], m0
  236. mova [dstq+2*lenq+1*mmsize], m1
  237. mova [dstq+2*lenq+2*mmsize], m2
  238. mova [dstq+2*lenq+3*mmsize], m3
  239. add lenq, 2*mmsize
  240. jl .loop
  241. REP_RET
  242. %endmacro
  243. INIT_XMM sse2
  244. CONV_S16P_TO_S16_2CH
  245. %if HAVE_AVX_EXTERNAL
  246. INIT_XMM avx
  247. CONV_S16P_TO_S16_2CH
  248. %endif
  249. ;------------------------------------------------------------------------------
  250. ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
  251. ; int channels);
  252. ;------------------------------------------------------------------------------
  253. ;------------------------------------------------------------------------------
  254. ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
  255. ; instead of just a counter, which would avoid incrementing the
  256. ; pointers, but the extra complexity and amount of code is not worth
  257. ; the small gain. On x86-32 there are not enough registers to use len
  258. ; as an index without keeping two of the pointers on the stack and
  259. ; loading them in each iteration.
  260. ;------------------------------------------------------------------------------
  261. %macro CONV_S16P_TO_S16_6CH 0
  262. %if ARCH_X86_64
  263. cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
  264. %else
  265. cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
  266. %define lend dword r2m
  267. %endif
  268. mov src1q, [src0q+1*gprsize]
  269. mov src2q, [src0q+2*gprsize]
  270. mov src3q, [src0q+3*gprsize]
  271. mov src4q, [src0q+4*gprsize]
  272. mov src5q, [src0q+5*gprsize]
  273. mov src0q, [src0q]
  274. sub src1q, src0q
  275. sub src2q, src0q
  276. sub src3q, src0q
  277. sub src4q, src0q
  278. sub src5q, src0q
  279. .loop:
  280. %if cpuflag(sse2slow)
  281. movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
  282. movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  283. movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  284. movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  285. movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  286. movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  287. ; unpack words:
  288. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  289. punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
  290. punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
  291. ; blend dwords
  292. shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
  293. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  294. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  295. ; shuffle dwords
  296. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  297. pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
  298. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  299. movq [dstq+0*mmsize/2], m1
  300. movq [dstq+1*mmsize/2], m0
  301. movq [dstq+2*mmsize/2], m2
  302. movhps [dstq+3*mmsize/2], m1
  303. movhps [dstq+4*mmsize/2], m0
  304. movhps [dstq+5*mmsize/2], m2
  305. add src0q, mmsize/2
  306. add dstq, mmsize*3
  307. sub lend, mmsize/4
  308. %else
  309. mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
  310. mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
  311. mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
  312. mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
  313. mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
  314. mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
  315. ; unpack words:
  316. SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  317. ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
  318. SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  319. ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
  320. SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  321. ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
  322. ; blend dwords
  323. shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
  324. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  325. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  326. SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
  327. shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
  328. shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
  329. shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
  330. SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
  331. ; shuffle dwords
  332. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  333. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  334. pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
  335. pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
  336. pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
  337. pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
  338. ; shuffle qwords
  339. punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
  340. punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  341. shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
  342. SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
  343. punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
  344. punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
  345. shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
  346. SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
  347. mova [dstq+0*mmsize], m4
  348. mova [dstq+1*mmsize], m2
  349. mova [dstq+2*mmsize], m0
  350. mova [dstq+3*mmsize], m5
  351. mova [dstq+4*mmsize], m3
  352. mova [dstq+5*mmsize], m1
  353. add src0q, mmsize
  354. add dstq, mmsize*6
  355. sub lend, mmsize/2
  356. %endif
  357. jg .loop
  358. REP_RET
  359. %endmacro
  360. INIT_XMM sse2
  361. CONV_S16P_TO_S16_6CH
  362. INIT_XMM sse2slow
  363. CONV_S16P_TO_S16_6CH
  364. %if HAVE_AVX_EXTERNAL
  365. INIT_XMM avx
  366. CONV_S16P_TO_S16_6CH
  367. %endif
  368. ;------------------------------------------------------------------------------
  369. ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
  370. ; int channels);
  371. ;------------------------------------------------------------------------------
  372. %macro CONV_S16P_TO_FLT_2CH 0
  373. cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
  374. lea lenq, [2*lend]
  375. mov src1q, [src0q+gprsize]
  376. mov src0q, [src0q ]
  377. lea dstq, [dstq+4*lenq]
  378. add src0q, lenq
  379. add src1q, lenq
  380. neg lenq
  381. mova m5, [pf_s32_inv_scale]
  382. .loop:
  383. mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
  384. mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
  385. SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
  386. ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
  387. pxor m3, m3
  388. punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
  389. punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
  390. punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
  391. punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
  392. cvtdq2ps m0, m0
  393. cvtdq2ps m1, m1
  394. cvtdq2ps m2, m2
  395. cvtdq2ps m3, m3
  396. mulps m0, m5
  397. mulps m1, m5
  398. mulps m2, m5
  399. mulps m3, m5
  400. mova [dstq+4*lenq ], m0
  401. mova [dstq+4*lenq+ mmsize], m1
  402. mova [dstq+4*lenq+2*mmsize], m2
  403. mova [dstq+4*lenq+3*mmsize], m3
  404. add lenq, mmsize
  405. jl .loop
  406. REP_RET
  407. %endmacro
  408. INIT_XMM sse2
  409. CONV_S16P_TO_FLT_2CH
  410. %if HAVE_AVX_EXTERNAL
  411. INIT_XMM avx
  412. CONV_S16P_TO_FLT_2CH
  413. %endif
  414. ;------------------------------------------------------------------------------
  415. ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
  416. ; int channels);
  417. ;------------------------------------------------------------------------------
  418. %macro CONV_S16P_TO_FLT_6CH 0
  419. %if ARCH_X86_64
  420. cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
  421. %else
  422. cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
  423. %define lend dword r2m
  424. %endif
  425. mov src1q, [srcq+1*gprsize]
  426. mov src2q, [srcq+2*gprsize]
  427. mov src3q, [srcq+3*gprsize]
  428. mov src4q, [srcq+4*gprsize]
  429. mov src5q, [srcq+5*gprsize]
  430. mov srcq, [srcq]
  431. sub src1q, srcq
  432. sub src2q, srcq
  433. sub src3q, srcq
  434. sub src4q, srcq
  435. sub src5q, srcq
  436. mova m7, [pf_s32_inv_scale]
  437. %if cpuflag(ssse3)
  438. %define unpack_even m6
  439. mova m6, [pb_shuf_unpack_even]
  440. %if ARCH_X86_64
  441. %define unpack_odd m8
  442. mova m8, [pb_shuf_unpack_odd]
  443. %else
  444. %define unpack_odd [pb_shuf_unpack_odd]
  445. %endif
  446. %endif
  447. .loop:
  448. movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
  449. movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  450. movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  451. movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  452. movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  453. movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  454. ; unpack words:
  455. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  456. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  457. punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  458. ; blend dwords
  459. shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
  460. shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
  461. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  462. %if cpuflag(ssse3)
  463. pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
  464. pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
  465. pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
  466. pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
  467. pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
  468. pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
  469. %else
  470. ; shuffle dwords
  471. pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
  472. pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
  473. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  474. pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
  475. punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
  476. punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
  477. punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
  478. punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
  479. punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
  480. punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
  481. SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
  482. %endif
  483. cvtdq2ps m0, m0 ; convert s32 to float
  484. cvtdq2ps m1, m1
  485. cvtdq2ps m2, m2
  486. cvtdq2ps m3, m3
  487. cvtdq2ps m4, m4
  488. cvtdq2ps m5, m5
  489. mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
  490. mulps m1, m7
  491. mulps m2, m7
  492. mulps m3, m7
  493. mulps m4, m7
  494. mulps m5, m7
  495. mova [dstq ], m0
  496. mova [dstq+ mmsize], m1
  497. mova [dstq+2*mmsize], m2
  498. mova [dstq+3*mmsize], m3
  499. mova [dstq+4*mmsize], m4
  500. mova [dstq+5*mmsize], m5
  501. add srcq, mmsize/2
  502. add dstq, mmsize*6
  503. sub lend, mmsize/4
  504. jg .loop
  505. REP_RET
  506. %endmacro
  507. INIT_XMM sse2
  508. CONV_S16P_TO_FLT_6CH
  509. INIT_XMM ssse3
  510. CONV_S16P_TO_FLT_6CH
  511. %if HAVE_AVX_EXTERNAL
  512. INIT_XMM avx
  513. CONV_S16P_TO_FLT_6CH
  514. %endif
  515. ;------------------------------------------------------------------------------
  516. ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
  517. ; int channels);
  518. ;------------------------------------------------------------------------------
  519. %macro CONV_FLTP_TO_S16_2CH 0
  520. cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
  521. lea lenq, [4*lend]
  522. mov src1q, [src0q+gprsize]
  523. mov src0q, [src0q ]
  524. add dstq, lenq
  525. add src0q, lenq
  526. add src1q, lenq
  527. neg lenq
  528. mova m2, [pf_s16_scale]
  529. %if cpuflag(ssse3)
  530. mova m3, [pb_interleave_words]
  531. %endif
  532. .loop:
  533. mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
  534. mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
  535. cvtps2dq m0, m0
  536. cvtps2dq m1, m1
  537. %if cpuflag(ssse3)
  538. packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  539. pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  540. %else
  541. packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
  542. packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
  543. punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  544. %endif
  545. mova [dstq+lenq], m0
  546. add lenq, mmsize
  547. jl .loop
  548. REP_RET
  549. %endmacro
  550. INIT_XMM sse2
  551. CONV_FLTP_TO_S16_2CH
  552. INIT_XMM ssse3
  553. CONV_FLTP_TO_S16_2CH
  554. ;------------------------------------------------------------------------------
  555. ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
  556. ; int channels);
  557. ;------------------------------------------------------------------------------
  558. %macro CONV_FLTP_TO_S16_6CH 0
  559. %if ARCH_X86_64
  560. cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
  561. %else
  562. cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
  563. %define lend dword r2m
  564. %endif
  565. mov src1q, [srcq+1*gprsize]
  566. mov src2q, [srcq+2*gprsize]
  567. mov src3q, [srcq+3*gprsize]
  568. mov src4q, [srcq+4*gprsize]
  569. mov src5q, [srcq+5*gprsize]
  570. mov srcq, [srcq]
  571. sub src1q, srcq
  572. sub src2q, srcq
  573. sub src3q, srcq
  574. sub src4q, srcq
  575. sub src5q, srcq
  576. movaps xmm6, [pf_s16_scale]
  577. .loop:
  578. %if cpuflag(sse2)
  579. mulps m0, m6, [srcq ]
  580. mulps m1, m6, [srcq+src1q]
  581. mulps m2, m6, [srcq+src2q]
  582. mulps m3, m6, [srcq+src3q]
  583. mulps m4, m6, [srcq+src4q]
  584. mulps m5, m6, [srcq+src5q]
  585. cvtps2dq m0, m0
  586. cvtps2dq m1, m1
  587. cvtps2dq m2, m2
  588. cvtps2dq m3, m3
  589. cvtps2dq m4, m4
  590. cvtps2dq m5, m5
  591. packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
  592. packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
  593. packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
  594. ; unpack words:
  595. movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
  596. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  597. punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
  598. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  599. ; blend dwords:
  600. shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
  601. shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  602. shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  603. ; shuffle dwords:
  604. shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  605. shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
  606. shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  607. mova [dstq+0*mmsize], m3
  608. mova [dstq+1*mmsize], m1
  609. mova [dstq+2*mmsize], m0
  610. %else ; sse
  611. movlps xmm0, [srcq ]
  612. movlps xmm1, [srcq+src1q]
  613. movlps xmm2, [srcq+src2q]
  614. movlps xmm3, [srcq+src3q]
  615. movlps xmm4, [srcq+src4q]
  616. movlps xmm5, [srcq+src5q]
  617. mulps xmm0, xmm6
  618. mulps xmm1, xmm6
  619. mulps xmm2, xmm6
  620. mulps xmm3, xmm6
  621. mulps xmm4, xmm6
  622. mulps xmm5, xmm6
  623. cvtps2pi mm0, xmm0
  624. cvtps2pi mm1, xmm1
  625. cvtps2pi mm2, xmm2
  626. cvtps2pi mm3, xmm3
  627. cvtps2pi mm4, xmm4
  628. cvtps2pi mm5, xmm5
  629. packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
  630. packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
  631. packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
  632. ; unpack words
  633. pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
  634. punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
  635. punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
  636. punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
  637. ; unpack dwords
  638. pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
  639. punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
  640. punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
  641. punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
  642. mova [dstq+0*mmsize], mm0
  643. mova [dstq+1*mmsize], mm1
  644. mova [dstq+2*mmsize], mm2
  645. %endif
  646. add srcq, mmsize
  647. add dstq, mmsize*3
  648. sub lend, mmsize/4
  649. jg .loop
  650. %if mmsize == 8
  651. emms
  652. RET
  653. %else
  654. REP_RET
  655. %endif
  656. %endmacro
  657. INIT_MMX sse
  658. CONV_FLTP_TO_S16_6CH
  659. INIT_XMM sse2
  660. CONV_FLTP_TO_S16_6CH
  661. %if HAVE_AVX_EXTERNAL
  662. INIT_XMM avx
  663. CONV_FLTP_TO_S16_6CH
  664. %endif
  665. ;------------------------------------------------------------------------------
  666. ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
  667. ; int channels);
  668. ;------------------------------------------------------------------------------
  669. %macro CONV_FLTP_TO_FLT_2CH 0
  670. cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
  671. mov src1q, [src0q+gprsize]
  672. mov src0q, [src0q]
  673. lea lenq, [4*lend]
  674. add src0q, lenq
  675. add src1q, lenq
  676. lea dstq, [dstq+2*lenq]
  677. neg lenq
  678. .loop:
  679. mova m0, [src0q+lenq ]
  680. mova m1, [src1q+lenq ]
  681. mova m2, [src0q+lenq+mmsize]
  682. mova m3, [src1q+lenq+mmsize]
  683. SBUTTERFLYPS 0, 1, 4
  684. SBUTTERFLYPS 2, 3, 4
  685. mova [dstq+2*lenq+0*mmsize], m0
  686. mova [dstq+2*lenq+1*mmsize], m1
  687. mova [dstq+2*lenq+2*mmsize], m2
  688. mova [dstq+2*lenq+3*mmsize], m3
  689. add lenq, 2*mmsize
  690. jl .loop
  691. REP_RET
  692. %endmacro
  693. INIT_XMM sse
  694. CONV_FLTP_TO_FLT_2CH
  695. %if HAVE_AVX_EXTERNAL
  696. INIT_XMM avx
  697. CONV_FLTP_TO_FLT_2CH
  698. %endif
  699. ;-----------------------------------------------------------------------------
  700. ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
  701. ; int channels);
  702. ;-----------------------------------------------------------------------------
  703. %macro CONV_FLTP_TO_FLT_6CH 0
  704. cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  705. %if ARCH_X86_64
  706. mov lend, r2d
  707. %else
  708. %define lend dword r2m
  709. %endif
  710. mov src1q, [srcq+1*gprsize]
  711. mov src2q, [srcq+2*gprsize]
  712. mov src3q, [srcq+3*gprsize]
  713. mov src4q, [srcq+4*gprsize]
  714. mov src5q, [srcq+5*gprsize]
  715. mov srcq, [srcq]
  716. sub src1q, srcq
  717. sub src2q, srcq
  718. sub src3q, srcq
  719. sub src4q, srcq
  720. sub src5q, srcq
  721. .loop:
  722. mova m0, [srcq ]
  723. mova m1, [srcq+src1q]
  724. mova m2, [srcq+src2q]
  725. mova m3, [srcq+src3q]
  726. mova m4, [srcq+src4q]
  727. mova m5, [srcq+src5q]
  728. %if cpuflag(sse4)
  729. SBUTTERFLYPS 0, 1, 6
  730. SBUTTERFLYPS 2, 3, 6
  731. SBUTTERFLYPS 4, 5, 6
  732. blendps m6, m4, m0, 1100b
  733. movlhps m0, m2
  734. movhlps m4, m2
  735. blendps m2, m5, m1, 1100b
  736. movlhps m1, m3
  737. movhlps m5, m3
  738. movaps [dstq ], m0
  739. movaps [dstq+16], m6
  740. movaps [dstq+32], m4
  741. movaps [dstq+48], m1
  742. movaps [dstq+64], m2
  743. movaps [dstq+80], m5
  744. %else ; mmx
  745. SBUTTERFLY dq, 0, 1, 6
  746. SBUTTERFLY dq, 2, 3, 6
  747. SBUTTERFLY dq, 4, 5, 6
  748. movq [dstq ], m0
  749. movq [dstq+ 8], m2
  750. movq [dstq+16], m4
  751. movq [dstq+24], m1
  752. movq [dstq+32], m3
  753. movq [dstq+40], m5
  754. %endif
  755. add srcq, mmsize
  756. add dstq, mmsize*6
  757. sub lend, mmsize/4
  758. jg .loop
  759. %if mmsize == 8
  760. emms
  761. RET
  762. %else
  763. REP_RET
  764. %endif
  765. %endmacro
  766. INIT_MMX mmx
  767. CONV_FLTP_TO_FLT_6CH
  768. INIT_XMM sse4
  769. CONV_FLTP_TO_FLT_6CH
  770. %if HAVE_AVX_EXTERNAL
  771. INIT_XMM avx
  772. CONV_FLTP_TO_FLT_6CH
  773. %endif
  774. ;------------------------------------------------------------------------------
  775. ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
  776. ; int channels);
  777. ;------------------------------------------------------------------------------
  778. %macro CONV_S16_TO_S16P_2CH 0
  779. cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
  780. lea lenq, [2*lend]
  781. mov dst1q, [dst0q+gprsize]
  782. mov dst0q, [dst0q ]
  783. lea srcq, [srcq+2*lenq]
  784. add dst0q, lenq
  785. add dst1q, lenq
  786. neg lenq
  787. %if cpuflag(ssse3)
  788. mova m3, [pb_deinterleave_words]
  789. %endif
  790. .loop:
  791. mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  792. mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  793. %if cpuflag(ssse3)
  794. pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  795. pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
  796. SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  797. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  798. %else ; sse2
  799. pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
  800. pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
  801. pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
  802. pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
  803. DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  804. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  805. %endif
  806. mova [dst0q+lenq], m0
  807. mova [dst1q+lenq], m1
  808. add lenq, mmsize
  809. jl .loop
  810. REP_RET
  811. %endmacro
  812. INIT_XMM sse2
  813. CONV_S16_TO_S16P_2CH
  814. INIT_XMM ssse3
  815. CONV_S16_TO_S16P_2CH
  816. %if HAVE_AVX_EXTERNAL
  817. INIT_XMM avx
  818. CONV_S16_TO_S16P_2CH
  819. %endif
  820. ;------------------------------------------------------------------------------
  821. ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
  822. ; int channels);
  823. ;------------------------------------------------------------------------------
  824. %macro CONV_S16_TO_S16P_6CH 0
  825. %if ARCH_X86_64
  826. cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
  827. %else
  828. cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
  829. %define lend dword r2m
  830. %endif
  831. mov dst1q, [dstq+ gprsize]
  832. mov dst2q, [dstq+2*gprsize]
  833. mov dst3q, [dstq+3*gprsize]
  834. mov dst4q, [dstq+4*gprsize]
  835. mov dst5q, [dstq+5*gprsize]
  836. mov dstq, [dstq ]
  837. sub dst1q, dstq
  838. sub dst2q, dstq
  839. sub dst3q, dstq
  840. sub dst4q, dstq
  841. sub dst5q, dstq
  842. .loop:
  843. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  844. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  845. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  846. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  847. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  848. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  849. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  850. ; m1 = 4, 10, 5, 11, x, x, x, x
  851. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  852. ; m2 = 16, 22, 17, 23, x, x, x, x
  853. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  854. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  855. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  856. movq [dstq ], m0
  857. movhps [dstq+dst1q], m0
  858. movq [dstq+dst2q], m3
  859. movhps [dstq+dst3q], m3
  860. movq [dstq+dst4q], m1
  861. movhps [dstq+dst5q], m1
  862. add srcq, mmsize*3
  863. add dstq, mmsize/2
  864. sub lend, mmsize/4
  865. jg .loop
  866. REP_RET
  867. %endmacro
  868. %define PALIGNR PALIGNR_MMX
  869. INIT_XMM sse2
  870. CONV_S16_TO_S16P_6CH
  871. %define PALIGNR PALIGNR_SSSE3
  872. INIT_XMM ssse3
  873. CONV_S16_TO_S16P_6CH
  874. %if HAVE_AVX_EXTERNAL
  875. INIT_XMM avx
  876. CONV_S16_TO_S16P_6CH
  877. %endif
  878. ;------------------------------------------------------------------------------
  879. ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
  880. ; int channels);
  881. ;------------------------------------------------------------------------------
  882. %macro CONV_S16_TO_FLTP_2CH 0
  883. cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
  884. lea lenq, [4*lend]
  885. mov dst1q, [dst0q+gprsize]
  886. mov dst0q, [dst0q ]
  887. add srcq, lenq
  888. add dst0q, lenq
  889. add dst1q, lenq
  890. neg lenq
  891. mova m3, [pf_s32_inv_scale]
  892. mova m4, [pw_zero_even]
  893. .loop:
  894. mova m1, [srcq+lenq]
  895. pslld m0, m1, 16
  896. pand m1, m4
  897. cvtdq2ps m0, m0
  898. cvtdq2ps m1, m1
  899. mulps m0, m0, m3
  900. mulps m1, m1, m3
  901. mova [dst0q+lenq], m0
  902. mova [dst1q+lenq], m1
  903. add lenq, mmsize
  904. jl .loop
  905. REP_RET
  906. %endmacro
  907. INIT_XMM sse2
  908. CONV_S16_TO_FLTP_2CH
  909. %if HAVE_AVX_EXTERNAL
  910. INIT_XMM avx
  911. CONV_S16_TO_FLTP_2CH
  912. %endif
  913. ;------------------------------------------------------------------------------
  914. ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
  915. ; int channels);
  916. ;------------------------------------------------------------------------------
  917. %macro CONV_S16_TO_FLTP_6CH 0
  918. %if ARCH_X86_64
  919. cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  920. %else
  921. cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  922. %define lend dword r2m
  923. %endif
  924. mov dst1q, [dstq+ gprsize]
  925. mov dst2q, [dstq+2*gprsize]
  926. mov dst3q, [dstq+3*gprsize]
  927. mov dst4q, [dstq+4*gprsize]
  928. mov dst5q, [dstq+5*gprsize]
  929. mov dstq, [dstq ]
  930. sub dst1q, dstq
  931. sub dst2q, dstq
  932. sub dst3q, dstq
  933. sub dst4q, dstq
  934. sub dst5q, dstq
  935. mova m6, [pf_s16_inv_scale]
  936. .loop:
  937. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  938. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  939. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  940. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  941. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  942. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  943. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  944. ; m1 = 4, 10, 5, 11, x, x, x, x
  945. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  946. ; m2 = 16, 22, 17, 23, x, x, x, x
  947. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  948. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  949. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  950. S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
  951. ; m2 = 1, 7, 13, 19
  952. S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
  953. ; m4 = 3, 9, 15, 21
  954. S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
  955. ; m5 = 5, 11, 17, 23
  956. SWAP 1,2,3,4
  957. cvtdq2ps m0, m0
  958. cvtdq2ps m1, m1
  959. cvtdq2ps m2, m2
  960. cvtdq2ps m3, m3
  961. cvtdq2ps m4, m4
  962. cvtdq2ps m5, m5
  963. mulps m0, m6
  964. mulps m1, m6
  965. mulps m2, m6
  966. mulps m3, m6
  967. mulps m4, m6
  968. mulps m5, m6
  969. mova [dstq ], m0
  970. mova [dstq+dst1q], m1
  971. mova [dstq+dst2q], m2
  972. mova [dstq+dst3q], m3
  973. mova [dstq+dst4q], m4
  974. mova [dstq+dst5q], m5
  975. add srcq, mmsize*3
  976. add dstq, mmsize
  977. sub lend, mmsize/4
  978. jg .loop
  979. REP_RET
  980. %endmacro
  981. %define PALIGNR PALIGNR_MMX
  982. INIT_XMM sse2
  983. CONV_S16_TO_FLTP_6CH
  984. %define PALIGNR PALIGNR_SSSE3
  985. INIT_XMM ssse3
  986. CONV_S16_TO_FLTP_6CH
  987. INIT_XMM sse4
  988. CONV_S16_TO_FLTP_6CH
  989. %if HAVE_AVX_EXTERNAL
  990. INIT_XMM avx
  991. CONV_S16_TO_FLTP_6CH
  992. %endif
  993. ;------------------------------------------------------------------------------
  994. ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
  995. ; int channels);
  996. ;------------------------------------------------------------------------------
  997. %macro CONV_FLT_TO_S16P_2CH 0
  998. cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
  999. lea lenq, [2*lend]
  1000. mov dst1q, [dst0q+gprsize]
  1001. mov dst0q, [dst0q ]
  1002. lea srcq, [srcq+4*lenq]
  1003. add dst0q, lenq
  1004. add dst1q, lenq
  1005. neg lenq
  1006. mova m5, [pf_s16_scale]
  1007. .loop:
  1008. mova m0, [srcq+4*lenq ]
  1009. mova m1, [srcq+4*lenq+ mmsize]
  1010. mova m2, [srcq+4*lenq+2*mmsize]
  1011. mova m3, [srcq+4*lenq+3*mmsize]
  1012. DEINT2_PS 0, 1, 4
  1013. DEINT2_PS 2, 3, 4
  1014. mulps m0, m0, m5
  1015. mulps m1, m1, m5
  1016. mulps m2, m2, m5
  1017. mulps m3, m3, m5
  1018. cvtps2dq m0, m0
  1019. cvtps2dq m1, m1
  1020. cvtps2dq m2, m2
  1021. cvtps2dq m3, m3
  1022. packssdw m0, m2
  1023. packssdw m1, m3
  1024. mova [dst0q+lenq], m0
  1025. mova [dst1q+lenq], m1
  1026. add lenq, mmsize
  1027. jl .loop
  1028. REP_RET
  1029. %endmacro
  1030. INIT_XMM sse2
  1031. CONV_FLT_TO_S16P_2CH
  1032. %if HAVE_AVX_EXTERNAL
  1033. INIT_XMM avx
  1034. CONV_FLT_TO_S16P_2CH
  1035. %endif
  1036. ;------------------------------------------------------------------------------
  1037. ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
  1038. ; int channels);
  1039. ;------------------------------------------------------------------------------
  1040. %macro CONV_FLT_TO_S16P_6CH 0
  1041. %if ARCH_X86_64
  1042. cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1043. %else
  1044. cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1045. %define lend dword r2m
  1046. %endif
  1047. mov dst1q, [dstq+ gprsize]
  1048. mov dst2q, [dstq+2*gprsize]
  1049. mov dst3q, [dstq+3*gprsize]
  1050. mov dst4q, [dstq+4*gprsize]
  1051. mov dst5q, [dstq+5*gprsize]
  1052. mov dstq, [dstq ]
  1053. sub dst1q, dstq
  1054. sub dst2q, dstq
  1055. sub dst3q, dstq
  1056. sub dst4q, dstq
  1057. sub dst5q, dstq
  1058. mova m6, [pf_s16_scale]
  1059. .loop:
  1060. mulps m0, m6, [srcq+0*mmsize]
  1061. mulps m3, m6, [srcq+1*mmsize]
  1062. mulps m1, m6, [srcq+2*mmsize]
  1063. mulps m4, m6, [srcq+3*mmsize]
  1064. mulps m2, m6, [srcq+4*mmsize]
  1065. mulps m5, m6, [srcq+5*mmsize]
  1066. cvtps2dq m0, m0
  1067. cvtps2dq m1, m1
  1068. cvtps2dq m2, m2
  1069. cvtps2dq m3, m3
  1070. cvtps2dq m4, m4
  1071. cvtps2dq m5, m5
  1072. packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  1073. packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  1074. packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  1075. PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
  1076. shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
  1077. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  1078. SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  1079. ; m3 = 4, 10, 5, 11, x, x, x, x
  1080. SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
  1081. ; m2 = 16, 22, 17, 23, x, x, x, x
  1082. SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  1083. ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
  1084. punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
  1085. movq [dstq ], m0
  1086. movhps [dstq+dst1q], m0
  1087. movq [dstq+dst2q], m1
  1088. movhps [dstq+dst3q], m1
  1089. movq [dstq+dst4q], m3
  1090. movhps [dstq+dst5q], m3
  1091. add srcq, mmsize*6
  1092. add dstq, mmsize/2
  1093. sub lend, mmsize/4
  1094. jg .loop
  1095. REP_RET
  1096. %endmacro
  1097. %define PALIGNR PALIGNR_MMX
  1098. INIT_XMM sse2
  1099. CONV_FLT_TO_S16P_6CH
  1100. %define PALIGNR PALIGNR_SSSE3
  1101. INIT_XMM ssse3
  1102. CONV_FLT_TO_S16P_6CH
  1103. %if HAVE_AVX_EXTERNAL
  1104. INIT_XMM avx
  1105. CONV_FLT_TO_S16P_6CH
  1106. %endif
  1107. ;------------------------------------------------------------------------------
  1108. ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
  1109. ; int channels);
  1110. ;------------------------------------------------------------------------------
  1111. %macro CONV_FLT_TO_FLTP_2CH 0
  1112. cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
  1113. lea lenq, [4*lend]
  1114. mov dst1q, [dst0q+gprsize]
  1115. mov dst0q, [dst0q ]
  1116. lea srcq, [srcq+2*lenq]
  1117. add dst0q, lenq
  1118. add dst1q, lenq
  1119. neg lenq
  1120. .loop:
  1121. mova m0, [srcq+2*lenq ]
  1122. mova m1, [srcq+2*lenq+mmsize]
  1123. DEINT2_PS 0, 1, 2
  1124. mova [dst0q+lenq], m0
  1125. mova [dst1q+lenq], m1
  1126. add lenq, mmsize
  1127. jl .loop
  1128. REP_RET
  1129. %endmacro
  1130. INIT_XMM sse
  1131. CONV_FLT_TO_FLTP_2CH
  1132. %if HAVE_AVX_EXTERNAL
  1133. INIT_XMM avx
  1134. CONV_FLT_TO_FLTP_2CH
  1135. %endif
  1136. ;------------------------------------------------------------------------------
  1137. ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
  1138. ; int channels);
  1139. ;------------------------------------------------------------------------------
  1140. %macro CONV_FLT_TO_FLTP_6CH 0
  1141. %if ARCH_X86_64
  1142. cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1143. %else
  1144. cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1145. %define lend dword r2m
  1146. %endif
  1147. mov dst1q, [dstq+ gprsize]
  1148. mov dst2q, [dstq+2*gprsize]
  1149. mov dst3q, [dstq+3*gprsize]
  1150. mov dst4q, [dstq+4*gprsize]
  1151. mov dst5q, [dstq+5*gprsize]
  1152. mov dstq, [dstq ]
  1153. sub dst1q, dstq
  1154. sub dst2q, dstq
  1155. sub dst3q, dstq
  1156. sub dst4q, dstq
  1157. sub dst5q, dstq
  1158. .loop:
  1159. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
  1160. mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
  1161. mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
  1162. mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
  1163. mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
  1164. mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
  1165. SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
  1166. ; m3 = 2, 14, 3, 15
  1167. SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
  1168. ; m4 = 6, 18, 7, 19
  1169. SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
  1170. ; m5 = 10, 22, 11, 23
  1171. SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
  1172. ; m4 = 1, 7, 13, 19
  1173. SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
  1174. ; m2 = 3, 9, 15, 21
  1175. SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
  1176. ; m5 = 5, 11, 17, 23
  1177. mova [dstq ], m0
  1178. mova [dstq+dst1q], m4
  1179. mova [dstq+dst2q], m3
  1180. mova [dstq+dst3q], m2
  1181. mova [dstq+dst4q], m1
  1182. mova [dstq+dst5q], m5
  1183. add srcq, mmsize*6
  1184. add dstq, mmsize
  1185. sub lend, mmsize/4
  1186. jg .loop
  1187. REP_RET
  1188. %endmacro
  1189. INIT_XMM sse2
  1190. CONV_FLT_TO_FLTP_6CH
  1191. %if HAVE_AVX_EXTERNAL
  1192. INIT_XMM avx
  1193. CONV_FLT_TO_FLTP_6CH
  1194. %endif