You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1228 lines
43KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. %include "util.asm"
  24. SECTION_RODATA 32
  25. pf_s32_inv_scale: times 8 dd 0x30000000
  26. pf_s32_scale: times 8 dd 0x4f000000
  27. pf_s32_clip: times 8 dd 0x4effffff
  28. pf_s16_inv_scale: times 4 dd 0x38000000
  29. pf_s16_scale: times 4 dd 0x47000000
  30. pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
  31. pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
  32. pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
  33. pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
  34. pw_zero_even: times 4 dw 0x0000, 0xffff
  35. SECTION_TEXT
  36. ;------------------------------------------------------------------------------
  37. ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
  38. ;------------------------------------------------------------------------------
  39. INIT_XMM sse2
  40. cglobal conv_s16_to_s32, 3,3,3, dst, src, len
  41. lea lenq, [2*lend]
  42. lea dstq, [dstq+2*lenq]
  43. add srcq, lenq
  44. neg lenq
  45. .loop:
  46. mova m2, [srcq+lenq]
  47. pxor m0, m0
  48. pxor m1, m1
  49. punpcklwd m0, m2
  50. punpckhwd m1, m2
  51. mova [dstq+2*lenq ], m0
  52. mova [dstq+2*lenq+mmsize], m1
  53. add lenq, mmsize
  54. jl .loop
  55. REP_RET
  56. ;------------------------------------------------------------------------------
  57. ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
  58. ;------------------------------------------------------------------------------
  59. %macro CONV_S16_TO_FLT 0
  60. cglobal conv_s16_to_flt, 3,3,3, dst, src, len
  61. lea lenq, [2*lend]
  62. add srcq, lenq
  63. lea dstq, [dstq + 2*lenq]
  64. neg lenq
  65. mova m2, [pf_s16_inv_scale]
  66. ALIGN 16
  67. .loop:
  68. mova m0, [srcq+lenq]
  69. S16_TO_S32_SX 0, 1
  70. cvtdq2ps m0, m0
  71. cvtdq2ps m1, m1
  72. mulps m0, m2
  73. mulps m1, m2
  74. mova [dstq+2*lenq ], m0
  75. mova [dstq+2*lenq+mmsize], m1
  76. add lenq, mmsize
  77. jl .loop
  78. REP_RET
  79. %endmacro
  80. INIT_XMM sse2
  81. CONV_S16_TO_FLT
  82. INIT_XMM sse4
  83. CONV_S16_TO_FLT
  84. ;------------------------------------------------------------------------------
  85. ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
  86. ;------------------------------------------------------------------------------
  87. %macro CONV_S32_TO_S16 0
  88. cglobal conv_s32_to_s16, 3,3,4, dst, src, len
  89. lea lenq, [2*lend]
  90. lea srcq, [srcq+2*lenq]
  91. add dstq, lenq
  92. neg lenq
  93. .loop:
  94. mova m0, [srcq+2*lenq ]
  95. mova m1, [srcq+2*lenq+ mmsize]
  96. mova m2, [srcq+2*lenq+2*mmsize]
  97. mova m3, [srcq+2*lenq+3*mmsize]
  98. psrad m0, 16
  99. psrad m1, 16
  100. psrad m2, 16
  101. psrad m3, 16
  102. packssdw m0, m1
  103. packssdw m2, m3
  104. mova [dstq+lenq ], m0
  105. mova [dstq+lenq+mmsize], m2
  106. add lenq, mmsize*2
  107. jl .loop
  108. %if mmsize == 8
  109. emms
  110. RET
  111. %else
  112. REP_RET
  113. %endif
  114. %endmacro
  115. INIT_MMX mmx
  116. CONV_S32_TO_S16
  117. INIT_XMM sse2
  118. CONV_S32_TO_S16
  119. ;------------------------------------------------------------------------------
  120. ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
  121. ;------------------------------------------------------------------------------
  122. %macro CONV_S32_TO_FLT 0
  123. cglobal conv_s32_to_flt, 3,3,3, dst, src, len
  124. lea lenq, [4*lend]
  125. add srcq, lenq
  126. add dstq, lenq
  127. neg lenq
  128. mova m0, [pf_s32_inv_scale]
  129. ALIGN 16
  130. .loop:
  131. cvtdq2ps m1, [srcq+lenq ]
  132. cvtdq2ps m2, [srcq+lenq+mmsize]
  133. mulps m1, m1, m0
  134. mulps m2, m2, m0
  135. mova [dstq+lenq ], m1
  136. mova [dstq+lenq+mmsize], m2
  137. add lenq, mmsize*2
  138. jl .loop
  139. REP_RET
  140. %endmacro
  141. INIT_XMM sse2
  142. CONV_S32_TO_FLT
  143. INIT_YMM avx
  144. CONV_S32_TO_FLT
  145. ;------------------------------------------------------------------------------
  146. ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
  147. ;------------------------------------------------------------------------------
  148. INIT_XMM sse2
  149. cglobal conv_flt_to_s16, 3,3,5, dst, src, len
  150. lea lenq, [2*lend]
  151. lea srcq, [srcq+2*lenq]
  152. add dstq, lenq
  153. neg lenq
  154. mova m4, [pf_s16_scale]
  155. .loop:
  156. mova m0, [srcq+2*lenq ]
  157. mova m1, [srcq+2*lenq+1*mmsize]
  158. mova m2, [srcq+2*lenq+2*mmsize]
  159. mova m3, [srcq+2*lenq+3*mmsize]
  160. mulps m0, m4
  161. mulps m1, m4
  162. mulps m2, m4
  163. mulps m3, m4
  164. cvtps2dq m0, m0
  165. cvtps2dq m1, m1
  166. cvtps2dq m2, m2
  167. cvtps2dq m3, m3
  168. packssdw m0, m1
  169. packssdw m2, m3
  170. mova [dstq+lenq ], m0
  171. mova [dstq+lenq+mmsize], m2
  172. add lenq, mmsize*2
  173. jl .loop
  174. REP_RET
  175. ;------------------------------------------------------------------------------
  176. ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
  177. ;------------------------------------------------------------------------------
  178. %macro CONV_FLT_TO_S32 0
  179. cglobal conv_flt_to_s32, 3,3,6, dst, src, len
  180. lea lenq, [lend*4]
  181. add srcq, lenq
  182. add dstq, lenq
  183. neg lenq
  184. mova m4, [pf_s32_scale]
  185. mova m5, [pf_s32_clip]
  186. .loop:
  187. mulps m0, m4, [srcq+lenq ]
  188. mulps m1, m4, [srcq+lenq+1*mmsize]
  189. mulps m2, m4, [srcq+lenq+2*mmsize]
  190. mulps m3, m4, [srcq+lenq+3*mmsize]
  191. minps m0, m0, m5
  192. minps m1, m1, m5
  193. minps m2, m2, m5
  194. minps m3, m3, m5
  195. cvtps2dq m0, m0
  196. cvtps2dq m1, m1
  197. cvtps2dq m2, m2
  198. cvtps2dq m3, m3
  199. mova [dstq+lenq ], m0
  200. mova [dstq+lenq+1*mmsize], m1
  201. mova [dstq+lenq+2*mmsize], m2
  202. mova [dstq+lenq+3*mmsize], m3
  203. add lenq, mmsize*4
  204. jl .loop
  205. REP_RET
  206. %endmacro
  207. INIT_XMM sse2
  208. CONV_FLT_TO_S32
  209. INIT_YMM avx
  210. CONV_FLT_TO_S32
  211. ;------------------------------------------------------------------------------
  212. ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
  213. ; int channels);
  214. ;------------------------------------------------------------------------------
  215. %macro CONV_S16P_TO_S16_2CH 0
  216. cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
  217. mov src1q, [src0q+gprsize]
  218. mov src0q, [src0q ]
  219. lea lenq, [2*lend]
  220. add src0q, lenq
  221. add src1q, lenq
  222. lea dstq, [dstq+2*lenq]
  223. neg lenq
  224. .loop:
  225. mova m0, [src0q+lenq ]
  226. mova m1, [src1q+lenq ]
  227. mova m2, [src0q+lenq+mmsize]
  228. mova m3, [src1q+lenq+mmsize]
  229. SBUTTERFLY2 wd, 0, 1, 4
  230. SBUTTERFLY2 wd, 2, 3, 4
  231. mova [dstq+2*lenq+0*mmsize], m0
  232. mova [dstq+2*lenq+1*mmsize], m1
  233. mova [dstq+2*lenq+2*mmsize], m2
  234. mova [dstq+2*lenq+3*mmsize], m3
  235. add lenq, 2*mmsize
  236. jl .loop
  237. REP_RET
  238. %endmacro
  239. INIT_XMM sse2
  240. CONV_S16P_TO_S16_2CH
  241. INIT_XMM avx
  242. CONV_S16P_TO_S16_2CH
  243. ;------------------------------------------------------------------------------
  244. ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
  245. ; int channels);
  246. ;------------------------------------------------------------------------------
  247. ;------------------------------------------------------------------------------
  248. ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
  249. ; instead of just a counter, which would avoid incrementing the
  250. ; pointers, but the extra complexity and amount of code is not worth
  251. ; the small gain. On x86-32 there are not enough registers to use len
  252. ; as an index without keeping two of the pointers on the stack and
  253. ; loading them in each iteration.
  254. ;------------------------------------------------------------------------------
  255. %macro CONV_S16P_TO_S16_6CH 0
  256. %if ARCH_X86_64
  257. cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
  258. %else
  259. cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
  260. %define lend dword r2m
  261. %endif
  262. mov src1q, [src0q+1*gprsize]
  263. mov src2q, [src0q+2*gprsize]
  264. mov src3q, [src0q+3*gprsize]
  265. mov src4q, [src0q+4*gprsize]
  266. mov src5q, [src0q+5*gprsize]
  267. mov src0q, [src0q]
  268. sub src1q, src0q
  269. sub src2q, src0q
  270. sub src3q, src0q
  271. sub src4q, src0q
  272. sub src5q, src0q
  273. .loop:
  274. %if cpuflag(sse2slow)
  275. movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
  276. movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  277. movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  278. movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  279. movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  280. movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  281. ; unpack words:
  282. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  283. punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
  284. punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
  285. ; blend dwords
  286. shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
  287. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  288. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  289. ; shuffle dwords
  290. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  291. pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
  292. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  293. movq [dstq+0*mmsize/2], m1
  294. movq [dstq+1*mmsize/2], m0
  295. movq [dstq+2*mmsize/2], m2
  296. movhps [dstq+3*mmsize/2], m1
  297. movhps [dstq+4*mmsize/2], m0
  298. movhps [dstq+5*mmsize/2], m2
  299. add src0q, mmsize/2
  300. add dstq, mmsize*3
  301. sub lend, mmsize/4
  302. %else
  303. mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
  304. mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
  305. mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
  306. mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
  307. mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
  308. mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
  309. ; unpack words:
  310. SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  311. ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
  312. SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  313. ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
  314. SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  315. ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
  316. ; blend dwords
  317. shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
  318. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  319. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  320. SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
  321. shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
  322. shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
  323. shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
  324. SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
  325. ; shuffle dwords
  326. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  327. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  328. pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
  329. pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
  330. pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
  331. pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
  332. ; shuffle qwords
  333. punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
  334. punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  335. shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
  336. SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
  337. punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
  338. punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
  339. shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
  340. SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
  341. mova [dstq+0*mmsize], m4
  342. mova [dstq+1*mmsize], m2
  343. mova [dstq+2*mmsize], m0
  344. mova [dstq+3*mmsize], m5
  345. mova [dstq+4*mmsize], m3
  346. mova [dstq+5*mmsize], m1
  347. add src0q, mmsize
  348. add dstq, mmsize*6
  349. sub lend, mmsize/2
  350. %endif
  351. jg .loop
  352. REP_RET
  353. %endmacro
  354. INIT_XMM sse2
  355. CONV_S16P_TO_S16_6CH
  356. INIT_XMM sse2slow
  357. CONV_S16P_TO_S16_6CH
  358. INIT_XMM avx
  359. CONV_S16P_TO_S16_6CH
  360. ;------------------------------------------------------------------------------
  361. ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
  362. ; int channels);
  363. ;------------------------------------------------------------------------------
  364. %macro CONV_S16P_TO_FLT_2CH 0
  365. cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
  366. lea lenq, [2*lend]
  367. mov src1q, [src0q+gprsize]
  368. mov src0q, [src0q ]
  369. lea dstq, [dstq+4*lenq]
  370. add src0q, lenq
  371. add src1q, lenq
  372. neg lenq
  373. mova m5, [pf_s32_inv_scale]
  374. .loop:
  375. mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
  376. mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
  377. SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
  378. ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
  379. pxor m3, m3
  380. punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
  381. punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
  382. punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
  383. punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
  384. cvtdq2ps m0, m0
  385. cvtdq2ps m1, m1
  386. cvtdq2ps m2, m2
  387. cvtdq2ps m3, m3
  388. mulps m0, m5
  389. mulps m1, m5
  390. mulps m2, m5
  391. mulps m3, m5
  392. mova [dstq+4*lenq ], m0
  393. mova [dstq+4*lenq+ mmsize], m1
  394. mova [dstq+4*lenq+2*mmsize], m2
  395. mova [dstq+4*lenq+3*mmsize], m3
  396. add lenq, mmsize
  397. jl .loop
  398. REP_RET
  399. %endmacro
  400. INIT_XMM sse2
  401. CONV_S16P_TO_FLT_2CH
  402. INIT_XMM avx
  403. CONV_S16P_TO_FLT_2CH
  404. ;------------------------------------------------------------------------------
  405. ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
  406. ; int channels);
  407. ;------------------------------------------------------------------------------
  408. %macro CONV_S16P_TO_FLT_6CH 0
  409. %if ARCH_X86_64
  410. cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
  411. %else
  412. cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
  413. %define lend dword r2m
  414. %endif
  415. mov src1q, [srcq+1*gprsize]
  416. mov src2q, [srcq+2*gprsize]
  417. mov src3q, [srcq+3*gprsize]
  418. mov src4q, [srcq+4*gprsize]
  419. mov src5q, [srcq+5*gprsize]
  420. mov srcq, [srcq]
  421. sub src1q, srcq
  422. sub src2q, srcq
  423. sub src3q, srcq
  424. sub src4q, srcq
  425. sub src5q, srcq
  426. mova m7, [pf_s32_inv_scale]
  427. %if cpuflag(ssse3)
  428. %define unpack_even m6
  429. mova m6, [pb_shuf_unpack_even]
  430. %if ARCH_X86_64
  431. %define unpack_odd m8
  432. mova m8, [pb_shuf_unpack_odd]
  433. %else
  434. %define unpack_odd [pb_shuf_unpack_odd]
  435. %endif
  436. %endif
  437. .loop:
  438. movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
  439. movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  440. movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  441. movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  442. movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  443. movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  444. ; unpack words:
  445. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  446. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  447. punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  448. ; blend dwords
  449. shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
  450. shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
  451. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  452. %if cpuflag(ssse3)
  453. pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
  454. pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
  455. pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
  456. pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
  457. pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
  458. pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
  459. %else
  460. ; shuffle dwords
  461. pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
  462. pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
  463. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  464. pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
  465. punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
  466. punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
  467. punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
  468. punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
  469. punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
  470. punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
  471. SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
  472. %endif
  473. cvtdq2ps m0, m0 ; convert s32 to float
  474. cvtdq2ps m1, m1
  475. cvtdq2ps m2, m2
  476. cvtdq2ps m3, m3
  477. cvtdq2ps m4, m4
  478. cvtdq2ps m5, m5
  479. mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
  480. mulps m1, m7
  481. mulps m2, m7
  482. mulps m3, m7
  483. mulps m4, m7
  484. mulps m5, m7
  485. mova [dstq ], m0
  486. mova [dstq+ mmsize], m1
  487. mova [dstq+2*mmsize], m2
  488. mova [dstq+3*mmsize], m3
  489. mova [dstq+4*mmsize], m4
  490. mova [dstq+5*mmsize], m5
  491. add srcq, mmsize/2
  492. add dstq, mmsize*6
  493. sub lend, mmsize/4
  494. jg .loop
  495. REP_RET
  496. %endmacro
  497. INIT_XMM sse2
  498. CONV_S16P_TO_FLT_6CH
  499. INIT_XMM ssse3
  500. CONV_S16P_TO_FLT_6CH
  501. INIT_XMM avx
  502. CONV_S16P_TO_FLT_6CH
  503. ;------------------------------------------------------------------------------
  504. ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
  505. ; int channels);
  506. ;------------------------------------------------------------------------------
  507. %macro CONV_FLTP_TO_S16_2CH 0
  508. cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
  509. lea lenq, [4*lend]
  510. mov src1q, [src0q+gprsize]
  511. mov src0q, [src0q ]
  512. add dstq, lenq
  513. add src0q, lenq
  514. add src1q, lenq
  515. neg lenq
  516. mova m2, [pf_s16_scale]
  517. %if cpuflag(ssse3)
  518. mova m3, [pb_interleave_words]
  519. %endif
  520. .loop:
  521. mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
  522. mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
  523. cvtps2dq m0, m0
  524. cvtps2dq m1, m1
  525. %if cpuflag(ssse3)
  526. packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  527. pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  528. %else
  529. packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
  530. packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
  531. punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  532. %endif
  533. mova [dstq+lenq], m0
  534. add lenq, mmsize
  535. jl .loop
  536. REP_RET
  537. %endmacro
  538. INIT_XMM sse2
  539. CONV_FLTP_TO_S16_2CH
  540. INIT_XMM ssse3
  541. CONV_FLTP_TO_S16_2CH
  542. ;------------------------------------------------------------------------------
  543. ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
  544. ; int channels);
  545. ;------------------------------------------------------------------------------
  546. %macro CONV_FLTP_TO_S16_6CH 0
  547. %if ARCH_X86_64
  548. cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
  549. %else
  550. cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
  551. %define lend dword r2m
  552. %endif
  553. mov src1q, [srcq+1*gprsize]
  554. mov src2q, [srcq+2*gprsize]
  555. mov src3q, [srcq+3*gprsize]
  556. mov src4q, [srcq+4*gprsize]
  557. mov src5q, [srcq+5*gprsize]
  558. mov srcq, [srcq]
  559. sub src1q, srcq
  560. sub src2q, srcq
  561. sub src3q, srcq
  562. sub src4q, srcq
  563. sub src5q, srcq
  564. movaps xmm6, [pf_s16_scale]
  565. .loop:
  566. %if cpuflag(sse2)
  567. mulps m0, m6, [srcq ]
  568. mulps m1, m6, [srcq+src1q]
  569. mulps m2, m6, [srcq+src2q]
  570. mulps m3, m6, [srcq+src3q]
  571. mulps m4, m6, [srcq+src4q]
  572. mulps m5, m6, [srcq+src5q]
  573. cvtps2dq m0, m0
  574. cvtps2dq m1, m1
  575. cvtps2dq m2, m2
  576. cvtps2dq m3, m3
  577. cvtps2dq m4, m4
  578. cvtps2dq m5, m5
  579. packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
  580. packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
  581. packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
  582. ; unpack words:
  583. movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
  584. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  585. punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
  586. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  587. ; blend dwords:
  588. shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
  589. shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  590. shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  591. ; shuffle dwords:
  592. shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  593. shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
  594. shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  595. mova [dstq+0*mmsize], m3
  596. mova [dstq+1*mmsize], m1
  597. mova [dstq+2*mmsize], m0
  598. %else ; sse
  599. movlps xmm0, [srcq ]
  600. movlps xmm1, [srcq+src1q]
  601. movlps xmm2, [srcq+src2q]
  602. movlps xmm3, [srcq+src3q]
  603. movlps xmm4, [srcq+src4q]
  604. movlps xmm5, [srcq+src5q]
  605. mulps xmm0, xmm6
  606. mulps xmm1, xmm6
  607. mulps xmm2, xmm6
  608. mulps xmm3, xmm6
  609. mulps xmm4, xmm6
  610. mulps xmm5, xmm6
  611. cvtps2pi mm0, xmm0
  612. cvtps2pi mm1, xmm1
  613. cvtps2pi mm2, xmm2
  614. cvtps2pi mm3, xmm3
  615. cvtps2pi mm4, xmm4
  616. cvtps2pi mm5, xmm5
  617. packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
  618. packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
  619. packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
  620. ; unpack words
  621. pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
  622. punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
  623. punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
  624. punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
  625. ; unpack dwords
  626. pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
  627. punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
  628. punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
  629. punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
  630. mova [dstq+0*mmsize], mm0
  631. mova [dstq+1*mmsize], mm1
  632. mova [dstq+2*mmsize], mm2
  633. %endif
  634. add srcq, mmsize
  635. add dstq, mmsize*3
  636. sub lend, mmsize/4
  637. jg .loop
  638. %if mmsize == 8
  639. emms
  640. RET
  641. %else
  642. REP_RET
  643. %endif
  644. %endmacro
  645. INIT_MMX sse
  646. CONV_FLTP_TO_S16_6CH
  647. INIT_XMM sse2
  648. CONV_FLTP_TO_S16_6CH
  649. INIT_XMM avx
  650. CONV_FLTP_TO_S16_6CH
  651. ;------------------------------------------------------------------------------
  652. ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
  653. ; int channels);
  654. ;------------------------------------------------------------------------------
  655. %macro CONV_FLTP_TO_FLT_2CH 0
  656. cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
  657. mov src1q, [src0q+gprsize]
  658. mov src0q, [src0q]
  659. lea lenq, [4*lend]
  660. add src0q, lenq
  661. add src1q, lenq
  662. lea dstq, [dstq+2*lenq]
  663. neg lenq
  664. .loop:
  665. mova m0, [src0q+lenq ]
  666. mova m1, [src1q+lenq ]
  667. mova m2, [src0q+lenq+mmsize]
  668. mova m3, [src1q+lenq+mmsize]
  669. SBUTTERFLYPS 0, 1, 4
  670. SBUTTERFLYPS 2, 3, 4
  671. mova [dstq+2*lenq+0*mmsize], m0
  672. mova [dstq+2*lenq+1*mmsize], m1
  673. mova [dstq+2*lenq+2*mmsize], m2
  674. mova [dstq+2*lenq+3*mmsize], m3
  675. add lenq, 2*mmsize
  676. jl .loop
  677. REP_RET
  678. %endmacro
  679. INIT_XMM sse
  680. CONV_FLTP_TO_FLT_2CH
  681. INIT_XMM avx
  682. CONV_FLTP_TO_FLT_2CH
  683. ;-----------------------------------------------------------------------------
  684. ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
  685. ; int channels);
  686. ;-----------------------------------------------------------------------------
  687. %macro CONV_FLTP_TO_FLT_6CH 0
  688. cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  689. %if ARCH_X86_64
  690. mov lend, r2d
  691. %else
  692. %define lend dword r2m
  693. %endif
  694. mov src1q, [srcq+1*gprsize]
  695. mov src2q, [srcq+2*gprsize]
  696. mov src3q, [srcq+3*gprsize]
  697. mov src4q, [srcq+4*gprsize]
  698. mov src5q, [srcq+5*gprsize]
  699. mov srcq, [srcq]
  700. sub src1q, srcq
  701. sub src2q, srcq
  702. sub src3q, srcq
  703. sub src4q, srcq
  704. sub src5q, srcq
  705. .loop:
  706. mova m0, [srcq ]
  707. mova m1, [srcq+src1q]
  708. mova m2, [srcq+src2q]
  709. mova m3, [srcq+src3q]
  710. mova m4, [srcq+src4q]
  711. mova m5, [srcq+src5q]
  712. %if cpuflag(sse4)
  713. SBUTTERFLYPS 0, 1, 6
  714. SBUTTERFLYPS 2, 3, 6
  715. SBUTTERFLYPS 4, 5, 6
  716. blendps m6, m4, m0, 1100b
  717. movlhps m0, m2
  718. movhlps m4, m2
  719. blendps m2, m5, m1, 1100b
  720. movlhps m1, m3
  721. movhlps m5, m3
  722. movaps [dstq ], m0
  723. movaps [dstq+16], m6
  724. movaps [dstq+32], m4
  725. movaps [dstq+48], m1
  726. movaps [dstq+64], m2
  727. movaps [dstq+80], m5
  728. %else ; mmx
  729. SBUTTERFLY dq, 0, 1, 6
  730. SBUTTERFLY dq, 2, 3, 6
  731. SBUTTERFLY dq, 4, 5, 6
  732. movq [dstq ], m0
  733. movq [dstq+ 8], m2
  734. movq [dstq+16], m4
  735. movq [dstq+24], m1
  736. movq [dstq+32], m3
  737. movq [dstq+40], m5
  738. %endif
  739. add srcq, mmsize
  740. add dstq, mmsize*6
  741. sub lend, mmsize/4
  742. jg .loop
  743. %if mmsize == 8
  744. emms
  745. RET
  746. %else
  747. REP_RET
  748. %endif
  749. %endmacro
  750. INIT_MMX mmx
  751. CONV_FLTP_TO_FLT_6CH
  752. INIT_XMM sse4
  753. CONV_FLTP_TO_FLT_6CH
  754. INIT_XMM avx
  755. CONV_FLTP_TO_FLT_6CH
  756. ;------------------------------------------------------------------------------
  757. ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
  758. ; int channels);
  759. ;------------------------------------------------------------------------------
  760. %macro CONV_S16_TO_S16P_2CH 0
  761. cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
  762. lea lenq, [2*lend]
  763. mov dst1q, [dst0q+gprsize]
  764. mov dst0q, [dst0q ]
  765. lea srcq, [srcq+2*lenq]
  766. add dst0q, lenq
  767. add dst1q, lenq
  768. neg lenq
  769. %if cpuflag(ssse3)
  770. mova m3, [pb_deinterleave_words]
  771. %endif
  772. .loop:
  773. mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  774. mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  775. %if cpuflag(ssse3)
  776. pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  777. pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
  778. SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  779. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  780. %else ; sse2
  781. pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
  782. pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
  783. pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
  784. pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
  785. DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  786. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  787. %endif
  788. mova [dst0q+lenq], m0
  789. mova [dst1q+lenq], m1
  790. add lenq, mmsize
  791. jl .loop
  792. REP_RET
  793. %endmacro
  794. INIT_XMM sse2
  795. CONV_S16_TO_S16P_2CH
  796. INIT_XMM ssse3
  797. CONV_S16_TO_S16P_2CH
  798. INIT_XMM avx
  799. CONV_S16_TO_S16P_2CH
  800. ;------------------------------------------------------------------------------
  801. ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
  802. ; int channels);
  803. ;------------------------------------------------------------------------------
  804. %macro CONV_S16_TO_S16P_6CH 0
  805. %if ARCH_X86_64
  806. cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
  807. %else
  808. cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
  809. %define lend dword r2m
  810. %endif
  811. mov dst1q, [dstq+ gprsize]
  812. mov dst2q, [dstq+2*gprsize]
  813. mov dst3q, [dstq+3*gprsize]
  814. mov dst4q, [dstq+4*gprsize]
  815. mov dst5q, [dstq+5*gprsize]
  816. mov dstq, [dstq ]
  817. sub dst1q, dstq
  818. sub dst2q, dstq
  819. sub dst3q, dstq
  820. sub dst4q, dstq
  821. sub dst5q, dstq
  822. .loop:
  823. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  824. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  825. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  826. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  827. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  828. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  829. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  830. ; m1 = 4, 10, 5, 11, x, x, x, x
  831. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  832. ; m2 = 16, 22, 17, 23, x, x, x, x
  833. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  834. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  835. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  836. movq [dstq ], m0
  837. movhps [dstq+dst1q], m0
  838. movq [dstq+dst2q], m3
  839. movhps [dstq+dst3q], m3
  840. movq [dstq+dst4q], m1
  841. movhps [dstq+dst5q], m1
  842. add srcq, mmsize*3
  843. add dstq, mmsize/2
  844. sub lend, mmsize/4
  845. jg .loop
  846. REP_RET
  847. %endmacro
  848. INIT_XMM sse2
  849. CONV_S16_TO_S16P_6CH
  850. INIT_XMM ssse3
  851. CONV_S16_TO_S16P_6CH
  852. INIT_XMM avx
  853. CONV_S16_TO_S16P_6CH
  854. ;------------------------------------------------------------------------------
  855. ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
  856. ; int channels);
  857. ;------------------------------------------------------------------------------
  858. %macro CONV_S16_TO_FLTP_2CH 0
  859. cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
  860. lea lenq, [4*lend]
  861. mov dst1q, [dst0q+gprsize]
  862. mov dst0q, [dst0q ]
  863. add srcq, lenq
  864. add dst0q, lenq
  865. add dst1q, lenq
  866. neg lenq
  867. mova m3, [pf_s32_inv_scale]
  868. mova m4, [pw_zero_even]
  869. .loop:
  870. mova m1, [srcq+lenq]
  871. pslld m0, m1, 16
  872. pand m1, m4
  873. cvtdq2ps m0, m0
  874. cvtdq2ps m1, m1
  875. mulps m0, m0, m3
  876. mulps m1, m1, m3
  877. mova [dst0q+lenq], m0
  878. mova [dst1q+lenq], m1
  879. add lenq, mmsize
  880. jl .loop
  881. REP_RET
  882. %endmacro
  883. INIT_XMM sse2
  884. CONV_S16_TO_FLTP_2CH
  885. INIT_XMM avx
  886. CONV_S16_TO_FLTP_2CH
  887. ;------------------------------------------------------------------------------
  888. ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
  889. ; int channels);
  890. ;------------------------------------------------------------------------------
  891. %macro CONV_S16_TO_FLTP_6CH 0
  892. %if ARCH_X86_64
  893. cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  894. %else
  895. cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  896. %define lend dword r2m
  897. %endif
  898. mov dst1q, [dstq+ gprsize]
  899. mov dst2q, [dstq+2*gprsize]
  900. mov dst3q, [dstq+3*gprsize]
  901. mov dst4q, [dstq+4*gprsize]
  902. mov dst5q, [dstq+5*gprsize]
  903. mov dstq, [dstq ]
  904. sub dst1q, dstq
  905. sub dst2q, dstq
  906. sub dst3q, dstq
  907. sub dst4q, dstq
  908. sub dst5q, dstq
  909. mova m6, [pf_s16_inv_scale]
  910. .loop:
  911. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  912. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  913. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  914. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  915. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  916. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  917. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  918. ; m1 = 4, 10, 5, 11, x, x, x, x
  919. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  920. ; m2 = 16, 22, 17, 23, x, x, x, x
  921. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  922. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  923. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  924. S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
  925. ; m2 = 1, 7, 13, 19
  926. S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
  927. ; m4 = 3, 9, 15, 21
  928. S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
  929. ; m5 = 5, 11, 17, 23
  930. SWAP 1,2,3,4
  931. cvtdq2ps m0, m0
  932. cvtdq2ps m1, m1
  933. cvtdq2ps m2, m2
  934. cvtdq2ps m3, m3
  935. cvtdq2ps m4, m4
  936. cvtdq2ps m5, m5
  937. mulps m0, m6
  938. mulps m1, m6
  939. mulps m2, m6
  940. mulps m3, m6
  941. mulps m4, m6
  942. mulps m5, m6
  943. mova [dstq ], m0
  944. mova [dstq+dst1q], m1
  945. mova [dstq+dst2q], m2
  946. mova [dstq+dst3q], m3
  947. mova [dstq+dst4q], m4
  948. mova [dstq+dst5q], m5
  949. add srcq, mmsize*3
  950. add dstq, mmsize
  951. sub lend, mmsize/4
  952. jg .loop
  953. REP_RET
  954. %endmacro
  955. INIT_XMM sse2
  956. CONV_S16_TO_FLTP_6CH
  957. INIT_XMM ssse3
  958. CONV_S16_TO_FLTP_6CH
  959. INIT_XMM sse4
  960. CONV_S16_TO_FLTP_6CH
  961. INIT_XMM avx
  962. CONV_S16_TO_FLTP_6CH
  963. ;------------------------------------------------------------------------------
  964. ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
  965. ; int channels);
  966. ;------------------------------------------------------------------------------
  967. %macro CONV_FLT_TO_S16P_2CH 0
  968. cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
  969. lea lenq, [2*lend]
  970. mov dst1q, [dst0q+gprsize]
  971. mov dst0q, [dst0q ]
  972. lea srcq, [srcq+4*lenq]
  973. add dst0q, lenq
  974. add dst1q, lenq
  975. neg lenq
  976. mova m5, [pf_s16_scale]
  977. .loop:
  978. mova m0, [srcq+4*lenq ]
  979. mova m1, [srcq+4*lenq+ mmsize]
  980. mova m2, [srcq+4*lenq+2*mmsize]
  981. mova m3, [srcq+4*lenq+3*mmsize]
  982. DEINT2_PS 0, 1, 4
  983. DEINT2_PS 2, 3, 4
  984. mulps m0, m0, m5
  985. mulps m1, m1, m5
  986. mulps m2, m2, m5
  987. mulps m3, m3, m5
  988. cvtps2dq m0, m0
  989. cvtps2dq m1, m1
  990. cvtps2dq m2, m2
  991. cvtps2dq m3, m3
  992. packssdw m0, m2
  993. packssdw m1, m3
  994. mova [dst0q+lenq], m0
  995. mova [dst1q+lenq], m1
  996. add lenq, mmsize
  997. jl .loop
  998. REP_RET
  999. %endmacro
  1000. INIT_XMM sse2
  1001. CONV_FLT_TO_S16P_2CH
  1002. INIT_XMM avx
  1003. CONV_FLT_TO_S16P_2CH
  1004. ;------------------------------------------------------------------------------
  1005. ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
  1006. ; int channels);
  1007. ;------------------------------------------------------------------------------
  1008. %macro CONV_FLT_TO_S16P_6CH 0
  1009. %if ARCH_X86_64
  1010. cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1011. %else
  1012. cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1013. %define lend dword r2m
  1014. %endif
  1015. mov dst1q, [dstq+ gprsize]
  1016. mov dst2q, [dstq+2*gprsize]
  1017. mov dst3q, [dstq+3*gprsize]
  1018. mov dst4q, [dstq+4*gprsize]
  1019. mov dst5q, [dstq+5*gprsize]
  1020. mov dstq, [dstq ]
  1021. sub dst1q, dstq
  1022. sub dst2q, dstq
  1023. sub dst3q, dstq
  1024. sub dst4q, dstq
  1025. sub dst5q, dstq
  1026. mova m6, [pf_s16_scale]
  1027. .loop:
  1028. mulps m0, m6, [srcq+0*mmsize]
  1029. mulps m3, m6, [srcq+1*mmsize]
  1030. mulps m1, m6, [srcq+2*mmsize]
  1031. mulps m4, m6, [srcq+3*mmsize]
  1032. mulps m2, m6, [srcq+4*mmsize]
  1033. mulps m5, m6, [srcq+5*mmsize]
  1034. cvtps2dq m0, m0
  1035. cvtps2dq m1, m1
  1036. cvtps2dq m2, m2
  1037. cvtps2dq m3, m3
  1038. cvtps2dq m4, m4
  1039. cvtps2dq m5, m5
  1040. packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  1041. packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  1042. packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  1043. PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
  1044. shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
  1045. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  1046. SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  1047. ; m3 = 4, 10, 5, 11, x, x, x, x
  1048. SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
  1049. ; m2 = 16, 22, 17, 23, x, x, x, x
  1050. SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  1051. ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
  1052. punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
  1053. movq [dstq ], m0
  1054. movhps [dstq+dst1q], m0
  1055. movq [dstq+dst2q], m1
  1056. movhps [dstq+dst3q], m1
  1057. movq [dstq+dst4q], m3
  1058. movhps [dstq+dst5q], m3
  1059. add srcq, mmsize*6
  1060. add dstq, mmsize/2
  1061. sub lend, mmsize/4
  1062. jg .loop
  1063. REP_RET
  1064. %endmacro
  1065. INIT_XMM sse2
  1066. CONV_FLT_TO_S16P_6CH
  1067. INIT_XMM ssse3
  1068. CONV_FLT_TO_S16P_6CH
  1069. INIT_XMM avx
  1070. CONV_FLT_TO_S16P_6CH
  1071. ;------------------------------------------------------------------------------
  1072. ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
  1073. ; int channels);
  1074. ;------------------------------------------------------------------------------
  1075. %macro CONV_FLT_TO_FLTP_2CH 0
  1076. cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
  1077. lea lenq, [4*lend]
  1078. mov dst1q, [dst0q+gprsize]
  1079. mov dst0q, [dst0q ]
  1080. lea srcq, [srcq+2*lenq]
  1081. add dst0q, lenq
  1082. add dst1q, lenq
  1083. neg lenq
  1084. .loop:
  1085. mova m0, [srcq+2*lenq ]
  1086. mova m1, [srcq+2*lenq+mmsize]
  1087. DEINT2_PS 0, 1, 2
  1088. mova [dst0q+lenq], m0
  1089. mova [dst1q+lenq], m1
  1090. add lenq, mmsize
  1091. jl .loop
  1092. REP_RET
  1093. %endmacro
  1094. INIT_XMM sse
  1095. CONV_FLT_TO_FLTP_2CH
  1096. INIT_XMM avx
  1097. CONV_FLT_TO_FLTP_2CH
  1098. ;------------------------------------------------------------------------------
  1099. ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
  1100. ; int channels);
  1101. ;------------------------------------------------------------------------------
  1102. %macro CONV_FLT_TO_FLTP_6CH 0
  1103. %if ARCH_X86_64
  1104. cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1105. %else
  1106. cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1107. %define lend dword r2m
  1108. %endif
  1109. mov dst1q, [dstq+ gprsize]
  1110. mov dst2q, [dstq+2*gprsize]
  1111. mov dst3q, [dstq+3*gprsize]
  1112. mov dst4q, [dstq+4*gprsize]
  1113. mov dst5q, [dstq+5*gprsize]
  1114. mov dstq, [dstq ]
  1115. sub dst1q, dstq
  1116. sub dst2q, dstq
  1117. sub dst3q, dstq
  1118. sub dst4q, dstq
  1119. sub dst5q, dstq
  1120. .loop:
  1121. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
  1122. mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
  1123. mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
  1124. mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
  1125. mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
  1126. mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
  1127. SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
  1128. ; m3 = 2, 14, 3, 15
  1129. SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
  1130. ; m4 = 6, 18, 7, 19
  1131. SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
  1132. ; m5 = 10, 22, 11, 23
  1133. SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
  1134. ; m4 = 1, 7, 13, 19
  1135. SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
  1136. ; m2 = 3, 9, 15, 21
  1137. SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
  1138. ; m5 = 5, 11, 17, 23
  1139. mova [dstq ], m0
  1140. mova [dstq+dst1q], m4
  1141. mova [dstq+dst2q], m3
  1142. mova [dstq+dst3q], m2
  1143. mova [dstq+dst4q], m1
  1144. mova [dstq+dst5q], m5
  1145. add srcq, mmsize*6
  1146. add dstq, mmsize
  1147. sub lend, mmsize/4
  1148. jg .loop
  1149. REP_RET
  1150. %endmacro
  1151. INIT_XMM sse2
  1152. CONV_FLT_TO_FLTP_6CH
  1153. INIT_XMM avx
  1154. CONV_FLT_TO_FLTP_6CH