You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

521 lines
14KB

  1. ;******************************************************************************
  2. ;* x86 optimized channel mixing
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86util.asm"
  22. %include "util.asm"
  23. SECTION_TEXT
  24. ;-----------------------------------------------------------------------------
  25. ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
  26. ; int out_ch, int in_ch);
  27. ;-----------------------------------------------------------------------------
  28. %macro MIX_2_TO_1_FLTP_FLT 0
  29. cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
  30. mov src1q, [srcq+gprsize]
  31. mov srcq, [srcq ]
  32. sub src1q, srcq
  33. mov matrixq, [matrixq ]
  34. VBROADCASTSS m4, [matrixq ]
  35. VBROADCASTSS m5, [matrixq+4]
  36. ALIGN 16
  37. .loop:
  38. mulps m0, m4, [srcq ]
  39. mulps m1, m5, [srcq+src1q ]
  40. mulps m2, m4, [srcq+ mmsize]
  41. mulps m3, m5, [srcq+src1q+mmsize]
  42. addps m0, m0, m1
  43. addps m2, m2, m3
  44. mova [srcq ], m0
  45. mova [srcq+mmsize], m2
  46. add srcq, mmsize*2
  47. sub lend, mmsize*2/4
  48. jg .loop
  49. REP_RET
  50. %endmacro
  51. INIT_XMM sse
  52. MIX_2_TO_1_FLTP_FLT
  53. %if HAVE_AVX_EXTERNAL
  54. INIT_YMM avx
  55. MIX_2_TO_1_FLTP_FLT
  56. %endif
  57. ;-----------------------------------------------------------------------------
  58. ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
  59. ; int out_ch, int in_ch);
  60. ;-----------------------------------------------------------------------------
  61. %macro MIX_2_TO_1_S16P_FLT 0
  62. cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
  63. mov src1q, [srcq+gprsize]
  64. mov srcq, [srcq]
  65. sub src1q, srcq
  66. mov matrixq, [matrixq ]
  67. VBROADCASTSS m4, [matrixq ]
  68. VBROADCASTSS m5, [matrixq+4]
  69. ALIGN 16
  70. .loop:
  71. mova m0, [srcq ]
  72. mova m2, [srcq+src1q]
  73. S16_TO_S32_SX 0, 1
  74. S16_TO_S32_SX 2, 3
  75. cvtdq2ps m0, m0
  76. cvtdq2ps m1, m1
  77. cvtdq2ps m2, m2
  78. cvtdq2ps m3, m3
  79. mulps m0, m4
  80. mulps m1, m4
  81. mulps m2, m5
  82. mulps m3, m5
  83. addps m0, m2
  84. addps m1, m3
  85. cvtps2dq m0, m0
  86. cvtps2dq m1, m1
  87. packssdw m0, m1
  88. mova [srcq], m0
  89. add srcq, mmsize
  90. sub lend, mmsize/2
  91. jg .loop
  92. REP_RET
  93. %endmacro
  94. INIT_XMM sse2
  95. MIX_2_TO_1_S16P_FLT
  96. INIT_XMM sse4
  97. MIX_2_TO_1_S16P_FLT
  98. ;-----------------------------------------------------------------------------
  99. ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
  100. ; int out_ch, int in_ch);
  101. ;-----------------------------------------------------------------------------
  102. INIT_XMM sse2
  103. cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
  104. mov src1q, [srcq+gprsize]
  105. mov srcq, [srcq]
  106. sub src1q, srcq
  107. mov matrixq, [matrixq]
  108. movd m4, [matrixq]
  109. movd m5, [matrixq]
  110. SPLATW m4, m4, 0
  111. SPLATW m5, m5, 1
  112. pxor m0, m0
  113. punpcklwd m4, m0
  114. punpcklwd m5, m0
  115. ALIGN 16
  116. .loop:
  117. mova m0, [srcq ]
  118. mova m2, [srcq+src1q]
  119. punpckhwd m1, m0, m0
  120. punpcklwd m0, m0
  121. punpckhwd m3, m2, m2
  122. punpcklwd m2, m2
  123. pmaddwd m0, m4
  124. pmaddwd m1, m4
  125. pmaddwd m2, m5
  126. pmaddwd m3, m5
  127. paddd m0, m2
  128. paddd m1, m3
  129. psrad m0, 8
  130. psrad m1, 8
  131. packssdw m0, m1
  132. mova [srcq], m0
  133. add srcq, mmsize
  134. sub lend, mmsize/2
  135. jg .loop
  136. REP_RET
  137. ;-----------------------------------------------------------------------------
  138. ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
  139. ; int out_ch, int in_ch);
  140. ;-----------------------------------------------------------------------------
  141. %macro MIX_1_TO_2_FLTP_FLT 0
  142. cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
  143. mov src1q, [src0q+gprsize]
  144. mov src0q, [src0q]
  145. sub src1q, src0q
  146. mov matrix1q, [matrix0q+gprsize]
  147. mov matrix0q, [matrix0q]
  148. VBROADCASTSS m2, [matrix0q]
  149. VBROADCASTSS m3, [matrix1q]
  150. ALIGN 16
  151. .loop:
  152. mova m0, [src0q]
  153. mulps m1, m0, m3
  154. mulps m0, m0, m2
  155. mova [src0q ], m0
  156. mova [src0q+src1q], m1
  157. add src0q, mmsize
  158. sub lend, mmsize/4
  159. jg .loop
  160. REP_RET
  161. %endmacro
  162. INIT_XMM sse
  163. MIX_1_TO_2_FLTP_FLT
  164. %if HAVE_AVX_EXTERNAL
  165. INIT_YMM avx
  166. MIX_1_TO_2_FLTP_FLT
  167. %endif
  168. ;-----------------------------------------------------------------------------
  169. ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
  170. ; int out_ch, int in_ch);
  171. ;-----------------------------------------------------------------------------
  172. %macro MIX_1_TO_2_S16P_FLT 0
  173. cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
  174. mov src1q, [src0q+gprsize]
  175. mov src0q, [src0q]
  176. sub src1q, src0q
  177. mov matrix1q, [matrix0q+gprsize]
  178. mov matrix0q, [matrix0q]
  179. VBROADCASTSS m4, [matrix0q]
  180. VBROADCASTSS m5, [matrix1q]
  181. ALIGN 16
  182. .loop:
  183. mova m0, [src0q]
  184. S16_TO_S32_SX 0, 2
  185. cvtdq2ps m0, m0
  186. cvtdq2ps m2, m2
  187. mulps m1, m0, m5
  188. mulps m0, m0, m4
  189. mulps m3, m2, m5
  190. mulps m2, m2, m4
  191. cvtps2dq m0, m0
  192. cvtps2dq m1, m1
  193. cvtps2dq m2, m2
  194. cvtps2dq m3, m3
  195. packssdw m0, m2
  196. packssdw m1, m3
  197. mova [src0q ], m0
  198. mova [src0q+src1q], m1
  199. add src0q, mmsize
  200. sub lend, mmsize/2
  201. jg .loop
  202. REP_RET
  203. %endmacro
  204. INIT_XMM sse2
  205. MIX_1_TO_2_S16P_FLT
  206. INIT_XMM sse4
  207. MIX_1_TO_2_S16P_FLT
  208. %if HAVE_AVX_EXTERNAL
  209. INIT_XMM avx
  210. MIX_1_TO_2_S16P_FLT
  211. %endif
  212. ;-----------------------------------------------------------------------------
  213. ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
  214. ; int len, int out_ch, int in_ch);
  215. ;-----------------------------------------------------------------------------
  216. %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
  217. ; define some names to make the code clearer
  218. %assign in_channels %1
  219. %assign out_channels %2
  220. %assign stereo out_channels - 1
  221. %ifidn %3, s16p
  222. %assign is_s16 1
  223. %else
  224. %assign is_s16 0
  225. %endif
  226. ; determine how many matrix elements must go on the stack vs. mmregs
  227. %assign matrix_elements in_channels * out_channels
  228. %if is_s16
  229. %if stereo
  230. %assign needed_mmregs 7
  231. %else
  232. %assign needed_mmregs 5
  233. %endif
  234. %else
  235. %if stereo
  236. %assign needed_mmregs 4
  237. %else
  238. %assign needed_mmregs 3
  239. %endif
  240. %endif
  241. %assign matrix_elements_mm num_mmregs - needed_mmregs
  242. %if matrix_elements < matrix_elements_mm
  243. %assign matrix_elements_mm matrix_elements
  244. %endif
  245. %if matrix_elements_mm < matrix_elements
  246. %assign matrix_elements_stack matrix_elements - matrix_elements_mm
  247. %else
  248. %assign matrix_elements_stack 0
  249. %endif
  250. cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
  251. ; get aligned stack space if needed
  252. %if matrix_elements_stack > 0
  253. %if mmsize == 32
  254. %assign bkpreg %1 + 1
  255. %define bkpq r %+ bkpreg %+ q
  256. mov bkpq, rsp
  257. and rsp, ~(mmsize-1)
  258. sub rsp, matrix_elements_stack * mmsize
  259. %else
  260. %assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
  261. SUB rsp, pad
  262. %endif
  263. %endif
  264. ; load matrix pointers
  265. %define matrix0q r1q
  266. %define matrix1q r3q
  267. %if stereo
  268. mov matrix1q, [matrix0q+gprsize]
  269. %endif
  270. mov matrix0q, [matrix0q]
  271. ; define matrix coeff names
  272. %assign %%i 0
  273. %assign %%j needed_mmregs
  274. %rep in_channels
  275. %if %%i >= matrix_elements_mm
  276. CAT_XDEFINE mx_stack_0_, %%i, 1
  277. CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
  278. %else
  279. CAT_XDEFINE mx_stack_0_, %%i, 0
  280. CAT_XDEFINE mx_0_, %%i, m %+ %%j
  281. %assign %%j %%j+1
  282. %endif
  283. %assign %%i %%i+1
  284. %endrep
  285. %if stereo
  286. %assign %%i 0
  287. %rep in_channels
  288. %if in_channels + %%i >= matrix_elements_mm
  289. CAT_XDEFINE mx_stack_1_, %%i, 1
  290. CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
  291. %else
  292. CAT_XDEFINE mx_stack_1_, %%i, 0
  293. CAT_XDEFINE mx_1_, %%i, m %+ %%j
  294. %assign %%j %%j+1
  295. %endif
  296. %assign %%i %%i+1
  297. %endrep
  298. %endif
  299. ; load/splat matrix coeffs
  300. %assign %%i 0
  301. %rep in_channels
  302. %if mx_stack_0_ %+ %%i
  303. VBROADCASTSS m0, [matrix0q+4*%%i]
  304. mova mx_0_ %+ %%i, m0
  305. %else
  306. VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
  307. %endif
  308. %if stereo
  309. %if mx_stack_1_ %+ %%i
  310. VBROADCASTSS m0, [matrix1q+4*%%i]
  311. mova mx_1_ %+ %%i, m0
  312. %else
  313. VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
  314. %endif
  315. %endif
  316. %assign %%i %%i+1
  317. %endrep
  318. ; load channel pointers to registers as offsets from the first channel pointer
  319. %if ARCH_X86_64
  320. movsxd lenq, r2d
  321. %endif
  322. shl lenq, 2-is_s16
  323. %assign %%i 1
  324. %rep (in_channels - 1)
  325. %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  326. mov src5q, [src0q+%%i*gprsize]
  327. add src5q, lenq
  328. mov src %+ %%i %+ m, src5q
  329. %else
  330. mov src %+ %%i %+ q, [src0q+%%i*gprsize]
  331. add src %+ %%i %+ q, lenq
  332. %endif
  333. %assign %%i %%i+1
  334. %endrep
  335. mov src0q, [src0q]
  336. add src0q, lenq
  337. neg lenq
  338. .loop:
  339. ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
  340. ; pointers, so we have to load some of them from the stack each time
  341. %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  342. %if is_s16
  343. ; mix with s16p input
  344. mova m0, [src0q+lenq]
  345. S16_TO_S32_SX 0, 1
  346. cvtdq2ps m0, m0
  347. cvtdq2ps m1, m1
  348. %if stereo
  349. mulps m2, m0, mx_1_0
  350. mulps m3, m1, mx_1_0
  351. %endif
  352. mulps m0, m0, mx_0_0
  353. mulps m1, m1, mx_0_0
  354. %assign %%i 1
  355. %rep (in_channels - 1)
  356. %if copy_src_from_stack
  357. %define src_ptr src5q
  358. %else
  359. %define src_ptr src %+ %%i %+ q
  360. %endif
  361. %if stereo
  362. %if copy_src_from_stack
  363. mov src_ptr, src %+ %%i %+ m
  364. %endif
  365. mova m4, [src_ptr+lenq]
  366. S16_TO_S32_SX 4, 5
  367. cvtdq2ps m4, m4
  368. cvtdq2ps m5, m5
  369. fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
  370. fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
  371. fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
  372. fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
  373. %else
  374. %if copy_src_from_stack
  375. mov src_ptr, src %+ %%i %+ m
  376. %endif
  377. mova m2, [src_ptr+lenq]
  378. S16_TO_S32_SX 2, 3
  379. cvtdq2ps m2, m2
  380. cvtdq2ps m3, m3
  381. fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
  382. fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
  383. %endif
  384. %assign %%i %%i+1
  385. %endrep
  386. %if stereo
  387. cvtps2dq m2, m2
  388. cvtps2dq m3, m3
  389. packssdw m2, m3
  390. mova [src1q+lenq], m2
  391. %endif
  392. cvtps2dq m0, m0
  393. cvtps2dq m1, m1
  394. packssdw m0, m1
  395. mova [src0q+lenq], m0
  396. %else
  397. ; mix with fltp input
  398. %if stereo || mx_stack_0_0
  399. mova m0, [src0q+lenq]
  400. %endif
  401. %if stereo
  402. mulps m1, m0, mx_1_0
  403. %endif
  404. %if stereo || mx_stack_0_0
  405. mulps m0, m0, mx_0_0
  406. %else
  407. mulps m0, [src0q+lenq], mx_0_0
  408. %endif
  409. %assign %%i 1
  410. %rep (in_channels - 1)
  411. %if copy_src_from_stack
  412. %define src_ptr src5q
  413. mov src_ptr, src %+ %%i %+ m
  414. %else
  415. %define src_ptr src %+ %%i %+ q
  416. %endif
  417. ; avoid extra load for mono if matrix is in a mm register
  418. %if stereo || mx_stack_0_ %+ %%i
  419. mova m2, [src_ptr+lenq]
  420. %endif
  421. %if stereo
  422. fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
  423. %endif
  424. %if stereo || mx_stack_0_ %+ %%i
  425. fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
  426. %else
  427. fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
  428. %endif
  429. %assign %%i %%i+1
  430. %endrep
  431. mova [src0q+lenq], m0
  432. %if stereo
  433. mova [src1q+lenq], m1
  434. %endif
  435. %endif
  436. add lenq, mmsize
  437. jl .loop
  438. ; restore stack pointer
  439. %if matrix_elements_stack > 0
  440. %if mmsize == 32
  441. mov rsp, bkpq
  442. %else
  443. ADD rsp, pad
  444. %endif
  445. %endif
  446. ; zero ymm high halves
  447. %if mmsize == 32
  448. vzeroupper
  449. %endif
  450. RET
  451. %endmacro
  452. %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
  453. %assign %%i 3
  454. %rep 6
  455. INIT_XMM sse
  456. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  457. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  458. INIT_XMM sse2
  459. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  460. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  461. INIT_XMM sse4
  462. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  463. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  464. ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
  465. %if HAVE_AVX_EXTERNAL
  466. %if ARCH_X86_64 || %%i < 6
  467. INIT_YMM avx
  468. %else
  469. INIT_XMM avx
  470. %endif
  471. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  472. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  473. INIT_XMM avx
  474. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  475. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  476. %endif
  477. %if HAVE_FMA4_EXTERNAL
  478. %if ARCH_X86_64 || %%i < 6
  479. INIT_YMM fma4
  480. %else
  481. INIT_XMM fma4
  482. %endif
  483. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  484. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  485. INIT_XMM fma4
  486. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  487. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  488. %endif
  489. %assign %%i %%i+1
  490. %endrep
  491. %endmacro
  492. MIX_3_8_TO_1_2_FLT_FUNCS