You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

513 lines
14KB

  1. ;******************************************************************************
  2. ;* x86 optimized channel mixing
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. %include "util.asm"
  23. SECTION_TEXT
  24. ;-----------------------------------------------------------------------------
  25. ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
  26. ; int out_ch, int in_ch);
  27. ;-----------------------------------------------------------------------------
  28. %macro MIX_2_TO_1_FLTP_FLT 0
  29. cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
  30. mov src1q, [srcq+gprsize]
  31. mov srcq, [srcq ]
  32. sub src1q, srcq
  33. mov matrixq, [matrixq ]
  34. VBROADCASTSS m4, [matrixq ]
  35. VBROADCASTSS m5, [matrixq+4]
  36. ALIGN 16
  37. .loop:
  38. mulps m0, m4, [srcq ]
  39. mulps m1, m5, [srcq+src1q ]
  40. mulps m2, m4, [srcq+ mmsize]
  41. mulps m3, m5, [srcq+src1q+mmsize]
  42. addps m0, m0, m1
  43. addps m2, m2, m3
  44. mova [srcq ], m0
  45. mova [srcq+mmsize], m2
  46. add srcq, mmsize*2
  47. sub lend, mmsize*2/4
  48. jg .loop
  49. REP_RET
  50. %endmacro
  51. INIT_XMM sse
  52. MIX_2_TO_1_FLTP_FLT
  53. INIT_YMM avx
  54. MIX_2_TO_1_FLTP_FLT
  55. ;-----------------------------------------------------------------------------
  56. ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
  57. ; int out_ch, int in_ch);
  58. ;-----------------------------------------------------------------------------
  59. %macro MIX_2_TO_1_S16P_FLT 0
  60. cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
  61. mov src1q, [srcq+gprsize]
  62. mov srcq, [srcq]
  63. sub src1q, srcq
  64. mov matrixq, [matrixq ]
  65. VBROADCASTSS m4, [matrixq ]
  66. VBROADCASTSS m5, [matrixq+4]
  67. ALIGN 16
  68. .loop:
  69. mova m0, [srcq ]
  70. mova m2, [srcq+src1q]
  71. S16_TO_S32_SX 0, 1
  72. S16_TO_S32_SX 2, 3
  73. cvtdq2ps m0, m0
  74. cvtdq2ps m1, m1
  75. cvtdq2ps m2, m2
  76. cvtdq2ps m3, m3
  77. mulps m0, m4
  78. mulps m1, m4
  79. mulps m2, m5
  80. mulps m3, m5
  81. addps m0, m2
  82. addps m1, m3
  83. cvtps2dq m0, m0
  84. cvtps2dq m1, m1
  85. packssdw m0, m1
  86. mova [srcq], m0
  87. add srcq, mmsize
  88. sub lend, mmsize/2
  89. jg .loop
  90. REP_RET
  91. %endmacro
  92. INIT_XMM sse2
  93. MIX_2_TO_1_S16P_FLT
  94. INIT_XMM sse4
  95. MIX_2_TO_1_S16P_FLT
  96. ;-----------------------------------------------------------------------------
  97. ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
  98. ; int out_ch, int in_ch);
  99. ;-----------------------------------------------------------------------------
  100. INIT_XMM sse2
  101. cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
  102. mov src1q, [srcq+gprsize]
  103. mov srcq, [srcq]
  104. sub src1q, srcq
  105. mov matrixq, [matrixq]
  106. movd m4, [matrixq]
  107. movd m5, [matrixq]
  108. SPLATW m4, m4, 0
  109. SPLATW m5, m5, 1
  110. pxor m0, m0
  111. punpcklwd m4, m0
  112. punpcklwd m5, m0
  113. ALIGN 16
  114. .loop:
  115. mova m0, [srcq ]
  116. mova m2, [srcq+src1q]
  117. punpckhwd m1, m0, m0
  118. punpcklwd m0, m0
  119. punpckhwd m3, m2, m2
  120. punpcklwd m2, m2
  121. pmaddwd m0, m4
  122. pmaddwd m1, m4
  123. pmaddwd m2, m5
  124. pmaddwd m3, m5
  125. paddd m0, m2
  126. paddd m1, m3
  127. psrad m0, 8
  128. psrad m1, 8
  129. packssdw m0, m1
  130. mova [srcq], m0
  131. add srcq, mmsize
  132. sub lend, mmsize/2
  133. jg .loop
  134. REP_RET
  135. ;-----------------------------------------------------------------------------
  136. ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
  137. ; int out_ch, int in_ch);
  138. ;-----------------------------------------------------------------------------
  139. %macro MIX_1_TO_2_FLTP_FLT 0
  140. cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
  141. mov src1q, [src0q+gprsize]
  142. mov src0q, [src0q]
  143. sub src1q, src0q
  144. mov matrix1q, [matrix0q+gprsize]
  145. mov matrix0q, [matrix0q]
  146. VBROADCASTSS m2, [matrix0q]
  147. VBROADCASTSS m3, [matrix1q]
  148. ALIGN 16
  149. .loop:
  150. mova m0, [src0q]
  151. mulps m1, m0, m3
  152. mulps m0, m0, m2
  153. mova [src0q ], m0
  154. mova [src0q+src1q], m1
  155. add src0q, mmsize
  156. sub lend, mmsize/4
  157. jg .loop
  158. REP_RET
  159. %endmacro
  160. INIT_XMM sse
  161. MIX_1_TO_2_FLTP_FLT
  162. INIT_YMM avx
  163. MIX_1_TO_2_FLTP_FLT
  164. ;-----------------------------------------------------------------------------
  165. ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
  166. ; int out_ch, int in_ch);
  167. ;-----------------------------------------------------------------------------
  168. %macro MIX_1_TO_2_S16P_FLT 0
  169. cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
  170. mov src1q, [src0q+gprsize]
  171. mov src0q, [src0q]
  172. sub src1q, src0q
  173. mov matrix1q, [matrix0q+gprsize]
  174. mov matrix0q, [matrix0q]
  175. VBROADCASTSS m4, [matrix0q]
  176. VBROADCASTSS m5, [matrix1q]
  177. ALIGN 16
  178. .loop:
  179. mova m0, [src0q]
  180. S16_TO_S32_SX 0, 2
  181. cvtdq2ps m0, m0
  182. cvtdq2ps m2, m2
  183. mulps m1, m0, m5
  184. mulps m0, m0, m4
  185. mulps m3, m2, m5
  186. mulps m2, m2, m4
  187. cvtps2dq m0, m0
  188. cvtps2dq m1, m1
  189. cvtps2dq m2, m2
  190. cvtps2dq m3, m3
  191. packssdw m0, m2
  192. packssdw m1, m3
  193. mova [src0q ], m0
  194. mova [src0q+src1q], m1
  195. add src0q, mmsize
  196. sub lend, mmsize/2
  197. jg .loop
  198. REP_RET
  199. %endmacro
  200. INIT_XMM sse2
  201. MIX_1_TO_2_S16P_FLT
  202. INIT_XMM sse4
  203. MIX_1_TO_2_S16P_FLT
  204. INIT_XMM avx
  205. MIX_1_TO_2_S16P_FLT
  206. ;-----------------------------------------------------------------------------
  207. ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
  208. ; int len, int out_ch, int in_ch);
  209. ;-----------------------------------------------------------------------------
  210. %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
  211. ; define some names to make the code clearer
  212. %assign in_channels %1
  213. %assign out_channels %2
  214. %assign stereo out_channels - 1
  215. %ifidn %3, s16p
  216. %assign is_s16 1
  217. %else
  218. %assign is_s16 0
  219. %endif
  220. ; determine how many matrix elements must go on the stack vs. mmregs
  221. %assign matrix_elements in_channels * out_channels
  222. %if is_s16
  223. %if stereo
  224. %assign needed_mmregs 7
  225. %else
  226. %assign needed_mmregs 5
  227. %endif
  228. %else
  229. %if stereo
  230. %assign needed_mmregs 4
  231. %else
  232. %assign needed_mmregs 3
  233. %endif
  234. %endif
  235. %assign matrix_elements_mm num_mmregs - needed_mmregs
  236. %if matrix_elements < matrix_elements_mm
  237. %assign matrix_elements_mm matrix_elements
  238. %endif
  239. %if matrix_elements_mm < matrix_elements
  240. %assign matrix_elements_stack matrix_elements - matrix_elements_mm
  241. %else
  242. %assign matrix_elements_stack 0
  243. %endif
  244. cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
  245. ; get aligned stack space if needed
  246. %if matrix_elements_stack > 0
  247. %if mmsize == 32
  248. %assign bkpreg %1 + 1
  249. %define bkpq r %+ bkpreg %+ q
  250. mov bkpq, rsp
  251. and rsp, ~(mmsize-1)
  252. sub rsp, matrix_elements_stack * mmsize
  253. %else
  254. %assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
  255. SUB rsp, pad
  256. %endif
  257. %endif
  258. ; load matrix pointers
  259. %define matrix0q r1q
  260. %define matrix1q r3q
  261. %if stereo
  262. mov matrix1q, [matrix0q+gprsize]
  263. %endif
  264. mov matrix0q, [matrix0q]
  265. ; define matrix coeff names
  266. %assign %%i 0
  267. %assign %%j needed_mmregs
  268. %rep in_channels
  269. %if %%i >= matrix_elements_mm
  270. CAT_XDEFINE mx_stack_0_, %%i, 1
  271. CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
  272. %else
  273. CAT_XDEFINE mx_stack_0_, %%i, 0
  274. CAT_XDEFINE mx_0_, %%i, m %+ %%j
  275. %assign %%j %%j+1
  276. %endif
  277. %assign %%i %%i+1
  278. %endrep
  279. %if stereo
  280. %assign %%i 0
  281. %rep in_channels
  282. %if in_channels + %%i >= matrix_elements_mm
  283. CAT_XDEFINE mx_stack_1_, %%i, 1
  284. CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
  285. %else
  286. CAT_XDEFINE mx_stack_1_, %%i, 0
  287. CAT_XDEFINE mx_1_, %%i, m %+ %%j
  288. %assign %%j %%j+1
  289. %endif
  290. %assign %%i %%i+1
  291. %endrep
  292. %endif
  293. ; load/splat matrix coeffs
  294. %assign %%i 0
  295. %rep in_channels
  296. %if mx_stack_0_ %+ %%i
  297. VBROADCASTSS m0, [matrix0q+4*%%i]
  298. mova mx_0_ %+ %%i, m0
  299. %else
  300. VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
  301. %endif
  302. %if stereo
  303. %if mx_stack_1_ %+ %%i
  304. VBROADCASTSS m0, [matrix1q+4*%%i]
  305. mova mx_1_ %+ %%i, m0
  306. %else
  307. VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
  308. %endif
  309. %endif
  310. %assign %%i %%i+1
  311. %endrep
  312. ; load channel pointers to registers as offsets from the first channel pointer
  313. %if ARCH_X86_64
  314. movsxd lenq, r2d
  315. %endif
  316. shl lenq, 2-is_s16
  317. %assign %%i 1
  318. %rep (in_channels - 1)
  319. %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  320. mov src5q, [src0q+%%i*gprsize]
  321. add src5q, lenq
  322. mov src %+ %%i %+ m, src5q
  323. %else
  324. mov src %+ %%i %+ q, [src0q+%%i*gprsize]
  325. add src %+ %%i %+ q, lenq
  326. %endif
  327. %assign %%i %%i+1
  328. %endrep
  329. mov src0q, [src0q]
  330. add src0q, lenq
  331. neg lenq
  332. .loop:
  333. ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
  334. ; pointers, so we have to load some of them from the stack each time
  335. %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  336. %if is_s16
  337. ; mix with s16p input
  338. mova m0, [src0q+lenq]
  339. S16_TO_S32_SX 0, 1
  340. cvtdq2ps m0, m0
  341. cvtdq2ps m1, m1
  342. %if stereo
  343. mulps m2, m0, mx_1_0
  344. mulps m3, m1, mx_1_0
  345. %endif
  346. mulps m0, m0, mx_0_0
  347. mulps m1, m1, mx_0_0
  348. %assign %%i 1
  349. %rep (in_channels - 1)
  350. %if copy_src_from_stack
  351. %define src_ptr src5q
  352. %else
  353. %define src_ptr src %+ %%i %+ q
  354. %endif
  355. %if stereo
  356. %if copy_src_from_stack
  357. mov src_ptr, src %+ %%i %+ m
  358. %endif
  359. mova m4, [src_ptr+lenq]
  360. S16_TO_S32_SX 4, 5
  361. cvtdq2ps m4, m4
  362. cvtdq2ps m5, m5
  363. fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
  364. fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
  365. fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
  366. fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
  367. %else
  368. %if copy_src_from_stack
  369. mov src_ptr, src %+ %%i %+ m
  370. %endif
  371. mova m2, [src_ptr+lenq]
  372. S16_TO_S32_SX 2, 3
  373. cvtdq2ps m2, m2
  374. cvtdq2ps m3, m3
  375. fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
  376. fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
  377. %endif
  378. %assign %%i %%i+1
  379. %endrep
  380. %if stereo
  381. cvtps2dq m2, m2
  382. cvtps2dq m3, m3
  383. packssdw m2, m3
  384. mova [src1q+lenq], m2
  385. %endif
  386. cvtps2dq m0, m0
  387. cvtps2dq m1, m1
  388. packssdw m0, m1
  389. mova [src0q+lenq], m0
  390. %else
  391. ; mix with fltp input
  392. %if stereo || mx_stack_0_0
  393. mova m0, [src0q+lenq]
  394. %endif
  395. %if stereo
  396. mulps m1, m0, mx_1_0
  397. %endif
  398. %if stereo || mx_stack_0_0
  399. mulps m0, m0, mx_0_0
  400. %else
  401. mulps m0, [src0q+lenq], mx_0_0
  402. %endif
  403. %assign %%i 1
  404. %rep (in_channels - 1)
  405. %if copy_src_from_stack
  406. %define src_ptr src5q
  407. mov src_ptr, src %+ %%i %+ m
  408. %else
  409. %define src_ptr src %+ %%i %+ q
  410. %endif
  411. ; avoid extra load for mono if matrix is in a mm register
  412. %if stereo || mx_stack_0_ %+ %%i
  413. mova m2, [src_ptr+lenq]
  414. %endif
  415. %if stereo
  416. fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
  417. %endif
  418. %if stereo || mx_stack_0_ %+ %%i
  419. fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
  420. %else
  421. fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
  422. %endif
  423. %assign %%i %%i+1
  424. %endrep
  425. mova [src0q+lenq], m0
  426. %if stereo
  427. mova [src1q+lenq], m1
  428. %endif
  429. %endif
  430. add lenq, mmsize
  431. jl .loop
  432. ; restore stack pointer
  433. %if matrix_elements_stack > 0
  434. %if mmsize == 32
  435. mov rsp, bkpq
  436. %else
  437. ADD rsp, pad
  438. %endif
  439. %endif
  440. ; zero ymm high halves
  441. %if mmsize == 32
  442. vzeroupper
  443. %endif
  444. RET
  445. %endmacro
  446. %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
  447. %assign %%i 3
  448. %rep 6
  449. INIT_XMM sse
  450. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  451. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  452. INIT_XMM sse2
  453. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  454. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  455. INIT_XMM sse4
  456. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  457. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  458. ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
  459. %if ARCH_X86_64 || %%i < 6
  460. INIT_YMM avx
  461. %else
  462. INIT_XMM avx
  463. %endif
  464. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  465. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  466. INIT_XMM avx
  467. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  468. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  469. %if HAVE_FMA4_EXTERNAL
  470. %if ARCH_X86_64 || %%i < 6
  471. INIT_YMM fma4
  472. %else
  473. INIT_XMM fma4
  474. %endif
  475. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  476. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  477. INIT_XMM fma4
  478. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  479. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  480. %endif
  481. %assign %%i %%i+1
  482. %endrep
  483. %endmacro
  484. MIX_3_8_TO_1_2_FLT_FUNCS