You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

521 lines
14KB

  1. ;******************************************************************************
  2. ;* x86 optimized channel mixing
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. %include "util.asm"
  23. SECTION_TEXT
  24. ;-----------------------------------------------------------------------------
  25. ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
  26. ; int out_ch, int in_ch);
  27. ;-----------------------------------------------------------------------------
  28. %macro MIX_2_TO_1_FLTP_FLT 0
  29. cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
  30. mov src1q, [srcq+gprsize]
  31. mov srcq, [srcq ]
  32. sub src1q, srcq
  33. mov matrixq, [matrixq ]
  34. VBROADCASTSS m4, [matrixq ]
  35. VBROADCASTSS m5, [matrixq+4]
  36. ALIGN 16
  37. .loop:
  38. mulps m0, m4, [srcq ]
  39. mulps m1, m5, [srcq+src1q ]
  40. mulps m2, m4, [srcq+ mmsize]
  41. mulps m3, m5, [srcq+src1q+mmsize]
  42. addps m0, m0, m1
  43. addps m2, m2, m3
  44. mova [srcq ], m0
  45. mova [srcq+mmsize], m2
  46. add srcq, mmsize*2
  47. sub lend, mmsize*2/4
  48. jg .loop
  49. REP_RET
  50. %endmacro
  51. INIT_XMM sse
  52. MIX_2_TO_1_FLTP_FLT
  53. INIT_YMM avx
  54. MIX_2_TO_1_FLTP_FLT
  55. ;-----------------------------------------------------------------------------
  56. ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
  57. ; int out_ch, int in_ch);
  58. ;-----------------------------------------------------------------------------
  59. %macro MIX_2_TO_1_S16P_FLT 0
  60. cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
  61. mov src1q, [srcq+gprsize]
  62. mov srcq, [srcq]
  63. sub src1q, srcq
  64. mov matrixq, [matrixq ]
  65. VBROADCASTSS m4, [matrixq ]
  66. VBROADCASTSS m5, [matrixq+4]
  67. ALIGN 16
  68. .loop:
  69. mova m0, [srcq ]
  70. mova m2, [srcq+src1q]
  71. S16_TO_S32_SX 0, 1
  72. S16_TO_S32_SX 2, 3
  73. cvtdq2ps m0, m0
  74. cvtdq2ps m1, m1
  75. cvtdq2ps m2, m2
  76. cvtdq2ps m3, m3
  77. mulps m0, m4
  78. mulps m1, m4
  79. mulps m2, m5
  80. mulps m3, m5
  81. addps m0, m2
  82. addps m1, m3
  83. cvtps2dq m0, m0
  84. cvtps2dq m1, m1
  85. packssdw m0, m1
  86. mova [srcq], m0
  87. add srcq, mmsize
  88. sub lend, mmsize/2
  89. jg .loop
  90. REP_RET
  91. %endmacro
  92. INIT_XMM sse2
  93. MIX_2_TO_1_S16P_FLT
  94. INIT_XMM sse4
  95. MIX_2_TO_1_S16P_FLT
  96. ;-----------------------------------------------------------------------------
  97. ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
  98. ; int out_ch, int in_ch);
  99. ;-----------------------------------------------------------------------------
  100. INIT_XMM sse2
  101. cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
  102. mov src1q, [srcq+gprsize]
  103. mov srcq, [srcq]
  104. sub src1q, srcq
  105. mov matrixq, [matrixq]
  106. movd m4, [matrixq]
  107. movd m5, [matrixq]
  108. SPLATW m4, m4, 0
  109. SPLATW m5, m5, 1
  110. pxor m0, m0
  111. punpcklwd m4, m0
  112. punpcklwd m5, m0
  113. ALIGN 16
  114. .loop:
  115. mova m0, [srcq ]
  116. mova m2, [srcq+src1q]
  117. punpckhwd m1, m0, m0
  118. punpcklwd m0, m0
  119. punpckhwd m3, m2, m2
  120. punpcklwd m2, m2
  121. pmaddwd m0, m4
  122. pmaddwd m1, m4
  123. pmaddwd m2, m5
  124. pmaddwd m3, m5
  125. paddd m0, m2
  126. paddd m1, m3
  127. psrad m0, 8
  128. psrad m1, 8
  129. packssdw m0, m1
  130. mova [srcq], m0
  131. add srcq, mmsize
  132. sub lend, mmsize/2
  133. jg .loop
  134. REP_RET
  135. ;-----------------------------------------------------------------------------
  136. ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
  137. ; int out_ch, int in_ch);
  138. ;-----------------------------------------------------------------------------
  139. %macro MIX_1_TO_2_FLTP_FLT 0
  140. cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
  141. mov src1q, [src0q+gprsize]
  142. mov src0q, [src0q]
  143. sub src1q, src0q
  144. mov matrix1q, [matrix0q+gprsize]
  145. mov matrix0q, [matrix0q]
  146. VBROADCASTSS m2, [matrix0q]
  147. VBROADCASTSS m3, [matrix1q]
  148. ALIGN 16
  149. .loop:
  150. mova m0, [src0q]
  151. mulps m1, m0, m3
  152. mulps m0, m0, m2
  153. mova [src0q ], m0
  154. mova [src0q+src1q], m1
  155. add src0q, mmsize
  156. sub lend, mmsize/4
  157. jg .loop
  158. REP_RET
  159. %endmacro
  160. INIT_XMM sse
  161. MIX_1_TO_2_FLTP_FLT
  162. INIT_YMM avx
  163. MIX_1_TO_2_FLTP_FLT
  164. ;-----------------------------------------------------------------------------
  165. ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
  166. ; int out_ch, int in_ch);
  167. ;-----------------------------------------------------------------------------
  168. %macro MIX_1_TO_2_S16P_FLT 0
  169. cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
  170. mov src1q, [src0q+gprsize]
  171. mov src0q, [src0q]
  172. sub src1q, src0q
  173. mov matrix1q, [matrix0q+gprsize]
  174. mov matrix0q, [matrix0q]
  175. VBROADCASTSS m4, [matrix0q]
  176. VBROADCASTSS m5, [matrix1q]
  177. ALIGN 16
  178. .loop:
  179. mova m0, [src0q]
  180. S16_TO_S32_SX 0, 2
  181. cvtdq2ps m0, m0
  182. cvtdq2ps m2, m2
  183. mulps m1, m0, m5
  184. mulps m0, m0, m4
  185. mulps m3, m2, m5
  186. mulps m2, m2, m4
  187. cvtps2dq m0, m0
  188. cvtps2dq m1, m1
  189. cvtps2dq m2, m2
  190. cvtps2dq m3, m3
  191. packssdw m0, m2
  192. packssdw m1, m3
  193. mova [src0q ], m0
  194. mova [src0q+src1q], m1
  195. add src0q, mmsize
  196. sub lend, mmsize/2
  197. jg .loop
  198. REP_RET
  199. %endmacro
  200. INIT_XMM sse2
  201. MIX_1_TO_2_S16P_FLT
  202. INIT_XMM sse4
  203. MIX_1_TO_2_S16P_FLT
  204. INIT_XMM avx
  205. MIX_1_TO_2_S16P_FLT
  206. ;-----------------------------------------------------------------------------
  207. ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
  208. ; int len, int out_ch, int in_ch);
  209. ;-----------------------------------------------------------------------------
  210. %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
  211. ; define some names to make the code clearer
  212. %assign in_channels %1
  213. %assign out_channels %2
  214. %assign stereo out_channels - 1
  215. %ifidn %3, s16p
  216. %assign is_s16 1
  217. %else
  218. %assign is_s16 0
  219. %endif
  220. ; determine how many matrix elements must go on the stack vs. mmregs
  221. %assign matrix_elements in_channels * out_channels
  222. %if is_s16
  223. %if stereo
  224. %assign needed_mmregs 7
  225. %else
  226. %assign needed_mmregs 5
  227. %endif
  228. %else
  229. %if stereo
  230. %assign needed_mmregs 4
  231. %else
  232. %assign needed_mmregs 3
  233. %endif
  234. %endif
  235. %assign matrix_elements_mm num_mmregs - needed_mmregs
  236. %if matrix_elements < matrix_elements_mm
  237. %assign matrix_elements_mm matrix_elements
  238. %endif
  239. %if matrix_elements_mm < matrix_elements
  240. %assign matrix_elements_stack matrix_elements - matrix_elements_mm
  241. %else
  242. %assign matrix_elements_stack 0
  243. %endif
  244. cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
  245. ; get aligned stack space if needed
  246. %if matrix_elements_stack > 0
  247. %if mmsize == 32
  248. %assign bkpreg %1 + 1
  249. %define bkpq r %+ bkpreg %+ q
  250. mov bkpq, rsp
  251. and rsp, ~(mmsize-1)
  252. sub rsp, matrix_elements_stack * mmsize
  253. %else
  254. %assign matrix_stack_size matrix_elements_stack * mmsize
  255. %assign pad matrix_stack_size + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
  256. ; on x86-32 for 7 and 8 channels we need more stack space for src pointers
  257. %if ARCH_X86_32 && in_channels >= 7
  258. %assign pad pad + 0x10
  259. %define src5m [rsp+matrix_stack_size+0]
  260. %define src6m [rsp+matrix_stack_size+4]
  261. %define src7m [rsp+matrix_stack_size+8]
  262. %endif
  263. SUB rsp, pad
  264. %endif
  265. %endif
  266. ; load matrix pointers
  267. %define matrix0q r1q
  268. %define matrix1q r3q
  269. %if stereo
  270. mov matrix1q, [matrix0q+gprsize]
  271. %endif
  272. mov matrix0q, [matrix0q]
  273. ; define matrix coeff names
  274. %assign %%i 0
  275. %assign %%j needed_mmregs
  276. %rep in_channels
  277. %if %%i >= matrix_elements_mm
  278. CAT_XDEFINE mx_stack_0_, %%i, 1
  279. CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
  280. %else
  281. CAT_XDEFINE mx_stack_0_, %%i, 0
  282. CAT_XDEFINE mx_0_, %%i, m %+ %%j
  283. %assign %%j %%j+1
  284. %endif
  285. %assign %%i %%i+1
  286. %endrep
  287. %if stereo
  288. %assign %%i 0
  289. %rep in_channels
  290. %if in_channels + %%i >= matrix_elements_mm
  291. CAT_XDEFINE mx_stack_1_, %%i, 1
  292. CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
  293. %else
  294. CAT_XDEFINE mx_stack_1_, %%i, 0
  295. CAT_XDEFINE mx_1_, %%i, m %+ %%j
  296. %assign %%j %%j+1
  297. %endif
  298. %assign %%i %%i+1
  299. %endrep
  300. %endif
  301. ; load/splat matrix coeffs
  302. %assign %%i 0
  303. %rep in_channels
  304. %if mx_stack_0_ %+ %%i
  305. VBROADCASTSS m0, [matrix0q+4*%%i]
  306. mova mx_0_ %+ %%i, m0
  307. %else
  308. VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
  309. %endif
  310. %if stereo
  311. %if mx_stack_1_ %+ %%i
  312. VBROADCASTSS m0, [matrix1q+4*%%i]
  313. mova mx_1_ %+ %%i, m0
  314. %else
  315. VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
  316. %endif
  317. %endif
  318. %assign %%i %%i+1
  319. %endrep
  320. ; load channel pointers to registers as offsets from the first channel pointer
  321. %if ARCH_X86_64
  322. movsxd lenq, r2d
  323. %endif
  324. shl lenq, 2-is_s16
  325. %assign %%i 1
  326. %rep (in_channels - 1)
  327. %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  328. mov src5q, [src0q+%%i*gprsize]
  329. add src5q, lenq
  330. mov src %+ %%i %+ m, src5q
  331. %else
  332. mov src %+ %%i %+ q, [src0q+%%i*gprsize]
  333. add src %+ %%i %+ q, lenq
  334. %endif
  335. %assign %%i %%i+1
  336. %endrep
  337. mov src0q, [src0q]
  338. add src0q, lenq
  339. neg lenq
  340. .loop:
  341. ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
  342. ; pointers, so we have to load some of them from the stack each time
  343. %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  344. %if is_s16
  345. ; mix with s16p input
  346. mova m0, [src0q+lenq]
  347. S16_TO_S32_SX 0, 1
  348. cvtdq2ps m0, m0
  349. cvtdq2ps m1, m1
  350. %if stereo
  351. mulps m2, m0, mx_1_0
  352. mulps m3, m1, mx_1_0
  353. %endif
  354. mulps m0, m0, mx_0_0
  355. mulps m1, m1, mx_0_0
  356. %assign %%i 1
  357. %rep (in_channels - 1)
  358. %if copy_src_from_stack
  359. %define src_ptr src5q
  360. %else
  361. %define src_ptr src %+ %%i %+ q
  362. %endif
  363. %if stereo
  364. %if copy_src_from_stack
  365. mov src_ptr, src %+ %%i %+ m
  366. %endif
  367. mova m4, [src_ptr+lenq]
  368. S16_TO_S32_SX 4, 5
  369. cvtdq2ps m4, m4
  370. cvtdq2ps m5, m5
  371. fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
  372. fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
  373. fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
  374. fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
  375. %else
  376. %if copy_src_from_stack
  377. mov src_ptr, src %+ %%i %+ m
  378. %endif
  379. mova m2, [src_ptr+lenq]
  380. S16_TO_S32_SX 2, 3
  381. cvtdq2ps m2, m2
  382. cvtdq2ps m3, m3
  383. fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
  384. fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
  385. %endif
  386. %assign %%i %%i+1
  387. %endrep
  388. %if stereo
  389. cvtps2dq m2, m2
  390. cvtps2dq m3, m3
  391. packssdw m2, m3
  392. mova [src1q+lenq], m2
  393. %endif
  394. cvtps2dq m0, m0
  395. cvtps2dq m1, m1
  396. packssdw m0, m1
  397. mova [src0q+lenq], m0
  398. %else
  399. ; mix with fltp input
  400. %if stereo || mx_stack_0_0
  401. mova m0, [src0q+lenq]
  402. %endif
  403. %if stereo
  404. mulps m1, m0, mx_1_0
  405. %endif
  406. %if stereo || mx_stack_0_0
  407. mulps m0, m0, mx_0_0
  408. %else
  409. mulps m0, [src0q+lenq], mx_0_0
  410. %endif
  411. %assign %%i 1
  412. %rep (in_channels - 1)
  413. %if copy_src_from_stack
  414. %define src_ptr src5q
  415. mov src_ptr, src %+ %%i %+ m
  416. %else
  417. %define src_ptr src %+ %%i %+ q
  418. %endif
  419. ; avoid extra load for mono if matrix is in a mm register
  420. %if stereo || mx_stack_0_ %+ %%i
  421. mova m2, [src_ptr+lenq]
  422. %endif
  423. %if stereo
  424. fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
  425. %endif
  426. %if stereo || mx_stack_0_ %+ %%i
  427. fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
  428. %else
  429. fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
  430. %endif
  431. %assign %%i %%i+1
  432. %endrep
  433. mova [src0q+lenq], m0
  434. %if stereo
  435. mova [src1q+lenq], m1
  436. %endif
  437. %endif
  438. add lenq, mmsize
  439. jl .loop
  440. ; restore stack pointer
  441. %if matrix_elements_stack > 0
  442. %if mmsize == 32
  443. mov rsp, bkpq
  444. %else
  445. ADD rsp, pad
  446. %endif
  447. %endif
  448. ; zero ymm high halves
  449. %if mmsize == 32
  450. vzeroupper
  451. %endif
  452. RET
  453. %endmacro
  454. %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
  455. %assign %%i 3
  456. %rep 6
  457. INIT_XMM sse
  458. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  459. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  460. INIT_XMM sse2
  461. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  462. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  463. INIT_XMM sse4
  464. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  465. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  466. ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
  467. %if ARCH_X86_64 || %%i < 6
  468. INIT_YMM avx
  469. %else
  470. INIT_XMM avx
  471. %endif
  472. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  473. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  474. INIT_XMM avx
  475. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  476. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  477. %if HAVE_FMA4_EXTERNAL
  478. %if ARCH_X86_64 || %%i < 6
  479. INIT_YMM fma4
  480. %else
  481. INIT_XMM fma4
  482. %endif
  483. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  484. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  485. INIT_XMM fma4
  486. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  487. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  488. %endif
  489. %assign %%i %%i+1
  490. %endrep
  491. %endmacro
  492. MIX_3_8_TO_1_2_FLT_FUNCS