You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

239 lines
6.7KB

  1. ;******************************************************************************
  2. ;* x86 optimized channel mixing
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. %include "util.asm"
  24. SECTION_TEXT
  25. ;-----------------------------------------------------------------------------
  26. ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
  27. ; int out_ch, int in_ch);
  28. ;-----------------------------------------------------------------------------
  29. %macro MIX_2_TO_1_FLTP_FLT 0
  30. cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
  31. mov src1q, [srcq+gprsize]
  32. mov srcq, [srcq ]
  33. sub src1q, srcq
  34. mov matrixq, [matrixq ]
  35. VBROADCASTSS m4, [matrixq ]
  36. VBROADCASTSS m5, [matrixq+4]
  37. ALIGN 16
  38. .loop:
  39. mulps m0, m4, [srcq ]
  40. mulps m1, m5, [srcq+src1q ]
  41. mulps m2, m4, [srcq+ mmsize]
  42. mulps m3, m5, [srcq+src1q+mmsize]
  43. addps m0, m0, m1
  44. addps m2, m2, m3
  45. mova [srcq ], m0
  46. mova [srcq+mmsize], m2
  47. add srcq, mmsize*2
  48. sub lend, mmsize*2/4
  49. jg .loop
  50. %if mmsize == 32
  51. vzeroupper
  52. RET
  53. %else
  54. REP_RET
  55. %endif
  56. %endmacro
  57. INIT_XMM sse
  58. MIX_2_TO_1_FLTP_FLT
  59. %if HAVE_AVX
  60. INIT_YMM avx
  61. MIX_2_TO_1_FLTP_FLT
  62. %endif
  63. ;-----------------------------------------------------------------------------
  64. ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
  65. ; int out_ch, int in_ch);
  66. ;-----------------------------------------------------------------------------
  67. %macro MIX_2_TO_1_S16P_FLT 0
  68. cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
  69. mov src1q, [srcq+gprsize]
  70. mov srcq, [srcq]
  71. sub src1q, srcq
  72. mov matrixq, [matrixq ]
  73. VBROADCASTSS m4, [matrixq ]
  74. VBROADCASTSS m5, [matrixq+4]
  75. ALIGN 16
  76. .loop:
  77. mova m0, [srcq ]
  78. mova m2, [srcq+src1q]
  79. S16_TO_S32_SX 0, 1
  80. S16_TO_S32_SX 2, 3
  81. cvtdq2ps m0, m0
  82. cvtdq2ps m1, m1
  83. cvtdq2ps m2, m2
  84. cvtdq2ps m3, m3
  85. mulps m0, m4
  86. mulps m1, m4
  87. mulps m2, m5
  88. mulps m3, m5
  89. addps m0, m2
  90. addps m1, m3
  91. cvtps2dq m0, m0
  92. cvtps2dq m1, m1
  93. packssdw m0, m1
  94. mova [srcq], m0
  95. add srcq, mmsize
  96. sub lend, mmsize/2
  97. jg .loop
  98. REP_RET
  99. %endmacro
  100. INIT_XMM sse2
  101. MIX_2_TO_1_S16P_FLT
  102. INIT_XMM sse4
  103. MIX_2_TO_1_S16P_FLT
  104. ;-----------------------------------------------------------------------------
  105. ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
  106. ; int out_ch, int in_ch);
  107. ;-----------------------------------------------------------------------------
  108. INIT_XMM sse2
  109. cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
  110. mov src1q, [srcq+gprsize]
  111. mov srcq, [srcq]
  112. sub src1q, srcq
  113. mov matrixq, [matrixq]
  114. movd m4, [matrixq]
  115. movd m5, [matrixq]
  116. SPLATW m4, m4, 0
  117. SPLATW m5, m5, 1
  118. pxor m0, m0
  119. punpcklwd m4, m0
  120. punpcklwd m5, m0
  121. ALIGN 16
  122. .loop:
  123. mova m0, [srcq ]
  124. mova m2, [srcq+src1q]
  125. punpckhwd m1, m0, m0
  126. punpcklwd m0, m0
  127. punpckhwd m3, m2, m2
  128. punpcklwd m2, m2
  129. pmaddwd m0, m4
  130. pmaddwd m1, m4
  131. pmaddwd m2, m5
  132. pmaddwd m3, m5
  133. paddd m0, m2
  134. paddd m1, m3
  135. psrad m0, 8
  136. psrad m1, 8
  137. packssdw m0, m1
  138. mova [srcq], m0
  139. add srcq, mmsize
  140. sub lend, mmsize/2
  141. jg .loop
  142. REP_RET
  143. ;-----------------------------------------------------------------------------
  144. ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
  145. ; int out_ch, int in_ch);
  146. ;-----------------------------------------------------------------------------
  147. %macro MIX_1_TO_2_FLTP_FLT 0
  148. cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
  149. mov src1q, [src0q+gprsize]
  150. mov src0q, [src0q]
  151. sub src1q, src0q
  152. mov matrix1q, [matrix0q+gprsize]
  153. mov matrix0q, [matrix0q]
  154. VBROADCASTSS m2, [matrix0q]
  155. VBROADCASTSS m3, [matrix1q]
  156. ALIGN 16
  157. .loop:
  158. mova m0, [src0q]
  159. mulps m1, m0, m3
  160. mulps m0, m0, m2
  161. mova [src0q ], m0
  162. mova [src0q+src1q], m1
  163. add src0q, mmsize
  164. sub lend, mmsize/4
  165. jg .loop
  166. %if mmsize == 32
  167. vzeroupper
  168. RET
  169. %else
  170. REP_RET
  171. %endif
  172. %endmacro
  173. INIT_XMM sse
  174. MIX_1_TO_2_FLTP_FLT
  175. %if HAVE_AVX
  176. INIT_YMM avx
  177. MIX_1_TO_2_FLTP_FLT
  178. %endif
  179. ;-----------------------------------------------------------------------------
  180. ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
  181. ; int out_ch, int in_ch);
  182. ;-----------------------------------------------------------------------------
  183. %macro MIX_1_TO_2_S16P_FLT 0
  184. cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
  185. mov src1q, [src0q+gprsize]
  186. mov src0q, [src0q]
  187. sub src1q, src0q
  188. mov matrix1q, [matrix0q+gprsize]
  189. mov matrix0q, [matrix0q]
  190. VBROADCASTSS m4, [matrix0q]
  191. VBROADCASTSS m5, [matrix1q]
  192. ALIGN 16
  193. .loop:
  194. mova m0, [src0q]
  195. S16_TO_S32_SX 0, 2
  196. cvtdq2ps m0, m0
  197. cvtdq2ps m2, m2
  198. mulps m1, m0, m5
  199. mulps m0, m0, m4
  200. mulps m3, m2, m5
  201. mulps m2, m2, m4
  202. cvtps2dq m0, m0
  203. cvtps2dq m1, m1
  204. cvtps2dq m2, m2
  205. cvtps2dq m3, m3
  206. packssdw m0, m2
  207. packssdw m1, m3
  208. mova [src0q ], m0
  209. mova [src0q+src1q], m1
  210. add src0q, mmsize
  211. sub lend, mmsize/2
  212. jg .loop
  213. REP_RET
  214. %endmacro
  215. INIT_XMM sse2
  216. MIX_1_TO_2_S16P_FLT
  217. INIT_XMM sse4
  218. MIX_1_TO_2_S16P_FLT
  219. %if HAVE_AVX
  220. INIT_XMM avx
  221. MIX_1_TO_2_S16P_FLT
  222. %endif