You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

252 lines
6.0KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. align 32
  23. dw1: times 8 dd 1
  24. w1 : times 16 dw 1
  25. SECTION .text
  26. %macro MIX2_FLT 1
  27. cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
  28. %ifidn %1, a
  29. test in1q, mmsize-1
  30. jne mix_2_1_float_u_int %+ SUFFIX
  31. test in2q, mmsize-1
  32. jne mix_2_1_float_u_int %+ SUFFIX
  33. test outq, mmsize-1
  34. jne mix_2_1_float_u_int %+ SUFFIX
  35. %else
  36. mix_2_1_float_u_int %+ SUFFIX
  37. %endif
  38. VBROADCASTSS m4, [coeffpq + 4*index1q]
  39. VBROADCASTSS m5, [coeffpq + 4*index2q]
  40. shl lend , 2
  41. add in1q , lenq
  42. add in2q , lenq
  43. add outq , lenq
  44. neg lenq
  45. .next:
  46. %ifidn %1, a
  47. mulps m0, m4, [in1q + lenq ]
  48. mulps m1, m5, [in2q + lenq ]
  49. mulps m2, m4, [in1q + lenq + mmsize]
  50. mulps m3, m5, [in2q + lenq + mmsize]
  51. %else
  52. movu m0, [in1q + lenq ]
  53. movu m1, [in2q + lenq ]
  54. movu m2, [in1q + lenq + mmsize]
  55. movu m3, [in2q + lenq + mmsize]
  56. mulps m0, m0, m4
  57. mulps m1, m1, m5
  58. mulps m2, m2, m4
  59. mulps m3, m3, m5
  60. %endif
  61. addps m0, m0, m1
  62. addps m2, m2, m3
  63. mov%1 [outq + lenq ], m0
  64. mov%1 [outq + lenq + mmsize], m2
  65. add lenq, mmsize*2
  66. jl .next
  67. REP_RET
  68. %endmacro
  69. %macro MIX1_FLT 1
  70. cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
  71. %ifidn %1, a
  72. test inq, mmsize-1
  73. jne mix_1_1_float_u_int %+ SUFFIX
  74. test outq, mmsize-1
  75. jne mix_1_1_float_u_int %+ SUFFIX
  76. %else
  77. mix_1_1_float_u_int %+ SUFFIX
  78. %endif
  79. VBROADCASTSS m2, [coeffpq + 4*indexq]
  80. shl lenq , 2
  81. add inq , lenq
  82. add outq , lenq
  83. neg lenq
  84. .next:
  85. %ifidn %1, a
  86. mulps m0, m2, [inq + lenq ]
  87. mulps m1, m2, [inq + lenq + mmsize]
  88. %else
  89. movu m0, [inq + lenq ]
  90. movu m1, [inq + lenq + mmsize]
  91. mulps m0, m0, m2
  92. mulps m1, m1, m2
  93. %endif
  94. mov%1 [outq + lenq ], m0
  95. mov%1 [outq + lenq + mmsize], m1
  96. add lenq, mmsize*2
  97. jl .next
  98. REP_RET
  99. %endmacro
  100. %macro MIX1_INT16 1
  101. cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
  102. %ifidn %1, a
  103. test inq, mmsize-1
  104. jne mix_1_1_int16_u_int %+ SUFFIX
  105. test outq, mmsize-1
  106. jne mix_1_1_int16_u_int %+ SUFFIX
  107. %else
  108. mix_1_1_int16_u_int %+ SUFFIX
  109. %endif
  110. movd m4, [coeffpq + 4*indexq]
  111. SPLATW m5, m4
  112. psllq m4, 32
  113. psrlq m4, 48
  114. mova m0, [w1]
  115. psllw m0, m4
  116. psrlw m0, 1
  117. punpcklwd m5, m0
  118. add lenq , lenq
  119. add inq , lenq
  120. add outq , lenq
  121. neg lenq
  122. .next:
  123. mov%1 m0, [inq + lenq ]
  124. mov%1 m2, [inq + lenq + mmsize]
  125. mova m1, m0
  126. mova m3, m2
  127. punpcklwd m0, [w1]
  128. punpckhwd m1, [w1]
  129. punpcklwd m2, [w1]
  130. punpckhwd m3, [w1]
  131. pmaddwd m0, m5
  132. pmaddwd m1, m5
  133. pmaddwd m2, m5
  134. pmaddwd m3, m5
  135. psrad m0, m4
  136. psrad m1, m4
  137. psrad m2, m4
  138. psrad m3, m4
  139. packssdw m0, m1
  140. packssdw m2, m3
  141. mov%1 [outq + lenq ], m0
  142. mov%1 [outq + lenq + mmsize], m2
  143. add lenq, mmsize*2
  144. jl .next
  145. %if mmsize == 8
  146. emms
  147. RET
  148. %else
  149. REP_RET
  150. %endif
  151. %endmacro
  152. %macro MIX2_INT16 1
  153. cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
  154. %ifidn %1, a
  155. test in1q, mmsize-1
  156. jne mix_2_1_int16_u_int %+ SUFFIX
  157. test in2q, mmsize-1
  158. jne mix_2_1_int16_u_int %+ SUFFIX
  159. test outq, mmsize-1
  160. jne mix_2_1_int16_u_int %+ SUFFIX
  161. %else
  162. mix_2_1_int16_u_int %+ SUFFIX
  163. %endif
  164. movd m4, [coeffpq + 4*index1q]
  165. movd m6, [coeffpq + 4*index2q]
  166. SPLATW m5, m4
  167. SPLATW m6, m6
  168. psllq m4, 32
  169. psrlq m4, 48
  170. mova m7, [dw1]
  171. pslld m7, m4
  172. psrld m7, 1
  173. punpcklwd m5, m6
  174. add lend , lend
  175. add in1q , lenq
  176. add in2q , lenq
  177. add outq , lenq
  178. neg lenq
  179. .next:
  180. mov%1 m0, [in1q + lenq ]
  181. mov%1 m2, [in2q + lenq ]
  182. mova m1, m0
  183. punpcklwd m0, m2
  184. punpckhwd m1, m2
  185. mov%1 m2, [in1q + lenq + mmsize]
  186. mov%1 m6, [in2q + lenq + mmsize]
  187. mova m3, m2
  188. punpcklwd m2, m6
  189. punpckhwd m3, m6
  190. pmaddwd m0, m5
  191. pmaddwd m1, m5
  192. pmaddwd m2, m5
  193. pmaddwd m3, m5
  194. paddd m0, m7
  195. paddd m1, m7
  196. paddd m2, m7
  197. paddd m3, m7
  198. psrad m0, m4
  199. psrad m1, m4
  200. psrad m2, m4
  201. psrad m3, m4
  202. packssdw m0, m1
  203. packssdw m2, m3
  204. mov%1 [outq + lenq ], m0
  205. mov%1 [outq + lenq + mmsize], m2
  206. add lenq, mmsize*2
  207. jl .next
  208. %if mmsize == 8
  209. emms
  210. RET
  211. %else
  212. REP_RET
  213. %endif
  214. %endmacro
  215. INIT_MMX mmx
  216. MIX1_INT16 u
  217. MIX1_INT16 a
  218. MIX2_INT16 u
  219. MIX2_INT16 a
  220. INIT_XMM sse
  221. MIX2_FLT u
  222. MIX2_FLT a
  223. MIX1_FLT u
  224. MIX1_FLT a
  225. INIT_XMM sse2
  226. MIX1_INT16 u
  227. MIX1_INT16 a
  228. MIX2_INT16 u
  229. MIX2_INT16 a
  230. %if HAVE_AVX_EXTERNAL
  231. INIT_YMM avx
  232. MIX2_FLT u
  233. MIX2_FLT a
  234. MIX1_FLT u
  235. MIX1_FLT a
  236. %endif