You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

299 lines
8.0KB

  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  26. ;-----------------------------------------------------------------------------
  27. %macro VECTOR_FMUL 0
  28. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  29. lea lenq, [lend*4 - 64]
  30. ALIGN 16
  31. .loop:
  32. %assign a 0
  33. %rep 32/mmsize
  34. mova m0, [src0q + lenq + (a+0)*mmsize]
  35. mova m1, [src0q + lenq + (a+1)*mmsize]
  36. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  37. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  38. mova [dstq + lenq + (a+0)*mmsize], m0
  39. mova [dstq + lenq + (a+1)*mmsize], m1
  40. %assign a a+2
  41. %endrep
  42. sub lenq, 64
  43. jge .loop
  44. REP_RET
  45. %endmacro
  46. INIT_XMM sse
  47. VECTOR_FMUL
  48. %if HAVE_AVX_EXTERNAL
  49. INIT_YMM avx
  50. VECTOR_FMUL
  51. %endif
  52. ;------------------------------------------------------------------------------
  53. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  54. ;------------------------------------------------------------------------------
  55. %macro VECTOR_FMAC_SCALAR 0
  56. %if UNIX64
  57. cglobal vector_fmac_scalar, 3,3,3, dst, src, len
  58. %else
  59. cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
  60. %endif
  61. %if ARCH_X86_32
  62. VBROADCASTSS m0, mulm
  63. %else
  64. %if WIN64
  65. mova xmm0, xmm2
  66. %endif
  67. shufps xmm0, xmm0, 0
  68. %if cpuflag(avx)
  69. vinsertf128 m0, m0, xmm0, 1
  70. %endif
  71. %endif
  72. lea lenq, [lend*4-64]
  73. .loop:
  74. %assign a 0
  75. %rep 32/mmsize
  76. mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
  77. mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
  78. addps m1, m1, [dstq+lenq+(a+0)*mmsize]
  79. addps m2, m2, [dstq+lenq+(a+1)*mmsize]
  80. mova [dstq+lenq+(a+0)*mmsize], m1
  81. mova [dstq+lenq+(a+1)*mmsize], m2
  82. %assign a a+2
  83. %endrep
  84. sub lenq, 64
  85. jge .loop
  86. REP_RET
  87. %endmacro
  88. INIT_XMM sse
  89. VECTOR_FMAC_SCALAR
  90. %if HAVE_AVX_EXTERNAL
  91. INIT_YMM avx
  92. VECTOR_FMAC_SCALAR
  93. %endif
  94. ;------------------------------------------------------------------------------
  95. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  96. ;------------------------------------------------------------------------------
  97. %macro VECTOR_FMUL_SCALAR 0
  98. %if UNIX64
  99. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  100. %else
  101. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  102. %endif
  103. %if ARCH_X86_32
  104. movss m0, mulm
  105. %elif WIN64
  106. SWAP 0, 2
  107. %endif
  108. shufps m0, m0, 0
  109. lea lenq, [lend*4-mmsize]
  110. .loop:
  111. mova m1, [srcq+lenq]
  112. mulps m1, m0
  113. mova [dstq+lenq], m1
  114. sub lenq, mmsize
  115. jge .loop
  116. REP_RET
  117. %endmacro
  118. INIT_XMM sse
  119. VECTOR_FMUL_SCALAR
  120. ;------------------------------------------------------------------------------
  121. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  122. ; int len)
  123. ;------------------------------------------------------------------------------
  124. %macro VECTOR_DMUL_SCALAR 0
  125. %if ARCH_X86_32
  126. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  127. mov lenq, lenaddrm
  128. %elif UNIX64
  129. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  130. %else
  131. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  132. %endif
  133. %if ARCH_X86_32
  134. VBROADCASTSD m0, mulm
  135. %else
  136. %if WIN64
  137. movlhps xmm2, xmm2
  138. %if cpuflag(avx)
  139. vinsertf128 ymm2, ymm2, xmm2, 1
  140. %endif
  141. SWAP 0, 2
  142. %else
  143. movlhps xmm0, xmm0
  144. %if cpuflag(avx)
  145. vinsertf128 ymm0, ymm0, xmm0, 1
  146. %endif
  147. %endif
  148. %endif
  149. lea lenq, [lend*8-2*mmsize]
  150. .loop:
  151. mulpd m1, m0, [srcq+lenq ]
  152. mulpd m2, m0, [srcq+lenq+mmsize]
  153. mova [dstq+lenq ], m1
  154. mova [dstq+lenq+mmsize], m2
  155. sub lenq, 2*mmsize
  156. jge .loop
  157. REP_RET
  158. %endmacro
  159. INIT_XMM sse2
  160. VECTOR_DMUL_SCALAR
  161. %if HAVE_AVX_EXTERNAL
  162. INIT_YMM avx
  163. VECTOR_DMUL_SCALAR
  164. %endif
  165. ;-----------------------------------------------------------------------------
  166. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  167. ; const float *src2, int len)
  168. ;-----------------------------------------------------------------------------
  169. %macro VECTOR_FMUL_ADD 0
  170. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  171. lea lenq, [lend*4 - 2*mmsize]
  172. ALIGN 16
  173. .loop:
  174. mova m0, [src0q + lenq]
  175. mova m1, [src0q + lenq + mmsize]
  176. mulps m0, m0, [src1q + lenq]
  177. mulps m1, m1, [src1q + lenq + mmsize]
  178. addps m0, m0, [src2q + lenq]
  179. addps m1, m1, [src2q + lenq + mmsize]
  180. mova [dstq + lenq], m0
  181. mova [dstq + lenq + mmsize], m1
  182. sub lenq, 2*mmsize
  183. jge .loop
  184. REP_RET
  185. %endmacro
  186. INIT_XMM sse
  187. VECTOR_FMUL_ADD
  188. %if HAVE_AVX_EXTERNAL
  189. INIT_YMM avx
  190. VECTOR_FMUL_ADD
  191. %endif
  192. ;-----------------------------------------------------------------------------
  193. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  194. ; int len)
  195. ;-----------------------------------------------------------------------------
  196. %macro VECTOR_FMUL_REVERSE 0
  197. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  198. lea lenq, [lend*4 - 2*mmsize]
  199. ALIGN 16
  200. .loop:
  201. %if cpuflag(avx)
  202. vmovaps xmm0, [src1q + 16]
  203. vinsertf128 m0, m0, [src1q], 1
  204. vshufps m0, m0, m0, q0123
  205. vmovaps xmm1, [src1q + mmsize + 16]
  206. vinsertf128 m1, m1, [src1q + mmsize], 1
  207. vshufps m1, m1, m1, q0123
  208. %else
  209. mova m0, [src1q]
  210. mova m1, [src1q + mmsize]
  211. shufps m0, m0, q0123
  212. shufps m1, m1, q0123
  213. %endif
  214. mulps m0, m0, [src0q + lenq + mmsize]
  215. mulps m1, m1, [src0q + lenq]
  216. mova [dstq + lenq + mmsize], m0
  217. mova [dstq + lenq], m1
  218. add src1q, 2*mmsize
  219. sub lenq, 2*mmsize
  220. jge .loop
  221. REP_RET
  222. %endmacro
  223. INIT_XMM sse
  224. VECTOR_FMUL_REVERSE
  225. %if HAVE_AVX_EXTERNAL
  226. INIT_YMM avx
  227. VECTOR_FMUL_REVERSE
  228. %endif
  229. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  230. INIT_XMM sse
  231. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  232. neg offsetq
  233. shl offsetq, 2
  234. sub v1q, offsetq
  235. sub v2q, offsetq
  236. xorps xmm0, xmm0
  237. .loop:
  238. movaps xmm1, [v1q+offsetq]
  239. mulps xmm1, [v2q+offsetq]
  240. addps xmm0, xmm1
  241. add offsetq, 16
  242. js .loop
  243. movhlps xmm1, xmm0
  244. addps xmm0, xmm1
  245. movss xmm1, xmm0
  246. shufps xmm0, xmm0, 1
  247. addss xmm0, xmm1
  248. %if ARCH_X86_64 == 0
  249. movss r0m, xmm0
  250. fld dword r0m
  251. %endif
  252. RET
  253. ;-----------------------------------------------------------------------------
  254. ; void ff_butterflies_float(float *src0, float *src1, int len);
  255. ;-----------------------------------------------------------------------------
  256. INIT_XMM sse
  257. cglobal butterflies_float, 3,3,3, src0, src1, len
  258. %if ARCH_X86_64
  259. movsxd lenq, lend
  260. %endif
  261. test lenq, lenq
  262. jz .end
  263. shl lenq, 2
  264. add src0q, lenq
  265. add src1q, lenq
  266. neg lenq
  267. .loop:
  268. mova m0, [src0q + lenq]
  269. mova m1, [src1q + lenq]
  270. subps m2, m0, m1
  271. addps m0, m0, m1
  272. mova [src1q + lenq], m2
  273. mova [src0q + lenq], m0
  274. add lenq, mmsize
  275. jl .loop
  276. .end:
  277. REP_RET