You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

324 lines
8.6KB

  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  26. ;-----------------------------------------------------------------------------
  27. %macro VECTOR_FMUL 0
  28. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  29. lea lenq, [lend*4 - 64]
  30. ALIGN 16
  31. .loop:
  32. %assign a 0
  33. %rep 32/mmsize
  34. mova m0, [src0q + lenq + (a+0)*mmsize]
  35. mova m1, [src0q + lenq + (a+1)*mmsize]
  36. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  37. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  38. mova [dstq + lenq + (a+0)*mmsize], m0
  39. mova [dstq + lenq + (a+1)*mmsize], m1
  40. %assign a a+2
  41. %endrep
  42. sub lenq, 64
  43. jge .loop
  44. REP_RET
  45. %endmacro
  46. INIT_XMM sse
  47. VECTOR_FMUL
  48. %if HAVE_AVX_EXTERNAL
  49. INIT_YMM avx
  50. VECTOR_FMUL
  51. %endif
  52. ;------------------------------------------------------------------------------
  53. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  54. ;------------------------------------------------------------------------------
  55. %macro VECTOR_FMAC_SCALAR 0
  56. %if UNIX64
  57. cglobal vector_fmac_scalar, 3,3,5, dst, src, len
  58. %else
  59. cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
  60. %endif
  61. %if ARCH_X86_32
  62. VBROADCASTSS m0, mulm
  63. %else
  64. %if WIN64
  65. SWAP 0, 2
  66. %endif
  67. shufps xm0, xm0, 0
  68. %if cpuflag(avx)
  69. vinsertf128 m0, m0, xm0, 1
  70. %endif
  71. %endif
  72. lea lenq, [lend*4-64]
  73. .loop:
  74. %if cpuflag(fma3)
  75. mova m1, [dstq+lenq]
  76. mova m2, [dstq+lenq+1*mmsize]
  77. fmaddps m1, m0, [srcq+lenq], m1
  78. fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
  79. %else ; cpuflag
  80. mulps m1, m0, [srcq+lenq]
  81. mulps m2, m0, [srcq+lenq+1*mmsize]
  82. %if mmsize < 32
  83. mulps m3, m0, [srcq+lenq+2*mmsize]
  84. mulps m4, m0, [srcq+lenq+3*mmsize]
  85. %endif ; mmsize
  86. addps m1, m1, [dstq+lenq]
  87. addps m2, m2, [dstq+lenq+1*mmsize]
  88. %if mmsize < 32
  89. addps m3, m3, [dstq+lenq+2*mmsize]
  90. addps m4, m4, [dstq+lenq+3*mmsize]
  91. %endif ; mmsize
  92. %endif ; cpuflag
  93. mova [dstq+lenq], m1
  94. mova [dstq+lenq+1*mmsize], m2
  95. %if mmsize < 32
  96. mova [dstq+lenq+2*mmsize], m3
  97. mova [dstq+lenq+3*mmsize], m4
  98. %endif ; mmsize
  99. sub lenq, 64
  100. jge .loop
  101. REP_RET
  102. %endmacro
  103. INIT_XMM sse
  104. VECTOR_FMAC_SCALAR
  105. %if HAVE_AVX_EXTERNAL
  106. INIT_YMM avx
  107. VECTOR_FMAC_SCALAR
  108. %endif
  109. %if HAVE_FMA3_EXTERNAL
  110. INIT_YMM fma3
  111. VECTOR_FMAC_SCALAR
  112. %endif
  113. ;------------------------------------------------------------------------------
  114. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  115. ;------------------------------------------------------------------------------
  116. %macro VECTOR_FMUL_SCALAR 0
  117. %if UNIX64
  118. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  119. %else
  120. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  121. %endif
  122. %if ARCH_X86_32
  123. movss m0, mulm
  124. %elif WIN64
  125. SWAP 0, 2
  126. %endif
  127. shufps m0, m0, 0
  128. lea lenq, [lend*4-mmsize]
  129. .loop:
  130. mova m1, [srcq+lenq]
  131. mulps m1, m0
  132. mova [dstq+lenq], m1
  133. sub lenq, mmsize
  134. jge .loop
  135. REP_RET
  136. %endmacro
  137. INIT_XMM sse
  138. VECTOR_FMUL_SCALAR
  139. ;------------------------------------------------------------------------------
  140. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  141. ; int len)
  142. ;------------------------------------------------------------------------------
  143. %macro VECTOR_DMUL_SCALAR 0
  144. %if ARCH_X86_32
  145. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  146. mov lenq, lenaddrm
  147. %elif UNIX64
  148. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  149. %else
  150. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  151. %endif
  152. %if ARCH_X86_32
  153. VBROADCASTSD m0, mulm
  154. %else
  155. %if WIN64
  156. SWAP 0, 2
  157. %endif
  158. movlhps xm0, xm0
  159. %if cpuflag(avx)
  160. vinsertf128 ym0, ym0, xm0, 1
  161. %endif
  162. %endif
  163. lea lenq, [lend*8-2*mmsize]
  164. .loop:
  165. mulpd m1, m0, [srcq+lenq ]
  166. mulpd m2, m0, [srcq+lenq+mmsize]
  167. mova [dstq+lenq ], m1
  168. mova [dstq+lenq+mmsize], m2
  169. sub lenq, 2*mmsize
  170. jge .loop
  171. REP_RET
  172. %endmacro
  173. INIT_XMM sse2
  174. VECTOR_DMUL_SCALAR
  175. %if HAVE_AVX_EXTERNAL
  176. INIT_YMM avx
  177. VECTOR_DMUL_SCALAR
  178. %endif
  179. ;-----------------------------------------------------------------------------
  180. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  181. ; const float *src2, int len)
  182. ;-----------------------------------------------------------------------------
  183. %macro VECTOR_FMUL_ADD 0
  184. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  185. lea lenq, [lend*4 - 2*mmsize]
  186. ALIGN 16
  187. .loop:
  188. mova m0, [src0q + lenq]
  189. mova m1, [src0q + lenq + mmsize]
  190. %if cpuflag(fma3)
  191. mova m2, [src2q + lenq]
  192. mova m3, [src2q + lenq + mmsize]
  193. fmaddps m0, m0, [src1q + lenq], m2
  194. fmaddps m1, m1, [src1q + lenq + mmsize], m3
  195. %else
  196. mulps m0, m0, [src1q + lenq]
  197. mulps m1, m1, [src1q + lenq + mmsize]
  198. addps m0, m0, [src2q + lenq]
  199. addps m1, m1, [src2q + lenq + mmsize]
  200. %endif
  201. mova [dstq + lenq], m0
  202. mova [dstq + lenq + mmsize], m1
  203. sub lenq, 2*mmsize
  204. jge .loop
  205. REP_RET
  206. %endmacro
  207. INIT_XMM sse
  208. VECTOR_FMUL_ADD
  209. %if HAVE_AVX_EXTERNAL
  210. INIT_YMM avx
  211. VECTOR_FMUL_ADD
  212. %endif
  213. %if HAVE_FMA3_EXTERNAL
  214. INIT_YMM fma3
  215. VECTOR_FMUL_ADD
  216. %endif
  217. ;-----------------------------------------------------------------------------
  218. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  219. ; int len)
  220. ;-----------------------------------------------------------------------------
  221. %macro VECTOR_FMUL_REVERSE 0
  222. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  223. lea lenq, [lend*4 - 2*mmsize]
  224. ALIGN 16
  225. .loop:
  226. %if cpuflag(avx)
  227. vmovaps xmm0, [src1q + 16]
  228. vinsertf128 m0, m0, [src1q], 1
  229. vshufps m0, m0, m0, q0123
  230. vmovaps xmm1, [src1q + mmsize + 16]
  231. vinsertf128 m1, m1, [src1q + mmsize], 1
  232. vshufps m1, m1, m1, q0123
  233. %else
  234. mova m0, [src1q]
  235. mova m1, [src1q + mmsize]
  236. shufps m0, m0, q0123
  237. shufps m1, m1, q0123
  238. %endif
  239. mulps m0, m0, [src0q + lenq + mmsize]
  240. mulps m1, m1, [src0q + lenq]
  241. mova [dstq + lenq + mmsize], m0
  242. mova [dstq + lenq], m1
  243. add src1q, 2*mmsize
  244. sub lenq, 2*mmsize
  245. jge .loop
  246. REP_RET
  247. %endmacro
  248. INIT_XMM sse
  249. VECTOR_FMUL_REVERSE
  250. %if HAVE_AVX_EXTERNAL
  251. INIT_YMM avx
  252. VECTOR_FMUL_REVERSE
  253. %endif
  254. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  255. INIT_XMM sse
  256. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  257. neg offsetq
  258. shl offsetq, 2
  259. sub v1q, offsetq
  260. sub v2q, offsetq
  261. xorps xmm0, xmm0
  262. .loop:
  263. movaps xmm1, [v1q+offsetq]
  264. mulps xmm1, [v2q+offsetq]
  265. addps xmm0, xmm1
  266. add offsetq, 16
  267. js .loop
  268. movhlps xmm1, xmm0
  269. addps xmm0, xmm1
  270. movss xmm1, xmm0
  271. shufps xmm0, xmm0, 1
  272. addss xmm0, xmm1
  273. %if ARCH_X86_64 == 0
  274. movss r0m, xmm0
  275. fld dword r0m
  276. %endif
  277. RET
  278. ;-----------------------------------------------------------------------------
  279. ; void ff_butterflies_float(float *src0, float *src1, int len);
  280. ;-----------------------------------------------------------------------------
  281. INIT_XMM sse
  282. cglobal butterflies_float, 3,3,3, src0, src1, len
  283. %if ARCH_X86_64
  284. movsxd lenq, lend
  285. %endif
  286. test lenq, lenq
  287. jz .end
  288. shl lenq, 2
  289. add src0q, lenq
  290. add src1q, lenq
  291. neg lenq
  292. .loop:
  293. mova m0, [src0q + lenq]
  294. mova m1, [src1q + lenq]
  295. subps m2, m0, m1
  296. addps m0, m0, m1
  297. mova [src1q + lenq], m2
  298. mova [src0q + lenq], m0
  299. add lenq, mmsize
  300. jl .loop
  301. .end:
  302. REP_RET