You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

321 lines
8.5KB

  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  26. ;-----------------------------------------------------------------------------
  27. %macro VECTOR_FMUL 0
  28. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  29. lea lenq, [lend*4 - 64]
  30. ALIGN 16
  31. .loop:
  32. %assign a 0
  33. %rep 32/mmsize
  34. mova m0, [src0q + lenq + (a+0)*mmsize]
  35. mova m1, [src0q + lenq + (a+1)*mmsize]
  36. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  37. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  38. mova [dstq + lenq + (a+0)*mmsize], m0
  39. mova [dstq + lenq + (a+1)*mmsize], m1
  40. %assign a a+2
  41. %endrep
  42. sub lenq, 64
  43. jge .loop
  44. REP_RET
  45. %endmacro
  46. INIT_XMM sse
  47. VECTOR_FMUL
  48. %if HAVE_AVX_EXTERNAL
  49. INIT_YMM avx
  50. VECTOR_FMUL
  51. %endif
  52. ;------------------------------------------------------------------------------
  53. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  54. ;------------------------------------------------------------------------------
  55. %macro VECTOR_FMAC_SCALAR 0
  56. %if UNIX64
  57. cglobal vector_fmac_scalar, 3,3,3, dst, src, len
  58. %else
  59. cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
  60. %endif
  61. %if ARCH_X86_32
  62. VBROADCASTSS m0, mulm
  63. %else
  64. %if WIN64
  65. SWAP 0, 2
  66. %endif
  67. shufps xm0, xm0, 0
  68. %if cpuflag(avx)
  69. vinsertf128 m0, m0, xm0, 1
  70. %endif
  71. %endif
  72. lea lenq, [lend*4-64]
  73. .loop:
  74. %assign a 0
  75. %rep 32/mmsize
  76. %if cpuflag(fma3)
  77. mova m1, [dstq+lenq+(a+0)*mmsize]
  78. mova m2, [dstq+lenq+(a+1)*mmsize]
  79. fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1
  80. fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2
  81. %else
  82. mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
  83. mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
  84. addps m1, m1, [dstq+lenq+(a+0)*mmsize]
  85. addps m2, m2, [dstq+lenq+(a+1)*mmsize]
  86. %endif
  87. mova [dstq+lenq+(a+0)*mmsize], m1
  88. mova [dstq+lenq+(a+1)*mmsize], m2
  89. %assign a a+2
  90. %endrep
  91. sub lenq, 64
  92. jge .loop
  93. REP_RET
  94. %endmacro
  95. INIT_XMM sse
  96. VECTOR_FMAC_SCALAR
  97. %if HAVE_AVX_EXTERNAL
  98. INIT_YMM avx
  99. VECTOR_FMAC_SCALAR
  100. %endif
  101. %if HAVE_FMA3_EXTERNAL
  102. INIT_YMM fma3
  103. VECTOR_FMAC_SCALAR
  104. %endif
  105. ;------------------------------------------------------------------------------
  106. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  107. ;------------------------------------------------------------------------------
  108. %macro VECTOR_FMUL_SCALAR 0
  109. %if UNIX64
  110. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  111. %else
  112. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  113. %endif
  114. %if ARCH_X86_32
  115. movss m0, mulm
  116. %elif WIN64
  117. SWAP 0, 2
  118. %endif
  119. shufps m0, m0, 0
  120. lea lenq, [lend*4-mmsize]
  121. .loop:
  122. mova m1, [srcq+lenq]
  123. mulps m1, m0
  124. mova [dstq+lenq], m1
  125. sub lenq, mmsize
  126. jge .loop
  127. REP_RET
  128. %endmacro
  129. INIT_XMM sse
  130. VECTOR_FMUL_SCALAR
  131. ;------------------------------------------------------------------------------
  132. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  133. ; int len)
  134. ;------------------------------------------------------------------------------
  135. %macro VECTOR_DMUL_SCALAR 0
  136. %if ARCH_X86_32
  137. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  138. mov lenq, lenaddrm
  139. %elif UNIX64
  140. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  141. %else
  142. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  143. %endif
  144. %if ARCH_X86_32
  145. VBROADCASTSD m0, mulm
  146. %else
  147. %if WIN64
  148. movlhps xmm2, xmm2
  149. %if cpuflag(avx)
  150. vinsertf128 ymm2, ymm2, xmm2, 1
  151. %endif
  152. SWAP 0, 2
  153. %else
  154. movlhps xmm0, xmm0
  155. %if cpuflag(avx)
  156. vinsertf128 ymm0, ymm0, xmm0, 1
  157. %endif
  158. %endif
  159. %endif
  160. lea lenq, [lend*8-2*mmsize]
  161. .loop:
  162. mulpd m1, m0, [srcq+lenq ]
  163. mulpd m2, m0, [srcq+lenq+mmsize]
  164. mova [dstq+lenq ], m1
  165. mova [dstq+lenq+mmsize], m2
  166. sub lenq, 2*mmsize
  167. jge .loop
  168. REP_RET
  169. %endmacro
  170. INIT_XMM sse2
  171. VECTOR_DMUL_SCALAR
  172. %if HAVE_AVX_EXTERNAL
  173. INIT_YMM avx
  174. VECTOR_DMUL_SCALAR
  175. %endif
  176. ;-----------------------------------------------------------------------------
  177. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  178. ; const float *src2, int len)
  179. ;-----------------------------------------------------------------------------
  180. %macro VECTOR_FMUL_ADD 0
  181. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  182. lea lenq, [lend*4 - 2*mmsize]
  183. ALIGN 16
  184. .loop:
  185. mova m0, [src0q + lenq]
  186. mova m1, [src0q + lenq + mmsize]
  187. %if cpuflag(fma3)
  188. mova m2, [src2q + lenq]
  189. mova m3, [src2q + lenq + mmsize]
  190. fmaddps m0, m0, [src1q + lenq], m2
  191. fmaddps m1, m1, [src1q + lenq + mmsize], m3
  192. %else
  193. mulps m0, m0, [src1q + lenq]
  194. mulps m1, m1, [src1q + lenq + mmsize]
  195. addps m0, m0, [src2q + lenq]
  196. addps m1, m1, [src2q + lenq + mmsize]
  197. %endif
  198. mova [dstq + lenq], m0
  199. mova [dstq + lenq + mmsize], m1
  200. sub lenq, 2*mmsize
  201. jge .loop
  202. REP_RET
  203. %endmacro
  204. INIT_XMM sse
  205. VECTOR_FMUL_ADD
  206. %if HAVE_AVX_EXTERNAL
  207. INIT_YMM avx
  208. VECTOR_FMUL_ADD
  209. %endif
  210. %if HAVE_FMA3_EXTERNAL
  211. INIT_YMM fma3
  212. VECTOR_FMUL_ADD
  213. %endif
  214. ;-----------------------------------------------------------------------------
  215. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  216. ; int len)
  217. ;-----------------------------------------------------------------------------
  218. %macro VECTOR_FMUL_REVERSE 0
  219. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  220. lea lenq, [lend*4 - 2*mmsize]
  221. ALIGN 16
  222. .loop:
  223. %if cpuflag(avx)
  224. vmovaps xmm0, [src1q + 16]
  225. vinsertf128 m0, m0, [src1q], 1
  226. vshufps m0, m0, m0, q0123
  227. vmovaps xmm1, [src1q + mmsize + 16]
  228. vinsertf128 m1, m1, [src1q + mmsize], 1
  229. vshufps m1, m1, m1, q0123
  230. %else
  231. mova m0, [src1q]
  232. mova m1, [src1q + mmsize]
  233. shufps m0, m0, q0123
  234. shufps m1, m1, q0123
  235. %endif
  236. mulps m0, m0, [src0q + lenq + mmsize]
  237. mulps m1, m1, [src0q + lenq]
  238. mova [dstq + lenq + mmsize], m0
  239. mova [dstq + lenq], m1
  240. add src1q, 2*mmsize
  241. sub lenq, 2*mmsize
  242. jge .loop
  243. REP_RET
  244. %endmacro
  245. INIT_XMM sse
  246. VECTOR_FMUL_REVERSE
  247. %if HAVE_AVX_EXTERNAL
  248. INIT_YMM avx
  249. VECTOR_FMUL_REVERSE
  250. %endif
  251. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  252. INIT_XMM sse
  253. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  254. neg offsetq
  255. shl offsetq, 2
  256. sub v1q, offsetq
  257. sub v2q, offsetq
  258. xorps xmm0, xmm0
  259. .loop:
  260. movaps xmm1, [v1q+offsetq]
  261. mulps xmm1, [v2q+offsetq]
  262. addps xmm0, xmm1
  263. add offsetq, 16
  264. js .loop
  265. movhlps xmm1, xmm0
  266. addps xmm0, xmm1
  267. movss xmm1, xmm0
  268. shufps xmm0, xmm0, 1
  269. addss xmm0, xmm1
  270. %if ARCH_X86_64 == 0
  271. movss r0m, xmm0
  272. fld dword r0m
  273. %endif
  274. RET
  275. ;-----------------------------------------------------------------------------
  276. ; void ff_butterflies_float(float *src0, float *src1, int len);
  277. ;-----------------------------------------------------------------------------
  278. INIT_XMM sse
  279. cglobal butterflies_float, 3,3,3, src0, src1, len
  280. %if ARCH_X86_64
  281. movsxd lenq, lend
  282. %endif
  283. test lenq, lenq
  284. jz .end
  285. shl lenq, 2
  286. add src0q, lenq
  287. add src1q, lenq
  288. neg lenq
  289. .loop:
  290. mova m0, [src0q + lenq]
  291. mova m1, [src1q + lenq]
  292. subps m2, m0, m1
  293. addps m0, m0, m1
  294. mova [src1q + lenq], m2
  295. mova [src0q + lenq], m0
  296. add lenq, mmsize
  297. jl .loop
  298. .end:
  299. REP_RET