You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

379 lines
10.0KB

  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  26. ;-----------------------------------------------------------------------------
  27. %macro VECTOR_FMUL 0
  28. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  29. lea lenq, [lend*4 - 64]
  30. ALIGN 16
  31. .loop:
  32. %assign a 0
  33. %rep 32/mmsize
  34. mova m0, [src0q + lenq + (a+0)*mmsize]
  35. mova m1, [src0q + lenq + (a+1)*mmsize]
  36. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  37. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  38. mova [dstq + lenq + (a+0)*mmsize], m0
  39. mova [dstq + lenq + (a+1)*mmsize], m1
  40. %assign a a+2
  41. %endrep
  42. sub lenq, 64
  43. jge .loop
  44. REP_RET
  45. %endmacro
  46. INIT_XMM sse
  47. VECTOR_FMUL
  48. %if HAVE_AVX_EXTERNAL
  49. INIT_YMM avx
  50. VECTOR_FMUL
  51. %endif
  52. ;------------------------------------------------------------------------------
  53. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  54. ;------------------------------------------------------------------------------
  55. %macro VECTOR_FMAC_SCALAR 0
  56. %if UNIX64
  57. cglobal vector_fmac_scalar, 3,3,5, dst, src, len
  58. %else
  59. cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
  60. %endif
  61. %if ARCH_X86_32
  62. VBROADCASTSS m0, mulm
  63. %else
  64. %if WIN64
  65. SWAP 0, 2
  66. %endif
  67. shufps xm0, xm0, 0
  68. %if cpuflag(avx)
  69. vinsertf128 m0, m0, xm0, 1
  70. %endif
  71. %endif
  72. lea lenq, [lend*4-64]
  73. .loop:
  74. %if cpuflag(fma3)
  75. mova m1, [dstq+lenq]
  76. mova m2, [dstq+lenq+1*mmsize]
  77. fmaddps m1, m0, [srcq+lenq], m1
  78. fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
  79. %else ; cpuflag
  80. mulps m1, m0, [srcq+lenq]
  81. mulps m2, m0, [srcq+lenq+1*mmsize]
  82. %if mmsize < 32
  83. mulps m3, m0, [srcq+lenq+2*mmsize]
  84. mulps m4, m0, [srcq+lenq+3*mmsize]
  85. %endif ; mmsize
  86. addps m1, m1, [dstq+lenq]
  87. addps m2, m2, [dstq+lenq+1*mmsize]
  88. %if mmsize < 32
  89. addps m3, m3, [dstq+lenq+2*mmsize]
  90. addps m4, m4, [dstq+lenq+3*mmsize]
  91. %endif ; mmsize
  92. %endif ; cpuflag
  93. mova [dstq+lenq], m1
  94. mova [dstq+lenq+1*mmsize], m2
  95. %if mmsize < 32
  96. mova [dstq+lenq+2*mmsize], m3
  97. mova [dstq+lenq+3*mmsize], m4
  98. %endif ; mmsize
  99. sub lenq, 64
  100. jge .loop
  101. REP_RET
  102. %endmacro
  103. INIT_XMM sse
  104. VECTOR_FMAC_SCALAR
  105. %if HAVE_AVX_EXTERNAL
  106. INIT_YMM avx
  107. VECTOR_FMAC_SCALAR
  108. %endif
  109. %if HAVE_FMA3_EXTERNAL
  110. INIT_YMM fma3
  111. VECTOR_FMAC_SCALAR
  112. %endif
  113. ;------------------------------------------------------------------------------
  114. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  115. ;------------------------------------------------------------------------------
  116. %macro VECTOR_FMUL_SCALAR 0
  117. %if UNIX64
  118. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  119. %else
  120. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  121. %endif
  122. %if ARCH_X86_32
  123. movss m0, mulm
  124. %elif WIN64
  125. SWAP 0, 2
  126. %endif
  127. shufps m0, m0, 0
  128. lea lenq, [lend*4-mmsize]
  129. .loop:
  130. mova m1, [srcq+lenq]
  131. mulps m1, m0
  132. mova [dstq+lenq], m1
  133. sub lenq, mmsize
  134. jge .loop
  135. REP_RET
  136. %endmacro
  137. INIT_XMM sse
  138. VECTOR_FMUL_SCALAR
  139. ;------------------------------------------------------------------------------
  140. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  141. ; int len)
  142. ;------------------------------------------------------------------------------
  143. %macro VECTOR_DMUL_SCALAR 0
  144. %if ARCH_X86_32
  145. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  146. mov lenq, lenaddrm
  147. %elif UNIX64
  148. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  149. %else
  150. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  151. %endif
  152. %if ARCH_X86_32
  153. VBROADCASTSD m0, mulm
  154. %else
  155. %if WIN64
  156. SWAP 0, 2
  157. %endif
  158. movlhps xm0, xm0
  159. %if cpuflag(avx)
  160. vinsertf128 ym0, ym0, xm0, 1
  161. %endif
  162. %endif
  163. lea lenq, [lend*8-2*mmsize]
  164. .loop:
  165. mulpd m1, m0, [srcq+lenq ]
  166. mulpd m2, m0, [srcq+lenq+mmsize]
  167. mova [dstq+lenq ], m1
  168. mova [dstq+lenq+mmsize], m2
  169. sub lenq, 2*mmsize
  170. jge .loop
  171. REP_RET
  172. %endmacro
  173. INIT_XMM sse2
  174. VECTOR_DMUL_SCALAR
  175. %if HAVE_AVX_EXTERNAL
  176. INIT_YMM avx
  177. VECTOR_DMUL_SCALAR
  178. %endif
  179. ;-----------------------------------------------------------------------------
  180. ; vector_fmul_window(float *dst, const float *src0,
  181. ; const float *src1, const float *win, int len);
  182. ;-----------------------------------------------------------------------------
  183. %macro VECTOR_FMUL_WINDOW 0
  184. cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
  185. shl lend, 2
  186. lea len1q, [lenq - mmsize]
  187. add src0q, lenq
  188. add dstq, lenq
  189. add winq, lenq
  190. neg lenq
  191. .loop
  192. mova m0, [winq + lenq]
  193. mova m4, [src0q + lenq]
  194. %if cpuflag(sse)
  195. mova m1, [winq + len1q]
  196. mova m5, [src1q + len1q]
  197. shufps m1, m1, 0x1b
  198. shufps m5, m5, 0x1b
  199. mova m2, m0
  200. mova m3, m1
  201. mulps m2, m4
  202. mulps m3, m5
  203. mulps m1, m4
  204. mulps m0, m5
  205. addps m2, m3
  206. subps m1, m0
  207. shufps m2, m2, 0x1b
  208. %else
  209. pswapd m1, [winq + len1q]
  210. pswapd m5, [src1q + len1q]
  211. mova m2, m0
  212. mova m3, m1
  213. pfmul m2, m4
  214. pfmul m3, m5
  215. pfmul m1, m4
  216. pfmul m0, m5
  217. pfadd m2, m3
  218. pfsub m1, m0
  219. pswapd m2, m2
  220. %endif
  221. mova [dstq + lenq], m1
  222. mova [dstq + len1q], m2
  223. sub len1q, mmsize
  224. add lenq, mmsize
  225. jl .loop
  226. REP_RET
  227. %endmacro
  228. INIT_MMX 3dnowext
  229. VECTOR_FMUL_WINDOW
  230. INIT_XMM sse
  231. VECTOR_FMUL_WINDOW
  232. ;-----------------------------------------------------------------------------
  233. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  234. ; const float *src2, int len)
  235. ;-----------------------------------------------------------------------------
  236. %macro VECTOR_FMUL_ADD 0
  237. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  238. lea lenq, [lend*4 - 2*mmsize]
  239. ALIGN 16
  240. .loop:
  241. mova m0, [src0q + lenq]
  242. mova m1, [src0q + lenq + mmsize]
  243. %if cpuflag(fma3)
  244. mova m2, [src2q + lenq]
  245. mova m3, [src2q + lenq + mmsize]
  246. fmaddps m0, m0, [src1q + lenq], m2
  247. fmaddps m1, m1, [src1q + lenq + mmsize], m3
  248. %else
  249. mulps m0, m0, [src1q + lenq]
  250. mulps m1, m1, [src1q + lenq + mmsize]
  251. addps m0, m0, [src2q + lenq]
  252. addps m1, m1, [src2q + lenq + mmsize]
  253. %endif
  254. mova [dstq + lenq], m0
  255. mova [dstq + lenq + mmsize], m1
  256. sub lenq, 2*mmsize
  257. jge .loop
  258. REP_RET
  259. %endmacro
  260. INIT_XMM sse
  261. VECTOR_FMUL_ADD
  262. %if HAVE_AVX_EXTERNAL
  263. INIT_YMM avx
  264. VECTOR_FMUL_ADD
  265. %endif
  266. %if HAVE_FMA3_EXTERNAL
  267. INIT_YMM fma3
  268. VECTOR_FMUL_ADD
  269. %endif
  270. ;-----------------------------------------------------------------------------
  271. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  272. ; int len)
  273. ;-----------------------------------------------------------------------------
  274. %macro VECTOR_FMUL_REVERSE 0
  275. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  276. lea lenq, [lend*4 - 2*mmsize]
  277. ALIGN 16
  278. .loop:
  279. %if cpuflag(avx)
  280. vmovaps xmm0, [src1q + 16]
  281. vinsertf128 m0, m0, [src1q], 1
  282. vshufps m0, m0, m0, q0123
  283. vmovaps xmm1, [src1q + mmsize + 16]
  284. vinsertf128 m1, m1, [src1q + mmsize], 1
  285. vshufps m1, m1, m1, q0123
  286. %else
  287. mova m0, [src1q]
  288. mova m1, [src1q + mmsize]
  289. shufps m0, m0, q0123
  290. shufps m1, m1, q0123
  291. %endif
  292. mulps m0, m0, [src0q + lenq + mmsize]
  293. mulps m1, m1, [src0q + lenq]
  294. mova [dstq + lenq + mmsize], m0
  295. mova [dstq + lenq], m1
  296. add src1q, 2*mmsize
  297. sub lenq, 2*mmsize
  298. jge .loop
  299. REP_RET
  300. %endmacro
  301. INIT_XMM sse
  302. VECTOR_FMUL_REVERSE
  303. %if HAVE_AVX_EXTERNAL
  304. INIT_YMM avx
  305. VECTOR_FMUL_REVERSE
  306. %endif
  307. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  308. INIT_XMM sse
  309. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  310. neg offsetq
  311. shl offsetq, 2
  312. sub v1q, offsetq
  313. sub v2q, offsetq
  314. xorps xmm0, xmm0
  315. .loop:
  316. movaps xmm1, [v1q+offsetq]
  317. mulps xmm1, [v2q+offsetq]
  318. addps xmm0, xmm1
  319. add offsetq, 16
  320. js .loop
  321. movhlps xmm1, xmm0
  322. addps xmm0, xmm1
  323. movss xmm1, xmm0
  324. shufps xmm0, xmm0, 1
  325. addss xmm0, xmm1
  326. %if ARCH_X86_64 == 0
  327. movss r0m, xmm0
  328. fld dword r0m
  329. %endif
  330. RET
  331. ;-----------------------------------------------------------------------------
  332. ; void ff_butterflies_float(float *src0, float *src1, int len);
  333. ;-----------------------------------------------------------------------------
  334. INIT_XMM sse
  335. cglobal butterflies_float, 3,3,3, src0, src1, len
  336. %if ARCH_X86_64
  337. movsxd lenq, lend
  338. %endif
  339. test lenq, lenq
  340. jz .end
  341. shl lenq, 2
  342. add src0q, lenq
  343. add src1q, lenq
  344. neg lenq
  345. .loop:
  346. mova m0, [src0q + lenq]
  347. mova m1, [src1q + lenq]
  348. subps m2, m0, m1
  349. addps m0, m0, m1
  350. mova [src1q + lenq], m2
  351. mova [src0q + lenq], m0
  352. add lenq, mmsize
  353. jl .loop
  354. .end:
  355. REP_RET