You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

330 lines
8.8KB

  1. ;******************************************************************************
  2. ;* SSE-optimized functions for the DCA decoder
  3. ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pf_inv16: times 4 dd 0x3D800000 ; 1/16
  24. SECTION_TEXT
  25. ; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
  26. %macro INT8X8_FMUL_INT32 0
  27. cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
  28. cvtsi2ss m0, scalem
  29. mulss m0, [pf_inv16]
  30. shufps m0, m0, 0
  31. %if cpuflag(sse2)
  32. %if cpuflag(sse4)
  33. pmovsxbd m1, [srcq+0]
  34. pmovsxbd m2, [srcq+4]
  35. %else
  36. movq m1, [srcq]
  37. punpcklbw m1, m1
  38. mova m2, m1
  39. punpcklwd m1, m1
  40. punpckhwd m2, m2
  41. psrad m1, 24
  42. psrad m2, 24
  43. %endif
  44. cvtdq2ps m1, m1
  45. cvtdq2ps m2, m2
  46. %else
  47. movd mm0, [srcq+0]
  48. movd mm1, [srcq+4]
  49. punpcklbw mm0, mm0
  50. punpcklbw mm1, mm1
  51. movq mm2, mm0
  52. movq mm3, mm1
  53. punpcklwd mm0, mm0
  54. punpcklwd mm1, mm1
  55. punpckhwd mm2, mm2
  56. punpckhwd mm3, mm3
  57. psrad mm0, 24
  58. psrad mm1, 24
  59. psrad mm2, 24
  60. psrad mm3, 24
  61. cvtpi2ps m1, mm0
  62. cvtpi2ps m2, mm1
  63. cvtpi2ps m3, mm2
  64. cvtpi2ps m4, mm3
  65. shufps m0, m0, 0
  66. emms
  67. shufps m1, m3, q1010
  68. shufps m2, m4, q1010
  69. %endif
  70. mulps m1, m0
  71. mulps m2, m0
  72. mova [dstq+ 0], m1
  73. mova [dstq+16], m2
  74. REP_RET
  75. %endmacro
  76. %if ARCH_X86_32
  77. INIT_XMM sse
  78. INT8X8_FMUL_INT32
  79. %endif
  80. INIT_XMM sse2
  81. INT8X8_FMUL_INT32
  82. INIT_XMM sse4
  83. INT8X8_FMUL_INT32
  84. ; %1=v0/v1 %2=in1 %3=in2
  85. %macro FIR_LOOP 2-3
  86. .loop%1:
  87. %define va m1
  88. %define vb m2
  89. %if %1
  90. %define OFFSET 0
  91. %else
  92. %define OFFSET NUM_COEF*count
  93. %endif
  94. ; for v0, incrementing and for v1, decrementing
  95. mova va, [cf0q + OFFSET]
  96. mova vb, [cf0q + OFFSET + 4*NUM_COEF]
  97. %if %0 == 3
  98. mova m4, [cf0q + OFFSET + mmsize]
  99. mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
  100. %endif
  101. mulps va, %2
  102. mulps vb, %2
  103. %if %0 == 3
  104. mulps m4, %3
  105. mulps m0, %3
  106. addps va, m4
  107. addps vb, m0
  108. %endif
  109. ; va = va1 va2 va3 va4
  110. ; vb = vb1 vb2 vb3 vb4
  111. %if %1
  112. SWAP va, vb
  113. %endif
  114. mova m4, va
  115. unpcklps va, vb ; va3 vb3 va4 vb4
  116. unpckhps m4, vb ; va1 vb1 va2 vb2
  117. addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
  118. movhlps vb, m4 ; va1+3 vb1+3
  119. addps vb, m4 ; va0..4 vb0..4
  120. movh [outq + count], vb
  121. %if %1
  122. sub cf0q, 8*NUM_COEF
  123. %endif
  124. add count, 8
  125. jl .loop%1
  126. %endmacro
  127. ; void dca_lfe_fir(float *out, float *in, float *coefs)
  128. %macro DCA_LFE_FIR 1
  129. cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
  130. %define IN1 m3
  131. %define IN2 m5
  132. %define count inq
  133. %define NUM_COEF 4*(2-%1)
  134. %define NUM_OUT 32*(%1+1)
  135. movu IN1, [inq + 4 - 1*mmsize]
  136. shufps IN1, IN1, q0123
  137. %if %1 == 0
  138. movu IN2, [inq + 4 - 2*mmsize]
  139. shufps IN2, IN2, q0123
  140. %endif
  141. mov count, -4*NUM_OUT
  142. add cf0q, 4*NUM_COEF*NUM_OUT
  143. add outq, 4*NUM_OUT
  144. ; compute v0 first
  145. %if %1 == 0
  146. FIR_LOOP 0, IN1, IN2
  147. %else
  148. FIR_LOOP 0, IN1
  149. %endif
  150. shufps IN1, IN1, q0123
  151. mov count, -4*NUM_OUT
  152. ; cf1 already correctly positioned
  153. add outq, 4*NUM_OUT ; outq now at out2
  154. sub cf0q, 8*NUM_COEF
  155. %if %1 == 0
  156. shufps IN2, IN2, q0123
  157. FIR_LOOP 1, IN2, IN1
  158. %else
  159. FIR_LOOP 1, IN1
  160. %endif
  161. RET
  162. %endmacro
  163. INIT_XMM sse
  164. DCA_LFE_FIR 0
  165. DCA_LFE_FIR 1
  166. INIT_XMM sse2
  167. %macro INNER_LOOP 1
  168. ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
  169. ;~ a += window[i + j] * (-synth_buf[15 - i + j])
  170. ;~ b += window[i + j + 16] * (synth_buf[i + j])
  171. pshufd m5, [ptr2 + j + (15 - 3) * 4], q0123
  172. mova m6, [ptr1 + j]
  173. %if ARCH_X86_64
  174. pshufd m11, [ptr2 + j + (15 - 3) * 4 - mmsize], q0123
  175. mova m12, [ptr1 + j + mmsize]
  176. %endif
  177. mulps m6, [win + %1 + j + 16 * 4]
  178. mulps m5, [win + %1 + j]
  179. %if ARCH_X86_64
  180. mulps m12, [win + %1 + j + mmsize + 16 * 4]
  181. mulps m11, [win + %1 + j + mmsize]
  182. %endif
  183. addps m2, m6
  184. subps m1, m5
  185. %if ARCH_X86_64
  186. addps m8, m12
  187. subps m7, m11
  188. %endif
  189. ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
  190. ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
  191. pshufd m6, [ptr2 + j + (31 - 3) * 4], q0123
  192. mova m5, [ptr1 + j + 16 * 4]
  193. %if ARCH_X86_64
  194. pshufd m12, [ptr2 + j + (31 - 3) * 4 - mmsize], q0123
  195. mova m11, [ptr1 + j + mmsize + 16 * 4]
  196. %endif
  197. mulps m5, [win + %1 + j + 32 * 4]
  198. mulps m6, [win + %1 + j + 48 * 4]
  199. %if ARCH_X86_64
  200. mulps m11, [win + %1 + j + mmsize + 32 * 4]
  201. mulps m12, [win + %1 + j + mmsize + 48 * 4]
  202. %endif
  203. addps m3, m5
  204. addps m4, m6
  205. %if ARCH_X86_64
  206. addps m9, m11
  207. addps m10, m12
  208. %endif
  209. sub j, 64 * 4
  210. %endmacro
  211. ; void ff_synth_filter_inner_sse2(float *synth_buf, float synth_buf2[32],
  212. ; const float window[512], float out[32],
  213. ; intptr_t offset, float scale)
  214. cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
  215. synth_buf, synth_buf2, window, out, off, scale
  216. %define scale m0
  217. %if ARCH_X86_32 || WIN64
  218. movd scale, scalem
  219. ; Make sure offset is in a register and not on the stack
  220. %define OFFQ r4q
  221. %else
  222. %define OFFQ offq
  223. %endif
  224. pshufd m0, m0, 0
  225. ; prepare inner counter limit 1
  226. mov r5q, 480
  227. sub r5q, offmp
  228. and r5q, -64
  229. shl r5q, 2
  230. mov OFFQ, r5q
  231. %define i r5q
  232. mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
  233. %define buf2 synth_buf2q
  234. %if ARCH_X86_32
  235. mov buf2, synth_buf2mp
  236. %endif
  237. .mainloop
  238. ; m1 = a m2 = b m3 = c m4 = d
  239. pxor m3, m3
  240. pxor m4, m4
  241. mova m1, [buf2 + i]
  242. mova m2, [buf2 + i + 16 * 4]
  243. %if ARCH_X86_32
  244. %define ptr1 r0q
  245. %define ptr2 r1q
  246. %define win r2q
  247. %define j r3q
  248. mov win, windowm
  249. mov ptr1, synth_bufm
  250. add win, i
  251. add ptr1, i
  252. %else ; ARCH_X86_64
  253. %define ptr1 r6q
  254. %define ptr2 r7q ; must be loaded
  255. %define win r8q
  256. %define j r9q
  257. pxor m9, m9
  258. pxor m10, m10
  259. mova m7, [buf2 + i + mmsize]
  260. mova m8, [buf2 + i + mmsize + 16 * 4]
  261. lea win, [windowq + i]
  262. lea ptr1, [synth_bufq + i]
  263. %endif
  264. mov ptr2, synth_bufmp
  265. ; prepare the inner loop counter
  266. mov j, OFFQ
  267. sub ptr2, i
  268. .loop1:
  269. INNER_LOOP 0
  270. jge .loop1
  271. mov j, 448 * 4
  272. sub j, OFFQ
  273. jz .end
  274. sub ptr1, j
  275. sub ptr2, j
  276. add win, OFFQ ; now at j-64, so define OFFSET
  277. sub j, 64 * 4
  278. .loop2:
  279. INNER_LOOP 64 * 4
  280. jge .loop2
  281. .end:
  282. %if ARCH_X86_32
  283. mov buf2, synth_buf2m ; needed for next iteration anyway
  284. mov outq, outmp ; j, which will be set again during it
  285. %endif
  286. ;~ out[i] = a * scale;
  287. ;~ out[i + 16] = b * scale;
  288. mulps m1, scale
  289. mulps m2, scale
  290. %if ARCH_X86_64
  291. mulps m7, scale
  292. mulps m8, scale
  293. %endif
  294. ;~ synth_buf2[i] = c;
  295. ;~ synth_buf2[i + 16] = d;
  296. mova [buf2 + i + 0 * 4], m3
  297. mova [buf2 + i + 16 * 4], m4
  298. %if ARCH_X86_64
  299. mova [buf2 + i + 0 * 4 + mmsize], m9
  300. mova [buf2 + i + 16 * 4 + mmsize], m10
  301. %endif
  302. ;~ out[i] = a;
  303. ;~ out[i + 16] = a;
  304. mova [outq + i + 0 * 4], m1
  305. mova [outq + i + 16 * 4], m2
  306. %if ARCH_X86_64
  307. mova [outq + i + 0 * 4 + mmsize], m7
  308. mova [outq + i + 16 * 4 + mmsize], m8
  309. %endif
  310. sub i, (ARCH_X86_64 + 1) * mmsize
  311. jge .mainloop
  312. RET