You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

432 lines
11KB

  1. ;******************************************************************************
  2. ;* SSE-optimized functions for the DCA decoder
  3. ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pf_inv16: times 4 dd 0x3D800000 ; 1/16
  24. SECTION_TEXT
  25. ; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
  26. ; const int8_t hf_vq[1024][32], intptr_t vq_offset,
  27. ; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
  28. %macro DECODE_HF 0
  29. cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
  30. lea srcq, [srcq + offsetq]
  31. shl startq, 2
  32. mov offsetd, endm
  33. %define DICT offsetq
  34. shl offsetq, 2
  35. mov endm, offsetq
  36. .loop:
  37. %if ARCH_X86_64
  38. mov offsetd, [scaleq + 2 * startq]
  39. cvtsi2ss m0, offsetd
  40. %else
  41. cvtsi2ss m0, [scaleq + 2 * startq]
  42. %endif
  43. mov offsetd, [numq + startq]
  44. mulss m0, [pf_inv16]
  45. shl DICT, 5
  46. shufps m0, m0, 0
  47. %if cpuflag(sse2)
  48. %if cpuflag(sse4)
  49. pmovsxbd m1, [srcq + DICT + 0]
  50. pmovsxbd m2, [srcq + DICT + 4]
  51. %else
  52. movq m1, [srcq + DICT]
  53. punpcklbw m1, m1
  54. mova m2, m1
  55. punpcklwd m1, m1
  56. punpckhwd m2, m2
  57. psrad m1, 24
  58. psrad m2, 24
  59. %endif
  60. cvtdq2ps m1, m1
  61. cvtdq2ps m2, m2
  62. %else
  63. movd mm0, [srcq + DICT + 0]
  64. movd mm1, [srcq + DICT + 4]
  65. punpcklbw mm0, mm0
  66. punpcklbw mm1, mm1
  67. movq mm2, mm0
  68. movq mm3, mm1
  69. punpcklwd mm0, mm0
  70. punpcklwd mm1, mm1
  71. punpckhwd mm2, mm2
  72. punpckhwd mm3, mm3
  73. psrad mm0, 24
  74. psrad mm1, 24
  75. psrad mm2, 24
  76. psrad mm3, 24
  77. cvtpi2ps m1, mm0
  78. cvtpi2ps m2, mm1
  79. cvtpi2ps m3, mm2
  80. cvtpi2ps m4, mm3
  81. shufps m0, m0, 0
  82. shufps m1, m3, q1010
  83. shufps m2, m4, q1010
  84. %endif
  85. mulps m1, m0
  86. mulps m2, m0
  87. mova [dstq + 8 * startq + 0], m1
  88. mova [dstq + 8 * startq + 16], m2
  89. add startq, 4
  90. cmp startq, endm
  91. jl .loop
  92. .end:
  93. %if notcpuflag(sse2)
  94. emms
  95. %endif
  96. REP_RET
  97. %endmacro
  98. %if ARCH_X86_32
  99. INIT_XMM sse
  100. DECODE_HF
  101. %endif
  102. INIT_XMM sse2
  103. DECODE_HF
  104. INIT_XMM sse4
  105. DECODE_HF
  106. ; %1=v0/v1 %2=in1 %3=in2
  107. %macro FIR_LOOP 2-3
  108. .loop%1:
  109. %define va m1
  110. %define vb m2
  111. %if %1
  112. %define OFFSET 0
  113. %else
  114. %define OFFSET NUM_COEF*count
  115. %endif
  116. ; for v0, incrementing and for v1, decrementing
  117. mova va, [cf0q + OFFSET]
  118. mova vb, [cf0q + OFFSET + 4*NUM_COEF]
  119. %if %0 == 3
  120. mova m4, [cf0q + OFFSET + mmsize]
  121. mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
  122. %endif
  123. mulps va, %2
  124. mulps vb, %2
  125. %if %0 == 3
  126. %if cpuflag(fma3)
  127. fmaddps va, m4, %3, va
  128. fmaddps vb, m0, %3, vb
  129. %else
  130. mulps m4, %3
  131. mulps m0, %3
  132. addps va, m4
  133. addps vb, m0
  134. %endif
  135. %endif
  136. ; va = va1 va2 va3 va4
  137. ; vb = vb1 vb2 vb3 vb4
  138. %if %1
  139. SWAP va, vb
  140. %endif
  141. mova m4, va
  142. unpcklps va, vb ; va3 vb3 va4 vb4
  143. unpckhps m4, vb ; va1 vb1 va2 vb2
  144. addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
  145. movhlps vb, m4 ; va1+3 vb1+3
  146. addps vb, m4 ; va0..4 vb0..4
  147. movlps [outq + count], vb
  148. %if %1
  149. sub cf0q, 8*NUM_COEF
  150. %endif
  151. add count, 8
  152. jl .loop%1
  153. %endmacro
  154. ; void dca_lfe_fir(float *out, float *in, float *coefs)
  155. %macro DCA_LFE_FIR 1
  156. cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
  157. %define IN1 m3
  158. %define IN2 m5
  159. %define count inq
  160. %define NUM_COEF 4*(2-%1)
  161. %define NUM_OUT 32*(%1+1)
  162. movu IN1, [inq + 4 - 1*mmsize]
  163. shufps IN1, IN1, q0123
  164. %if %1 == 0
  165. movu IN2, [inq + 4 - 2*mmsize]
  166. shufps IN2, IN2, q0123
  167. %endif
  168. mov count, -4*NUM_OUT
  169. add cf0q, 4*NUM_COEF*NUM_OUT
  170. add outq, 4*NUM_OUT
  171. ; compute v0 first
  172. %if %1 == 0
  173. FIR_LOOP 0, IN1, IN2
  174. %else
  175. FIR_LOOP 0, IN1
  176. %endif
  177. shufps IN1, IN1, q0123
  178. mov count, -4*NUM_OUT
  179. ; cf1 already correctly positioned
  180. add outq, 4*NUM_OUT ; outq now at out2
  181. sub cf0q, 8*NUM_COEF
  182. %if %1 == 0
  183. shufps IN2, IN2, q0123
  184. FIR_LOOP 1, IN2, IN1
  185. %else
  186. FIR_LOOP 1, IN1
  187. %endif
  188. RET
  189. %endmacro
  190. INIT_XMM sse
  191. DCA_LFE_FIR 0
  192. DCA_LFE_FIR 1
  193. %if HAVE_FMA3_EXTERNAL
  194. INIT_XMM fma3
  195. DCA_LFE_FIR 0
  196. %endif
  197. %macro SETZERO 1
  198. %if cpuflag(sse2) && notcpuflag(avx)
  199. pxor %1, %1
  200. %else
  201. xorps %1, %1, %1
  202. %endif
  203. %endmacro
  204. %macro SHUF 3
  205. %if cpuflag(avx)
  206. mova %3, [%2 - 16]
  207. vperm2f128 %1, %3, %3, 1
  208. vshufps %1, %1, %1, q0123
  209. %elif cpuflag(sse2)
  210. pshufd %1, [%2], q0123
  211. %else
  212. mova %1, [%2]
  213. shufps %1, %1, q0123
  214. %endif
  215. %endmacro
  216. %macro INNER_LOOP 1
  217. ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
  218. ;~ a += window[i + j] * (-synth_buf[15 - i + j])
  219. ;~ b += window[i + j + 16] * (synth_buf[i + j])
  220. SHUF m5, ptr2 + j + (15 - 3) * 4, m6
  221. mova m6, [ptr1 + j]
  222. %if ARCH_X86_64
  223. SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
  224. mova m12, [ptr1 + j + mmsize]
  225. %endif
  226. %if cpuflag(fma3)
  227. fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
  228. fnmaddps m1, m5, [win + %1 + j], m1
  229. %if ARCH_X86_64
  230. fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
  231. fnmaddps m7, m11, [win + %1 + j + mmsize], m7
  232. %endif
  233. %else ; non-FMA
  234. mulps m6, m6, [win + %1 + j + 16 * 4]
  235. mulps m5, m5, [win + %1 + j]
  236. %if ARCH_X86_64
  237. mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
  238. mulps m11, m11, [win + %1 + j + mmsize]
  239. %endif
  240. addps m2, m2, m6
  241. subps m1, m1, m5
  242. %if ARCH_X86_64
  243. addps m8, m8, m12
  244. subps m7, m7, m11
  245. %endif
  246. %endif ; cpuflag(fma3)
  247. ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
  248. ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
  249. SHUF m6, ptr2 + j + (31 - 3) * 4, m5
  250. mova m5, [ptr1 + j + 16 * 4]
  251. %if ARCH_X86_64
  252. SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
  253. mova m11, [ptr1 + j + mmsize + 16 * 4]
  254. %endif
  255. %if cpuflag(fma3)
  256. fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
  257. fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
  258. %if ARCH_X86_64
  259. fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
  260. fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
  261. %endif
  262. %else ; non-FMA
  263. mulps m5, m5, [win + %1 + j + 32 * 4]
  264. mulps m6, m6, [win + %1 + j + 48 * 4]
  265. %if ARCH_X86_64
  266. mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
  267. mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
  268. %endif
  269. addps m3, m3, m5
  270. addps m4, m4, m6
  271. %if ARCH_X86_64
  272. addps m9, m9, m11
  273. addps m10, m10, m12
  274. %endif
  275. %endif ; cpuflag(fma3)
  276. sub j, 64 * 4
  277. %endmacro
  278. ; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
  279. ; const float window[512], float out[32],
  280. ; intptr_t offset, float scale)
  281. %macro SYNTH_FILTER 0
  282. cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
  283. synth_buf, synth_buf2, window, out, off, scale
  284. %define scale m0
  285. %if ARCH_X86_32 || WIN64
  286. %if cpuflag(sse2) && notcpuflag(avx)
  287. movd scale, scalem
  288. SPLATD m0
  289. %else
  290. VBROADCASTSS m0, scalem
  291. %endif
  292. ; Make sure offset is in a register and not on the stack
  293. %define OFFQ r4q
  294. %else
  295. SPLATD xmm0
  296. %if cpuflag(avx)
  297. vinsertf128 m0, m0, xmm0, 1
  298. %endif
  299. %define OFFQ offq
  300. %endif
  301. ; prepare inner counter limit 1
  302. mov r5q, 480
  303. sub r5q, offmp
  304. and r5q, -64
  305. shl r5q, 2
  306. %if ARCH_X86_32 || notcpuflag(avx)
  307. mov OFFQ, r5q
  308. %define i r5q
  309. mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
  310. %else
  311. %define i 0
  312. %define OFFQ r5q
  313. %endif
  314. %define buf2 synth_buf2q
  315. %if ARCH_X86_32
  316. mov buf2, synth_buf2mp
  317. %endif
  318. .mainloop
  319. ; m1 = a m2 = b m3 = c m4 = d
  320. SETZERO m3
  321. SETZERO m4
  322. mova m1, [buf2 + i]
  323. mova m2, [buf2 + i + 16 * 4]
  324. %if ARCH_X86_32
  325. %define ptr1 r0q
  326. %define ptr2 r1q
  327. %define win r2q
  328. %define j r3q
  329. mov win, windowm
  330. mov ptr1, synth_bufm
  331. %if ARCH_X86_32 || notcpuflag(avx)
  332. add win, i
  333. add ptr1, i
  334. %endif
  335. %else ; ARCH_X86_64
  336. %define ptr1 r6q
  337. %define ptr2 r7q ; must be loaded
  338. %define win r8q
  339. %define j r9q
  340. SETZERO m9
  341. SETZERO m10
  342. mova m7, [buf2 + i + mmsize]
  343. mova m8, [buf2 + i + mmsize + 16 * 4]
  344. lea win, [windowq + i]
  345. lea ptr1, [synth_bufq + i]
  346. %endif
  347. mov ptr2, synth_bufmp
  348. ; prepare the inner loop counter
  349. mov j, OFFQ
  350. %if ARCH_X86_32 || notcpuflag(avx)
  351. sub ptr2, i
  352. %endif
  353. .loop1:
  354. INNER_LOOP 0
  355. jge .loop1
  356. mov j, 448 * 4
  357. sub j, OFFQ
  358. jz .end
  359. sub ptr1, j
  360. sub ptr2, j
  361. add win, OFFQ ; now at j-64, so define OFFSET
  362. sub j, 64 * 4
  363. .loop2:
  364. INNER_LOOP 64 * 4
  365. jge .loop2
  366. .end:
  367. %if ARCH_X86_32
  368. mov buf2, synth_buf2m ; needed for next iteration anyway
  369. mov outq, outmp ; j, which will be set again during it
  370. %endif
  371. ;~ out[i] = a * scale;
  372. ;~ out[i + 16] = b * scale;
  373. mulps m1, m1, scale
  374. mulps m2, m2, scale
  375. %if ARCH_X86_64
  376. mulps m7, m7, scale
  377. mulps m8, m8, scale
  378. %endif
  379. ;~ synth_buf2[i] = c;
  380. ;~ synth_buf2[i + 16] = d;
  381. mova [buf2 + i + 0 * 4], m3
  382. mova [buf2 + i + 16 * 4], m4
  383. %if ARCH_X86_64
  384. mova [buf2 + i + 0 * 4 + mmsize], m9
  385. mova [buf2 + i + 16 * 4 + mmsize], m10
  386. %endif
  387. ;~ out[i] = a;
  388. ;~ out[i + 16] = a;
  389. mova [outq + i + 0 * 4], m1
  390. mova [outq + i + 16 * 4], m2
  391. %if ARCH_X86_64
  392. mova [outq + i + 0 * 4 + mmsize], m7
  393. mova [outq + i + 16 * 4 + mmsize], m8
  394. %endif
  395. %if ARCH_X86_32 || notcpuflag(avx)
  396. sub i, (ARCH_X86_64 + 1) * mmsize
  397. jge .mainloop
  398. %endif
  399. RET
  400. %endmacro
  401. %if ARCH_X86_32
  402. INIT_XMM sse
  403. SYNTH_FILTER
  404. %endif
  405. INIT_XMM sse2
  406. SYNTH_FILTER
  407. INIT_YMM avx
  408. SYNTH_FILTER
  409. INIT_YMM fma3
  410. SYNTH_FILTER