You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

423 lines
11KB

  1. ;******************************************************************************
  2. ;* SSE-optimized functions for the DCA decoder
  3. ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pf_inv16: times 4 dd 0x3D800000 ; 1/16
  24. SECTION_TEXT
  25. ; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
  26. ; const int8_t hf_vq[1024][32], intptr_t vq_offset,
  27. ; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
  28. %macro DECODE_HF 0
  29. cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
  30. lea srcq, [srcq + offsetq]
  31. shl startq, 2
  32. mov offsetd, endm
  33. %define DICT offsetq
  34. shl offsetq, 2
  35. mov endm, offsetq
  36. .loop:
  37. %if ARCH_X86_64
  38. mov offsetd, [scaleq + 2 * startq]
  39. cvtsi2ss m0, offsetd
  40. %else
  41. cvtsi2ss m0, [scaleq + 2 * startq]
  42. %endif
  43. mov offsetd, [numq + startq]
  44. mulss m0, [pf_inv16]
  45. shl DICT, 5
  46. shufps m0, m0, 0
  47. %if cpuflag(sse2)
  48. %if cpuflag(sse4)
  49. pmovsxbd m1, [srcq + DICT + 0]
  50. pmovsxbd m2, [srcq + DICT + 4]
  51. %else
  52. movq m1, [srcq + DICT]
  53. punpcklbw m1, m1
  54. mova m2, m1
  55. punpcklwd m1, m1
  56. punpckhwd m2, m2
  57. psrad m1, 24
  58. psrad m2, 24
  59. %endif
  60. cvtdq2ps m1, m1
  61. cvtdq2ps m2, m2
  62. %else
  63. movd mm0, [srcq + DICT + 0]
  64. movd mm1, [srcq + DICT + 4]
  65. punpcklbw mm0, mm0
  66. punpcklbw mm1, mm1
  67. movq mm2, mm0
  68. movq mm3, mm1
  69. punpcklwd mm0, mm0
  70. punpcklwd mm1, mm1
  71. punpckhwd mm2, mm2
  72. punpckhwd mm3, mm3
  73. psrad mm0, 24
  74. psrad mm1, 24
  75. psrad mm2, 24
  76. psrad mm3, 24
  77. cvtpi2ps m1, mm0
  78. cvtpi2ps m2, mm1
  79. cvtpi2ps m3, mm2
  80. cvtpi2ps m4, mm3
  81. shufps m0, m0, 0
  82. shufps m1, m3, q1010
  83. shufps m2, m4, q1010
  84. %endif
  85. mulps m1, m0
  86. mulps m2, m0
  87. mova [dstq + 8 * startq + 0], m1
  88. mova [dstq + 8 * startq + 16], m2
  89. add startq, 4
  90. cmp startq, endm
  91. jl .loop
  92. .end:
  93. %if notcpuflag(sse2)
  94. emms
  95. %endif
  96. REP_RET
  97. %endmacro
  98. %if ARCH_X86_32
  99. INIT_XMM sse
  100. DECODE_HF
  101. %endif
  102. INIT_XMM sse2
  103. DECODE_HF
  104. INIT_XMM sse4
  105. DECODE_HF
  106. ; %1=v0/v1 %2=in1 %3=in2
  107. %macro FIR_LOOP 2-3
  108. .loop%1:
  109. %define va m1
  110. %define vb m2
  111. %if %1
  112. %define OFFSET 0
  113. %else
  114. %define OFFSET NUM_COEF*count
  115. %endif
  116. ; for v0, incrementing and for v1, decrementing
  117. mova va, [cf0q + OFFSET]
  118. mova vb, [cf0q + OFFSET + 4*NUM_COEF]
  119. %if %0 == 3
  120. mova m4, [cf0q + OFFSET + mmsize]
  121. mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
  122. %endif
  123. mulps va, %2
  124. mulps vb, %2
  125. %if %0 == 3
  126. mulps m4, %3
  127. mulps m0, %3
  128. addps va, m4
  129. addps vb, m0
  130. %endif
  131. ; va = va1 va2 va3 va4
  132. ; vb = vb1 vb2 vb3 vb4
  133. %if %1
  134. SWAP va, vb
  135. %endif
  136. mova m4, va
  137. unpcklps va, vb ; va3 vb3 va4 vb4
  138. unpckhps m4, vb ; va1 vb1 va2 vb2
  139. addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
  140. movhlps vb, m4 ; va1+3 vb1+3
  141. addps vb, m4 ; va0..4 vb0..4
  142. movh [outq + count], vb
  143. %if %1
  144. sub cf0q, 8*NUM_COEF
  145. %endif
  146. add count, 8
  147. jl .loop%1
  148. %endmacro
  149. ; void dca_lfe_fir(float *out, float *in, float *coefs)
  150. %macro DCA_LFE_FIR 1
  151. cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
  152. %define IN1 m3
  153. %define IN2 m5
  154. %define count inq
  155. %define NUM_COEF 4*(2-%1)
  156. %define NUM_OUT 32*(%1+1)
  157. movu IN1, [inq + 4 - 1*mmsize]
  158. shufps IN1, IN1, q0123
  159. %if %1 == 0
  160. movu IN2, [inq + 4 - 2*mmsize]
  161. shufps IN2, IN2, q0123
  162. %endif
  163. mov count, -4*NUM_OUT
  164. add cf0q, 4*NUM_COEF*NUM_OUT
  165. add outq, 4*NUM_OUT
  166. ; compute v0 first
  167. %if %1 == 0
  168. FIR_LOOP 0, IN1, IN2
  169. %else
  170. FIR_LOOP 0, IN1
  171. %endif
  172. shufps IN1, IN1, q0123
  173. mov count, -4*NUM_OUT
  174. ; cf1 already correctly positioned
  175. add outq, 4*NUM_OUT ; outq now at out2
  176. sub cf0q, 8*NUM_COEF
  177. %if %1 == 0
  178. shufps IN2, IN2, q0123
  179. FIR_LOOP 1, IN2, IN1
  180. %else
  181. FIR_LOOP 1, IN1
  182. %endif
  183. RET
  184. %endmacro
  185. INIT_XMM sse
  186. DCA_LFE_FIR 0
  187. DCA_LFE_FIR 1
  188. %macro SETZERO 1
  189. %if cpuflag(sse2) && notcpuflag(avx)
  190. pxor %1, %1
  191. %else
  192. xorps %1, %1, %1
  193. %endif
  194. %endmacro
  195. %macro SHUF 3
  196. %if cpuflag(avx)
  197. mova %3, [%2 - 16]
  198. vperm2f128 %1, %3, %3, 1
  199. vshufps %1, %1, %1, q0123
  200. %elif cpuflag(sse2)
  201. pshufd %1, [%2], q0123
  202. %else
  203. mova %1, [%2]
  204. shufps %1, %1, q0123
  205. %endif
  206. %endmacro
  207. %macro INNER_LOOP 1
  208. ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
  209. ;~ a += window[i + j] * (-synth_buf[15 - i + j])
  210. ;~ b += window[i + j + 16] * (synth_buf[i + j])
  211. SHUF m5, ptr2 + j + (15 - 3) * 4, m6
  212. mova m6, [ptr1 + j]
  213. %if ARCH_X86_64
  214. SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
  215. mova m12, [ptr1 + j + mmsize]
  216. %endif
  217. %if cpuflag(fma3)
  218. fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
  219. fnmaddps m1, m5, [win + %1 + j], m1
  220. %if ARCH_X86_64
  221. fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
  222. fnmaddps m7, m11, [win + %1 + j + mmsize], m7
  223. %endif
  224. %else ; non-FMA
  225. mulps m6, m6, [win + %1 + j + 16 * 4]
  226. mulps m5, m5, [win + %1 + j]
  227. %if ARCH_X86_64
  228. mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
  229. mulps m11, m11, [win + %1 + j + mmsize]
  230. %endif
  231. addps m2, m2, m6
  232. subps m1, m1, m5
  233. %if ARCH_X86_64
  234. addps m8, m8, m12
  235. subps m7, m7, m11
  236. %endif
  237. %endif ; cpuflag(fma3)
  238. ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
  239. ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
  240. SHUF m6, ptr2 + j + (31 - 3) * 4, m5
  241. mova m5, [ptr1 + j + 16 * 4]
  242. %if ARCH_X86_64
  243. SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
  244. mova m11, [ptr1 + j + mmsize + 16 * 4]
  245. %endif
  246. %if cpuflag(fma3)
  247. fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
  248. fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
  249. %if ARCH_X86_64
  250. fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
  251. fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
  252. %endif
  253. %else ; non-FMA
  254. mulps m5, m5, [win + %1 + j + 32 * 4]
  255. mulps m6, m6, [win + %1 + j + 48 * 4]
  256. %if ARCH_X86_64
  257. mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
  258. mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
  259. %endif
  260. addps m3, m3, m5
  261. addps m4, m4, m6
  262. %if ARCH_X86_64
  263. addps m9, m9, m11
  264. addps m10, m10, m12
  265. %endif
  266. %endif ; cpuflag(fma3)
  267. sub j, 64 * 4
  268. %endmacro
  269. ; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
  270. ; const float window[512], float out[32],
  271. ; intptr_t offset, float scale)
  272. %macro SYNTH_FILTER 0
  273. cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
  274. synth_buf, synth_buf2, window, out, off, scale
  275. %define scale m0
  276. %if ARCH_X86_32 || WIN64
  277. %if cpuflag(sse2) && notcpuflag(avx)
  278. movd scale, scalem
  279. SPLATD m0
  280. %else
  281. VBROADCASTSS m0, scalem
  282. %endif
  283. ; Make sure offset is in a register and not on the stack
  284. %define OFFQ r4q
  285. %else
  286. SPLATD xmm0
  287. %if cpuflag(avx)
  288. vinsertf128 m0, m0, xmm0, 1
  289. %endif
  290. %define OFFQ offq
  291. %endif
  292. ; prepare inner counter limit 1
  293. mov r5q, 480
  294. sub r5q, offmp
  295. and r5q, -64
  296. shl r5q, 2
  297. %if ARCH_X86_32 || notcpuflag(avx)
  298. mov OFFQ, r5q
  299. %define i r5q
  300. mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
  301. %else
  302. %define i 0
  303. %define OFFQ r5q
  304. %endif
  305. %define buf2 synth_buf2q
  306. %if ARCH_X86_32
  307. mov buf2, synth_buf2mp
  308. %endif
  309. .mainloop
  310. ; m1 = a m2 = b m3 = c m4 = d
  311. SETZERO m3
  312. SETZERO m4
  313. mova m1, [buf2 + i]
  314. mova m2, [buf2 + i + 16 * 4]
  315. %if ARCH_X86_32
  316. %define ptr1 r0q
  317. %define ptr2 r1q
  318. %define win r2q
  319. %define j r3q
  320. mov win, windowm
  321. mov ptr1, synth_bufm
  322. %if ARCH_X86_32 || notcpuflag(avx)
  323. add win, i
  324. add ptr1, i
  325. %endif
  326. %else ; ARCH_X86_64
  327. %define ptr1 r6q
  328. %define ptr2 r7q ; must be loaded
  329. %define win r8q
  330. %define j r9q
  331. SETZERO m9
  332. SETZERO m10
  333. mova m7, [buf2 + i + mmsize]
  334. mova m8, [buf2 + i + mmsize + 16 * 4]
  335. lea win, [windowq + i]
  336. lea ptr1, [synth_bufq + i]
  337. %endif
  338. mov ptr2, synth_bufmp
  339. ; prepare the inner loop counter
  340. mov j, OFFQ
  341. %if ARCH_X86_32 || notcpuflag(avx)
  342. sub ptr2, i
  343. %endif
  344. .loop1:
  345. INNER_LOOP 0
  346. jge .loop1
  347. mov j, 448 * 4
  348. sub j, OFFQ
  349. jz .end
  350. sub ptr1, j
  351. sub ptr2, j
  352. add win, OFFQ ; now at j-64, so define OFFSET
  353. sub j, 64 * 4
  354. .loop2:
  355. INNER_LOOP 64 * 4
  356. jge .loop2
  357. .end:
  358. %if ARCH_X86_32
  359. mov buf2, synth_buf2m ; needed for next iteration anyway
  360. mov outq, outmp ; j, which will be set again during it
  361. %endif
  362. ;~ out[i] = a * scale;
  363. ;~ out[i + 16] = b * scale;
  364. mulps m1, m1, scale
  365. mulps m2, m2, scale
  366. %if ARCH_X86_64
  367. mulps m7, m7, scale
  368. mulps m8, m8, scale
  369. %endif
  370. ;~ synth_buf2[i] = c;
  371. ;~ synth_buf2[i + 16] = d;
  372. mova [buf2 + i + 0 * 4], m3
  373. mova [buf2 + i + 16 * 4], m4
  374. %if ARCH_X86_64
  375. mova [buf2 + i + 0 * 4 + mmsize], m9
  376. mova [buf2 + i + 16 * 4 + mmsize], m10
  377. %endif
  378. ;~ out[i] = a;
  379. ;~ out[i + 16] = a;
  380. mova [outq + i + 0 * 4], m1
  381. mova [outq + i + 16 * 4], m2
  382. %if ARCH_X86_64
  383. mova [outq + i + 0 * 4 + mmsize], m7
  384. mova [outq + i + 16 * 4 + mmsize], m8
  385. %endif
  386. %if ARCH_X86_32 || notcpuflag(avx)
  387. sub i, (ARCH_X86_64 + 1) * mmsize
  388. jge .mainloop
  389. %endif
  390. RET
  391. %endmacro
  392. %if ARCH_X86_32
  393. INIT_XMM sse
  394. SYNTH_FILTER
  395. %endif
  396. INIT_XMM sse2
  397. SYNTH_FILTER
  398. INIT_YMM avx
  399. SYNTH_FILTER
  400. INIT_YMM fma3
  401. SYNTH_FILTER