You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

426 lines
12KB

  1. ;******************************************************************************
  2. ;* AAC Spectral Band Replication decoding functions
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. ; mask equivalent for multiply by -1.0 1.0
  24. ps_mask times 2 dd 1<<31, 0
  25. ps_mask2 times 2 dd 0, 1<<31
  26. ps_neg times 4 dd 1<<31
  27. ps_noise0 times 2 dd 1.0, 0.0,
  28. ps_noise2 times 2 dd -1.0, 0.0
  29. ps_noise13 dd 0.0, 1.0, 0.0, -1.0
  30. dd 0.0, -1.0, 0.0, 1.0
  31. dd 0.0, 1.0, 0.0, -1.0
  32. cextern sbr_noise_table
  33. SECTION_TEXT
  34. INIT_XMM sse
  35. cglobal sbr_sum_square, 2, 3, 6
  36. mov r2, r1
  37. xorps m0, m0
  38. xorps m1, m1
  39. sar r2, 3
  40. jz .prepare
  41. .loop:
  42. movu m2, [r0 + 0]
  43. movu m3, [r0 + 16]
  44. movu m4, [r0 + 32]
  45. movu m5, [r0 + 48]
  46. mulps m2, m2
  47. mulps m3, m3
  48. mulps m4, m4
  49. mulps m5, m5
  50. addps m0, m2
  51. addps m1, m3
  52. addps m0, m4
  53. addps m1, m5
  54. add r0, 64
  55. dec r2
  56. jnz .loop
  57. .prepare:
  58. and r1, 7
  59. sar r1, 1
  60. jz .end
  61. ; len is a multiple of 2, thus there are at least 4 elements to process
  62. .endloop:
  63. movu m2, [r0]
  64. add r0, 16
  65. mulps m2, m2
  66. dec r1
  67. addps m0, m2
  68. jnz .endloop
  69. .end:
  70. addps m0, m1
  71. movhlps m2, m0
  72. addps m0, m2
  73. movss m1, m0
  74. shufps m0, m0, 1
  75. addss m0, m1
  76. %if ARCH_X86_64 == 0
  77. movss r0m, m0
  78. fld dword r0m
  79. %endif
  80. RET
  81. %define STEP 40*4*2
  82. cglobal sbr_hf_g_filt, 5, 6, 5
  83. lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
  84. mov r5, r3
  85. and r3, 0xFC
  86. lea r2, [r2 + r3*4]
  87. lea r0, [r0 + r3*8]
  88. neg r3
  89. jz .loop1
  90. .loop4:
  91. movlps m0, [r2 + 4*r3 + 0]
  92. movlps m1, [r2 + 4*r3 + 8]
  93. movlps m2, [r1 + 0*STEP]
  94. movlps m3, [r1 + 2*STEP]
  95. movhps m2, [r1 + 1*STEP]
  96. movhps m3, [r1 + 3*STEP]
  97. unpcklps m0, m0
  98. unpcklps m1, m1
  99. mulps m0, m2
  100. mulps m1, m3
  101. movu [r0 + 8*r3 + 0], m0
  102. movu [r0 + 8*r3 + 16], m1
  103. add r1, 4*STEP
  104. add r3, 4
  105. jnz .loop4
  106. and r5, 3 ; number of single element loops
  107. jz .end
  108. .loop1: ; element 0 and 1 can be computed at the same time
  109. movss m0, [r2]
  110. movlps m2, [r1]
  111. unpcklps m0, m0
  112. mulps m2, m0
  113. movlps [r0], m2
  114. add r0, 8
  115. add r2, 4
  116. add r1, STEP
  117. dec r5
  118. jnz .loop1
  119. .end:
  120. RET
  121. ; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
  122. ; const float alpha0[2], const float alpha1[2],
  123. ; float bw, int start, int end)
  124. ;
  125. cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
  126. ; load alpha factors
  127. %define bw m0
  128. %if ARCH_X86_64 == 0 || WIN64
  129. movss bw, BWm
  130. %endif
  131. movlps m2, [alpha1q]
  132. movlps m1, [alpha0q]
  133. shufps bw, bw, 0
  134. mulps m2, bw ; (a1[0] a1[1])*bw
  135. mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
  136. mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
  137. mova m3, m1
  138. mova m4, m2
  139. ; Set pointers
  140. %if ARCH_X86_64 == 0 || WIN64
  141. ; start and end 6th and 7th args on stack
  142. mov r2d, Sm
  143. mov r3d, Em
  144. %define start r2q
  145. %define end r3q
  146. %else
  147. ; BW does not actually occupy a register, so shift by 1
  148. %define start BWq
  149. %define end Sq
  150. %endif
  151. sub start, end ; neg num of loops
  152. lea X_highq, [X_highq + end*2*4]
  153. lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
  154. shl start, 3 ; offset from num loops
  155. mova m0, [X_lowq + start]
  156. shufps m3, m3, q1111
  157. shufps m4, m4, q1111
  158. xorps m3, [ps_mask]
  159. shufps m1, m1, q0000
  160. shufps m2, m2, q0000
  161. xorps m4, [ps_mask]
  162. .loop2:
  163. movu m7, [X_lowq + start + 8] ; BbCc
  164. mova m6, m0
  165. mova m5, m7
  166. shufps m0, m0, q2301 ; aAbB
  167. shufps m7, m7, q2301 ; bBcC
  168. mulps m0, m4
  169. mulps m7, m3
  170. mulps m6, m2
  171. mulps m5, m1
  172. addps m7, m0
  173. mova m0, [X_lowq + start +16] ; CcDd
  174. addps m7, m0
  175. addps m6, m5
  176. addps m7, m6
  177. mova [X_highq + start], m7
  178. add start, 16
  179. jnz .loop2
  180. RET
  181. cglobal sbr_sum64x5, 1,2,4,z
  182. lea r1q, [zq+ 256]
  183. .loop:
  184. mova m0, [zq+ 0]
  185. mova m2, [zq+ 16]
  186. mova m1, [zq+ 256]
  187. mova m3, [zq+ 272]
  188. addps m0, [zq+ 512]
  189. addps m2, [zq+ 528]
  190. addps m1, [zq+ 768]
  191. addps m3, [zq+ 784]
  192. addps m0, [zq+1024]
  193. addps m2, [zq+1040]
  194. addps m0, m1
  195. addps m2, m3
  196. mova [zq], m0
  197. mova [zq+16], m2
  198. add zq, 32
  199. cmp zq, r1q
  200. jne .loop
  201. REP_RET
  202. INIT_XMM sse
  203. cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
  204. lea r2q, [zq + (64-4)*4]
  205. mova m3, [ps_neg]
  206. .loop:
  207. mova m1, [zq]
  208. xorps m0, m3, [r2q]
  209. shufps m0, m0, m0, q0123
  210. unpcklps m2, m0, m1
  211. unpckhps m0, m0, m1
  212. mova [Wq + 0], m2
  213. mova [Wq + 16], m0
  214. add Wq, 32
  215. sub r2q, 16
  216. add zq, 16
  217. cmp zq, r2q
  218. jl .loop
  219. REP_RET
  220. INIT_XMM sse
  221. cglobal sbr_neg_odd_64, 1,2,4,z
  222. lea r1q, [zq+256]
  223. .loop:
  224. mova m0, [zq+ 0]
  225. mova m1, [zq+16]
  226. mova m2, [zq+32]
  227. mova m3, [zq+48]
  228. xorps m0, [ps_mask2]
  229. xorps m1, [ps_mask2]
  230. xorps m2, [ps_mask2]
  231. xorps m3, [ps_mask2]
  232. mova [zq+ 0], m0
  233. mova [zq+16], m1
  234. mova [zq+32], m2
  235. mova [zq+48], m3
  236. add zq, 64
  237. cmp zq, r1q
  238. jne .loop
  239. REP_RET
  240. ; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
  241. %macro SBR_QMF_DEINT_BFLY 0
  242. cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
  243. mov cq, 64*4-2*mmsize
  244. lea vrevq, [vq + 64*4]
  245. .loop:
  246. mova m0, [src0q+cq]
  247. mova m1, [src1q]
  248. mova m4, [src0q+cq+mmsize]
  249. mova m5, [src1q+mmsize]
  250. %if cpuflag(sse2)
  251. pshufd m2, m0, q0123
  252. pshufd m3, m1, q0123
  253. pshufd m6, m4, q0123
  254. pshufd m7, m5, q0123
  255. %else
  256. shufps m2, m0, m0, q0123
  257. shufps m3, m1, m1, q0123
  258. shufps m6, m4, m4, q0123
  259. shufps m7, m5, m5, q0123
  260. %endif
  261. addps m5, m2
  262. subps m0, m7
  263. addps m1, m6
  264. subps m4, m3
  265. mova [vrevq], m1
  266. mova [vrevq+mmsize], m5
  267. mova [vq+cq], m0
  268. mova [vq+cq+mmsize], m4
  269. add src1q, 2*mmsize
  270. add vrevq, 2*mmsize
  271. sub cq, 2*mmsize
  272. jge .loop
  273. REP_RET
  274. %endmacro
  275. INIT_XMM sse
  276. SBR_QMF_DEINT_BFLY
  277. INIT_XMM sse2
  278. SBR_QMF_DEINT_BFLY
  279. INIT_XMM sse2
  280. cglobal sbr_qmf_pre_shuffle, 1,4,6,z
  281. %define OFFSET (32*4-2*mmsize)
  282. mov r3q, OFFSET
  283. lea r1q, [zq + (32+1)*4]
  284. lea r2q, [zq + 64*4]
  285. mova m5, [ps_neg]
  286. .loop:
  287. movu m0, [r1q]
  288. movu m2, [r1q + mmsize]
  289. movu m1, [zq + r3q + 4 + mmsize]
  290. movu m3, [zq + r3q + 4]
  291. pxor m2, m5
  292. pxor m0, m5
  293. pshufd m2, m2, q0123
  294. pshufd m0, m0, q0123
  295. SBUTTERFLY dq, 2, 3, 4
  296. SBUTTERFLY dq, 0, 1, 4
  297. mova [r2q + 2*r3q + 0*mmsize], m2
  298. mova [r2q + 2*r3q + 1*mmsize], m3
  299. mova [r2q + 2*r3q + 2*mmsize], m0
  300. mova [r2q + 2*r3q + 3*mmsize], m1
  301. add r1q, 2*mmsize
  302. sub r3q, 2*mmsize
  303. jge .loop
  304. movq m2, [zq]
  305. movq [r2q], m2
  306. REP_RET
  307. %ifdef PIC
  308. %define NREGS 1
  309. %if UNIX64
  310. %define NOISE_TABLE r6q ; r5q is m_max
  311. %else
  312. %define NOISE_TABLE r5q
  313. %endif
  314. %else
  315. %define NREGS 0
  316. %define NOISE_TABLE sbr_noise_table
  317. %endif
  318. %macro LOAD_NST 1
  319. %ifdef PIC
  320. lea NOISE_TABLE, [%1]
  321. mova m0, [kxq + NOISE_TABLE]
  322. %else
  323. mova m0, [kxq + %1]
  324. %endif
  325. %endmacro
  326. INIT_XMM sse2
  327. ; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
  328. ; const float *q_filt, int noise,
  329. ; int kx, int m_max)
  330. cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  331. mova m0, [ps_noise0]
  332. jmp apply_noise_main
  333. ; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
  334. ; const float *q_filt, int noise,
  335. ; int kx, int m_max)
  336. cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  337. and kxq, 1
  338. shl kxq, 4
  339. LOAD_NST ps_noise13
  340. jmp apply_noise_main
  341. ; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
  342. ; const float *q_filt, int noise,
  343. ; int kx, int m_max)
  344. cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  345. mova m0, [ps_noise2]
  346. jmp apply_noise_main
  347. ; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
  348. ; const float *q_filt, int noise,
  349. ; int kx, int m_max)
  350. cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
  351. and kxq, 1
  352. shl kxq, 4
  353. LOAD_NST ps_noise13+16
  354. apply_noise_main:
  355. %if ARCH_X86_64 == 0 || WIN64
  356. mov kxd, m_maxm
  357. %define count kxq
  358. %else
  359. %define count m_maxq
  360. %endif
  361. dec noiseq
  362. shl count, 2
  363. %ifdef PIC
  364. lea NOISE_TABLE, [sbr_noise_table]
  365. %endif
  366. lea Yq, [Yq + 2*count]
  367. add s_mq, count
  368. add q_filtq, count
  369. shl noiseq, 3
  370. pxor m5, m5
  371. neg count
  372. .loop:
  373. mova m1, [q_filtq + count]
  374. movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
  375. movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
  376. add noiseq, 2*mmsize
  377. and noiseq, 0x1ff<<3
  378. punpckhdq m2, m1, m1
  379. punpckldq m1, m1
  380. mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
  381. mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
  382. mova m3, [s_mq + count]
  383. ; TODO: replace by a vpermd in AVX2
  384. punpckhdq m4, m3, m3
  385. punpckldq m3, m3
  386. pcmpeqd m6, m3, m5 ; m6 == 0
  387. pcmpeqd m7, m4, m5 ; m7 == 0
  388. mulps m3, m0 ; s_m[m] * phi_sign
  389. mulps m4, m0 ; s_m[m] * phi_sign
  390. pand m1, m6
  391. pand m2, m7
  392. movu m6, [Yq + 2*count]
  393. movu m7, [Yq + 2*count + mmsize]
  394. addps m3, m1
  395. addps m4, m2
  396. addps m6, m3
  397. addps m7, m4
  398. movu [Yq + 2*count], m6
  399. movu [Yq + 2*count + mmsize], m7
  400. add count, mmsize
  401. jl .loop
  402. RET