You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

306 lines
8.6KB

  1. ;******************************************************************************
  2. ;* AAC Spectral Band Replication decoding functions
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. ; mask equivalent for multiply by -1.0 1.0
  24. ps_mask times 2 dd 1<<31, 0
  25. ps_mask2 times 2 dd 0, 1<<31
  26. ps_neg times 4 dd 1<<31
  27. SECTION .text
  28. INIT_XMM sse
  29. cglobal sbr_sum_square, 2, 3, 6
  30. mov r2, r1
  31. xorps m0, m0
  32. xorps m1, m1
  33. sar r2, 3
  34. jz .prepare
  35. .loop:
  36. movu m2, [r0 + 0]
  37. movu m3, [r0 + 16]
  38. movu m4, [r0 + 32]
  39. movu m5, [r0 + 48]
  40. mulps m2, m2
  41. mulps m3, m3
  42. mulps m4, m4
  43. mulps m5, m5
  44. addps m0, m2
  45. addps m1, m3
  46. addps m0, m4
  47. addps m1, m5
  48. add r0, 64
  49. dec r2
  50. jnz .loop
  51. .prepare:
  52. and r1, 7
  53. sar r1, 1
  54. jz .end
  55. ; len is a multiple of 2, thus there are at least 4 elements to process
  56. .endloop:
  57. movu m2, [r0]
  58. add r0, 16
  59. mulps m2, m2
  60. dec r1
  61. addps m0, m2
  62. jnz .endloop
  63. .end:
  64. addps m0, m1
  65. movhlps m2, m0
  66. addps m0, m2
  67. movss m1, m0
  68. shufps m0, m0, 1
  69. addss m0, m1
  70. %if ARCH_X86_64 == 0
  71. movss r0m, m0
  72. fld dword r0m
  73. %endif
  74. RET
  75. %define STEP 40*4*2
  76. cglobal sbr_hf_g_filt, 5, 6, 5
  77. lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
  78. mov r5, r3
  79. and r3, 0xFC
  80. lea r2, [r2 + r3*4]
  81. lea r0, [r0 + r3*8]
  82. neg r3
  83. jz .loop1
  84. .loop4:
  85. movlps m0, [r2 + 4*r3 + 0]
  86. movlps m1, [r2 + 4*r3 + 8]
  87. movlps m2, [r1 + 0*STEP]
  88. movlps m3, [r1 + 2*STEP]
  89. movhps m2, [r1 + 1*STEP]
  90. movhps m3, [r1 + 3*STEP]
  91. unpcklps m0, m0
  92. unpcklps m1, m1
  93. mulps m0, m2
  94. mulps m1, m3
  95. movu [r0 + 8*r3 + 0], m0
  96. movu [r0 + 8*r3 + 16], m1
  97. add r1, 4*STEP
  98. add r3, 4
  99. jnz .loop4
  100. and r5, 3 ; number of single element loops
  101. jz .end
  102. .loop1: ; element 0 and 1 can be computed at the same time
  103. movss m0, [r2]
  104. movlps m2, [r1]
  105. unpcklps m0, m0
  106. mulps m2, m0
  107. movlps [r0], m2
  108. add r0, 8
  109. add r2, 4
  110. add r1, STEP
  111. dec r5
  112. jnz .loop1
  113. .end:
  114. RET
  115. ; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
  116. ; const float alpha0[2], const float alpha1[2],
  117. ; float bw, int start, int end)
  118. ;
  119. cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
  120. ; load alpha factors
  121. %define bw m0
  122. %if ARCH_X86_64 == 0 || WIN64
  123. movss bw, BWm
  124. %endif
  125. movlps m2, [alpha1q]
  126. movlps m1, [alpha0q]
  127. shufps bw, bw, 0
  128. mulps m2, bw ; (a1[0] a1[1])*bw
  129. mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
  130. mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
  131. mova m3, m1
  132. mova m4, m2
  133. mova m7, [ps_mask]
  134. ; Set pointers
  135. %if ARCH_X86_64 == 0 || WIN64
  136. ; start and end 6th and 7th args on stack
  137. mov r2d, Sm
  138. mov r3d, Em
  139. %define start r2q
  140. %define end r3q
  141. %else
  142. ; BW does not actually occupy a register, so shift by 1
  143. %define start BWq
  144. %define end Sq
  145. %endif
  146. sub start, end ; neg num of loops
  147. lea X_highq, [X_highq + end*2*4]
  148. lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
  149. shl start, 3 ; offset from num loops
  150. mova m0, [X_lowq + start]
  151. movlhps m1, m1 ; (a2 a3 a2 a3)
  152. movlhps m2, m2 ; (a0 a1 a0 a1)
  153. shufps m3, m3, q0101 ; (a3 a2 a3 a2)
  154. shufps m4, m4, q0101 ; (a1 a0 a1 a0)
  155. xorps m3, m7 ; (-a3 a2 -a3 a2)
  156. xorps m4, m7 ; (-a1 a0 -a1 a0)
  157. .loop2:
  158. mova m5, m0
  159. mova m6, m0
  160. shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"}
  161. shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"}
  162. mulps m0, m2
  163. mulps m5, m4
  164. mova m7, m6
  165. addps m5, m0
  166. mova m0, [X_lowq + start + 2*2*4]
  167. shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"}
  168. shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"}
  169. mulps m6, m1
  170. mulps m7, m3
  171. addps m5, m6
  172. addps m7, m0
  173. addps m5, m7
  174. mova [X_highq + start], m5
  175. add start, 16
  176. jnz .loop2
  177. RET
  178. cglobal sbr_sum64x5, 1,2,4,z
  179. lea r1q, [zq+ 256]
  180. .loop:
  181. mova m0, [zq+ 0]
  182. mova m2, [zq+ 16]
  183. mova m1, [zq+ 256]
  184. mova m3, [zq+ 272]
  185. addps m0, [zq+ 512]
  186. addps m2, [zq+ 528]
  187. addps m1, [zq+ 768]
  188. addps m3, [zq+ 784]
  189. addps m0, [zq+1024]
  190. addps m2, [zq+1040]
  191. addps m0, m1
  192. addps m2, m3
  193. mova [zq], m0
  194. mova [zq+16], m2
  195. add zq, 32
  196. cmp zq, r1q
  197. jne .loop
  198. REP_RET
  199. INIT_XMM sse
  200. cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
  201. lea r2q, [zq + (64-4)*4]
  202. mova m3, [ps_neg]
  203. .loop:
  204. mova m1, [zq]
  205. xorps m0, m3, [r2q]
  206. shufps m0, m0, m0, q0123
  207. unpcklps m2, m0, m1
  208. unpckhps m0, m0, m1
  209. mova [Wq + 0], m2
  210. mova [Wq + 16], m0
  211. add Wq, 32
  212. sub r2q, 16
  213. add zq, 16
  214. cmp zq, r2q
  215. jl .loop
  216. REP_RET
  217. INIT_XMM sse
  218. cglobal sbr_neg_odd_64, 1,2,4,z
  219. lea r1q, [zq+256]
  220. .loop:
  221. mova m0, [zq+ 0]
  222. mova m1, [zq+16]
  223. mova m2, [zq+32]
  224. mova m3, [zq+48]
  225. xorps m0, [ps_mask2]
  226. xorps m1, [ps_mask2]
  227. xorps m2, [ps_mask2]
  228. xorps m3, [ps_mask2]
  229. mova [zq+ 0], m0
  230. mova [zq+16], m1
  231. mova [zq+32], m2
  232. mova [zq+48], m3
  233. add zq, 64
  234. cmp zq, r1q
  235. jne .loop
  236. REP_RET
  237. INIT_XMM sse2
  238. ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
  239. cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
  240. mov cq, 64*4-2*mmsize
  241. lea vrevq, [vq + 64*4]
  242. .loop:
  243. mova m0, [src0q+cq]
  244. mova m1, [src1q]
  245. mova m2, [src0q+cq+mmsize]
  246. mova m3, [src1q+mmsize]
  247. pshufd m4, m0, q0123
  248. pshufd m5, m1, q0123
  249. pshufd m6, m2, q0123
  250. pshufd m7, m3, q0123
  251. addps m3, m4
  252. subps m0, m7
  253. addps m1, m6
  254. subps m2, m5
  255. mova [vrevq], m1
  256. mova [vrevq+mmsize], m3
  257. mova [vq+cq], m0
  258. mova [vq+cq+mmsize], m2
  259. add src1q, 2*mmsize
  260. add vrevq, 2*mmsize
  261. sub cq, 2*mmsize
  262. jge .loop
  263. REP_RET
  264. INIT_XMM sse2
  265. cglobal sbr_qmf_pre_shuffle, 1,4,6,z
  266. %define OFFSET (32*4-2*mmsize)
  267. mov r3q, OFFSET
  268. lea r1q, [zq + (32+1)*4]
  269. lea r2q, [zq + 64*4]
  270. mova m5, [ps_neg]
  271. .loop:
  272. movu m0, [r1q]
  273. movu m2, [r1q + mmsize]
  274. movu m1, [zq + r3q + 4 + mmsize]
  275. movu m3, [zq + r3q + 4]
  276. pxor m2, m5
  277. pxor m0, m5
  278. pshufd m2, m2, q0123
  279. pshufd m0, m0, q0123
  280. SBUTTERFLY dq, 2, 3, 4
  281. SBUTTERFLY dq, 0, 1, 4
  282. mova [r2q + 2*r3q + 0*mmsize], m2
  283. mova [r2q + 2*r3q + 1*mmsize], m3
  284. mova [r2q + 2*r3q + 2*mmsize], m0
  285. mova [r2q + 2*r3q + 3*mmsize], m1
  286. add r1q, 2*mmsize
  287. sub r3q, 2*mmsize
  288. jge .loop
  289. movq m2, [zq]
  290. movq [r2q], m2
  291. REP_RET