You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 lines
5.2KB

  1. ;******************************************************************************
  2. ;* AAC Spectral Band Replication decoding functions
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. ; mask equivalent for multiply by -1.0 1.0
  24. ps_mask times 2 dd 1<<31, 0
  25. SECTION_TEXT
  26. INIT_XMM sse
  27. cglobal sbr_sum_square, 2, 3, 6
  28. mov r2, r1
  29. xorps m0, m0
  30. xorps m1, m1
  31. sar r2, 3
  32. jz .prepare
  33. .loop:
  34. movu m2, [r0 + 0]
  35. movu m3, [r0 + 16]
  36. movu m4, [r0 + 32]
  37. movu m5, [r0 + 48]
  38. mulps m2, m2
  39. mulps m3, m3
  40. mulps m4, m4
  41. mulps m5, m5
  42. addps m0, m2
  43. addps m1, m3
  44. addps m0, m4
  45. addps m1, m5
  46. add r0, 64
  47. dec r2
  48. jnz .loop
  49. .prepare:
  50. and r1, 7
  51. sar r1, 1
  52. jz .end
  53. ; len is a multiple of 2, thus there are at least 4 elements to process
  54. .endloop:
  55. movu m2, [r0]
  56. add r0, 16
  57. mulps m2, m2
  58. dec r1
  59. addps m0, m2
  60. jnz .endloop
  61. .end:
  62. addps m0, m1
  63. movhlps m2, m0
  64. addps m0, m2
  65. movss m1, m0
  66. shufps m0, m0, 1
  67. addss m0, m1
  68. %if ARCH_X86_64 == 0
  69. movss r0m, m0
  70. fld dword r0m
  71. %endif
  72. RET
  73. %define STEP 40*4*2
  74. cglobal sbr_hf_g_filt, 5, 6, 5
  75. lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
  76. mov r5, r3
  77. and r3, 0xFC
  78. lea r2, [r2 + r3*4]
  79. lea r0, [r0 + r3*8]
  80. neg r3
  81. jz .loop1
  82. .loop4:
  83. movlps m0, [r2 + 4*r3 + 0]
  84. movlps m1, [r2 + 4*r3 + 8]
  85. movlps m2, [r1 + 0*STEP]
  86. movlps m3, [r1 + 2*STEP]
  87. movhps m2, [r1 + 1*STEP]
  88. movhps m3, [r1 + 3*STEP]
  89. unpcklps m0, m0
  90. unpcklps m1, m1
  91. mulps m0, m2
  92. mulps m1, m3
  93. movu [r0 + 8*r3 + 0], m0
  94. movu [r0 + 8*r3 + 16], m1
  95. add r1, 4*STEP
  96. add r3, 4
  97. jnz .loop4
  98. and r5, 3 ; number of single element loops
  99. jz .end
  100. .loop1: ; element 0 and 1 can be computed at the same time
  101. movss m0, [r2]
  102. movlps m2, [r1]
  103. unpcklps m0, m0
  104. mulps m2, m0
  105. movlps [r0], m2
  106. add r0, 8
  107. add r2, 4
  108. add r1, STEP
  109. dec r5
  110. jnz .loop1
  111. .end:
  112. RET
  113. ; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
  114. ; const float alpha0[2], const float alpha1[2],
  115. ; float bw, int start, int end)
  116. ;
  117. cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
  118. ; load alpha factors
  119. %define bw m0
  120. %if ARCH_X86_64 == 0 || WIN64
  121. movss bw, BWm
  122. %endif
  123. movlps m2, [alpha1q]
  124. movlps m1, [alpha0q]
  125. shufps bw, bw, 0
  126. mulps m2, bw ; (a1[0] a1[1])*bw
  127. mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
  128. mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
  129. mova m3, m1
  130. mova m4, m2
  131. ; Set pointers
  132. %if ARCH_X86_64 == 0 || WIN64
  133. ; start and end 6th and 7th args on stack
  134. mov r2d, Sm
  135. mov r3d, Em
  136. %define start r2q
  137. %define end r3q
  138. %else
  139. ; BW does not actually occupy a register, so shift by 1
  140. %define start BWq
  141. %define end Sq
  142. %endif
  143. sub start, end ; neg num of loops
  144. lea X_highq, [X_highq + end*2*4]
  145. lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
  146. shl start, 3 ; offset from num loops
  147. mova m0, [X_lowq + start]
  148. shufps m3, m3, q1111
  149. shufps m4, m4, q1111
  150. xorps m3, [ps_mask]
  151. shufps m1, m1, q0000
  152. shufps m2, m2, q0000
  153. xorps m4, [ps_mask]
  154. .loop2:
  155. movu m7, [X_lowq + start + 8] ; BbCc
  156. mova m6, m0
  157. mova m5, m7
  158. shufps m0, m0, q2301 ; aAbB
  159. shufps m7, m7, q2301 ; bBcC
  160. mulps m0, m4
  161. mulps m7, m3
  162. mulps m6, m2
  163. mulps m5, m1
  164. addps m7, m0
  165. mova m0, [X_lowq + start +16] ; CcDd
  166. addps m7, m0
  167. addps m6, m5
  168. addps m7, m6
  169. mova [X_highq + start], m7
  170. add start, 16
  171. jnz .loop2
  172. RET