You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

296 lines
7.8KB

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP utils
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. ; 16777216.0f - used in ff_float_to_fixed24()
  25. pf_1_24: times 4 dd 0x4B800000
  26. SECTION .text
  27. ;-----------------------------------------------------------------------------
  28. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  29. ;-----------------------------------------------------------------------------
  30. %macro AC3_EXPONENT_MIN 1
  31. cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
  32. shl reuse_blksq, 8
  33. jz .end
  34. LOOP_ALIGN
  35. .nextexp:
  36. mov offsetq, reuse_blksq
  37. mova m0, [expq+offsetq]
  38. sub offsetq, 256
  39. LOOP_ALIGN
  40. .nextblk:
  41. PMINUB m0, [expq+offsetq], m1
  42. sub offsetq, 256
  43. jae .nextblk
  44. mova [expq], m0
  45. add expq, mmsize
  46. sub expnq, mmsize
  47. jg .nextexp
  48. .end:
  49. REP_RET
  50. %endmacro
  51. %define PMINUB PMINUB_MMX
  52. %define LOOP_ALIGN
  53. INIT_MMX
  54. AC3_EXPONENT_MIN mmx
  55. %ifdef HAVE_MMX2
  56. %define PMINUB PMINUB_MMXEXT
  57. %define LOOP_ALIGN ALIGN 16
  58. AC3_EXPONENT_MIN mmxext
  59. %endif
  60. %ifdef HAVE_SSE
  61. INIT_XMM
  62. AC3_EXPONENT_MIN sse2
  63. %endif
  64. %undef PMINUB
  65. %undef LOOP_ALIGN
  66. ;-----------------------------------------------------------------------------
  67. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  68. ;
  69. ; This function uses 2 different methods to calculate a valid result.
  70. ; 1) logical 'or' of abs of each element
  71. ; This is used for ssse3 because of the pabsw instruction.
  72. ; It is also used for mmx because of the lack of min/max instructions.
  73. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  74. ; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  75. ;-----------------------------------------------------------------------------
  76. %macro AC3_MAX_MSB_ABS_INT16 2
  77. cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
  78. pxor m2, m2
  79. pxor m3, m3
  80. .loop:
  81. %ifidn %2, min_max
  82. mova m0, [srcq]
  83. mova m1, [srcq+mmsize]
  84. pminsw m2, m0
  85. pminsw m2, m1
  86. pmaxsw m3, m0
  87. pmaxsw m3, m1
  88. %else ; or_abs
  89. %ifidn %1, mmx
  90. mova m0, [srcq]
  91. mova m1, [srcq+mmsize]
  92. ABS2 m0, m1, m3, m4
  93. %else ; ssse3
  94. ; using memory args is faster for ssse3
  95. pabsw m0, [srcq]
  96. pabsw m1, [srcq+mmsize]
  97. %endif
  98. por m2, m0
  99. por m2, m1
  100. %endif
  101. add srcq, mmsize*2
  102. sub lend, mmsize
  103. ja .loop
  104. %ifidn %2, min_max
  105. ABS2 m2, m3, m0, m1
  106. por m2, m3
  107. %endif
  108. %ifidn mmsize, 16
  109. movhlps m0, m2
  110. por m2, m0
  111. %endif
  112. PSHUFLW m0, m2, 0xe
  113. por m2, m0
  114. PSHUFLW m0, m2, 0x1
  115. por m2, m0
  116. movd eax, m2
  117. and eax, 0xFFFF
  118. RET
  119. %endmacro
  120. INIT_MMX
  121. %define ABS2 ABS2_MMX
  122. %define PSHUFLW pshufw
  123. AC3_MAX_MSB_ABS_INT16 mmx, or_abs
  124. %define ABS2 ABS2_MMX2
  125. AC3_MAX_MSB_ABS_INT16 mmxext, min_max
  126. INIT_XMM
  127. %define PSHUFLW pshuflw
  128. AC3_MAX_MSB_ABS_INT16 sse2, min_max
  129. %define ABS2 ABS2_SSSE3
  130. AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
  131. ;-----------------------------------------------------------------------------
  132. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  133. ;-----------------------------------------------------------------------------
  134. %macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
  135. cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
  136. movd m0, shiftd
  137. .loop:
  138. mova m1, [srcq ]
  139. mova m2, [srcq+mmsize ]
  140. mova m3, [srcq+mmsize*2]
  141. mova m4, [srcq+mmsize*3]
  142. %3 m1, m0
  143. %3 m2, m0
  144. %3 m3, m0
  145. %3 m4, m0
  146. mova [srcq ], m1
  147. mova [srcq+mmsize ], m2
  148. mova [srcq+mmsize*2], m3
  149. mova [srcq+mmsize*3], m4
  150. add srcq, mmsize*4
  151. sub lend, mmsize*32/%2
  152. ja .loop
  153. .end:
  154. REP_RET
  155. %endmacro
  156. ;-----------------------------------------------------------------------------
  157. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  158. ;-----------------------------------------------------------------------------
  159. INIT_MMX
  160. AC3_SHIFT l, 16, psllw, mmx
  161. INIT_XMM
  162. AC3_SHIFT l, 16, psllw, sse2
  163. ;-----------------------------------------------------------------------------
  164. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  165. ;-----------------------------------------------------------------------------
  166. INIT_MMX
  167. AC3_SHIFT r, 32, psrad, mmx
  168. INIT_XMM
  169. AC3_SHIFT r, 32, psrad, sse2
  170. ;-----------------------------------------------------------------------------
  171. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  172. ;-----------------------------------------------------------------------------
  173. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  174. ; than round-to-nearest.
  175. INIT_MMX
  176. cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
  177. movq m0, [pf_1_24]
  178. .loop:
  179. movq m1, [srcq ]
  180. movq m2, [srcq+8 ]
  181. movq m3, [srcq+16]
  182. movq m4, [srcq+24]
  183. pfmul m1, m0
  184. pfmul m2, m0
  185. pfmul m3, m0
  186. pfmul m4, m0
  187. pf2id m1, m1
  188. pf2id m2, m2
  189. pf2id m3, m3
  190. pf2id m4, m4
  191. movq [dstq ], m1
  192. movq [dstq+8 ], m2
  193. movq [dstq+16], m3
  194. movq [dstq+24], m4
  195. add srcq, 32
  196. add dstq, 32
  197. sub lend, 8
  198. ja .loop
  199. REP_RET
  200. INIT_XMM
  201. cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
  202. movaps m0, [pf_1_24]
  203. .loop:
  204. movaps m1, [srcq ]
  205. movaps m2, [srcq+16]
  206. mulps m1, m0
  207. mulps m2, m0
  208. cvtps2pi mm0, m1
  209. movhlps m1, m1
  210. cvtps2pi mm1, m1
  211. cvtps2pi mm2, m2
  212. movhlps m2, m2
  213. cvtps2pi mm3, m2
  214. movq [dstq ], mm0
  215. movq [dstq+ 8], mm1
  216. movq [dstq+16], mm2
  217. movq [dstq+24], mm3
  218. add srcq, 32
  219. add dstq, 32
  220. sub lend, 8
  221. ja .loop
  222. REP_RET
  223. INIT_XMM
  224. cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
  225. movaps m0, [pf_1_24]
  226. .loop:
  227. movaps m1, [srcq ]
  228. movaps m2, [srcq+16 ]
  229. movaps m3, [srcq+32 ]
  230. movaps m4, [srcq+48 ]
  231. %ifdef m8
  232. movaps m5, [srcq+64 ]
  233. movaps m6, [srcq+80 ]
  234. movaps m7, [srcq+96 ]
  235. movaps m8, [srcq+112]
  236. %endif
  237. mulps m1, m0
  238. mulps m2, m0
  239. mulps m3, m0
  240. mulps m4, m0
  241. %ifdef m8
  242. mulps m5, m0
  243. mulps m6, m0
  244. mulps m7, m0
  245. mulps m8, m0
  246. %endif
  247. cvtps2dq m1, m1
  248. cvtps2dq m2, m2
  249. cvtps2dq m3, m3
  250. cvtps2dq m4, m4
  251. %ifdef m8
  252. cvtps2dq m5, m5
  253. cvtps2dq m6, m6
  254. cvtps2dq m7, m7
  255. cvtps2dq m8, m8
  256. %endif
  257. movdqa [dstq ], m1
  258. movdqa [dstq+16 ], m2
  259. movdqa [dstq+32 ], m3
  260. movdqa [dstq+48 ], m4
  261. %ifdef m8
  262. movdqa [dstq+64 ], m5
  263. movdqa [dstq+80 ], m6
  264. movdqa [dstq+96 ], m7
  265. movdqa [dstq+112], m8
  266. add srcq, 128
  267. add dstq, 128
  268. sub lenq, 32
  269. %else
  270. add srcq, 64
  271. add dstq, 64
  272. sub lenq, 16
  273. %endif
  274. ja .loop
  275. REP_RET