You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

349 lines
9.4KB

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP utils
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. ; 16777216.0f - used in ff_float_to_fixed24()
  25. pf_1_24: times 4 dd 0x4B800000
  26. ; used in ff_ac3_compute_mantissa_size()
  27. cextern ac3_bap_bits
  28. pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
  29. pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
  30. SECTION .text
  31. ;-----------------------------------------------------------------------------
  32. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  33. ;-----------------------------------------------------------------------------
  34. %macro AC3_EXPONENT_MIN 1
  35. cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
  36. shl reuse_blksq, 8
  37. jz .end
  38. LOOP_ALIGN
  39. .nextexp:
  40. mov offsetq, reuse_blksq
  41. mova m0, [expq+offsetq]
  42. sub offsetq, 256
  43. LOOP_ALIGN
  44. .nextblk:
  45. PMINUB m0, [expq+offsetq], m1
  46. sub offsetq, 256
  47. jae .nextblk
  48. mova [expq], m0
  49. add expq, mmsize
  50. sub expnq, mmsize
  51. jg .nextexp
  52. .end:
  53. REP_RET
  54. %endmacro
  55. %define PMINUB PMINUB_MMX
  56. %define LOOP_ALIGN
  57. INIT_MMX
  58. AC3_EXPONENT_MIN mmx
  59. %ifdef HAVE_MMX2
  60. %define PMINUB PMINUB_MMXEXT
  61. %define LOOP_ALIGN ALIGN 16
  62. AC3_EXPONENT_MIN mmxext
  63. %endif
  64. %ifdef HAVE_SSE
  65. INIT_XMM
  66. AC3_EXPONENT_MIN sse2
  67. %endif
  68. %undef PMINUB
  69. %undef LOOP_ALIGN
  70. ;-----------------------------------------------------------------------------
  71. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  72. ;
  73. ; This function uses 2 different methods to calculate a valid result.
  74. ; 1) logical 'or' of abs of each element
  75. ; This is used for ssse3 because of the pabsw instruction.
  76. ; It is also used for mmx because of the lack of min/max instructions.
  77. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  78. ; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  79. ;-----------------------------------------------------------------------------
  80. %macro AC3_MAX_MSB_ABS_INT16 2
  81. cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
  82. pxor m2, m2
  83. pxor m3, m3
  84. .loop:
  85. %ifidn %2, min_max
  86. mova m0, [srcq]
  87. mova m1, [srcq+mmsize]
  88. pminsw m2, m0
  89. pminsw m2, m1
  90. pmaxsw m3, m0
  91. pmaxsw m3, m1
  92. %else ; or_abs
  93. %ifidn %1, mmx
  94. mova m0, [srcq]
  95. mova m1, [srcq+mmsize]
  96. ABS2 m0, m1, m3, m4
  97. %else ; ssse3
  98. ; using memory args is faster for ssse3
  99. pabsw m0, [srcq]
  100. pabsw m1, [srcq+mmsize]
  101. %endif
  102. por m2, m0
  103. por m2, m1
  104. %endif
  105. add srcq, mmsize*2
  106. sub lend, mmsize
  107. ja .loop
  108. %ifidn %2, min_max
  109. ABS2 m2, m3, m0, m1
  110. por m2, m3
  111. %endif
  112. %ifidn mmsize, 16
  113. movhlps m0, m2
  114. por m2, m0
  115. %endif
  116. PSHUFLW m0, m2, 0xe
  117. por m2, m0
  118. PSHUFLW m0, m2, 0x1
  119. por m2, m0
  120. movd eax, m2
  121. and eax, 0xFFFF
  122. RET
  123. %endmacro
  124. INIT_MMX
  125. %define ABS2 ABS2_MMX
  126. %define PSHUFLW pshufw
  127. AC3_MAX_MSB_ABS_INT16 mmx, or_abs
  128. %define ABS2 ABS2_MMX2
  129. AC3_MAX_MSB_ABS_INT16 mmxext, min_max
  130. INIT_XMM
  131. %define PSHUFLW pshuflw
  132. AC3_MAX_MSB_ABS_INT16 sse2, min_max
  133. %define ABS2 ABS2_SSSE3
  134. AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
  135. ;-----------------------------------------------------------------------------
  136. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  137. ;-----------------------------------------------------------------------------
  138. %macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
  139. cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
  140. movd m0, shiftd
  141. .loop:
  142. mova m1, [srcq ]
  143. mova m2, [srcq+mmsize ]
  144. mova m3, [srcq+mmsize*2]
  145. mova m4, [srcq+mmsize*3]
  146. %3 m1, m0
  147. %3 m2, m0
  148. %3 m3, m0
  149. %3 m4, m0
  150. mova [srcq ], m1
  151. mova [srcq+mmsize ], m2
  152. mova [srcq+mmsize*2], m3
  153. mova [srcq+mmsize*3], m4
  154. add srcq, mmsize*4
  155. sub lend, mmsize*32/%2
  156. ja .loop
  157. .end:
  158. REP_RET
  159. %endmacro
  160. ;-----------------------------------------------------------------------------
  161. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  162. ;-----------------------------------------------------------------------------
  163. INIT_MMX
  164. AC3_SHIFT l, 16, psllw, mmx
  165. INIT_XMM
  166. AC3_SHIFT l, 16, psllw, sse2
  167. ;-----------------------------------------------------------------------------
  168. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  169. ;-----------------------------------------------------------------------------
  170. INIT_MMX
  171. AC3_SHIFT r, 32, psrad, mmx
  172. INIT_XMM
  173. AC3_SHIFT r, 32, psrad, sse2
  174. ;-----------------------------------------------------------------------------
  175. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  176. ;-----------------------------------------------------------------------------
  177. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  178. ; than round-to-nearest.
  179. INIT_MMX
  180. cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
  181. movq m0, [pf_1_24]
  182. .loop:
  183. movq m1, [srcq ]
  184. movq m2, [srcq+8 ]
  185. movq m3, [srcq+16]
  186. movq m4, [srcq+24]
  187. pfmul m1, m0
  188. pfmul m2, m0
  189. pfmul m3, m0
  190. pfmul m4, m0
  191. pf2id m1, m1
  192. pf2id m2, m2
  193. pf2id m3, m3
  194. pf2id m4, m4
  195. movq [dstq ], m1
  196. movq [dstq+8 ], m2
  197. movq [dstq+16], m3
  198. movq [dstq+24], m4
  199. add srcq, 32
  200. add dstq, 32
  201. sub lend, 8
  202. ja .loop
  203. REP_RET
  204. INIT_XMM
  205. cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
  206. movaps m0, [pf_1_24]
  207. .loop:
  208. movaps m1, [srcq ]
  209. movaps m2, [srcq+16]
  210. mulps m1, m0
  211. mulps m2, m0
  212. cvtps2pi mm0, m1
  213. movhlps m1, m1
  214. cvtps2pi mm1, m1
  215. cvtps2pi mm2, m2
  216. movhlps m2, m2
  217. cvtps2pi mm3, m2
  218. movq [dstq ], mm0
  219. movq [dstq+ 8], mm1
  220. movq [dstq+16], mm2
  221. movq [dstq+24], mm3
  222. add srcq, 32
  223. add dstq, 32
  224. sub lend, 8
  225. ja .loop
  226. REP_RET
  227. INIT_XMM
  228. cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
  229. movaps m0, [pf_1_24]
  230. .loop:
  231. movaps m1, [srcq ]
  232. movaps m2, [srcq+16 ]
  233. movaps m3, [srcq+32 ]
  234. movaps m4, [srcq+48 ]
  235. %ifdef m8
  236. movaps m5, [srcq+64 ]
  237. movaps m6, [srcq+80 ]
  238. movaps m7, [srcq+96 ]
  239. movaps m8, [srcq+112]
  240. %endif
  241. mulps m1, m0
  242. mulps m2, m0
  243. mulps m3, m0
  244. mulps m4, m0
  245. %ifdef m8
  246. mulps m5, m0
  247. mulps m6, m0
  248. mulps m7, m0
  249. mulps m8, m0
  250. %endif
  251. cvtps2dq m1, m1
  252. cvtps2dq m2, m2
  253. cvtps2dq m3, m3
  254. cvtps2dq m4, m4
  255. %ifdef m8
  256. cvtps2dq m5, m5
  257. cvtps2dq m6, m6
  258. cvtps2dq m7, m7
  259. cvtps2dq m8, m8
  260. %endif
  261. movdqa [dstq ], m1
  262. movdqa [dstq+16 ], m2
  263. movdqa [dstq+32 ], m3
  264. movdqa [dstq+48 ], m4
  265. %ifdef m8
  266. movdqa [dstq+64 ], m5
  267. movdqa [dstq+80 ], m6
  268. movdqa [dstq+96 ], m7
  269. movdqa [dstq+112], m8
  270. add srcq, 128
  271. add dstq, 128
  272. sub lenq, 32
  273. %else
  274. add srcq, 64
  275. add dstq, 64
  276. sub lenq, 16
  277. %endif
  278. ja .loop
  279. REP_RET
  280. ;------------------------------------------------------------------------------
  281. ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
  282. ;------------------------------------------------------------------------------
  283. %macro PHADDD4 2 ; xmm src, xmm tmp
  284. movhlps %2, %1
  285. paddd %1, %2
  286. pshufd %2, %1, 0x1
  287. paddd %1, %2
  288. %endmacro
  289. INIT_XMM
  290. cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
  291. movdqa m0, [mant_cntq ]
  292. movdqa m1, [mant_cntq+ 1*16]
  293. paddw m0, [mant_cntq+ 2*16]
  294. paddw m1, [mant_cntq+ 3*16]
  295. paddw m0, [mant_cntq+ 4*16]
  296. paddw m1, [mant_cntq+ 5*16]
  297. paddw m0, [mant_cntq+ 6*16]
  298. paddw m1, [mant_cntq+ 7*16]
  299. paddw m0, [mant_cntq+ 8*16]
  300. paddw m1, [mant_cntq+ 9*16]
  301. paddw m0, [mant_cntq+10*16]
  302. paddw m1, [mant_cntq+11*16]
  303. pmaddwd m0, [ac3_bap_bits ]
  304. pmaddwd m1, [ac3_bap_bits+16]
  305. paddd m0, m1
  306. PHADDD4 m0, m1
  307. movd sumd, m0
  308. movdqa m3, [pw_bap_mul1]
  309. movhpd m0, [mant_cntq +2]
  310. movlpd m0, [mant_cntq+1*32+2]
  311. movhpd m1, [mant_cntq+2*32+2]
  312. movlpd m1, [mant_cntq+3*32+2]
  313. movhpd m2, [mant_cntq+4*32+2]
  314. movlpd m2, [mant_cntq+5*32+2]
  315. pmulhuw m0, m3
  316. pmulhuw m1, m3
  317. pmulhuw m2, m3
  318. paddusw m0, m1
  319. paddusw m0, m2
  320. pmaddwd m0, [pw_bap_mul2]
  321. PHADDD4 m0, m1
  322. movd eax, m0
  323. add eax, sumd
  324. RET