You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

458 lines
12KB

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP utils
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. ; 16777216.0f - used in ff_float_to_fixed24()
  24. pf_1_24: times 4 dd 0x4B800000
  25. ; used in ff_ac3_compute_mantissa_size()
  26. cextern ac3_bap_bits
  27. pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
  28. pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
  29. ; used in ff_ac3_extract_exponents()
  30. pd_1: times 4 dd 1
  31. pd_151: times 4 dd 151
  32. SECTION .text
  33. ;-----------------------------------------------------------------------------
  34. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  35. ;-----------------------------------------------------------------------------
  36. %macro AC3_EXPONENT_MIN 0
  37. cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
  38. shl reuse_blksq, 8
  39. jz .end
  40. LOOP_ALIGN
  41. .nextexp:
  42. mov offsetq, reuse_blksq
  43. mova m0, [expq+offsetq]
  44. sub offsetq, 256
  45. LOOP_ALIGN
  46. .nextblk:
  47. PMINUB m0, [expq+offsetq], m1
  48. sub offsetq, 256
  49. jae .nextblk
  50. mova [expq], m0
  51. add expq, mmsize
  52. sub expnq, mmsize
  53. jg .nextexp
  54. .end:
  55. REP_RET
  56. %endmacro
  57. %define LOOP_ALIGN
  58. INIT_MMX mmx
  59. AC3_EXPONENT_MIN
  60. %if HAVE_MMXEXT_EXTERNAL
  61. %define LOOP_ALIGN ALIGN 16
  62. INIT_MMX mmxext
  63. AC3_EXPONENT_MIN
  64. %endif
  65. %if HAVE_SSE2_EXTERNAL
  66. INIT_XMM sse2
  67. AC3_EXPONENT_MIN
  68. %endif
  69. %undef LOOP_ALIGN
  70. ;-----------------------------------------------------------------------------
  71. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  72. ;
  73. ; This function uses 2 different methods to calculate a valid result.
  74. ; 1) logical 'or' of abs of each element
  75. ; This is used for ssse3 because of the pabsw instruction.
  76. ; It is also used for mmx because of the lack of min/max instructions.
  77. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  78. ; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  79. ;-----------------------------------------------------------------------------
  80. ; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
  81. %macro OR_WORDS_HORIZ 2 ; src, tmp
  82. %if cpuflag(sse2)
  83. movhlps %2, %1
  84. por %1, %2
  85. pshuflw %2, %1, q0032
  86. por %1, %2
  87. pshuflw %2, %1, q0001
  88. por %1, %2
  89. %elif cpuflag(mmxext)
  90. pshufw %2, %1, q0032
  91. por %1, %2
  92. pshufw %2, %1, q0001
  93. por %1, %2
  94. %else ; mmx
  95. movq %2, %1
  96. psrlq %2, 32
  97. por %1, %2
  98. movq %2, %1
  99. psrlq %2, 16
  100. por %1, %2
  101. %endif
  102. %endmacro
  103. %macro AC3_MAX_MSB_ABS_INT16 1
  104. cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
  105. pxor m2, m2
  106. pxor m3, m3
  107. .loop:
  108. %ifidn %1, min_max
  109. mova m0, [srcq]
  110. mova m1, [srcq+mmsize]
  111. pminsw m2, m0
  112. pminsw m2, m1
  113. pmaxsw m3, m0
  114. pmaxsw m3, m1
  115. %else ; or_abs
  116. %if notcpuflag(ssse3)
  117. mova m0, [srcq]
  118. mova m1, [srcq+mmsize]
  119. ABS2 m0, m1, m3, m4
  120. %else ; ssse3
  121. ; using memory args is faster for ssse3
  122. pabsw m0, [srcq]
  123. pabsw m1, [srcq+mmsize]
  124. %endif
  125. por m2, m0
  126. por m2, m1
  127. %endif
  128. add srcq, mmsize*2
  129. sub lend, mmsize
  130. ja .loop
  131. %ifidn %1, min_max
  132. ABS2 m2, m3, m0, m1
  133. por m2, m3
  134. %endif
  135. OR_WORDS_HORIZ m2, m0
  136. movd eax, m2
  137. and eax, 0xFFFF
  138. RET
  139. %endmacro
  140. INIT_MMX mmx
  141. AC3_MAX_MSB_ABS_INT16 or_abs
  142. INIT_MMX mmxext
  143. AC3_MAX_MSB_ABS_INT16 min_max
  144. INIT_XMM sse2
  145. AC3_MAX_MSB_ABS_INT16 min_max
  146. INIT_XMM ssse3
  147. AC3_MAX_MSB_ABS_INT16 or_abs
  148. ;-----------------------------------------------------------------------------
  149. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  150. ;-----------------------------------------------------------------------------
  151. %macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
  152. cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
  153. movd m0, shiftd
  154. .loop:
  155. mova m1, [srcq ]
  156. mova m2, [srcq+mmsize ]
  157. mova m3, [srcq+mmsize*2]
  158. mova m4, [srcq+mmsize*3]
  159. %3 m1, m0
  160. %3 m2, m0
  161. %3 m3, m0
  162. %3 m4, m0
  163. mova [srcq ], m1
  164. mova [srcq+mmsize ], m2
  165. mova [srcq+mmsize*2], m3
  166. mova [srcq+mmsize*3], m4
  167. add srcq, mmsize*4
  168. sub lend, mmsize*32/%2
  169. ja .loop
  170. .end:
  171. REP_RET
  172. %endmacro
  173. ;-----------------------------------------------------------------------------
  174. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  175. ;-----------------------------------------------------------------------------
  176. INIT_MMX mmx
  177. AC3_SHIFT l, 16, psllw
  178. INIT_XMM sse2
  179. AC3_SHIFT l, 16, psllw
  180. ;-----------------------------------------------------------------------------
  181. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  182. ;-----------------------------------------------------------------------------
  183. INIT_MMX mmx
  184. AC3_SHIFT r, 32, psrad
  185. INIT_XMM sse2
  186. AC3_SHIFT r, 32, psrad
  187. ;-----------------------------------------------------------------------------
  188. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  189. ;-----------------------------------------------------------------------------
  190. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  191. ; than round-to-nearest.
  192. INIT_MMX 3dnow
  193. cglobal float_to_fixed24, 3, 3, 0, dst, src, len
  194. movq m0, [pf_1_24]
  195. .loop:
  196. movq m1, [srcq ]
  197. movq m2, [srcq+8 ]
  198. movq m3, [srcq+16]
  199. movq m4, [srcq+24]
  200. pfmul m1, m0
  201. pfmul m2, m0
  202. pfmul m3, m0
  203. pfmul m4, m0
  204. pf2id m1, m1
  205. pf2id m2, m2
  206. pf2id m3, m3
  207. pf2id m4, m4
  208. movq [dstq ], m1
  209. movq [dstq+8 ], m2
  210. movq [dstq+16], m3
  211. movq [dstq+24], m4
  212. add srcq, 32
  213. add dstq, 32
  214. sub lend, 8
  215. ja .loop
  216. femms
  217. RET
  218. INIT_XMM sse
  219. cglobal float_to_fixed24, 3, 3, 3, dst, src, len
  220. movaps m0, [pf_1_24]
  221. .loop:
  222. movaps m1, [srcq ]
  223. movaps m2, [srcq+16]
  224. mulps m1, m0
  225. mulps m2, m0
  226. cvtps2pi mm0, m1
  227. movhlps m1, m1
  228. cvtps2pi mm1, m1
  229. cvtps2pi mm2, m2
  230. movhlps m2, m2
  231. cvtps2pi mm3, m2
  232. movq [dstq ], mm0
  233. movq [dstq+ 8], mm1
  234. movq [dstq+16], mm2
  235. movq [dstq+24], mm3
  236. add srcq, 32
  237. add dstq, 32
  238. sub lend, 8
  239. ja .loop
  240. emms
  241. RET
  242. INIT_XMM sse2
  243. cglobal float_to_fixed24, 3, 3, 9, dst, src, len
  244. movaps m0, [pf_1_24]
  245. .loop:
  246. movaps m1, [srcq ]
  247. movaps m2, [srcq+16 ]
  248. movaps m3, [srcq+32 ]
  249. movaps m4, [srcq+48 ]
  250. %ifdef m8
  251. movaps m5, [srcq+64 ]
  252. movaps m6, [srcq+80 ]
  253. movaps m7, [srcq+96 ]
  254. movaps m8, [srcq+112]
  255. %endif
  256. mulps m1, m0
  257. mulps m2, m0
  258. mulps m3, m0
  259. mulps m4, m0
  260. %ifdef m8
  261. mulps m5, m0
  262. mulps m6, m0
  263. mulps m7, m0
  264. mulps m8, m0
  265. %endif
  266. cvtps2dq m1, m1
  267. cvtps2dq m2, m2
  268. cvtps2dq m3, m3
  269. cvtps2dq m4, m4
  270. %ifdef m8
  271. cvtps2dq m5, m5
  272. cvtps2dq m6, m6
  273. cvtps2dq m7, m7
  274. cvtps2dq m8, m8
  275. %endif
  276. movdqa [dstq ], m1
  277. movdqa [dstq+16 ], m2
  278. movdqa [dstq+32 ], m3
  279. movdqa [dstq+48 ], m4
  280. %ifdef m8
  281. movdqa [dstq+64 ], m5
  282. movdqa [dstq+80 ], m6
  283. movdqa [dstq+96 ], m7
  284. movdqa [dstq+112], m8
  285. add srcq, 128
  286. add dstq, 128
  287. sub lenq, 32
  288. %else
  289. add srcq, 64
  290. add dstq, 64
  291. sub lenq, 16
  292. %endif
  293. ja .loop
  294. REP_RET
  295. ;------------------------------------------------------------------------------
  296. ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
  297. ;------------------------------------------------------------------------------
  298. %macro PHADDD4 2 ; xmm src, xmm tmp
  299. movhlps %2, %1
  300. paddd %1, %2
  301. pshufd %2, %1, 0x1
  302. paddd %1, %2
  303. %endmacro
  304. INIT_XMM sse2
  305. cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
  306. movdqa m0, [mant_cntq ]
  307. movdqa m1, [mant_cntq+ 1*16]
  308. paddw m0, [mant_cntq+ 2*16]
  309. paddw m1, [mant_cntq+ 3*16]
  310. paddw m0, [mant_cntq+ 4*16]
  311. paddw m1, [mant_cntq+ 5*16]
  312. paddw m0, [mant_cntq+ 6*16]
  313. paddw m1, [mant_cntq+ 7*16]
  314. paddw m0, [mant_cntq+ 8*16]
  315. paddw m1, [mant_cntq+ 9*16]
  316. paddw m0, [mant_cntq+10*16]
  317. paddw m1, [mant_cntq+11*16]
  318. pmaddwd m0, [ac3_bap_bits ]
  319. pmaddwd m1, [ac3_bap_bits+16]
  320. paddd m0, m1
  321. PHADDD4 m0, m1
  322. movd sumd, m0
  323. movdqa m3, [pw_bap_mul1]
  324. movhpd m0, [mant_cntq +2]
  325. movlpd m0, [mant_cntq+1*32+2]
  326. movhpd m1, [mant_cntq+2*32+2]
  327. movlpd m1, [mant_cntq+3*32+2]
  328. movhpd m2, [mant_cntq+4*32+2]
  329. movlpd m2, [mant_cntq+5*32+2]
  330. pmulhuw m0, m3
  331. pmulhuw m1, m3
  332. pmulhuw m2, m3
  333. paddusw m0, m1
  334. paddusw m0, m2
  335. pmaddwd m0, [pw_bap_mul2]
  336. PHADDD4 m0, m1
  337. movd eax, m0
  338. add eax, sumd
  339. RET
  340. ;------------------------------------------------------------------------------
  341. ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
  342. ;------------------------------------------------------------------------------
  343. %macro PABSD 1-2 ; src/dst, unused
  344. %if cpuflag(ssse3)
  345. pabsd %1, %1
  346. %else ; src/dst, tmp
  347. pxor %2, %2
  348. pcmpgtd %2, %1
  349. pxor %1, %2
  350. psubd %1, %2
  351. %endif
  352. %endmacro
  353. %if HAVE_AMD3DNOW_EXTERNAL
  354. INIT_MMX 3dnow
  355. cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len
  356. add expq, lenq
  357. lea coefq, [coefq+4*lenq]
  358. neg lenq
  359. movq m3, [pd_1]
  360. movq m4, [pd_151]
  361. .loop:
  362. movq m0, [coefq+4*lenq ]
  363. movq m1, [coefq+4*lenq+8]
  364. PABSD m0, m2
  365. PABSD m1, m2
  366. pslld m0, 1
  367. por m0, m3
  368. pi2fd m2, m0
  369. psrld m2, 23
  370. movq m0, m4
  371. psubd m0, m2
  372. pslld m1, 1
  373. por m1, m3
  374. pi2fd m2, m1
  375. psrld m2, 23
  376. movq m1, m4
  377. psubd m1, m2
  378. packssdw m0, m0
  379. packuswb m0, m0
  380. packssdw m1, m1
  381. packuswb m1, m1
  382. punpcklwd m0, m1
  383. movd [expq+lenq], m0
  384. add lenq, 4
  385. jl .loop
  386. REP_RET
  387. %endif
  388. %macro AC3_EXTRACT_EXPONENTS 0
  389. cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
  390. add expq, lenq
  391. lea coefq, [coefq+4*lenq]
  392. neg lenq
  393. mova m2, [pd_1]
  394. mova m3, [pd_151]
  395. .loop:
  396. ; move 4 32-bit coefs to xmm0
  397. mova m0, [coefq+4*lenq]
  398. ; absolute value
  399. PABSD m0, m1
  400. ; convert to float and extract exponents
  401. pslld m0, 1
  402. por m0, m2
  403. cvtdq2ps m1, m0
  404. psrld m1, 23
  405. mova m0, m3
  406. psubd m0, m1
  407. ; move the lowest byte in each of 4 dwords to the low dword
  408. ; NOTE: We cannot just extract the low bytes with pshufb because the dword
  409. ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
  410. ; clips this to 0, which is the correct exponent.
  411. packssdw m0, m0
  412. packuswb m0, m0
  413. movd [expq+lenq], m0
  414. add lenq, 4
  415. jl .loop
  416. REP_RET
  417. %endmacro
  418. %if HAVE_SSE2_EXTERNAL
  419. INIT_XMM sse2
  420. AC3_EXTRACT_EXPONENTS
  421. %endif
  422. %if HAVE_SSSE3_EXTERNAL
  423. INIT_XMM ssse3
  424. AC3_EXTRACT_EXPONENTS
  425. %endif