You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

553 lines
15KB

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP functions
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. ; 16777216.0f - used in ff_float_to_fixed24()
  24. pf_1_24: times 4 dd 0x4B800000
  25. ; used in ff_ac3_compute_mantissa_size()
  26. cextern ac3_bap_bits
  27. pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
  28. pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
  29. ; used in ff_ac3_extract_exponents()
  30. pd_1: times 4 dd 1
  31. pd_151: times 4 dd 151
  32. ; used in ff_apply_window_int16()
  33. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  34. pd_16384: times 4 dd 16384
  35. SECTION .text
  36. ;-----------------------------------------------------------------------------
  37. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  38. ;-----------------------------------------------------------------------------
  39. %macro AC3_EXPONENT_MIN 0
  40. cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
  41. shl reuse_blksq, 8
  42. jz .end
  43. LOOP_ALIGN
  44. .nextexp:
  45. mov offsetq, reuse_blksq
  46. mova m0, [expq+offsetq]
  47. sub offsetq, 256
  48. LOOP_ALIGN
  49. .nextblk:
  50. PMINUB m0, [expq+offsetq], m1
  51. sub offsetq, 256
  52. jae .nextblk
  53. mova [expq], m0
  54. add expq, mmsize
  55. sub expnq, mmsize
  56. jg .nextexp
  57. .end:
  58. REP_RET
  59. %endmacro
  60. %define LOOP_ALIGN
  61. INIT_MMX mmx
  62. AC3_EXPONENT_MIN
  63. %if HAVE_MMXEXT_EXTERNAL
  64. %define LOOP_ALIGN ALIGN 16
  65. INIT_MMX mmxext
  66. AC3_EXPONENT_MIN
  67. %endif
  68. %if HAVE_SSE2_EXTERNAL
  69. INIT_XMM sse2
  70. AC3_EXPONENT_MIN
  71. %endif
  72. %undef LOOP_ALIGN
  73. ;-----------------------------------------------------------------------------
  74. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  75. ;
  76. ; This function uses 2 different methods to calculate a valid result.
  77. ; 1) logical 'or' of abs of each element
  78. ; This is used for ssse3 because of the pabsw instruction.
  79. ; It is also used for mmx because of the lack of min/max instructions.
  80. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  81. ; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  82. ;-----------------------------------------------------------------------------
  83. ; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
  84. %macro OR_WORDS_HORIZ 2 ; src, tmp
  85. %if cpuflag(sse2)
  86. movhlps %2, %1
  87. por %1, %2
  88. pshuflw %2, %1, q0032
  89. por %1, %2
  90. pshuflw %2, %1, q0001
  91. por %1, %2
  92. %elif cpuflag(mmxext)
  93. pshufw %2, %1, q0032
  94. por %1, %2
  95. pshufw %2, %1, q0001
  96. por %1, %2
  97. %else ; mmx
  98. movq %2, %1
  99. psrlq %2, 32
  100. por %1, %2
  101. movq %2, %1
  102. psrlq %2, 16
  103. por %1, %2
  104. %endif
  105. %endmacro
  106. %macro AC3_MAX_MSB_ABS_INT16 1
  107. cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
  108. pxor m2, m2
  109. pxor m3, m3
  110. .loop:
  111. %ifidn %1, min_max
  112. mova m0, [srcq]
  113. mova m1, [srcq+mmsize]
  114. pminsw m2, m0
  115. pminsw m2, m1
  116. pmaxsw m3, m0
  117. pmaxsw m3, m1
  118. %else ; or_abs
  119. %if notcpuflag(ssse3)
  120. mova m0, [srcq]
  121. mova m1, [srcq+mmsize]
  122. ABS2 m0, m1, m3, m4
  123. %else ; ssse3
  124. ; using memory args is faster for ssse3
  125. pabsw m0, [srcq]
  126. pabsw m1, [srcq+mmsize]
  127. %endif
  128. por m2, m0
  129. por m2, m1
  130. %endif
  131. add srcq, mmsize*2
  132. sub lend, mmsize
  133. ja .loop
  134. %ifidn %1, min_max
  135. ABS2 m2, m3, m0, m1
  136. por m2, m3
  137. %endif
  138. OR_WORDS_HORIZ m2, m0
  139. movd eax, m2
  140. and eax, 0xFFFF
  141. RET
  142. %endmacro
  143. INIT_MMX mmx
  144. AC3_MAX_MSB_ABS_INT16 or_abs
  145. INIT_MMX mmxext
  146. AC3_MAX_MSB_ABS_INT16 min_max
  147. INIT_XMM sse2
  148. AC3_MAX_MSB_ABS_INT16 min_max
  149. INIT_XMM ssse3
  150. AC3_MAX_MSB_ABS_INT16 or_abs
  151. ;-----------------------------------------------------------------------------
  152. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  153. ;-----------------------------------------------------------------------------
  154. %macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
  155. cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
  156. movd m0, shiftd
  157. .loop:
  158. mova m1, [srcq ]
  159. mova m2, [srcq+mmsize ]
  160. mova m3, [srcq+mmsize*2]
  161. mova m4, [srcq+mmsize*3]
  162. %3 m1, m0
  163. %3 m2, m0
  164. %3 m3, m0
  165. %3 m4, m0
  166. mova [srcq ], m1
  167. mova [srcq+mmsize ], m2
  168. mova [srcq+mmsize*2], m3
  169. mova [srcq+mmsize*3], m4
  170. add srcq, mmsize*4
  171. sub lend, mmsize*32/%2
  172. ja .loop
  173. .end:
  174. REP_RET
  175. %endmacro
  176. ;-----------------------------------------------------------------------------
  177. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  178. ;-----------------------------------------------------------------------------
  179. INIT_MMX mmx
  180. AC3_SHIFT l, 16, psllw
  181. INIT_XMM sse2
  182. AC3_SHIFT l, 16, psllw
  183. ;-----------------------------------------------------------------------------
  184. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  185. ;-----------------------------------------------------------------------------
  186. INIT_MMX mmx
  187. AC3_SHIFT r, 32, psrad
  188. INIT_XMM sse2
  189. AC3_SHIFT r, 32, psrad
  190. ;-----------------------------------------------------------------------------
  191. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  192. ;-----------------------------------------------------------------------------
  193. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  194. ; than round-to-nearest.
  195. INIT_MMX 3dnow
  196. cglobal float_to_fixed24, 3, 3, 0, dst, src, len
  197. movq m0, [pf_1_24]
  198. .loop:
  199. movq m1, [srcq ]
  200. movq m2, [srcq+8 ]
  201. movq m3, [srcq+16]
  202. movq m4, [srcq+24]
  203. pfmul m1, m0
  204. pfmul m2, m0
  205. pfmul m3, m0
  206. pfmul m4, m0
  207. pf2id m1, m1
  208. pf2id m2, m2
  209. pf2id m3, m3
  210. pf2id m4, m4
  211. movq [dstq ], m1
  212. movq [dstq+8 ], m2
  213. movq [dstq+16], m3
  214. movq [dstq+24], m4
  215. add srcq, 32
  216. add dstq, 32
  217. sub lend, 8
  218. ja .loop
  219. femms
  220. RET
  221. INIT_XMM sse
  222. cglobal float_to_fixed24, 3, 3, 3, dst, src, len
  223. movaps m0, [pf_1_24]
  224. .loop:
  225. movaps m1, [srcq ]
  226. movaps m2, [srcq+16]
  227. mulps m1, m0
  228. mulps m2, m0
  229. cvtps2pi mm0, m1
  230. movhlps m1, m1
  231. cvtps2pi mm1, m1
  232. cvtps2pi mm2, m2
  233. movhlps m2, m2
  234. cvtps2pi mm3, m2
  235. movq [dstq ], mm0
  236. movq [dstq+ 8], mm1
  237. movq [dstq+16], mm2
  238. movq [dstq+24], mm3
  239. add srcq, 32
  240. add dstq, 32
  241. sub lend, 8
  242. ja .loop
  243. emms
  244. RET
  245. INIT_XMM sse2
  246. cglobal float_to_fixed24, 3, 3, 9, dst, src, len
  247. movaps m0, [pf_1_24]
  248. .loop:
  249. movaps m1, [srcq ]
  250. movaps m2, [srcq+16 ]
  251. movaps m3, [srcq+32 ]
  252. movaps m4, [srcq+48 ]
  253. %ifdef m8
  254. movaps m5, [srcq+64 ]
  255. movaps m6, [srcq+80 ]
  256. movaps m7, [srcq+96 ]
  257. movaps m8, [srcq+112]
  258. %endif
  259. mulps m1, m0
  260. mulps m2, m0
  261. mulps m3, m0
  262. mulps m4, m0
  263. %ifdef m8
  264. mulps m5, m0
  265. mulps m6, m0
  266. mulps m7, m0
  267. mulps m8, m0
  268. %endif
  269. cvtps2dq m1, m1
  270. cvtps2dq m2, m2
  271. cvtps2dq m3, m3
  272. cvtps2dq m4, m4
  273. %ifdef m8
  274. cvtps2dq m5, m5
  275. cvtps2dq m6, m6
  276. cvtps2dq m7, m7
  277. cvtps2dq m8, m8
  278. %endif
  279. movdqa [dstq ], m1
  280. movdqa [dstq+16 ], m2
  281. movdqa [dstq+32 ], m3
  282. movdqa [dstq+48 ], m4
  283. %ifdef m8
  284. movdqa [dstq+64 ], m5
  285. movdqa [dstq+80 ], m6
  286. movdqa [dstq+96 ], m7
  287. movdqa [dstq+112], m8
  288. add srcq, 128
  289. add dstq, 128
  290. sub lenq, 32
  291. %else
  292. add srcq, 64
  293. add dstq, 64
  294. sub lenq, 16
  295. %endif
  296. ja .loop
  297. REP_RET
  298. ;------------------------------------------------------------------------------
  299. ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
  300. ;------------------------------------------------------------------------------
  301. %macro PHADDD4 2 ; xmm src, xmm tmp
  302. movhlps %2, %1
  303. paddd %1, %2
  304. pshufd %2, %1, 0x1
  305. paddd %1, %2
  306. %endmacro
  307. INIT_XMM sse2
  308. cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
  309. movdqa m0, [mant_cntq ]
  310. movdqa m1, [mant_cntq+ 1*16]
  311. paddw m0, [mant_cntq+ 2*16]
  312. paddw m1, [mant_cntq+ 3*16]
  313. paddw m0, [mant_cntq+ 4*16]
  314. paddw m1, [mant_cntq+ 5*16]
  315. paddw m0, [mant_cntq+ 6*16]
  316. paddw m1, [mant_cntq+ 7*16]
  317. paddw m0, [mant_cntq+ 8*16]
  318. paddw m1, [mant_cntq+ 9*16]
  319. paddw m0, [mant_cntq+10*16]
  320. paddw m1, [mant_cntq+11*16]
  321. pmaddwd m0, [ac3_bap_bits ]
  322. pmaddwd m1, [ac3_bap_bits+16]
  323. paddd m0, m1
  324. PHADDD4 m0, m1
  325. movd sumd, m0
  326. movdqa m3, [pw_bap_mul1]
  327. movhpd m0, [mant_cntq +2]
  328. movlpd m0, [mant_cntq+1*32+2]
  329. movhpd m1, [mant_cntq+2*32+2]
  330. movlpd m1, [mant_cntq+3*32+2]
  331. movhpd m2, [mant_cntq+4*32+2]
  332. movlpd m2, [mant_cntq+5*32+2]
  333. pmulhuw m0, m3
  334. pmulhuw m1, m3
  335. pmulhuw m2, m3
  336. paddusw m0, m1
  337. paddusw m0, m2
  338. pmaddwd m0, [pw_bap_mul2]
  339. PHADDD4 m0, m1
  340. movd eax, m0
  341. add eax, sumd
  342. RET
  343. ;------------------------------------------------------------------------------
  344. ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
  345. ;------------------------------------------------------------------------------
  346. %macro PABSD 1-2 ; src/dst, unused
  347. %if cpuflag(ssse3)
  348. pabsd %1, %1
  349. %else ; src/dst, tmp
  350. pxor %2, %2
  351. pcmpgtd %2, %1
  352. pxor %1, %2
  353. psubd %1, %2
  354. %endif
  355. %endmacro
  356. %macro AC3_EXTRACT_EXPONENTS 0
  357. cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
  358. add expq, lenq
  359. lea coefq, [coefq+4*lenq]
  360. neg lenq
  361. mova m2, [pd_1]
  362. mova m3, [pd_151]
  363. .loop:
  364. ; move 4 32-bit coefs to xmm0
  365. mova m0, [coefq+4*lenq]
  366. ; absolute value
  367. PABSD m0, m1
  368. ; convert to float and extract exponents
  369. pslld m0, 1
  370. por m0, m2
  371. cvtdq2ps m1, m0
  372. psrld m1, 23
  373. mova m0, m3
  374. psubd m0, m1
  375. ; move the lowest byte in each of 4 dwords to the low dword
  376. ; NOTE: We cannot just extract the low bytes with pshufb because the dword
  377. ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
  378. ; clips this to 0, which is the correct exponent.
  379. packssdw m0, m0
  380. packuswb m0, m0
  381. movd [expq+lenq], m0
  382. add lenq, 4
  383. jl .loop
  384. REP_RET
  385. %endmacro
  386. %if HAVE_SSE2_EXTERNAL
  387. INIT_XMM sse2
  388. AC3_EXTRACT_EXPONENTS
  389. %endif
  390. %if HAVE_SSSE3_EXTERNAL
  391. INIT_XMM ssse3
  392. AC3_EXTRACT_EXPONENTS
  393. %endif
  394. ;-----------------------------------------------------------------------------
  395. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  396. ; const int16_t *window, unsigned int len)
  397. ;-----------------------------------------------------------------------------
  398. %macro REVERSE_WORDS 1-2
  399. %if cpuflag(ssse3) && notcpuflag(atom)
  400. pshufb %1, %2
  401. %elif cpuflag(sse2)
  402. pshuflw %1, %1, 0x1B
  403. pshufhw %1, %1, 0x1B
  404. pshufd %1, %1, 0x4E
  405. %elif cpuflag(mmxext)
  406. pshufw %1, %1, 0x1B
  407. %endif
  408. %endmacro
  409. %macro MUL16FIXED 3
  410. %if cpuflag(ssse3) ; dst, src, unused
  411. ; dst = ((dst * src) + (1<<14)) >> 15
  412. pmulhrsw %1, %2
  413. %elif cpuflag(mmxext) ; dst, src, temp
  414. ; dst = (dst * src) >> 15
  415. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  416. ; in from the pmullw result.
  417. mova %3, %1
  418. pmulhw %1, %2
  419. pmullw %3, %2
  420. psrlw %3, 15
  421. psllw %1, 1
  422. por %1, %3
  423. %endif
  424. %endmacro
  425. %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
  426. %if %1
  427. cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
  428. %else
  429. cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
  430. %endif
  431. lea offset2q, [offsetq-mmsize]
  432. %if cpuflag(ssse3) && notcpuflag(atom)
  433. mova m5, [pb_revwords]
  434. ALIGN 16
  435. %elif %1
  436. mova m5, [pd_16384]
  437. %endif
  438. .loop:
  439. %if cpuflag(ssse3)
  440. ; This version does the 16x16->16 multiplication in-place without expanding
  441. ; to 32-bit. The ssse3 version is bit-identical.
  442. mova m0, [windowq+offset2q]
  443. mova m1, [ inputq+offset2q]
  444. pmulhrsw m1, m0
  445. REVERSE_WORDS m0, m5
  446. pmulhrsw m0, [ inputq+offsetq ]
  447. mova [outputq+offset2q], m1
  448. mova [outputq+offsetq ], m0
  449. %elif %1
  450. ; This version expands 16-bit to 32-bit, multiplies by the window,
  451. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  452. ; save to the output. The window is reversed for the second half.
  453. mova m3, [windowq+offset2q]
  454. mova m4, [ inputq+offset2q]
  455. pxor m0, m0
  456. punpcklwd m0, m3
  457. punpcklwd m1, m4
  458. pmaddwd m0, m1
  459. paddd m0, m5
  460. psrad m0, 15
  461. pxor m2, m2
  462. punpckhwd m2, m3
  463. punpckhwd m1, m4
  464. pmaddwd m2, m1
  465. paddd m2, m5
  466. psrad m2, 15
  467. packssdw m0, m2
  468. mova [outputq+offset2q], m0
  469. REVERSE_WORDS m3
  470. mova m4, [ inputq+offsetq]
  471. pxor m0, m0
  472. punpcklwd m0, m3
  473. punpcklwd m1, m4
  474. pmaddwd m0, m1
  475. paddd m0, m5
  476. psrad m0, 15
  477. pxor m2, m2
  478. punpckhwd m2, m3
  479. punpckhwd m1, m4
  480. pmaddwd m2, m1
  481. paddd m2, m5
  482. psrad m2, 15
  483. packssdw m0, m2
  484. mova [outputq+offsetq], m0
  485. %else
  486. ; This version does the 16x16->16 multiplication in-place without expanding
  487. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  488. ; therefore are not bit-identical to the C version.
  489. mova m0, [windowq+offset2q]
  490. mova m1, [ inputq+offset2q]
  491. mova m2, [ inputq+offsetq ]
  492. MUL16FIXED m1, m0, m3
  493. REVERSE_WORDS m0
  494. MUL16FIXED m2, m0, m3
  495. mova [outputq+offset2q], m1
  496. mova [outputq+offsetq ], m2
  497. %endif
  498. add offsetd, mmsize
  499. sub offset2d, mmsize
  500. jae .loop
  501. REP_RET
  502. %endmacro
  503. INIT_MMX mmxext
  504. APPLY_WINDOW_INT16 0
  505. INIT_XMM sse2
  506. APPLY_WINDOW_INT16 0
  507. INIT_MMX mmxext
  508. APPLY_WINDOW_INT16 1
  509. INIT_XMM sse2
  510. APPLY_WINDOW_INT16 1
  511. INIT_XMM ssse3
  512. APPLY_WINDOW_INT16 1
  513. INIT_XMM ssse3, atom
  514. APPLY_WINDOW_INT16 1