You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

451 lines
12KB

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP utils
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. ; 16777216.0f - used in ff_float_to_fixed24()
  25. pf_1_24: times 4 dd 0x4B800000
  26. ; used in ff_ac3_compute_mantissa_size()
  27. cextern ac3_bap_bits
  28. pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
  29. pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
  30. ; used in ff_ac3_extract_exponents()
  31. pd_1: times 4 dd 1
  32. pd_151: times 4 dd 151
  33. pb_shuf_4dwb: db 0, 4, 8, 12
  34. SECTION .text
  35. ;-----------------------------------------------------------------------------
  36. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  37. ;-----------------------------------------------------------------------------
  38. %macro AC3_EXPONENT_MIN 1
  39. cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
  40. shl reuse_blksq, 8
  41. jz .end
  42. LOOP_ALIGN
  43. .nextexp:
  44. mov offsetq, reuse_blksq
  45. mova m0, [expq+offsetq]
  46. sub offsetq, 256
  47. LOOP_ALIGN
  48. .nextblk:
  49. PMINUB m0, [expq+offsetq], m1
  50. sub offsetq, 256
  51. jae .nextblk
  52. mova [expq], m0
  53. add expq, mmsize
  54. sub expnq, mmsize
  55. jg .nextexp
  56. .end:
  57. REP_RET
  58. %endmacro
  59. %define PMINUB PMINUB_MMX
  60. %define LOOP_ALIGN
  61. INIT_MMX
  62. AC3_EXPONENT_MIN mmx
  63. %ifdef HAVE_MMX2
  64. %define PMINUB PMINUB_MMXEXT
  65. %define LOOP_ALIGN ALIGN 16
  66. AC3_EXPONENT_MIN mmxext
  67. %endif
  68. %ifdef HAVE_SSE
  69. INIT_XMM
  70. AC3_EXPONENT_MIN sse2
  71. %endif
  72. %undef PMINUB
  73. %undef LOOP_ALIGN
  74. ;-----------------------------------------------------------------------------
  75. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  76. ;
  77. ; This function uses 2 different methods to calculate a valid result.
  78. ; 1) logical 'or' of abs of each element
  79. ; This is used for ssse3 because of the pabsw instruction.
  80. ; It is also used for mmx because of the lack of min/max instructions.
  81. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  82. ; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  83. ;-----------------------------------------------------------------------------
  84. %macro AC3_MAX_MSB_ABS_INT16 2
  85. cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
  86. pxor m2, m2
  87. pxor m3, m3
  88. .loop:
  89. %ifidn %2, min_max
  90. mova m0, [srcq]
  91. mova m1, [srcq+mmsize]
  92. pminsw m2, m0
  93. pminsw m2, m1
  94. pmaxsw m3, m0
  95. pmaxsw m3, m1
  96. %else ; or_abs
  97. %ifidn %1, mmx
  98. mova m0, [srcq]
  99. mova m1, [srcq+mmsize]
  100. ABS2 m0, m1, m3, m4
  101. %else ; ssse3
  102. ; using memory args is faster for ssse3
  103. pabsw m0, [srcq]
  104. pabsw m1, [srcq+mmsize]
  105. %endif
  106. por m2, m0
  107. por m2, m1
  108. %endif
  109. add srcq, mmsize*2
  110. sub lend, mmsize
  111. ja .loop
  112. %ifidn %2, min_max
  113. ABS2 m2, m3, m0, m1
  114. por m2, m3
  115. %endif
  116. %ifidn mmsize, 16
  117. movhlps m0, m2
  118. por m2, m0
  119. %endif
  120. PSHUFLW m0, m2, 0xe
  121. por m2, m0
  122. PSHUFLW m0, m2, 0x1
  123. por m2, m0
  124. movd eax, m2
  125. and eax, 0xFFFF
  126. RET
  127. %endmacro
  128. INIT_MMX
  129. %define ABS2 ABS2_MMX
  130. %define PSHUFLW pshufw
  131. AC3_MAX_MSB_ABS_INT16 mmx, or_abs
  132. %define ABS2 ABS2_MMX2
  133. AC3_MAX_MSB_ABS_INT16 mmxext, min_max
  134. INIT_XMM
  135. %define PSHUFLW pshuflw
  136. AC3_MAX_MSB_ABS_INT16 sse2, min_max
  137. %define ABS2 ABS2_SSSE3
  138. AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
  139. ;-----------------------------------------------------------------------------
  140. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  141. ;-----------------------------------------------------------------------------
  142. %macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
  143. cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
  144. movd m0, shiftd
  145. .loop:
  146. mova m1, [srcq ]
  147. mova m2, [srcq+mmsize ]
  148. mova m3, [srcq+mmsize*2]
  149. mova m4, [srcq+mmsize*3]
  150. %3 m1, m0
  151. %3 m2, m0
  152. %3 m3, m0
  153. %3 m4, m0
  154. mova [srcq ], m1
  155. mova [srcq+mmsize ], m2
  156. mova [srcq+mmsize*2], m3
  157. mova [srcq+mmsize*3], m4
  158. add srcq, mmsize*4
  159. sub lend, mmsize*32/%2
  160. ja .loop
  161. .end:
  162. REP_RET
  163. %endmacro
  164. ;-----------------------------------------------------------------------------
  165. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  166. ;-----------------------------------------------------------------------------
  167. INIT_MMX
  168. AC3_SHIFT l, 16, psllw, mmx
  169. INIT_XMM
  170. AC3_SHIFT l, 16, psllw, sse2
  171. ;-----------------------------------------------------------------------------
  172. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  173. ;-----------------------------------------------------------------------------
  174. INIT_MMX
  175. AC3_SHIFT r, 32, psrad, mmx
  176. INIT_XMM
  177. AC3_SHIFT r, 32, psrad, sse2
  178. ;-----------------------------------------------------------------------------
  179. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  180. ;-----------------------------------------------------------------------------
  181. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  182. ; than round-to-nearest.
  183. INIT_MMX
  184. cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
  185. movq m0, [pf_1_24]
  186. .loop:
  187. movq m1, [srcq ]
  188. movq m2, [srcq+8 ]
  189. movq m3, [srcq+16]
  190. movq m4, [srcq+24]
  191. pfmul m1, m0
  192. pfmul m2, m0
  193. pfmul m3, m0
  194. pfmul m4, m0
  195. pf2id m1, m1
  196. pf2id m2, m2
  197. pf2id m3, m3
  198. pf2id m4, m4
  199. movq [dstq ], m1
  200. movq [dstq+8 ], m2
  201. movq [dstq+16], m3
  202. movq [dstq+24], m4
  203. add srcq, 32
  204. add dstq, 32
  205. sub lend, 8
  206. ja .loop
  207. REP_RET
  208. INIT_XMM
  209. cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
  210. movaps m0, [pf_1_24]
  211. .loop:
  212. movaps m1, [srcq ]
  213. movaps m2, [srcq+16]
  214. mulps m1, m0
  215. mulps m2, m0
  216. cvtps2pi mm0, m1
  217. movhlps m1, m1
  218. cvtps2pi mm1, m1
  219. cvtps2pi mm2, m2
  220. movhlps m2, m2
  221. cvtps2pi mm3, m2
  222. movq [dstq ], mm0
  223. movq [dstq+ 8], mm1
  224. movq [dstq+16], mm2
  225. movq [dstq+24], mm3
  226. add srcq, 32
  227. add dstq, 32
  228. sub lend, 8
  229. ja .loop
  230. REP_RET
  231. INIT_XMM
  232. cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
  233. movaps m0, [pf_1_24]
  234. .loop:
  235. movaps m1, [srcq ]
  236. movaps m2, [srcq+16 ]
  237. movaps m3, [srcq+32 ]
  238. movaps m4, [srcq+48 ]
  239. %ifdef m8
  240. movaps m5, [srcq+64 ]
  241. movaps m6, [srcq+80 ]
  242. movaps m7, [srcq+96 ]
  243. movaps m8, [srcq+112]
  244. %endif
  245. mulps m1, m0
  246. mulps m2, m0
  247. mulps m3, m0
  248. mulps m4, m0
  249. %ifdef m8
  250. mulps m5, m0
  251. mulps m6, m0
  252. mulps m7, m0
  253. mulps m8, m0
  254. %endif
  255. cvtps2dq m1, m1
  256. cvtps2dq m2, m2
  257. cvtps2dq m3, m3
  258. cvtps2dq m4, m4
  259. %ifdef m8
  260. cvtps2dq m5, m5
  261. cvtps2dq m6, m6
  262. cvtps2dq m7, m7
  263. cvtps2dq m8, m8
  264. %endif
  265. movdqa [dstq ], m1
  266. movdqa [dstq+16 ], m2
  267. movdqa [dstq+32 ], m3
  268. movdqa [dstq+48 ], m4
  269. %ifdef m8
  270. movdqa [dstq+64 ], m5
  271. movdqa [dstq+80 ], m6
  272. movdqa [dstq+96 ], m7
  273. movdqa [dstq+112], m8
  274. add srcq, 128
  275. add dstq, 128
  276. sub lenq, 32
  277. %else
  278. add srcq, 64
  279. add dstq, 64
  280. sub lenq, 16
  281. %endif
  282. ja .loop
  283. REP_RET
  284. ;------------------------------------------------------------------------------
  285. ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
  286. ;------------------------------------------------------------------------------
  287. %macro PHADDD4 2 ; xmm src, xmm tmp
  288. movhlps %2, %1
  289. paddd %1, %2
  290. pshufd %2, %1, 0x1
  291. paddd %1, %2
  292. %endmacro
  293. INIT_XMM
  294. cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
  295. movdqa m0, [mant_cntq ]
  296. movdqa m1, [mant_cntq+ 1*16]
  297. paddw m0, [mant_cntq+ 2*16]
  298. paddw m1, [mant_cntq+ 3*16]
  299. paddw m0, [mant_cntq+ 4*16]
  300. paddw m1, [mant_cntq+ 5*16]
  301. paddw m0, [mant_cntq+ 6*16]
  302. paddw m1, [mant_cntq+ 7*16]
  303. paddw m0, [mant_cntq+ 8*16]
  304. paddw m1, [mant_cntq+ 9*16]
  305. paddw m0, [mant_cntq+10*16]
  306. paddw m1, [mant_cntq+11*16]
  307. pmaddwd m0, [ac3_bap_bits ]
  308. pmaddwd m1, [ac3_bap_bits+16]
  309. paddd m0, m1
  310. PHADDD4 m0, m1
  311. movd sumd, m0
  312. movdqa m3, [pw_bap_mul1]
  313. movhpd m0, [mant_cntq +2]
  314. movlpd m0, [mant_cntq+1*32+2]
  315. movhpd m1, [mant_cntq+2*32+2]
  316. movlpd m1, [mant_cntq+3*32+2]
  317. movhpd m2, [mant_cntq+4*32+2]
  318. movlpd m2, [mant_cntq+5*32+2]
  319. pmulhuw m0, m3
  320. pmulhuw m1, m3
  321. pmulhuw m2, m3
  322. paddusw m0, m1
  323. paddusw m0, m2
  324. pmaddwd m0, [pw_bap_mul2]
  325. PHADDD4 m0, m1
  326. movd eax, m0
  327. add eax, sumd
  328. RET
  329. ;------------------------------------------------------------------------------
  330. ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
  331. ;------------------------------------------------------------------------------
  332. %macro PABSD_MMX 2 ; src/dst, tmp
  333. pxor %2, %2
  334. pcmpgtd %2, %1
  335. pxor %1, %2
  336. psubd %1, %2
  337. %endmacro
  338. %macro PABSD_SSSE3 1-2 ; src/dst, unused
  339. pabsd %1, %1
  340. %endmacro
  341. %ifdef HAVE_AMD3DNOW
  342. INIT_MMX
  343. cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
  344. add expq, lenq
  345. lea coefq, [coefq+4*lenq]
  346. neg lenq
  347. movq m3, [pd_1]
  348. movq m4, [pd_151]
  349. .loop:
  350. movq m0, [coefq+4*lenq ]
  351. movq m1, [coefq+4*lenq+8]
  352. PABSD_MMX m0, m2
  353. PABSD_MMX m1, m2
  354. pslld m0, 1
  355. por m0, m3
  356. pi2fd m2, m0
  357. psrld m2, 23
  358. movq m0, m4
  359. psubd m0, m2
  360. pslld m1, 1
  361. por m1, m3
  362. pi2fd m2, m1
  363. psrld m2, 23
  364. movq m1, m4
  365. psubd m1, m2
  366. packssdw m0, m0
  367. packuswb m0, m0
  368. packssdw m1, m1
  369. packuswb m1, m1
  370. punpcklwd m0, m1
  371. movd [expq+lenq], m0
  372. add lenq, 4
  373. jl .loop
  374. REP_RET
  375. %endif
  376. %macro AC3_EXTRACT_EXPONENTS 1
  377. cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
  378. add expq, lenq
  379. lea coefq, [coefq+4*lenq]
  380. neg lenq
  381. mova m2, [pd_1]
  382. mova m3, [pd_151]
  383. %ifidn %1, ssse3 ;
  384. movd m4, [pb_shuf_4dwb]
  385. %endif
  386. .loop:
  387. ; move 4 32-bit coefs to xmm0
  388. mova m0, [coefq+4*lenq]
  389. ; absolute value
  390. PABSD m0, m1
  391. ; convert to float and extract exponents
  392. pslld m0, 1
  393. por m0, m2
  394. cvtdq2ps m1, m0
  395. psrld m1, 23
  396. mova m0, m3
  397. psubd m0, m1
  398. ; move the lowest byte in each of 4 dwords to the low dword
  399. %ifidn %1, ssse3
  400. pshufb m0, m4
  401. %else
  402. packssdw m0, m0
  403. packuswb m0, m0
  404. %endif
  405. movd [expq+lenq], m0
  406. add lenq, 4
  407. jl .loop
  408. REP_RET
  409. %endmacro
  410. %ifdef HAVE_SSE
  411. INIT_XMM
  412. %define PABSD PABSD_MMX
  413. AC3_EXTRACT_EXPONENTS sse2
  414. %ifdef HAVE_SSSE3
  415. %define PABSD PABSD_SSSE3
  416. AC3_EXTRACT_EXPONENTS ssse3
  417. %endif
  418. %endif