You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

84 lines
3.1KB

  1. ;******************************************************************************
  2. ;* Vorbis x86 optimizations
  3. ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pdw_80000000: times 4 dd 0x80000000
  24. SECTION .text
  25. %if ARCH_X86_32
  26. INIT_MMX 3dnow
  27. cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
  28. pxor m7, m7
  29. lea magq, [magq+block_sizeq*4]
  30. lea angq, [angq+block_sizeq*4]
  31. neg block_sizeq
  32. .loop:
  33. mova m0, [magq+block_sizeq*4]
  34. mova m1, [angq+block_sizeq*4]
  35. mova m2, m0
  36. mova m3, m1
  37. pfcmpge m2, m7 ; m <= 0.0
  38. pfcmpge m3, m7 ; a <= 0.0
  39. pslld m2, 31 ; keep only the sign bit
  40. pxor m1, m2
  41. mova m4, m3
  42. pand m3, m1
  43. pandn m4, m1
  44. pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
  45. pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
  46. mova [angq+block_sizeq*4], m3
  47. mova [magq+block_sizeq*4], m0
  48. add block_sizeq, 2
  49. jl .loop
  50. femms
  51. RET
  52. %endif
  53. INIT_XMM sse
  54. cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
  55. mova m5, [pdw_80000000]
  56. xor cntrq, cntrq
  57. align 16
  58. .loop:
  59. mova m0, [magq+cntrq*4]
  60. mova m1, [angq+cntrq*4]
  61. xorps m2, m2
  62. xorps m3, m3
  63. cmpleps m2, m0 ; m <= 0.0
  64. cmpleps m3, m1 ; a <= 0.0
  65. andps m2, m5 ; keep only the sign bit
  66. xorps m1, m2
  67. mova m4, m3
  68. andps m3, m1
  69. andnps m4, m1
  70. addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
  71. subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
  72. mova [angq+cntrq*4], m3
  73. mova [magq+cntrq*4], m0
  74. add cntrq, 4
  75. cmp cntrq, block_sizeq
  76. jl .loop
  77. RET