You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
2.4KB

  1. ;******************************************************************************
  2. ;* SSE-optimized functions for the DCA decoder
  3. ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pf_inv16: times 4 dd 0x3D800000 ; 1/16
  24. SECTION_TEXT
  25. ; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
  26. %macro INT8X8_FMUL_INT32 0
  27. cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
  28. cvtsi2ss m0, scalem
  29. mulss m0, [pf_inv16]
  30. shufps m0, m0, 0
  31. %if cpuflag(sse2)
  32. %if cpuflag(sse4)
  33. pmovsxbd m1, [srcq+0]
  34. pmovsxbd m2, [srcq+4]
  35. %else
  36. movq m1, [srcq]
  37. punpcklbw m1, m1
  38. mova m2, m1
  39. punpcklwd m1, m1
  40. punpckhwd m2, m2
  41. psrad m1, 24
  42. psrad m2, 24
  43. %endif
  44. cvtdq2ps m1, m1
  45. cvtdq2ps m2, m2
  46. %else
  47. movd mm0, [srcq+0]
  48. movd mm1, [srcq+4]
  49. punpcklbw mm0, mm0
  50. punpcklbw mm1, mm1
  51. movq mm2, mm0
  52. movq mm3, mm1
  53. punpcklwd mm0, mm0
  54. punpcklwd mm1, mm1
  55. punpckhwd mm2, mm2
  56. punpckhwd mm3, mm3
  57. psrad mm0, 24
  58. psrad mm1, 24
  59. psrad mm2, 24
  60. psrad mm3, 24
  61. cvtpi2ps m1, mm0
  62. cvtpi2ps m2, mm1
  63. cvtpi2ps m3, mm2
  64. cvtpi2ps m4, mm3
  65. shufps m0, m0, 0
  66. emms
  67. shufps m1, m3, q1010
  68. shufps m2, m4, q1010
  69. %endif
  70. mulps m1, m0
  71. mulps m2, m0
  72. mova [dstq+ 0], m1
  73. mova [dstq+16], m2
  74. REP_RET
  75. %endmacro
  76. %if ARCH_X86_32
  77. INIT_XMM sse
  78. INT8X8_FMUL_INT32
  79. %endif
  80. INIT_XMM sse2
  81. INT8X8_FMUL_INT32
  82. INIT_XMM sse4
  83. INT8X8_FMUL_INT32