You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
3.0KB

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION .text
  24. %macro IDCT_DC_NOROUND 1
  25. imul %1, 13*13*3
  26. sar %1, 11
  27. %endmacro
  28. %macro IDCT_DC_ROUND 1
  29. imul %1, 13*13
  30. add %1, 0x200
  31. sar %1, 10
  32. %endmacro
  33. %macro rv34_idct 1
  34. cglobal rv34_idct_%1_mmx2, 1, 2, 0
  35. movsx r1, word [r0]
  36. IDCT_DC r1
  37. movd m0, r1
  38. pshufw m0, m0, 0
  39. movq [r0+ 0], m0
  40. movq [r0+ 8], m0
  41. movq [r0+16], m0
  42. movq [r0+24], m0
  43. REP_RET
  44. %endmacro
  45. INIT_MMX
  46. %define IDCT_DC IDCT_DC_ROUND
  47. rv34_idct dc
  48. %define IDCT_DC IDCT_DC_NOROUND
  49. rv34_idct dc_noround
  50. ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
  51. cglobal rv34_idct_dc_add_mmx, 3, 3
  52. ; calculate DC
  53. IDCT_DC_ROUND r2
  54. pxor m1, m1
  55. movd m0, r2
  56. psubw m1, m0
  57. packuswb m0, m0
  58. packuswb m1, m1
  59. punpcklbw m0, m0
  60. punpcklbw m1, m1
  61. punpcklwd m0, m0
  62. punpcklwd m1, m1
  63. ; add DC
  64. lea r2, [r0+r1*2]
  65. movh m2, [r0]
  66. movh m3, [r0+r1]
  67. movh m4, [r2]
  68. movh m5, [r2+r1]
  69. paddusb m2, m0
  70. paddusb m3, m0
  71. paddusb m4, m0
  72. paddusb m5, m0
  73. psubusb m2, m1
  74. psubusb m3, m1
  75. psubusb m4, m1
  76. psubusb m5, m1
  77. movh [r0], m2
  78. movh [r0+r1], m3
  79. movh [r2], m4
  80. movh [r2+r1], m5
  81. RET
  82. ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
  83. INIT_XMM
  84. cglobal rv34_idct_dc_add_sse4, 3, 3, 6
  85. ; load data
  86. IDCT_DC_ROUND r2
  87. pxor m1, m1
  88. ; calculate DC
  89. movd m0, r2
  90. lea r2, [r0+r1*2]
  91. movd m2, [r0]
  92. movd m3, [r0+r1]
  93. pshuflw m0, m0, 0
  94. movd m4, [r2]
  95. movd m5, [r2+r1]
  96. punpcklqdq m0, m0
  97. punpckldq m2, m3
  98. punpckldq m4, m5
  99. punpcklbw m2, m1
  100. punpcklbw m4, m1
  101. paddw m2, m0
  102. paddw m4, m0
  103. packuswb m2, m4
  104. movd [r0], m2
  105. pextrd [r0+r1], m2, 1
  106. pextrd [r2], m2, 2
  107. pextrd [r2+r1], m2, 3
  108. RET