You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

197 lines
5.8KB

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pw_row_coeffs: times 4 dw 13
  24. times 4 dw 17
  25. times 4 dw 7
  26. pd_512: times 2 dd 0x200
  27. pw_col_coeffs: dw 13, 13, 13, -13
  28. dw 17, 7, 7, -17
  29. dw 13, -13, 13, 13
  30. dw -7, 17, -17, -7
  31. SECTION .text
  32. %macro IDCT_DC_NOROUND 1
  33. imul %1, 13*13*3
  34. sar %1, 11
  35. %endmacro
  36. %macro IDCT_DC_ROUND 1
  37. imul %1, 13*13
  38. add %1, 0x200
  39. sar %1, 10
  40. %endmacro
  41. %macro rv34_idct 1
  42. cglobal rv34_idct_%1, 1, 2, 0
  43. movsx r1, word [r0]
  44. IDCT_DC r1
  45. movd m0, r1d
  46. pshufw m0, m0, 0
  47. movq [r0+ 0], m0
  48. movq [r0+ 8], m0
  49. movq [r0+16], m0
  50. movq [r0+24], m0
  51. REP_RET
  52. %endmacro
  53. INIT_MMX mmxext
  54. %define IDCT_DC IDCT_DC_ROUND
  55. rv34_idct dc
  56. %define IDCT_DC IDCT_DC_NOROUND
  57. rv34_idct dc_noround
  58. ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
  59. INIT_MMX mmx
  60. cglobal rv34_idct_dc_add, 3, 3
  61. ; calculate DC
  62. IDCT_DC_ROUND r2
  63. pxor m1, m1
  64. movd m0, r2d
  65. psubw m1, m0
  66. packuswb m0, m0
  67. packuswb m1, m1
  68. punpcklbw m0, m0
  69. punpcklbw m1, m1
  70. punpcklwd m0, m0
  71. punpcklwd m1, m1
  72. ; add DC
  73. lea r2, [r0+r1*2]
  74. movh m2, [r0]
  75. movh m3, [r0+r1]
  76. movh m4, [r2]
  77. movh m5, [r2+r1]
  78. paddusb m2, m0
  79. paddusb m3, m0
  80. paddusb m4, m0
  81. paddusb m5, m0
  82. psubusb m2, m1
  83. psubusb m3, m1
  84. psubusb m4, m1
  85. psubusb m5, m1
  86. movh [r0], m2
  87. movh [r0+r1], m3
  88. movh [r2], m4
  89. movh [r2+r1], m5
  90. RET
  91. ; Load coeffs and perform row transform
  92. ; Output: coeffs in mm[0467], rounder in mm5
  93. %macro ROW_TRANSFORM 1
  94. pxor mm7, mm7
  95. mova mm0, [%1+ 0*8]
  96. mova mm1, [%1+ 1*8]
  97. mova mm2, [%1+ 2*8]
  98. mova mm3, [%1+ 3*8]
  99. mova [%1+ 0*8], mm7
  100. mova [%1+ 1*8], mm7
  101. mova [%1+ 2*8], mm7
  102. mova [%1+ 3*8], mm7
  103. mova mm4, mm0
  104. mova mm6, [pw_row_coeffs+ 0]
  105. paddsw mm0, mm2 ; b0 + b2
  106. psubsw mm4, mm2 ; b0 - b2
  107. pmullw mm0, mm6 ; *13 = z0
  108. pmullw mm4, mm6 ; *13 = z1
  109. mova mm5, mm1
  110. pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
  111. pmullw mm5, [pw_row_coeffs+16] ; b1* 7
  112. mova mm7, mm3
  113. pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
  114. pmullw mm7, [pw_row_coeffs+16] ; b3* 7
  115. paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
  116. psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
  117. mova mm7, mm0
  118. mova mm6, mm4
  119. paddsw mm0, mm1 ; z0 + z3
  120. psubsw mm7, mm1 ; z0 - z3
  121. paddsw mm4, mm5 ; z1 + z2
  122. psubsw mm6, mm5 ; z1 - z2
  123. mova mm5, [pd_512] ; 0x200
  124. %endmacro
  125. ; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
  126. %macro COL_TRANSFORM 4
  127. pshufw mm3, %2, 0xDD ; col. 1,3,1,3
  128. pshufw %2, %2, 0x88 ; col. 0,2,0,2
  129. pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
  130. pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
  131. paddd %2, mm5
  132. pshufw mm1, %2, 01001110b ; z1 | z0
  133. pshufw mm2, mm3, 01001110b ; z2 | z3
  134. paddd %2, mm3 ; z0+z3 | z1+z2
  135. psubd mm1, mm2 ; z1-z2 | z0-z3
  136. movd mm3, %1
  137. psrad %2, 10
  138. pxor mm2, mm2
  139. psrad mm1, 10
  140. punpcklbw mm3, mm2
  141. packssdw %2, mm1
  142. paddw %2, mm3
  143. packuswb %2, %2
  144. movd %1, %2
  145. %endmacro
  146. INIT_MMX mmxext
  147. cglobal rv34_idct_add, 3,3,0, d, s, b
  148. ROW_TRANSFORM bq
  149. COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
  150. mova mm0, [pw_col_coeffs+ 0]
  151. COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
  152. mova mm4, [pw_col_coeffs+ 8]
  153. lea dq, [dq + 2*sq]
  154. COL_TRANSFORM [dq], mm6, mm0, mm4
  155. COL_TRANSFORM [dq+sq], mm7, mm0, mm4
  156. ret
  157. ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
  158. INIT_XMM sse4
  159. cglobal rv34_idct_dc_add, 3, 3, 6
  160. ; load data
  161. IDCT_DC_ROUND r2
  162. pxor m1, m1
  163. ; calculate DC
  164. movd m0, r2d
  165. lea r2, [r0+r1*2]
  166. movd m2, [r0]
  167. movd m3, [r0+r1]
  168. pshuflw m0, m0, 0
  169. movd m4, [r2]
  170. movd m5, [r2+r1]
  171. punpcklqdq m0, m0
  172. punpckldq m2, m3
  173. punpckldq m4, m5
  174. punpcklbw m2, m1
  175. punpcklbw m4, m1
  176. paddw m2, m0
  177. paddw m4, m0
  178. packuswb m2, m4
  179. movd [r0], m2
  180. pextrd [r0+r1], m2, 1
  181. pextrd [r2], m2, 2
  182. pextrd [r2+r1], m2, 3
  183. RET