You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

198 lines
5.8KB

  1. ;******************************************************************************
  2. ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
  3. ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. pw_row_coeffs: times 4 dw 13
  25. times 4 dw 17
  26. times 4 dw 7
  27. pd_512: times 2 dd 0x200
  28. pw_col_coeffs: dw 13, 13, 13, -13
  29. dw 17, 7, 7, -17
  30. dw 13, -13, 13, 13
  31. dw -7, 17, -17, -7
  32. SECTION .text
  33. %macro IDCT_DC_NOROUND 1
  34. imul %1, 13*13*3
  35. sar %1, 11
  36. %endmacro
  37. %macro IDCT_DC_ROUND 1
  38. imul %1, 13*13
  39. add %1, 0x200
  40. sar %1, 10
  41. %endmacro
  42. %macro rv34_idct 1
  43. cglobal rv34_idct_%1, 1, 2, 0
  44. movsx r1, word [r0]
  45. IDCT_DC r1
  46. movd m0, r1d
  47. pshufw m0, m0, 0
  48. movq [r0+ 0], m0
  49. movq [r0+ 8], m0
  50. movq [r0+16], m0
  51. movq [r0+24], m0
  52. REP_RET
  53. %endmacro
  54. INIT_MMX mmx2
  55. %define IDCT_DC IDCT_DC_ROUND
  56. rv34_idct dc
  57. %define IDCT_DC IDCT_DC_NOROUND
  58. rv34_idct dc_noround
  59. ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
  60. INIT_MMX mmx
  61. cglobal rv34_idct_dc_add, 3, 3
  62. ; calculate DC
  63. IDCT_DC_ROUND r2
  64. pxor m1, m1
  65. movd m0, r2d
  66. psubw m1, m0
  67. packuswb m0, m0
  68. packuswb m1, m1
  69. punpcklbw m0, m0
  70. punpcklbw m1, m1
  71. punpcklwd m0, m0
  72. punpcklwd m1, m1
  73. ; add DC
  74. lea r2, [r0+r1*2]
  75. movh m2, [r0]
  76. movh m3, [r0+r1]
  77. movh m4, [r2]
  78. movh m5, [r2+r1]
  79. paddusb m2, m0
  80. paddusb m3, m0
  81. paddusb m4, m0
  82. paddusb m5, m0
  83. psubusb m2, m1
  84. psubusb m3, m1
  85. psubusb m4, m1
  86. psubusb m5, m1
  87. movh [r0], m2
  88. movh [r0+r1], m3
  89. movh [r2], m4
  90. movh [r2+r1], m5
  91. RET
  92. ; Load coeffs and perform row transform
  93. ; Output: coeffs in mm[0467], rounder in mm5
  94. %macro ROW_TRANSFORM 1
  95. pxor mm7, mm7
  96. mova mm0, [%1+ 0*8]
  97. mova mm1, [%1+ 1*8]
  98. mova mm2, [%1+ 2*8]
  99. mova mm3, [%1+ 3*8]
  100. mova [%1+ 0*8], mm7
  101. mova [%1+ 1*8], mm7
  102. mova [%1+ 2*8], mm7
  103. mova [%1+ 3*8], mm7
  104. mova mm4, mm0
  105. mova mm6, [pw_row_coeffs+ 0]
  106. paddsw mm0, mm2 ; b0 + b2
  107. psubsw mm4, mm2 ; b0 - b2
  108. pmullw mm0, mm6 ; *13 = z0
  109. pmullw mm4, mm6 ; *13 = z1
  110. mova mm5, mm1
  111. pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
  112. pmullw mm5, [pw_row_coeffs+16] ; b1* 7
  113. mova mm7, mm3
  114. pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
  115. pmullw mm7, [pw_row_coeffs+16] ; b3* 7
  116. paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
  117. psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
  118. mova mm7, mm0
  119. mova mm6, mm4
  120. paddsw mm0, mm1 ; z0 + z3
  121. psubsw mm7, mm1 ; z0 - z3
  122. paddsw mm4, mm5 ; z1 + z2
  123. psubsw mm6, mm5 ; z1 - z2
  124. mova mm5, [pd_512] ; 0x200
  125. %endmacro
  126. ; ff_rv34_idct_add_mmx2(uint8_t *dst, ptrdiff_t stride, DCTELEM *block);
  127. %macro COL_TRANSFORM 4
  128. pshufw mm3, %2, 0xDD ; col. 1,3,1,3
  129. pshufw %2, %2, 0x88 ; col. 0,2,0,2
  130. pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
  131. pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
  132. paddd %2, mm5
  133. pshufw mm1, %2, 01001110b ; z1 | z0
  134. pshufw mm2, mm3, 01001110b ; z2 | z3
  135. paddd %2, mm3 ; z0+z3 | z1+z2
  136. psubd mm1, mm2 ; z1-z2 | z0-z3
  137. movd mm3, %1
  138. psrad %2, 10
  139. pxor mm2, mm2
  140. psrad mm1, 10
  141. punpcklbw mm3, mm2
  142. packssdw %2, mm1
  143. paddw %2, mm3
  144. packuswb %2, %2
  145. movd %1, %2
  146. %endmacro
  147. INIT_MMX mmx2
  148. cglobal rv34_idct_add, 3,3,0, d, s, b
  149. ROW_TRANSFORM bq
  150. COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
  151. mova mm0, [pw_col_coeffs+ 0]
  152. COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
  153. mova mm4, [pw_col_coeffs+ 8]
  154. lea dq, [dq + 2*sq]
  155. COL_TRANSFORM [dq], mm6, mm0, mm4
  156. COL_TRANSFORM [dq+sq], mm7, mm0, mm4
  157. ret
  158. ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
  159. INIT_XMM sse4
  160. cglobal rv34_idct_dc_add, 3, 3, 6
  161. ; load data
  162. IDCT_DC_ROUND r2
  163. pxor m1, m1
  164. ; calculate DC
  165. movd m0, r2d
  166. lea r2, [r0+r1*2]
  167. movd m2, [r0]
  168. movd m3, [r0+r1]
  169. pshuflw m0, m0, 0
  170. movd m4, [r2]
  171. movd m5, [r2+r1]
  172. punpcklqdq m0, m0
  173. punpckldq m2, m3
  174. punpckldq m4, m5
  175. punpcklbw m2, m1
  176. punpcklbw m4, m1
  177. paddw m2, m0
  178. paddw m4, m0
  179. packuswb m2, m4
  180. movd [r0], m2
  181. pextrd [r0+r1], m2, 1
  182. pextrd [r2], m2, 2
  183. pextrd [r2+r1], m2, 3
  184. RET