You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

166 lines
4.1KB

  1. ;******************************************************************************
  2. ;* SIMD-optimized HuffYUV functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. SECTION .text
  29. ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
  30. ; const uint8_t *diff, int w,
  31. ; int *left, int *left_top)
  32. INIT_MMX mmxext
  33. cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
  34. movq mm0, [topq]
  35. movq mm2, mm0
  36. movd mm4, [left_topq]
  37. psllq mm2, 8
  38. movq mm1, mm0
  39. por mm4, mm2
  40. movd mm3, [leftq]
  41. psubb mm0, mm4 ; t-tl
  42. add dstq, wq
  43. add topq, wq
  44. add diffq, wq
  45. neg wq
  46. jmp .skip
  47. .loop:
  48. movq mm4, [topq+wq]
  49. movq mm0, mm4
  50. psllq mm4, 8
  51. por mm4, mm1
  52. movq mm1, mm0 ; t
  53. psubb mm0, mm4 ; t-tl
  54. .skip:
  55. movq mm2, [diffq+wq]
  56. %assign i 0
  57. %rep 8
  58. movq mm4, mm0
  59. paddb mm4, mm3 ; t-tl+l
  60. movq mm5, mm3
  61. pmaxub mm3, mm1
  62. pminub mm5, mm1
  63. pminub mm3, mm4
  64. pmaxub mm3, mm5 ; median
  65. paddb mm3, mm2 ; +residual
  66. %if i==0
  67. movq mm7, mm3
  68. psllq mm7, 56
  69. %else
  70. movq mm6, mm3
  71. psrlq mm7, 8
  72. psllq mm6, 56
  73. por mm7, mm6
  74. %endif
  75. %if i<7
  76. psrlq mm0, 8
  77. psrlq mm1, 8
  78. psrlq mm2, 8
  79. %endif
  80. %assign i i+1
  81. %endrep
  82. movq [dstq+wq], mm7
  83. add wq, 8
  84. jl .loop
  85. movzx r2d, byte [dstq-1]
  86. mov [leftq], r2d
  87. movzx r2d, byte [topq-1]
  88. mov [left_topq], r2d
  89. RET
  90. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  91. add srcq, wq
  92. add dstq, wq
  93. neg wq
  94. %%.loop:
  95. %if %2
  96. mova m1, [srcq+wq]
  97. %else
  98. movu m1, [srcq+wq]
  99. %endif
  100. mova m2, m1
  101. psllw m1, 8
  102. paddb m1, m2
  103. mova m2, m1
  104. pshufb m1, m3
  105. paddb m1, m2
  106. pshufb m0, m5
  107. mova m2, m1
  108. pshufb m1, m4
  109. paddb m1, m2
  110. %if mmsize == 16
  111. mova m2, m1
  112. pshufb m1, m6
  113. paddb m1, m2
  114. %endif
  115. paddb m0, m1
  116. %if %1
  117. mova [dstq+wq], m0
  118. %else
  119. movq [dstq+wq], m0
  120. movhps [dstq+wq+8], m0
  121. %endif
  122. add wq, mmsize
  123. jl %%.loop
  124. mov eax, mmsize-1
  125. sub eax, wd
  126. movd m1, eax
  127. pshufb m0, m1
  128. movd eax, m0
  129. RET
  130. %endmacro
  131. ; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
  132. INIT_MMX ssse3
  133. cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
  134. .skip_prologue:
  135. mova m5, [pb_7]
  136. mova m4, [pb_zzzz3333zzzzbbbb]
  137. mova m3, [pb_zz11zz55zz99zzdd]
  138. movd m0, leftm
  139. psllq m0, 56
  140. ADD_HFYU_LEFT_LOOP 1, 1
  141. INIT_XMM ssse3
  142. cglobal add_hfyu_left_pred_unaligned, 3,3,7, dst, src, w, left
  143. mova m5, [pb_f]
  144. mova m6, [pb_zzzzzzzz77777777]
  145. mova m4, [pb_zzzz3333zzzzbbbb]
  146. mova m3, [pb_zz11zz55zz99zzdd]
  147. movd m0, leftm
  148. pslldq m0, 15
  149. test srcq, 15
  150. jnz .src_unaligned
  151. test dstq, 15
  152. jnz .dst_unaligned
  153. ADD_HFYU_LEFT_LOOP 1, 1
  154. .dst_unaligned:
  155. ADD_HFYU_LEFT_LOOP 0, 1
  156. .src_unaligned:
  157. ADD_HFYU_LEFT_LOOP 0, 0