You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
4.7KB

  1. ;******************************************************************************
  2. ;* x86 optimizations for PNG decoding
  3. ;*
  4. ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
  5. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. cextern pw_255
  26. SECTION .text
  27. ; %1 = nr. of xmm registers used
  28. %macro ADD_BYTES_FN 1
  29. cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
  30. %if ARCH_X86_64
  31. movsxd waq, wad
  32. %endif
  33. xor iq, iq
  34. ; vector loop
  35. mov wq, waq
  36. and waq, ~(mmsize*2-1)
  37. jmp .end_v
  38. .loop_v:
  39. mova m0, [src1q+iq]
  40. mova m1, [src1q+iq+mmsize]
  41. paddb m0, [src2q+iq]
  42. paddb m1, [src2q+iq+mmsize]
  43. mova [dstq+iq ], m0
  44. mova [dstq+iq+mmsize], m1
  45. add iq, mmsize*2
  46. .end_v:
  47. cmp iq, waq
  48. jl .loop_v
  49. %if mmsize == 16
  50. ; vector loop
  51. mov waq, wq
  52. and waq, ~7
  53. jmp .end_l
  54. .loop_l:
  55. movq mm0, [src1q+iq]
  56. paddb mm0, [src2q+iq]
  57. movq [dstq+iq ], mm0
  58. add iq, 8
  59. .end_l:
  60. cmp iq, waq
  61. jl .loop_l
  62. %endif
  63. ; scalar loop for leftover
  64. jmp .end_s
  65. .loop_s:
  66. mov wab, [src1q+iq]
  67. add wab, [src2q+iq]
  68. mov [dstq+iq], wab
  69. inc iq
  70. .end_s:
  71. cmp iq, wq
  72. jl .loop_s
  73. REP_RET
  74. %endmacro
  75. %if ARCH_X86_32
  76. INIT_MMX mmx
  77. ADD_BYTES_FN 0
  78. %endif
  79. INIT_XMM sse2
  80. ADD_BYTES_FN 2
  81. %macro ADD_PAETH_PRED_FN 1
  82. cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
  83. %if ARCH_X86_64
  84. movsxd bppq, bppd
  85. movsxd wq, wd
  86. %endif
  87. lea endq, [dstq+wq-(mmsize/2-1)]
  88. sub topq, dstq
  89. sub srcq, dstq
  90. sub dstq, bppq
  91. pxor m7, m7
  92. PUSH dstq
  93. lea cntrq, [bppq-1]
  94. shr cntrq, 2 + mmsize/16
  95. .bpp_loop:
  96. lea dstq, [dstq+cntrq*(mmsize/2)]
  97. movh m0, [dstq]
  98. movh m1, [topq+dstq]
  99. punpcklbw m0, m7
  100. punpcklbw m1, m7
  101. add dstq, bppq
  102. .loop:
  103. mova m2, m1
  104. movh m1, [topq+dstq]
  105. mova m3, m2
  106. punpcklbw m1, m7
  107. mova m4, m2
  108. psubw m3, m1
  109. psubw m4, m0
  110. mova m5, m3
  111. paddw m5, m4
  112. %if cpuflag(ssse3)
  113. pabsw m3, m3
  114. pabsw m4, m4
  115. pabsw m5, m5
  116. %else ; !cpuflag(ssse3)
  117. psubw m7, m5
  118. pmaxsw m5, m7
  119. pxor m6, m6
  120. pxor m7, m7
  121. psubw m6, m3
  122. psubw m7, m4
  123. pmaxsw m3, m6
  124. pmaxsw m4, m7
  125. pxor m7, m7
  126. %endif ; cpuflag(ssse3)
  127. mova m6, m4
  128. pminsw m6, m5
  129. pcmpgtw m3, m6
  130. pcmpgtw m4, m5
  131. mova m6, m4
  132. pand m4, m3
  133. pandn m6, m3
  134. pandn m3, m0
  135. movh m0, [srcq+dstq]
  136. pand m6, m1
  137. pand m2, m4
  138. punpcklbw m0, m7
  139. paddw m0, m6
  140. paddw m3, m2
  141. paddw m0, m3
  142. pand m0, [pw_255]
  143. mova m3, m0
  144. packuswb m3, m3
  145. movh [dstq], m3
  146. add dstq, bppq
  147. cmp dstq, endq
  148. jle .loop
  149. mov dstq, [rsp]
  150. dec cntrq
  151. jge .bpp_loop
  152. POP dstq
  153. RET
  154. %endmacro
  155. INIT_MMX mmxext
  156. ADD_PAETH_PRED_FN 0
  157. INIT_MMX ssse3
  158. ADD_PAETH_PRED_FN 0