You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

175 lines
4.7KB

  1. ;******************************************************************************
  2. ;* x86 optimizations for PNG decoding
  3. ;*
  4. ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
  5. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "x86inc.asm"
  24. %include "x86util.asm"
  25. SECTION_RODATA
  26. cextern pw_255
  27. section .text align=16
  28. ; %1 = nr. of xmm registers used
  29. %macro ADD_BYTES_FN 1
  30. cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
  31. %if ARCH_X86_64
  32. movsxd waq, wad
  33. %endif
  34. xor iq, iq
  35. ; vector loop
  36. mov wq, waq
  37. and waq, ~(mmsize*2-1)
  38. jmp .end_v
  39. .loop_v:
  40. mova m0, [src1q+iq]
  41. mova m1, [src1q+iq+mmsize]
  42. paddb m0, [src2q+iq]
  43. paddb m1, [src2q+iq+mmsize]
  44. mova [dstq+iq ], m0
  45. mova [dstq+iq+mmsize], m1
  46. add iq, mmsize*2
  47. .end_v:
  48. cmp iq, waq
  49. jl .loop_v
  50. %if mmsize == 16
  51. ; vector loop
  52. mov waq, wq
  53. and waq, ~7
  54. jmp .end_l
  55. .loop_l:
  56. movq mm0, [src1q+iq]
  57. paddb mm0, [src2q+iq]
  58. movq [dstq+iq ], mm0
  59. add iq, 8
  60. .end_l:
  61. cmp iq, waq
  62. jl .loop_l
  63. %endif
  64. ; scalar loop for leftover
  65. jmp .end_s
  66. .loop_s:
  67. mov wab, [src1q+iq]
  68. add wab, [src2q+iq]
  69. mov [dstq+iq], wab
  70. inc iq
  71. .end_s:
  72. cmp iq, wq
  73. jl .loop_s
  74. REP_RET
  75. %endmacro
  76. %if ARCH_X86_32
  77. INIT_MMX mmx
  78. ADD_BYTES_FN 0
  79. %endif
  80. INIT_XMM sse2
  81. ADD_BYTES_FN 2
  82. %macro ADD_PAETH_PRED_FN 1
  83. cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
  84. %if ARCH_X86_64
  85. movsxd bppq, bppd
  86. movsxd wq, wd
  87. %endif
  88. lea endq, [dstq+wq-(mmsize/2-1)]
  89. sub topq, dstq
  90. sub srcq, dstq
  91. sub dstq, bppq
  92. pxor m7, m7
  93. PUSH dstq
  94. lea cntrq, [bppq-1]
  95. shr cntrq, 2 + mmsize/16
  96. .bpp_loop:
  97. lea dstq, [dstq+cntrq*(mmsize/2)]
  98. movh m0, [dstq]
  99. movh m1, [topq+dstq]
  100. punpcklbw m0, m7
  101. punpcklbw m1, m7
  102. add dstq, bppq
  103. .loop:
  104. mova m2, m1
  105. movh m1, [topq+dstq]
  106. mova m3, m2
  107. punpcklbw m1, m7
  108. mova m4, m2
  109. psubw m3, m1
  110. psubw m4, m0
  111. mova m5, m3
  112. paddw m5, m4
  113. %if cpuflag(ssse3)
  114. pabsw m3, m3
  115. pabsw m4, m4
  116. pabsw m5, m5
  117. %else ; !cpuflag(ssse3)
  118. psubw m7, m5
  119. pmaxsw m5, m7
  120. pxor m6, m6
  121. pxor m7, m7
  122. psubw m6, m3
  123. psubw m7, m4
  124. pmaxsw m3, m6
  125. pmaxsw m4, m7
  126. pxor m7, m7
  127. %endif ; cpuflag(ssse3)
  128. mova m6, m4
  129. pminsw m6, m5
  130. pcmpgtw m3, m6
  131. pcmpgtw m4, m5
  132. mova m6, m4
  133. pand m4, m3
  134. pandn m6, m3
  135. pandn m3, m0
  136. movh m0, [srcq+dstq]
  137. pand m6, m1
  138. pand m2, m4
  139. punpcklbw m0, m7
  140. paddw m0, m6
  141. paddw m3, m2
  142. paddw m0, m3
  143. pand m0, [pw_255]
  144. mova m3, m0
  145. packuswb m3, m3
  146. movh [dstq], m3
  147. add dstq, bppq
  148. cmp dstq, endq
  149. jle .loop
  150. mov dstq, [rsp]
  151. dec cntrq
  152. jge .bpp_loop
  153. POP dstq
  154. RET
  155. %endmacro
  156. INIT_MMX mmx2
  157. ADD_PAETH_PRED_FN 0
  158. INIT_MMX ssse3
  159. ADD_PAETH_PRED_FN 0