You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

177 lines
4.5KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION .text
  25. %macro op_avgh 3
  26. movh %3, %2
  27. pavgb %1, %3
  28. movh %2, %1
  29. %endmacro
  30. %macro op_avg 2
  31. pavgb %1, %2
  32. mova %2, %1
  33. %endmacro
  34. %macro op_puth 2-3
  35. movh %2, %1
  36. %endmacro
  37. %macro op_put 2
  38. mova %2, %1
  39. %endmacro
  40. ; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  41. %macro PIXELS4_L2 1
  42. %define OP op_%1h
  43. cglobal %1_pixels4_l2, 6,6
  44. movsxdifnidn r3, r3d
  45. movsxdifnidn r4, r4d
  46. test r5d, 1
  47. je .loop
  48. movd m0, [r1]
  49. movd m1, [r2]
  50. add r1, r4
  51. add r2, 4
  52. pavgb m0, m1
  53. OP m0, [r0], m3
  54. add r0, r3
  55. dec r5d
  56. .loop:
  57. mova m0, [r1]
  58. mova m1, [r1+r4]
  59. lea r1, [r1+2*r4]
  60. pavgb m0, [r2]
  61. pavgb m1, [r2+4]
  62. OP m0, [r0], m3
  63. OP m1, [r0+r3], m3
  64. lea r0, [r0+2*r3]
  65. mova m0, [r1]
  66. mova m1, [r1+r4]
  67. lea r1, [r1+2*r4]
  68. pavgb m0, [r2+8]
  69. pavgb m1, [r2+12]
  70. OP m0, [r0], m3
  71. OP m1, [r0+r3], m3
  72. lea r0, [r0+2*r3]
  73. add r2, 16
  74. sub r5d, 4
  75. jne .loop
  76. REP_RET
  77. %endmacro
  78. INIT_MMX mmxext
  79. PIXELS4_L2 put
  80. PIXELS4_L2 avg
  81. ; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  82. %macro PIXELS8_L2 1
  83. %define OP op_%1
  84. cglobal %1_pixels8_l2, 6,6
  85. movsxdifnidn r3, r3d
  86. movsxdifnidn r4, r4d
  87. test r5d, 1
  88. je .loop
  89. mova m0, [r1]
  90. mova m1, [r2]
  91. add r1, r4
  92. add r2, 8
  93. pavgb m0, m1
  94. OP m0, [r0]
  95. add r0, r3
  96. dec r5d
  97. .loop:
  98. mova m0, [r1]
  99. mova m1, [r1+r4]
  100. lea r1, [r1+2*r4]
  101. pavgb m0, [r2]
  102. pavgb m1, [r2+8]
  103. OP m0, [r0]
  104. OP m1, [r0+r3]
  105. lea r0, [r0+2*r3]
  106. mova m0, [r1]
  107. mova m1, [r1+r4]
  108. lea r1, [r1+2*r4]
  109. pavgb m0, [r2+16]
  110. pavgb m1, [r2+24]
  111. OP m0, [r0]
  112. OP m1, [r0+r3]
  113. lea r0, [r0+2*r3]
  114. add r2, 32
  115. sub r5d, 4
  116. jne .loop
  117. REP_RET
  118. %endmacro
  119. INIT_MMX mmxext
  120. PIXELS8_L2 put
  121. PIXELS8_L2 avg
  122. ; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  123. %macro PIXELS16_L2 1
  124. %define OP op_%1
  125. cglobal %1_pixels16_l2, 6,6
  126. movsxdifnidn r3, r3d
  127. movsxdifnidn r4, r4d
  128. test r5d, 1
  129. je .loop
  130. mova m0, [r1]
  131. mova m1, [r1+8]
  132. pavgb m0, [r2]
  133. pavgb m1, [r2+8]
  134. add r1, r4
  135. add r2, 16
  136. OP m0, [r0]
  137. OP m1, [r0+8]
  138. add r0, r3
  139. dec r5d
  140. .loop:
  141. mova m0, [r1]
  142. mova m1, [r1+8]
  143. add r1, r4
  144. pavgb m0, [r2]
  145. pavgb m1, [r2+8]
  146. OP m0, [r0]
  147. OP m1, [r0+8]
  148. add r0, r3
  149. mova m0, [r1]
  150. mova m1, [r1+8]
  151. add r1, r4
  152. pavgb m0, [r2+16]
  153. pavgb m1, [r2+24]
  154. OP m0, [r0]
  155. OP m1, [r0+8]
  156. add r0, r3
  157. add r2, 32
  158. sub r5d, 2
  159. jne .loop
  160. REP_RET
  161. %endmacro
  162. INIT_MMX mmxext
  163. PIXELS16_L2 put
  164. PIXELS16_L2 avg