You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
4.6KB

  1. ;******************************************************************************
  2. ;* SIMD-optimized quarterpel functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION .text
  25. %macro op_avgh 3
  26. movh %3, %2
  27. pavgb %1, %3
  28. movh %2, %1
  29. %endmacro
  30. %macro op_avg 2
  31. pavgb %1, %2
  32. mova %2, %1
  33. %endmacro
  34. %macro op_puth 2-3
  35. movh %2, %1
  36. %endmacro
  37. %macro op_put 2
  38. mova %2, %1
  39. %endmacro
  40. ; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  41. ; int dstStride, int src1Stride, int h)
  42. %macro PIXELS4_L2 1
  43. %define OP op_%1h
  44. cglobal %1_pixels4_l2, 6,6
  45. movsxdifnidn r3, r3d
  46. movsxdifnidn r4, r4d
  47. test r5d, 1
  48. je .loop
  49. movd m0, [r1]
  50. movd m1, [r2]
  51. add r1, r4
  52. add r2, 4
  53. pavgb m0, m1
  54. OP m0, [r0], m3
  55. add r0, r3
  56. dec r5d
  57. .loop:
  58. mova m0, [r1]
  59. mova m1, [r1+r4]
  60. lea r1, [r1+2*r4]
  61. pavgb m0, [r2]
  62. pavgb m1, [r2+4]
  63. OP m0, [r0], m3
  64. OP m1, [r0+r3], m3
  65. lea r0, [r0+2*r3]
  66. mova m0, [r1]
  67. mova m1, [r1+r4]
  68. lea r1, [r1+2*r4]
  69. pavgb m0, [r2+8]
  70. pavgb m1, [r2+12]
  71. OP m0, [r0], m3
  72. OP m1, [r0+r3], m3
  73. lea r0, [r0+2*r3]
  74. add r2, 16
  75. sub r5d, 4
  76. jne .loop
  77. REP_RET
  78. %endmacro
  79. INIT_MMX mmxext
  80. PIXELS4_L2 put
  81. PIXELS4_L2 avg
  82. ; void ff_put/avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  83. ; int dstStride, int src1Stride, int h)
  84. %macro PIXELS8_L2 1
  85. %define OP op_%1
  86. cglobal %1_pixels8_l2, 6,6
  87. movsxdifnidn r3, r3d
  88. movsxdifnidn r4, r4d
  89. test r5d, 1
  90. je .loop
  91. mova m0, [r1]
  92. mova m1, [r2]
  93. add r1, r4
  94. add r2, 8
  95. pavgb m0, m1
  96. OP m0, [r0]
  97. add r0, r3
  98. dec r5d
  99. .loop:
  100. mova m0, [r1]
  101. mova m1, [r1+r4]
  102. lea r1, [r1+2*r4]
  103. pavgb m0, [r2]
  104. pavgb m1, [r2+8]
  105. OP m0, [r0]
  106. OP m1, [r0+r3]
  107. lea r0, [r0+2*r3]
  108. mova m0, [r1]
  109. mova m1, [r1+r4]
  110. lea r1, [r1+2*r4]
  111. pavgb m0, [r2+16]
  112. pavgb m1, [r2+24]
  113. OP m0, [r0]
  114. OP m1, [r0+r3]
  115. lea r0, [r0+2*r3]
  116. add r2, 32
  117. sub r5d, 4
  118. jne .loop
  119. REP_RET
  120. %endmacro
  121. INIT_MMX mmxext
  122. PIXELS8_L2 put
  123. PIXELS8_L2 avg
  124. ; void ff_put/avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  125. ; int dstStride, int src1Stride, int h)
  126. %macro PIXELS16_L2 1
  127. %define OP op_%1
  128. cglobal %1_pixels16_l2, 6,6
  129. movsxdifnidn r3, r3d
  130. movsxdifnidn r4, r4d
  131. test r5d, 1
  132. je .loop
  133. mova m0, [r1]
  134. mova m1, [r1+8]
  135. pavgb m0, [r2]
  136. pavgb m1, [r2+8]
  137. add r1, r4
  138. add r2, 16
  139. OP m0, [r0]
  140. OP m1, [r0+8]
  141. add r0, r3
  142. dec r5d
  143. .loop:
  144. mova m0, [r1]
  145. mova m1, [r1+8]
  146. add r1, r4
  147. pavgb m0, [r2]
  148. pavgb m1, [r2+8]
  149. OP m0, [r0]
  150. OP m1, [r0+8]
  151. add r0, r3
  152. mova m0, [r1]
  153. mova m1, [r1+8]
  154. add r1, r4
  155. pavgb m0, [r2+16]
  156. pavgb m1, [r2+24]
  157. OP m0, [r0]
  158. OP m1, [r0+8]
  159. add r0, r3
  160. add r2, 32
  161. sub r5d, 2
  162. jne .loop
  163. REP_RET
  164. %endmacro
  165. INIT_MMX mmxext
  166. PIXELS16_L2 put
  167. PIXELS16_L2 avg