You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
6.3KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION .text
  25. %macro op_avgh 3
  26. movh %3, %2
  27. pavgb %1, %3
  28. movh %2, %1
  29. %endmacro
  30. %macro op_avg 2
  31. pavgb %1, %2
  32. mova %2, %1
  33. %endmacro
  34. %macro op_puth 2-3
  35. movh %2, %1
  36. %endmacro
  37. %macro op_put 2
  38. mova %2, %1
  39. %endmacro
  40. ; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  41. %macro PIXELS4_L2 1
  42. %define OP op_%1h
  43. cglobal %1_pixels4_l2, 6,6
  44. movsxdifnidn r3, r3d
  45. movsxdifnidn r4, r4d
  46. test r5d, 1
  47. je .loop
  48. movd m0, [r1]
  49. movd m1, [r2]
  50. add r1, r4
  51. add r2, 4
  52. pavgb m0, m1
  53. OP m0, [r0], m3
  54. add r0, r3
  55. dec r5d
  56. .loop:
  57. mova m0, [r1]
  58. mova m1, [r1+r4]
  59. lea r1, [r1+2*r4]
  60. pavgb m0, [r2]
  61. pavgb m1, [r2+4]
  62. OP m0, [r0], m3
  63. OP m1, [r0+r3], m3
  64. lea r0, [r0+2*r3]
  65. mova m0, [r1]
  66. mova m1, [r1+r4]
  67. lea r1, [r1+2*r4]
  68. pavgb m0, [r2+8]
  69. pavgb m1, [r2+12]
  70. OP m0, [r0], m3
  71. OP m1, [r0+r3], m3
  72. lea r0, [r0+2*r3]
  73. add r2, 16
  74. sub r5d, 4
  75. jne .loop
  76. REP_RET
  77. %endmacro
  78. INIT_MMX mmxext
  79. PIXELS4_L2 put
  80. PIXELS4_L2 avg
  81. ; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  82. %macro PIXELS8_L2 1
  83. %define OP op_%1
  84. cglobal %1_pixels8_l2, 6,6
  85. movsxdifnidn r3, r3d
  86. movsxdifnidn r4, r4d
  87. test r5d, 1
  88. je .loop
  89. mova m0, [r1]
  90. mova m1, [r2]
  91. add r1, r4
  92. add r2, 8
  93. pavgb m0, m1
  94. OP m0, [r0]
  95. add r0, r3
  96. dec r5d
  97. .loop:
  98. mova m0, [r1]
  99. mova m1, [r1+r4]
  100. lea r1, [r1+2*r4]
  101. pavgb m0, [r2]
  102. pavgb m1, [r2+8]
  103. OP m0, [r0]
  104. OP m1, [r0+r3]
  105. lea r0, [r0+2*r3]
  106. mova m0, [r1]
  107. mova m1, [r1+r4]
  108. lea r1, [r1+2*r4]
  109. pavgb m0, [r2+16]
  110. pavgb m1, [r2+24]
  111. OP m0, [r0]
  112. OP m1, [r0+r3]
  113. lea r0, [r0+2*r3]
  114. add r2, 32
  115. sub r5d, 4
  116. jne .loop
  117. REP_RET
  118. %endmacro
  119. INIT_MMX mmxext
  120. PIXELS8_L2 put
  121. PIXELS8_L2 avg
  122. ; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  123. %macro PIXELS16_L2 1
  124. %define OP op_%1
  125. cglobal %1_pixels16_l2, 6,6
  126. movsxdifnidn r3, r3d
  127. movsxdifnidn r4, r4d
  128. test r5d, 1
  129. je .loop
  130. mova m0, [r1]
  131. mova m1, [r1+8]
  132. pavgb m0, [r2]
  133. pavgb m1, [r2+8]
  134. add r1, r4
  135. add r2, 16
  136. OP m0, [r0]
  137. OP m1, [r0+8]
  138. add r0, r3
  139. dec r5d
  140. .loop:
  141. mova m0, [r1]
  142. mova m1, [r1+8]
  143. add r1, r4
  144. pavgb m0, [r2]
  145. pavgb m1, [r2+8]
  146. OP m0, [r0]
  147. OP m1, [r0+8]
  148. add r0, r3
  149. mova m0, [r1]
  150. mova m1, [r1+8]
  151. add r1, r4
  152. pavgb m0, [r2+16]
  153. pavgb m1, [r2+24]
  154. OP m0, [r0]
  155. OP m1, [r0+8]
  156. add r0, r3
  157. add r2, 32
  158. sub r5d, 2
  159. jne .loop
  160. REP_RET
  161. %endmacro
  162. INIT_MMX mmxext
  163. PIXELS16_L2 put
  164. PIXELS16_L2 avg
  165. INIT_MMX mmxext
  166. ; void pixels(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  167. %macro PIXELS48 2
  168. %if %2 == 4
  169. %define OP movh
  170. %else
  171. %define OP mova
  172. %endif
  173. cglobal %1_pixels%2, 4,5
  174. movsxdifnidn r2, r2d
  175. lea r4, [r2*3]
  176. .loop:
  177. OP m0, [r1]
  178. OP m1, [r1+r2]
  179. OP m2, [r1+r2*2]
  180. OP m3, [r1+r4]
  181. lea r1, [r1+r2*4]
  182. %ifidn %1, avg
  183. pavgb m0, [r0]
  184. pavgb m1, [r0+r2]
  185. pavgb m2, [r0+r2*2]
  186. pavgb m3, [r0+r4]
  187. %endif
  188. OP [r0], m0
  189. OP [r0+r2], m1
  190. OP [r0+r2*2], m2
  191. OP [r0+r4], m3
  192. sub r3d, 4
  193. lea r0, [r0+r2*4]
  194. jne .loop
  195. RET
  196. %endmacro
  197. PIXELS48 put, 4
  198. PIXELS48 avg, 4
  199. PIXELS48 put, 8
  200. PIXELS48 avg, 8
  201. INIT_XMM sse2
  202. ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  203. cglobal put_pixels16, 4,5,4
  204. lea r4, [r2*3]
  205. .loop:
  206. movu m0, [r1]
  207. movu m1, [r1+r2]
  208. movu m2, [r1+r2*2]
  209. movu m3, [r1+r4]
  210. lea r1, [r1+r2*4]
  211. mova [r0], m0
  212. mova [r0+r2], m1
  213. mova [r0+r2*2], m2
  214. mova [r0+r4], m3
  215. sub r3d, 4
  216. lea r0, [r0+r2*4]
  217. jnz .loop
  218. REP_RET
  219. ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  220. cglobal avg_pixels16, 4,5,4
  221. lea r4, [r2*3]
  222. .loop:
  223. movu m0, [r1]
  224. movu m1, [r1+r2]
  225. movu m2, [r1+r2*2]
  226. movu m3, [r1+r4]
  227. lea r1, [r1+r2*4]
  228. pavgb m0, [r0]
  229. pavgb m1, [r0+r2]
  230. pavgb m2, [r0+r2*2]
  231. pavgb m3, [r0+r4]
  232. mova [r0], m0
  233. mova [r0+r2], m1
  234. mova [r0+r2*2], m2
  235. mova [r0+r4], m3
  236. sub r3d, 4
  237. lea r0, [r0+r2*4]
  238. jnz .loop
  239. REP_RET