You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

366 lines
8.7KB

  1. ;******************************************************************************
  2. ;* SIMD-optimized halfpel functions
  3. ;*
  4. ;* This file is part of Libav.
  5. ;*
  6. ;* Libav is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* Libav is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with Libav; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. cextern pb_1
  23. SECTION .text
  24. ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  25. %macro PUT_PIXELS8_X2 0
  26. cglobal put_pixels8_x2, 4,5
  27. lea r4, [r2*2]
  28. .loop:
  29. mova m0, [r1]
  30. mova m1, [r1+r2]
  31. PAVGB m0, [r1+1]
  32. PAVGB m1, [r1+r2+1]
  33. mova [r0], m0
  34. mova [r0+r2], m1
  35. add r1, r4
  36. add r0, r4
  37. mova m0, [r1]
  38. mova m1, [r1+r2]
  39. PAVGB m0, [r1+1]
  40. PAVGB m1, [r1+r2+1]
  41. add r1, r4
  42. mova [r0], m0
  43. mova [r0+r2], m1
  44. add r0, r4
  45. sub r3d, 4
  46. jne .loop
  47. REP_RET
  48. %endmacro
  49. INIT_MMX mmxext
  50. PUT_PIXELS8_X2
  51. INIT_MMX 3dnow
  52. PUT_PIXELS8_X2
  53. ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  54. %macro PUT_PIXELS_16 0
  55. cglobal put_pixels16_x2, 4,5
  56. lea r4, [r2*2]
  57. .loop:
  58. mova m0, [r1]
  59. mova m1, [r1+r2]
  60. mova m2, [r1+8]
  61. mova m3, [r1+r2+8]
  62. PAVGB m0, [r1+1]
  63. PAVGB m1, [r1+r2+1]
  64. PAVGB m2, [r1+9]
  65. PAVGB m3, [r1+r2+9]
  66. mova [r0], m0
  67. mova [r0+r2], m1
  68. mova [r0+8], m2
  69. mova [r0+r2+8], m3
  70. add r1, r4
  71. add r0, r4
  72. mova m0, [r1]
  73. mova m1, [r1+r2]
  74. mova m2, [r1+8]
  75. mova m3, [r1+r2+8]
  76. PAVGB m0, [r1+1]
  77. PAVGB m1, [r1+r2+1]
  78. PAVGB m2, [r1+9]
  79. PAVGB m3, [r1+r2+9]
  80. add r1, r4
  81. mova [r0], m0
  82. mova [r0+r2], m1
  83. mova [r0+8], m2
  84. mova [r0+r2+8], m3
  85. add r0, r4
  86. sub r3d, 4
  87. jne .loop
  88. REP_RET
  89. %endmacro
  90. INIT_MMX mmxext
  91. PUT_PIXELS_16
  92. INIT_MMX 3dnow
  93. PUT_PIXELS_16
  94. ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  95. %macro PUT_NO_RND_PIXELS8_X2 0
  96. cglobal put_no_rnd_pixels8_x2, 4,5
  97. mova m6, [pb_1]
  98. lea r4, [r2*2]
  99. .loop:
  100. mova m0, [r1]
  101. mova m2, [r1+r2]
  102. mova m1, [r1+1]
  103. mova m3, [r1+r2+1]
  104. add r1, r4
  105. psubusb m0, m6
  106. psubusb m2, m6
  107. PAVGB m0, m1
  108. PAVGB m2, m3
  109. mova [r0], m0
  110. mova [r0+r2], m2
  111. mova m0, [r1]
  112. mova m1, [r1+1]
  113. mova m2, [r1+r2]
  114. mova m3, [r1+r2+1]
  115. add r0, r4
  116. add r1, r4
  117. psubusb m0, m6
  118. psubusb m2, m6
  119. PAVGB m0, m1
  120. PAVGB m2, m3
  121. mova [r0], m0
  122. mova [r0+r2], m2
  123. add r0, r4
  124. sub r3d, 4
  125. jne .loop
  126. REP_RET
  127. %endmacro
  128. INIT_MMX mmxext
  129. PUT_NO_RND_PIXELS8_X2
  130. INIT_MMX 3dnow
  131. PUT_NO_RND_PIXELS8_X2
  132. ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  133. %macro PUT_PIXELS8_Y2 0
  134. cglobal put_pixels8_y2, 4,5
  135. lea r4, [r2*2]
  136. mova m0, [r1]
  137. sub r0, r2
  138. .loop:
  139. mova m1, [r1+r2]
  140. mova m2, [r1+r4]
  141. add r1, r4
  142. PAVGB m0, m1
  143. PAVGB m1, m2
  144. mova [r0+r2], m0
  145. mova [r0+r4], m1
  146. mova m1, [r1+r2]
  147. mova m0, [r1+r4]
  148. add r0, r4
  149. add r1, r4
  150. PAVGB m2, m1
  151. PAVGB m1, m0
  152. mova [r0+r2], m2
  153. mova [r0+r4], m1
  154. add r0, r4
  155. sub r3d, 4
  156. jne .loop
  157. REP_RET
  158. %endmacro
  159. INIT_MMX mmxext
  160. PUT_PIXELS8_Y2
  161. INIT_MMX 3dnow
  162. PUT_PIXELS8_Y2
  163. ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  164. %macro PUT_NO_RND_PIXELS8_Y2 0
  165. cglobal put_no_rnd_pixels8_y2, 4,5
  166. mova m6, [pb_1]
  167. lea r4, [r2+r2]
  168. mova m0, [r1]
  169. sub r0, r2
  170. .loop:
  171. mova m1, [r1+r2]
  172. mova m2, [r1+r4]
  173. add r1, r4
  174. psubusb m1, m6
  175. PAVGB m0, m1
  176. PAVGB m1, m2
  177. mova [r0+r2], m0
  178. mova [r0+r4], m1
  179. mova m1, [r1+r2]
  180. mova m0, [r1+r4]
  181. add r0, r4
  182. add r1, r4
  183. psubusb m1, m6
  184. PAVGB m2, m1
  185. PAVGB m1, m0
  186. mova [r0+r2], m2
  187. mova [r0+r4], m1
  188. add r0, r4
  189. sub r3d, 4
  190. jne .loop
  191. REP_RET
  192. %endmacro
  193. INIT_MMX mmxext
  194. PUT_NO_RND_PIXELS8_Y2
  195. INIT_MMX 3dnow
  196. PUT_NO_RND_PIXELS8_Y2
  197. ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  198. %macro AVG_PIXELS8 0
  199. cglobal avg_pixels8, 4,5
  200. lea r4, [r2*2]
  201. .loop:
  202. mova m0, [r0]
  203. mova m1, [r0+r2]
  204. PAVGB m0, [r1]
  205. PAVGB m1, [r1+r2]
  206. mova [r0], m0
  207. mova [r0+r2], m1
  208. add r1, r4
  209. add r0, r4
  210. mova m0, [r0]
  211. mova m1, [r0+r2]
  212. PAVGB m0, [r1]
  213. PAVGB m1, [r1+r2]
  214. add r1, r4
  215. mova [r0], m0
  216. mova [r0+r2], m1
  217. add r0, r4
  218. sub r3d, 4
  219. jne .loop
  220. REP_RET
  221. %endmacro
  222. INIT_MMX 3dnow
  223. AVG_PIXELS8
  224. ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  225. %macro AVG_PIXELS8_X2 0
  226. cglobal avg_pixels8_x2, 4,5
  227. lea r4, [r2*2]
  228. .loop:
  229. mova m0, [r1]
  230. mova m2, [r1+r2]
  231. PAVGB m0, [r1+1]
  232. PAVGB m2, [r1+r2+1]
  233. PAVGB m0, [r0]
  234. PAVGB m2, [r0+r2]
  235. add r1, r4
  236. mova [r0], m0
  237. mova [r0+r2], m2
  238. mova m0, [r1]
  239. mova m2, [r1+r2]
  240. PAVGB m0, [r1+1]
  241. PAVGB m2, [r1+r2+1]
  242. add r0, r4
  243. add r1, r4
  244. PAVGB m0, [r0]
  245. PAVGB m2, [r0+r2]
  246. mova [r0], m0
  247. mova [r0+r2], m2
  248. add r0, r4
  249. sub r3d, 4
  250. jne .loop
  251. REP_RET
  252. %endmacro
  253. INIT_MMX mmxext
  254. AVG_PIXELS8_X2
  255. INIT_MMX 3dnow
  256. AVG_PIXELS8_X2
  257. ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  258. %macro AVG_PIXELS8_Y2 0
  259. cglobal avg_pixels8_y2, 4,5
  260. lea r4, [r2*2]
  261. mova m0, [r1]
  262. sub r0, r2
  263. .loop:
  264. mova m1, [r1+r2]
  265. mova m2, [r1+r4]
  266. add r1, r4
  267. PAVGB m0, m1
  268. PAVGB m1, m2
  269. mova m3, [r0+r2]
  270. mova m4, [r0+r4]
  271. PAVGB m0, m3
  272. PAVGB m1, m4
  273. mova [r0+r2], m0
  274. mova [r0+r4], m1
  275. mova m1, [r1+r2]
  276. mova m0, [r1+r4]
  277. PAVGB m2, m1
  278. PAVGB m1, m0
  279. add r0, r4
  280. add r1, r4
  281. mova m3, [r0+r2]
  282. mova m4, [r0+r4]
  283. PAVGB m2, m3
  284. PAVGB m1, m4
  285. mova [r0+r2], m2
  286. mova [r0+r4], m1
  287. add r0, r4
  288. sub r3d, 4
  289. jne .loop
  290. REP_RET
  291. %endmacro
  292. INIT_MMX mmxext
  293. AVG_PIXELS8_Y2
  294. INIT_MMX 3dnow
  295. AVG_PIXELS8_Y2
  296. ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  297. %macro AVG_PIXELS8_XY2 0
  298. cglobal avg_pixels8_xy2, 4,5
  299. mova m6, [pb_1]
  300. lea r4, [r2*2]
  301. mova m0, [r1]
  302. PAVGB m0, [r1+1]
  303. .loop:
  304. mova m2, [r1+r4]
  305. mova m1, [r1+r2]
  306. psubusb m2, m6
  307. PAVGB m1, [r1+r2+1]
  308. PAVGB m2, [r1+r4+1]
  309. add r1, r4
  310. PAVGB m0, m1
  311. PAVGB m1, m2
  312. PAVGB m0, [r0]
  313. PAVGB m1, [r0+r2]
  314. mova [r0], m0
  315. mova [r0+r2], m1
  316. mova m1, [r1+r2]
  317. mova m0, [r1+r4]
  318. PAVGB m1, [r1+r2+1]
  319. PAVGB m0, [r1+r4+1]
  320. add r0, r4
  321. add r1, r4
  322. PAVGB m2, m1
  323. PAVGB m1, m0
  324. PAVGB m2, [r0]
  325. PAVGB m1, [r0+r2]
  326. mova [r0], m2
  327. mova [r0+r2], m1
  328. add r0, r4
  329. sub r3d, 4
  330. jne .loop
  331. REP_RET
  332. %endmacro
  333. INIT_MMX mmxext
  334. AVG_PIXELS8_XY2
  335. INIT_MMX 3dnow
  336. AVG_PIXELS8_XY2