You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

466 lines
11KB

  1. ;******************************************************************************
  2. ;* MMX optimized hpel functions
  3. ;*
  4. ;* This file is part of Libav.
  5. ;*
  6. ;* Libav is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* Libav is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with Libav; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. cextern pb_1
  23. SECTION_TEXT
  24. ; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  25. %macro PUT_PIXELS8_X2 0
  26. cglobal put_pixels8_x2, 4,5
  27. movsxdifnidn r2, r2d
  28. lea r4, [r2*2]
  29. .loop:
  30. mova m0, [r1]
  31. mova m1, [r1+r2]
  32. PAVGB m0, [r1+1]
  33. PAVGB m1, [r1+r2+1]
  34. mova [r0], m0
  35. mova [r0+r2], m1
  36. add r1, r4
  37. add r0, r4
  38. mova m0, [r1]
  39. mova m1, [r1+r2]
  40. PAVGB m0, [r1+1]
  41. PAVGB m1, [r1+r2+1]
  42. add r1, r4
  43. mova [r0], m0
  44. mova [r0+r2], m1
  45. add r0, r4
  46. sub r3d, 4
  47. jne .loop
  48. REP_RET
  49. %endmacro
  50. INIT_MMX mmxext
  51. PUT_PIXELS8_X2
  52. INIT_MMX 3dnow
  53. PUT_PIXELS8_X2
  54. ; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  55. %macro PUT_PIXELS_16 0
  56. cglobal put_pixels16_x2, 4,5
  57. movsxdifnidn r2, r2d
  58. lea r4, [r2*2]
  59. .loop:
  60. mova m0, [r1]
  61. mova m1, [r1+r2]
  62. mova m2, [r1+8]
  63. mova m3, [r1+r2+8]
  64. PAVGB m0, [r1+1]
  65. PAVGB m1, [r1+r2+1]
  66. PAVGB m2, [r1+9]
  67. PAVGB m3, [r1+r2+9]
  68. mova [r0], m0
  69. mova [r0+r2], m1
  70. mova [r0+8], m2
  71. mova [r0+r2+8], m3
  72. add r1, r4
  73. add r0, r4
  74. mova m0, [r1]
  75. mova m1, [r1+r2]
  76. mova m2, [r1+8]
  77. mova m3, [r1+r2+8]
  78. PAVGB m0, [r1+1]
  79. PAVGB m1, [r1+r2+1]
  80. PAVGB m2, [r1+9]
  81. PAVGB m3, [r1+r2+9]
  82. add r1, r4
  83. mova [r0], m0
  84. mova [r0+r2], m1
  85. mova [r0+8], m2
  86. mova [r0+r2+8], m3
  87. add r0, r4
  88. sub r3d, 4
  89. jne .loop
  90. REP_RET
  91. %endmacro
  92. INIT_MMX mmxext
  93. PUT_PIXELS_16
  94. INIT_MMX 3dnow
  95. PUT_PIXELS_16
  96. ; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  97. %macro PUT_NO_RND_PIXELS8_X2 0
  98. cglobal put_no_rnd_pixels8_x2, 4,5
  99. mova m6, [pb_1]
  100. movsxdifnidn r2, r2d
  101. lea r4, [r2*2]
  102. .loop:
  103. mova m0, [r1]
  104. mova m2, [r1+r2]
  105. mova m1, [r1+1]
  106. mova m3, [r1+r2+1]
  107. add r1, r4
  108. psubusb m0, m6
  109. psubusb m2, m6
  110. PAVGB m0, m1
  111. PAVGB m2, m3
  112. mova [r0], m0
  113. mova [r0+r2], m2
  114. mova m0, [r1]
  115. mova m1, [r1+1]
  116. mova m2, [r1+r2]
  117. mova m3, [r1+r2+1]
  118. add r0, r4
  119. add r1, r4
  120. psubusb m0, m6
  121. psubusb m2, m6
  122. PAVGB m0, m1
  123. PAVGB m2, m3
  124. mova [r0], m0
  125. mova [r0+r2], m2
  126. add r0, r4
  127. sub r3d, 4
  128. jne .loop
  129. REP_RET
  130. %endmacro
  131. INIT_MMX mmxext
  132. PUT_NO_RND_PIXELS8_X2
  133. INIT_MMX 3dnow
  134. PUT_NO_RND_PIXELS8_X2
  135. ; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  136. %macro PUT_NO_RND_PIXELS8_X2_EXACT 0
  137. cglobal put_no_rnd_pixels8_x2_exact, 4,5
  138. movsxdifnidn r2, r2d
  139. lea r4, [r2*3]
  140. pcmpeqb m6, m6
  141. .loop:
  142. mova m0, [r1]
  143. mova m2, [r1+r2]
  144. mova m1, [r1+1]
  145. mova m3, [r1+r2+1]
  146. pxor m0, m6
  147. pxor m2, m6
  148. pxor m1, m6
  149. pxor m3, m6
  150. PAVGB m0, m1
  151. PAVGB m2, m3
  152. pxor m0, m6
  153. pxor m2, m6
  154. mova [r0], m0
  155. mova [r0+r2], m2
  156. mova m0, [r1+r2*2]
  157. mova m1, [r1+r2*2+1]
  158. mova m2, [r1+r4]
  159. mova m3, [r1+r4+1]
  160. pxor m0, m6
  161. pxor m1, m6
  162. pxor m2, m6
  163. pxor m3, m6
  164. PAVGB m0, m1
  165. PAVGB m2, m3
  166. pxor m0, m6
  167. pxor m2, m6
  168. mova [r0+r2*2], m0
  169. mova [r0+r4], m2
  170. lea r1, [r1+r2*4]
  171. lea r0, [r0+r2*4]
  172. sub r3d, 4
  173. jg .loop
  174. REP_RET
  175. %endmacro
  176. INIT_MMX mmxext
  177. PUT_NO_RND_PIXELS8_X2_EXACT
  178. INIT_MMX 3dnow
  179. PUT_NO_RND_PIXELS8_X2_EXACT
  180. ; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  181. %macro PUT_PIXELS8_Y2 0
  182. cglobal put_pixels8_y2, 4,5
  183. movsxdifnidn r2, r2d
  184. lea r4, [r2*2]
  185. mova m0, [r1]
  186. sub r0, r2
  187. .loop:
  188. mova m1, [r1+r2]
  189. mova m2, [r1+r4]
  190. add r1, r4
  191. PAVGB m0, m1
  192. PAVGB m1, m2
  193. mova [r0+r2], m0
  194. mova [r0+r4], m1
  195. mova m1, [r1+r2]
  196. mova m0, [r1+r4]
  197. add r0, r4
  198. add r1, r4
  199. PAVGB m2, m1
  200. PAVGB m1, m0
  201. mova [r0+r2], m2
  202. mova [r0+r4], m1
  203. add r0, r4
  204. sub r3d, 4
  205. jne .loop
  206. REP_RET
  207. %endmacro
  208. INIT_MMX mmxext
  209. PUT_PIXELS8_Y2
  210. INIT_MMX 3dnow
  211. PUT_PIXELS8_Y2
  212. ; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  213. %macro PUT_NO_RND_PIXELS8_Y2 0
  214. cglobal put_no_rnd_pixels8_y2, 4,5
  215. mova m6, [pb_1]
  216. movsxdifnidn r2, r2d
  217. lea r4, [r2+r2]
  218. mova m0, [r1]
  219. sub r0, r2
  220. .loop:
  221. mova m1, [r1+r2]
  222. mova m2, [r1+r4]
  223. add r1, r4
  224. psubusb m1, m6
  225. PAVGB m0, m1
  226. PAVGB m1, m2
  227. mova [r0+r2], m0
  228. mova [r0+r4], m1
  229. mova m1, [r1+r2]
  230. mova m0, [r1+r4]
  231. add r0, r4
  232. add r1, r4
  233. psubusb m1, m6
  234. PAVGB m2, m1
  235. PAVGB m1, m0
  236. mova [r0+r2], m2
  237. mova [r0+r4], m1
  238. add r0, r4
  239. sub r3d, 4
  240. jne .loop
  241. REP_RET
  242. %endmacro
  243. INIT_MMX mmxext
  244. PUT_NO_RND_PIXELS8_Y2
  245. INIT_MMX 3dnow
  246. PUT_NO_RND_PIXELS8_Y2
  247. ; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  248. %macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
  249. cglobal put_no_rnd_pixels8_y2_exact, 4,5
  250. movsxdifnidn r2, r2d
  251. lea r4, [r2*3]
  252. mova m0, [r1]
  253. pcmpeqb m6, m6
  254. add r1, r2
  255. pxor m0, m6
  256. .loop:
  257. mova m1, [r1]
  258. mova m2, [r1+r2]
  259. pxor m1, m6
  260. pxor m2, m6
  261. PAVGB m0, m1
  262. PAVGB m1, m2
  263. pxor m0, m6
  264. pxor m1, m6
  265. mova [r0], m0
  266. mova [r0+r2], m1
  267. mova m1, [r1+r2*2]
  268. mova m0, [r1+r4]
  269. pxor m1, m6
  270. pxor m0, m6
  271. PAVGB m2, m1
  272. PAVGB m1, m0
  273. pxor m2, m6
  274. pxor m1, m6
  275. mova [r0+r2*2], m2
  276. mova [r0+r4], m1
  277. lea r1, [r1+r2*4]
  278. lea r0, [r0+r2*4]
  279. sub r3d, 4
  280. jg .loop
  281. REP_RET
  282. %endmacro
  283. INIT_MMX mmxext
  284. PUT_NO_RND_PIXELS8_Y2_EXACT
  285. INIT_MMX 3dnow
  286. PUT_NO_RND_PIXELS8_Y2_EXACT
  287. ; avg_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  288. %macro AVG_PIXELS8 0
  289. cglobal avg_pixels8, 4,5
  290. movsxdifnidn r2, r2d
  291. lea r4, [r2*2]
  292. .loop:
  293. mova m0, [r0]
  294. mova m1, [r0+r2]
  295. PAVGB m0, [r1]
  296. PAVGB m1, [r1+r2]
  297. mova [r0], m0
  298. mova [r0+r2], m1
  299. add r1, r4
  300. add r0, r4
  301. mova m0, [r0]
  302. mova m1, [r0+r2]
  303. PAVGB m0, [r1]
  304. PAVGB m1, [r1+r2]
  305. add r1, r4
  306. mova [r0], m0
  307. mova [r0+r2], m1
  308. add r0, r4
  309. sub r3d, 4
  310. jne .loop
  311. REP_RET
  312. %endmacro
  313. INIT_MMX 3dnow
  314. AVG_PIXELS8
  315. ; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  316. %macro AVG_PIXELS8_X2 0
  317. cglobal avg_pixels8_x2, 4,5
  318. movsxdifnidn r2, r2d
  319. lea r4, [r2*2]
  320. .loop:
  321. mova m0, [r1]
  322. mova m2, [r1+r2]
  323. PAVGB m0, [r1+1]
  324. PAVGB m2, [r1+r2+1]
  325. PAVGB m0, [r0]
  326. PAVGB m2, [r0+r2]
  327. add r1, r4
  328. mova [r0], m0
  329. mova [r0+r2], m2
  330. mova m0, [r1]
  331. mova m2, [r1+r2]
  332. PAVGB m0, [r1+1]
  333. PAVGB m2, [r1+r2+1]
  334. add r0, r4
  335. add r1, r4
  336. PAVGB m0, [r0]
  337. PAVGB m2, [r0+r2]
  338. mova [r0], m0
  339. mova [r0+r2], m2
  340. add r0, r4
  341. sub r3d, 4
  342. jne .loop
  343. REP_RET
  344. %endmacro
  345. INIT_MMX mmxext
  346. AVG_PIXELS8_X2
  347. INIT_MMX 3dnow
  348. AVG_PIXELS8_X2
  349. ; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  350. %macro AVG_PIXELS8_Y2 0
  351. cglobal avg_pixels8_y2, 4,5
  352. movsxdifnidn r2, r2d
  353. lea r4, [r2*2]
  354. mova m0, [r1]
  355. sub r0, r2
  356. .loop:
  357. mova m1, [r1+r2]
  358. mova m2, [r1+r4]
  359. add r1, r4
  360. PAVGB m0, m1
  361. PAVGB m1, m2
  362. mova m3, [r0+r2]
  363. mova m4, [r0+r4]
  364. PAVGB m0, m3
  365. PAVGB m1, m4
  366. mova [r0+r2], m0
  367. mova [r0+r4], m1
  368. mova m1, [r1+r2]
  369. mova m0, [r1+r4]
  370. PAVGB m2, m1
  371. PAVGB m1, m0
  372. add r0, r4
  373. add r1, r4
  374. mova m3, [r0+r2]
  375. mova m4, [r0+r4]
  376. PAVGB m2, m3
  377. PAVGB m1, m4
  378. mova [r0+r2], m2
  379. mova [r0+r4], m1
  380. add r0, r4
  381. sub r3d, 4
  382. jne .loop
  383. REP_RET
  384. %endmacro
  385. INIT_MMX mmxext
  386. AVG_PIXELS8_Y2
  387. INIT_MMX 3dnow
  388. AVG_PIXELS8_Y2
  389. ; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  390. %macro AVG_PIXELS8_XY2 0
  391. cglobal avg_pixels8_xy2, 4,5
  392. mova m6, [pb_1]
  393. movsxdifnidn r2, r2d
  394. lea r4, [r2*2]
  395. mova m0, [r1]
  396. pavgb m0, [r1+1]
  397. .loop:
  398. mova m2, [r1+r4]
  399. mova m1, [r1+r2]
  400. psubusb m2, m6
  401. pavgb m1, [r1+r2+1]
  402. pavgb m2, [r1+r4+1]
  403. add r1, r4
  404. pavgb m0, m1
  405. pavgb m1, m2
  406. pavgb m0, [r0]
  407. pavgb m1, [r0+r2]
  408. mova [r0], m0
  409. mova [r0+r2], m1
  410. mova m1, [r1+r2]
  411. mova m0, [r1+r4]
  412. pavgb m1, [r1+r2+1]
  413. pavgb m0, [r1+r4+1]
  414. add r0, r4
  415. add r1, r4
  416. pavgb m2, m1
  417. pavgb m1, m0
  418. pavgb m2, [r0]
  419. pavgb m1, [r0+r2]
  420. mova [r0], m2
  421. mova [r0+r2], m1
  422. add r0, r4
  423. sub r3d, 4
  424. jne .loop
  425. REP_RET
  426. %endmacro
  427. INIT_MMX mmxext
  428. AVG_PIXELS8_XY2
  429. INIT_MMX 3dnow
  430. AVG_PIXELS8_XY2