You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

503 lines
12KB

  1. ;******************************************************************************
  2. ;*
  3. ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
  4. ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
  5. ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  6. ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
  7. ;* Copyright (c) 2013 Daniel Kang
  8. ;*
  9. ;* MMX optimized hpel functions
  10. ;*
  11. ;* This file is part of FFmpeg.
  12. ;*
  13. ;* FFmpeg is free software; you can redistribute it and/or
  14. ;* modify it under the terms of the GNU Lesser General Public
  15. ;* License as published by the Free Software Foundation; either
  16. ;* version 2.1 of the License, or (at your option) any later version.
  17. ;*
  18. ;* FFmpeg is distributed in the hope that it will be useful,
  19. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. ;* Lesser General Public License for more details.
  22. ;*
  23. ;* You should have received a copy of the GNU Lesser General Public
  24. ;* License along with FFmpeg; if not, write to the Free Software
  25. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  26. ;******************************************************************************
  27. %include "libavutil/x86/x86util.asm"
  28. SECTION_RODATA
  29. cextern pb_1
  30. SECTION_TEXT
  31. ; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  32. %macro PUT_PIXELS8_X2 0
  33. cglobal put_pixels8_x2, 4,5
  34. lea r4, [r2*2]
  35. .loop:
  36. mova m0, [r1]
  37. mova m1, [r1+r2]
  38. PAVGB m0, [r1+1]
  39. PAVGB m1, [r1+r2+1]
  40. mova [r0], m0
  41. mova [r0+r2], m1
  42. add r1, r4
  43. add r0, r4
  44. mova m0, [r1]
  45. mova m1, [r1+r2]
  46. PAVGB m0, [r1+1]
  47. PAVGB m1, [r1+r2+1]
  48. add r1, r4
  49. mova [r0], m0
  50. mova [r0+r2], m1
  51. add r0, r4
  52. sub r3d, 4
  53. jne .loop
  54. REP_RET
  55. %endmacro
  56. INIT_MMX mmxext
  57. PUT_PIXELS8_X2
  58. INIT_MMX 3dnow
  59. PUT_PIXELS8_X2
  60. ; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  61. %macro PUT_PIXELS_16 0
  62. cglobal put_pixels16_x2, 4,5
  63. lea r4, [r2*2]
  64. .loop:
  65. mova m0, [r1]
  66. mova m1, [r1+r2]
  67. mova m2, [r1+8]
  68. mova m3, [r1+r2+8]
  69. PAVGB m0, [r1+1]
  70. PAVGB m1, [r1+r2+1]
  71. PAVGB m2, [r1+9]
  72. PAVGB m3, [r1+r2+9]
  73. mova [r0], m0
  74. mova [r0+r2], m1
  75. mova [r0+8], m2
  76. mova [r0+r2+8], m3
  77. add r1, r4
  78. add r0, r4
  79. mova m0, [r1]
  80. mova m1, [r1+r2]
  81. mova m2, [r1+8]
  82. mova m3, [r1+r2+8]
  83. PAVGB m0, [r1+1]
  84. PAVGB m1, [r1+r2+1]
  85. PAVGB m2, [r1+9]
  86. PAVGB m3, [r1+r2+9]
  87. add r1, r4
  88. mova [r0], m0
  89. mova [r0+r2], m1
  90. mova [r0+8], m2
  91. mova [r0+r2+8], m3
  92. add r0, r4
  93. sub r3d, 4
  94. jne .loop
  95. REP_RET
  96. %endmacro
  97. INIT_MMX mmxext
  98. PUT_PIXELS_16
  99. INIT_MMX 3dnow
  100. PUT_PIXELS_16
  101. ; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  102. %macro PUT_NO_RND_PIXELS8_X2 0
  103. cglobal put_no_rnd_pixels8_x2, 4,5
  104. mova m6, [pb_1]
  105. lea r4, [r2*2]
  106. .loop:
  107. mova m0, [r1]
  108. mova m2, [r1+r2]
  109. mova m1, [r1+1]
  110. mova m3, [r1+r2+1]
  111. add r1, r4
  112. psubusb m0, m6
  113. psubusb m2, m6
  114. PAVGB m0, m1
  115. PAVGB m2, m3
  116. mova [r0], m0
  117. mova [r0+r2], m2
  118. mova m0, [r1]
  119. mova m1, [r1+1]
  120. mova m2, [r1+r2]
  121. mova m3, [r1+r2+1]
  122. add r0, r4
  123. add r1, r4
  124. psubusb m0, m6
  125. psubusb m2, m6
  126. PAVGB m0, m1
  127. PAVGB m2, m3
  128. mova [r0], m0
  129. mova [r0+r2], m2
  130. add r0, r4
  131. sub r3d, 4
  132. jne .loop
  133. REP_RET
  134. %endmacro
  135. INIT_MMX mmxext
  136. PUT_NO_RND_PIXELS8_X2
  137. INIT_MMX 3dnow
  138. PUT_NO_RND_PIXELS8_X2
  139. ; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  140. %macro PUT_NO_RND_PIXELS8_X2_EXACT 0
  141. cglobal put_no_rnd_pixels8_x2_exact, 4,5
  142. lea r4, [r2*3]
  143. pcmpeqb m6, m6
  144. .loop:
  145. mova m0, [r1]
  146. mova m2, [r1+r2]
  147. mova m1, [r1+1]
  148. mova m3, [r1+r2+1]
  149. pxor m0, m6
  150. pxor m2, m6
  151. pxor m1, m6
  152. pxor m3, m6
  153. PAVGB m0, m1
  154. PAVGB m2, m3
  155. pxor m0, m6
  156. pxor m2, m6
  157. mova [r0], m0
  158. mova [r0+r2], m2
  159. mova m0, [r1+r2*2]
  160. mova m1, [r1+r2*2+1]
  161. mova m2, [r1+r4]
  162. mova m3, [r1+r4+1]
  163. pxor m0, m6
  164. pxor m1, m6
  165. pxor m2, m6
  166. pxor m3, m6
  167. PAVGB m0, m1
  168. PAVGB m2, m3
  169. pxor m0, m6
  170. pxor m2, m6
  171. mova [r0+r2*2], m0
  172. mova [r0+r4], m2
  173. lea r1, [r1+r2*4]
  174. lea r0, [r0+r2*4]
  175. sub r3d, 4
  176. jg .loop
  177. REP_RET
  178. %endmacro
  179. INIT_MMX mmxext
  180. PUT_NO_RND_PIXELS8_X2_EXACT
  181. INIT_MMX 3dnow
  182. PUT_NO_RND_PIXELS8_X2_EXACT
  183. ; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  184. %macro PUT_PIXELS8_Y2 0
  185. cglobal put_pixels8_y2, 4,5
  186. lea r4, [r2*2]
  187. mova m0, [r1]
  188. sub r0, r2
  189. .loop:
  190. mova m1, [r1+r2]
  191. mova m2, [r1+r4]
  192. add r1, r4
  193. PAVGB m0, m1
  194. PAVGB m1, m2
  195. mova [r0+r2], m0
  196. mova [r0+r4], m1
  197. mova m1, [r1+r2]
  198. mova m0, [r1+r4]
  199. add r0, r4
  200. add r1, r4
  201. PAVGB m2, m1
  202. PAVGB m1, m0
  203. mova [r0+r2], m2
  204. mova [r0+r4], m1
  205. add r0, r4
  206. sub r3d, 4
  207. jne .loop
  208. REP_RET
  209. %endmacro
  210. INIT_MMX mmxext
  211. PUT_PIXELS8_Y2
  212. INIT_MMX 3dnow
  213. PUT_PIXELS8_Y2
  214. ; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  215. %macro PUT_NO_RND_PIXELS8_Y2 0
  216. cglobal put_no_rnd_pixels8_y2, 4,5
  217. mova m6, [pb_1]
  218. lea r4, [r2+r2]
  219. mova m0, [r1]
  220. sub r0, r2
  221. .loop:
  222. mova m1, [r1+r2]
  223. mova m2, [r1+r4]
  224. add r1, r4
  225. psubusb m1, m6
  226. PAVGB m0, m1
  227. PAVGB m1, m2
  228. mova [r0+r2], m0
  229. mova [r0+r4], m1
  230. mova m1, [r1+r2]
  231. mova m0, [r1+r4]
  232. add r0, r4
  233. add r1, r4
  234. psubusb m1, m6
  235. PAVGB m2, m1
  236. PAVGB m1, m0
  237. mova [r0+r2], m2
  238. mova [r0+r4], m1
  239. add r0, r4
  240. sub r3d, 4
  241. jne .loop
  242. REP_RET
  243. %endmacro
  244. INIT_MMX mmxext
  245. PUT_NO_RND_PIXELS8_Y2
  246. INIT_MMX 3dnow
  247. PUT_NO_RND_PIXELS8_Y2
  248. ; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  249. %macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
  250. cglobal put_no_rnd_pixels8_y2_exact, 4,5
  251. lea r4, [r2*3]
  252. mova m0, [r1]
  253. pcmpeqb m6, m6
  254. add r1, r2
  255. pxor m0, m6
  256. .loop:
  257. mova m1, [r1]
  258. mova m2, [r1+r2]
  259. pxor m1, m6
  260. pxor m2, m6
  261. PAVGB m0, m1
  262. PAVGB m1, m2
  263. pxor m0, m6
  264. pxor m1, m6
  265. mova [r0], m0
  266. mova [r0+r2], m1
  267. mova m1, [r1+r2*2]
  268. mova m0, [r1+r4]
  269. pxor m1, m6
  270. pxor m0, m6
  271. PAVGB m2, m1
  272. PAVGB m1, m0
  273. pxor m2, m6
  274. pxor m1, m6
  275. mova [r0+r2*2], m2
  276. mova [r0+r4], m1
  277. lea r1, [r1+r2*4]
  278. lea r0, [r0+r2*4]
  279. sub r3d, 4
  280. jg .loop
  281. REP_RET
  282. %endmacro
  283. INIT_MMX mmxext
  284. PUT_NO_RND_PIXELS8_Y2_EXACT
  285. INIT_MMX 3dnow
  286. PUT_NO_RND_PIXELS8_Y2_EXACT
  287. ; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  288. %macro AVG_PIXELS8 0
  289. cglobal avg_pixels8, 4,5
  290. lea r4, [r2*2]
  291. .loop:
  292. mova m0, [r0]
  293. mova m1, [r0+r2]
  294. PAVGB m0, [r1]
  295. PAVGB m1, [r1+r2]
  296. mova [r0], m0
  297. mova [r0+r2], m1
  298. add r1, r4
  299. add r0, r4
  300. mova m0, [r0]
  301. mova m1, [r0+r2]
  302. PAVGB m0, [r1]
  303. PAVGB m1, [r1+r2]
  304. add r1, r4
  305. mova [r0], m0
  306. mova [r0+r2], m1
  307. add r0, r4
  308. sub r3d, 4
  309. jne .loop
  310. REP_RET
  311. %endmacro
  312. INIT_MMX 3dnow
  313. AVG_PIXELS8
  314. ; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  315. %macro AVG_PIXELS8_X2 0
  316. cglobal avg_pixels8_x2, 4,5
  317. lea r4, [r2*2]
  318. .loop:
  319. mova m0, [r1]
  320. mova m2, [r1+r2]
  321. PAVGB m0, [r1+1]
  322. PAVGB m2, [r1+r2+1]
  323. PAVGB m0, [r0]
  324. PAVGB m2, [r0+r2]
  325. add r1, r4
  326. mova [r0], m0
  327. mova [r0+r2], m2
  328. mova m0, [r1]
  329. mova m2, [r1+r2]
  330. PAVGB m0, [r1+1]
  331. PAVGB m2, [r1+r2+1]
  332. add r0, r4
  333. add r1, r4
  334. PAVGB m0, [r0]
  335. PAVGB m2, [r0+r2]
  336. mova [r0], m0
  337. mova [r0+r2], m2
  338. add r0, r4
  339. sub r3d, 4
  340. jne .loop
  341. REP_RET
  342. %endmacro
  343. INIT_MMX mmxext
  344. AVG_PIXELS8_X2
  345. INIT_MMX 3dnow
  346. AVG_PIXELS8_X2
  347. ; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  348. %macro AVG_PIXELS8_Y2 0
  349. cglobal avg_pixels8_y2, 4,5
  350. lea r4, [r2*2]
  351. mova m0, [r1]
  352. sub r0, r2
  353. .loop:
  354. mova m1, [r1+r2]
  355. mova m2, [r1+r4]
  356. add r1, r4
  357. PAVGB m0, m1
  358. PAVGB m1, m2
  359. mova m3, [r0+r2]
  360. mova m4, [r0+r4]
  361. PAVGB m0, m3
  362. PAVGB m1, m4
  363. mova [r0+r2], m0
  364. mova [r0+r4], m1
  365. mova m1, [r1+r2]
  366. mova m0, [r1+r4]
  367. PAVGB m2, m1
  368. PAVGB m1, m0
  369. add r0, r4
  370. add r1, r4
  371. mova m3, [r0+r2]
  372. mova m4, [r0+r4]
  373. PAVGB m2, m3
  374. PAVGB m1, m4
  375. mova [r0+r2], m2
  376. mova [r0+r4], m1
  377. add r0, r4
  378. sub r3d, 4
  379. jne .loop
  380. REP_RET
  381. %endmacro
  382. INIT_MMX mmxext
  383. AVG_PIXELS8_Y2
  384. INIT_MMX 3dnow
  385. AVG_PIXELS8_Y2
  386. ; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  387. %macro AVG_PIXELS8_XY2 0
  388. cglobal avg_pixels8_xy2, 4,5
  389. mova m6, [pb_1]
  390. lea r4, [r2*2]
  391. mova m0, [r1]
  392. pavgb m0, [r1+1]
  393. .loop:
  394. mova m2, [r1+r4]
  395. mova m1, [r1+r2]
  396. psubusb m2, m6
  397. pavgb m1, [r1+r2+1]
  398. pavgb m2, [r1+r4+1]
  399. add r1, r4
  400. pavgb m0, m1
  401. pavgb m1, m2
  402. pavgb m0, [r0]
  403. pavgb m1, [r0+r2]
  404. mova [r0], m0
  405. mova [r0+r2], m1
  406. mova m1, [r1+r2]
  407. mova m0, [r1+r4]
  408. pavgb m1, [r1+r2+1]
  409. pavgb m0, [r1+r4+1]
  410. add r0, r4
  411. add r1, r4
  412. pavgb m2, m1
  413. pavgb m1, m0
  414. pavgb m2, [r0]
  415. pavgb m1, [r0+r2]
  416. mova [r0], m2
  417. mova [r0+r2], m1
  418. add r0, r4
  419. sub r3d, 4
  420. jne .loop
  421. REP_RET
  422. %endmacro
  423. INIT_MMX mmxext
  424. AVG_PIXELS8_XY2
  425. INIT_MMX 3dnow
  426. AVG_PIXELS8_XY2
  427. INIT_XMM sse2
  428. ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  429. cglobal put_pixels16, 4,5,4
  430. lea r4, [r2*3]
  431. .loop:
  432. movu m0, [r1]
  433. movu m1, [r1+r2]
  434. movu m2, [r1+r2*2]
  435. movu m3, [r1+r4]
  436. lea r1, [r1+r2*4]
  437. mova [r0], m0
  438. mova [r0+r2], m1
  439. mova [r0+r2*2], m2
  440. mova [r0+r4], m3
  441. sub r3d, 4
  442. lea r0, [r0+r2*4]
  443. jnz .loop
  444. REP_RET
  445. ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  446. cglobal avg_pixels16, 4,5,4
  447. lea r4, [r2*3]
  448. .loop:
  449. movu m0, [r1]
  450. movu m1, [r1+r2]
  451. movu m2, [r1+r2*2]
  452. movu m3, [r1+r4]
  453. lea r1, [r1+r2*4]
  454. pavgb m0, [r0]
  455. pavgb m1, [r0+r2]
  456. pavgb m2, [r0+r2*2]
  457. pavgb m3, [r0+r4]
  458. mova [r0], m0
  459. mova [r0+r2], m1
  460. mova [r0+r2*2], m2
  461. mova [r0+r4], m3
  462. sub r3d, 4
  463. lea r0, [r0+r2*4]
  464. jnz .loop
  465. REP_RET