You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

560 lines
14KB

  1. ;******************************************************************************
  2. ;* quarterpel DSP functions
  3. ;*
  4. ;* Copyright (c) 2008 Loren Merritt
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. cextern pb_1
  25. cextern pw_3
  26. cextern pw_15
  27. cextern pw_16
  28. cextern pw_20
  29. SECTION .text
  30. ; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  31. %macro PUT_NO_RND_PIXELS8_L2 0
  32. cglobal put_no_rnd_pixels8_l2, 6,6
  33. movsxdifnidn r4, r4d
  34. movsxdifnidn r3, r3d
  35. pcmpeqb m6, m6
  36. test r5d, 1
  37. je .loop
  38. mova m0, [r1]
  39. mova m1, [r2]
  40. add r1, r4
  41. add r2, 8
  42. pxor m0, m6
  43. pxor m1, m6
  44. PAVGB m0, m1
  45. pxor m0, m6
  46. mova [r0], m0
  47. add r0, r3
  48. dec r5d
  49. .loop:
  50. mova m0, [r1]
  51. add r1, r4
  52. mova m1, [r1]
  53. add r1, r4
  54. mova m2, [r2]
  55. mova m3, [r2+8]
  56. pxor m0, m6
  57. pxor m1, m6
  58. pxor m2, m6
  59. pxor m3, m6
  60. PAVGB m0, m2
  61. PAVGB m1, m3
  62. pxor m0, m6
  63. pxor m1, m6
  64. mova [r0], m0
  65. add r0, r3
  66. mova [r0], m1
  67. add r0, r3
  68. mova m0, [r1]
  69. add r1, r4
  70. mova m1, [r1]
  71. add r1, r4
  72. mova m2, [r2+16]
  73. mova m3, [r2+24]
  74. pxor m0, m6
  75. pxor m1, m6
  76. pxor m2, m6
  77. pxor m3, m6
  78. PAVGB m0, m2
  79. PAVGB m1, m3
  80. pxor m0, m6
  81. pxor m1, m6
  82. mova [r0], m0
  83. add r0, r3
  84. mova [r0], m1
  85. add r0, r3
  86. add r2, 32
  87. sub r5d, 4
  88. jne .loop
  89. REP_RET
  90. %endmacro
  91. INIT_MMX mmxext
  92. PUT_NO_RND_PIXELS8_L2
  93. ; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  94. %macro PUT_NO_RND_PIXELS16_l2 0
  95. cglobal put_no_rnd_pixels16_l2, 6,6
  96. movsxdifnidn r3, r3d
  97. movsxdifnidn r4, r4d
  98. pcmpeqb m6, m6
  99. test r5d, 1
  100. je .loop
  101. mova m0, [r1]
  102. mova m1, [r1+8]
  103. mova m2, [r2]
  104. mova m3, [r2+8]
  105. pxor m0, m6
  106. pxor m1, m6
  107. pxor m2, m6
  108. pxor m3, m6
  109. PAVGB m0, m2
  110. PAVGB m1, m3
  111. pxor m0, m6
  112. pxor m1, m6
  113. add r1, r4
  114. add r2, 16
  115. mova [r0], m0
  116. mova [r0+8], m1
  117. add r0, r3
  118. dec r5d
  119. .loop:
  120. mova m0, [r1]
  121. mova m1, [r1+8]
  122. add r1, r4
  123. mova m2, [r2]
  124. mova m3, [r2+8]
  125. pxor m0, m6
  126. pxor m1, m6
  127. pxor m2, m6
  128. pxor m3, m6
  129. PAVGB m0, m2
  130. PAVGB m1, m3
  131. pxor m0, m6
  132. pxor m1, m6
  133. mova [r0], m0
  134. mova [r0+8], m1
  135. add r0, r3
  136. mova m0, [r1]
  137. mova m1, [r1+8]
  138. add r1, r4
  139. mova m2, [r2+16]
  140. mova m3, [r2+24]
  141. pxor m0, m6
  142. pxor m1, m6
  143. pxor m2, m6
  144. pxor m3, m6
  145. PAVGB m0, m2
  146. PAVGB m1, m3
  147. pxor m0, m6
  148. pxor m1, m6
  149. mova [r0], m0
  150. mova [r0+8], m1
  151. add r0, r3
  152. add r2, 32
  153. sub r5d, 2
  154. jne .loop
  155. REP_RET
  156. %endmacro
  157. INIT_MMX mmxext
  158. PUT_NO_RND_PIXELS16_l2
  159. INIT_MMX 3dnow
  160. PUT_NO_RND_PIXELS16_l2
  161. %macro MPEG4_QPEL16_H_LOWPASS 1
  162. cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
  163. movsxdifnidn r2, r2d
  164. movsxdifnidn r3, r3d
  165. pxor m7, m7
  166. .loop:
  167. mova m0, [r1]
  168. mova m1, m0
  169. mova m2, m0
  170. punpcklbw m0, m7
  171. punpckhbw m1, m7
  172. pshufw m5, m0, 0x90
  173. pshufw m6, m0, 0x41
  174. mova m3, m2
  175. mova m4, m2
  176. psllq m2, 8
  177. psllq m3, 16
  178. psllq m4, 24
  179. punpckhbw m2, m7
  180. punpckhbw m3, m7
  181. punpckhbw m4, m7
  182. paddw m5, m3
  183. paddw m6, m2
  184. paddw m5, m5
  185. psubw m6, m5
  186. pshufw m5, m0, 6
  187. pmullw m6, [pw_3]
  188. paddw m0, m4
  189. paddw m5, m1
  190. pmullw m0, [pw_20]
  191. psubw m0, m5
  192. paddw m6, [PW_ROUND]
  193. paddw m0, m6
  194. psraw m0, 5
  195. mova [rsp+8], m0
  196. mova m0, [r1+5]
  197. mova m5, m0
  198. mova m6, m0
  199. psrlq m0, 8
  200. psrlq m5, 16
  201. punpcklbw m0, m7
  202. punpcklbw m5, m7
  203. paddw m2, m0
  204. paddw m3, m5
  205. paddw m2, m2
  206. psubw m3, m2
  207. mova m2, m6
  208. psrlq m6, 24
  209. punpcklbw m2, m7
  210. punpcklbw m6, m7
  211. pmullw m3, [pw_3]
  212. paddw m1, m2
  213. paddw m4, m6
  214. pmullw m1, [pw_20]
  215. psubw m3, m4
  216. paddw m1, [PW_ROUND]
  217. paddw m3, m1
  218. psraw m3, 5
  219. mova m1, [rsp+8]
  220. packuswb m1, m3
  221. OP_MOV [r0], m1, m4
  222. mova m1, [r1+9]
  223. mova m4, m1
  224. mova m3, m1
  225. psrlq m1, 8
  226. psrlq m4, 16
  227. punpcklbw m1, m7
  228. punpcklbw m4, m7
  229. paddw m5, m1
  230. paddw m0, m4
  231. paddw m5, m5
  232. psubw m0, m5
  233. mova m5, m3
  234. psrlq m3, 24
  235. pmullw m0, [pw_3]
  236. punpcklbw m3, m7
  237. paddw m2, m3
  238. psubw m0, m2
  239. mova m2, m5
  240. punpcklbw m2, m7
  241. punpckhbw m5, m7
  242. paddw m6, m2
  243. pmullw m6, [pw_20]
  244. paddw m0, [PW_ROUND]
  245. paddw m0, m6
  246. psraw m0, 5
  247. paddw m3, m5
  248. pshufw m6, m5, 0xf9
  249. paddw m6, m4
  250. pshufw m4, m5, 0xbe
  251. pshufw m5, m5, 0x6f
  252. paddw m4, m1
  253. paddw m5, m2
  254. paddw m6, m6
  255. psubw m4, m6
  256. pmullw m3, [pw_20]
  257. pmullw m4, [pw_3]
  258. psubw m3, m5
  259. paddw m4, [PW_ROUND]
  260. paddw m4, m3
  261. psraw m4, 5
  262. packuswb m0, m4
  263. OP_MOV [r0+8], m0, m4
  264. add r1, r3
  265. add r0, r2
  266. dec r4d
  267. jne .loop
  268. REP_RET
  269. %endmacro
  270. %macro PUT_OP 2-3
  271. mova %1, %2
  272. %endmacro
  273. %macro AVG_OP 2-3
  274. mova %3, %1
  275. pavgb %2, %3
  276. mova %1, %2
  277. %endmacro
  278. INIT_MMX mmxext
  279. %define PW_ROUND pw_16
  280. %define OP_MOV PUT_OP
  281. MPEG4_QPEL16_H_LOWPASS put
  282. %define PW_ROUND pw_16
  283. %define OP_MOV AVG_OP
  284. MPEG4_QPEL16_H_LOWPASS avg
  285. %define PW_ROUND pw_15
  286. %define OP_MOV PUT_OP
  287. MPEG4_QPEL16_H_LOWPASS put_no_rnd
  288. %macro MPEG4_QPEL8_H_LOWPASS 1
  289. cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
  290. movsxdifnidn r2, r2d
  291. movsxdifnidn r3, r3d
  292. pxor m7, m7
  293. .loop:
  294. mova m0, [r1]
  295. mova m1, m0
  296. mova m2, m0
  297. punpcklbw m0, m7
  298. punpckhbw m1, m7
  299. pshufw m5, m0, 0x90
  300. pshufw m6, m0, 0x41
  301. mova m3, m2
  302. mova m4, m2
  303. psllq m2, 8
  304. psllq m3, 16
  305. psllq m4, 24
  306. punpckhbw m2, m7
  307. punpckhbw m3, m7
  308. punpckhbw m4, m7
  309. paddw m5, m3
  310. paddw m6, m2
  311. paddw m5, m5
  312. psubw m6, m5
  313. pshufw m5, m0, 0x6
  314. pmullw m6, [pw_3]
  315. paddw m0, m4
  316. paddw m5, m1
  317. pmullw m0, [pw_20]
  318. psubw m0, m5
  319. paddw m6, [PW_ROUND]
  320. paddw m0, m6
  321. psraw m0, 5
  322. movh m5, [r1+5]
  323. punpcklbw m5, m7
  324. pshufw m6, m5, 0xf9
  325. paddw m1, m5
  326. paddw m2, m6
  327. pshufw m6, m5, 0xbe
  328. pshufw m5, m5, 0x6f
  329. paddw m3, m6
  330. paddw m4, m5
  331. paddw m2, m2
  332. psubw m3, m2
  333. pmullw m1, [pw_20]
  334. pmullw m3, [pw_3]
  335. psubw m3, m4
  336. paddw m1, [PW_ROUND]
  337. paddw m3, m1
  338. psraw m3, 5
  339. packuswb m0, m3
  340. OP_MOV [r0], m0, m4
  341. add r1, r3
  342. add r0, r2
  343. dec r4d
  344. jne .loop
  345. REP_RET
  346. %endmacro
  347. INIT_MMX mmxext
  348. %define PW_ROUND pw_16
  349. %define OP_MOV PUT_OP
  350. MPEG4_QPEL8_H_LOWPASS put
  351. %define PW_ROUND pw_16
  352. %define OP_MOV AVG_OP
  353. MPEG4_QPEL8_H_LOWPASS avg
  354. %define PW_ROUND pw_15
  355. %define OP_MOV PUT_OP
  356. MPEG4_QPEL8_H_LOWPASS put_no_rnd
  357. %macro QPEL_V_LOW 5
  358. paddw m0, m1
  359. mova m4, [pw_20]
  360. pmullw m4, m0
  361. mova m0, %4
  362. mova m5, %1
  363. paddw m5, m0
  364. psubw m4, m5
  365. mova m5, %2
  366. mova m6, %3
  367. paddw m5, m3
  368. paddw m6, m2
  369. paddw m6, m6
  370. psubw m5, m6
  371. pmullw m5, [pw_3]
  372. paddw m4, [PW_ROUND]
  373. paddw m5, m4
  374. psraw m5, 5
  375. packuswb m5, m5
  376. OP_MOV %5, m5, m7
  377. SWAP 0,1,2,3
  378. %endmacro
  379. %macro MPEG4_QPEL16_V_LOWPASS 1
  380. cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
  381. movsxdifnidn r2, r2d
  382. movsxdifnidn r3, r3d
  383. mov r4d, 17
  384. mov r5, rsp
  385. pxor m7, m7
  386. .looph:
  387. mova m0, [r1]
  388. mova m1, [r1]
  389. mova m2, [r1+8]
  390. mova m3, [r1+8]
  391. punpcklbw m0, m7
  392. punpckhbw m1, m7
  393. punpcklbw m2, m7
  394. punpckhbw m3, m7
  395. mova [r5], m0
  396. mova [r5+0x88], m1
  397. mova [r5+0x110], m2
  398. mova [r5+0x198], m3
  399. add r5, 8
  400. add r1, r3
  401. dec r4d
  402. jne .looph
  403. ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
  404. mov r4d, 4
  405. mov r1, 4
  406. neg r2
  407. lea r1, [r1+r2*8]
  408. lea r1, [r1+r2*4]
  409. lea r1, [r1+r2*2]
  410. neg r2
  411. mov r5, rsp
  412. .loopv:
  413. pxor m7, m7
  414. mova m0, [r5+ 0x0]
  415. mova m1, [r5+ 0x8]
  416. mova m2, [r5+0x10]
  417. mova m3, [r5+0x18]
  418. QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
  419. QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
  420. lea r0, [r0+r2*2]
  421. QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
  422. QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
  423. lea r0, [r0+r2*2]
  424. QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
  425. QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
  426. lea r0, [r0+r2*2]
  427. QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
  428. QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
  429. lea r0, [r0+r2*2]
  430. QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
  431. QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
  432. lea r0, [r0+r2*2]
  433. QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
  434. QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
  435. lea r0, [r0+r2*2]
  436. QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
  437. QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
  438. lea r0, [r0+r2*2]
  439. QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
  440. QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
  441. add r5, 0x88
  442. add r0, r1
  443. dec r4d
  444. jne .loopv
  445. REP_RET
  446. %endmacro
  447. %macro PUT_OPH 2-3
  448. movh %1, %2
  449. %endmacro
  450. %macro AVG_OPH 2-3
  451. movh %3, %1
  452. pavgb %2, %3
  453. movh %1, %2
  454. %endmacro
  455. INIT_MMX mmxext
  456. %define PW_ROUND pw_16
  457. %define OP_MOV PUT_OPH
  458. MPEG4_QPEL16_V_LOWPASS put
  459. %define PW_ROUND pw_16
  460. %define OP_MOV AVG_OPH
  461. MPEG4_QPEL16_V_LOWPASS avg
  462. %define PW_ROUND pw_15
  463. %define OP_MOV PUT_OPH
  464. MPEG4_QPEL16_V_LOWPASS put_no_rnd
  465. %macro MPEG4_QPEL8_V_LOWPASS 1
  466. cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
  467. movsxdifnidn r2, r2d
  468. movsxdifnidn r3, r3d
  469. mov r4d, 9
  470. mov r5, rsp
  471. pxor m7, m7
  472. .looph:
  473. mova m0, [r1]
  474. mova m1, [r1]
  475. punpcklbw m0, m7
  476. punpckhbw m1, m7
  477. mova [r5], m0
  478. mova [r5+0x48], m1
  479. add r5, 8
  480. add r1, r3
  481. dec r4d
  482. jne .looph
  483. ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
  484. mov r4d, 2
  485. mov r1, 4
  486. neg r2
  487. lea r1, [r1+r2*4]
  488. lea r1, [r1+r2*2]
  489. neg r2
  490. mov r5, rsp
  491. .loopv:
  492. pxor m7, m7
  493. mova m0, [r5+ 0x0]
  494. mova m1, [r5+ 0x8]
  495. mova m2, [r5+0x10]
  496. mova m3, [r5+0x18]
  497. QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
  498. QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
  499. lea r0, [r0+r2*2]
  500. QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
  501. QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
  502. lea r0, [r0+r2*2]
  503. QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
  504. QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
  505. lea r0, [r0+r2*2]
  506. QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
  507. QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
  508. add r5, 0x48
  509. add r0, r1
  510. dec r4d
  511. jne .loopv
  512. REP_RET
  513. %endmacro
  514. INIT_MMX mmxext
  515. %define PW_ROUND pw_16
  516. %define OP_MOV PUT_OPH
  517. MPEG4_QPEL8_V_LOWPASS put
  518. %define PW_ROUND pw_16
  519. %define OP_MOV AVG_OPH
  520. MPEG4_QPEL8_V_LOWPASS avg
  521. %define PW_ROUND pw_15
  522. %define OP_MOV PUT_OPH
  523. MPEG4_QPEL8_V_LOWPASS put_no_rnd