You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

559 lines
14KB

  1. ;******************************************************************************
  2. ;* mpeg4 qpel
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. cextern pb_1
  24. cextern pw_3
  25. cextern pw_15
  26. cextern pw_16
  27. cextern pw_20
  28. SECTION_TEXT
  29. ; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  30. %macro PUT_NO_RND_PIXELS8_L2 0
  31. cglobal put_no_rnd_pixels8_l2, 6,6
  32. movsxdifnidn r4, r4d
  33. movsxdifnidn r3, r3d
  34. pcmpeqb m6, m6
  35. test r5d, 1
  36. je .loop
  37. mova m0, [r1]
  38. mova m1, [r2]
  39. add r1, r4
  40. add r2, 8
  41. pxor m0, m6
  42. pxor m1, m6
  43. PAVGB m0, m1
  44. pxor m0, m6
  45. mova [r0], m0
  46. add r0, r3
  47. dec r5d
  48. .loop:
  49. mova m0, [r1]
  50. add r1, r4
  51. mova m1, [r1]
  52. add r1, r4
  53. mova m2, [r2]
  54. mova m3, [r2+8]
  55. pxor m0, m6
  56. pxor m1, m6
  57. pxor m2, m6
  58. pxor m3, m6
  59. PAVGB m0, m2
  60. PAVGB m1, m3
  61. pxor m0, m6
  62. pxor m1, m6
  63. mova [r0], m0
  64. add r0, r3
  65. mova [r0], m1
  66. add r0, r3
  67. mova m0, [r1]
  68. add r1, r4
  69. mova m1, [r1]
  70. add r1, r4
  71. mova m2, [r2+16]
  72. mova m3, [r2+24]
  73. pxor m0, m6
  74. pxor m1, m6
  75. pxor m2, m6
  76. pxor m3, m6
  77. PAVGB m0, m2
  78. PAVGB m1, m3
  79. pxor m0, m6
  80. pxor m1, m6
  81. mova [r0], m0
  82. add r0, r3
  83. mova [r0], m1
  84. add r0, r3
  85. add r2, 32
  86. sub r5d, 4
  87. jne .loop
  88. REP_RET
  89. %endmacro
  90. INIT_MMX mmxext
  91. PUT_NO_RND_PIXELS8_L2
  92. ; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  93. %macro PUT_NO_RND_PIXELS16_l2 0
  94. cglobal put_no_rnd_pixels16_l2, 6,6
  95. movsxdifnidn r3, r3d
  96. movsxdifnidn r4, r4d
  97. pcmpeqb m6, m6
  98. test r5d, 1
  99. je .loop
  100. mova m0, [r1]
  101. mova m1, [r1+8]
  102. mova m2, [r2]
  103. mova m3, [r2+8]
  104. pxor m0, m6
  105. pxor m1, m6
  106. pxor m2, m6
  107. pxor m3, m6
  108. PAVGB m0, m2
  109. PAVGB m1, m3
  110. pxor m0, m6
  111. pxor m1, m6
  112. add r1, r4
  113. add r2, 16
  114. mova [r0], m0
  115. mova [r0+8], m1
  116. add r0, r3
  117. dec r5d
  118. .loop:
  119. mova m0, [r1]
  120. mova m1, [r1+8]
  121. add r1, r4
  122. mova m2, [r2]
  123. mova m3, [r2+8]
  124. pxor m0, m6
  125. pxor m1, m6
  126. pxor m2, m6
  127. pxor m3, m6
  128. PAVGB m0, m2
  129. PAVGB m1, m3
  130. pxor m0, m6
  131. pxor m1, m6
  132. mova [r0], m0
  133. mova [r0+8], m1
  134. add r0, r3
  135. mova m0, [r1]
  136. mova m1, [r1+8]
  137. add r1, r4
  138. mova m2, [r2+16]
  139. mova m3, [r2+24]
  140. pxor m0, m6
  141. pxor m1, m6
  142. pxor m2, m6
  143. pxor m3, m6
  144. PAVGB m0, m2
  145. PAVGB m1, m3
  146. pxor m0, m6
  147. pxor m1, m6
  148. mova [r0], m0
  149. mova [r0+8], m1
  150. add r0, r3
  151. add r2, 32
  152. sub r5d, 2
  153. jne .loop
  154. REP_RET
  155. %endmacro
  156. INIT_MMX mmxext
  157. PUT_NO_RND_PIXELS16_l2
  158. INIT_MMX 3dnow
  159. PUT_NO_RND_PIXELS16_l2
  160. %macro MPEG4_QPEL16_H_LOWPASS 1
  161. cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
  162. movsxdifnidn r2, r2d
  163. movsxdifnidn r3, r3d
  164. pxor m7, m7
  165. .loop:
  166. mova m0, [r1]
  167. mova m1, m0
  168. mova m2, m0
  169. punpcklbw m0, m7
  170. punpckhbw m1, m7
  171. pshufw m5, m0, 0x90
  172. pshufw m6, m0, 0x41
  173. mova m3, m2
  174. mova m4, m2
  175. psllq m2, 8
  176. psllq m3, 16
  177. psllq m4, 24
  178. punpckhbw m2, m7
  179. punpckhbw m3, m7
  180. punpckhbw m4, m7
  181. paddw m5, m3
  182. paddw m6, m2
  183. paddw m5, m5
  184. psubw m6, m5
  185. pshufw m5, m0, 6
  186. pmullw m6, [pw_3]
  187. paddw m0, m4
  188. paddw m5, m1
  189. pmullw m0, [pw_20]
  190. psubw m0, m5
  191. paddw m6, [PW_ROUND]
  192. paddw m0, m6
  193. psraw m0, 5
  194. mova [rsp+8], m0
  195. mova m0, [r1+5]
  196. mova m5, m0
  197. mova m6, m0
  198. psrlq m0, 8
  199. psrlq m5, 16
  200. punpcklbw m0, m7
  201. punpcklbw m5, m7
  202. paddw m2, m0
  203. paddw m3, m5
  204. paddw m2, m2
  205. psubw m3, m2
  206. mova m2, m6
  207. psrlq m6, 24
  208. punpcklbw m2, m7
  209. punpcklbw m6, m7
  210. pmullw m3, [pw_3]
  211. paddw m1, m2
  212. paddw m4, m6
  213. pmullw m1, [pw_20]
  214. psubw m3, m4
  215. paddw m1, [PW_ROUND]
  216. paddw m3, m1
  217. psraw m3, 5
  218. mova m1, [rsp+8]
  219. packuswb m1, m3
  220. OP_MOV [r0], m1, m4
  221. mova m1, [r1+9]
  222. mova m4, m1
  223. mova m3, m1
  224. psrlq m1, 8
  225. psrlq m4, 16
  226. punpcklbw m1, m7
  227. punpcklbw m4, m7
  228. paddw m5, m1
  229. paddw m0, m4
  230. paddw m5, m5
  231. psubw m0, m5
  232. mova m5, m3
  233. psrlq m3, 24
  234. pmullw m0, [pw_3]
  235. punpcklbw m3, m7
  236. paddw m2, m3
  237. psubw m0, m2
  238. mova m2, m5
  239. punpcklbw m2, m7
  240. punpckhbw m5, m7
  241. paddw m6, m2
  242. pmullw m6, [pw_20]
  243. paddw m0, [PW_ROUND]
  244. paddw m0, m6
  245. psraw m0, 5
  246. paddw m3, m5
  247. pshufw m6, m5, 0xf9
  248. paddw m6, m4
  249. pshufw m4, m5, 0xbe
  250. pshufw m5, m5, 0x6f
  251. paddw m4, m1
  252. paddw m5, m2
  253. paddw m6, m6
  254. psubw m4, m6
  255. pmullw m3, [pw_20]
  256. pmullw m4, [pw_3]
  257. psubw m3, m5
  258. paddw m4, [PW_ROUND]
  259. paddw m4, m3
  260. psraw m4, 5
  261. packuswb m0, m4
  262. OP_MOV [r0+8], m0, m4
  263. add r1, r3
  264. add r0, r2
  265. dec r4d
  266. jne .loop
  267. REP_RET
  268. %endmacro
  269. %macro PUT_OP 2-3
  270. mova %1, %2
  271. %endmacro
  272. %macro AVG_OP 2-3
  273. mova %3, %1
  274. pavgb %2, %3
  275. mova %1, %2
  276. %endmacro
  277. INIT_MMX mmxext
  278. %define PW_ROUND pw_16
  279. %define OP_MOV PUT_OP
  280. MPEG4_QPEL16_H_LOWPASS put
  281. %define PW_ROUND pw_16
  282. %define OP_MOV AVG_OP
  283. MPEG4_QPEL16_H_LOWPASS avg
  284. %define PW_ROUND pw_15
  285. %define OP_MOV PUT_OP
  286. MPEG4_QPEL16_H_LOWPASS put_no_rnd
  287. %macro MPEG4_QPEL8_H_LOWPASS 1
  288. cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
  289. movsxdifnidn r2, r2d
  290. movsxdifnidn r3, r3d
  291. pxor m7, m7
  292. .loop:
  293. mova m0, [r1]
  294. mova m1, m0
  295. mova m2, m0
  296. punpcklbw m0, m7
  297. punpckhbw m1, m7
  298. pshufw m5, m0, 0x90
  299. pshufw m6, m0, 0x41
  300. mova m3, m2
  301. mova m4, m2
  302. psllq m2, 8
  303. psllq m3, 16
  304. psllq m4, 24
  305. punpckhbw m2, m7
  306. punpckhbw m3, m7
  307. punpckhbw m4, m7
  308. paddw m5, m3
  309. paddw m6, m2
  310. paddw m5, m5
  311. psubw m6, m5
  312. pshufw m5, m0, 0x6
  313. pmullw m6, [pw_3]
  314. paddw m0, m4
  315. paddw m5, m1
  316. pmullw m0, [pw_20]
  317. psubw m0, m5
  318. paddw m6, [PW_ROUND]
  319. paddw m0, m6
  320. psraw m0, 5
  321. movh m5, [r1+5]
  322. punpcklbw m5, m7
  323. pshufw m6, m5, 0xf9
  324. paddw m1, m5
  325. paddw m2, m6
  326. pshufw m6, m5, 0xbe
  327. pshufw m5, m5, 0x6f
  328. paddw m3, m6
  329. paddw m4, m5
  330. paddw m2, m2
  331. psubw m3, m2
  332. pmullw m1, [pw_20]
  333. pmullw m3, [pw_3]
  334. psubw m3, m4
  335. paddw m1, [PW_ROUND]
  336. paddw m3, m1
  337. psraw m3, 5
  338. packuswb m0, m3
  339. OP_MOV [r0], m0, m4
  340. add r1, r3
  341. add r0, r2
  342. dec r4d
  343. jne .loop
  344. REP_RET
  345. %endmacro
  346. INIT_MMX mmxext
  347. %define PW_ROUND pw_16
  348. %define OP_MOV PUT_OP
  349. MPEG4_QPEL8_H_LOWPASS put
  350. %define PW_ROUND pw_16
  351. %define OP_MOV AVG_OP
  352. MPEG4_QPEL8_H_LOWPASS avg
  353. %define PW_ROUND pw_15
  354. %define OP_MOV PUT_OP
  355. MPEG4_QPEL8_H_LOWPASS put_no_rnd
  356. %macro QPEL_V_LOW 5
  357. paddw m0, m1
  358. mova m4, [pw_20]
  359. pmullw m4, m0
  360. mova m0, %4
  361. mova m5, %1
  362. paddw m5, m0
  363. psubw m4, m5
  364. mova m5, %2
  365. mova m6, %3
  366. paddw m5, m3
  367. paddw m6, m2
  368. paddw m6, m6
  369. psubw m5, m6
  370. pmullw m5, [pw_3]
  371. paddw m4, [PW_ROUND]
  372. paddw m5, m4
  373. psraw m5, 5
  374. packuswb m5, m5
  375. OP_MOV %5, m5, m7
  376. SWAP 0,1,2,3
  377. %endmacro
  378. %macro MPEG4_QPEL16_V_LOWPASS 1
  379. cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
  380. movsxdifnidn r2, r2d
  381. movsxdifnidn r3, r3d
  382. mov r4d, 17
  383. mov r5, rsp
  384. pxor m7, m7
  385. .looph:
  386. mova m0, [r1]
  387. mova m1, [r1]
  388. mova m2, [r1+8]
  389. mova m3, [r1+8]
  390. punpcklbw m0, m7
  391. punpckhbw m1, m7
  392. punpcklbw m2, m7
  393. punpckhbw m3, m7
  394. mova [r5], m0
  395. mova [r5+0x88], m1
  396. mova [r5+0x110], m2
  397. mova [r5+0x198], m3
  398. add r5, 8
  399. add r1, r3
  400. dec r4d
  401. jne .looph
  402. ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
  403. mov r4d, 4
  404. mov r1, 4
  405. neg r2
  406. lea r1, [r1+r2*8]
  407. lea r1, [r1+r2*4]
  408. lea r1, [r1+r2*2]
  409. neg r2
  410. mov r5, rsp
  411. .loopv:
  412. pxor m7, m7
  413. mova m0, [r5+ 0x0]
  414. mova m1, [r5+ 0x8]
  415. mova m2, [r5+0x10]
  416. mova m3, [r5+0x18]
  417. QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
  418. QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
  419. lea r0, [r0+r2*2]
  420. QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
  421. QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
  422. lea r0, [r0+r2*2]
  423. QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
  424. QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
  425. lea r0, [r0+r2*2]
  426. QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
  427. QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
  428. lea r0, [r0+r2*2]
  429. QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
  430. QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
  431. lea r0, [r0+r2*2]
  432. QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
  433. QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
  434. lea r0, [r0+r2*2]
  435. QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
  436. QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
  437. lea r0, [r0+r2*2]
  438. QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
  439. QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
  440. add r5, 0x88
  441. add r0, r1
  442. dec r4d
  443. jne .loopv
  444. REP_RET
  445. %endmacro
  446. %macro PUT_OPH 2-3
  447. movh %1, %2
  448. %endmacro
  449. %macro AVG_OPH 2-3
  450. movh %3, %1
  451. pavgb %2, %3
  452. movh %1, %2
  453. %endmacro
  454. INIT_MMX mmxext
  455. %define PW_ROUND pw_16
  456. %define OP_MOV PUT_OPH
  457. MPEG4_QPEL16_V_LOWPASS put
  458. %define PW_ROUND pw_16
  459. %define OP_MOV AVG_OPH
  460. MPEG4_QPEL16_V_LOWPASS avg
  461. %define PW_ROUND pw_15
  462. %define OP_MOV PUT_OPH
  463. MPEG4_QPEL16_V_LOWPASS put_no_rnd
  464. %macro MPEG4_QPEL8_V_LOWPASS 1
  465. cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
  466. movsxdifnidn r2, r2d
  467. movsxdifnidn r3, r3d
  468. mov r4d, 9
  469. mov r5, rsp
  470. pxor m7, m7
  471. .looph:
  472. mova m0, [r1]
  473. mova m1, [r1]
  474. punpcklbw m0, m7
  475. punpckhbw m1, m7
  476. mova [r5], m0
  477. mova [r5+0x48], m1
  478. add r5, 8
  479. add r1, r3
  480. dec r4d
  481. jne .looph
  482. ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
  483. mov r4d, 2
  484. mov r1, 4
  485. neg r2
  486. lea r1, [r1+r2*4]
  487. lea r1, [r1+r2*2]
  488. neg r2
  489. mov r5, rsp
  490. .loopv:
  491. pxor m7, m7
  492. mova m0, [r5+ 0x0]
  493. mova m1, [r5+ 0x8]
  494. mova m2, [r5+0x10]
  495. mova m3, [r5+0x18]
  496. QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
  497. QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
  498. lea r0, [r0+r2*2]
  499. QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
  500. QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
  501. lea r0, [r0+r2*2]
  502. QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
  503. QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
  504. lea r0, [r0+r2*2]
  505. QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
  506. QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
  507. add r5, 0x48
  508. add r0, r1
  509. dec r4d
  510. jne .loopv
  511. REP_RET
  512. %endmacro
  513. INIT_MMX mmxext
  514. %define PW_ROUND pw_16
  515. %define OP_MOV PUT_OPH
  516. MPEG4_QPEL8_V_LOWPASS put
  517. %define PW_ROUND pw_16
  518. %define OP_MOV AVG_OPH
  519. MPEG4_QPEL8_V_LOWPASS avg
  520. %define PW_ROUND pw_15
  521. %define OP_MOV PUT_OPH
  522. MPEG4_QPEL8_V_LOWPASS put_no_rnd