You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

561 lines
14KB

  1. ;******************************************************************************
  2. ;* mpeg4 qpel
  3. ;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  4. ;* Copyright (c) 2008 Loren Merritt
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. cextern pb_1
  26. cextern pw_3
  27. cextern pw_15
  28. cextern pw_16
  29. cextern pw_20
  30. SECTION_TEXT
  31. ; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  32. %macro PUT_NO_RND_PIXELS8_L2 0
  33. cglobal put_no_rnd_pixels8_l2, 6,6
  34. movsxdifnidn r4, r4d
  35. movsxdifnidn r3, r3d
  36. pcmpeqb m6, m6
  37. test r5d, 1
  38. je .loop
  39. mova m0, [r1]
  40. mova m1, [r2]
  41. add r1, r4
  42. add r2, 8
  43. pxor m0, m6
  44. pxor m1, m6
  45. PAVGB m0, m1
  46. pxor m0, m6
  47. mova [r0], m0
  48. add r0, r3
  49. dec r5d
  50. .loop:
  51. mova m0, [r1]
  52. add r1, r4
  53. mova m1, [r1]
  54. add r1, r4
  55. mova m2, [r2]
  56. mova m3, [r2+8]
  57. pxor m0, m6
  58. pxor m1, m6
  59. pxor m2, m6
  60. pxor m3, m6
  61. PAVGB m0, m2
  62. PAVGB m1, m3
  63. pxor m0, m6
  64. pxor m1, m6
  65. mova [r0], m0
  66. add r0, r3
  67. mova [r0], m1
  68. add r0, r3
  69. mova m0, [r1]
  70. add r1, r4
  71. mova m1, [r1]
  72. add r1, r4
  73. mova m2, [r2+16]
  74. mova m3, [r2+24]
  75. pxor m0, m6
  76. pxor m1, m6
  77. pxor m2, m6
  78. pxor m3, m6
  79. PAVGB m0, m2
  80. PAVGB m1, m3
  81. pxor m0, m6
  82. pxor m1, m6
  83. mova [r0], m0
  84. add r0, r3
  85. mova [r0], m1
  86. add r0, r3
  87. add r2, 32
  88. sub r5d, 4
  89. jne .loop
  90. REP_RET
  91. %endmacro
  92. INIT_MMX mmxext
  93. PUT_NO_RND_PIXELS8_L2
  94. ; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  95. %macro PUT_NO_RND_PIXELS16_l2 0
  96. cglobal put_no_rnd_pixels16_l2, 6,6
  97. movsxdifnidn r3, r3d
  98. movsxdifnidn r4, r4d
  99. pcmpeqb m6, m6
  100. test r5d, 1
  101. je .loop
  102. mova m0, [r1]
  103. mova m1, [r1+8]
  104. mova m2, [r2]
  105. mova m3, [r2+8]
  106. pxor m0, m6
  107. pxor m1, m6
  108. pxor m2, m6
  109. pxor m3, m6
  110. PAVGB m0, m2
  111. PAVGB m1, m3
  112. pxor m0, m6
  113. pxor m1, m6
  114. add r1, r4
  115. add r2, 16
  116. mova [r0], m0
  117. mova [r0+8], m1
  118. add r0, r3
  119. dec r5d
  120. .loop:
  121. mova m0, [r1]
  122. mova m1, [r1+8]
  123. add r1, r4
  124. mova m2, [r2]
  125. mova m3, [r2+8]
  126. pxor m0, m6
  127. pxor m1, m6
  128. pxor m2, m6
  129. pxor m3, m6
  130. PAVGB m0, m2
  131. PAVGB m1, m3
  132. pxor m0, m6
  133. pxor m1, m6
  134. mova [r0], m0
  135. mova [r0+8], m1
  136. add r0, r3
  137. mova m0, [r1]
  138. mova m1, [r1+8]
  139. add r1, r4
  140. mova m2, [r2+16]
  141. mova m3, [r2+24]
  142. pxor m0, m6
  143. pxor m1, m6
  144. pxor m2, m6
  145. pxor m3, m6
  146. PAVGB m0, m2
  147. PAVGB m1, m3
  148. pxor m0, m6
  149. pxor m1, m6
  150. mova [r0], m0
  151. mova [r0+8], m1
  152. add r0, r3
  153. add r2, 32
  154. sub r5d, 2
  155. jne .loop
  156. REP_RET
  157. %endmacro
  158. INIT_MMX mmxext
  159. PUT_NO_RND_PIXELS16_l2
  160. INIT_MMX 3dnow
  161. PUT_NO_RND_PIXELS16_l2
  162. %macro MPEG4_QPEL16_H_LOWPASS 1
  163. cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
  164. movsxdifnidn r2, r2d
  165. movsxdifnidn r3, r3d
  166. pxor m7, m7
  167. .loop:
  168. mova m0, [r1]
  169. mova m1, m0
  170. mova m2, m0
  171. punpcklbw m0, m7
  172. punpckhbw m1, m7
  173. pshufw m5, m0, 0x90
  174. pshufw m6, m0, 0x41
  175. mova m3, m2
  176. mova m4, m2
  177. psllq m2, 8
  178. psllq m3, 16
  179. psllq m4, 24
  180. punpckhbw m2, m7
  181. punpckhbw m3, m7
  182. punpckhbw m4, m7
  183. paddw m5, m3
  184. paddw m6, m2
  185. paddw m5, m5
  186. psubw m6, m5
  187. pshufw m5, m0, 6
  188. pmullw m6, [pw_3]
  189. paddw m0, m4
  190. paddw m5, m1
  191. pmullw m0, [pw_20]
  192. psubw m0, m5
  193. paddw m6, [PW_ROUND]
  194. paddw m0, m6
  195. psraw m0, 5
  196. mova [rsp+8], m0
  197. mova m0, [r1+5]
  198. mova m5, m0
  199. mova m6, m0
  200. psrlq m0, 8
  201. psrlq m5, 16
  202. punpcklbw m0, m7
  203. punpcklbw m5, m7
  204. paddw m2, m0
  205. paddw m3, m5
  206. paddw m2, m2
  207. psubw m3, m2
  208. mova m2, m6
  209. psrlq m6, 24
  210. punpcklbw m2, m7
  211. punpcklbw m6, m7
  212. pmullw m3, [pw_3]
  213. paddw m1, m2
  214. paddw m4, m6
  215. pmullw m1, [pw_20]
  216. psubw m3, m4
  217. paddw m1, [PW_ROUND]
  218. paddw m3, m1
  219. psraw m3, 5
  220. mova m1, [rsp+8]
  221. packuswb m1, m3
  222. OP_MOV [r0], m1, m4
  223. mova m1, [r1+9]
  224. mova m4, m1
  225. mova m3, m1
  226. psrlq m1, 8
  227. psrlq m4, 16
  228. punpcklbw m1, m7
  229. punpcklbw m4, m7
  230. paddw m5, m1
  231. paddw m0, m4
  232. paddw m5, m5
  233. psubw m0, m5
  234. mova m5, m3
  235. psrlq m3, 24
  236. pmullw m0, [pw_3]
  237. punpcklbw m3, m7
  238. paddw m2, m3
  239. psubw m0, m2
  240. mova m2, m5
  241. punpcklbw m2, m7
  242. punpckhbw m5, m7
  243. paddw m6, m2
  244. pmullw m6, [pw_20]
  245. paddw m0, [PW_ROUND]
  246. paddw m0, m6
  247. psraw m0, 5
  248. paddw m3, m5
  249. pshufw m6, m5, 0xf9
  250. paddw m6, m4
  251. pshufw m4, m5, 0xbe
  252. pshufw m5, m5, 0x6f
  253. paddw m4, m1
  254. paddw m5, m2
  255. paddw m6, m6
  256. psubw m4, m6
  257. pmullw m3, [pw_20]
  258. pmullw m4, [pw_3]
  259. psubw m3, m5
  260. paddw m4, [PW_ROUND]
  261. paddw m4, m3
  262. psraw m4, 5
  263. packuswb m0, m4
  264. OP_MOV [r0+8], m0, m4
  265. add r1, r3
  266. add r0, r2
  267. dec r4d
  268. jne .loop
  269. REP_RET
  270. %endmacro
  271. %macro PUT_OP 2-3
  272. mova %1, %2
  273. %endmacro
  274. %macro AVG_OP 2-3
  275. mova %3, %1
  276. pavgb %2, %3
  277. mova %1, %2
  278. %endmacro
  279. INIT_MMX mmxext
  280. %define PW_ROUND pw_16
  281. %define OP_MOV PUT_OP
  282. MPEG4_QPEL16_H_LOWPASS put
  283. %define PW_ROUND pw_16
  284. %define OP_MOV AVG_OP
  285. MPEG4_QPEL16_H_LOWPASS avg
  286. %define PW_ROUND pw_15
  287. %define OP_MOV PUT_OP
  288. MPEG4_QPEL16_H_LOWPASS put_no_rnd
  289. %macro MPEG4_QPEL8_H_LOWPASS 1
  290. cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
  291. movsxdifnidn r2, r2d
  292. movsxdifnidn r3, r3d
  293. pxor m7, m7
  294. .loop:
  295. mova m0, [r1]
  296. mova m1, m0
  297. mova m2, m0
  298. punpcklbw m0, m7
  299. punpckhbw m1, m7
  300. pshufw m5, m0, 0x90
  301. pshufw m6, m0, 0x41
  302. mova m3, m2
  303. mova m4, m2
  304. psllq m2, 8
  305. psllq m3, 16
  306. psllq m4, 24
  307. punpckhbw m2, m7
  308. punpckhbw m3, m7
  309. punpckhbw m4, m7
  310. paddw m5, m3
  311. paddw m6, m2
  312. paddw m5, m5
  313. psubw m6, m5
  314. pshufw m5, m0, 0x6
  315. pmullw m6, [pw_3]
  316. paddw m0, m4
  317. paddw m5, m1
  318. pmullw m0, [pw_20]
  319. psubw m0, m5
  320. paddw m6, [PW_ROUND]
  321. paddw m0, m6
  322. psraw m0, 5
  323. movh m5, [r1+5]
  324. punpcklbw m5, m7
  325. pshufw m6, m5, 0xf9
  326. paddw m1, m5
  327. paddw m2, m6
  328. pshufw m6, m5, 0xbe
  329. pshufw m5, m5, 0x6f
  330. paddw m3, m6
  331. paddw m4, m5
  332. paddw m2, m2
  333. psubw m3, m2
  334. pmullw m1, [pw_20]
  335. pmullw m3, [pw_3]
  336. psubw m3, m4
  337. paddw m1, [PW_ROUND]
  338. paddw m3, m1
  339. psraw m3, 5
  340. packuswb m0, m3
  341. OP_MOV [r0], m0, m4
  342. add r1, r3
  343. add r0, r2
  344. dec r4d
  345. jne .loop
  346. REP_RET
  347. %endmacro
  348. INIT_MMX mmxext
  349. %define PW_ROUND pw_16
  350. %define OP_MOV PUT_OP
  351. MPEG4_QPEL8_H_LOWPASS put
  352. %define PW_ROUND pw_16
  353. %define OP_MOV AVG_OP
  354. MPEG4_QPEL8_H_LOWPASS avg
  355. %define PW_ROUND pw_15
  356. %define OP_MOV PUT_OP
  357. MPEG4_QPEL8_H_LOWPASS put_no_rnd
  358. %macro QPEL_V_LOW 5
  359. paddw m0, m1
  360. mova m4, [pw_20]
  361. pmullw m4, m0
  362. mova m0, %4
  363. mova m5, %1
  364. paddw m5, m0
  365. psubw m4, m5
  366. mova m5, %2
  367. mova m6, %3
  368. paddw m5, m3
  369. paddw m6, m2
  370. paddw m6, m6
  371. psubw m5, m6
  372. pmullw m5, [pw_3]
  373. paddw m4, [PW_ROUND]
  374. paddw m5, m4
  375. psraw m5, 5
  376. packuswb m5, m5
  377. OP_MOV %5, m5, m7
  378. SWAP 0,1,2,3
  379. %endmacro
  380. %macro MPEG4_QPEL16_V_LOWPASS 1
  381. cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
  382. movsxdifnidn r2, r2d
  383. movsxdifnidn r3, r3d
  384. mov r4d, 17
  385. mov r5, rsp
  386. pxor m7, m7
  387. .looph:
  388. mova m0, [r1]
  389. mova m1, [r1]
  390. mova m2, [r1+8]
  391. mova m3, [r1+8]
  392. punpcklbw m0, m7
  393. punpckhbw m1, m7
  394. punpcklbw m2, m7
  395. punpckhbw m3, m7
  396. mova [r5], m0
  397. mova [r5+0x88], m1
  398. mova [r5+0x110], m2
  399. mova [r5+0x198], m3
  400. add r5, 8
  401. add r1, r3
  402. dec r4d
  403. jne .looph
  404. ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
  405. mov r4d, 4
  406. mov r1, 4
  407. neg r2
  408. lea r1, [r1+r2*8]
  409. lea r1, [r1+r2*4]
  410. lea r1, [r1+r2*2]
  411. neg r2
  412. mov r5, rsp
  413. .loopv:
  414. pxor m7, m7
  415. mova m0, [r5+ 0x0]
  416. mova m1, [r5+ 0x8]
  417. mova m2, [r5+0x10]
  418. mova m3, [r5+0x18]
  419. QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
  420. QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
  421. lea r0, [r0+r2*2]
  422. QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
  423. QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
  424. lea r0, [r0+r2*2]
  425. QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
  426. QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
  427. lea r0, [r0+r2*2]
  428. QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
  429. QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
  430. lea r0, [r0+r2*2]
  431. QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
  432. QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
  433. lea r0, [r0+r2*2]
  434. QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
  435. QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
  436. lea r0, [r0+r2*2]
  437. QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
  438. QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
  439. lea r0, [r0+r2*2]
  440. QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
  441. QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
  442. add r5, 0x88
  443. add r0, r1
  444. dec r4d
  445. jne .loopv
  446. REP_RET
  447. %endmacro
  448. %macro PUT_OPH 2-3
  449. movh %1, %2
  450. %endmacro
  451. %macro AVG_OPH 2-3
  452. movh %3, %1
  453. pavgb %2, %3
  454. movh %1, %2
  455. %endmacro
  456. INIT_MMX mmxext
  457. %define PW_ROUND pw_16
  458. %define OP_MOV PUT_OPH
  459. MPEG4_QPEL16_V_LOWPASS put
  460. %define PW_ROUND pw_16
  461. %define OP_MOV AVG_OPH
  462. MPEG4_QPEL16_V_LOWPASS avg
  463. %define PW_ROUND pw_15
  464. %define OP_MOV PUT_OPH
  465. MPEG4_QPEL16_V_LOWPASS put_no_rnd
  466. %macro MPEG4_QPEL8_V_LOWPASS 1
  467. cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
  468. movsxdifnidn r2, r2d
  469. movsxdifnidn r3, r3d
  470. mov r4d, 9
  471. mov r5, rsp
  472. pxor m7, m7
  473. .looph:
  474. mova m0, [r1]
  475. mova m1, [r1]
  476. punpcklbw m0, m7
  477. punpckhbw m1, m7
  478. mova [r5], m0
  479. mova [r5+0x48], m1
  480. add r5, 8
  481. add r1, r3
  482. dec r4d
  483. jne .looph
  484. ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
  485. mov r4d, 2
  486. mov r1, 4
  487. neg r2
  488. lea r1, [r1+r2*4]
  489. lea r1, [r1+r2*2]
  490. neg r2
  491. mov r5, rsp
  492. .loopv:
  493. pxor m7, m7
  494. mova m0, [r5+ 0x0]
  495. mova m1, [r5+ 0x8]
  496. mova m2, [r5+0x10]
  497. mova m3, [r5+0x18]
  498. QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
  499. QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
  500. lea r0, [r0+r2*2]
  501. QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
  502. QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
  503. lea r0, [r0+r2*2]
  504. QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
  505. QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
  506. lea r0, [r0+r2*2]
  507. QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
  508. QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
  509. add r5, 0x48
  510. add r0, r1
  511. dec r4d
  512. jne .loopv
  513. REP_RET
  514. %endmacro
  515. INIT_MMX mmxext
  516. %define PW_ROUND pw_16
  517. %define OP_MOV PUT_OPH
  518. MPEG4_QPEL8_V_LOWPASS put
  519. %define PW_ROUND pw_16
  520. %define OP_MOV AVG_OPH
  521. MPEG4_QPEL8_V_LOWPASS avg
  522. %define PW_ROUND pw_15
  523. %define OP_MOV PUT_OPH
  524. MPEG4_QPEL8_V_LOWPASS put_no_rnd