You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

894 lines
20KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA 32
  27. cextern pw_16
  28. cextern pw_1
  29. cextern pb_0
  30. pw_pixel_max: times 8 dw ((1 << 10)-1)
  31. pad10: times 8 dw 10*1023
  32. pad20: times 8 dw 20*1023
  33. pad30: times 8 dw 30*1023
  34. depad: times 4 dd 32*20*1023 + 512
  35. depad2: times 8 dw 20*1023 + 16*1022 + 16
  36. unpad: times 8 dw 16*1022/32 ; needs to be mod 16
  37. tap1: times 4 dw 1, -5
  38. tap2: times 4 dw 20, 20
  39. tap3: times 4 dw -5, 1
  40. pd_0f: times 4 dd 0xffff
  41. SECTION .text
  42. %macro AVG_MOV 2
  43. pavgw %2, %1
  44. mova %1, %2
  45. %endmacro
  46. %macro ADDW 3
  47. %if mmsize == 8
  48. paddw %1, %2
  49. %else
  50. movu %3, %2
  51. paddw %1, %3
  52. %endif
  53. %endmacro
  54. %macro FILT_H 4
  55. paddw %1, %4
  56. psubw %1, %2 ; a-b
  57. psraw %1, 2 ; (a-b)/4
  58. psubw %1, %2 ; (a-b)/4-b
  59. paddw %1, %3 ; (a-b)/4-b+c
  60. psraw %1, 2 ; ((a-b)/4-b+c)/4
  61. paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  62. %endmacro
  63. %macro PRELOAD_V 0
  64. lea r3, [r2*3]
  65. sub r1, r3
  66. movu m0, [r1+r2]
  67. movu m1, [r1+r2*2]
  68. add r1, r3
  69. movu m2, [r1]
  70. movu m3, [r1+r2]
  71. movu m4, [r1+r2*2]
  72. add r1, r3
  73. %endmacro
  74. %macro FILT_V 8
  75. movu %6, [r1]
  76. paddw %1, %6
  77. mova %7, %2
  78. paddw %7, %5
  79. mova %8, %3
  80. paddw %8, %4
  81. FILT_H %1, %7, %8, [pw_16]
  82. psraw %1, 1
  83. CLIPW %1, [pb_0], [pw_pixel_max]
  84. %endmacro
  85. %macro MC 1
  86. %define OP_MOV mova
  87. INIT_MMX
  88. %1 mmxext, put, 4
  89. INIT_XMM
  90. %1 sse2 , put, 8
  91. %define OP_MOV AVG_MOV
  92. INIT_MMX
  93. %1 mmxext, avg, 4
  94. INIT_XMM
  95. %1 sse2 , avg, 8
  96. %endmacro
  97. %macro MCAxA 8
  98. %if ARCH_X86_64
  99. %ifnidn %1,mmxext
  100. MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
  101. %endif
  102. %else
  103. MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
  104. %endif
  105. %endmacro
  106. %macro MCAxA_OP 8
  107. %if ARCH_X86_32
  108. cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
  109. call stub_%2_h264_qpel%4_%3_10_%1
  110. mov r0, r0m
  111. mov r1, r1m
  112. add r0, %4*2
  113. add r1, %4*2
  114. call stub_%2_h264_qpel%4_%3_10_%1
  115. mov r0, r0m
  116. mov r1, r1m
  117. lea r0, [r0+r2*%4]
  118. lea r1, [r1+r2*%4]
  119. call stub_%2_h264_qpel%4_%3_10_%1
  120. mov r0, r0m
  121. mov r1, r1m
  122. lea r0, [r0+r2*%4+%4*2]
  123. lea r1, [r1+r2*%4+%4*2]
  124. call stub_%2_h264_qpel%4_%3_10_%1
  125. RET
  126. %else ; ARCH_X86_64
  127. cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
  128. mov r%7, r0
  129. %assign p1 %7+1
  130. mov r %+ p1, r1
  131. call stub_%2_h264_qpel%4_%3_10_%1
  132. lea r0, [r%7+%4*2]
  133. lea r1, [r %+ p1+%4*2]
  134. call stub_%2_h264_qpel%4_%3_10_%1
  135. lea r0, [r%7+r2*%4]
  136. lea r1, [r %+ p1+r2*%4]
  137. call stub_%2_h264_qpel%4_%3_10_%1
  138. lea r0, [r%7+r2*%4+%4*2]
  139. lea r1, [r %+ p1+r2*%4+%4*2]
  140. %if UNIX64 == 0 ; fall through to function
  141. call stub_%2_h264_qpel%4_%3_10_%1
  142. RET
  143. %endif
  144. %endif
  145. %endmacro
  146. ;cpu, put/avg, mc, 4/8, ...
  147. %macro cglobal_mc 7
  148. %assign i %4*2
  149. MCAxA %1, %2, %3, %4, i, %5,%6,%7
  150. cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
  151. %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
  152. call stub_%2_h264_qpel%4_%3_10_%1
  153. RET
  154. %endif
  155. stub_%2_h264_qpel%4_%3_10_%1:
  156. %endmacro
  157. ;-----------------------------------------------------------------------------
  158. ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  159. ;-----------------------------------------------------------------------------
  160. %macro COPY4 0
  161. movu m0, [r1 ]
  162. OP_MOV [r0 ], m0
  163. movu m0, [r1+r2 ]
  164. OP_MOV [r0+r2 ], m0
  165. movu m0, [r1+r2*2]
  166. OP_MOV [r0+r2*2], m0
  167. movu m0, [r1+r3 ]
  168. OP_MOV [r0+r3 ], m0
  169. %endmacro
  170. %macro MC00 1
  171. INIT_MMX
  172. cglobal_mc mmxext, %1, mc00, 4, 3,4,0
  173. lea r3, [r2*3]
  174. COPY4
  175. ret
  176. INIT_XMM
  177. cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
  178. lea r3, [r2*3]
  179. COPY4
  180. lea r0, [r0+r2*4]
  181. lea r1, [r1+r2*4]
  182. COPY4
  183. RET
  184. cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
  185. mov r3d, 8
  186. .loop:
  187. movu m0, [r1 ]
  188. movu m1, [r1 +16]
  189. OP_MOV [r0 ], m0
  190. OP_MOV [r0 +16], m1
  191. movu m0, [r1+r2 ]
  192. movu m1, [r1+r2+16]
  193. OP_MOV [r0+r2 ], m0
  194. OP_MOV [r0+r2+16], m1
  195. lea r0, [r0+r2*2]
  196. lea r1, [r1+r2*2]
  197. dec r3d
  198. jg .loop
  199. REP_RET
  200. %endmacro
  201. %define OP_MOV mova
  202. MC00 put
  203. %define OP_MOV AVG_MOV
  204. MC00 avg
  205. ;-----------------------------------------------------------------------------
  206. ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  207. ;-----------------------------------------------------------------------------
  208. %macro MC_CACHE 1
  209. %define OP_MOV mova
  210. %define PALIGNR PALIGNR_MMX
  211. INIT_MMX
  212. %1 mmxext , put, 4
  213. INIT_XMM
  214. %1 sse2_cache64 , put, 8
  215. %define PALIGNR PALIGNR_SSSE3
  216. %1 ssse3_cache64, put, 8
  217. %1 sse2 , put, 8, 0
  218. %define OP_MOV AVG_MOV
  219. %define PALIGNR PALIGNR_MMX
  220. INIT_MMX
  221. %1 mmxext , avg, 4
  222. INIT_XMM
  223. %1 sse2_cache64 , avg, 8
  224. %define PALIGNR PALIGNR_SSSE3
  225. %1 ssse3_cache64, avg, 8
  226. %1 sse2 , avg, 8, 0
  227. %endmacro
  228. %macro MC20 3-4
  229. cglobal_mc %1, %2, mc20, %3, 3,4,9
  230. mov r3d, %3
  231. mova m1, [pw_pixel_max]
  232. %if num_mmregs > 8
  233. mova m8, [pw_16]
  234. %define p16 m8
  235. %else
  236. %define p16 [pw_16]
  237. %endif
  238. .nextrow:
  239. %if %0 == 4
  240. movu m2, [r1-4]
  241. movu m3, [r1-2]
  242. movu m4, [r1+0]
  243. ADDW m2, [r1+6], m5
  244. ADDW m3, [r1+4], m5
  245. ADDW m4, [r1+2], m5
  246. %else ; movu is slow on these processors
  247. %if mmsize==16
  248. movu m2, [r1-4]
  249. movu m0, [r1+6]
  250. mova m6, m0
  251. psrldq m0, 6
  252. paddw m6, m2
  253. PALIGNR m3, m0, m2, 2, m5
  254. PALIGNR m7, m0, m2, 8, m5
  255. paddw m3, m7
  256. PALIGNR m4, m0, m2, 4, m5
  257. PALIGNR m7, m0, m2, 6, m5
  258. paddw m4, m7
  259. SWAP 2, 6
  260. %else
  261. movu m2, [r1-4]
  262. movu m6, [r1+4]
  263. PALIGNR m3, m6, m2, 2, m5
  264. paddw m3, m6
  265. PALIGNR m4, m6, m2, 4, m5
  266. PALIGNR m7, m6, m2, 6, m5
  267. paddw m4, m7
  268. paddw m2, [r1+6]
  269. %endif
  270. %endif
  271. FILT_H m2, m3, m4, p16
  272. psraw m2, 1
  273. pxor m0, m0
  274. CLIPW m2, m0, m1
  275. OP_MOV [r0], m2
  276. add r0, r2
  277. add r1, r2
  278. dec r3d
  279. jg .nextrow
  280. rep ret
  281. %endmacro
  282. MC_CACHE MC20
  283. ;-----------------------------------------------------------------------------
  284. ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  285. ;-----------------------------------------------------------------------------
  286. %macro MC30 3-4
  287. cglobal_mc %1, %2, mc30, %3, 3,5,9
  288. lea r4, [r1+2]
  289. jmp stub_%2_h264_qpel%3_mc10_10_%1.body
  290. %endmacro
  291. MC_CACHE MC30
  292. ;-----------------------------------------------------------------------------
  293. ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  294. ;-----------------------------------------------------------------------------
  295. %macro MC10 3-4
  296. cglobal_mc %1, %2, mc10, %3, 3,5,9
  297. mov r4, r1
  298. .body:
  299. mov r3d, %3
  300. mova m1, [pw_pixel_max]
  301. %if num_mmregs > 8
  302. mova m8, [pw_16]
  303. %define p16 m8
  304. %else
  305. %define p16 [pw_16]
  306. %endif
  307. .nextrow:
  308. %if %0 == 4
  309. movu m2, [r1-4]
  310. movu m3, [r1-2]
  311. movu m4, [r1+0]
  312. ADDW m2, [r1+6], m5
  313. ADDW m3, [r1+4], m5
  314. ADDW m4, [r1+2], m5
  315. %else ; movu is slow on these processors
  316. %if mmsize==16
  317. movu m2, [r1-4]
  318. movu m0, [r1+6]
  319. mova m6, m0
  320. psrldq m0, 6
  321. paddw m6, m2
  322. PALIGNR m3, m0, m2, 2, m5
  323. PALIGNR m7, m0, m2, 8, m5
  324. paddw m3, m7
  325. PALIGNR m4, m0, m2, 4, m5
  326. PALIGNR m7, m0, m2, 6, m5
  327. paddw m4, m7
  328. SWAP 2, 6
  329. %else
  330. movu m2, [r1-4]
  331. movu m6, [r1+4]
  332. PALIGNR m3, m6, m2, 2, m5
  333. paddw m3, m6
  334. PALIGNR m4, m6, m2, 4, m5
  335. PALIGNR m7, m6, m2, 6, m5
  336. paddw m4, m7
  337. paddw m2, [r1+6]
  338. %endif
  339. %endif
  340. FILT_H m2, m3, m4, p16
  341. psraw m2, 1
  342. pxor m0, m0
  343. CLIPW m2, m0, m1
  344. movu m3, [r4]
  345. pavgw m2, m3
  346. OP_MOV [r0], m2
  347. add r0, r2
  348. add r1, r2
  349. add r4, r2
  350. dec r3d
  351. jg .nextrow
  352. rep ret
  353. %endmacro
  354. MC_CACHE MC10
  355. ;-----------------------------------------------------------------------------
  356. ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  357. ;-----------------------------------------------------------------------------
  358. %macro V_FILT 11
  359. v_filt%9_%10_10_%11:
  360. add r4, r2
  361. .no_addr4:
  362. FILT_V m0, m1, m2, m3, m4, m5, m6, m7
  363. add r1, r2
  364. add r0, r2
  365. ret
  366. %endmacro
  367. INIT_MMX
  368. RESET_MM_PERMUTATION
  369. %assign i 0
  370. %rep 4
  371. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
  372. SWAP 0,1,2,3,4,5
  373. %assign i i+1
  374. %endrep
  375. INIT_XMM
  376. RESET_MM_PERMUTATION
  377. %assign i 0
  378. %rep 6
  379. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
  380. SWAP 0,1,2,3,4,5
  381. %assign i i+1
  382. %endrep
  383. %macro MC02 3
  384. cglobal_mc %1, %2, mc02, %3, 3,4,8
  385. PRELOAD_V
  386. sub r0, r2
  387. %assign j 0
  388. %rep %3
  389. %assign i (j % 6)
  390. call v_filt%3_ %+ i %+ _10_%1.no_addr4
  391. OP_MOV [r0], m0
  392. SWAP 0,1,2,3,4,5
  393. %assign j j+1
  394. %endrep
  395. ret
  396. %endmacro
  397. MC MC02
  398. ;-----------------------------------------------------------------------------
  399. ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  400. ;-----------------------------------------------------------------------------
  401. %macro MC01 3
  402. cglobal_mc %1, %2, mc01, %3, 3,5,8
  403. mov r4, r1
  404. .body:
  405. PRELOAD_V
  406. sub r4, r2
  407. sub r0, r2
  408. %assign j 0
  409. %rep %3
  410. %assign i (j % 6)
  411. call v_filt%3_ %+ i %+ _10_%1
  412. movu m7, [r4]
  413. pavgw m0, m7
  414. OP_MOV [r0], m0
  415. SWAP 0,1,2,3,4,5
  416. %assign j j+1
  417. %endrep
  418. ret
  419. %endmacro
  420. MC MC01
  421. ;-----------------------------------------------------------------------------
  422. ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  423. ;-----------------------------------------------------------------------------
  424. %macro MC03 3
  425. cglobal_mc %1, %2, mc03, %3, 3,5,8
  426. lea r4, [r1+r2]
  427. jmp stub_%2_h264_qpel%3_mc01_10_%1.body
  428. %endmacro
  429. MC MC03
  430. ;-----------------------------------------------------------------------------
  431. ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  432. ;-----------------------------------------------------------------------------
  433. %macro H_FILT_AVG 3-4
  434. h_filt%2_%3_10_%1:
  435. ;FILT_H with fewer registers and averaged with the FILT_V result
  436. ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  437. ;unfortunately I need three registers, so m5 will have to be re-read from memory
  438. movu m5, [r4-4]
  439. ADDW m5, [r4+6], m7
  440. movu m6, [r4-2]
  441. ADDW m6, [r4+4], m7
  442. paddw m5, [pw_16]
  443. psubw m5, m6 ; a-b
  444. psraw m5, 2 ; (a-b)/4
  445. psubw m5, m6 ; (a-b)/4-b
  446. movu m6, [r4+0]
  447. ADDW m6, [r4+2], m7
  448. paddw m5, m6 ; (a-b)/4-b+c
  449. psraw m5, 2 ; ((a-b)/4-b+c)/4
  450. paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  451. psraw m5, 1
  452. CLIPW m5, [pb_0], [pw_pixel_max]
  453. ;avg FILT_V, FILT_H
  454. pavgw m0, m5
  455. %if %0!=4
  456. movu m5, [r1+r5]
  457. %endif
  458. ret
  459. %endmacro
  460. INIT_MMX
  461. RESET_MM_PERMUTATION
  462. %assign i 0
  463. %rep 3
  464. H_FILT_AVG mmxext, 4, i
  465. SWAP 0,1,2,3,4,5
  466. %assign i i+1
  467. %endrep
  468. H_FILT_AVG mmxext, 4, i, 0
  469. INIT_XMM
  470. RESET_MM_PERMUTATION
  471. %assign i 0
  472. %rep 6
  473. %if i==1
  474. H_FILT_AVG sse2, 8, i, 0
  475. %else
  476. H_FILT_AVG sse2, 8, i
  477. %endif
  478. SWAP 0,1,2,3,4,5
  479. %assign i i+1
  480. %endrep
  481. %macro MC11 3
  482. ; this REALLY needs x86_64
  483. cglobal_mc %1, %2, mc11, %3, 3,6,8
  484. mov r4, r1
  485. .body:
  486. PRELOAD_V
  487. sub r0, r2
  488. sub r4, r2
  489. mov r5, r2
  490. neg r5
  491. %assign j 0
  492. %rep %3
  493. %assign i (j % 6)
  494. call v_filt%3_ %+ i %+ _10_%1
  495. call h_filt%3_ %+ i %+ _10_%1
  496. %if %3==8 && i==1
  497. movu m5, [r1+r5]
  498. %endif
  499. OP_MOV [r0], m0
  500. SWAP 0,1,2,3,4,5
  501. %assign j j+1
  502. %endrep
  503. ret
  504. %endmacro
  505. MC MC11
  506. ;-----------------------------------------------------------------------------
  507. ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  508. ;-----------------------------------------------------------------------------
  509. %macro MC31 3
  510. cglobal_mc %1, %2, mc31, %3, 3,6,8
  511. mov r4, r1
  512. add r1, 2
  513. jmp stub_%2_h264_qpel%3_mc11_10_%1.body
  514. %endmacro
  515. MC MC31
  516. ;-----------------------------------------------------------------------------
  517. ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  518. ;-----------------------------------------------------------------------------
  519. %macro MC13 3
  520. cglobal_mc %1, %2, mc13, %3, 3,7,12
  521. lea r4, [r1+r2]
  522. jmp stub_%2_h264_qpel%3_mc11_10_%1.body
  523. %endmacro
  524. MC MC13
  525. ;-----------------------------------------------------------------------------
  526. ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  527. ;-----------------------------------------------------------------------------
  528. %macro MC33 3
  529. cglobal_mc %1, %2, mc33, %3, 3,6,8
  530. lea r4, [r1+r2]
  531. add r1, 2
  532. jmp stub_%2_h264_qpel%3_mc11_10_%1.body
  533. %endmacro
  534. MC MC33
  535. ;-----------------------------------------------------------------------------
  536. ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  537. ;-----------------------------------------------------------------------------
  538. %macro FILT_H2 3
  539. psubw %1, %2 ; a-b
  540. psubw %2, %3 ; b-c
  541. psllw %2, 2
  542. psubw %1, %2 ; a-5*b+4*c
  543. psllw %3, 4
  544. paddw %1, %3 ; a-5*b+20*c
  545. %endmacro
  546. %macro FILT_VNRD 8
  547. movu %6, [r1]
  548. paddw %1, %6
  549. mova %7, %2
  550. paddw %7, %5
  551. mova %8, %3
  552. paddw %8, %4
  553. FILT_H2 %1, %7, %8
  554. %endmacro
  555. %macro HV 2
  556. %ifidn %1,sse2
  557. %define PAD 12
  558. %define COUNT 2
  559. %else
  560. %define PAD 4
  561. %define COUNT 3
  562. %endif
  563. put_hv%2_10_%1:
  564. neg r2 ; This actually saves instructions
  565. lea r1, [r1+r2*2-mmsize+PAD]
  566. lea r4, [rsp+PAD+gprsize]
  567. mov r3d, COUNT
  568. .v_loop:
  569. movu m0, [r1]
  570. sub r1, r2
  571. movu m1, [r1]
  572. sub r1, r2
  573. movu m2, [r1]
  574. sub r1, r2
  575. movu m3, [r1]
  576. sub r1, r2
  577. movu m4, [r1]
  578. sub r1, r2
  579. %assign i 0
  580. %rep %2-1
  581. FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  582. psubw m0, [pad20]
  583. movu [r4+i*mmsize*3], m0
  584. sub r1, r2
  585. SWAP 0,1,2,3,4,5
  586. %assign i i+1
  587. %endrep
  588. FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  589. psubw m0, [pad20]
  590. movu [r4+i*mmsize*3], m0
  591. add r4, mmsize
  592. lea r1, [r1+r2*8+mmsize]
  593. %if %2==8
  594. lea r1, [r1+r2*4]
  595. %endif
  596. dec r3d
  597. jg .v_loop
  598. neg r2
  599. ret
  600. %endmacro
  601. INIT_MMX
  602. HV mmxext, 4
  603. INIT_XMM
  604. HV sse2 , 8
  605. %macro H_LOOP 2
  606. %if num_mmregs > 8
  607. %define s1 m8
  608. %define s2 m9
  609. %define s3 m10
  610. %define d1 m11
  611. %else
  612. %define s1 [tap1]
  613. %define s2 [tap2]
  614. %define s3 [tap3]
  615. %define d1 [depad]
  616. %endif
  617. h%2_loop_op_%1:
  618. movu m1, [r1+mmsize-4]
  619. movu m2, [r1+mmsize-2]
  620. mova m3, [r1+mmsize+0]
  621. movu m4, [r1+mmsize+2]
  622. movu m5, [r1+mmsize+4]
  623. movu m6, [r1+mmsize+6]
  624. %if num_mmregs > 8
  625. pmaddwd m1, s1
  626. pmaddwd m2, s1
  627. pmaddwd m3, s2
  628. pmaddwd m4, s2
  629. pmaddwd m5, s3
  630. pmaddwd m6, s3
  631. paddd m1, d1
  632. paddd m2, d1
  633. %else
  634. mova m0, s1
  635. pmaddwd m1, m0
  636. pmaddwd m2, m0
  637. mova m0, s2
  638. pmaddwd m3, m0
  639. pmaddwd m4, m0
  640. mova m0, s3
  641. pmaddwd m5, m0
  642. pmaddwd m6, m0
  643. mova m0, d1
  644. paddd m1, m0
  645. paddd m2, m0
  646. %endif
  647. paddd m3, m5
  648. paddd m4, m6
  649. paddd m1, m3
  650. paddd m2, m4
  651. psrad m1, 10
  652. psrad m2, 10
  653. pslld m2, 16
  654. pand m1, [pd_0f]
  655. por m1, m2
  656. %if num_mmregs <= 8
  657. pxor m0, m0
  658. %endif
  659. CLIPW m1, m0, m7
  660. add r1, mmsize*3
  661. ret
  662. %endmacro
  663. INIT_MMX
  664. H_LOOP mmxext, 4
  665. INIT_XMM
  666. H_LOOP sse2 , 8
  667. %macro MC22 3
  668. cglobal_mc %1, %2, mc22, %3, 3,7,12
  669. %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
  670. mov r6, rsp ; backup stack pointer
  671. and rsp, ~(mmsize-1) ; align stack
  672. sub rsp, PAD
  673. call put_hv%3_10_%1
  674. mov r3d, %3
  675. mova m7, [pw_pixel_max]
  676. %if num_mmregs > 8
  677. pxor m0, m0
  678. mova m8, [tap1]
  679. mova m9, [tap2]
  680. mova m10, [tap3]
  681. mova m11, [depad]
  682. %endif
  683. mov r1, rsp
  684. .h_loop:
  685. call h%3_loop_op_%1
  686. OP_MOV [r0], m1
  687. add r0, r2
  688. dec r3d
  689. jg .h_loop
  690. mov rsp, r6 ; restore stack pointer
  691. ret
  692. %endmacro
  693. MC MC22
  694. ;-----------------------------------------------------------------------------
  695. ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  696. ;-----------------------------------------------------------------------------
  697. %macro MC12 3
  698. cglobal_mc %1, %2, mc12, %3, 3,7,12
  699. %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
  700. mov r6, rsp ; backup stack pointer
  701. and rsp, ~(mmsize-1) ; align stack
  702. sub rsp, PAD
  703. call put_hv%3_10_%1
  704. xor r4d, r4d
  705. .body:
  706. mov r3d, %3
  707. pxor m0, m0
  708. mova m7, [pw_pixel_max]
  709. %if num_mmregs > 8
  710. mova m8, [tap1]
  711. mova m9, [tap2]
  712. mova m10, [tap3]
  713. mova m11, [depad]
  714. %endif
  715. mov r1, rsp
  716. .h_loop:
  717. call h%3_loop_op_%1
  718. movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
  719. paddw m3, [depad2]
  720. psrlw m3, 5
  721. psubw m3, [unpad]
  722. CLIPW m3, m0, m7
  723. pavgw m1, m3
  724. OP_MOV [r0], m1
  725. add r0, r2
  726. dec r3d
  727. jg .h_loop
  728. mov rsp, r6 ; restore stack pointer
  729. ret
  730. %endmacro
  731. MC MC12
  732. ;-----------------------------------------------------------------------------
  733. ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  734. ;-----------------------------------------------------------------------------
  735. %macro MC32 3
  736. cglobal_mc %1, %2, mc32, %3, 3,7,12
  737. %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
  738. mov r6, rsp ; backup stack pointer
  739. and rsp, ~(mmsize-1) ; align stack
  740. sub rsp, PAD
  741. call put_hv%3_10_%1
  742. mov r4d, 2 ; sizeof(pixel)
  743. jmp stub_%2_h264_qpel%3_mc12_10_%1.body
  744. %endmacro
  745. MC MC32
  746. ;-----------------------------------------------------------------------------
  747. ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  748. ;-----------------------------------------------------------------------------
  749. %macro H_NRD 2
  750. put_h%2_10_%1:
  751. add rsp, gprsize
  752. mov r3d, %2
  753. xor r4d, r4d
  754. mova m6, [pad20]
  755. .nextrow:
  756. movu m2, [r5-4]
  757. movu m3, [r5-2]
  758. movu m4, [r5+0]
  759. ADDW m2, [r5+6], m5
  760. ADDW m3, [r5+4], m5
  761. ADDW m4, [r5+2], m5
  762. FILT_H2 m2, m3, m4
  763. psubw m2, m6
  764. mova [rsp+r4], m2
  765. add r4d, mmsize*3
  766. add r5, r2
  767. dec r3d
  768. jg .nextrow
  769. sub rsp, gprsize
  770. ret
  771. %endmacro
  772. INIT_MMX
  773. H_NRD mmxext, 4
  774. INIT_XMM
  775. H_NRD sse2 , 8
  776. %macro MC21 3
  777. cglobal_mc %1, %2, mc21, %3, 3,7,12
  778. mov r5, r1
  779. .body:
  780. %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
  781. mov r6, rsp ; backup stack pointer
  782. and rsp, ~(mmsize-1) ; align stack
  783. sub rsp, PAD
  784. call put_h%3_10_%1
  785. sub rsp, PAD
  786. call put_hv%3_10_%1
  787. mov r4d, PAD-mmsize ; H buffer
  788. jmp stub_%2_h264_qpel%3_mc12_10_%1.body
  789. %endmacro
  790. MC MC21
  791. ;-----------------------------------------------------------------------------
  792. ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  793. ;-----------------------------------------------------------------------------
  794. %macro MC23 3
  795. cglobal_mc %1, %2, mc23, %3, 3,7,12
  796. lea r5, [r1+r2]
  797. jmp stub_%2_h264_qpel%3_mc21_10_%1.body
  798. %endmacro
  799. MC MC23