You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

893 lines
20KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA 32
  26. cextern pw_16
  27. cextern pw_1
  28. cextern pb_0
  29. pw_pixel_max: times 8 dw ((1 << 10)-1)
  30. pad10: times 8 dw 10*1023
  31. pad20: times 8 dw 20*1023
  32. pad30: times 8 dw 30*1023
  33. depad: times 4 dd 32*20*1023 + 512
  34. depad2: times 8 dw 20*1023 + 16*1022 + 16
  35. unpad: times 8 dw 16*1022/32 ; needs to be mod 16
  36. tap1: times 4 dw 1, -5
  37. tap2: times 4 dw 20, 20
  38. tap3: times 4 dw -5, 1
  39. pd_0f: times 4 dd 0xffff
  40. SECTION .text
  41. %macro AVG_MOV 2
  42. pavgw %2, %1
  43. mova %1, %2
  44. %endmacro
  45. %macro ADDW 3
  46. %if mmsize == 8
  47. paddw %1, %2
  48. %else
  49. movu %3, %2
  50. paddw %1, %3
  51. %endif
  52. %endmacro
  53. %macro FILT_H 4
  54. paddw %1, %4
  55. psubw %1, %2 ; a-b
  56. psraw %1, 2 ; (a-b)/4
  57. psubw %1, %2 ; (a-b)/4-b
  58. paddw %1, %3 ; (a-b)/4-b+c
  59. psraw %1, 2 ; ((a-b)/4-b+c)/4
  60. paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  61. %endmacro
  62. %macro PRELOAD_V 0
  63. lea r3, [r2*3]
  64. sub r1, r3
  65. movu m0, [r1+r2]
  66. movu m1, [r1+r2*2]
  67. add r1, r3
  68. movu m2, [r1]
  69. movu m3, [r1+r2]
  70. movu m4, [r1+r2*2]
  71. add r1, r3
  72. %endmacro
  73. %macro FILT_V 8
  74. movu %6, [r1]
  75. paddw %1, %6
  76. mova %7, %2
  77. paddw %7, %5
  78. mova %8, %3
  79. paddw %8, %4
  80. FILT_H %1, %7, %8, [pw_16]
  81. psraw %1, 1
  82. CLIPW %1, [pb_0], [pw_pixel_max]
  83. %endmacro
  84. %macro MC 1
  85. %define OP_MOV mova
  86. INIT_MMX
  87. %1 mmxext, put, 4
  88. INIT_XMM
  89. %1 sse2 , put, 8
  90. %define OP_MOV AVG_MOV
  91. INIT_MMX
  92. %1 mmxext, avg, 4
  93. INIT_XMM
  94. %1 sse2 , avg, 8
  95. %endmacro
  96. %macro MCAxA 8
  97. %if ARCH_X86_64
  98. %ifnidn %1,mmxext
  99. MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
  100. %endif
  101. %else
  102. MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
  103. %endif
  104. %endmacro
  105. %macro MCAxA_OP 8
  106. %if ARCH_X86_32
  107. cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
  108. call stub_%2_h264_qpel%4_%3_10_%1
  109. mov r0, r0m
  110. mov r1, r1m
  111. add r0, %4*2
  112. add r1, %4*2
  113. call stub_%2_h264_qpel%4_%3_10_%1
  114. mov r0, r0m
  115. mov r1, r1m
  116. lea r0, [r0+r2*%4]
  117. lea r1, [r1+r2*%4]
  118. call stub_%2_h264_qpel%4_%3_10_%1
  119. mov r0, r0m
  120. mov r1, r1m
  121. lea r0, [r0+r2*%4+%4*2]
  122. lea r1, [r1+r2*%4+%4*2]
  123. call stub_%2_h264_qpel%4_%3_10_%1
  124. RET
  125. %else ; ARCH_X86_64
  126. cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
  127. mov r%7, r0
  128. %assign p1 %7+1
  129. mov r %+ p1, r1
  130. call stub_%2_h264_qpel%4_%3_10_%1
  131. lea r0, [r%7+%4*2]
  132. lea r1, [r %+ p1+%4*2]
  133. call stub_%2_h264_qpel%4_%3_10_%1
  134. lea r0, [r%7+r2*%4]
  135. lea r1, [r %+ p1+r2*%4]
  136. call stub_%2_h264_qpel%4_%3_10_%1
  137. lea r0, [r%7+r2*%4+%4*2]
  138. lea r1, [r %+ p1+r2*%4+%4*2]
  139. %if UNIX64 == 0 ; fall through to function
  140. call stub_%2_h264_qpel%4_%3_10_%1
  141. RET
  142. %endif
  143. %endif
  144. %endmacro
  145. ;cpu, put/avg, mc, 4/8, ...
  146. %macro cglobal_mc 7
  147. %assign i %4*2
  148. MCAxA %1, %2, %3, %4, i, %5,%6,%7
  149. cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
  150. %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
  151. call stub_%2_h264_qpel%4_%3_10_%1
  152. RET
  153. %endif
  154. stub_%2_h264_qpel%4_%3_10_%1:
  155. %endmacro
  156. ;-----------------------------------------------------------------------------
  157. ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  158. ;-----------------------------------------------------------------------------
  159. %macro COPY4 0
  160. movu m0, [r1 ]
  161. OP_MOV [r0 ], m0
  162. movu m0, [r1+r2 ]
  163. OP_MOV [r0+r2 ], m0
  164. movu m0, [r1+r2*2]
  165. OP_MOV [r0+r2*2], m0
  166. movu m0, [r1+r3 ]
  167. OP_MOV [r0+r3 ], m0
  168. %endmacro
  169. %macro MC00 1
  170. INIT_MMX
  171. cglobal_mc mmxext, %1, mc00, 4, 3,4,0
  172. lea r3, [r2*3]
  173. COPY4
  174. ret
  175. INIT_XMM
  176. cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
  177. lea r3, [r2*3]
  178. COPY4
  179. lea r0, [r0+r2*4]
  180. lea r1, [r1+r2*4]
  181. COPY4
  182. RET
  183. cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
  184. mov r3d, 8
  185. .loop:
  186. movu m0, [r1 ]
  187. movu m1, [r1 +16]
  188. OP_MOV [r0 ], m0
  189. OP_MOV [r0 +16], m1
  190. movu m0, [r1+r2 ]
  191. movu m1, [r1+r2+16]
  192. OP_MOV [r0+r2 ], m0
  193. OP_MOV [r0+r2+16], m1
  194. lea r0, [r0+r2*2]
  195. lea r1, [r1+r2*2]
  196. dec r3d
  197. jg .loop
  198. REP_RET
  199. %endmacro
  200. %define OP_MOV mova
  201. MC00 put
  202. %define OP_MOV AVG_MOV
  203. MC00 avg
  204. ;-----------------------------------------------------------------------------
  205. ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  206. ;-----------------------------------------------------------------------------
  207. %macro MC_CACHE 1
  208. %define OP_MOV mova
  209. %define PALIGNR PALIGNR_MMX
  210. INIT_MMX
  211. %1 mmxext , put, 4
  212. INIT_XMM
  213. %1 sse2_cache64 , put, 8
  214. %define PALIGNR PALIGNR_SSSE3
  215. %1 ssse3_cache64, put, 8
  216. %1 sse2 , put, 8, 0
  217. %define OP_MOV AVG_MOV
  218. %define PALIGNR PALIGNR_MMX
  219. INIT_MMX
  220. %1 mmxext , avg, 4
  221. INIT_XMM
  222. %1 sse2_cache64 , avg, 8
  223. %define PALIGNR PALIGNR_SSSE3
  224. %1 ssse3_cache64, avg, 8
  225. %1 sse2 , avg, 8, 0
  226. %endmacro
  227. %macro MC20 3-4
  228. cglobal_mc %1, %2, mc20, %3, 3,4,9
  229. mov r3d, %3
  230. mova m1, [pw_pixel_max]
  231. %if num_mmregs > 8
  232. mova m8, [pw_16]
  233. %define p16 m8
  234. %else
  235. %define p16 [pw_16]
  236. %endif
  237. .nextrow:
  238. %if %0 == 4
  239. movu m2, [r1-4]
  240. movu m3, [r1-2]
  241. movu m4, [r1+0]
  242. ADDW m2, [r1+6], m5
  243. ADDW m3, [r1+4], m5
  244. ADDW m4, [r1+2], m5
  245. %else ; movu is slow on these processors
  246. %if mmsize==16
  247. movu m2, [r1-4]
  248. movu m0, [r1+6]
  249. mova m6, m0
  250. psrldq m0, 6
  251. paddw m6, m2
  252. PALIGNR m3, m0, m2, 2, m5
  253. PALIGNR m7, m0, m2, 8, m5
  254. paddw m3, m7
  255. PALIGNR m4, m0, m2, 4, m5
  256. PALIGNR m7, m0, m2, 6, m5
  257. paddw m4, m7
  258. SWAP 2, 6
  259. %else
  260. movu m2, [r1-4]
  261. movu m6, [r1+4]
  262. PALIGNR m3, m6, m2, 2, m5
  263. paddw m3, m6
  264. PALIGNR m4, m6, m2, 4, m5
  265. PALIGNR m7, m6, m2, 6, m5
  266. paddw m4, m7
  267. paddw m2, [r1+6]
  268. %endif
  269. %endif
  270. FILT_H m2, m3, m4, p16
  271. psraw m2, 1
  272. pxor m0, m0
  273. CLIPW m2, m0, m1
  274. OP_MOV [r0], m2
  275. add r0, r2
  276. add r1, r2
  277. dec r3d
  278. jg .nextrow
  279. rep ret
  280. %endmacro
  281. MC_CACHE MC20
  282. ;-----------------------------------------------------------------------------
  283. ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  284. ;-----------------------------------------------------------------------------
  285. %macro MC30 3-4
  286. cglobal_mc %1, %2, mc30, %3, 3,5,9
  287. lea r4, [r1+2]
  288. jmp stub_%2_h264_qpel%3_mc10_10_%1.body
  289. %endmacro
  290. MC_CACHE MC30
  291. ;-----------------------------------------------------------------------------
  292. ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  293. ;-----------------------------------------------------------------------------
  294. %macro MC10 3-4
  295. cglobal_mc %1, %2, mc10, %3, 3,5,9
  296. mov r4, r1
  297. .body:
  298. mov r3d, %3
  299. mova m1, [pw_pixel_max]
  300. %if num_mmregs > 8
  301. mova m8, [pw_16]
  302. %define p16 m8
  303. %else
  304. %define p16 [pw_16]
  305. %endif
  306. .nextrow:
  307. %if %0 == 4
  308. movu m2, [r1-4]
  309. movu m3, [r1-2]
  310. movu m4, [r1+0]
  311. ADDW m2, [r1+6], m5
  312. ADDW m3, [r1+4], m5
  313. ADDW m4, [r1+2], m5
  314. %else ; movu is slow on these processors
  315. %if mmsize==16
  316. movu m2, [r1-4]
  317. movu m0, [r1+6]
  318. mova m6, m0
  319. psrldq m0, 6
  320. paddw m6, m2
  321. PALIGNR m3, m0, m2, 2, m5
  322. PALIGNR m7, m0, m2, 8, m5
  323. paddw m3, m7
  324. PALIGNR m4, m0, m2, 4, m5
  325. PALIGNR m7, m0, m2, 6, m5
  326. paddw m4, m7
  327. SWAP 2, 6
  328. %else
  329. movu m2, [r1-4]
  330. movu m6, [r1+4]
  331. PALIGNR m3, m6, m2, 2, m5
  332. paddw m3, m6
  333. PALIGNR m4, m6, m2, 4, m5
  334. PALIGNR m7, m6, m2, 6, m5
  335. paddw m4, m7
  336. paddw m2, [r1+6]
  337. %endif
  338. %endif
  339. FILT_H m2, m3, m4, p16
  340. psraw m2, 1
  341. pxor m0, m0
  342. CLIPW m2, m0, m1
  343. movu m3, [r4]
  344. pavgw m2, m3
  345. OP_MOV [r0], m2
  346. add r0, r2
  347. add r1, r2
  348. add r4, r2
  349. dec r3d
  350. jg .nextrow
  351. rep ret
  352. %endmacro
  353. MC_CACHE MC10
  354. ;-----------------------------------------------------------------------------
  355. ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  356. ;-----------------------------------------------------------------------------
  357. %macro V_FILT 11
  358. v_filt%9_%10_10_%11:
  359. add r4, r2
  360. .no_addr4:
  361. FILT_V m0, m1, m2, m3, m4, m5, m6, m7
  362. add r1, r2
  363. add r0, r2
  364. ret
  365. %endmacro
  366. INIT_MMX
  367. RESET_MM_PERMUTATION
  368. %assign i 0
  369. %rep 4
  370. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
  371. SWAP 0,1,2,3,4,5
  372. %assign i i+1
  373. %endrep
  374. INIT_XMM
  375. RESET_MM_PERMUTATION
  376. %assign i 0
  377. %rep 6
  378. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
  379. SWAP 0,1,2,3,4,5
  380. %assign i i+1
  381. %endrep
  382. %macro MC02 3
  383. cglobal_mc %1, %2, mc02, %3, 3,4,8
  384. PRELOAD_V
  385. sub r0, r2
  386. %assign j 0
  387. %rep %3
  388. %assign i (j % 6)
  389. call v_filt%3_ %+ i %+ _10_%1.no_addr4
  390. OP_MOV [r0], m0
  391. SWAP 0,1,2,3,4,5
  392. %assign j j+1
  393. %endrep
  394. ret
  395. %endmacro
  396. MC MC02
  397. ;-----------------------------------------------------------------------------
  398. ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  399. ;-----------------------------------------------------------------------------
  400. %macro MC01 3
  401. cglobal_mc %1, %2, mc01, %3, 3,5,8
  402. mov r4, r1
  403. .body:
  404. PRELOAD_V
  405. sub r4, r2
  406. sub r0, r2
  407. %assign j 0
  408. %rep %3
  409. %assign i (j % 6)
  410. call v_filt%3_ %+ i %+ _10_%1
  411. movu m7, [r4]
  412. pavgw m0, m7
  413. OP_MOV [r0], m0
  414. SWAP 0,1,2,3,4,5
  415. %assign j j+1
  416. %endrep
  417. ret
  418. %endmacro
  419. MC MC01
  420. ;-----------------------------------------------------------------------------
  421. ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  422. ;-----------------------------------------------------------------------------
  423. %macro MC03 3
  424. cglobal_mc %1, %2, mc03, %3, 3,5,8
  425. lea r4, [r1+r2]
  426. jmp stub_%2_h264_qpel%3_mc01_10_%1.body
  427. %endmacro
  428. MC MC03
  429. ;-----------------------------------------------------------------------------
  430. ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  431. ;-----------------------------------------------------------------------------
  432. %macro H_FILT_AVG 3-4
  433. h_filt%2_%3_10_%1:
  434. ;FILT_H with fewer registers and averaged with the FILT_V result
  435. ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  436. ;unfortunately I need three registers, so m5 will have to be re-read from memory
  437. movu m5, [r4-4]
  438. ADDW m5, [r4+6], m7
  439. movu m6, [r4-2]
  440. ADDW m6, [r4+4], m7
  441. paddw m5, [pw_16]
  442. psubw m5, m6 ; a-b
  443. psraw m5, 2 ; (a-b)/4
  444. psubw m5, m6 ; (a-b)/4-b
  445. movu m6, [r4+0]
  446. ADDW m6, [r4+2], m7
  447. paddw m5, m6 ; (a-b)/4-b+c
  448. psraw m5, 2 ; ((a-b)/4-b+c)/4
  449. paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  450. psraw m5, 1
  451. CLIPW m5, [pb_0], [pw_pixel_max]
  452. ;avg FILT_V, FILT_H
  453. pavgw m0, m5
  454. %if %0!=4
  455. movu m5, [r1+r5]
  456. %endif
  457. ret
  458. %endmacro
  459. INIT_MMX
  460. RESET_MM_PERMUTATION
  461. %assign i 0
  462. %rep 3
  463. H_FILT_AVG mmxext, 4, i
  464. SWAP 0,1,2,3,4,5
  465. %assign i i+1
  466. %endrep
  467. H_FILT_AVG mmxext, 4, i, 0
  468. INIT_XMM
  469. RESET_MM_PERMUTATION
  470. %assign i 0
  471. %rep 6
  472. %if i==1
  473. H_FILT_AVG sse2, 8, i, 0
  474. %else
  475. H_FILT_AVG sse2, 8, i
  476. %endif
  477. SWAP 0,1,2,3,4,5
  478. %assign i i+1
  479. %endrep
  480. %macro MC11 3
  481. ; this REALLY needs x86_64
  482. cglobal_mc %1, %2, mc11, %3, 3,6,8
  483. mov r4, r1
  484. .body:
  485. PRELOAD_V
  486. sub r0, r2
  487. sub r4, r2
  488. mov r5, r2
  489. neg r5
  490. %assign j 0
  491. %rep %3
  492. %assign i (j % 6)
  493. call v_filt%3_ %+ i %+ _10_%1
  494. call h_filt%3_ %+ i %+ _10_%1
  495. %if %3==8 && i==1
  496. movu m5, [r1+r5]
  497. %endif
  498. OP_MOV [r0], m0
  499. SWAP 0,1,2,3,4,5
  500. %assign j j+1
  501. %endrep
  502. ret
  503. %endmacro
  504. MC MC11
  505. ;-----------------------------------------------------------------------------
  506. ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  507. ;-----------------------------------------------------------------------------
  508. %macro MC31 3
  509. cglobal_mc %1, %2, mc31, %3, 3,6,8
  510. mov r4, r1
  511. add r1, 2
  512. jmp stub_%2_h264_qpel%3_mc11_10_%1.body
  513. %endmacro
  514. MC MC31
  515. ;-----------------------------------------------------------------------------
  516. ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  517. ;-----------------------------------------------------------------------------
  518. %macro MC13 3
  519. cglobal_mc %1, %2, mc13, %3, 3,7,12
  520. lea r4, [r1+r2]
  521. jmp stub_%2_h264_qpel%3_mc11_10_%1.body
  522. %endmacro
  523. MC MC13
  524. ;-----------------------------------------------------------------------------
  525. ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  526. ;-----------------------------------------------------------------------------
  527. %macro MC33 3
  528. cglobal_mc %1, %2, mc33, %3, 3,6,8
  529. lea r4, [r1+r2]
  530. add r1, 2
  531. jmp stub_%2_h264_qpel%3_mc11_10_%1.body
  532. %endmacro
  533. MC MC33
  534. ;-----------------------------------------------------------------------------
  535. ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  536. ;-----------------------------------------------------------------------------
  537. %macro FILT_H2 3
  538. psubw %1, %2 ; a-b
  539. psubw %2, %3 ; b-c
  540. psllw %2, 2
  541. psubw %1, %2 ; a-5*b+4*c
  542. psllw %3, 4
  543. paddw %1, %3 ; a-5*b+20*c
  544. %endmacro
  545. %macro FILT_VNRD 8
  546. movu %6, [r1]
  547. paddw %1, %6
  548. mova %7, %2
  549. paddw %7, %5
  550. mova %8, %3
  551. paddw %8, %4
  552. FILT_H2 %1, %7, %8
  553. %endmacro
  554. %macro HV 2
  555. %ifidn %1,sse2
  556. %define PAD 12
  557. %define COUNT 2
  558. %else
  559. %define PAD 4
  560. %define COUNT 3
  561. %endif
  562. put_hv%2_10_%1:
  563. neg r2 ; This actually saves instructions
  564. lea r1, [r1+r2*2-mmsize+PAD]
  565. lea r4, [rsp+PAD+gprsize]
  566. mov r3d, COUNT
  567. .v_loop:
  568. movu m0, [r1]
  569. sub r1, r2
  570. movu m1, [r1]
  571. sub r1, r2
  572. movu m2, [r1]
  573. sub r1, r2
  574. movu m3, [r1]
  575. sub r1, r2
  576. movu m4, [r1]
  577. sub r1, r2
  578. %assign i 0
  579. %rep %2-1
  580. FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  581. psubw m0, [pad20]
  582. movu [r4+i*mmsize*3], m0
  583. sub r1, r2
  584. SWAP 0,1,2,3,4,5
  585. %assign i i+1
  586. %endrep
  587. FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  588. psubw m0, [pad20]
  589. movu [r4+i*mmsize*3], m0
  590. add r4, mmsize
  591. lea r1, [r1+r2*8+mmsize]
  592. %if %2==8
  593. lea r1, [r1+r2*4]
  594. %endif
  595. dec r3d
  596. jg .v_loop
  597. neg r2
  598. ret
  599. %endmacro
  600. INIT_MMX
  601. HV mmxext, 4
  602. INIT_XMM
  603. HV sse2 , 8
  604. %macro H_LOOP 2
  605. %if num_mmregs > 8
  606. %define s1 m8
  607. %define s2 m9
  608. %define s3 m10
  609. %define d1 m11
  610. %else
  611. %define s1 [tap1]
  612. %define s2 [tap2]
  613. %define s3 [tap3]
  614. %define d1 [depad]
  615. %endif
  616. h%2_loop_op_%1:
  617. movu m1, [r1+mmsize-4]
  618. movu m2, [r1+mmsize-2]
  619. mova m3, [r1+mmsize+0]
  620. movu m4, [r1+mmsize+2]
  621. movu m5, [r1+mmsize+4]
  622. movu m6, [r1+mmsize+6]
  623. %if num_mmregs > 8
  624. pmaddwd m1, s1
  625. pmaddwd m2, s1
  626. pmaddwd m3, s2
  627. pmaddwd m4, s2
  628. pmaddwd m5, s3
  629. pmaddwd m6, s3
  630. paddd m1, d1
  631. paddd m2, d1
  632. %else
  633. mova m0, s1
  634. pmaddwd m1, m0
  635. pmaddwd m2, m0
  636. mova m0, s2
  637. pmaddwd m3, m0
  638. pmaddwd m4, m0
  639. mova m0, s3
  640. pmaddwd m5, m0
  641. pmaddwd m6, m0
  642. mova m0, d1
  643. paddd m1, m0
  644. paddd m2, m0
  645. %endif
  646. paddd m3, m5
  647. paddd m4, m6
  648. paddd m1, m3
  649. paddd m2, m4
  650. psrad m1, 10
  651. psrad m2, 10
  652. pslld m2, 16
  653. pand m1, [pd_0f]
  654. por m1, m2
  655. %if num_mmregs <= 8
  656. pxor m0, m0
  657. %endif
  658. CLIPW m1, m0, m7
  659. add r1, mmsize*3
  660. ret
  661. %endmacro
  662. INIT_MMX
  663. H_LOOP mmxext, 4
  664. INIT_XMM
  665. H_LOOP sse2 , 8
  666. %macro MC22 3
  667. cglobal_mc %1, %2, mc22, %3, 3,7,12
  668. %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
  669. mov r6, rsp ; backup stack pointer
  670. and rsp, ~(mmsize-1) ; align stack
  671. sub rsp, PAD
  672. call put_hv%3_10_%1
  673. mov r3d, %3
  674. mova m7, [pw_pixel_max]
  675. %if num_mmregs > 8
  676. pxor m0, m0
  677. mova m8, [tap1]
  678. mova m9, [tap2]
  679. mova m10, [tap3]
  680. mova m11, [depad]
  681. %endif
  682. mov r1, rsp
  683. .h_loop:
  684. call h%3_loop_op_%1
  685. OP_MOV [r0], m1
  686. add r0, r2
  687. dec r3d
  688. jg .h_loop
  689. mov rsp, r6 ; restore stack pointer
  690. ret
  691. %endmacro
  692. MC MC22
  693. ;-----------------------------------------------------------------------------
  694. ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  695. ;-----------------------------------------------------------------------------
  696. %macro MC12 3
  697. cglobal_mc %1, %2, mc12, %3, 3,7,12
  698. %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
  699. mov r6, rsp ; backup stack pointer
  700. and rsp, ~(mmsize-1) ; align stack
  701. sub rsp, PAD
  702. call put_hv%3_10_%1
  703. xor r4d, r4d
  704. .body:
  705. mov r3d, %3
  706. pxor m0, m0
  707. mova m7, [pw_pixel_max]
  708. %if num_mmregs > 8
  709. mova m8, [tap1]
  710. mova m9, [tap2]
  711. mova m10, [tap3]
  712. mova m11, [depad]
  713. %endif
  714. mov r1, rsp
  715. .h_loop:
  716. call h%3_loop_op_%1
  717. movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
  718. paddw m3, [depad2]
  719. psrlw m3, 5
  720. psubw m3, [unpad]
  721. CLIPW m3, m0, m7
  722. pavgw m1, m3
  723. OP_MOV [r0], m1
  724. add r0, r2
  725. dec r3d
  726. jg .h_loop
  727. mov rsp, r6 ; restore stack pointer
  728. ret
  729. %endmacro
  730. MC MC12
  731. ;-----------------------------------------------------------------------------
  732. ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  733. ;-----------------------------------------------------------------------------
  734. %macro MC32 3
  735. cglobal_mc %1, %2, mc32, %3, 3,7,12
  736. %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
  737. mov r6, rsp ; backup stack pointer
  738. and rsp, ~(mmsize-1) ; align stack
  739. sub rsp, PAD
  740. call put_hv%3_10_%1
  741. mov r4d, 2 ; sizeof(pixel)
  742. jmp stub_%2_h264_qpel%3_mc12_10_%1.body
  743. %endmacro
  744. MC MC32
  745. ;-----------------------------------------------------------------------------
  746. ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  747. ;-----------------------------------------------------------------------------
  748. %macro H_NRD 2
  749. put_h%2_10_%1:
  750. add rsp, gprsize
  751. mov r3d, %2
  752. xor r4d, r4d
  753. mova m6, [pad20]
  754. .nextrow:
  755. movu m2, [r5-4]
  756. movu m3, [r5-2]
  757. movu m4, [r5+0]
  758. ADDW m2, [r5+6], m5
  759. ADDW m3, [r5+4], m5
  760. ADDW m4, [r5+2], m5
  761. FILT_H2 m2, m3, m4
  762. psubw m2, m6
  763. mova [rsp+r4], m2
  764. add r4d, mmsize*3
  765. add r5, r2
  766. dec r3d
  767. jg .nextrow
  768. sub rsp, gprsize
  769. ret
  770. %endmacro
  771. INIT_MMX
  772. H_NRD mmxext, 4
  773. INIT_XMM
  774. H_NRD sse2 , 8
  775. %macro MC21 3
  776. cglobal_mc %1, %2, mc21, %3, 3,7,12
  777. mov r5, r1
  778. .body:
  779. %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
  780. mov r6, rsp ; backup stack pointer
  781. and rsp, ~(mmsize-1) ; align stack
  782. sub rsp, PAD
  783. call put_h%3_10_%1
  784. sub rsp, PAD
  785. call put_hv%3_10_%1
  786. mov r4d, PAD-mmsize ; H buffer
  787. jmp stub_%2_h264_qpel%3_mc12_10_%1.body
  788. %endmacro
  789. MC MC21
  790. ;-----------------------------------------------------------------------------
  791. ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  792. ;-----------------------------------------------------------------------------
  793. %macro MC23 3
  794. cglobal_mc %1, %2, mc23, %3, 3,7,12
  795. lea r5, [r1+r2]
  796. jmp stub_%2_h264_qpel%3_mc21_10_%1.body
  797. %endmacro
  798. MC MC23