You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

885 lines
20KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA 32
  26. cextern pw_16
  27. cextern pw_1
  28. cextern pb_0
  29. pw_pixel_max: times 8 dw ((1 << 10)-1)
  30. pad10: times 8 dw 10*1023
  31. pad20: times 8 dw 20*1023
  32. pad30: times 8 dw 30*1023
  33. depad: times 4 dd 32*20*1023 + 512
  34. depad2: times 8 dw 20*1023 + 16*1022 + 16
  35. unpad: times 8 dw 16*1022/32 ; needs to be mod 16
  36. tap1: times 4 dw 1, -5
  37. tap2: times 4 dw 20, 20
  38. tap3: times 4 dw -5, 1
  39. pd_0f: times 4 dd 0xffff
  40. SECTION .text
  41. %macro AVG_MOV 2
  42. pavgw %2, %1
  43. mova %1, %2
  44. %endmacro
  45. %macro ADDW 3
  46. %if mmsize == 8
  47. paddw %1, %2
  48. %else
  49. movu %3, %2
  50. paddw %1, %3
  51. %endif
  52. %endmacro
  53. %macro FILT_H 4
  54. paddw %1, %4
  55. psubw %1, %2 ; a-b
  56. psraw %1, 2 ; (a-b)/4
  57. psubw %1, %2 ; (a-b)/4-b
  58. paddw %1, %3 ; (a-b)/4-b+c
  59. psraw %1, 2 ; ((a-b)/4-b+c)/4
  60. paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  61. %endmacro
  62. %macro PRELOAD_V 0
  63. lea r3, [r2*3]
  64. sub r1, r3
  65. movu m0, [r1+r2]
  66. movu m1, [r1+r2*2]
  67. add r1, r3
  68. movu m2, [r1]
  69. movu m3, [r1+r2]
  70. movu m4, [r1+r2*2]
  71. add r1, r3
  72. %endmacro
  73. %macro FILT_V 8
  74. movu %6, [r1]
  75. paddw %1, %6
  76. mova %7, %2
  77. paddw %7, %5
  78. mova %8, %3
  79. paddw %8, %4
  80. FILT_H %1, %7, %8, [pw_16]
  81. psraw %1, 1
  82. CLIPW %1, [pb_0], [pw_pixel_max]
  83. %endmacro
  84. %macro MC 1
  85. %define OP_MOV mova
  86. INIT_MMX mmxext
  87. %1 put, 4
  88. INIT_XMM sse2
  89. %1 put, 8
  90. %define OP_MOV AVG_MOV
  91. INIT_MMX mmxext
  92. %1 avg, 4
  93. INIT_XMM sse2
  94. %1 avg, 8
  95. %endmacro
  96. %macro MCAxA_OP 7
  97. %if ARCH_X86_32
  98. cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
  99. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  100. mov r0, r0m
  101. mov r1, r1m
  102. add r0, %3*2
  103. add r1, %3*2
  104. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  105. mov r0, r0m
  106. mov r1, r1m
  107. lea r0, [r0+r2*%3]
  108. lea r1, [r1+r2*%3]
  109. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  110. mov r0, r0m
  111. mov r1, r1m
  112. lea r0, [r0+r2*%3+%3*2]
  113. lea r1, [r1+r2*%3+%3*2]
  114. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  115. RET
  116. %else ; ARCH_X86_64
  117. cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
  118. mov r%6, r0
  119. %assign p1 %6+1
  120. mov r %+ p1, r1
  121. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  122. lea r0, [r%6+%3*2]
  123. lea r1, [r %+ p1+%3*2]
  124. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  125. lea r0, [r%6+r2*%3]
  126. lea r1, [r %+ p1+r2*%3]
  127. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  128. lea r0, [r%6+r2*%3+%3*2]
  129. lea r1, [r %+ p1+r2*%3+%3*2]
  130. %if UNIX64 == 0 ; fall through to function
  131. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  132. RET
  133. %endif
  134. %endif
  135. %endmacro
  136. ;cpu, put/avg, mc, 4/8, ...
  137. %macro cglobal_mc 6
  138. %assign i %3*2
  139. %if ARCH_X86_32 || cpuflag(sse2)
  140. MCAxA_OP %1, %2, %3, i, %4,%5,%6
  141. %endif
  142. cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
  143. %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
  144. call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
  145. RET
  146. %endif
  147. stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
  148. %endmacro
  149. ;-----------------------------------------------------------------------------
  150. ; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
  151. ;-----------------------------------------------------------------------------
  152. %macro COPY4 0
  153. movu m0, [r1 ]
  154. OP_MOV [r0 ], m0
  155. movu m0, [r1+r2 ]
  156. OP_MOV [r0+r2 ], m0
  157. movu m0, [r1+r2*2]
  158. OP_MOV [r0+r2*2], m0
  159. movu m0, [r1+r3 ]
  160. OP_MOV [r0+r3 ], m0
  161. %endmacro
  162. %macro MC00 1
  163. INIT_MMX mmxext
  164. cglobal_mc %1, mc00, 4, 3,4,0
  165. lea r3, [r2*3]
  166. COPY4
  167. ret
  168. INIT_XMM sse2
  169. cglobal %1_h264_qpel8_mc00_10, 3,4
  170. lea r3, [r2*3]
  171. COPY4
  172. lea r0, [r0+r2*4]
  173. lea r1, [r1+r2*4]
  174. COPY4
  175. RET
  176. cglobal %1_h264_qpel16_mc00_10, 3,4
  177. mov r3d, 8
  178. .loop:
  179. movu m0, [r1 ]
  180. movu m1, [r1 +16]
  181. OP_MOV [r0 ], m0
  182. OP_MOV [r0 +16], m1
  183. movu m0, [r1+r2 ]
  184. movu m1, [r1+r2+16]
  185. OP_MOV [r0+r2 ], m0
  186. OP_MOV [r0+r2+16], m1
  187. lea r0, [r0+r2*2]
  188. lea r1, [r1+r2*2]
  189. dec r3d
  190. jg .loop
  191. REP_RET
  192. %endmacro
  193. %define OP_MOV mova
  194. MC00 put
  195. %define OP_MOV AVG_MOV
  196. MC00 avg
  197. ;-----------------------------------------------------------------------------
  198. ; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
  199. ;-----------------------------------------------------------------------------
  200. %macro MC_CACHE 1
  201. %define OP_MOV mova
  202. INIT_MMX mmxext
  203. %1 put, 4
  204. INIT_XMM sse2, cache64
  205. %1 put, 8
  206. INIT_XMM ssse3, cache64
  207. %1 put, 8
  208. INIT_XMM sse2
  209. %1 put, 8
  210. %define OP_MOV AVG_MOV
  211. INIT_MMX mmxext
  212. %1 avg, 4
  213. INIT_XMM sse2, cache64
  214. %1 avg, 8
  215. INIT_XMM ssse3, cache64
  216. %1 avg, 8
  217. INIT_XMM sse2
  218. %1 avg, 8
  219. %endmacro
  220. %macro MC20 2
  221. cglobal_mc %1, mc20, %2, 3,4,9
  222. mov r3d, %2
  223. mova m1, [pw_pixel_max]
  224. %if num_mmregs > 8
  225. mova m8, [pw_16]
  226. %define p16 m8
  227. %else
  228. %define p16 [pw_16]
  229. %endif
  230. .nextrow:
  231. %if %0 == 4
  232. movu m2, [r1-4]
  233. movu m3, [r1-2]
  234. movu m4, [r1+0]
  235. ADDW m2, [r1+6], m5
  236. ADDW m3, [r1+4], m5
  237. ADDW m4, [r1+2], m5
  238. %else ; movu is slow on these processors
  239. %if mmsize==16
  240. movu m2, [r1-4]
  241. movu m0, [r1+6]
  242. mova m6, m0
  243. psrldq m0, 6
  244. paddw m6, m2
  245. PALIGNR m3, m0, m2, 2, m5
  246. PALIGNR m7, m0, m2, 8, m5
  247. paddw m3, m7
  248. PALIGNR m4, m0, m2, 4, m5
  249. PALIGNR m7, m0, m2, 6, m5
  250. paddw m4, m7
  251. SWAP 2, 6
  252. %else
  253. movu m2, [r1-4]
  254. movu m6, [r1+4]
  255. PALIGNR m3, m6, m2, 2, m5
  256. paddw m3, m6
  257. PALIGNR m4, m6, m2, 4, m5
  258. PALIGNR m7, m6, m2, 6, m5
  259. paddw m4, m7
  260. paddw m2, [r1+6]
  261. %endif
  262. %endif
  263. FILT_H m2, m3, m4, p16
  264. psraw m2, 1
  265. pxor m0, m0
  266. CLIPW m2, m0, m1
  267. OP_MOV [r0], m2
  268. add r0, r2
  269. add r1, r2
  270. dec r3d
  271. jg .nextrow
  272. rep ret
  273. %endmacro
  274. MC_CACHE MC20
  275. ;-----------------------------------------------------------------------------
  276. ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
  277. ;-----------------------------------------------------------------------------
  278. %macro MC30 2
  279. cglobal_mc %1, mc30, %2, 3,5,9
  280. lea r4, [r1+2]
  281. jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
  282. %endmacro
  283. MC_CACHE MC30
  284. ;-----------------------------------------------------------------------------
  285. ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
  286. ;-----------------------------------------------------------------------------
  287. %macro MC10 2
  288. cglobal_mc %1, mc10, %2, 3,5,9
  289. mov r4, r1
  290. .body:
  291. mov r3d, %2
  292. mova m1, [pw_pixel_max]
  293. %if num_mmregs > 8
  294. mova m8, [pw_16]
  295. %define p16 m8
  296. %else
  297. %define p16 [pw_16]
  298. %endif
  299. .nextrow:
  300. %if %0 == 4
  301. movu m2, [r1-4]
  302. movu m3, [r1-2]
  303. movu m4, [r1+0]
  304. ADDW m2, [r1+6], m5
  305. ADDW m3, [r1+4], m5
  306. ADDW m4, [r1+2], m5
  307. %else ; movu is slow on these processors
  308. %if mmsize==16
  309. movu m2, [r1-4]
  310. movu m0, [r1+6]
  311. mova m6, m0
  312. psrldq m0, 6
  313. paddw m6, m2
  314. PALIGNR m3, m0, m2, 2, m5
  315. PALIGNR m7, m0, m2, 8, m5
  316. paddw m3, m7
  317. PALIGNR m4, m0, m2, 4, m5
  318. PALIGNR m7, m0, m2, 6, m5
  319. paddw m4, m7
  320. SWAP 2, 6
  321. %else
  322. movu m2, [r1-4]
  323. movu m6, [r1+4]
  324. PALIGNR m3, m6, m2, 2, m5
  325. paddw m3, m6
  326. PALIGNR m4, m6, m2, 4, m5
  327. PALIGNR m7, m6, m2, 6, m5
  328. paddw m4, m7
  329. paddw m2, [r1+6]
  330. %endif
  331. %endif
  332. FILT_H m2, m3, m4, p16
  333. psraw m2, 1
  334. pxor m0, m0
  335. CLIPW m2, m0, m1
  336. movu m3, [r4]
  337. pavgw m2, m3
  338. OP_MOV [r0], m2
  339. add r0, r2
  340. add r1, r2
  341. add r4, r2
  342. dec r3d
  343. jg .nextrow
  344. rep ret
  345. %endmacro
  346. MC_CACHE MC10
  347. ;-----------------------------------------------------------------------------
  348. ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
  349. ;-----------------------------------------------------------------------------
  350. %macro V_FILT 10
  351. v_filt%9_%10_10
  352. add r4, r2
  353. .no_addr4:
  354. FILT_V m0, m1, m2, m3, m4, m5, m6, m7
  355. add r1, r2
  356. add r0, r2
  357. ret
  358. %endmacro
  359. INIT_MMX mmxext
  360. RESET_MM_PERMUTATION
  361. %assign i 0
  362. %rep 4
  363. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
  364. SWAP 0,1,2,3,4,5
  365. %assign i i+1
  366. %endrep
  367. INIT_XMM sse2
  368. RESET_MM_PERMUTATION
  369. %assign i 0
  370. %rep 6
  371. V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
  372. SWAP 0,1,2,3,4,5
  373. %assign i i+1
  374. %endrep
  375. %macro MC02 2
  376. cglobal_mc %1, mc02, %2, 3,4,8
  377. PRELOAD_V
  378. sub r0, r2
  379. %assign j 0
  380. %rep %2
  381. %assign i (j % 6)
  382. call v_filt%2_ %+ i %+ _10.no_addr4
  383. OP_MOV [r0], m0
  384. SWAP 0,1,2,3,4,5
  385. %assign j j+1
  386. %endrep
  387. ret
  388. %endmacro
  389. MC MC02
  390. ;-----------------------------------------------------------------------------
  391. ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
  392. ;-----------------------------------------------------------------------------
  393. %macro MC01 2
  394. cglobal_mc %1, mc01, %2, 3,5,8
  395. mov r4, r1
  396. .body:
  397. PRELOAD_V
  398. sub r4, r2
  399. sub r0, r2
  400. %assign j 0
  401. %rep %2
  402. %assign i (j % 6)
  403. call v_filt%2_ %+ i %+ _10
  404. movu m7, [r4]
  405. pavgw m0, m7
  406. OP_MOV [r0], m0
  407. SWAP 0,1,2,3,4,5
  408. %assign j j+1
  409. %endrep
  410. ret
  411. %endmacro
  412. MC MC01
  413. ;-----------------------------------------------------------------------------
  414. ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
  415. ;-----------------------------------------------------------------------------
  416. %macro MC03 2
  417. cglobal_mc %1, mc03, %2, 3,5,8
  418. lea r4, [r1+r2]
  419. jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
  420. %endmacro
  421. MC MC03
  422. ;-----------------------------------------------------------------------------
  423. ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
  424. ;-----------------------------------------------------------------------------
  425. %macro H_FILT_AVG 2-3
  426. h_filt%1_%2_10:
  427. ;FILT_H with fewer registers and averaged with the FILT_V result
  428. ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
  429. ;unfortunately I need three registers, so m5 will have to be re-read from memory
  430. movu m5, [r4-4]
  431. ADDW m5, [r4+6], m7
  432. movu m6, [r4-2]
  433. ADDW m6, [r4+4], m7
  434. paddw m5, [pw_16]
  435. psubw m5, m6 ; a-b
  436. psraw m5, 2 ; (a-b)/4
  437. psubw m5, m6 ; (a-b)/4-b
  438. movu m6, [r4+0]
  439. ADDW m6, [r4+2], m7
  440. paddw m5, m6 ; (a-b)/4-b+c
  441. psraw m5, 2 ; ((a-b)/4-b+c)/4
  442. paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  443. psraw m5, 1
  444. CLIPW m5, [pb_0], [pw_pixel_max]
  445. ;avg FILT_V, FILT_H
  446. pavgw m0, m5
  447. %if %0!=4
  448. movu m5, [r1+r5]
  449. %endif
  450. ret
  451. %endmacro
  452. INIT_MMX mmxext
  453. RESET_MM_PERMUTATION
  454. %assign i 0
  455. %rep 3
  456. H_FILT_AVG 4, i
  457. SWAP 0,1,2,3,4,5
  458. %assign i i+1
  459. %endrep
  460. H_FILT_AVG 4, i, 0
  461. INIT_XMM sse2
  462. RESET_MM_PERMUTATION
  463. %assign i 0
  464. %rep 6
  465. %if i==1
  466. H_FILT_AVG 8, i, 0
  467. %else
  468. H_FILT_AVG 8, i
  469. %endif
  470. SWAP 0,1,2,3,4,5
  471. %assign i i+1
  472. %endrep
  473. %macro MC11 2
  474. ; this REALLY needs x86_64
  475. cglobal_mc %1, mc11, %2, 3,6,8
  476. mov r4, r1
  477. .body:
  478. PRELOAD_V
  479. sub r0, r2
  480. sub r4, r2
  481. mov r5, r2
  482. neg r5
  483. %assign j 0
  484. %rep %2
  485. %assign i (j % 6)
  486. call v_filt%2_ %+ i %+ _10
  487. call h_filt%2_ %+ i %+ _10
  488. %if %2==8 && i==1
  489. movu m5, [r1+r5]
  490. %endif
  491. OP_MOV [r0], m0
  492. SWAP 0,1,2,3,4,5
  493. %assign j j+1
  494. %endrep
  495. ret
  496. %endmacro
  497. MC MC11
  498. ;-----------------------------------------------------------------------------
  499. ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
  500. ;-----------------------------------------------------------------------------
  501. %macro MC31 2
  502. cglobal_mc %1, mc31, %2, 3,6,8
  503. mov r4, r1
  504. add r1, 2
  505. jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
  506. %endmacro
  507. MC MC31
  508. ;-----------------------------------------------------------------------------
  509. ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
  510. ;-----------------------------------------------------------------------------
  511. %macro MC13 2
  512. cglobal_mc %1, mc13, %2, 3,7,12
  513. lea r4, [r1+r2]
  514. jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
  515. %endmacro
  516. MC MC13
  517. ;-----------------------------------------------------------------------------
  518. ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
  519. ;-----------------------------------------------------------------------------
  520. %macro MC33 2
  521. cglobal_mc %1, mc33, %2, 3,6,8
  522. lea r4, [r1+r2]
  523. add r1, 2
  524. jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
  525. %endmacro
  526. MC MC33
  527. ;-----------------------------------------------------------------------------
  528. ; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
  529. ;-----------------------------------------------------------------------------
  530. %macro FILT_H2 3
  531. psubw %1, %2 ; a-b
  532. psubw %2, %3 ; b-c
  533. psllw %2, 2
  534. psubw %1, %2 ; a-5*b+4*c
  535. psllw %3, 4
  536. paddw %1, %3 ; a-5*b+20*c
  537. %endmacro
  538. %macro FILT_VNRD 8
  539. movu %6, [r1]
  540. paddw %1, %6
  541. mova %7, %2
  542. paddw %7, %5
  543. mova %8, %3
  544. paddw %8, %4
  545. FILT_H2 %1, %7, %8
  546. %endmacro
  547. %macro HV 1
  548. %if mmsize==16
  549. %define PAD 12
  550. %define COUNT 2
  551. %else
  552. %define PAD 4
  553. %define COUNT 3
  554. %endif
  555. put_hv%1_10:
  556. neg r2 ; This actually saves instructions
  557. lea r1, [r1+r2*2-mmsize+PAD]
  558. lea r4, [rsp+PAD+gprsize]
  559. mov r3d, COUNT
  560. .v_loop:
  561. movu m0, [r1]
  562. sub r1, r2
  563. movu m1, [r1]
  564. sub r1, r2
  565. movu m2, [r1]
  566. sub r1, r2
  567. movu m3, [r1]
  568. sub r1, r2
  569. movu m4, [r1]
  570. sub r1, r2
  571. %assign i 0
  572. %rep %1-1
  573. FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  574. psubw m0, [pad20]
  575. movu [r4+i*mmsize*3], m0
  576. sub r1, r2
  577. SWAP 0,1,2,3,4,5
  578. %assign i i+1
  579. %endrep
  580. FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
  581. psubw m0, [pad20]
  582. movu [r4+i*mmsize*3], m0
  583. add r4, mmsize
  584. lea r1, [r1+r2*8+mmsize]
  585. %if %1==8
  586. lea r1, [r1+r2*4]
  587. %endif
  588. dec r3d
  589. jg .v_loop
  590. neg r2
  591. ret
  592. %endmacro
  593. INIT_MMX mmxext
  594. HV 4
  595. INIT_XMM sse2
  596. HV 8
  597. %macro H_LOOP 1
  598. %if num_mmregs > 8
  599. %define s1 m8
  600. %define s2 m9
  601. %define s3 m10
  602. %define d1 m11
  603. %else
  604. %define s1 [tap1]
  605. %define s2 [tap2]
  606. %define s3 [tap3]
  607. %define d1 [depad]
  608. %endif
  609. h%1_loop_op:
  610. movu m1, [r1+mmsize-4]
  611. movu m2, [r1+mmsize-2]
  612. mova m3, [r1+mmsize+0]
  613. movu m4, [r1+mmsize+2]
  614. movu m5, [r1+mmsize+4]
  615. movu m6, [r1+mmsize+6]
  616. %if num_mmregs > 8
  617. pmaddwd m1, s1
  618. pmaddwd m2, s1
  619. pmaddwd m3, s2
  620. pmaddwd m4, s2
  621. pmaddwd m5, s3
  622. pmaddwd m6, s3
  623. paddd m1, d1
  624. paddd m2, d1
  625. %else
  626. mova m0, s1
  627. pmaddwd m1, m0
  628. pmaddwd m2, m0
  629. mova m0, s2
  630. pmaddwd m3, m0
  631. pmaddwd m4, m0
  632. mova m0, s3
  633. pmaddwd m5, m0
  634. pmaddwd m6, m0
  635. mova m0, d1
  636. paddd m1, m0
  637. paddd m2, m0
  638. %endif
  639. paddd m3, m5
  640. paddd m4, m6
  641. paddd m1, m3
  642. paddd m2, m4
  643. psrad m1, 10
  644. psrad m2, 10
  645. pslld m2, 16
  646. pand m1, [pd_0f]
  647. por m1, m2
  648. %if num_mmregs <= 8
  649. pxor m0, m0
  650. %endif
  651. CLIPW m1, m0, m7
  652. add r1, mmsize*3
  653. ret
  654. %endmacro
  655. INIT_MMX mmxext
  656. H_LOOP 4
  657. INIT_XMM sse2
  658. H_LOOP 8
  659. %macro MC22 2
  660. cglobal_mc %1, mc22, %2, 3,7,12
  661. %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
  662. mov r6, rsp ; backup stack pointer
  663. and rsp, ~(mmsize-1) ; align stack
  664. sub rsp, PAD
  665. call put_hv%2_10
  666. mov r3d, %2
  667. mova m7, [pw_pixel_max]
  668. %if num_mmregs > 8
  669. pxor m0, m0
  670. mova m8, [tap1]
  671. mova m9, [tap2]
  672. mova m10, [tap3]
  673. mova m11, [depad]
  674. %endif
  675. mov r1, rsp
  676. .h_loop:
  677. call h%2_loop_op
  678. OP_MOV [r0], m1
  679. add r0, r2
  680. dec r3d
  681. jg .h_loop
  682. mov rsp, r6 ; restore stack pointer
  683. ret
  684. %endmacro
  685. MC MC22
  686. ;-----------------------------------------------------------------------------
  687. ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
  688. ;-----------------------------------------------------------------------------
  689. %macro MC12 2
  690. cglobal_mc %1, mc12, %2, 3,7,12
  691. %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
  692. mov r6, rsp ; backup stack pointer
  693. and rsp, ~(mmsize-1) ; align stack
  694. sub rsp, PAD
  695. call put_hv%2_10
  696. xor r4d, r4d
  697. .body:
  698. mov r3d, %2
  699. pxor m0, m0
  700. mova m7, [pw_pixel_max]
  701. %if num_mmregs > 8
  702. mova m8, [tap1]
  703. mova m9, [tap2]
  704. mova m10, [tap3]
  705. mova m11, [depad]
  706. %endif
  707. mov r1, rsp
  708. .h_loop:
  709. call h%2_loop_op
  710. movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
  711. paddw m3, [depad2]
  712. psrlw m3, 5
  713. psubw m3, [unpad]
  714. CLIPW m3, m0, m7
  715. pavgw m1, m3
  716. OP_MOV [r0], m1
  717. add r0, r2
  718. dec r3d
  719. jg .h_loop
  720. mov rsp, r6 ; restore stack pointer
  721. ret
  722. %endmacro
  723. MC MC12
  724. ;-----------------------------------------------------------------------------
  725. ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
  726. ;-----------------------------------------------------------------------------
  727. %macro MC32 2
  728. cglobal_mc %1, mc32, %2, 3,7,12
  729. %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
  730. mov r6, rsp ; backup stack pointer
  731. and rsp, ~(mmsize-1) ; align stack
  732. sub rsp, PAD
  733. call put_hv%2_10
  734. mov r4d, 2 ; sizeof(pixel)
  735. jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
  736. %endmacro
  737. MC MC32
  738. ;-----------------------------------------------------------------------------
  739. ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
  740. ;-----------------------------------------------------------------------------
  741. %macro H_NRD 1
  742. put_h%1_10:
  743. add rsp, gprsize
  744. mov r3d, %1
  745. xor r4d, r4d
  746. mova m6, [pad20]
  747. .nextrow:
  748. movu m2, [r5-4]
  749. movu m3, [r5-2]
  750. movu m4, [r5+0]
  751. ADDW m2, [r5+6], m5
  752. ADDW m3, [r5+4], m5
  753. ADDW m4, [r5+2], m5
  754. FILT_H2 m2, m3, m4
  755. psubw m2, m6
  756. mova [rsp+r4], m2
  757. add r4d, mmsize*3
  758. add r5, r2
  759. dec r3d
  760. jg .nextrow
  761. sub rsp, gprsize
  762. ret
  763. %endmacro
  764. INIT_MMX mmxext
  765. H_NRD 4
  766. INIT_XMM sse2
  767. H_NRD 8
  768. %macro MC21 2
  769. cglobal_mc %1, mc21, %2, 3,7,12
  770. mov r5, r1
  771. .body:
  772. %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
  773. mov r6, rsp ; backup stack pointer
  774. and rsp, ~(mmsize-1) ; align stack
  775. sub rsp, PAD
  776. call put_h%2_10
  777. sub rsp, PAD
  778. call put_hv%2_10
  779. mov r4d, PAD-mmsize ; H buffer
  780. jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
  781. %endmacro
  782. MC MC21
  783. ;-----------------------------------------------------------------------------
  784. ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
  785. ;-----------------------------------------------------------------------------
  786. %macro MC23 2
  787. cglobal_mc %1, mc23, %2, 3,7,12
  788. lea r5, [r1+r2]
  789. jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
  790. %endmacro
  791. MC MC23