You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2723 lines
69KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA
  27. tm_shuf: times 8 db 0x03, 0x80
  28. pw_ff00: times 8 dw 0xff00
  29. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  30. db 1, 2, 3, 4, 5, 6, 7, 8
  31. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  32. db 1, 2, 3, 4, 0, 0, 0, 0
  33. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  34. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  35. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  36. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  37. SECTION .text
  38. cextern pb_1
  39. cextern pb_3
  40. cextern pw_4
  41. cextern pw_5
  42. cextern pw_8
  43. cextern pw_16
  44. cextern pw_17
  45. cextern pw_32
  46. ;-----------------------------------------------------------------------------
  47. ; void pred16x16_vertical(uint8_t *src, int stride)
  48. ;-----------------------------------------------------------------------------
  49. cglobal pred16x16_vertical_mmx, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. cglobal pred16x16_vertical_sse, 2,3
  64. sub r0, r1
  65. mov r2, 4
  66. movaps xmm0, [r0]
  67. .loop:
  68. movaps [r0+r1*1], xmm0
  69. movaps [r0+r1*2], xmm0
  70. lea r0, [r0+r1*2]
  71. movaps [r0+r1*1], xmm0
  72. movaps [r0+r1*2], xmm0
  73. lea r0, [r0+r1*2]
  74. dec r2
  75. jg .loop
  76. REP_RET
  77. ;-----------------------------------------------------------------------------
  78. ; void pred16x16_horizontal(uint8_t *src, int stride)
  79. ;-----------------------------------------------------------------------------
  80. %macro PRED16x16_H 0
  81. cglobal pred16x16_horizontal, 2,3
  82. mov r2, 8
  83. %if cpuflag(ssse3)
  84. mova m2, [pb_3]
  85. %endif
  86. .loop:
  87. movd m0, [r0+r1*0-4]
  88. movd m1, [r0+r1*1-4]
  89. %if cpuflag(ssse3)
  90. pshufb m0, m2
  91. pshufb m1, m2
  92. %else
  93. punpcklbw m0, m0
  94. punpcklbw m1, m1
  95. SPLATW m0, m0, 3
  96. SPLATW m1, m1, 3
  97. mova [r0+r1*0+8], m0
  98. mova [r0+r1*1+8], m1
  99. %endif
  100. mova [r0+r1*0], m0
  101. mova [r0+r1*1], m1
  102. lea r0, [r0+r1*2]
  103. dec r2
  104. jg .loop
  105. REP_RET
  106. %endmacro
  107. INIT_MMX mmx
  108. PRED16x16_H
  109. INIT_MMX mmx2
  110. PRED16x16_H
  111. INIT_XMM ssse3
  112. PRED16x16_H
  113. INIT_XMM
  114. ;-----------------------------------------------------------------------------
  115. ; void pred16x16_dc(uint8_t *src, int stride)
  116. ;-----------------------------------------------------------------------------
  117. %macro PRED16x16_DC 0
  118. cglobal pred16x16_dc, 2,7
  119. mov r4, r0
  120. sub r0, r1
  121. pxor mm0, mm0
  122. pxor mm1, mm1
  123. psadbw mm0, [r0+0]
  124. psadbw mm1, [r0+8]
  125. dec r0
  126. movzx r5d, byte [r0+r1*1]
  127. paddw mm0, mm1
  128. movd r6d, mm0
  129. lea r0, [r0+r1*2]
  130. %rep 7
  131. movzx r2d, byte [r0+r1*0]
  132. movzx r3d, byte [r0+r1*1]
  133. add r5d, r2d
  134. add r6d, r3d
  135. lea r0, [r0+r1*2]
  136. %endrep
  137. movzx r2d, byte [r0+r1*0]
  138. add r5d, r6d
  139. lea r2d, [r2+r5+16]
  140. shr r2d, 5
  141. %if cpuflag(ssse3)
  142. pxor m1, m1
  143. %endif
  144. SPLATB_REG m0, r2, m1
  145. %if mmsize==8
  146. mov r3d, 8
  147. .loop:
  148. mova [r4+r1*0+0], m0
  149. mova [r4+r1*0+8], m0
  150. mova [r4+r1*1+0], m0
  151. mova [r4+r1*1+8], m0
  152. %else
  153. mov r3d, 4
  154. .loop:
  155. mova [r4+r1*0], m0
  156. mova [r4+r1*1], m0
  157. lea r4, [r4+r1*2]
  158. mova [r4+r1*0], m0
  159. mova [r4+r1*1], m0
  160. %endif
  161. lea r4, [r4+r1*2]
  162. dec r3d
  163. jg .loop
  164. REP_RET
  165. %endmacro
  166. INIT_MMX mmx2
  167. PRED16x16_DC
  168. INIT_XMM sse2
  169. PRED16x16_DC
  170. INIT_XMM ssse3
  171. PRED16x16_DC
  172. INIT_XMM
  173. ;-----------------------------------------------------------------------------
  174. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  175. ;-----------------------------------------------------------------------------
  176. %macro PRED16x16_TM_MMX 0
  177. cglobal pred16x16_tm_vp8, 2,5
  178. sub r0, r1
  179. pxor mm7, mm7
  180. movq mm0, [r0+0]
  181. movq mm2, [r0+8]
  182. movq mm1, mm0
  183. movq mm3, mm2
  184. punpcklbw mm0, mm7
  185. punpckhbw mm1, mm7
  186. punpcklbw mm2, mm7
  187. punpckhbw mm3, mm7
  188. movzx r3d, byte [r0-1]
  189. mov r4d, 16
  190. .loop:
  191. movzx r2d, byte [r0+r1-1]
  192. sub r2d, r3d
  193. movd mm4, r2d
  194. SPLATW mm4, mm4, 0
  195. movq mm5, mm4
  196. movq mm6, mm4
  197. movq mm7, mm4
  198. paddw mm4, mm0
  199. paddw mm5, mm1
  200. paddw mm6, mm2
  201. paddw mm7, mm3
  202. packuswb mm4, mm5
  203. packuswb mm6, mm7
  204. movq [r0+r1+0], mm4
  205. movq [r0+r1+8], mm6
  206. add r0, r1
  207. dec r4d
  208. jg .loop
  209. REP_RET
  210. %endmacro
  211. INIT_MMX mmx
  212. PRED16x16_TM_MMX
  213. INIT_MMX mmx2
  214. PRED16x16_TM_MMX
  215. INIT_MMX
  216. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  217. sub r0, r1
  218. pxor xmm2, xmm2
  219. movdqa xmm0, [r0]
  220. movdqa xmm1, xmm0
  221. punpcklbw xmm0, xmm2
  222. punpckhbw xmm1, xmm2
  223. movzx r4d, byte [r0-1]
  224. mov r5d, 8
  225. .loop:
  226. movzx r2d, byte [r0+r1*1-1]
  227. movzx r3d, byte [r0+r1*2-1]
  228. sub r2d, r4d
  229. sub r3d, r4d
  230. movd xmm2, r2d
  231. movd xmm4, r3d
  232. pshuflw xmm2, xmm2, 0
  233. pshuflw xmm4, xmm4, 0
  234. punpcklqdq xmm2, xmm2
  235. punpcklqdq xmm4, xmm4
  236. movdqa xmm3, xmm2
  237. movdqa xmm5, xmm4
  238. paddw xmm2, xmm0
  239. paddw xmm3, xmm1
  240. paddw xmm4, xmm0
  241. paddw xmm5, xmm1
  242. packuswb xmm2, xmm3
  243. packuswb xmm4, xmm5
  244. movdqa [r0+r1*1], xmm2
  245. movdqa [r0+r1*2], xmm4
  246. lea r0, [r0+r1*2]
  247. dec r5d
  248. jg .loop
  249. REP_RET
  250. ;-----------------------------------------------------------------------------
  251. ; void pred16x16_plane(uint8_t *src, int stride)
  252. ;-----------------------------------------------------------------------------
  253. %macro H264_PRED16x16_PLANE 1
  254. cglobal pred16x16_plane_%1, 2,9,7
  255. mov r2, r1 ; +stride
  256. neg r1 ; -stride
  257. movh m0, [r0+r1 -1]
  258. %if mmsize == 8
  259. pxor m4, m4
  260. movh m1, [r0+r1 +3 ]
  261. movh m2, [r0+r1 +8 ]
  262. movh m3, [r0+r1 +12]
  263. punpcklbw m0, m4
  264. punpcklbw m1, m4
  265. punpcklbw m2, m4
  266. punpcklbw m3, m4
  267. pmullw m0, [pw_m8tom1 ]
  268. pmullw m1, [pw_m8tom1+8]
  269. pmullw m2, [pw_1to8 ]
  270. pmullw m3, [pw_1to8 +8]
  271. paddw m0, m2
  272. paddw m1, m3
  273. %else ; mmsize == 16
  274. %if cpuflag(ssse3)
  275. movhps m0, [r0+r1 +8]
  276. pmaddubsw m0, [plane_shuf] ; H coefficients
  277. %else ; sse2
  278. pxor m2, m2
  279. movh m1, [r0+r1 +8]
  280. punpcklbw m0, m2
  281. punpcklbw m1, m2
  282. pmullw m0, [pw_m8tom1]
  283. pmullw m1, [pw_1to8]
  284. paddw m0, m1
  285. %endif
  286. movhlps m1, m0
  287. %endif
  288. paddw m0, m1
  289. %if cpuflag(mmx2)
  290. PSHUFLW m1, m0, 0xE
  291. %elif cpuflag(mmx)
  292. mova m1, m0
  293. psrlq m1, 32
  294. %endif
  295. paddw m0, m1
  296. %if cpuflag(mmx2)
  297. PSHUFLW m1, m0, 0x1
  298. %elif cpuflag(mmx)
  299. mova m1, m0
  300. psrlq m1, 16
  301. %endif
  302. paddw m0, m1 ; sum of H coefficients
  303. lea r4, [r0+r2*8-1]
  304. lea r3, [r0+r2*4-1]
  305. add r4, r2
  306. %if ARCH_X86_64
  307. %define e_reg r8
  308. %else
  309. %define e_reg r0
  310. %endif
  311. movzx e_reg, byte [r3+r2*2 ]
  312. movzx r5, byte [r4+r1 ]
  313. sub r5, e_reg
  314. movzx e_reg, byte [r3+r2 ]
  315. movzx r6, byte [r4 ]
  316. sub r6, e_reg
  317. lea r5, [r5+r6*2]
  318. movzx e_reg, byte [r3+r1 ]
  319. movzx r6, byte [r4+r2*2 ]
  320. sub r6, e_reg
  321. lea r5, [r5+r6*4]
  322. movzx e_reg, byte [r3 ]
  323. %if ARCH_X86_64
  324. movzx r7, byte [r4+r2 ]
  325. sub r7, e_reg
  326. %else
  327. movzx r6, byte [r4+r2 ]
  328. sub r6, e_reg
  329. lea r5, [r5+r6*4]
  330. sub r5, r6
  331. %endif
  332. lea e_reg, [r3+r1*4]
  333. lea r3, [r4+r2*4]
  334. movzx r4, byte [e_reg+r2 ]
  335. movzx r6, byte [r3 ]
  336. sub r6, r4
  337. %if ARCH_X86_64
  338. lea r6, [r7+r6*2]
  339. lea r5, [r5+r6*2]
  340. add r5, r6
  341. %else
  342. lea r5, [r5+r6*4]
  343. lea r5, [r5+r6*2]
  344. %endif
  345. movzx r4, byte [e_reg ]
  346. %if ARCH_X86_64
  347. movzx r7, byte [r3 +r2 ]
  348. sub r7, r4
  349. sub r5, r7
  350. %else
  351. movzx r6, byte [r3 +r2 ]
  352. sub r6, r4
  353. lea r5, [r5+r6*8]
  354. sub r5, r6
  355. %endif
  356. movzx r4, byte [e_reg+r1 ]
  357. movzx r6, byte [r3 +r2*2]
  358. sub r6, r4
  359. %if ARCH_X86_64
  360. add r6, r7
  361. %endif
  362. lea r5, [r5+r6*8]
  363. movzx r4, byte [e_reg+r2*2]
  364. movzx r6, byte [r3 +r1 ]
  365. sub r6, r4
  366. lea r5, [r5+r6*4]
  367. add r5, r6 ; sum of V coefficients
  368. %if ARCH_X86_64 == 0
  369. mov r0, r0m
  370. %endif
  371. %ifidn %1, h264
  372. lea r5, [r5*5+32]
  373. sar r5, 6
  374. %elifidn %1, rv40
  375. lea r5, [r5*5]
  376. sar r5, 6
  377. %elifidn %1, svq3
  378. test r5, r5
  379. lea r6, [r5+3]
  380. cmovs r5, r6
  381. sar r5, 2 ; V/4
  382. lea r5, [r5*5] ; 5*(V/4)
  383. test r5, r5
  384. lea r6, [r5+15]
  385. cmovs r5, r6
  386. sar r5, 4 ; (5*(V/4))/16
  387. %endif
  388. movzx r4, byte [r0+r1 +15]
  389. movzx r3, byte [r3+r2*2 ]
  390. lea r3, [r3+r4+1]
  391. shl r3, 4
  392. movd r1d, m0
  393. movsx r1d, r1w
  394. %ifnidn %1, svq3
  395. %ifidn %1, h264
  396. lea r1d, [r1d*5+32]
  397. %else ; rv40
  398. lea r1d, [r1d*5]
  399. %endif
  400. sar r1d, 6
  401. %else ; svq3
  402. test r1d, r1d
  403. lea r4d, [r1d+3]
  404. cmovs r1d, r4d
  405. sar r1d, 2 ; H/4
  406. lea r1d, [r1d*5] ; 5*(H/4)
  407. test r1d, r1d
  408. lea r4d, [r1d+15]
  409. cmovs r1d, r4d
  410. sar r1d, 4 ; (5*(H/4))/16
  411. %endif
  412. movd m0, r1d
  413. add r1d, r5d
  414. add r3d, r1d
  415. shl r1d, 3
  416. sub r3d, r1d ; a
  417. movd m1, r5d
  418. movd m3, r3d
  419. SPLATW m0, m0, 0 ; H
  420. SPLATW m1, m1, 0 ; V
  421. SPLATW m3, m3, 0 ; a
  422. %ifidn %1, svq3
  423. SWAP 0, 1
  424. %endif
  425. mova m2, m0
  426. %if mmsize == 8
  427. mova m5, m0
  428. %endif
  429. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  430. %if mmsize == 16
  431. psllw m2, 3
  432. %else
  433. psllw m5, 3
  434. psllw m2, 2
  435. mova m6, m5
  436. paddw m6, m2
  437. %endif
  438. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  439. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  440. %if mmsize == 8
  441. paddw m5, m0 ; a + {8,9,10,11}*H
  442. paddw m6, m0 ; a + {12,13,14,15}*H
  443. %endif
  444. mov r4, 8
  445. .loop
  446. mova m3, m0 ; b[0..7]
  447. mova m4, m2 ; b[8..15]
  448. psraw m3, 5
  449. psraw m4, 5
  450. packuswb m3, m4
  451. mova [r0], m3
  452. %if mmsize == 8
  453. mova m3, m5 ; b[8..11]
  454. mova m4, m6 ; b[12..15]
  455. psraw m3, 5
  456. psraw m4, 5
  457. packuswb m3, m4
  458. mova [r0+8], m3
  459. %endif
  460. paddw m0, m1
  461. paddw m2, m1
  462. %if mmsize == 8
  463. paddw m5, m1
  464. paddw m6, m1
  465. %endif
  466. mova m3, m0 ; b[0..7]
  467. mova m4, m2 ; b[8..15]
  468. psraw m3, 5
  469. psraw m4, 5
  470. packuswb m3, m4
  471. mova [r0+r2], m3
  472. %if mmsize == 8
  473. mova m3, m5 ; b[8..11]
  474. mova m4, m6 ; b[12..15]
  475. psraw m3, 5
  476. psraw m4, 5
  477. packuswb m3, m4
  478. mova [r0+r2+8], m3
  479. %endif
  480. paddw m0, m1
  481. paddw m2, m1
  482. %if mmsize == 8
  483. paddw m5, m1
  484. paddw m6, m1
  485. %endif
  486. lea r0, [r0+r2*2]
  487. dec r4
  488. jg .loop
  489. REP_RET
  490. %endmacro
  491. INIT_MMX mmx
  492. H264_PRED16x16_PLANE h264
  493. H264_PRED16x16_PLANE rv40
  494. H264_PRED16x16_PLANE svq3
  495. INIT_MMX mmx2
  496. H264_PRED16x16_PLANE h264
  497. H264_PRED16x16_PLANE rv40
  498. H264_PRED16x16_PLANE svq3
  499. INIT_XMM sse2
  500. H264_PRED16x16_PLANE h264
  501. H264_PRED16x16_PLANE rv40
  502. H264_PRED16x16_PLANE svq3
  503. INIT_XMM ssse3
  504. H264_PRED16x16_PLANE h264
  505. H264_PRED16x16_PLANE rv40
  506. H264_PRED16x16_PLANE svq3
  507. INIT_XMM
  508. ;-----------------------------------------------------------------------------
  509. ; void pred8x8_plane(uint8_t *src, int stride)
  510. ;-----------------------------------------------------------------------------
  511. %macro H264_PRED8x8_PLANE 0
  512. cglobal pred8x8_plane, 2,9,7
  513. mov r2, r1 ; +stride
  514. neg r1 ; -stride
  515. movd m0, [r0+r1 -1]
  516. %if mmsize == 8
  517. pxor m2, m2
  518. movh m1, [r0+r1 +4 ]
  519. punpcklbw m0, m2
  520. punpcklbw m1, m2
  521. pmullw m0, [pw_m4to4]
  522. pmullw m1, [pw_m4to4+8]
  523. %else ; mmsize == 16
  524. %if cpuflag(ssse3)
  525. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  526. pmaddubsw m0, [plane8_shuf] ; H coefficients
  527. %else ; sse2
  528. pxor m2, m2
  529. movd m1, [r0+r1 +4]
  530. punpckldq m0, m1
  531. punpcklbw m0, m2
  532. pmullw m0, [pw_m4to4]
  533. %endif
  534. movhlps m1, m0
  535. %endif
  536. paddw m0, m1
  537. %if notcpuflag(ssse3)
  538. %if cpuflag(mmx2)
  539. PSHUFLW m1, m0, 0xE
  540. %elif cpuflag(mmx)
  541. mova m1, m0
  542. psrlq m1, 32
  543. %endif
  544. paddw m0, m1
  545. %endif ; !ssse3
  546. %if cpuflag(mmx2)
  547. PSHUFLW m1, m0, 0x1
  548. %elif cpuflag(mmx)
  549. mova m1, m0
  550. psrlq m1, 16
  551. %endif
  552. paddw m0, m1 ; sum of H coefficients
  553. lea r4, [r0+r2*4-1]
  554. lea r3, [r0 -1]
  555. add r4, r2
  556. %if ARCH_X86_64
  557. %define e_reg r8
  558. %else
  559. %define e_reg r0
  560. %endif
  561. movzx e_reg, byte [r3+r2*2 ]
  562. movzx r5, byte [r4+r1 ]
  563. sub r5, e_reg
  564. movzx e_reg, byte [r3 ]
  565. %if ARCH_X86_64
  566. movzx r7, byte [r4+r2 ]
  567. sub r7, e_reg
  568. sub r5, r7
  569. %else
  570. movzx r6, byte [r4+r2 ]
  571. sub r6, e_reg
  572. lea r5, [r5+r6*4]
  573. sub r5, r6
  574. %endif
  575. movzx e_reg, byte [r3+r1 ]
  576. movzx r6, byte [r4+r2*2 ]
  577. sub r6, e_reg
  578. %if ARCH_X86_64
  579. add r6, r7
  580. %endif
  581. lea r5, [r5+r6*4]
  582. movzx e_reg, byte [r3+r2 ]
  583. movzx r6, byte [r4 ]
  584. sub r6, e_reg
  585. lea r6, [r5+r6*2]
  586. lea r5, [r6*9+16]
  587. lea r5, [r5+r6*8]
  588. sar r5, 5
  589. %if ARCH_X86_64 == 0
  590. mov r0, r0m
  591. %endif
  592. movzx r3, byte [r4+r2*2 ]
  593. movzx r4, byte [r0+r1 +7]
  594. lea r3, [r3+r4+1]
  595. shl r3, 4
  596. movd r1d, m0
  597. movsx r1d, r1w
  598. imul r1d, 17
  599. add r1d, 16
  600. sar r1d, 5
  601. movd m0, r1d
  602. add r1d, r5d
  603. sub r3d, r1d
  604. add r1d, r1d
  605. sub r3d, r1d ; a
  606. movd m1, r5d
  607. movd m3, r3d
  608. SPLATW m0, m0, 0 ; H
  609. SPLATW m1, m1, 0 ; V
  610. SPLATW m3, m3, 0 ; a
  611. %if mmsize == 8
  612. mova m2, m0
  613. %endif
  614. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  615. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  616. %if mmsize == 8
  617. psllw m2, 2
  618. paddw m2, m0 ; a + {4,5,6,7}*H
  619. %endif
  620. mov r4, 4
  621. ALIGN 16
  622. .loop
  623. %if mmsize == 16
  624. mova m3, m0 ; b[0..7]
  625. paddw m0, m1
  626. psraw m3, 5
  627. mova m4, m0 ; V+b[0..7]
  628. paddw m0, m1
  629. psraw m4, 5
  630. packuswb m3, m4
  631. movh [r0], m3
  632. movhps [r0+r2], m3
  633. %else ; mmsize == 8
  634. mova m3, m0 ; b[0..3]
  635. mova m4, m2 ; b[4..7]
  636. paddw m0, m1
  637. paddw m2, m1
  638. psraw m3, 5
  639. psraw m4, 5
  640. mova m5, m0 ; V+b[0..3]
  641. mova m6, m2 ; V+b[4..7]
  642. paddw m0, m1
  643. paddw m2, m1
  644. psraw m5, 5
  645. psraw m6, 5
  646. packuswb m3, m4
  647. packuswb m5, m6
  648. mova [r0], m3
  649. mova [r0+r2], m5
  650. %endif
  651. lea r0, [r0+r2*2]
  652. dec r4
  653. jg .loop
  654. REP_RET
  655. %endmacro
  656. INIT_MMX mmx
  657. H264_PRED8x8_PLANE
  658. INIT_MMX mmx2
  659. H264_PRED8x8_PLANE
  660. INIT_XMM sse2
  661. H264_PRED8x8_PLANE
  662. INIT_XMM ssse3
  663. H264_PRED8x8_PLANE
  664. INIT_XMM
  665. ;-----------------------------------------------------------------------------
  666. ; void pred8x8_vertical(uint8_t *src, int stride)
  667. ;-----------------------------------------------------------------------------
  668. cglobal pred8x8_vertical_mmx, 2,2
  669. sub r0, r1
  670. movq mm0, [r0]
  671. %rep 3
  672. movq [r0+r1*1], mm0
  673. movq [r0+r1*2], mm0
  674. lea r0, [r0+r1*2]
  675. %endrep
  676. movq [r0+r1*1], mm0
  677. movq [r0+r1*2], mm0
  678. RET
  679. ;-----------------------------------------------------------------------------
  680. ; void pred8x8_horizontal(uint8_t *src, int stride)
  681. ;-----------------------------------------------------------------------------
  682. %macro PRED8x8_H 0
  683. cglobal pred8x8_horizontal, 2,3
  684. mov r2, 4
  685. %if cpuflag(ssse3)
  686. mova m2, [pb_3]
  687. %endif
  688. .loop:
  689. SPLATB_LOAD m0, r0+r1*0-1, m2
  690. SPLATB_LOAD m1, r0+r1*1-1, m2
  691. mova [r0+r1*0], m0
  692. mova [r0+r1*1], m1
  693. lea r0, [r0+r1*2]
  694. dec r2
  695. jg .loop
  696. REP_RET
  697. %endmacro
  698. INIT_MMX mmx
  699. PRED8x8_H
  700. INIT_MMX mmx2
  701. PRED8x8_H
  702. INIT_MMX ssse3
  703. PRED8x8_H
  704. INIT_MMX
  705. ;-----------------------------------------------------------------------------
  706. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  707. ;-----------------------------------------------------------------------------
  708. cglobal pred8x8_top_dc_mmxext, 2,5
  709. sub r0, r1
  710. movq mm0, [r0]
  711. pxor mm1, mm1
  712. pxor mm2, mm2
  713. lea r2, [r0+r1*2]
  714. punpckhbw mm1, mm0
  715. punpcklbw mm0, mm2
  716. psadbw mm1, mm2 ; s1
  717. lea r3, [r2+r1*2]
  718. psadbw mm0, mm2 ; s0
  719. psrlw mm1, 1
  720. psrlw mm0, 1
  721. pavgw mm1, mm2
  722. lea r4, [r3+r1*2]
  723. pavgw mm0, mm2
  724. pshufw mm1, mm1, 0
  725. pshufw mm0, mm0, 0 ; dc0 (w)
  726. packuswb mm0, mm1 ; dc0,dc1 (b)
  727. movq [r0+r1*1], mm0
  728. movq [r0+r1*2], mm0
  729. lea r0, [r3+r1*2]
  730. movq [r2+r1*1], mm0
  731. movq [r2+r1*2], mm0
  732. movq [r3+r1*1], mm0
  733. movq [r3+r1*2], mm0
  734. movq [r0+r1*1], mm0
  735. movq [r0+r1*2], mm0
  736. RET
  737. ;-----------------------------------------------------------------------------
  738. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  739. ;-----------------------------------------------------------------------------
  740. INIT_MMX
  741. cglobal pred8x8_dc_mmxext, 2,5
  742. sub r0, r1
  743. pxor m7, m7
  744. movd m0, [r0+0]
  745. movd m1, [r0+4]
  746. psadbw m0, m7 ; s0
  747. mov r4, r0
  748. psadbw m1, m7 ; s1
  749. movzx r2d, byte [r0+r1*1-1]
  750. movzx r3d, byte [r0+r1*2-1]
  751. lea r0, [r0+r1*2]
  752. add r2d, r3d
  753. movzx r3d, byte [r0+r1*1-1]
  754. add r2d, r3d
  755. movzx r3d, byte [r0+r1*2-1]
  756. add r2d, r3d
  757. lea r0, [r0+r1*2]
  758. movd m2, r2d ; s2
  759. movzx r2d, byte [r0+r1*1-1]
  760. movzx r3d, byte [r0+r1*2-1]
  761. lea r0, [r0+r1*2]
  762. add r2d, r3d
  763. movzx r3d, byte [r0+r1*1-1]
  764. add r2d, r3d
  765. movzx r3d, byte [r0+r1*2-1]
  766. add r2d, r3d
  767. movd m3, r2d ; s3
  768. punpcklwd m0, m1
  769. mov r0, r4
  770. punpcklwd m2, m3
  771. punpckldq m0, m2 ; s0, s1, s2, s3
  772. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  773. lea r2, [r0+r1*2]
  774. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  775. paddw m0, m3
  776. lea r3, [r2+r1*2]
  777. psrlw m0, 2
  778. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  779. lea r4, [r3+r1*2]
  780. packuswb m0, m0
  781. punpcklbw m0, m0
  782. movq m1, m0
  783. punpcklbw m0, m0
  784. punpckhbw m1, m1
  785. movq [r0+r1*1], m0
  786. movq [r0+r1*2], m0
  787. movq [r2+r1*1], m0
  788. movq [r2+r1*2], m0
  789. movq [r3+r1*1], m1
  790. movq [r3+r1*2], m1
  791. movq [r4+r1*1], m1
  792. movq [r4+r1*2], m1
  793. RET
  794. ;-----------------------------------------------------------------------------
  795. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  796. ;-----------------------------------------------------------------------------
  797. cglobal pred8x8_dc_rv40_mmxext, 2,7
  798. mov r4, r0
  799. sub r0, r1
  800. pxor mm0, mm0
  801. psadbw mm0, [r0]
  802. dec r0
  803. movzx r5d, byte [r0+r1*1]
  804. movd r6d, mm0
  805. lea r0, [r0+r1*2]
  806. %rep 3
  807. movzx r2d, byte [r0+r1*0]
  808. movzx r3d, byte [r0+r1*1]
  809. add r5d, r2d
  810. add r6d, r3d
  811. lea r0, [r0+r1*2]
  812. %endrep
  813. movzx r2d, byte [r0+r1*0]
  814. add r5d, r6d
  815. lea r2d, [r2+r5+8]
  816. shr r2d, 4
  817. movd mm0, r2d
  818. punpcklbw mm0, mm0
  819. pshufw mm0, mm0, 0
  820. mov r3d, 4
  821. .loop:
  822. movq [r4+r1*0], mm0
  823. movq [r4+r1*1], mm0
  824. lea r4, [r4+r1*2]
  825. dec r3d
  826. jg .loop
  827. REP_RET
  828. ;-----------------------------------------------------------------------------
  829. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  830. ;-----------------------------------------------------------------------------
  831. %macro PRED8x8_TM_MMX 0
  832. cglobal pred8x8_tm_vp8, 2,6
  833. sub r0, r1
  834. pxor mm7, mm7
  835. movq mm0, [r0]
  836. movq mm1, mm0
  837. punpcklbw mm0, mm7
  838. punpckhbw mm1, mm7
  839. movzx r4d, byte [r0-1]
  840. mov r5d, 4
  841. .loop:
  842. movzx r2d, byte [r0+r1*1-1]
  843. movzx r3d, byte [r0+r1*2-1]
  844. sub r2d, r4d
  845. sub r3d, r4d
  846. movd mm2, r2d
  847. movd mm4, r3d
  848. SPLATW mm2, mm2, 0
  849. SPLATW mm4, mm4, 0
  850. movq mm3, mm2
  851. movq mm5, mm4
  852. paddw mm2, mm0
  853. paddw mm3, mm1
  854. paddw mm4, mm0
  855. paddw mm5, mm1
  856. packuswb mm2, mm3
  857. packuswb mm4, mm5
  858. movq [r0+r1*1], mm2
  859. movq [r0+r1*2], mm4
  860. lea r0, [r0+r1*2]
  861. dec r5d
  862. jg .loop
  863. REP_RET
  864. %endmacro
  865. INIT_MMX mmx
  866. PRED8x8_TM_MMX
  867. INIT_MMX mmx2
  868. PRED8x8_TM_MMX
  869. INIT_MMX
  870. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  871. sub r0, r1
  872. pxor xmm1, xmm1
  873. movq xmm0, [r0]
  874. punpcklbw xmm0, xmm1
  875. movzx r4d, byte [r0-1]
  876. mov r5d, 4
  877. .loop:
  878. movzx r2d, byte [r0+r1*1-1]
  879. movzx r3d, byte [r0+r1*2-1]
  880. sub r2d, r4d
  881. sub r3d, r4d
  882. movd xmm2, r2d
  883. movd xmm3, r3d
  884. pshuflw xmm2, xmm2, 0
  885. pshuflw xmm3, xmm3, 0
  886. punpcklqdq xmm2, xmm2
  887. punpcklqdq xmm3, xmm3
  888. paddw xmm2, xmm0
  889. paddw xmm3, xmm0
  890. packuswb xmm2, xmm3
  891. movq [r0+r1*1], xmm2
  892. movhps [r0+r1*2], xmm2
  893. lea r0, [r0+r1*2]
  894. dec r5d
  895. jg .loop
  896. REP_RET
  897. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  898. sub r0, r1
  899. movdqa xmm4, [tm_shuf]
  900. pxor xmm1, xmm1
  901. movq xmm0, [r0]
  902. punpcklbw xmm0, xmm1
  903. movd xmm5, [r0-4]
  904. pshufb xmm5, xmm4
  905. mov r2d, 4
  906. .loop:
  907. movd xmm2, [r0+r1*1-4]
  908. movd xmm3, [r0+r1*2-4]
  909. pshufb xmm2, xmm4
  910. pshufb xmm3, xmm4
  911. psubw xmm2, xmm5
  912. psubw xmm3, xmm5
  913. paddw xmm2, xmm0
  914. paddw xmm3, xmm0
  915. packuswb xmm2, xmm3
  916. movq [r0+r1*1], xmm2
  917. movhps [r0+r1*2], xmm2
  918. lea r0, [r0+r1*2]
  919. dec r2d
  920. jg .loop
  921. REP_RET
  922. ; dest, left, right, src, tmp
  923. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  924. %macro PRED4x4_LOWPASS 5
  925. mova %5, %2
  926. pavgb %2, %3
  927. pxor %3, %5
  928. mova %1, %4
  929. pand %3, [pb_1]
  930. psubusb %2, %3
  931. pavgb %1, %2
  932. %endmacro
  933. ;-----------------------------------------------------------------------------
  934. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  935. ;-----------------------------------------------------------------------------
  936. %macro PRED8x8L_TOP_DC 1
  937. cglobal pred8x8l_top_dc_%1, 4,4
  938. sub r0, r3
  939. pxor mm7, mm7
  940. movq mm0, [r0-8]
  941. movq mm3, [r0]
  942. movq mm1, [r0+8]
  943. movq mm2, mm3
  944. movq mm4, mm3
  945. PALIGNR mm2, mm0, 7, mm0
  946. PALIGNR mm1, mm4, 1, mm4
  947. test r1, r1 ; top_left
  948. jz .fix_lt_2
  949. test r2, r2 ; top_right
  950. jz .fix_tr_1
  951. jmp .body
  952. .fix_lt_2:
  953. movq mm5, mm3
  954. pxor mm5, mm2
  955. psllq mm5, 56
  956. psrlq mm5, 56
  957. pxor mm2, mm5
  958. test r2, r2 ; top_right
  959. jnz .body
  960. .fix_tr_1:
  961. movq mm5, mm3
  962. pxor mm5, mm1
  963. psrlq mm5, 56
  964. psllq mm5, 56
  965. pxor mm1, mm5
  966. .body
  967. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  968. psadbw mm7, mm0
  969. paddw mm7, [pw_4]
  970. psrlw mm7, 3
  971. pshufw mm7, mm7, 0
  972. packuswb mm7, mm7
  973. %rep 3
  974. movq [r0+r3*1], mm7
  975. movq [r0+r3*2], mm7
  976. lea r0, [r0+r3*2]
  977. %endrep
  978. movq [r0+r3*1], mm7
  979. movq [r0+r3*2], mm7
  980. RET
  981. %endmacro
  982. INIT_MMX
  983. %define PALIGNR PALIGNR_MMX
  984. PRED8x8L_TOP_DC mmxext
  985. %define PALIGNR PALIGNR_SSSE3
  986. PRED8x8L_TOP_DC ssse3
  987. ;-----------------------------------------------------------------------------
  988. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  989. ;-----------------------------------------------------------------------------
  990. %macro PRED8x8L_DC 1
  991. cglobal pred8x8l_dc_%1, 4,5
  992. sub r0, r3
  993. lea r4, [r0+r3*2]
  994. movq mm0, [r0+r3*1-8]
  995. punpckhbw mm0, [r0+r3*0-8]
  996. movq mm1, [r4+r3*1-8]
  997. punpckhbw mm1, [r0+r3*2-8]
  998. mov r4, r0
  999. punpckhwd mm1, mm0
  1000. lea r0, [r0+r3*4]
  1001. movq mm2, [r0+r3*1-8]
  1002. punpckhbw mm2, [r0+r3*0-8]
  1003. lea r0, [r0+r3*2]
  1004. movq mm3, [r0+r3*1-8]
  1005. punpckhbw mm3, [r0+r3*0-8]
  1006. punpckhwd mm3, mm2
  1007. punpckhdq mm3, mm1
  1008. lea r0, [r0+r3*2]
  1009. movq mm0, [r0+r3*0-8]
  1010. movq mm1, [r4]
  1011. mov r0, r4
  1012. movq mm4, mm3
  1013. movq mm2, mm3
  1014. PALIGNR mm4, mm0, 7, mm0
  1015. PALIGNR mm1, mm2, 1, mm2
  1016. test r1, r1
  1017. jnz .do_left
  1018. .fix_lt_1:
  1019. movq mm5, mm3
  1020. pxor mm5, mm4
  1021. psrlq mm5, 56
  1022. psllq mm5, 48
  1023. pxor mm1, mm5
  1024. jmp .do_left
  1025. .fix_lt_2:
  1026. movq mm5, mm3
  1027. pxor mm5, mm2
  1028. psllq mm5, 56
  1029. psrlq mm5, 56
  1030. pxor mm2, mm5
  1031. test r2, r2
  1032. jnz .body
  1033. .fix_tr_1:
  1034. movq mm5, mm3
  1035. pxor mm5, mm1
  1036. psrlq mm5, 56
  1037. psllq mm5, 56
  1038. pxor mm1, mm5
  1039. jmp .body
  1040. .do_left:
  1041. movq mm0, mm4
  1042. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1043. movq mm4, mm0
  1044. movq mm7, mm2
  1045. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1046. psllq mm1, 56
  1047. PALIGNR mm7, mm1, 7, mm3
  1048. movq mm0, [r0-8]
  1049. movq mm3, [r0]
  1050. movq mm1, [r0+8]
  1051. movq mm2, mm3
  1052. movq mm4, mm3
  1053. PALIGNR mm2, mm0, 7, mm0
  1054. PALIGNR mm1, mm4, 1, mm4
  1055. test r1, r1
  1056. jz .fix_lt_2
  1057. test r2, r2
  1058. jz .fix_tr_1
  1059. .body
  1060. lea r1, [r0+r3*2]
  1061. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1062. pxor mm0, mm0
  1063. pxor mm1, mm1
  1064. lea r2, [r1+r3*2]
  1065. psadbw mm0, mm7
  1066. psadbw mm1, mm6
  1067. paddw mm0, [pw_8]
  1068. paddw mm0, mm1
  1069. lea r4, [r2+r3*2]
  1070. psrlw mm0, 4
  1071. pshufw mm0, mm0, 0
  1072. packuswb mm0, mm0
  1073. movq [r0+r3*1], mm0
  1074. movq [r0+r3*2], mm0
  1075. movq [r1+r3*1], mm0
  1076. movq [r1+r3*2], mm0
  1077. movq [r2+r3*1], mm0
  1078. movq [r2+r3*2], mm0
  1079. movq [r4+r3*1], mm0
  1080. movq [r4+r3*2], mm0
  1081. RET
  1082. %endmacro
  1083. INIT_MMX
  1084. %define PALIGNR PALIGNR_MMX
  1085. PRED8x8L_DC mmxext
  1086. %define PALIGNR PALIGNR_SSSE3
  1087. PRED8x8L_DC ssse3
  1088. ;-----------------------------------------------------------------------------
  1089. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1090. ;-----------------------------------------------------------------------------
  1091. %macro PRED8x8L_HORIZONTAL 1
  1092. cglobal pred8x8l_horizontal_%1, 4,4
  1093. sub r0, r3
  1094. lea r2, [r0+r3*2]
  1095. movq mm0, [r0+r3*1-8]
  1096. test r1, r1
  1097. lea r1, [r0+r3]
  1098. cmovnz r1, r0
  1099. punpckhbw mm0, [r1+r3*0-8]
  1100. movq mm1, [r2+r3*1-8]
  1101. punpckhbw mm1, [r0+r3*2-8]
  1102. mov r2, r0
  1103. punpckhwd mm1, mm0
  1104. lea r0, [r0+r3*4]
  1105. movq mm2, [r0+r3*1-8]
  1106. punpckhbw mm2, [r0+r3*0-8]
  1107. lea r0, [r0+r3*2]
  1108. movq mm3, [r0+r3*1-8]
  1109. punpckhbw mm3, [r0+r3*0-8]
  1110. punpckhwd mm3, mm2
  1111. punpckhdq mm3, mm1
  1112. lea r0, [r0+r3*2]
  1113. movq mm0, [r0+r3*0-8]
  1114. movq mm1, [r1+r3*0-8]
  1115. mov r0, r2
  1116. movq mm4, mm3
  1117. movq mm2, mm3
  1118. PALIGNR mm4, mm0, 7, mm0
  1119. PALIGNR mm1, mm2, 1, mm2
  1120. movq mm0, mm4
  1121. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1122. movq mm4, mm0
  1123. movq mm7, mm2
  1124. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1125. psllq mm1, 56
  1126. PALIGNR mm7, mm1, 7, mm3
  1127. movq mm3, mm7
  1128. lea r1, [r0+r3*2]
  1129. movq mm7, mm3
  1130. punpckhbw mm3, mm3
  1131. punpcklbw mm7, mm7
  1132. pshufw mm0, mm3, 0xff
  1133. pshufw mm1, mm3, 0xaa
  1134. lea r2, [r1+r3*2]
  1135. pshufw mm2, mm3, 0x55
  1136. pshufw mm3, mm3, 0x00
  1137. pshufw mm4, mm7, 0xff
  1138. pshufw mm5, mm7, 0xaa
  1139. pshufw mm6, mm7, 0x55
  1140. pshufw mm7, mm7, 0x00
  1141. movq [r0+r3*1], mm0
  1142. movq [r0+r3*2], mm1
  1143. movq [r1+r3*1], mm2
  1144. movq [r1+r3*2], mm3
  1145. movq [r2+r3*1], mm4
  1146. movq [r2+r3*2], mm5
  1147. lea r0, [r2+r3*2]
  1148. movq [r0+r3*1], mm6
  1149. movq [r0+r3*2], mm7
  1150. RET
  1151. %endmacro
  1152. INIT_MMX
  1153. %define PALIGNR PALIGNR_MMX
  1154. PRED8x8L_HORIZONTAL mmxext
  1155. %define PALIGNR PALIGNR_SSSE3
  1156. PRED8x8L_HORIZONTAL ssse3
  1157. ;-----------------------------------------------------------------------------
  1158. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1159. ;-----------------------------------------------------------------------------
  1160. %macro PRED8x8L_VERTICAL 1
  1161. cglobal pred8x8l_vertical_%1, 4,4
  1162. sub r0, r3
  1163. movq mm0, [r0-8]
  1164. movq mm3, [r0]
  1165. movq mm1, [r0+8]
  1166. movq mm2, mm3
  1167. movq mm4, mm3
  1168. PALIGNR mm2, mm0, 7, mm0
  1169. PALIGNR mm1, mm4, 1, mm4
  1170. test r1, r1 ; top_left
  1171. jz .fix_lt_2
  1172. test r2, r2 ; top_right
  1173. jz .fix_tr_1
  1174. jmp .body
  1175. .fix_lt_2:
  1176. movq mm5, mm3
  1177. pxor mm5, mm2
  1178. psllq mm5, 56
  1179. psrlq mm5, 56
  1180. pxor mm2, mm5
  1181. test r2, r2 ; top_right
  1182. jnz .body
  1183. .fix_tr_1:
  1184. movq mm5, mm3
  1185. pxor mm5, mm1
  1186. psrlq mm5, 56
  1187. psllq mm5, 56
  1188. pxor mm1, mm5
  1189. .body
  1190. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1191. %rep 3
  1192. movq [r0+r3*1], mm0
  1193. movq [r0+r3*2], mm0
  1194. lea r0, [r0+r3*2]
  1195. %endrep
  1196. movq [r0+r3*1], mm0
  1197. movq [r0+r3*2], mm0
  1198. RET
  1199. %endmacro
  1200. INIT_MMX
  1201. %define PALIGNR PALIGNR_MMX
  1202. PRED8x8L_VERTICAL mmxext
  1203. %define PALIGNR PALIGNR_SSSE3
  1204. PRED8x8L_VERTICAL ssse3
  1205. ;-----------------------------------------------------------------------------
  1206. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1207. ;-----------------------------------------------------------------------------
  1208. INIT_MMX
  1209. %define PALIGNR PALIGNR_MMX
  1210. cglobal pred8x8l_down_left_mmxext, 4,5
  1211. sub r0, r3
  1212. movq mm0, [r0-8]
  1213. movq mm3, [r0]
  1214. movq mm1, [r0+8]
  1215. movq mm2, mm3
  1216. movq mm4, mm3
  1217. PALIGNR mm2, mm0, 7, mm0
  1218. PALIGNR mm1, mm4, 1, mm4
  1219. test r1, r1
  1220. jz .fix_lt_2
  1221. test r2, r2
  1222. jz .fix_tr_1
  1223. jmp .do_top
  1224. .fix_lt_2:
  1225. movq mm5, mm3
  1226. pxor mm5, mm2
  1227. psllq mm5, 56
  1228. psrlq mm5, 56
  1229. pxor mm2, mm5
  1230. test r2, r2
  1231. jnz .do_top
  1232. .fix_tr_1:
  1233. movq mm5, mm3
  1234. pxor mm5, mm1
  1235. psrlq mm5, 56
  1236. psllq mm5, 56
  1237. pxor mm1, mm5
  1238. jmp .do_top
  1239. .fix_tr_2:
  1240. punpckhbw mm3, mm3
  1241. pshufw mm1, mm3, 0xFF
  1242. jmp .do_topright
  1243. .do_top:
  1244. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1245. movq mm7, mm4
  1246. test r2, r2
  1247. jz .fix_tr_2
  1248. movq mm0, [r0+8]
  1249. movq mm5, mm0
  1250. movq mm2, mm0
  1251. movq mm4, mm0
  1252. psrlq mm5, 56
  1253. PALIGNR mm2, mm3, 7, mm3
  1254. PALIGNR mm5, mm4, 1, mm4
  1255. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1256. .do_topright:
  1257. lea r1, [r0+r3*2]
  1258. movq mm6, mm1
  1259. psrlq mm1, 56
  1260. movq mm4, mm1
  1261. lea r2, [r1+r3*2]
  1262. movq mm2, mm6
  1263. PALIGNR mm2, mm7, 1, mm0
  1264. movq mm3, mm6
  1265. PALIGNR mm3, mm7, 7, mm0
  1266. PALIGNR mm4, mm6, 1, mm0
  1267. movq mm5, mm7
  1268. movq mm1, mm7
  1269. movq mm7, mm6
  1270. lea r4, [r2+r3*2]
  1271. psllq mm1, 8
  1272. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1273. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1274. movq [r4+r3*2], mm1
  1275. movq mm2, mm0
  1276. psllq mm1, 8
  1277. psrlq mm2, 56
  1278. psllq mm0, 8
  1279. por mm1, mm2
  1280. movq [r4+r3*1], mm1
  1281. movq mm2, mm0
  1282. psllq mm1, 8
  1283. psrlq mm2, 56
  1284. psllq mm0, 8
  1285. por mm1, mm2
  1286. movq [r2+r3*2], mm1
  1287. movq mm2, mm0
  1288. psllq mm1, 8
  1289. psrlq mm2, 56
  1290. psllq mm0, 8
  1291. por mm1, mm2
  1292. movq [r2+r3*1], mm1
  1293. movq mm2, mm0
  1294. psllq mm1, 8
  1295. psrlq mm2, 56
  1296. psllq mm0, 8
  1297. por mm1, mm2
  1298. movq [r1+r3*2], mm1
  1299. movq mm2, mm0
  1300. psllq mm1, 8
  1301. psrlq mm2, 56
  1302. psllq mm0, 8
  1303. por mm1, mm2
  1304. movq [r1+r3*1], mm1
  1305. movq mm2, mm0
  1306. psllq mm1, 8
  1307. psrlq mm2, 56
  1308. psllq mm0, 8
  1309. por mm1, mm2
  1310. movq [r0+r3*2], mm1
  1311. psllq mm1, 8
  1312. psrlq mm0, 56
  1313. por mm1, mm0
  1314. movq [r0+r3*1], mm1
  1315. RET
  1316. %macro PRED8x8L_DOWN_LEFT 1
  1317. cglobal pred8x8l_down_left_%1, 4,4
  1318. sub r0, r3
  1319. movq mm0, [r0-8]
  1320. movq mm3, [r0]
  1321. movq mm1, [r0+8]
  1322. movq mm2, mm3
  1323. movq mm4, mm3
  1324. PALIGNR mm2, mm0, 7, mm0
  1325. PALIGNR mm1, mm4, 1, mm4
  1326. test r1, r1 ; top_left
  1327. jz .fix_lt_2
  1328. test r2, r2 ; top_right
  1329. jz .fix_tr_1
  1330. jmp .do_top
  1331. .fix_lt_2:
  1332. movq mm5, mm3
  1333. pxor mm5, mm2
  1334. psllq mm5, 56
  1335. psrlq mm5, 56
  1336. pxor mm2, mm5
  1337. test r2, r2 ; top_right
  1338. jnz .do_top
  1339. .fix_tr_1:
  1340. movq mm5, mm3
  1341. pxor mm5, mm1
  1342. psrlq mm5, 56
  1343. psllq mm5, 56
  1344. pxor mm1, mm5
  1345. jmp .do_top
  1346. .fix_tr_2:
  1347. punpckhbw mm3, mm3
  1348. pshufw mm1, mm3, 0xFF
  1349. jmp .do_topright
  1350. .do_top:
  1351. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1352. movq2dq xmm3, mm4
  1353. test r2, r2 ; top_right
  1354. jz .fix_tr_2
  1355. movq mm0, [r0+8]
  1356. movq mm5, mm0
  1357. movq mm2, mm0
  1358. movq mm4, mm0
  1359. psrlq mm5, 56
  1360. PALIGNR mm2, mm3, 7, mm3
  1361. PALIGNR mm5, mm4, 1, mm4
  1362. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1363. .do_topright:
  1364. movq2dq xmm4, mm1
  1365. psrlq mm1, 56
  1366. movq2dq xmm5, mm1
  1367. lea r1, [r0+r3*2]
  1368. pslldq xmm4, 8
  1369. por xmm3, xmm4
  1370. movdqa xmm2, xmm3
  1371. psrldq xmm2, 1
  1372. pslldq xmm5, 15
  1373. por xmm2, xmm5
  1374. lea r2, [r1+r3*2]
  1375. movdqa xmm1, xmm3
  1376. pslldq xmm1, 1
  1377. INIT_XMM
  1378. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1379. psrldq xmm0, 1
  1380. movq [r0+r3*1], xmm0
  1381. psrldq xmm0, 1
  1382. movq [r0+r3*2], xmm0
  1383. psrldq xmm0, 1
  1384. lea r0, [r2+r3*2]
  1385. movq [r1+r3*1], xmm0
  1386. psrldq xmm0, 1
  1387. movq [r1+r3*2], xmm0
  1388. psrldq xmm0, 1
  1389. movq [r2+r3*1], xmm0
  1390. psrldq xmm0, 1
  1391. movq [r2+r3*2], xmm0
  1392. psrldq xmm0, 1
  1393. movq [r0+r3*1], xmm0
  1394. psrldq xmm0, 1
  1395. movq [r0+r3*2], xmm0
  1396. RET
  1397. %endmacro
  1398. INIT_MMX
  1399. %define PALIGNR PALIGNR_MMX
  1400. PRED8x8L_DOWN_LEFT sse2
  1401. INIT_MMX
  1402. %define PALIGNR PALIGNR_SSSE3
  1403. PRED8x8L_DOWN_LEFT ssse3
  1404. ;-----------------------------------------------------------------------------
  1405. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1406. ;-----------------------------------------------------------------------------
  1407. INIT_MMX
  1408. %define PALIGNR PALIGNR_MMX
  1409. cglobal pred8x8l_down_right_mmxext, 4,5
  1410. sub r0, r3
  1411. lea r4, [r0+r3*2]
  1412. movq mm0, [r0+r3*1-8]
  1413. punpckhbw mm0, [r0+r3*0-8]
  1414. movq mm1, [r4+r3*1-8]
  1415. punpckhbw mm1, [r0+r3*2-8]
  1416. mov r4, r0
  1417. punpckhwd mm1, mm0
  1418. lea r0, [r0+r3*4]
  1419. movq mm2, [r0+r3*1-8]
  1420. punpckhbw mm2, [r0+r3*0-8]
  1421. lea r0, [r0+r3*2]
  1422. movq mm3, [r0+r3*1-8]
  1423. punpckhbw mm3, [r0+r3*0-8]
  1424. punpckhwd mm3, mm2
  1425. punpckhdq mm3, mm1
  1426. lea r0, [r0+r3*2]
  1427. movq mm0, [r0+r3*0-8]
  1428. movq mm1, [r4]
  1429. mov r0, r4
  1430. movq mm4, mm3
  1431. movq mm2, mm3
  1432. PALIGNR mm4, mm0, 7, mm0
  1433. PALIGNR mm1, mm2, 1, mm2
  1434. test r1, r1 ; top_left
  1435. jz .fix_lt_1
  1436. .do_left:
  1437. movq mm0, mm4
  1438. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1439. movq mm4, mm0
  1440. movq mm7, mm2
  1441. movq mm6, mm2
  1442. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1443. psllq mm1, 56
  1444. PALIGNR mm7, mm1, 7, mm3
  1445. movq mm0, [r0-8]
  1446. movq mm3, [r0]
  1447. movq mm1, [r0+8]
  1448. movq mm2, mm3
  1449. movq mm4, mm3
  1450. PALIGNR mm2, mm0, 7, mm0
  1451. PALIGNR mm1, mm4, 1, mm4
  1452. test r1, r1 ; top_left
  1453. jz .fix_lt_2
  1454. test r2, r2 ; top_right
  1455. jz .fix_tr_1
  1456. .do_top:
  1457. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1458. movq mm5, mm4
  1459. jmp .body
  1460. .fix_lt_1:
  1461. movq mm5, mm3
  1462. pxor mm5, mm4
  1463. psrlq mm5, 56
  1464. psllq mm5, 48
  1465. pxor mm1, mm5
  1466. jmp .do_left
  1467. .fix_lt_2:
  1468. movq mm5, mm3
  1469. pxor mm5, mm2
  1470. psllq mm5, 56
  1471. psrlq mm5, 56
  1472. pxor mm2, mm5
  1473. test r2, r2 ; top_right
  1474. jnz .do_top
  1475. .fix_tr_1:
  1476. movq mm5, mm3
  1477. pxor mm5, mm1
  1478. psrlq mm5, 56
  1479. psllq mm5, 56
  1480. pxor mm1, mm5
  1481. jmp .do_top
  1482. .body
  1483. lea r1, [r0+r3*2]
  1484. movq mm1, mm7
  1485. movq mm7, mm5
  1486. movq mm5, mm6
  1487. movq mm2, mm7
  1488. lea r2, [r1+r3*2]
  1489. PALIGNR mm2, mm6, 1, mm0
  1490. movq mm3, mm7
  1491. PALIGNR mm3, mm6, 7, mm0
  1492. movq mm4, mm7
  1493. lea r4, [r2+r3*2]
  1494. psrlq mm4, 8
  1495. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1496. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1497. movq [r4+r3*2], mm0
  1498. movq mm2, mm1
  1499. psrlq mm0, 8
  1500. psllq mm2, 56
  1501. psrlq mm1, 8
  1502. por mm0, mm2
  1503. movq [r4+r3*1], mm0
  1504. movq mm2, mm1
  1505. psrlq mm0, 8
  1506. psllq mm2, 56
  1507. psrlq mm1, 8
  1508. por mm0, mm2
  1509. movq [r2+r3*2], mm0
  1510. movq mm2, mm1
  1511. psrlq mm0, 8
  1512. psllq mm2, 56
  1513. psrlq mm1, 8
  1514. por mm0, mm2
  1515. movq [r2+r3*1], mm0
  1516. movq mm2, mm1
  1517. psrlq mm0, 8
  1518. psllq mm2, 56
  1519. psrlq mm1, 8
  1520. por mm0, mm2
  1521. movq [r1+r3*2], mm0
  1522. movq mm2, mm1
  1523. psrlq mm0, 8
  1524. psllq mm2, 56
  1525. psrlq mm1, 8
  1526. por mm0, mm2
  1527. movq [r1+r3*1], mm0
  1528. movq mm2, mm1
  1529. psrlq mm0, 8
  1530. psllq mm2, 56
  1531. psrlq mm1, 8
  1532. por mm0, mm2
  1533. movq [r0+r3*2], mm0
  1534. psrlq mm0, 8
  1535. psllq mm1, 56
  1536. por mm0, mm1
  1537. movq [r0+r3*1], mm0
  1538. RET
  1539. %macro PRED8x8L_DOWN_RIGHT 1
  1540. cglobal pred8x8l_down_right_%1, 4,5
  1541. sub r0, r3
  1542. lea r4, [r0+r3*2]
  1543. movq mm0, [r0+r3*1-8]
  1544. punpckhbw mm0, [r0+r3*0-8]
  1545. movq mm1, [r4+r3*1-8]
  1546. punpckhbw mm1, [r0+r3*2-8]
  1547. mov r4, r0
  1548. punpckhwd mm1, mm0
  1549. lea r0, [r0+r3*4]
  1550. movq mm2, [r0+r3*1-8]
  1551. punpckhbw mm2, [r0+r3*0-8]
  1552. lea r0, [r0+r3*2]
  1553. movq mm3, [r0+r3*1-8]
  1554. punpckhbw mm3, [r0+r3*0-8]
  1555. punpckhwd mm3, mm2
  1556. punpckhdq mm3, mm1
  1557. lea r0, [r0+r3*2]
  1558. movq mm0, [r0+r3*0-8]
  1559. movq mm1, [r4]
  1560. mov r0, r4
  1561. movq mm4, mm3
  1562. movq mm2, mm3
  1563. PALIGNR mm4, mm0, 7, mm0
  1564. PALIGNR mm1, mm2, 1, mm2
  1565. test r1, r1
  1566. jz .fix_lt_1
  1567. jmp .do_left
  1568. .fix_lt_1:
  1569. movq mm5, mm3
  1570. pxor mm5, mm4
  1571. psrlq mm5, 56
  1572. psllq mm5, 48
  1573. pxor mm1, mm5
  1574. jmp .do_left
  1575. .fix_lt_2:
  1576. movq mm5, mm3
  1577. pxor mm5, mm2
  1578. psllq mm5, 56
  1579. psrlq mm5, 56
  1580. pxor mm2, mm5
  1581. test r2, r2
  1582. jnz .do_top
  1583. .fix_tr_1:
  1584. movq mm5, mm3
  1585. pxor mm5, mm1
  1586. psrlq mm5, 56
  1587. psllq mm5, 56
  1588. pxor mm1, mm5
  1589. jmp .do_top
  1590. .do_left:
  1591. movq mm0, mm4
  1592. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1593. movq mm4, mm0
  1594. movq mm7, mm2
  1595. movq2dq xmm3, mm2
  1596. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1597. psllq mm1, 56
  1598. PALIGNR mm7, mm1, 7, mm3
  1599. movq2dq xmm1, mm7
  1600. movq mm0, [r0-8]
  1601. movq mm3, [r0]
  1602. movq mm1, [r0+8]
  1603. movq mm2, mm3
  1604. movq mm4, mm3
  1605. PALIGNR mm2, mm0, 7, mm0
  1606. PALIGNR mm1, mm4, 1, mm4
  1607. test r1, r1
  1608. jz .fix_lt_2
  1609. test r2, r2
  1610. jz .fix_tr_1
  1611. .do_top:
  1612. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1613. movq2dq xmm4, mm4
  1614. lea r1, [r0+r3*2]
  1615. movdqa xmm0, xmm3
  1616. pslldq xmm4, 8
  1617. por xmm3, xmm4
  1618. lea r2, [r1+r3*2]
  1619. pslldq xmm4, 1
  1620. por xmm1, xmm4
  1621. psrldq xmm0, 7
  1622. pslldq xmm0, 15
  1623. psrldq xmm0, 7
  1624. por xmm1, xmm0
  1625. lea r0, [r2+r3*2]
  1626. movdqa xmm2, xmm3
  1627. psrldq xmm2, 1
  1628. INIT_XMM
  1629. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1630. movdqa xmm1, xmm0
  1631. psrldq xmm1, 1
  1632. movq [r0+r3*2], xmm0
  1633. movq [r0+r3*1], xmm1
  1634. psrldq xmm0, 2
  1635. psrldq xmm1, 2
  1636. movq [r2+r3*2], xmm0
  1637. movq [r2+r3*1], xmm1
  1638. psrldq xmm0, 2
  1639. psrldq xmm1, 2
  1640. movq [r1+r3*2], xmm0
  1641. movq [r1+r3*1], xmm1
  1642. psrldq xmm0, 2
  1643. psrldq xmm1, 2
  1644. movq [r4+r3*2], xmm0
  1645. movq [r4+r3*1], xmm1
  1646. RET
  1647. %endmacro
  1648. INIT_MMX
  1649. %define PALIGNR PALIGNR_MMX
  1650. PRED8x8L_DOWN_RIGHT sse2
  1651. INIT_MMX
  1652. %define PALIGNR PALIGNR_SSSE3
  1653. PRED8x8L_DOWN_RIGHT ssse3
  1654. ;-----------------------------------------------------------------------------
  1655. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1656. ;-----------------------------------------------------------------------------
  1657. INIT_MMX
  1658. %define PALIGNR PALIGNR_MMX
  1659. cglobal pred8x8l_vertical_right_mmxext, 4,5
  1660. sub r0, r3
  1661. lea r4, [r0+r3*2]
  1662. movq mm0, [r0+r3*1-8]
  1663. punpckhbw mm0, [r0+r3*0-8]
  1664. movq mm1, [r4+r3*1-8]
  1665. punpckhbw mm1, [r0+r3*2-8]
  1666. mov r4, r0
  1667. punpckhwd mm1, mm0
  1668. lea r0, [r0+r3*4]
  1669. movq mm2, [r0+r3*1-8]
  1670. punpckhbw mm2, [r0+r3*0-8]
  1671. lea r0, [r0+r3*2]
  1672. movq mm3, [r0+r3*1-8]
  1673. punpckhbw mm3, [r0+r3*0-8]
  1674. punpckhwd mm3, mm2
  1675. punpckhdq mm3, mm1
  1676. lea r0, [r0+r3*2]
  1677. movq mm0, [r0+r3*0-8]
  1678. movq mm1, [r4]
  1679. mov r0, r4
  1680. movq mm4, mm3
  1681. movq mm2, mm3
  1682. PALIGNR mm4, mm0, 7, mm0
  1683. PALIGNR mm1, mm2, 1, mm2
  1684. test r1, r1
  1685. jz .fix_lt_1
  1686. jmp .do_left
  1687. .fix_lt_1:
  1688. movq mm5, mm3
  1689. pxor mm5, mm4
  1690. psrlq mm5, 56
  1691. psllq mm5, 48
  1692. pxor mm1, mm5
  1693. jmp .do_left
  1694. .fix_lt_2:
  1695. movq mm5, mm3
  1696. pxor mm5, mm2
  1697. psllq mm5, 56
  1698. psrlq mm5, 56
  1699. pxor mm2, mm5
  1700. test r2, r2
  1701. jnz .do_top
  1702. .fix_tr_1:
  1703. movq mm5, mm3
  1704. pxor mm5, mm1
  1705. psrlq mm5, 56
  1706. psllq mm5, 56
  1707. pxor mm1, mm5
  1708. jmp .do_top
  1709. .do_left:
  1710. movq mm0, mm4
  1711. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1712. movq mm7, mm2
  1713. movq mm0, [r0-8]
  1714. movq mm3, [r0]
  1715. movq mm1, [r0+8]
  1716. movq mm2, mm3
  1717. movq mm4, mm3
  1718. PALIGNR mm2, mm0, 7, mm0
  1719. PALIGNR mm1, mm4, 1, mm4
  1720. test r1, r1
  1721. jz .fix_lt_2
  1722. test r2, r2
  1723. jz .fix_tr_1
  1724. .do_top
  1725. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1726. lea r1, [r0+r3*2]
  1727. movq mm2, mm6
  1728. movq mm3, mm6
  1729. PALIGNR mm3, mm7, 7, mm0
  1730. PALIGNR mm6, mm7, 6, mm1
  1731. movq mm4, mm3
  1732. pavgb mm3, mm2
  1733. lea r2, [r1+r3*2]
  1734. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1735. movq [r0+r3*1], mm3
  1736. movq [r0+r3*2], mm0
  1737. movq mm5, mm0
  1738. movq mm6, mm3
  1739. movq mm1, mm7
  1740. movq mm2, mm1
  1741. psllq mm2, 8
  1742. movq mm3, mm1
  1743. psllq mm3, 16
  1744. lea r4, [r2+r3*2]
  1745. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1746. PALIGNR mm6, mm0, 7, mm2
  1747. movq [r1+r3*1], mm6
  1748. psllq mm0, 8
  1749. PALIGNR mm5, mm0, 7, mm1
  1750. movq [r1+r3*2], mm5
  1751. psllq mm0, 8
  1752. PALIGNR mm6, mm0, 7, mm2
  1753. movq [r2+r3*1], mm6
  1754. psllq mm0, 8
  1755. PALIGNR mm5, mm0, 7, mm1
  1756. movq [r2+r3*2], mm5
  1757. psllq mm0, 8
  1758. PALIGNR mm6, mm0, 7, mm2
  1759. movq [r4+r3*1], mm6
  1760. psllq mm0, 8
  1761. PALIGNR mm5, mm0, 7, mm1
  1762. movq [r4+r3*2], mm5
  1763. RET
  1764. %macro PRED8x8L_VERTICAL_RIGHT 1
  1765. cglobal pred8x8l_vertical_right_%1, 4,5,7
  1766. ; manually spill XMM registers for Win64 because
  1767. ; the code here is initialized with INIT_MMX
  1768. WIN64_SPILL_XMM 7
  1769. sub r0, r3
  1770. lea r4, [r0+r3*2]
  1771. movq mm0, [r0+r3*1-8]
  1772. punpckhbw mm0, [r0+r3*0-8]
  1773. movq mm1, [r4+r3*1-8]
  1774. punpckhbw mm1, [r0+r3*2-8]
  1775. mov r4, r0
  1776. punpckhwd mm1, mm0
  1777. lea r0, [r0+r3*4]
  1778. movq mm2, [r0+r3*1-8]
  1779. punpckhbw mm2, [r0+r3*0-8]
  1780. lea r0, [r0+r3*2]
  1781. movq mm3, [r0+r3*1-8]
  1782. punpckhbw mm3, [r0+r3*0-8]
  1783. punpckhwd mm3, mm2
  1784. punpckhdq mm3, mm1
  1785. lea r0, [r0+r3*2]
  1786. movq mm0, [r0+r3*0-8]
  1787. movq mm1, [r4]
  1788. mov r0, r4
  1789. movq mm4, mm3
  1790. movq mm2, mm3
  1791. PALIGNR mm4, mm0, 7, mm0
  1792. PALIGNR mm1, mm2, 1, mm2
  1793. test r1, r1
  1794. jnz .do_left
  1795. .fix_lt_1:
  1796. movq mm5, mm3
  1797. pxor mm5, mm4
  1798. psrlq mm5, 56
  1799. psllq mm5, 48
  1800. pxor mm1, mm5
  1801. jmp .do_left
  1802. .fix_lt_2:
  1803. movq mm5, mm3
  1804. pxor mm5, mm2
  1805. psllq mm5, 56
  1806. psrlq mm5, 56
  1807. pxor mm2, mm5
  1808. test r2, r2
  1809. jnz .do_top
  1810. .fix_tr_1:
  1811. movq mm5, mm3
  1812. pxor mm5, mm1
  1813. psrlq mm5, 56
  1814. psllq mm5, 56
  1815. pxor mm1, mm5
  1816. jmp .do_top
  1817. .do_left:
  1818. movq mm0, mm4
  1819. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1820. movq2dq xmm0, mm2
  1821. movq mm0, [r0-8]
  1822. movq mm3, [r0]
  1823. movq mm1, [r0+8]
  1824. movq mm2, mm3
  1825. movq mm4, mm3
  1826. PALIGNR mm2, mm0, 7, mm0
  1827. PALIGNR mm1, mm4, 1, mm4
  1828. test r1, r1
  1829. jz .fix_lt_2
  1830. test r2, r2
  1831. jz .fix_tr_1
  1832. .do_top
  1833. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1834. lea r1, [r0+r3*2]
  1835. movq2dq xmm4, mm6
  1836. pslldq xmm4, 8
  1837. por xmm0, xmm4
  1838. movdqa xmm6, [pw_ff00]
  1839. movdqa xmm1, xmm0
  1840. lea r2, [r1+r3*2]
  1841. movdqa xmm2, xmm0
  1842. movdqa xmm3, xmm0
  1843. pslldq xmm0, 1
  1844. pslldq xmm1, 2
  1845. pavgb xmm2, xmm0
  1846. INIT_XMM
  1847. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1848. pandn xmm6, xmm4
  1849. movdqa xmm5, xmm4
  1850. psrlw xmm4, 8
  1851. packuswb xmm6, xmm4
  1852. movhlps xmm4, xmm6
  1853. movhps [r0+r3*2], xmm5
  1854. movhps [r0+r3*1], xmm2
  1855. psrldq xmm5, 4
  1856. movss xmm5, xmm6
  1857. psrldq xmm2, 4
  1858. movss xmm2, xmm4
  1859. lea r0, [r2+r3*2]
  1860. psrldq xmm5, 1
  1861. psrldq xmm2, 1
  1862. movq [r0+r3*2], xmm5
  1863. movq [r0+r3*1], xmm2
  1864. psrldq xmm5, 1
  1865. psrldq xmm2, 1
  1866. movq [r2+r3*2], xmm5
  1867. movq [r2+r3*1], xmm2
  1868. psrldq xmm5, 1
  1869. psrldq xmm2, 1
  1870. movq [r1+r3*2], xmm5
  1871. movq [r1+r3*1], xmm2
  1872. RET
  1873. %endmacro
  1874. INIT_MMX
  1875. %define PALIGNR PALIGNR_MMX
  1876. PRED8x8L_VERTICAL_RIGHT sse2
  1877. INIT_MMX
  1878. %define PALIGNR PALIGNR_SSSE3
  1879. PRED8x8L_VERTICAL_RIGHT ssse3
  1880. ;-----------------------------------------------------------------------------
  1881. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1882. ;-----------------------------------------------------------------------------
  1883. %macro PRED8x8L_VERTICAL_LEFT 1
  1884. cglobal pred8x8l_vertical_left_%1, 4,4
  1885. sub r0, r3
  1886. movq mm0, [r0-8]
  1887. movq mm3, [r0]
  1888. movq mm1, [r0+8]
  1889. movq mm2, mm3
  1890. movq mm4, mm3
  1891. PALIGNR mm2, mm0, 7, mm0
  1892. PALIGNR mm1, mm4, 1, mm4
  1893. test r1, r1
  1894. jz .fix_lt_2
  1895. test r2, r2
  1896. jz .fix_tr_1
  1897. jmp .do_top
  1898. .fix_lt_2:
  1899. movq mm5, mm3
  1900. pxor mm5, mm2
  1901. psllq mm5, 56
  1902. psrlq mm5, 56
  1903. pxor mm2, mm5
  1904. test r2, r2
  1905. jnz .do_top
  1906. .fix_tr_1:
  1907. movq mm5, mm3
  1908. pxor mm5, mm1
  1909. psrlq mm5, 56
  1910. psllq mm5, 56
  1911. pxor mm1, mm5
  1912. jmp .do_top
  1913. .fix_tr_2:
  1914. punpckhbw mm3, mm3
  1915. pshufw mm1, mm3, 0xFF
  1916. jmp .do_topright
  1917. .do_top:
  1918. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1919. movq2dq xmm4, mm4
  1920. test r2, r2
  1921. jz .fix_tr_2
  1922. movq mm0, [r0+8]
  1923. movq mm5, mm0
  1924. movq mm2, mm0
  1925. movq mm4, mm0
  1926. psrlq mm5, 56
  1927. PALIGNR mm2, mm3, 7, mm3
  1928. PALIGNR mm5, mm4, 1, mm4
  1929. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1930. .do_topright:
  1931. movq2dq xmm3, mm1
  1932. lea r1, [r0+r3*2]
  1933. pslldq xmm3, 8
  1934. por xmm4, xmm3
  1935. movdqa xmm2, xmm4
  1936. movdqa xmm1, xmm4
  1937. movdqa xmm3, xmm4
  1938. psrldq xmm2, 1
  1939. pslldq xmm1, 1
  1940. pavgb xmm3, xmm2
  1941. lea r2, [r1+r3*2]
  1942. INIT_XMM
  1943. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1944. psrldq xmm0, 1
  1945. movq [r0+r3*1], xmm3
  1946. movq [r0+r3*2], xmm0
  1947. lea r0, [r2+r3*2]
  1948. psrldq xmm3, 1
  1949. psrldq xmm0, 1
  1950. movq [r1+r3*1], xmm3
  1951. movq [r1+r3*2], xmm0
  1952. psrldq xmm3, 1
  1953. psrldq xmm0, 1
  1954. movq [r2+r3*1], xmm3
  1955. movq [r2+r3*2], xmm0
  1956. psrldq xmm3, 1
  1957. psrldq xmm0, 1
  1958. movq [r0+r3*1], xmm3
  1959. movq [r0+r3*2], xmm0
  1960. RET
  1961. %endmacro
  1962. INIT_MMX
  1963. %define PALIGNR PALIGNR_MMX
  1964. PRED8x8L_VERTICAL_LEFT sse2
  1965. %define PALIGNR PALIGNR_SSSE3
  1966. INIT_MMX
  1967. PRED8x8L_VERTICAL_LEFT ssse3
  1968. ;-----------------------------------------------------------------------------
  1969. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  1970. ;-----------------------------------------------------------------------------
  1971. %macro PRED8x8L_HORIZONTAL_UP 1
  1972. cglobal pred8x8l_horizontal_up_%1, 4,4
  1973. sub r0, r3
  1974. lea r2, [r0+r3*2]
  1975. movq mm0, [r0+r3*1-8]
  1976. test r1, r1
  1977. lea r1, [r0+r3]
  1978. cmovnz r1, r0
  1979. punpckhbw mm0, [r1+r3*0-8]
  1980. movq mm1, [r2+r3*1-8]
  1981. punpckhbw mm1, [r0+r3*2-8]
  1982. mov r2, r0
  1983. punpckhwd mm1, mm0
  1984. lea r0, [r0+r3*4]
  1985. movq mm2, [r0+r3*1-8]
  1986. punpckhbw mm2, [r0+r3*0-8]
  1987. lea r0, [r0+r3*2]
  1988. movq mm3, [r0+r3*1-8]
  1989. punpckhbw mm3, [r0+r3*0-8]
  1990. punpckhwd mm3, mm2
  1991. punpckhdq mm3, mm1
  1992. lea r0, [r0+r3*2]
  1993. movq mm0, [r0+r3*0-8]
  1994. movq mm1, [r1+r3*0-8]
  1995. mov r0, r2
  1996. movq mm4, mm3
  1997. movq mm2, mm3
  1998. PALIGNR mm4, mm0, 7, mm0
  1999. PALIGNR mm1, mm2, 1, mm2
  2000. movq mm0, mm4
  2001. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2002. movq mm4, mm0
  2003. movq mm7, mm2
  2004. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2005. psllq mm1, 56
  2006. PALIGNR mm7, mm1, 7, mm3
  2007. lea r1, [r0+r3*2]
  2008. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2009. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2010. movq mm2, mm0
  2011. psllw mm0, 8
  2012. psrlw mm2, 8
  2013. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2014. movq mm3, mm2
  2015. movq mm4, mm2
  2016. movq mm5, mm2
  2017. psrlq mm2, 8
  2018. psrlq mm3, 16
  2019. lea r2, [r1+r3*2]
  2020. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2021. punpckhbw mm7, mm7
  2022. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2023. pavgb mm4, mm2
  2024. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2025. movq mm5, mm4
  2026. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2027. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2028. movq mm6, mm5
  2029. movq mm7, mm5
  2030. movq mm0, mm5
  2031. PALIGNR mm5, mm4, 2, mm1
  2032. pshufw mm1, mm6, 11111001b
  2033. PALIGNR mm6, mm4, 4, mm2
  2034. pshufw mm2, mm7, 11111110b
  2035. PALIGNR mm7, mm4, 6, mm3
  2036. pshufw mm3, mm0, 11111111b
  2037. movq [r0+r3*1], mm4
  2038. movq [r0+r3*2], mm5
  2039. lea r0, [r2+r3*2]
  2040. movq [r1+r3*1], mm6
  2041. movq [r1+r3*2], mm7
  2042. movq [r2+r3*1], mm0
  2043. movq [r2+r3*2], mm1
  2044. movq [r0+r3*1], mm2
  2045. movq [r0+r3*2], mm3
  2046. RET
  2047. %endmacro
  2048. INIT_MMX
  2049. %define PALIGNR PALIGNR_MMX
  2050. PRED8x8L_HORIZONTAL_UP mmxext
  2051. %define PALIGNR PALIGNR_SSSE3
  2052. PRED8x8L_HORIZONTAL_UP ssse3
  2053. ;-----------------------------------------------------------------------------
  2054. ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
  2055. ;-----------------------------------------------------------------------------
  2056. INIT_MMX
  2057. %define PALIGNR PALIGNR_MMX
  2058. cglobal pred8x8l_horizontal_down_mmxext, 4,5
  2059. sub r0, r3
  2060. lea r4, [r0+r3*2]
  2061. movq mm0, [r0+r3*1-8]
  2062. punpckhbw mm0, [r0+r3*0-8]
  2063. movq mm1, [r4+r3*1-8]
  2064. punpckhbw mm1, [r0+r3*2-8]
  2065. mov r4, r0
  2066. punpckhwd mm1, mm0
  2067. lea r0, [r0+r3*4]
  2068. movq mm2, [r0+r3*1-8]
  2069. punpckhbw mm2, [r0+r3*0-8]
  2070. lea r0, [r0+r3*2]
  2071. movq mm3, [r0+r3*1-8]
  2072. punpckhbw mm3, [r0+r3*0-8]
  2073. punpckhwd mm3, mm2
  2074. punpckhdq mm3, mm1
  2075. lea r0, [r0+r3*2]
  2076. movq mm0, [r0+r3*0-8]
  2077. movq mm1, [r4]
  2078. mov r0, r4
  2079. movq mm4, mm3
  2080. movq mm2, mm3
  2081. PALIGNR mm4, mm0, 7, mm0
  2082. PALIGNR mm1, mm2, 1, mm2
  2083. test r1, r1
  2084. jnz .do_left
  2085. .fix_lt_1:
  2086. movq mm5, mm3
  2087. pxor mm5, mm4
  2088. psrlq mm5, 56
  2089. psllq mm5, 48
  2090. pxor mm1, mm5
  2091. jmp .do_left
  2092. .fix_lt_2:
  2093. movq mm5, mm3
  2094. pxor mm5, mm2
  2095. psllq mm5, 56
  2096. psrlq mm5, 56
  2097. pxor mm2, mm5
  2098. test r2, r2
  2099. jnz .do_top
  2100. .fix_tr_1:
  2101. movq mm5, mm3
  2102. pxor mm5, mm1
  2103. psrlq mm5, 56
  2104. psllq mm5, 56
  2105. pxor mm1, mm5
  2106. jmp .do_top
  2107. .do_left:
  2108. movq mm0, mm4
  2109. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2110. movq mm4, mm0
  2111. movq mm7, mm2
  2112. movq mm6, mm2
  2113. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2114. psllq mm1, 56
  2115. PALIGNR mm7, mm1, 7, mm3
  2116. movq mm0, [r0-8]
  2117. movq mm3, [r0]
  2118. movq mm1, [r0+8]
  2119. movq mm2, mm3
  2120. movq mm4, mm3
  2121. PALIGNR mm2, mm0, 7, mm0
  2122. PALIGNR mm1, mm4, 1, mm4
  2123. test r1, r1
  2124. jz .fix_lt_2
  2125. test r2, r2
  2126. jz .fix_tr_1
  2127. .do_top:
  2128. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2129. movq mm5, mm4
  2130. lea r1, [r0+r3*2]
  2131. psllq mm7, 56
  2132. movq mm2, mm5
  2133. movq mm3, mm6
  2134. movq mm4, mm2
  2135. PALIGNR mm2, mm6, 7, mm5
  2136. PALIGNR mm6, mm7, 7, mm0
  2137. lea r2, [r1+r3*2]
  2138. PALIGNR mm4, mm3, 1, mm7
  2139. movq mm5, mm3
  2140. pavgb mm3, mm6
  2141. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2142. movq mm4, mm2
  2143. movq mm1, mm2
  2144. lea r4, [r2+r3*2]
  2145. psrlq mm4, 16
  2146. psrlq mm1, 8
  2147. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2148. movq mm7, mm3
  2149. punpcklbw mm3, mm0
  2150. punpckhbw mm7, mm0
  2151. movq mm1, mm7
  2152. movq mm0, mm7
  2153. movq mm4, mm7
  2154. movq [r4+r3*2], mm3
  2155. PALIGNR mm7, mm3, 2, mm5
  2156. movq [r4+r3*1], mm7
  2157. PALIGNR mm1, mm3, 4, mm5
  2158. movq [r2+r3*2], mm1
  2159. PALIGNR mm0, mm3, 6, mm3
  2160. movq [r2+r3*1], mm0
  2161. movq mm2, mm6
  2162. movq mm3, mm6
  2163. movq [r1+r3*2], mm4
  2164. PALIGNR mm6, mm4, 2, mm5
  2165. movq [r1+r3*1], mm6
  2166. PALIGNR mm2, mm4, 4, mm5
  2167. movq [r0+r3*2], mm2
  2168. PALIGNR mm3, mm4, 6, mm4
  2169. movq [r0+r3*1], mm3
  2170. RET
  2171. %macro PRED8x8L_HORIZONTAL_DOWN 1
  2172. cglobal pred8x8l_horizontal_down_%1, 4,5
  2173. sub r0, r3
  2174. lea r4, [r0+r3*2]
  2175. movq mm0, [r0+r3*1-8]
  2176. punpckhbw mm0, [r0+r3*0-8]
  2177. movq mm1, [r4+r3*1-8]
  2178. punpckhbw mm1, [r0+r3*2-8]
  2179. mov r4, r0
  2180. punpckhwd mm1, mm0
  2181. lea r0, [r0+r3*4]
  2182. movq mm2, [r0+r3*1-8]
  2183. punpckhbw mm2, [r0+r3*0-8]
  2184. lea r0, [r0+r3*2]
  2185. movq mm3, [r0+r3*1-8]
  2186. punpckhbw mm3, [r0+r3*0-8]
  2187. punpckhwd mm3, mm2
  2188. punpckhdq mm3, mm1
  2189. lea r0, [r0+r3*2]
  2190. movq mm0, [r0+r3*0-8]
  2191. movq mm1, [r4]
  2192. mov r0, r4
  2193. movq mm4, mm3
  2194. movq mm2, mm3
  2195. PALIGNR mm4, mm0, 7, mm0
  2196. PALIGNR mm1, mm2, 1, mm2
  2197. test r1, r1
  2198. jnz .do_left
  2199. .fix_lt_1:
  2200. movq mm5, mm3
  2201. pxor mm5, mm4
  2202. psrlq mm5, 56
  2203. psllq mm5, 48
  2204. pxor mm1, mm5
  2205. jmp .do_left
  2206. .fix_lt_2:
  2207. movq mm5, mm3
  2208. pxor mm5, mm2
  2209. psllq mm5, 56
  2210. psrlq mm5, 56
  2211. pxor mm2, mm5
  2212. test r2, r2
  2213. jnz .do_top
  2214. .fix_tr_1:
  2215. movq mm5, mm3
  2216. pxor mm5, mm1
  2217. psrlq mm5, 56
  2218. psllq mm5, 56
  2219. pxor mm1, mm5
  2220. jmp .do_top
  2221. .fix_tr_2:
  2222. punpckhbw mm3, mm3
  2223. pshufw mm1, mm3, 0xFF
  2224. jmp .do_topright
  2225. .do_left:
  2226. movq mm0, mm4
  2227. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2228. movq2dq xmm0, mm2
  2229. pslldq xmm0, 8
  2230. movq mm4, mm0
  2231. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2232. movq2dq xmm2, mm1
  2233. pslldq xmm2, 15
  2234. psrldq xmm2, 8
  2235. por xmm0, xmm2
  2236. movq mm0, [r0-8]
  2237. movq mm3, [r0]
  2238. movq mm1, [r0+8]
  2239. movq mm2, mm3
  2240. movq mm4, mm3
  2241. PALIGNR mm2, mm0, 7, mm0
  2242. PALIGNR mm1, mm4, 1, mm4
  2243. test r1, r1
  2244. jz .fix_lt_2
  2245. test r2, r2
  2246. jz .fix_tr_1
  2247. .do_top:
  2248. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2249. movq2dq xmm1, mm4
  2250. test r2, r2
  2251. jz .fix_tr_2
  2252. movq mm0, [r0+8]
  2253. movq mm5, mm0
  2254. movq mm2, mm0
  2255. movq mm4, mm0
  2256. psrlq mm5, 56
  2257. PALIGNR mm2, mm3, 7, mm3
  2258. PALIGNR mm5, mm4, 1, mm4
  2259. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2260. .do_topright:
  2261. movq2dq xmm5, mm1
  2262. pslldq xmm5, 8
  2263. por xmm1, xmm5
  2264. INIT_XMM
  2265. lea r2, [r4+r3*2]
  2266. movdqa xmm2, xmm1
  2267. movdqa xmm3, xmm1
  2268. PALIGNR xmm1, xmm0, 7, xmm4
  2269. PALIGNR xmm2, xmm0, 9, xmm5
  2270. lea r1, [r2+r3*2]
  2271. PALIGNR xmm3, xmm0, 8, xmm0
  2272. movdqa xmm4, xmm1
  2273. pavgb xmm4, xmm3
  2274. lea r0, [r1+r3*2]
  2275. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2276. punpcklbw xmm4, xmm0
  2277. movhlps xmm0, xmm4
  2278. movq [r0+r3*2], xmm4
  2279. movq [r2+r3*2], xmm0
  2280. psrldq xmm4, 2
  2281. psrldq xmm0, 2
  2282. movq [r0+r3*1], xmm4
  2283. movq [r2+r3*1], xmm0
  2284. psrldq xmm4, 2
  2285. psrldq xmm0, 2
  2286. movq [r1+r3*2], xmm4
  2287. movq [r4+r3*2], xmm0
  2288. psrldq xmm4, 2
  2289. psrldq xmm0, 2
  2290. movq [r1+r3*1], xmm4
  2291. movq [r4+r3*1], xmm0
  2292. RET
  2293. %endmacro
  2294. INIT_MMX
  2295. %define PALIGNR PALIGNR_MMX
  2296. PRED8x8L_HORIZONTAL_DOWN sse2
  2297. INIT_MMX
  2298. %define PALIGNR PALIGNR_SSSE3
  2299. PRED8x8L_HORIZONTAL_DOWN ssse3
  2300. ;-----------------------------------------------------------------------------
  2301. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2302. ;-----------------------------------------------------------------------------
  2303. cglobal pred4x4_dc_mmxext, 3,5
  2304. pxor mm7, mm7
  2305. mov r4, r0
  2306. sub r0, r2
  2307. movd mm0, [r0]
  2308. psadbw mm0, mm7
  2309. movzx r1d, byte [r0+r2*1-1]
  2310. movd r3d, mm0
  2311. add r3d, r1d
  2312. movzx r1d, byte [r0+r2*2-1]
  2313. lea r0, [r0+r2*2]
  2314. add r3d, r1d
  2315. movzx r1d, byte [r0+r2*1-1]
  2316. add r3d, r1d
  2317. movzx r1d, byte [r0+r2*2-1]
  2318. add r3d, r1d
  2319. add r3d, 4
  2320. shr r3d, 3
  2321. imul r3d, 0x01010101
  2322. mov [r4+r2*0], r3d
  2323. mov [r0+r2*0], r3d
  2324. mov [r0+r2*1], r3d
  2325. mov [r0+r2*2], r3d
  2326. RET
  2327. ;-----------------------------------------------------------------------------
  2328. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2329. ;-----------------------------------------------------------------------------
  2330. %macro PRED4x4_TM_MMX 0
  2331. cglobal pred4x4_tm_vp8, 3,6
  2332. sub r0, r2
  2333. pxor mm7, mm7
  2334. movd mm0, [r0]
  2335. punpcklbw mm0, mm7
  2336. movzx r4d, byte [r0-1]
  2337. mov r5d, 2
  2338. .loop:
  2339. movzx r1d, byte [r0+r2*1-1]
  2340. movzx r3d, byte [r0+r2*2-1]
  2341. sub r1d, r4d
  2342. sub r3d, r4d
  2343. movd mm2, r1d
  2344. movd mm4, r3d
  2345. %if cpuflag(mmx2)
  2346. pshufw mm2, mm2, 0
  2347. pshufw mm4, mm4, 0
  2348. %else
  2349. punpcklwd mm2, mm2
  2350. punpcklwd mm4, mm4
  2351. punpckldq mm2, mm2
  2352. punpckldq mm4, mm4
  2353. %endif
  2354. paddw mm2, mm0
  2355. paddw mm4, mm0
  2356. packuswb mm2, mm2
  2357. packuswb mm4, mm4
  2358. movd [r0+r2*1], mm2
  2359. movd [r0+r2*2], mm4
  2360. lea r0, [r0+r2*2]
  2361. dec r5d
  2362. jg .loop
  2363. REP_RET
  2364. %endmacro
  2365. INIT_MMX mmx
  2366. PRED4x4_TM_MMX
  2367. INIT_MMX mmx2
  2368. PRED4x4_TM_MMX
  2369. INIT_MMX
  2370. cglobal pred4x4_tm_vp8_ssse3, 3,3
  2371. sub r0, r2
  2372. movq mm6, [tm_shuf]
  2373. pxor mm1, mm1
  2374. movd mm0, [r0]
  2375. punpcklbw mm0, mm1
  2376. movd mm7, [r0-4]
  2377. pshufb mm7, mm6
  2378. lea r1, [r0+r2*2]
  2379. movd mm2, [r0+r2*1-4]
  2380. movd mm3, [r0+r2*2-4]
  2381. movd mm4, [r1+r2*1-4]
  2382. movd mm5, [r1+r2*2-4]
  2383. pshufb mm2, mm6
  2384. pshufb mm3, mm6
  2385. pshufb mm4, mm6
  2386. pshufb mm5, mm6
  2387. psubw mm2, mm7
  2388. psubw mm3, mm7
  2389. psubw mm4, mm7
  2390. psubw mm5, mm7
  2391. paddw mm2, mm0
  2392. paddw mm3, mm0
  2393. paddw mm4, mm0
  2394. paddw mm5, mm0
  2395. packuswb mm2, mm2
  2396. packuswb mm3, mm3
  2397. packuswb mm4, mm4
  2398. packuswb mm5, mm5
  2399. movd [r0+r2*1], mm2
  2400. movd [r0+r2*2], mm3
  2401. movd [r1+r2*1], mm4
  2402. movd [r1+r2*2], mm5
  2403. RET
  2404. ;-----------------------------------------------------------------------------
  2405. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2406. ;-----------------------------------------------------------------------------
  2407. INIT_MMX
  2408. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  2409. sub r0, r2
  2410. movd m1, [r0-1]
  2411. movd m0, [r0]
  2412. mova m2, m0 ;t0 t1 t2 t3
  2413. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2414. lea r1, [r0+r2*2]
  2415. psrlq m0, 8 ;t1 t2 t3 t4
  2416. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2417. movd [r0+r2*1], m3
  2418. movd [r0+r2*2], m3
  2419. movd [r1+r2*1], m3
  2420. movd [r1+r2*2], m3
  2421. RET
  2422. ;-----------------------------------------------------------------------------
  2423. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2424. ;-----------------------------------------------------------------------------
  2425. INIT_MMX
  2426. cglobal pred4x4_down_left_mmxext, 3,3
  2427. sub r0, r2
  2428. movq m1, [r0]
  2429. punpckldq m1, [r1]
  2430. movq m2, m1
  2431. movq m3, m1
  2432. psllq m1, 8
  2433. pxor m2, m1
  2434. psrlq m2, 8
  2435. pxor m2, m3
  2436. PRED4x4_LOWPASS m0, m1, m2, m3, m4
  2437. lea r1, [r0+r2*2]
  2438. psrlq m0, 8
  2439. movd [r0+r2*1], m0
  2440. psrlq m0, 8
  2441. movd [r0+r2*2], m0
  2442. psrlq m0, 8
  2443. movd [r1+r2*1], m0
  2444. psrlq m0, 8
  2445. movd [r1+r2*2], m0
  2446. RET
  2447. ;-----------------------------------------------------------------------------
  2448. ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2449. ;-----------------------------------------------------------------------------
  2450. INIT_MMX
  2451. cglobal pred4x4_vertical_left_mmxext, 3,3
  2452. sub r0, r2
  2453. movq m1, [r0]
  2454. punpckldq m1, [r1]
  2455. movq m3, m1
  2456. movq m2, m1
  2457. psrlq m3, 8
  2458. psrlq m2, 16
  2459. movq m4, m3
  2460. pavgb m4, m1
  2461. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2462. lea r1, [r0+r2*2]
  2463. movh [r0+r2*1], m4
  2464. movh [r0+r2*2], m0
  2465. psrlq m4, 8
  2466. psrlq m0, 8
  2467. movh [r1+r2*1], m4
  2468. movh [r1+r2*2], m0
  2469. RET
  2470. ;-----------------------------------------------------------------------------
  2471. ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2472. ;-----------------------------------------------------------------------------
  2473. INIT_MMX
  2474. cglobal pred4x4_horizontal_up_mmxext, 3,3
  2475. sub r0, r2
  2476. lea r1, [r0+r2*2]
  2477. movd m0, [r0+r2*1-4]
  2478. punpcklbw m0, [r0+r2*2-4]
  2479. movd m1, [r1+r2*1-4]
  2480. punpcklbw m1, [r1+r2*2-4]
  2481. punpckhwd m0, m1
  2482. movq m1, m0
  2483. punpckhbw m1, m1
  2484. pshufw m1, m1, 0xFF
  2485. punpckhdq m0, m1
  2486. movq m2, m0
  2487. movq m3, m0
  2488. movq m7, m0
  2489. psrlq m2, 16
  2490. psrlq m3, 8
  2491. pavgb m7, m3
  2492. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2493. punpcklbw m7, m4
  2494. movd [r0+r2*1], m7
  2495. psrlq m7, 16
  2496. movd [r0+r2*2], m7
  2497. psrlq m7, 16
  2498. movd [r1+r2*1], m7
  2499. movd [r1+r2*2], m1
  2500. RET
  2501. ;-----------------------------------------------------------------------------
  2502. ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2503. ;-----------------------------------------------------------------------------
  2504. INIT_MMX
  2505. %define PALIGNR PALIGNR_MMX
  2506. cglobal pred4x4_horizontal_down_mmxext, 3,3
  2507. sub r0, r2
  2508. lea r1, [r0+r2*2]
  2509. movh m0, [r0-4] ; lt ..
  2510. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2511. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2512. movd m1, [r1+r2*2-4] ; l3
  2513. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2514. movd m2, [r0+r2*2-4] ; l1
  2515. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2516. punpckhwd m1, m2 ; l0 l1 l2 l3
  2517. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2518. movq m0, m1
  2519. movq m2, m1
  2520. movq m5, m1
  2521. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2522. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2523. pavgb m5, m2
  2524. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2525. punpcklbw m5, m3
  2526. psrlq m3, 32
  2527. PALIGNR m3, m5, 6, m4
  2528. movh [r1+r2*2], m5
  2529. psrlq m5, 16
  2530. movh [r1+r2*1], m5
  2531. psrlq m5, 16
  2532. movh [r0+r2*2], m5
  2533. movh [r0+r2*1], m3
  2534. RET
  2535. ;-----------------------------------------------------------------------------
  2536. ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2537. ;-----------------------------------------------------------------------------
  2538. INIT_MMX
  2539. %define PALIGNR PALIGNR_MMX
  2540. cglobal pred4x4_vertical_right_mmxext, 3,3
  2541. sub r0, r2
  2542. lea r1, [r0+r2*2]
  2543. movh m0, [r0] ; ........t3t2t1t0
  2544. movq m5, m0
  2545. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2546. pavgb m5, m0
  2547. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2548. movq m1, m0
  2549. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2550. movq m2, m0
  2551. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2552. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2553. movq m1, m3
  2554. psrlq m3, 16
  2555. psllq m1, 48
  2556. movh [r0+r2*1], m5
  2557. movh [r0+r2*2], m3
  2558. PALIGNR m5, m1, 7, m2
  2559. psllq m1, 8
  2560. movh [r1+r2*1], m5
  2561. PALIGNR m3, m1, 7, m1
  2562. movh [r1+r2*2], m3
  2563. RET
  2564. ;-----------------------------------------------------------------------------
  2565. ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2566. ;-----------------------------------------------------------------------------
  2567. INIT_MMX
  2568. %define PALIGNR PALIGNR_MMX
  2569. cglobal pred4x4_down_right_mmxext, 3,3
  2570. sub r0, r2
  2571. lea r1, [r0+r2*2]
  2572. movq m1, [r1-8]
  2573. movq m2, [r0+r2*1-8]
  2574. punpckhbw m2, [r0-8]
  2575. movh m3, [r0]
  2576. punpckhwd m1, m2
  2577. PALIGNR m3, m1, 5, m1
  2578. movq m1, m3
  2579. PALIGNR m3, [r1+r2*1-8], 7, m4
  2580. movq m2, m3
  2581. PALIGNR m3, [r1+r2*2-8], 7, m4
  2582. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2583. movh [r1+r2*2], m0
  2584. psrlq m0, 8
  2585. movh [r1+r2*1], m0
  2586. psrlq m0, 8
  2587. movh [r0+r2*2], m0
  2588. psrlq m0, 8
  2589. movh [r0+r2*1], m0
  2590. RET