You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2722 lines
69KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. tm_shuf: times 8 db 0x03, 0x80
  27. pw_ff00: times 8 dw 0xff00
  28. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  29. db 1, 2, 3, 4, 5, 6, 7, 8
  30. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  31. db 1, 2, 3, 4, 0, 0, 0, 0
  32. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  33. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  34. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  35. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  36. SECTION .text
  37. cextern pb_1
  38. cextern pb_3
  39. cextern pw_4
  40. cextern pw_5
  41. cextern pw_8
  42. cextern pw_16
  43. cextern pw_17
  44. cextern pw_32
  45. ;-----------------------------------------------------------------------------
  46. ; void pred16x16_vertical(uint8_t *src, int stride)
  47. ;-----------------------------------------------------------------------------
  48. cglobal pred16x16_vertical_8_mmx, 2,3
  49. sub r0, r1
  50. mov r2, 8
  51. movq mm0, [r0+0]
  52. movq mm1, [r0+8]
  53. .loop:
  54. movq [r0+r1*1+0], mm0
  55. movq [r0+r1*1+8], mm1
  56. movq [r0+r1*2+0], mm0
  57. movq [r0+r1*2+8], mm1
  58. lea r0, [r0+r1*2]
  59. dec r2
  60. jg .loop
  61. REP_RET
  62. cglobal pred16x16_vertical_8_sse, 2,3
  63. sub r0, r1
  64. mov r2, 4
  65. movaps xmm0, [r0]
  66. .loop:
  67. movaps [r0+r1*1], xmm0
  68. movaps [r0+r1*2], xmm0
  69. lea r0, [r0+r1*2]
  70. movaps [r0+r1*1], xmm0
  71. movaps [r0+r1*2], xmm0
  72. lea r0, [r0+r1*2]
  73. dec r2
  74. jg .loop
  75. REP_RET
  76. ;-----------------------------------------------------------------------------
  77. ; void pred16x16_horizontal(uint8_t *src, int stride)
  78. ;-----------------------------------------------------------------------------
  79. %macro PRED16x16_H 0
  80. cglobal pred16x16_horizontal_8, 2,3
  81. mov r2, 8
  82. %if cpuflag(ssse3)
  83. mova m2, [pb_3]
  84. %endif
  85. .loop:
  86. movd m0, [r0+r1*0-4]
  87. movd m1, [r0+r1*1-4]
  88. %if cpuflag(ssse3)
  89. pshufb m0, m2
  90. pshufb m1, m2
  91. %else
  92. punpcklbw m0, m0
  93. punpcklbw m1, m1
  94. SPLATW m0, m0, 3
  95. SPLATW m1, m1, 3
  96. mova [r0+r1*0+8], m0
  97. mova [r0+r1*1+8], m1
  98. %endif
  99. mova [r0+r1*0], m0
  100. mova [r0+r1*1], m1
  101. lea r0, [r0+r1*2]
  102. dec r2
  103. jg .loop
  104. REP_RET
  105. %endmacro
  106. INIT_MMX mmx
  107. PRED16x16_H
  108. INIT_MMX mmx2
  109. PRED16x16_H
  110. INIT_XMM ssse3
  111. PRED16x16_H
  112. INIT_XMM
  113. ;-----------------------------------------------------------------------------
  114. ; void pred16x16_dc(uint8_t *src, int stride)
  115. ;-----------------------------------------------------------------------------
  116. %macro PRED16x16_DC 0
  117. cglobal pred16x16_dc_8, 2,7
  118. mov r4, r0
  119. sub r0, r1
  120. pxor mm0, mm0
  121. pxor mm1, mm1
  122. psadbw mm0, [r0+0]
  123. psadbw mm1, [r0+8]
  124. dec r0
  125. movzx r5d, byte [r0+r1*1]
  126. paddw mm0, mm1
  127. movd r6d, mm0
  128. lea r0, [r0+r1*2]
  129. %rep 7
  130. movzx r2d, byte [r0+r1*0]
  131. movzx r3d, byte [r0+r1*1]
  132. add r5d, r2d
  133. add r6d, r3d
  134. lea r0, [r0+r1*2]
  135. %endrep
  136. movzx r2d, byte [r0+r1*0]
  137. add r5d, r6d
  138. lea r2d, [r2+r5+16]
  139. shr r2d, 5
  140. %if cpuflag(ssse3)
  141. pxor m1, m1
  142. %endif
  143. SPLATB_REG m0, r2, m1
  144. %if mmsize==8
  145. mov r3d, 8
  146. .loop:
  147. mova [r4+r1*0+0], m0
  148. mova [r4+r1*0+8], m0
  149. mova [r4+r1*1+0], m0
  150. mova [r4+r1*1+8], m0
  151. %else
  152. mov r3d, 4
  153. .loop:
  154. mova [r4+r1*0], m0
  155. mova [r4+r1*1], m0
  156. lea r4, [r4+r1*2]
  157. mova [r4+r1*0], m0
  158. mova [r4+r1*1], m0
  159. %endif
  160. lea r4, [r4+r1*2]
  161. dec r3d
  162. jg .loop
  163. REP_RET
  164. %endmacro
  165. INIT_MMX mmx2
  166. PRED16x16_DC
  167. INIT_XMM sse2
  168. PRED16x16_DC
  169. INIT_XMM ssse3
  170. PRED16x16_DC
  171. INIT_XMM
  172. ;-----------------------------------------------------------------------------
  173. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  174. ;-----------------------------------------------------------------------------
  175. %macro PRED16x16_TM_MMX 0
  176. cglobal pred16x16_tm_vp8_8, 2,5
  177. sub r0, r1
  178. pxor mm7, mm7
  179. movq mm0, [r0+0]
  180. movq mm2, [r0+8]
  181. movq mm1, mm0
  182. movq mm3, mm2
  183. punpcklbw mm0, mm7
  184. punpckhbw mm1, mm7
  185. punpcklbw mm2, mm7
  186. punpckhbw mm3, mm7
  187. movzx r3d, byte [r0-1]
  188. mov r4d, 16
  189. .loop:
  190. movzx r2d, byte [r0+r1-1]
  191. sub r2d, r3d
  192. movd mm4, r2d
  193. SPLATW mm4, mm4, 0
  194. movq mm5, mm4
  195. movq mm6, mm4
  196. movq mm7, mm4
  197. paddw mm4, mm0
  198. paddw mm5, mm1
  199. paddw mm6, mm2
  200. paddw mm7, mm3
  201. packuswb mm4, mm5
  202. packuswb mm6, mm7
  203. movq [r0+r1+0], mm4
  204. movq [r0+r1+8], mm6
  205. add r0, r1
  206. dec r4d
  207. jg .loop
  208. REP_RET
  209. %endmacro
  210. INIT_MMX mmx
  211. PRED16x16_TM_MMX
  212. INIT_MMX mmx2
  213. PRED16x16_TM_MMX
  214. INIT_MMX
  215. cglobal pred16x16_tm_vp8_8_sse2, 2,6,6
  216. sub r0, r1
  217. pxor xmm2, xmm2
  218. movdqa xmm0, [r0]
  219. movdqa xmm1, xmm0
  220. punpcklbw xmm0, xmm2
  221. punpckhbw xmm1, xmm2
  222. movzx r4d, byte [r0-1]
  223. mov r5d, 8
  224. .loop:
  225. movzx r2d, byte [r0+r1*1-1]
  226. movzx r3d, byte [r0+r1*2-1]
  227. sub r2d, r4d
  228. sub r3d, r4d
  229. movd xmm2, r2d
  230. movd xmm4, r3d
  231. pshuflw xmm2, xmm2, 0
  232. pshuflw xmm4, xmm4, 0
  233. punpcklqdq xmm2, xmm2
  234. punpcklqdq xmm4, xmm4
  235. movdqa xmm3, xmm2
  236. movdqa xmm5, xmm4
  237. paddw xmm2, xmm0
  238. paddw xmm3, xmm1
  239. paddw xmm4, xmm0
  240. paddw xmm5, xmm1
  241. packuswb xmm2, xmm3
  242. packuswb xmm4, xmm5
  243. movdqa [r0+r1*1], xmm2
  244. movdqa [r0+r1*2], xmm4
  245. lea r0, [r0+r1*2]
  246. dec r5d
  247. jg .loop
  248. REP_RET
  249. ;-----------------------------------------------------------------------------
  250. ; void pred16x16_plane(uint8_t *src, int stride)
  251. ;-----------------------------------------------------------------------------
  252. %macro H264_PRED16x16_PLANE 1
  253. cglobal pred16x16_plane_%1_8, 2,9,7
  254. mov r2, r1 ; +stride
  255. neg r1 ; -stride
  256. movh m0, [r0+r1 -1]
  257. %if mmsize == 8
  258. pxor m4, m4
  259. movh m1, [r0+r1 +3 ]
  260. movh m2, [r0+r1 +8 ]
  261. movh m3, [r0+r1 +12]
  262. punpcklbw m0, m4
  263. punpcklbw m1, m4
  264. punpcklbw m2, m4
  265. punpcklbw m3, m4
  266. pmullw m0, [pw_m8tom1 ]
  267. pmullw m1, [pw_m8tom1+8]
  268. pmullw m2, [pw_1to8 ]
  269. pmullw m3, [pw_1to8 +8]
  270. paddw m0, m2
  271. paddw m1, m3
  272. %else ; mmsize == 16
  273. %if cpuflag(ssse3)
  274. movhps m0, [r0+r1 +8]
  275. pmaddubsw m0, [plane_shuf] ; H coefficients
  276. %else ; sse2
  277. pxor m2, m2
  278. movh m1, [r0+r1 +8]
  279. punpcklbw m0, m2
  280. punpcklbw m1, m2
  281. pmullw m0, [pw_m8tom1]
  282. pmullw m1, [pw_1to8]
  283. paddw m0, m1
  284. %endif
  285. movhlps m1, m0
  286. %endif
  287. paddw m0, m1
  288. %if cpuflag(mmx2)
  289. PSHUFLW m1, m0, 0xE
  290. %elif cpuflag(mmx)
  291. mova m1, m0
  292. psrlq m1, 32
  293. %endif
  294. paddw m0, m1
  295. %if cpuflag(mmx2)
  296. PSHUFLW m1, m0, 0x1
  297. %elif cpuflag(mmx)
  298. mova m1, m0
  299. psrlq m1, 16
  300. %endif
  301. paddw m0, m1 ; sum of H coefficients
  302. lea r4, [r0+r2*8-1]
  303. lea r3, [r0+r2*4-1]
  304. add r4, r2
  305. %if ARCH_X86_64
  306. %define e_reg r8
  307. %else
  308. %define e_reg r0
  309. %endif
  310. movzx e_reg, byte [r3+r2*2 ]
  311. movzx r5, byte [r4+r1 ]
  312. sub r5, e_reg
  313. movzx e_reg, byte [r3+r2 ]
  314. movzx r6, byte [r4 ]
  315. sub r6, e_reg
  316. lea r5, [r5+r6*2]
  317. movzx e_reg, byte [r3+r1 ]
  318. movzx r6, byte [r4+r2*2 ]
  319. sub r6, e_reg
  320. lea r5, [r5+r6*4]
  321. movzx e_reg, byte [r3 ]
  322. %if ARCH_X86_64
  323. movzx r7, byte [r4+r2 ]
  324. sub r7, e_reg
  325. %else
  326. movzx r6, byte [r4+r2 ]
  327. sub r6, e_reg
  328. lea r5, [r5+r6*4]
  329. sub r5, r6
  330. %endif
  331. lea e_reg, [r3+r1*4]
  332. lea r3, [r4+r2*4]
  333. movzx r4, byte [e_reg+r2 ]
  334. movzx r6, byte [r3 ]
  335. sub r6, r4
  336. %if ARCH_X86_64
  337. lea r6, [r7+r6*2]
  338. lea r5, [r5+r6*2]
  339. add r5, r6
  340. %else
  341. lea r5, [r5+r6*4]
  342. lea r5, [r5+r6*2]
  343. %endif
  344. movzx r4, byte [e_reg ]
  345. %if ARCH_X86_64
  346. movzx r7, byte [r3 +r2 ]
  347. sub r7, r4
  348. sub r5, r7
  349. %else
  350. movzx r6, byte [r3 +r2 ]
  351. sub r6, r4
  352. lea r5, [r5+r6*8]
  353. sub r5, r6
  354. %endif
  355. movzx r4, byte [e_reg+r1 ]
  356. movzx r6, byte [r3 +r2*2]
  357. sub r6, r4
  358. %if ARCH_X86_64
  359. add r6, r7
  360. %endif
  361. lea r5, [r5+r6*8]
  362. movzx r4, byte [e_reg+r2*2]
  363. movzx r6, byte [r3 +r1 ]
  364. sub r6, r4
  365. lea r5, [r5+r6*4]
  366. add r5, r6 ; sum of V coefficients
  367. %if ARCH_X86_64 == 0
  368. mov r0, r0m
  369. %endif
  370. %ifidn %1, h264
  371. lea r5, [r5*5+32]
  372. sar r5, 6
  373. %elifidn %1, rv40
  374. lea r5, [r5*5]
  375. sar r5, 6
  376. %elifidn %1, svq3
  377. test r5, r5
  378. lea r6, [r5+3]
  379. cmovs r5, r6
  380. sar r5, 2 ; V/4
  381. lea r5, [r5*5] ; 5*(V/4)
  382. test r5, r5
  383. lea r6, [r5+15]
  384. cmovs r5, r6
  385. sar r5, 4 ; (5*(V/4))/16
  386. %endif
  387. movzx r4, byte [r0+r1 +15]
  388. movzx r3, byte [r3+r2*2 ]
  389. lea r3, [r3+r4+1]
  390. shl r3, 4
  391. movd r1d, m0
  392. movsx r1d, r1w
  393. %ifnidn %1, svq3
  394. %ifidn %1, h264
  395. lea r1d, [r1d*5+32]
  396. %else ; rv40
  397. lea r1d, [r1d*5]
  398. %endif
  399. sar r1d, 6
  400. %else ; svq3
  401. test r1d, r1d
  402. lea r4d, [r1d+3]
  403. cmovs r1d, r4d
  404. sar r1d, 2 ; H/4
  405. lea r1d, [r1d*5] ; 5*(H/4)
  406. test r1d, r1d
  407. lea r4d, [r1d+15]
  408. cmovs r1d, r4d
  409. sar r1d, 4 ; (5*(H/4))/16
  410. %endif
  411. movd m0, r1d
  412. add r1d, r5d
  413. add r3d, r1d
  414. shl r1d, 3
  415. sub r3d, r1d ; a
  416. movd m1, r5d
  417. movd m3, r3d
  418. SPLATW m0, m0, 0 ; H
  419. SPLATW m1, m1, 0 ; V
  420. SPLATW m3, m3, 0 ; a
  421. %ifidn %1, svq3
  422. SWAP 0, 1
  423. %endif
  424. mova m2, m0
  425. %if mmsize == 8
  426. mova m5, m0
  427. %endif
  428. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  429. %if mmsize == 16
  430. psllw m2, 3
  431. %else
  432. psllw m5, 3
  433. psllw m2, 2
  434. mova m6, m5
  435. paddw m6, m2
  436. %endif
  437. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  438. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  439. %if mmsize == 8
  440. paddw m5, m0 ; a + {8,9,10,11}*H
  441. paddw m6, m0 ; a + {12,13,14,15}*H
  442. %endif
  443. mov r4, 8
  444. .loop:
  445. mova m3, m0 ; b[0..7]
  446. mova m4, m2 ; b[8..15]
  447. psraw m3, 5
  448. psraw m4, 5
  449. packuswb m3, m4
  450. mova [r0], m3
  451. %if mmsize == 8
  452. mova m3, m5 ; b[8..11]
  453. mova m4, m6 ; b[12..15]
  454. psraw m3, 5
  455. psraw m4, 5
  456. packuswb m3, m4
  457. mova [r0+8], m3
  458. %endif
  459. paddw m0, m1
  460. paddw m2, m1
  461. %if mmsize == 8
  462. paddw m5, m1
  463. paddw m6, m1
  464. %endif
  465. mova m3, m0 ; b[0..7]
  466. mova m4, m2 ; b[8..15]
  467. psraw m3, 5
  468. psraw m4, 5
  469. packuswb m3, m4
  470. mova [r0+r2], m3
  471. %if mmsize == 8
  472. mova m3, m5 ; b[8..11]
  473. mova m4, m6 ; b[12..15]
  474. psraw m3, 5
  475. psraw m4, 5
  476. packuswb m3, m4
  477. mova [r0+r2+8], m3
  478. %endif
  479. paddw m0, m1
  480. paddw m2, m1
  481. %if mmsize == 8
  482. paddw m5, m1
  483. paddw m6, m1
  484. %endif
  485. lea r0, [r0+r2*2]
  486. dec r4
  487. jg .loop
  488. REP_RET
  489. %endmacro
  490. INIT_MMX mmx
  491. H264_PRED16x16_PLANE h264
  492. H264_PRED16x16_PLANE rv40
  493. H264_PRED16x16_PLANE svq3
  494. INIT_MMX mmx2
  495. H264_PRED16x16_PLANE h264
  496. H264_PRED16x16_PLANE rv40
  497. H264_PRED16x16_PLANE svq3
  498. INIT_XMM sse2
  499. H264_PRED16x16_PLANE h264
  500. H264_PRED16x16_PLANE rv40
  501. H264_PRED16x16_PLANE svq3
  502. INIT_XMM ssse3
  503. H264_PRED16x16_PLANE h264
  504. H264_PRED16x16_PLANE rv40
  505. H264_PRED16x16_PLANE svq3
  506. INIT_XMM
  507. ;-----------------------------------------------------------------------------
  508. ; void pred8x8_plane(uint8_t *src, int stride)
  509. ;-----------------------------------------------------------------------------
  510. %macro H264_PRED8x8_PLANE 0
  511. cglobal pred8x8_plane_8, 2,9,7
  512. mov r2, r1 ; +stride
  513. neg r1 ; -stride
  514. movd m0, [r0+r1 -1]
  515. %if mmsize == 8
  516. pxor m2, m2
  517. movh m1, [r0+r1 +4 ]
  518. punpcklbw m0, m2
  519. punpcklbw m1, m2
  520. pmullw m0, [pw_m4to4]
  521. pmullw m1, [pw_m4to4+8]
  522. %else ; mmsize == 16
  523. %if cpuflag(ssse3)
  524. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  525. pmaddubsw m0, [plane8_shuf] ; H coefficients
  526. %else ; sse2
  527. pxor m2, m2
  528. movd m1, [r0+r1 +4]
  529. punpckldq m0, m1
  530. punpcklbw m0, m2
  531. pmullw m0, [pw_m4to4]
  532. %endif
  533. movhlps m1, m0
  534. %endif
  535. paddw m0, m1
  536. %if notcpuflag(ssse3)
  537. %if cpuflag(mmx2)
  538. PSHUFLW m1, m0, 0xE
  539. %elif cpuflag(mmx)
  540. mova m1, m0
  541. psrlq m1, 32
  542. %endif
  543. paddw m0, m1
  544. %endif ; !ssse3
  545. %if cpuflag(mmx2)
  546. PSHUFLW m1, m0, 0x1
  547. %elif cpuflag(mmx)
  548. mova m1, m0
  549. psrlq m1, 16
  550. %endif
  551. paddw m0, m1 ; sum of H coefficients
  552. lea r4, [r0+r2*4-1]
  553. lea r3, [r0 -1]
  554. add r4, r2
  555. %if ARCH_X86_64
  556. %define e_reg r8
  557. %else
  558. %define e_reg r0
  559. %endif
  560. movzx e_reg, byte [r3+r2*2 ]
  561. movzx r5, byte [r4+r1 ]
  562. sub r5, e_reg
  563. movzx e_reg, byte [r3 ]
  564. %if ARCH_X86_64
  565. movzx r7, byte [r4+r2 ]
  566. sub r7, e_reg
  567. sub r5, r7
  568. %else
  569. movzx r6, byte [r4+r2 ]
  570. sub r6, e_reg
  571. lea r5, [r5+r6*4]
  572. sub r5, r6
  573. %endif
  574. movzx e_reg, byte [r3+r1 ]
  575. movzx r6, byte [r4+r2*2 ]
  576. sub r6, e_reg
  577. %if ARCH_X86_64
  578. add r6, r7
  579. %endif
  580. lea r5, [r5+r6*4]
  581. movzx e_reg, byte [r3+r2 ]
  582. movzx r6, byte [r4 ]
  583. sub r6, e_reg
  584. lea r6, [r5+r6*2]
  585. lea r5, [r6*9+16]
  586. lea r5, [r5+r6*8]
  587. sar r5, 5
  588. %if ARCH_X86_64 == 0
  589. mov r0, r0m
  590. %endif
  591. movzx r3, byte [r4+r2*2 ]
  592. movzx r4, byte [r0+r1 +7]
  593. lea r3, [r3+r4+1]
  594. shl r3, 4
  595. movd r1d, m0
  596. movsx r1d, r1w
  597. imul r1d, 17
  598. add r1d, 16
  599. sar r1d, 5
  600. movd m0, r1d
  601. add r1d, r5d
  602. sub r3d, r1d
  603. add r1d, r1d
  604. sub r3d, r1d ; a
  605. movd m1, r5d
  606. movd m3, r3d
  607. SPLATW m0, m0, 0 ; H
  608. SPLATW m1, m1, 0 ; V
  609. SPLATW m3, m3, 0 ; a
  610. %if mmsize == 8
  611. mova m2, m0
  612. %endif
  613. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  614. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  615. %if mmsize == 8
  616. psllw m2, 2
  617. paddw m2, m0 ; a + {4,5,6,7}*H
  618. %endif
  619. mov r4, 4
  620. ALIGN 16
  621. .loop:
  622. %if mmsize == 16
  623. mova m3, m0 ; b[0..7]
  624. paddw m0, m1
  625. psraw m3, 5
  626. mova m4, m0 ; V+b[0..7]
  627. paddw m0, m1
  628. psraw m4, 5
  629. packuswb m3, m4
  630. movh [r0], m3
  631. movhps [r0+r2], m3
  632. %else ; mmsize == 8
  633. mova m3, m0 ; b[0..3]
  634. mova m4, m2 ; b[4..7]
  635. paddw m0, m1
  636. paddw m2, m1
  637. psraw m3, 5
  638. psraw m4, 5
  639. mova m5, m0 ; V+b[0..3]
  640. mova m6, m2 ; V+b[4..7]
  641. paddw m0, m1
  642. paddw m2, m1
  643. psraw m5, 5
  644. psraw m6, 5
  645. packuswb m3, m4
  646. packuswb m5, m6
  647. mova [r0], m3
  648. mova [r0+r2], m5
  649. %endif
  650. lea r0, [r0+r2*2]
  651. dec r4
  652. jg .loop
  653. REP_RET
  654. %endmacro
  655. INIT_MMX mmx
  656. H264_PRED8x8_PLANE
  657. INIT_MMX mmx2
  658. H264_PRED8x8_PLANE
  659. INIT_XMM sse2
  660. H264_PRED8x8_PLANE
  661. INIT_XMM ssse3
  662. H264_PRED8x8_PLANE
  663. INIT_XMM
  664. ;-----------------------------------------------------------------------------
  665. ; void pred8x8_vertical(uint8_t *src, int stride)
  666. ;-----------------------------------------------------------------------------
  667. cglobal pred8x8_vertical_8_mmx, 2,2
  668. sub r0, r1
  669. movq mm0, [r0]
  670. %rep 3
  671. movq [r0+r1*1], mm0
  672. movq [r0+r1*2], mm0
  673. lea r0, [r0+r1*2]
  674. %endrep
  675. movq [r0+r1*1], mm0
  676. movq [r0+r1*2], mm0
  677. RET
  678. ;-----------------------------------------------------------------------------
  679. ; void pred8x8_horizontal(uint8_t *src, int stride)
  680. ;-----------------------------------------------------------------------------
  681. %macro PRED8x8_H 0
  682. cglobal pred8x8_horizontal_8, 2,3
  683. mov r2, 4
  684. %if cpuflag(ssse3)
  685. mova m2, [pb_3]
  686. %endif
  687. .loop:
  688. SPLATB_LOAD m0, r0+r1*0-1, m2
  689. SPLATB_LOAD m1, r0+r1*1-1, m2
  690. mova [r0+r1*0], m0
  691. mova [r0+r1*1], m1
  692. lea r0, [r0+r1*2]
  693. dec r2
  694. jg .loop
  695. REP_RET
  696. %endmacro
  697. INIT_MMX mmx
  698. PRED8x8_H
  699. INIT_MMX mmx2
  700. PRED8x8_H
  701. INIT_MMX ssse3
  702. PRED8x8_H
  703. INIT_MMX
  704. ;-----------------------------------------------------------------------------
  705. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  706. ;-----------------------------------------------------------------------------
  707. cglobal pred8x8_top_dc_8_mmxext, 2,5
  708. sub r0, r1
  709. movq mm0, [r0]
  710. pxor mm1, mm1
  711. pxor mm2, mm2
  712. lea r2, [r0+r1*2]
  713. punpckhbw mm1, mm0
  714. punpcklbw mm0, mm2
  715. psadbw mm1, mm2 ; s1
  716. lea r3, [r2+r1*2]
  717. psadbw mm0, mm2 ; s0
  718. psrlw mm1, 1
  719. psrlw mm0, 1
  720. pavgw mm1, mm2
  721. lea r4, [r3+r1*2]
  722. pavgw mm0, mm2
  723. pshufw mm1, mm1, 0
  724. pshufw mm0, mm0, 0 ; dc0 (w)
  725. packuswb mm0, mm1 ; dc0,dc1 (b)
  726. movq [r0+r1*1], mm0
  727. movq [r0+r1*2], mm0
  728. lea r0, [r3+r1*2]
  729. movq [r2+r1*1], mm0
  730. movq [r2+r1*2], mm0
  731. movq [r3+r1*1], mm0
  732. movq [r3+r1*2], mm0
  733. movq [r0+r1*1], mm0
  734. movq [r0+r1*2], mm0
  735. RET
  736. ;-----------------------------------------------------------------------------
  737. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  738. ;-----------------------------------------------------------------------------
  739. INIT_MMX
  740. cglobal pred8x8_dc_8_mmxext, 2,5
  741. sub r0, r1
  742. pxor m7, m7
  743. movd m0, [r0+0]
  744. movd m1, [r0+4]
  745. psadbw m0, m7 ; s0
  746. mov r4, r0
  747. psadbw m1, m7 ; s1
  748. movzx r2d, byte [r0+r1*1-1]
  749. movzx r3d, byte [r0+r1*2-1]
  750. lea r0, [r0+r1*2]
  751. add r2d, r3d
  752. movzx r3d, byte [r0+r1*1-1]
  753. add r2d, r3d
  754. movzx r3d, byte [r0+r1*2-1]
  755. add r2d, r3d
  756. lea r0, [r0+r1*2]
  757. movd m2, r2d ; s2
  758. movzx r2d, byte [r0+r1*1-1]
  759. movzx r3d, byte [r0+r1*2-1]
  760. lea r0, [r0+r1*2]
  761. add r2d, r3d
  762. movzx r3d, byte [r0+r1*1-1]
  763. add r2d, r3d
  764. movzx r3d, byte [r0+r1*2-1]
  765. add r2d, r3d
  766. movd m3, r2d ; s3
  767. punpcklwd m0, m1
  768. mov r0, r4
  769. punpcklwd m2, m3
  770. punpckldq m0, m2 ; s0, s1, s2, s3
  771. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  772. lea r2, [r0+r1*2]
  773. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  774. paddw m0, m3
  775. lea r3, [r2+r1*2]
  776. psrlw m0, 2
  777. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  778. lea r4, [r3+r1*2]
  779. packuswb m0, m0
  780. punpcklbw m0, m0
  781. movq m1, m0
  782. punpcklbw m0, m0
  783. punpckhbw m1, m1
  784. movq [r0+r1*1], m0
  785. movq [r0+r1*2], m0
  786. movq [r2+r1*1], m0
  787. movq [r2+r1*2], m0
  788. movq [r3+r1*1], m1
  789. movq [r3+r1*2], m1
  790. movq [r4+r1*1], m1
  791. movq [r4+r1*2], m1
  792. RET
  793. ;-----------------------------------------------------------------------------
  794. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  795. ;-----------------------------------------------------------------------------
  796. cglobal pred8x8_dc_rv40_8_mmxext, 2,7
  797. mov r4, r0
  798. sub r0, r1
  799. pxor mm0, mm0
  800. psadbw mm0, [r0]
  801. dec r0
  802. movzx r5d, byte [r0+r1*1]
  803. movd r6d, mm0
  804. lea r0, [r0+r1*2]
  805. %rep 3
  806. movzx r2d, byte [r0+r1*0]
  807. movzx r3d, byte [r0+r1*1]
  808. add r5d, r2d
  809. add r6d, r3d
  810. lea r0, [r0+r1*2]
  811. %endrep
  812. movzx r2d, byte [r0+r1*0]
  813. add r5d, r6d
  814. lea r2d, [r2+r5+8]
  815. shr r2d, 4
  816. movd mm0, r2d
  817. punpcklbw mm0, mm0
  818. pshufw mm0, mm0, 0
  819. mov r3d, 4
  820. .loop:
  821. movq [r4+r1*0], mm0
  822. movq [r4+r1*1], mm0
  823. lea r4, [r4+r1*2]
  824. dec r3d
  825. jg .loop
  826. REP_RET
  827. ;-----------------------------------------------------------------------------
  828. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  829. ;-----------------------------------------------------------------------------
  830. %macro PRED8x8_TM_MMX 0
  831. cglobal pred8x8_tm_vp8_8, 2,6
  832. sub r0, r1
  833. pxor mm7, mm7
  834. movq mm0, [r0]
  835. movq mm1, mm0
  836. punpcklbw mm0, mm7
  837. punpckhbw mm1, mm7
  838. movzx r4d, byte [r0-1]
  839. mov r5d, 4
  840. .loop:
  841. movzx r2d, byte [r0+r1*1-1]
  842. movzx r3d, byte [r0+r1*2-1]
  843. sub r2d, r4d
  844. sub r3d, r4d
  845. movd mm2, r2d
  846. movd mm4, r3d
  847. SPLATW mm2, mm2, 0
  848. SPLATW mm4, mm4, 0
  849. movq mm3, mm2
  850. movq mm5, mm4
  851. paddw mm2, mm0
  852. paddw mm3, mm1
  853. paddw mm4, mm0
  854. paddw mm5, mm1
  855. packuswb mm2, mm3
  856. packuswb mm4, mm5
  857. movq [r0+r1*1], mm2
  858. movq [r0+r1*2], mm4
  859. lea r0, [r0+r1*2]
  860. dec r5d
  861. jg .loop
  862. REP_RET
  863. %endmacro
  864. INIT_MMX mmx
  865. PRED8x8_TM_MMX
  866. INIT_MMX mmx2
  867. PRED8x8_TM_MMX
  868. INIT_MMX
  869. cglobal pred8x8_tm_vp8_8_sse2, 2,6,4
  870. sub r0, r1
  871. pxor xmm1, xmm1
  872. movq xmm0, [r0]
  873. punpcklbw xmm0, xmm1
  874. movzx r4d, byte [r0-1]
  875. mov r5d, 4
  876. .loop:
  877. movzx r2d, byte [r0+r1*1-1]
  878. movzx r3d, byte [r0+r1*2-1]
  879. sub r2d, r4d
  880. sub r3d, r4d
  881. movd xmm2, r2d
  882. movd xmm3, r3d
  883. pshuflw xmm2, xmm2, 0
  884. pshuflw xmm3, xmm3, 0
  885. punpcklqdq xmm2, xmm2
  886. punpcklqdq xmm3, xmm3
  887. paddw xmm2, xmm0
  888. paddw xmm3, xmm0
  889. packuswb xmm2, xmm3
  890. movq [r0+r1*1], xmm2
  891. movhps [r0+r1*2], xmm2
  892. lea r0, [r0+r1*2]
  893. dec r5d
  894. jg .loop
  895. REP_RET
  896. cglobal pred8x8_tm_vp8_8_ssse3, 2,3,6
  897. sub r0, r1
  898. movdqa xmm4, [tm_shuf]
  899. pxor xmm1, xmm1
  900. movq xmm0, [r0]
  901. punpcklbw xmm0, xmm1
  902. movd xmm5, [r0-4]
  903. pshufb xmm5, xmm4
  904. mov r2d, 4
  905. .loop:
  906. movd xmm2, [r0+r1*1-4]
  907. movd xmm3, [r0+r1*2-4]
  908. pshufb xmm2, xmm4
  909. pshufb xmm3, xmm4
  910. psubw xmm2, xmm5
  911. psubw xmm3, xmm5
  912. paddw xmm2, xmm0
  913. paddw xmm3, xmm0
  914. packuswb xmm2, xmm3
  915. movq [r0+r1*1], xmm2
  916. movhps [r0+r1*2], xmm2
  917. lea r0, [r0+r1*2]
  918. dec r2d
  919. jg .loop
  920. REP_RET
  921. ; dest, left, right, src, tmp
  922. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  923. %macro PRED4x4_LOWPASS 5
  924. mova %5, %2
  925. pavgb %2, %3
  926. pxor %3, %5
  927. mova %1, %4
  928. pand %3, [pb_1]
  929. psubusb %2, %3
  930. pavgb %1, %2
  931. %endmacro
  932. ;-----------------------------------------------------------------------------
  933. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  934. ;-----------------------------------------------------------------------------
  935. %macro PRED8x8L_TOP_DC 1
  936. cglobal pred8x8l_top_dc_8_%1, 4,4
  937. sub r0, r3
  938. pxor mm7, mm7
  939. movq mm0, [r0-8]
  940. movq mm3, [r0]
  941. movq mm1, [r0+8]
  942. movq mm2, mm3
  943. movq mm4, mm3
  944. PALIGNR mm2, mm0, 7, mm0
  945. PALIGNR mm1, mm4, 1, mm4
  946. test r1, r1 ; top_left
  947. jz .fix_lt_2
  948. test r2, r2 ; top_right
  949. jz .fix_tr_1
  950. jmp .body
  951. .fix_lt_2:
  952. movq mm5, mm3
  953. pxor mm5, mm2
  954. psllq mm5, 56
  955. psrlq mm5, 56
  956. pxor mm2, mm5
  957. test r2, r2 ; top_right
  958. jnz .body
  959. .fix_tr_1:
  960. movq mm5, mm3
  961. pxor mm5, mm1
  962. psrlq mm5, 56
  963. psllq mm5, 56
  964. pxor mm1, mm5
  965. .body:
  966. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  967. psadbw mm7, mm0
  968. paddw mm7, [pw_4]
  969. psrlw mm7, 3
  970. pshufw mm7, mm7, 0
  971. packuswb mm7, mm7
  972. %rep 3
  973. movq [r0+r3*1], mm7
  974. movq [r0+r3*2], mm7
  975. lea r0, [r0+r3*2]
  976. %endrep
  977. movq [r0+r3*1], mm7
  978. movq [r0+r3*2], mm7
  979. RET
  980. %endmacro
  981. INIT_MMX
  982. %define PALIGNR PALIGNR_MMX
  983. PRED8x8L_TOP_DC mmxext
  984. %define PALIGNR PALIGNR_SSSE3
  985. PRED8x8L_TOP_DC ssse3
  986. ;-----------------------------------------------------------------------------
  987. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  988. ;-----------------------------------------------------------------------------
  989. %macro PRED8x8L_DC 1
  990. cglobal pred8x8l_dc_8_%1, 4,5
  991. sub r0, r3
  992. lea r4, [r0+r3*2]
  993. movq mm0, [r0+r3*1-8]
  994. punpckhbw mm0, [r0+r3*0-8]
  995. movq mm1, [r4+r3*1-8]
  996. punpckhbw mm1, [r0+r3*2-8]
  997. mov r4, r0
  998. punpckhwd mm1, mm0
  999. lea r0, [r0+r3*4]
  1000. movq mm2, [r0+r3*1-8]
  1001. punpckhbw mm2, [r0+r3*0-8]
  1002. lea r0, [r0+r3*2]
  1003. movq mm3, [r0+r3*1-8]
  1004. punpckhbw mm3, [r0+r3*0-8]
  1005. punpckhwd mm3, mm2
  1006. punpckhdq mm3, mm1
  1007. lea r0, [r0+r3*2]
  1008. movq mm0, [r0+r3*0-8]
  1009. movq mm1, [r4]
  1010. mov r0, r4
  1011. movq mm4, mm3
  1012. movq mm2, mm3
  1013. PALIGNR mm4, mm0, 7, mm0
  1014. PALIGNR mm1, mm2, 1, mm2
  1015. test r1, r1
  1016. jnz .do_left
  1017. .fix_lt_1:
  1018. movq mm5, mm3
  1019. pxor mm5, mm4
  1020. psrlq mm5, 56
  1021. psllq mm5, 48
  1022. pxor mm1, mm5
  1023. jmp .do_left
  1024. .fix_lt_2:
  1025. movq mm5, mm3
  1026. pxor mm5, mm2
  1027. psllq mm5, 56
  1028. psrlq mm5, 56
  1029. pxor mm2, mm5
  1030. test r2, r2
  1031. jnz .body
  1032. .fix_tr_1:
  1033. movq mm5, mm3
  1034. pxor mm5, mm1
  1035. psrlq mm5, 56
  1036. psllq mm5, 56
  1037. pxor mm1, mm5
  1038. jmp .body
  1039. .do_left:
  1040. movq mm0, mm4
  1041. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1042. movq mm4, mm0
  1043. movq mm7, mm2
  1044. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1045. psllq mm1, 56
  1046. PALIGNR mm7, mm1, 7, mm3
  1047. movq mm0, [r0-8]
  1048. movq mm3, [r0]
  1049. movq mm1, [r0+8]
  1050. movq mm2, mm3
  1051. movq mm4, mm3
  1052. PALIGNR mm2, mm0, 7, mm0
  1053. PALIGNR mm1, mm4, 1, mm4
  1054. test r1, r1
  1055. jz .fix_lt_2
  1056. test r2, r2
  1057. jz .fix_tr_1
  1058. .body:
  1059. lea r1, [r0+r3*2]
  1060. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1061. pxor mm0, mm0
  1062. pxor mm1, mm1
  1063. lea r2, [r1+r3*2]
  1064. psadbw mm0, mm7
  1065. psadbw mm1, mm6
  1066. paddw mm0, [pw_8]
  1067. paddw mm0, mm1
  1068. lea r4, [r2+r3*2]
  1069. psrlw mm0, 4
  1070. pshufw mm0, mm0, 0
  1071. packuswb mm0, mm0
  1072. movq [r0+r3*1], mm0
  1073. movq [r0+r3*2], mm0
  1074. movq [r1+r3*1], mm0
  1075. movq [r1+r3*2], mm0
  1076. movq [r2+r3*1], mm0
  1077. movq [r2+r3*2], mm0
  1078. movq [r4+r3*1], mm0
  1079. movq [r4+r3*2], mm0
  1080. RET
  1081. %endmacro
  1082. INIT_MMX
  1083. %define PALIGNR PALIGNR_MMX
  1084. PRED8x8L_DC mmxext
  1085. %define PALIGNR PALIGNR_SSSE3
  1086. PRED8x8L_DC ssse3
  1087. ;-----------------------------------------------------------------------------
  1088. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1089. ;-----------------------------------------------------------------------------
  1090. %macro PRED8x8L_HORIZONTAL 1
  1091. cglobal pred8x8l_horizontal_8_%1, 4,4
  1092. sub r0, r3
  1093. lea r2, [r0+r3*2]
  1094. movq mm0, [r0+r3*1-8]
  1095. test r1, r1
  1096. lea r1, [r0+r3]
  1097. cmovnz r1, r0
  1098. punpckhbw mm0, [r1+r3*0-8]
  1099. movq mm1, [r2+r3*1-8]
  1100. punpckhbw mm1, [r0+r3*2-8]
  1101. mov r2, r0
  1102. punpckhwd mm1, mm0
  1103. lea r0, [r0+r3*4]
  1104. movq mm2, [r0+r3*1-8]
  1105. punpckhbw mm2, [r0+r3*0-8]
  1106. lea r0, [r0+r3*2]
  1107. movq mm3, [r0+r3*1-8]
  1108. punpckhbw mm3, [r0+r3*0-8]
  1109. punpckhwd mm3, mm2
  1110. punpckhdq mm3, mm1
  1111. lea r0, [r0+r3*2]
  1112. movq mm0, [r0+r3*0-8]
  1113. movq mm1, [r1+r3*0-8]
  1114. mov r0, r2
  1115. movq mm4, mm3
  1116. movq mm2, mm3
  1117. PALIGNR mm4, mm0, 7, mm0
  1118. PALIGNR mm1, mm2, 1, mm2
  1119. movq mm0, mm4
  1120. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1121. movq mm4, mm0
  1122. movq mm7, mm2
  1123. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1124. psllq mm1, 56
  1125. PALIGNR mm7, mm1, 7, mm3
  1126. movq mm3, mm7
  1127. lea r1, [r0+r3*2]
  1128. movq mm7, mm3
  1129. punpckhbw mm3, mm3
  1130. punpcklbw mm7, mm7
  1131. pshufw mm0, mm3, 0xff
  1132. pshufw mm1, mm3, 0xaa
  1133. lea r2, [r1+r3*2]
  1134. pshufw mm2, mm3, 0x55
  1135. pshufw mm3, mm3, 0x00
  1136. pshufw mm4, mm7, 0xff
  1137. pshufw mm5, mm7, 0xaa
  1138. pshufw mm6, mm7, 0x55
  1139. pshufw mm7, mm7, 0x00
  1140. movq [r0+r3*1], mm0
  1141. movq [r0+r3*2], mm1
  1142. movq [r1+r3*1], mm2
  1143. movq [r1+r3*2], mm3
  1144. movq [r2+r3*1], mm4
  1145. movq [r2+r3*2], mm5
  1146. lea r0, [r2+r3*2]
  1147. movq [r0+r3*1], mm6
  1148. movq [r0+r3*2], mm7
  1149. RET
  1150. %endmacro
  1151. INIT_MMX
  1152. %define PALIGNR PALIGNR_MMX
  1153. PRED8x8L_HORIZONTAL mmxext
  1154. %define PALIGNR PALIGNR_SSSE3
  1155. PRED8x8L_HORIZONTAL ssse3
  1156. ;-----------------------------------------------------------------------------
  1157. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1158. ;-----------------------------------------------------------------------------
  1159. %macro PRED8x8L_VERTICAL 1
  1160. cglobal pred8x8l_vertical_8_%1, 4,4
  1161. sub r0, r3
  1162. movq mm0, [r0-8]
  1163. movq mm3, [r0]
  1164. movq mm1, [r0+8]
  1165. movq mm2, mm3
  1166. movq mm4, mm3
  1167. PALIGNR mm2, mm0, 7, mm0
  1168. PALIGNR mm1, mm4, 1, mm4
  1169. test r1, r1 ; top_left
  1170. jz .fix_lt_2
  1171. test r2, r2 ; top_right
  1172. jz .fix_tr_1
  1173. jmp .body
  1174. .fix_lt_2:
  1175. movq mm5, mm3
  1176. pxor mm5, mm2
  1177. psllq mm5, 56
  1178. psrlq mm5, 56
  1179. pxor mm2, mm5
  1180. test r2, r2 ; top_right
  1181. jnz .body
  1182. .fix_tr_1:
  1183. movq mm5, mm3
  1184. pxor mm5, mm1
  1185. psrlq mm5, 56
  1186. psllq mm5, 56
  1187. pxor mm1, mm5
  1188. .body:
  1189. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1190. %rep 3
  1191. movq [r0+r3*1], mm0
  1192. movq [r0+r3*2], mm0
  1193. lea r0, [r0+r3*2]
  1194. %endrep
  1195. movq [r0+r3*1], mm0
  1196. movq [r0+r3*2], mm0
  1197. RET
  1198. %endmacro
  1199. INIT_MMX
  1200. %define PALIGNR PALIGNR_MMX
  1201. PRED8x8L_VERTICAL mmxext
  1202. %define PALIGNR PALIGNR_SSSE3
  1203. PRED8x8L_VERTICAL ssse3
  1204. ;-----------------------------------------------------------------------------
  1205. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1206. ;-----------------------------------------------------------------------------
  1207. INIT_MMX
  1208. %define PALIGNR PALIGNR_MMX
  1209. cglobal pred8x8l_down_left_8_mmxext, 4,5
  1210. sub r0, r3
  1211. movq mm0, [r0-8]
  1212. movq mm3, [r0]
  1213. movq mm1, [r0+8]
  1214. movq mm2, mm3
  1215. movq mm4, mm3
  1216. PALIGNR mm2, mm0, 7, mm0
  1217. PALIGNR mm1, mm4, 1, mm4
  1218. test r1, r1
  1219. jz .fix_lt_2
  1220. test r2, r2
  1221. jz .fix_tr_1
  1222. jmp .do_top
  1223. .fix_lt_2:
  1224. movq mm5, mm3
  1225. pxor mm5, mm2
  1226. psllq mm5, 56
  1227. psrlq mm5, 56
  1228. pxor mm2, mm5
  1229. test r2, r2
  1230. jnz .do_top
  1231. .fix_tr_1:
  1232. movq mm5, mm3
  1233. pxor mm5, mm1
  1234. psrlq mm5, 56
  1235. psllq mm5, 56
  1236. pxor mm1, mm5
  1237. jmp .do_top
  1238. .fix_tr_2:
  1239. punpckhbw mm3, mm3
  1240. pshufw mm1, mm3, 0xFF
  1241. jmp .do_topright
  1242. .do_top:
  1243. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1244. movq mm7, mm4
  1245. test r2, r2
  1246. jz .fix_tr_2
  1247. movq mm0, [r0+8]
  1248. movq mm5, mm0
  1249. movq mm2, mm0
  1250. movq mm4, mm0
  1251. psrlq mm5, 56
  1252. PALIGNR mm2, mm3, 7, mm3
  1253. PALIGNR mm5, mm4, 1, mm4
  1254. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1255. .do_topright:
  1256. lea r1, [r0+r3*2]
  1257. movq mm6, mm1
  1258. psrlq mm1, 56
  1259. movq mm4, mm1
  1260. lea r2, [r1+r3*2]
  1261. movq mm2, mm6
  1262. PALIGNR mm2, mm7, 1, mm0
  1263. movq mm3, mm6
  1264. PALIGNR mm3, mm7, 7, mm0
  1265. PALIGNR mm4, mm6, 1, mm0
  1266. movq mm5, mm7
  1267. movq mm1, mm7
  1268. movq mm7, mm6
  1269. lea r4, [r2+r3*2]
  1270. psllq mm1, 8
  1271. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1272. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1273. movq [r4+r3*2], mm1
  1274. movq mm2, mm0
  1275. psllq mm1, 8
  1276. psrlq mm2, 56
  1277. psllq mm0, 8
  1278. por mm1, mm2
  1279. movq [r4+r3*1], mm1
  1280. movq mm2, mm0
  1281. psllq mm1, 8
  1282. psrlq mm2, 56
  1283. psllq mm0, 8
  1284. por mm1, mm2
  1285. movq [r2+r3*2], mm1
  1286. movq mm2, mm0
  1287. psllq mm1, 8
  1288. psrlq mm2, 56
  1289. psllq mm0, 8
  1290. por mm1, mm2
  1291. movq [r2+r3*1], mm1
  1292. movq mm2, mm0
  1293. psllq mm1, 8
  1294. psrlq mm2, 56
  1295. psllq mm0, 8
  1296. por mm1, mm2
  1297. movq [r1+r3*2], mm1
  1298. movq mm2, mm0
  1299. psllq mm1, 8
  1300. psrlq mm2, 56
  1301. psllq mm0, 8
  1302. por mm1, mm2
  1303. movq [r1+r3*1], mm1
  1304. movq mm2, mm0
  1305. psllq mm1, 8
  1306. psrlq mm2, 56
  1307. psllq mm0, 8
  1308. por mm1, mm2
  1309. movq [r0+r3*2], mm1
  1310. psllq mm1, 8
  1311. psrlq mm0, 56
  1312. por mm1, mm0
  1313. movq [r0+r3*1], mm1
  1314. RET
  1315. %macro PRED8x8L_DOWN_LEFT 1
  1316. cglobal pred8x8l_down_left_8_%1, 4,4
  1317. sub r0, r3
  1318. movq mm0, [r0-8]
  1319. movq mm3, [r0]
  1320. movq mm1, [r0+8]
  1321. movq mm2, mm3
  1322. movq mm4, mm3
  1323. PALIGNR mm2, mm0, 7, mm0
  1324. PALIGNR mm1, mm4, 1, mm4
  1325. test r1, r1 ; top_left
  1326. jz .fix_lt_2
  1327. test r2, r2 ; top_right
  1328. jz .fix_tr_1
  1329. jmp .do_top
  1330. .fix_lt_2:
  1331. movq mm5, mm3
  1332. pxor mm5, mm2
  1333. psllq mm5, 56
  1334. psrlq mm5, 56
  1335. pxor mm2, mm5
  1336. test r2, r2 ; top_right
  1337. jnz .do_top
  1338. .fix_tr_1:
  1339. movq mm5, mm3
  1340. pxor mm5, mm1
  1341. psrlq mm5, 56
  1342. psllq mm5, 56
  1343. pxor mm1, mm5
  1344. jmp .do_top
  1345. .fix_tr_2:
  1346. punpckhbw mm3, mm3
  1347. pshufw mm1, mm3, 0xFF
  1348. jmp .do_topright
  1349. .do_top:
  1350. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1351. movq2dq xmm3, mm4
  1352. test r2, r2 ; top_right
  1353. jz .fix_tr_2
  1354. movq mm0, [r0+8]
  1355. movq mm5, mm0
  1356. movq mm2, mm0
  1357. movq mm4, mm0
  1358. psrlq mm5, 56
  1359. PALIGNR mm2, mm3, 7, mm3
  1360. PALIGNR mm5, mm4, 1, mm4
  1361. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1362. .do_topright:
  1363. movq2dq xmm4, mm1
  1364. psrlq mm1, 56
  1365. movq2dq xmm5, mm1
  1366. lea r1, [r0+r3*2]
  1367. pslldq xmm4, 8
  1368. por xmm3, xmm4
  1369. movdqa xmm2, xmm3
  1370. psrldq xmm2, 1
  1371. pslldq xmm5, 15
  1372. por xmm2, xmm5
  1373. lea r2, [r1+r3*2]
  1374. movdqa xmm1, xmm3
  1375. pslldq xmm1, 1
  1376. INIT_XMM
  1377. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1378. psrldq xmm0, 1
  1379. movq [r0+r3*1], xmm0
  1380. psrldq xmm0, 1
  1381. movq [r0+r3*2], xmm0
  1382. psrldq xmm0, 1
  1383. lea r0, [r2+r3*2]
  1384. movq [r1+r3*1], xmm0
  1385. psrldq xmm0, 1
  1386. movq [r1+r3*2], xmm0
  1387. psrldq xmm0, 1
  1388. movq [r2+r3*1], xmm0
  1389. psrldq xmm0, 1
  1390. movq [r2+r3*2], xmm0
  1391. psrldq xmm0, 1
  1392. movq [r0+r3*1], xmm0
  1393. psrldq xmm0, 1
  1394. movq [r0+r3*2], xmm0
  1395. RET
  1396. %endmacro
  1397. INIT_MMX
  1398. %define PALIGNR PALIGNR_MMX
  1399. PRED8x8L_DOWN_LEFT sse2
  1400. INIT_MMX
  1401. %define PALIGNR PALIGNR_SSSE3
  1402. PRED8x8L_DOWN_LEFT ssse3
  1403. ;-----------------------------------------------------------------------------
  1404. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1405. ;-----------------------------------------------------------------------------
  1406. INIT_MMX
  1407. %define PALIGNR PALIGNR_MMX
  1408. cglobal pred8x8l_down_right_8_mmxext, 4,5
  1409. sub r0, r3
  1410. lea r4, [r0+r3*2]
  1411. movq mm0, [r0+r3*1-8]
  1412. punpckhbw mm0, [r0+r3*0-8]
  1413. movq mm1, [r4+r3*1-8]
  1414. punpckhbw mm1, [r0+r3*2-8]
  1415. mov r4, r0
  1416. punpckhwd mm1, mm0
  1417. lea r0, [r0+r3*4]
  1418. movq mm2, [r0+r3*1-8]
  1419. punpckhbw mm2, [r0+r3*0-8]
  1420. lea r0, [r0+r3*2]
  1421. movq mm3, [r0+r3*1-8]
  1422. punpckhbw mm3, [r0+r3*0-8]
  1423. punpckhwd mm3, mm2
  1424. punpckhdq mm3, mm1
  1425. lea r0, [r0+r3*2]
  1426. movq mm0, [r0+r3*0-8]
  1427. movq mm1, [r4]
  1428. mov r0, r4
  1429. movq mm4, mm3
  1430. movq mm2, mm3
  1431. PALIGNR mm4, mm0, 7, mm0
  1432. PALIGNR mm1, mm2, 1, mm2
  1433. test r1, r1 ; top_left
  1434. jz .fix_lt_1
  1435. .do_left:
  1436. movq mm0, mm4
  1437. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1438. movq mm4, mm0
  1439. movq mm7, mm2
  1440. movq mm6, mm2
  1441. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1442. psllq mm1, 56
  1443. PALIGNR mm7, mm1, 7, mm3
  1444. movq mm0, [r0-8]
  1445. movq mm3, [r0]
  1446. movq mm1, [r0+8]
  1447. movq mm2, mm3
  1448. movq mm4, mm3
  1449. PALIGNR mm2, mm0, 7, mm0
  1450. PALIGNR mm1, mm4, 1, mm4
  1451. test r1, r1 ; top_left
  1452. jz .fix_lt_2
  1453. test r2, r2 ; top_right
  1454. jz .fix_tr_1
  1455. .do_top:
  1456. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1457. movq mm5, mm4
  1458. jmp .body
  1459. .fix_lt_1:
  1460. movq mm5, mm3
  1461. pxor mm5, mm4
  1462. psrlq mm5, 56
  1463. psllq mm5, 48
  1464. pxor mm1, mm5
  1465. jmp .do_left
  1466. .fix_lt_2:
  1467. movq mm5, mm3
  1468. pxor mm5, mm2
  1469. psllq mm5, 56
  1470. psrlq mm5, 56
  1471. pxor mm2, mm5
  1472. test r2, r2 ; top_right
  1473. jnz .do_top
  1474. .fix_tr_1:
  1475. movq mm5, mm3
  1476. pxor mm5, mm1
  1477. psrlq mm5, 56
  1478. psllq mm5, 56
  1479. pxor mm1, mm5
  1480. jmp .do_top
  1481. .body:
  1482. lea r1, [r0+r3*2]
  1483. movq mm1, mm7
  1484. movq mm7, mm5
  1485. movq mm5, mm6
  1486. movq mm2, mm7
  1487. lea r2, [r1+r3*2]
  1488. PALIGNR mm2, mm6, 1, mm0
  1489. movq mm3, mm7
  1490. PALIGNR mm3, mm6, 7, mm0
  1491. movq mm4, mm7
  1492. lea r4, [r2+r3*2]
  1493. psrlq mm4, 8
  1494. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1495. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1496. movq [r4+r3*2], mm0
  1497. movq mm2, mm1
  1498. psrlq mm0, 8
  1499. psllq mm2, 56
  1500. psrlq mm1, 8
  1501. por mm0, mm2
  1502. movq [r4+r3*1], mm0
  1503. movq mm2, mm1
  1504. psrlq mm0, 8
  1505. psllq mm2, 56
  1506. psrlq mm1, 8
  1507. por mm0, mm2
  1508. movq [r2+r3*2], mm0
  1509. movq mm2, mm1
  1510. psrlq mm0, 8
  1511. psllq mm2, 56
  1512. psrlq mm1, 8
  1513. por mm0, mm2
  1514. movq [r2+r3*1], mm0
  1515. movq mm2, mm1
  1516. psrlq mm0, 8
  1517. psllq mm2, 56
  1518. psrlq mm1, 8
  1519. por mm0, mm2
  1520. movq [r1+r3*2], mm0
  1521. movq mm2, mm1
  1522. psrlq mm0, 8
  1523. psllq mm2, 56
  1524. psrlq mm1, 8
  1525. por mm0, mm2
  1526. movq [r1+r3*1], mm0
  1527. movq mm2, mm1
  1528. psrlq mm0, 8
  1529. psllq mm2, 56
  1530. psrlq mm1, 8
  1531. por mm0, mm2
  1532. movq [r0+r3*2], mm0
  1533. psrlq mm0, 8
  1534. psllq mm1, 56
  1535. por mm0, mm1
  1536. movq [r0+r3*1], mm0
  1537. RET
  1538. %macro PRED8x8L_DOWN_RIGHT 1
  1539. cglobal pred8x8l_down_right_8_%1, 4,5
  1540. sub r0, r3
  1541. lea r4, [r0+r3*2]
  1542. movq mm0, [r0+r3*1-8]
  1543. punpckhbw mm0, [r0+r3*0-8]
  1544. movq mm1, [r4+r3*1-8]
  1545. punpckhbw mm1, [r0+r3*2-8]
  1546. mov r4, r0
  1547. punpckhwd mm1, mm0
  1548. lea r0, [r0+r3*4]
  1549. movq mm2, [r0+r3*1-8]
  1550. punpckhbw mm2, [r0+r3*0-8]
  1551. lea r0, [r0+r3*2]
  1552. movq mm3, [r0+r3*1-8]
  1553. punpckhbw mm3, [r0+r3*0-8]
  1554. punpckhwd mm3, mm2
  1555. punpckhdq mm3, mm1
  1556. lea r0, [r0+r3*2]
  1557. movq mm0, [r0+r3*0-8]
  1558. movq mm1, [r4]
  1559. mov r0, r4
  1560. movq mm4, mm3
  1561. movq mm2, mm3
  1562. PALIGNR mm4, mm0, 7, mm0
  1563. PALIGNR mm1, mm2, 1, mm2
  1564. test r1, r1
  1565. jz .fix_lt_1
  1566. jmp .do_left
  1567. .fix_lt_1:
  1568. movq mm5, mm3
  1569. pxor mm5, mm4
  1570. psrlq mm5, 56
  1571. psllq mm5, 48
  1572. pxor mm1, mm5
  1573. jmp .do_left
  1574. .fix_lt_2:
  1575. movq mm5, mm3
  1576. pxor mm5, mm2
  1577. psllq mm5, 56
  1578. psrlq mm5, 56
  1579. pxor mm2, mm5
  1580. test r2, r2
  1581. jnz .do_top
  1582. .fix_tr_1:
  1583. movq mm5, mm3
  1584. pxor mm5, mm1
  1585. psrlq mm5, 56
  1586. psllq mm5, 56
  1587. pxor mm1, mm5
  1588. jmp .do_top
  1589. .do_left:
  1590. movq mm0, mm4
  1591. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1592. movq mm4, mm0
  1593. movq mm7, mm2
  1594. movq2dq xmm3, mm2
  1595. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1596. psllq mm1, 56
  1597. PALIGNR mm7, mm1, 7, mm3
  1598. movq2dq xmm1, mm7
  1599. movq mm0, [r0-8]
  1600. movq mm3, [r0]
  1601. movq mm1, [r0+8]
  1602. movq mm2, mm3
  1603. movq mm4, mm3
  1604. PALIGNR mm2, mm0, 7, mm0
  1605. PALIGNR mm1, mm4, 1, mm4
  1606. test r1, r1
  1607. jz .fix_lt_2
  1608. test r2, r2
  1609. jz .fix_tr_1
  1610. .do_top:
  1611. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1612. movq2dq xmm4, mm4
  1613. lea r1, [r0+r3*2]
  1614. movdqa xmm0, xmm3
  1615. pslldq xmm4, 8
  1616. por xmm3, xmm4
  1617. lea r2, [r1+r3*2]
  1618. pslldq xmm4, 1
  1619. por xmm1, xmm4
  1620. psrldq xmm0, 7
  1621. pslldq xmm0, 15
  1622. psrldq xmm0, 7
  1623. por xmm1, xmm0
  1624. lea r0, [r2+r3*2]
  1625. movdqa xmm2, xmm3
  1626. psrldq xmm2, 1
  1627. INIT_XMM
  1628. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1629. movdqa xmm1, xmm0
  1630. psrldq xmm1, 1
  1631. movq [r0+r3*2], xmm0
  1632. movq [r0+r3*1], xmm1
  1633. psrldq xmm0, 2
  1634. psrldq xmm1, 2
  1635. movq [r2+r3*2], xmm0
  1636. movq [r2+r3*1], xmm1
  1637. psrldq xmm0, 2
  1638. psrldq xmm1, 2
  1639. movq [r1+r3*2], xmm0
  1640. movq [r1+r3*1], xmm1
  1641. psrldq xmm0, 2
  1642. psrldq xmm1, 2
  1643. movq [r4+r3*2], xmm0
  1644. movq [r4+r3*1], xmm1
  1645. RET
  1646. %endmacro
  1647. INIT_MMX
  1648. %define PALIGNR PALIGNR_MMX
  1649. PRED8x8L_DOWN_RIGHT sse2
  1650. INIT_MMX
  1651. %define PALIGNR PALIGNR_SSSE3
  1652. PRED8x8L_DOWN_RIGHT ssse3
  1653. ;-----------------------------------------------------------------------------
  1654. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1655. ;-----------------------------------------------------------------------------
  1656. INIT_MMX
  1657. %define PALIGNR PALIGNR_MMX
  1658. cglobal pred8x8l_vertical_right_8_mmxext, 4,5
  1659. sub r0, r3
  1660. lea r4, [r0+r3*2]
  1661. movq mm0, [r0+r3*1-8]
  1662. punpckhbw mm0, [r0+r3*0-8]
  1663. movq mm1, [r4+r3*1-8]
  1664. punpckhbw mm1, [r0+r3*2-8]
  1665. mov r4, r0
  1666. punpckhwd mm1, mm0
  1667. lea r0, [r0+r3*4]
  1668. movq mm2, [r0+r3*1-8]
  1669. punpckhbw mm2, [r0+r3*0-8]
  1670. lea r0, [r0+r3*2]
  1671. movq mm3, [r0+r3*1-8]
  1672. punpckhbw mm3, [r0+r3*0-8]
  1673. punpckhwd mm3, mm2
  1674. punpckhdq mm3, mm1
  1675. lea r0, [r0+r3*2]
  1676. movq mm0, [r0+r3*0-8]
  1677. movq mm1, [r4]
  1678. mov r0, r4
  1679. movq mm4, mm3
  1680. movq mm2, mm3
  1681. PALIGNR mm4, mm0, 7, mm0
  1682. PALIGNR mm1, mm2, 1, mm2
  1683. test r1, r1
  1684. jz .fix_lt_1
  1685. jmp .do_left
  1686. .fix_lt_1:
  1687. movq mm5, mm3
  1688. pxor mm5, mm4
  1689. psrlq mm5, 56
  1690. psllq mm5, 48
  1691. pxor mm1, mm5
  1692. jmp .do_left
  1693. .fix_lt_2:
  1694. movq mm5, mm3
  1695. pxor mm5, mm2
  1696. psllq mm5, 56
  1697. psrlq mm5, 56
  1698. pxor mm2, mm5
  1699. test r2, r2
  1700. jnz .do_top
  1701. .fix_tr_1:
  1702. movq mm5, mm3
  1703. pxor mm5, mm1
  1704. psrlq mm5, 56
  1705. psllq mm5, 56
  1706. pxor mm1, mm5
  1707. jmp .do_top
  1708. .do_left:
  1709. movq mm0, mm4
  1710. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1711. movq mm7, mm2
  1712. movq mm0, [r0-8]
  1713. movq mm3, [r0]
  1714. movq mm1, [r0+8]
  1715. movq mm2, mm3
  1716. movq mm4, mm3
  1717. PALIGNR mm2, mm0, 7, mm0
  1718. PALIGNR mm1, mm4, 1, mm4
  1719. test r1, r1
  1720. jz .fix_lt_2
  1721. test r2, r2
  1722. jz .fix_tr_1
  1723. .do_top:
  1724. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1725. lea r1, [r0+r3*2]
  1726. movq mm2, mm6
  1727. movq mm3, mm6
  1728. PALIGNR mm3, mm7, 7, mm0
  1729. PALIGNR mm6, mm7, 6, mm1
  1730. movq mm4, mm3
  1731. pavgb mm3, mm2
  1732. lea r2, [r1+r3*2]
  1733. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1734. movq [r0+r3*1], mm3
  1735. movq [r0+r3*2], mm0
  1736. movq mm5, mm0
  1737. movq mm6, mm3
  1738. movq mm1, mm7
  1739. movq mm2, mm1
  1740. psllq mm2, 8
  1741. movq mm3, mm1
  1742. psllq mm3, 16
  1743. lea r4, [r2+r3*2]
  1744. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1745. PALIGNR mm6, mm0, 7, mm2
  1746. movq [r1+r3*1], mm6
  1747. psllq mm0, 8
  1748. PALIGNR mm5, mm0, 7, mm1
  1749. movq [r1+r3*2], mm5
  1750. psllq mm0, 8
  1751. PALIGNR mm6, mm0, 7, mm2
  1752. movq [r2+r3*1], mm6
  1753. psllq mm0, 8
  1754. PALIGNR mm5, mm0, 7, mm1
  1755. movq [r2+r3*2], mm5
  1756. psllq mm0, 8
  1757. PALIGNR mm6, mm0, 7, mm2
  1758. movq [r4+r3*1], mm6
  1759. psllq mm0, 8
  1760. PALIGNR mm5, mm0, 7, mm1
  1761. movq [r4+r3*2], mm5
  1762. RET
  1763. %macro PRED8x8L_VERTICAL_RIGHT 1
  1764. cglobal pred8x8l_vertical_right_8_%1, 4,5,7
  1765. ; manually spill XMM registers for Win64 because
  1766. ; the code here is initialized with INIT_MMX
  1767. WIN64_SPILL_XMM 7
  1768. sub r0, r3
  1769. lea r4, [r0+r3*2]
  1770. movq mm0, [r0+r3*1-8]
  1771. punpckhbw mm0, [r0+r3*0-8]
  1772. movq mm1, [r4+r3*1-8]
  1773. punpckhbw mm1, [r0+r3*2-8]
  1774. mov r4, r0
  1775. punpckhwd mm1, mm0
  1776. lea r0, [r0+r3*4]
  1777. movq mm2, [r0+r3*1-8]
  1778. punpckhbw mm2, [r0+r3*0-8]
  1779. lea r0, [r0+r3*2]
  1780. movq mm3, [r0+r3*1-8]
  1781. punpckhbw mm3, [r0+r3*0-8]
  1782. punpckhwd mm3, mm2
  1783. punpckhdq mm3, mm1
  1784. lea r0, [r0+r3*2]
  1785. movq mm0, [r0+r3*0-8]
  1786. movq mm1, [r4]
  1787. mov r0, r4
  1788. movq mm4, mm3
  1789. movq mm2, mm3
  1790. PALIGNR mm4, mm0, 7, mm0
  1791. PALIGNR mm1, mm2, 1, mm2
  1792. test r1, r1
  1793. jnz .do_left
  1794. .fix_lt_1:
  1795. movq mm5, mm3
  1796. pxor mm5, mm4
  1797. psrlq mm5, 56
  1798. psllq mm5, 48
  1799. pxor mm1, mm5
  1800. jmp .do_left
  1801. .fix_lt_2:
  1802. movq mm5, mm3
  1803. pxor mm5, mm2
  1804. psllq mm5, 56
  1805. psrlq mm5, 56
  1806. pxor mm2, mm5
  1807. test r2, r2
  1808. jnz .do_top
  1809. .fix_tr_1:
  1810. movq mm5, mm3
  1811. pxor mm5, mm1
  1812. psrlq mm5, 56
  1813. psllq mm5, 56
  1814. pxor mm1, mm5
  1815. jmp .do_top
  1816. .do_left:
  1817. movq mm0, mm4
  1818. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1819. movq2dq xmm0, mm2
  1820. movq mm0, [r0-8]
  1821. movq mm3, [r0]
  1822. movq mm1, [r0+8]
  1823. movq mm2, mm3
  1824. movq mm4, mm3
  1825. PALIGNR mm2, mm0, 7, mm0
  1826. PALIGNR mm1, mm4, 1, mm4
  1827. test r1, r1
  1828. jz .fix_lt_2
  1829. test r2, r2
  1830. jz .fix_tr_1
  1831. .do_top:
  1832. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1833. lea r1, [r0+r3*2]
  1834. movq2dq xmm4, mm6
  1835. pslldq xmm4, 8
  1836. por xmm0, xmm4
  1837. movdqa xmm6, [pw_ff00]
  1838. movdqa xmm1, xmm0
  1839. lea r2, [r1+r3*2]
  1840. movdqa xmm2, xmm0
  1841. movdqa xmm3, xmm0
  1842. pslldq xmm0, 1
  1843. pslldq xmm1, 2
  1844. pavgb xmm2, xmm0
  1845. INIT_XMM
  1846. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1847. pandn xmm6, xmm4
  1848. movdqa xmm5, xmm4
  1849. psrlw xmm4, 8
  1850. packuswb xmm6, xmm4
  1851. movhlps xmm4, xmm6
  1852. movhps [r0+r3*2], xmm5
  1853. movhps [r0+r3*1], xmm2
  1854. psrldq xmm5, 4
  1855. movss xmm5, xmm6
  1856. psrldq xmm2, 4
  1857. movss xmm2, xmm4
  1858. lea r0, [r2+r3*2]
  1859. psrldq xmm5, 1
  1860. psrldq xmm2, 1
  1861. movq [r0+r3*2], xmm5
  1862. movq [r0+r3*1], xmm2
  1863. psrldq xmm5, 1
  1864. psrldq xmm2, 1
  1865. movq [r2+r3*2], xmm5
  1866. movq [r2+r3*1], xmm2
  1867. psrldq xmm5, 1
  1868. psrldq xmm2, 1
  1869. movq [r1+r3*2], xmm5
  1870. movq [r1+r3*1], xmm2
  1871. RET
  1872. %endmacro
  1873. INIT_MMX
  1874. %define PALIGNR PALIGNR_MMX
  1875. PRED8x8L_VERTICAL_RIGHT sse2
  1876. INIT_MMX
  1877. %define PALIGNR PALIGNR_SSSE3
  1878. PRED8x8L_VERTICAL_RIGHT ssse3
  1879. ;-----------------------------------------------------------------------------
  1880. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1881. ;-----------------------------------------------------------------------------
  1882. %macro PRED8x8L_VERTICAL_LEFT 1
  1883. cglobal pred8x8l_vertical_left_8_%1, 4,4
  1884. sub r0, r3
  1885. movq mm0, [r0-8]
  1886. movq mm3, [r0]
  1887. movq mm1, [r0+8]
  1888. movq mm2, mm3
  1889. movq mm4, mm3
  1890. PALIGNR mm2, mm0, 7, mm0
  1891. PALIGNR mm1, mm4, 1, mm4
  1892. test r1, r1
  1893. jz .fix_lt_2
  1894. test r2, r2
  1895. jz .fix_tr_1
  1896. jmp .do_top
  1897. .fix_lt_2:
  1898. movq mm5, mm3
  1899. pxor mm5, mm2
  1900. psllq mm5, 56
  1901. psrlq mm5, 56
  1902. pxor mm2, mm5
  1903. test r2, r2
  1904. jnz .do_top
  1905. .fix_tr_1:
  1906. movq mm5, mm3
  1907. pxor mm5, mm1
  1908. psrlq mm5, 56
  1909. psllq mm5, 56
  1910. pxor mm1, mm5
  1911. jmp .do_top
  1912. .fix_tr_2:
  1913. punpckhbw mm3, mm3
  1914. pshufw mm1, mm3, 0xFF
  1915. jmp .do_topright
  1916. .do_top:
  1917. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1918. movq2dq xmm4, mm4
  1919. test r2, r2
  1920. jz .fix_tr_2
  1921. movq mm0, [r0+8]
  1922. movq mm5, mm0
  1923. movq mm2, mm0
  1924. movq mm4, mm0
  1925. psrlq mm5, 56
  1926. PALIGNR mm2, mm3, 7, mm3
  1927. PALIGNR mm5, mm4, 1, mm4
  1928. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1929. .do_topright:
  1930. movq2dq xmm3, mm1
  1931. lea r1, [r0+r3*2]
  1932. pslldq xmm3, 8
  1933. por xmm4, xmm3
  1934. movdqa xmm2, xmm4
  1935. movdqa xmm1, xmm4
  1936. movdqa xmm3, xmm4
  1937. psrldq xmm2, 1
  1938. pslldq xmm1, 1
  1939. pavgb xmm3, xmm2
  1940. lea r2, [r1+r3*2]
  1941. INIT_XMM
  1942. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1943. psrldq xmm0, 1
  1944. movq [r0+r3*1], xmm3
  1945. movq [r0+r3*2], xmm0
  1946. lea r0, [r2+r3*2]
  1947. psrldq xmm3, 1
  1948. psrldq xmm0, 1
  1949. movq [r1+r3*1], xmm3
  1950. movq [r1+r3*2], xmm0
  1951. psrldq xmm3, 1
  1952. psrldq xmm0, 1
  1953. movq [r2+r3*1], xmm3
  1954. movq [r2+r3*2], xmm0
  1955. psrldq xmm3, 1
  1956. psrldq xmm0, 1
  1957. movq [r0+r3*1], xmm3
  1958. movq [r0+r3*2], xmm0
  1959. RET
  1960. %endmacro
  1961. INIT_MMX
  1962. %define PALIGNR PALIGNR_MMX
  1963. PRED8x8L_VERTICAL_LEFT sse2
  1964. %define PALIGNR PALIGNR_SSSE3
  1965. INIT_MMX
  1966. PRED8x8L_VERTICAL_LEFT ssse3
  1967. ;-----------------------------------------------------------------------------
  1968. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  1969. ;-----------------------------------------------------------------------------
  1970. %macro PRED8x8L_HORIZONTAL_UP 1
  1971. cglobal pred8x8l_horizontal_up_8_%1, 4,4
  1972. sub r0, r3
  1973. lea r2, [r0+r3*2]
  1974. movq mm0, [r0+r3*1-8]
  1975. test r1, r1
  1976. lea r1, [r0+r3]
  1977. cmovnz r1, r0
  1978. punpckhbw mm0, [r1+r3*0-8]
  1979. movq mm1, [r2+r3*1-8]
  1980. punpckhbw mm1, [r0+r3*2-8]
  1981. mov r2, r0
  1982. punpckhwd mm1, mm0
  1983. lea r0, [r0+r3*4]
  1984. movq mm2, [r0+r3*1-8]
  1985. punpckhbw mm2, [r0+r3*0-8]
  1986. lea r0, [r0+r3*2]
  1987. movq mm3, [r0+r3*1-8]
  1988. punpckhbw mm3, [r0+r3*0-8]
  1989. punpckhwd mm3, mm2
  1990. punpckhdq mm3, mm1
  1991. lea r0, [r0+r3*2]
  1992. movq mm0, [r0+r3*0-8]
  1993. movq mm1, [r1+r3*0-8]
  1994. mov r0, r2
  1995. movq mm4, mm3
  1996. movq mm2, mm3
  1997. PALIGNR mm4, mm0, 7, mm0
  1998. PALIGNR mm1, mm2, 1, mm2
  1999. movq mm0, mm4
  2000. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2001. movq mm4, mm0
  2002. movq mm7, mm2
  2003. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2004. psllq mm1, 56
  2005. PALIGNR mm7, mm1, 7, mm3
  2006. lea r1, [r0+r3*2]
  2007. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2008. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2009. movq mm2, mm0
  2010. psllw mm0, 8
  2011. psrlw mm2, 8
  2012. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2013. movq mm3, mm2
  2014. movq mm4, mm2
  2015. movq mm5, mm2
  2016. psrlq mm2, 8
  2017. psrlq mm3, 16
  2018. lea r2, [r1+r3*2]
  2019. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2020. punpckhbw mm7, mm7
  2021. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2022. pavgb mm4, mm2
  2023. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2024. movq mm5, mm4
  2025. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2026. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2027. movq mm6, mm5
  2028. movq mm7, mm5
  2029. movq mm0, mm5
  2030. PALIGNR mm5, mm4, 2, mm1
  2031. pshufw mm1, mm6, 11111001b
  2032. PALIGNR mm6, mm4, 4, mm2
  2033. pshufw mm2, mm7, 11111110b
  2034. PALIGNR mm7, mm4, 6, mm3
  2035. pshufw mm3, mm0, 11111111b
  2036. movq [r0+r3*1], mm4
  2037. movq [r0+r3*2], mm5
  2038. lea r0, [r2+r3*2]
  2039. movq [r1+r3*1], mm6
  2040. movq [r1+r3*2], mm7
  2041. movq [r2+r3*1], mm0
  2042. movq [r2+r3*2], mm1
  2043. movq [r0+r3*1], mm2
  2044. movq [r0+r3*2], mm3
  2045. RET
  2046. %endmacro
  2047. INIT_MMX
  2048. %define PALIGNR PALIGNR_MMX
  2049. PRED8x8L_HORIZONTAL_UP mmxext
  2050. %define PALIGNR PALIGNR_SSSE3
  2051. PRED8x8L_HORIZONTAL_UP ssse3
  2052. ;-----------------------------------------------------------------------------
  2053. ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
  2054. ;-----------------------------------------------------------------------------
  2055. INIT_MMX
  2056. %define PALIGNR PALIGNR_MMX
  2057. cglobal pred8x8l_horizontal_down_8_mmxext, 4,5
  2058. sub r0, r3
  2059. lea r4, [r0+r3*2]
  2060. movq mm0, [r0+r3*1-8]
  2061. punpckhbw mm0, [r0+r3*0-8]
  2062. movq mm1, [r4+r3*1-8]
  2063. punpckhbw mm1, [r0+r3*2-8]
  2064. mov r4, r0
  2065. punpckhwd mm1, mm0
  2066. lea r0, [r0+r3*4]
  2067. movq mm2, [r0+r3*1-8]
  2068. punpckhbw mm2, [r0+r3*0-8]
  2069. lea r0, [r0+r3*2]
  2070. movq mm3, [r0+r3*1-8]
  2071. punpckhbw mm3, [r0+r3*0-8]
  2072. punpckhwd mm3, mm2
  2073. punpckhdq mm3, mm1
  2074. lea r0, [r0+r3*2]
  2075. movq mm0, [r0+r3*0-8]
  2076. movq mm1, [r4]
  2077. mov r0, r4
  2078. movq mm4, mm3
  2079. movq mm2, mm3
  2080. PALIGNR mm4, mm0, 7, mm0
  2081. PALIGNR mm1, mm2, 1, mm2
  2082. test r1, r1
  2083. jnz .do_left
  2084. .fix_lt_1:
  2085. movq mm5, mm3
  2086. pxor mm5, mm4
  2087. psrlq mm5, 56
  2088. psllq mm5, 48
  2089. pxor mm1, mm5
  2090. jmp .do_left
  2091. .fix_lt_2:
  2092. movq mm5, mm3
  2093. pxor mm5, mm2
  2094. psllq mm5, 56
  2095. psrlq mm5, 56
  2096. pxor mm2, mm5
  2097. test r2, r2
  2098. jnz .do_top
  2099. .fix_tr_1:
  2100. movq mm5, mm3
  2101. pxor mm5, mm1
  2102. psrlq mm5, 56
  2103. psllq mm5, 56
  2104. pxor mm1, mm5
  2105. jmp .do_top
  2106. .do_left:
  2107. movq mm0, mm4
  2108. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2109. movq mm4, mm0
  2110. movq mm7, mm2
  2111. movq mm6, mm2
  2112. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2113. psllq mm1, 56
  2114. PALIGNR mm7, mm1, 7, mm3
  2115. movq mm0, [r0-8]
  2116. movq mm3, [r0]
  2117. movq mm1, [r0+8]
  2118. movq mm2, mm3
  2119. movq mm4, mm3
  2120. PALIGNR mm2, mm0, 7, mm0
  2121. PALIGNR mm1, mm4, 1, mm4
  2122. test r1, r1
  2123. jz .fix_lt_2
  2124. test r2, r2
  2125. jz .fix_tr_1
  2126. .do_top:
  2127. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2128. movq mm5, mm4
  2129. lea r1, [r0+r3*2]
  2130. psllq mm7, 56
  2131. movq mm2, mm5
  2132. movq mm3, mm6
  2133. movq mm4, mm2
  2134. PALIGNR mm2, mm6, 7, mm5
  2135. PALIGNR mm6, mm7, 7, mm0
  2136. lea r2, [r1+r3*2]
  2137. PALIGNR mm4, mm3, 1, mm7
  2138. movq mm5, mm3
  2139. pavgb mm3, mm6
  2140. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2141. movq mm4, mm2
  2142. movq mm1, mm2
  2143. lea r4, [r2+r3*2]
  2144. psrlq mm4, 16
  2145. psrlq mm1, 8
  2146. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2147. movq mm7, mm3
  2148. punpcklbw mm3, mm0
  2149. punpckhbw mm7, mm0
  2150. movq mm1, mm7
  2151. movq mm0, mm7
  2152. movq mm4, mm7
  2153. movq [r4+r3*2], mm3
  2154. PALIGNR mm7, mm3, 2, mm5
  2155. movq [r4+r3*1], mm7
  2156. PALIGNR mm1, mm3, 4, mm5
  2157. movq [r2+r3*2], mm1
  2158. PALIGNR mm0, mm3, 6, mm3
  2159. movq [r2+r3*1], mm0
  2160. movq mm2, mm6
  2161. movq mm3, mm6
  2162. movq [r1+r3*2], mm4
  2163. PALIGNR mm6, mm4, 2, mm5
  2164. movq [r1+r3*1], mm6
  2165. PALIGNR mm2, mm4, 4, mm5
  2166. movq [r0+r3*2], mm2
  2167. PALIGNR mm3, mm4, 6, mm4
  2168. movq [r0+r3*1], mm3
  2169. RET
  2170. %macro PRED8x8L_HORIZONTAL_DOWN 1
  2171. cglobal pred8x8l_horizontal_down_8_%1, 4,5
  2172. sub r0, r3
  2173. lea r4, [r0+r3*2]
  2174. movq mm0, [r0+r3*1-8]
  2175. punpckhbw mm0, [r0+r3*0-8]
  2176. movq mm1, [r4+r3*1-8]
  2177. punpckhbw mm1, [r0+r3*2-8]
  2178. mov r4, r0
  2179. punpckhwd mm1, mm0
  2180. lea r0, [r0+r3*4]
  2181. movq mm2, [r0+r3*1-8]
  2182. punpckhbw mm2, [r0+r3*0-8]
  2183. lea r0, [r0+r3*2]
  2184. movq mm3, [r0+r3*1-8]
  2185. punpckhbw mm3, [r0+r3*0-8]
  2186. punpckhwd mm3, mm2
  2187. punpckhdq mm3, mm1
  2188. lea r0, [r0+r3*2]
  2189. movq mm0, [r0+r3*0-8]
  2190. movq mm1, [r4]
  2191. mov r0, r4
  2192. movq mm4, mm3
  2193. movq mm2, mm3
  2194. PALIGNR mm4, mm0, 7, mm0
  2195. PALIGNR mm1, mm2, 1, mm2
  2196. test r1, r1
  2197. jnz .do_left
  2198. .fix_lt_1:
  2199. movq mm5, mm3
  2200. pxor mm5, mm4
  2201. psrlq mm5, 56
  2202. psllq mm5, 48
  2203. pxor mm1, mm5
  2204. jmp .do_left
  2205. .fix_lt_2:
  2206. movq mm5, mm3
  2207. pxor mm5, mm2
  2208. psllq mm5, 56
  2209. psrlq mm5, 56
  2210. pxor mm2, mm5
  2211. test r2, r2
  2212. jnz .do_top
  2213. .fix_tr_1:
  2214. movq mm5, mm3
  2215. pxor mm5, mm1
  2216. psrlq mm5, 56
  2217. psllq mm5, 56
  2218. pxor mm1, mm5
  2219. jmp .do_top
  2220. .fix_tr_2:
  2221. punpckhbw mm3, mm3
  2222. pshufw mm1, mm3, 0xFF
  2223. jmp .do_topright
  2224. .do_left:
  2225. movq mm0, mm4
  2226. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2227. movq2dq xmm0, mm2
  2228. pslldq xmm0, 8
  2229. movq mm4, mm0
  2230. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2231. movq2dq xmm2, mm1
  2232. pslldq xmm2, 15
  2233. psrldq xmm2, 8
  2234. por xmm0, xmm2
  2235. movq mm0, [r0-8]
  2236. movq mm3, [r0]
  2237. movq mm1, [r0+8]
  2238. movq mm2, mm3
  2239. movq mm4, mm3
  2240. PALIGNR mm2, mm0, 7, mm0
  2241. PALIGNR mm1, mm4, 1, mm4
  2242. test r1, r1
  2243. jz .fix_lt_2
  2244. test r2, r2
  2245. jz .fix_tr_1
  2246. .do_top:
  2247. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2248. movq2dq xmm1, mm4
  2249. test r2, r2
  2250. jz .fix_tr_2
  2251. movq mm0, [r0+8]
  2252. movq mm5, mm0
  2253. movq mm2, mm0
  2254. movq mm4, mm0
  2255. psrlq mm5, 56
  2256. PALIGNR mm2, mm3, 7, mm3
  2257. PALIGNR mm5, mm4, 1, mm4
  2258. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2259. .do_topright:
  2260. movq2dq xmm5, mm1
  2261. pslldq xmm5, 8
  2262. por xmm1, xmm5
  2263. INIT_XMM
  2264. lea r2, [r4+r3*2]
  2265. movdqa xmm2, xmm1
  2266. movdqa xmm3, xmm1
  2267. PALIGNR xmm1, xmm0, 7, xmm4
  2268. PALIGNR xmm2, xmm0, 9, xmm5
  2269. lea r1, [r2+r3*2]
  2270. PALIGNR xmm3, xmm0, 8, xmm0
  2271. movdqa xmm4, xmm1
  2272. pavgb xmm4, xmm3
  2273. lea r0, [r1+r3*2]
  2274. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2275. punpcklbw xmm4, xmm0
  2276. movhlps xmm0, xmm4
  2277. movq [r0+r3*2], xmm4
  2278. movq [r2+r3*2], xmm0
  2279. psrldq xmm4, 2
  2280. psrldq xmm0, 2
  2281. movq [r0+r3*1], xmm4
  2282. movq [r2+r3*1], xmm0
  2283. psrldq xmm4, 2
  2284. psrldq xmm0, 2
  2285. movq [r1+r3*2], xmm4
  2286. movq [r4+r3*2], xmm0
  2287. psrldq xmm4, 2
  2288. psrldq xmm0, 2
  2289. movq [r1+r3*1], xmm4
  2290. movq [r4+r3*1], xmm0
  2291. RET
  2292. %endmacro
  2293. INIT_MMX
  2294. %define PALIGNR PALIGNR_MMX
  2295. PRED8x8L_HORIZONTAL_DOWN sse2
  2296. INIT_MMX
  2297. %define PALIGNR PALIGNR_SSSE3
  2298. PRED8x8L_HORIZONTAL_DOWN ssse3
  2299. ;-----------------------------------------------------------------------------
  2300. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2301. ;-----------------------------------------------------------------------------
  2302. cglobal pred4x4_dc_8_mmxext, 3,5
  2303. pxor mm7, mm7
  2304. mov r4, r0
  2305. sub r0, r2
  2306. movd mm0, [r0]
  2307. psadbw mm0, mm7
  2308. movzx r1d, byte [r0+r2*1-1]
  2309. movd r3d, mm0
  2310. add r3d, r1d
  2311. movzx r1d, byte [r0+r2*2-1]
  2312. lea r0, [r0+r2*2]
  2313. add r3d, r1d
  2314. movzx r1d, byte [r0+r2*1-1]
  2315. add r3d, r1d
  2316. movzx r1d, byte [r0+r2*2-1]
  2317. add r3d, r1d
  2318. add r3d, 4
  2319. shr r3d, 3
  2320. imul r3d, 0x01010101
  2321. mov [r4+r2*0], r3d
  2322. mov [r0+r2*0], r3d
  2323. mov [r0+r2*1], r3d
  2324. mov [r0+r2*2], r3d
  2325. RET
  2326. ;-----------------------------------------------------------------------------
  2327. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2328. ;-----------------------------------------------------------------------------
  2329. %macro PRED4x4_TM_MMX 0
  2330. cglobal pred4x4_tm_vp8_8, 3,6
  2331. sub r0, r2
  2332. pxor mm7, mm7
  2333. movd mm0, [r0]
  2334. punpcklbw mm0, mm7
  2335. movzx r4d, byte [r0-1]
  2336. mov r5d, 2
  2337. .loop:
  2338. movzx r1d, byte [r0+r2*1-1]
  2339. movzx r3d, byte [r0+r2*2-1]
  2340. sub r1d, r4d
  2341. sub r3d, r4d
  2342. movd mm2, r1d
  2343. movd mm4, r3d
  2344. %if cpuflag(mmx2)
  2345. pshufw mm2, mm2, 0
  2346. pshufw mm4, mm4, 0
  2347. %else
  2348. punpcklwd mm2, mm2
  2349. punpcklwd mm4, mm4
  2350. punpckldq mm2, mm2
  2351. punpckldq mm4, mm4
  2352. %endif
  2353. paddw mm2, mm0
  2354. paddw mm4, mm0
  2355. packuswb mm2, mm2
  2356. packuswb mm4, mm4
  2357. movd [r0+r2*1], mm2
  2358. movd [r0+r2*2], mm4
  2359. lea r0, [r0+r2*2]
  2360. dec r5d
  2361. jg .loop
  2362. REP_RET
  2363. %endmacro
  2364. INIT_MMX mmx
  2365. PRED4x4_TM_MMX
  2366. INIT_MMX mmx2
  2367. PRED4x4_TM_MMX
  2368. INIT_MMX
  2369. cglobal pred4x4_tm_vp8_8_ssse3, 3,3
  2370. sub r0, r2
  2371. movq mm6, [tm_shuf]
  2372. pxor mm1, mm1
  2373. movd mm0, [r0]
  2374. punpcklbw mm0, mm1
  2375. movd mm7, [r0-4]
  2376. pshufb mm7, mm6
  2377. lea r1, [r0+r2*2]
  2378. movd mm2, [r0+r2*1-4]
  2379. movd mm3, [r0+r2*2-4]
  2380. movd mm4, [r1+r2*1-4]
  2381. movd mm5, [r1+r2*2-4]
  2382. pshufb mm2, mm6
  2383. pshufb mm3, mm6
  2384. pshufb mm4, mm6
  2385. pshufb mm5, mm6
  2386. psubw mm2, mm7
  2387. psubw mm3, mm7
  2388. psubw mm4, mm7
  2389. psubw mm5, mm7
  2390. paddw mm2, mm0
  2391. paddw mm3, mm0
  2392. paddw mm4, mm0
  2393. paddw mm5, mm0
  2394. packuswb mm2, mm2
  2395. packuswb mm3, mm3
  2396. packuswb mm4, mm4
  2397. packuswb mm5, mm5
  2398. movd [r0+r2*1], mm2
  2399. movd [r0+r2*2], mm3
  2400. movd [r1+r2*1], mm4
  2401. movd [r1+r2*2], mm5
  2402. RET
  2403. ;-----------------------------------------------------------------------------
  2404. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2405. ;-----------------------------------------------------------------------------
  2406. INIT_MMX
  2407. cglobal pred4x4_vertical_vp8_8_mmxext, 3,3
  2408. sub r0, r2
  2409. movd m1, [r0-1]
  2410. movd m0, [r0]
  2411. mova m2, m0 ;t0 t1 t2 t3
  2412. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2413. lea r1, [r0+r2*2]
  2414. psrlq m0, 8 ;t1 t2 t3 t4
  2415. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2416. movd [r0+r2*1], m3
  2417. movd [r0+r2*2], m3
  2418. movd [r1+r2*1], m3
  2419. movd [r1+r2*2], m3
  2420. RET
  2421. ;-----------------------------------------------------------------------------
  2422. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2423. ;-----------------------------------------------------------------------------
  2424. INIT_MMX
  2425. cglobal pred4x4_down_left_8_mmxext, 3,3
  2426. sub r0, r2
  2427. movq m1, [r0]
  2428. punpckldq m1, [r1]
  2429. movq m2, m1
  2430. movq m3, m1
  2431. psllq m1, 8
  2432. pxor m2, m1
  2433. psrlq m2, 8
  2434. pxor m2, m3
  2435. PRED4x4_LOWPASS m0, m1, m2, m3, m4
  2436. lea r1, [r0+r2*2]
  2437. psrlq m0, 8
  2438. movd [r0+r2*1], m0
  2439. psrlq m0, 8
  2440. movd [r0+r2*2], m0
  2441. psrlq m0, 8
  2442. movd [r1+r2*1], m0
  2443. psrlq m0, 8
  2444. movd [r1+r2*2], m0
  2445. RET
  2446. ;-----------------------------------------------------------------------------
  2447. ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2448. ;-----------------------------------------------------------------------------
  2449. INIT_MMX
  2450. cglobal pred4x4_vertical_left_8_mmxext, 3,3
  2451. sub r0, r2
  2452. movq m1, [r0]
  2453. punpckldq m1, [r1]
  2454. movq m3, m1
  2455. movq m2, m1
  2456. psrlq m3, 8
  2457. psrlq m2, 16
  2458. movq m4, m3
  2459. pavgb m4, m1
  2460. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2461. lea r1, [r0+r2*2]
  2462. movh [r0+r2*1], m4
  2463. movh [r0+r2*2], m0
  2464. psrlq m4, 8
  2465. psrlq m0, 8
  2466. movh [r1+r2*1], m4
  2467. movh [r1+r2*2], m0
  2468. RET
  2469. ;-----------------------------------------------------------------------------
  2470. ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2471. ;-----------------------------------------------------------------------------
  2472. INIT_MMX
  2473. cglobal pred4x4_horizontal_up_8_mmxext, 3,3
  2474. sub r0, r2
  2475. lea r1, [r0+r2*2]
  2476. movd m0, [r0+r2*1-4]
  2477. punpcklbw m0, [r0+r2*2-4]
  2478. movd m1, [r1+r2*1-4]
  2479. punpcklbw m1, [r1+r2*2-4]
  2480. punpckhwd m0, m1
  2481. movq m1, m0
  2482. punpckhbw m1, m1
  2483. pshufw m1, m1, 0xFF
  2484. punpckhdq m0, m1
  2485. movq m2, m0
  2486. movq m3, m0
  2487. movq m7, m0
  2488. psrlq m2, 16
  2489. psrlq m3, 8
  2490. pavgb m7, m3
  2491. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2492. punpcklbw m7, m4
  2493. movd [r0+r2*1], m7
  2494. psrlq m7, 16
  2495. movd [r0+r2*2], m7
  2496. psrlq m7, 16
  2497. movd [r1+r2*1], m7
  2498. movd [r1+r2*2], m1
  2499. RET
  2500. ;-----------------------------------------------------------------------------
  2501. ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2502. ;-----------------------------------------------------------------------------
  2503. INIT_MMX
  2504. %define PALIGNR PALIGNR_MMX
  2505. cglobal pred4x4_horizontal_down_8_mmxext, 3,3
  2506. sub r0, r2
  2507. lea r1, [r0+r2*2]
  2508. movh m0, [r0-4] ; lt ..
  2509. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2510. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2511. movd m1, [r1+r2*2-4] ; l3
  2512. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2513. movd m2, [r0+r2*2-4] ; l1
  2514. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2515. punpckhwd m1, m2 ; l0 l1 l2 l3
  2516. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2517. movq m0, m1
  2518. movq m2, m1
  2519. movq m5, m1
  2520. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2521. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2522. pavgb m5, m2
  2523. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2524. punpcklbw m5, m3
  2525. psrlq m3, 32
  2526. PALIGNR m3, m5, 6, m4
  2527. movh [r1+r2*2], m5
  2528. psrlq m5, 16
  2529. movh [r1+r2*1], m5
  2530. psrlq m5, 16
  2531. movh [r0+r2*2], m5
  2532. movh [r0+r2*1], m3
  2533. RET
  2534. ;-----------------------------------------------------------------------------
  2535. ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2536. ;-----------------------------------------------------------------------------
  2537. INIT_MMX
  2538. %define PALIGNR PALIGNR_MMX
  2539. cglobal pred4x4_vertical_right_8_mmxext, 3,3
  2540. sub r0, r2
  2541. lea r1, [r0+r2*2]
  2542. movh m0, [r0] ; ........t3t2t1t0
  2543. movq m5, m0
  2544. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2545. pavgb m5, m0
  2546. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2547. movq m1, m0
  2548. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2549. movq m2, m0
  2550. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2551. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2552. movq m1, m3
  2553. psrlq m3, 16
  2554. psllq m1, 48
  2555. movh [r0+r2*1], m5
  2556. movh [r0+r2*2], m3
  2557. PALIGNR m5, m1, 7, m2
  2558. psllq m1, 8
  2559. movh [r1+r2*1], m5
  2560. PALIGNR m3, m1, 7, m1
  2561. movh [r1+r2*2], m3
  2562. RET
  2563. ;-----------------------------------------------------------------------------
  2564. ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2565. ;-----------------------------------------------------------------------------
  2566. INIT_MMX
  2567. %define PALIGNR PALIGNR_MMX
  2568. cglobal pred4x4_down_right_8_mmxext, 3,3
  2569. sub r0, r2
  2570. lea r1, [r0+r2*2]
  2571. movq m1, [r1-8]
  2572. movq m2, [r0+r2*1-8]
  2573. punpckhbw m2, [r0-8]
  2574. movh m3, [r0]
  2575. punpckhwd m1, m2
  2576. PALIGNR m3, m1, 5, m1
  2577. movq m1, m3
  2578. PALIGNR m3, [r1+r2*1-8], 7, m4
  2579. movq m2, m3
  2580. PALIGNR m3, [r1+r2*2-8], 7, m4
  2581. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2582. movh [r1+r2*2], m0
  2583. psrlq m0, 8
  2584. movh [r1+r2*1], m0
  2585. psrlq m0, 8
  2586. movh [r0+r2*2], m0
  2587. psrlq m0, 8
  2588. movh [r0+r2*1], m0
  2589. RET