You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2703 lines
68KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. tm_shuf: times 8 db 0x03, 0x80
  27. pw_ff00: times 8 dw 0xff00
  28. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  29. db 1, 2, 3, 4, 5, 6, 7, 8
  30. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  31. db 1, 2, 3, 4, 0, 0, 0, 0
  32. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  33. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  34. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  35. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  36. SECTION .text
  37. cextern pb_1
  38. cextern pb_3
  39. cextern pw_4
  40. cextern pw_5
  41. cextern pw_8
  42. cextern pw_16
  43. cextern pw_17
  44. cextern pw_32
  45. ;-----------------------------------------------------------------------------
  46. ; void pred16x16_vertical_8(uint8_t *src, int stride)
  47. ;-----------------------------------------------------------------------------
  48. INIT_MMX mmx
  49. cglobal pred16x16_vertical_8, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. INIT_XMM sse
  64. cglobal pred16x16_vertical_8, 2,3
  65. sub r0, r1
  66. mov r2, 4
  67. movaps xmm0, [r0]
  68. .loop:
  69. movaps [r0+r1*1], xmm0
  70. movaps [r0+r1*2], xmm0
  71. lea r0, [r0+r1*2]
  72. movaps [r0+r1*1], xmm0
  73. movaps [r0+r1*2], xmm0
  74. lea r0, [r0+r1*2]
  75. dec r2
  76. jg .loop
  77. REP_RET
  78. ;-----------------------------------------------------------------------------
  79. ; void pred16x16_horizontal_8(uint8_t *src, int stride)
  80. ;-----------------------------------------------------------------------------
  81. %macro PRED16x16_H 0
  82. cglobal pred16x16_horizontal_8, 2,3
  83. mov r2, 8
  84. %if cpuflag(ssse3)
  85. mova m2, [pb_3]
  86. %endif
  87. .loop:
  88. movd m0, [r0+r1*0-4]
  89. movd m1, [r0+r1*1-4]
  90. %if cpuflag(ssse3)
  91. pshufb m0, m2
  92. pshufb m1, m2
  93. %else
  94. punpcklbw m0, m0
  95. punpcklbw m1, m1
  96. SPLATW m0, m0, 3
  97. SPLATW m1, m1, 3
  98. mova [r0+r1*0+8], m0
  99. mova [r0+r1*1+8], m1
  100. %endif
  101. mova [r0+r1*0], m0
  102. mova [r0+r1*1], m1
  103. lea r0, [r0+r1*2]
  104. dec r2
  105. jg .loop
  106. REP_RET
  107. %endmacro
  108. INIT_MMX mmx
  109. PRED16x16_H
  110. INIT_MMX mmxext
  111. PRED16x16_H
  112. INIT_XMM ssse3
  113. PRED16x16_H
  114. ;-----------------------------------------------------------------------------
  115. ; void pred16x16_dc_8(uint8_t *src, int stride)
  116. ;-----------------------------------------------------------------------------
  117. %macro PRED16x16_DC 0
  118. cglobal pred16x16_dc_8, 2,7
  119. mov r4, r0
  120. sub r0, r1
  121. pxor mm0, mm0
  122. pxor mm1, mm1
  123. psadbw mm0, [r0+0]
  124. psadbw mm1, [r0+8]
  125. dec r0
  126. movzx r5d, byte [r0+r1*1]
  127. paddw mm0, mm1
  128. movd r6d, mm0
  129. lea r0, [r0+r1*2]
  130. %rep 7
  131. movzx r2d, byte [r0+r1*0]
  132. movzx r3d, byte [r0+r1*1]
  133. add r5d, r2d
  134. add r6d, r3d
  135. lea r0, [r0+r1*2]
  136. %endrep
  137. movzx r2d, byte [r0+r1*0]
  138. add r5d, r6d
  139. lea r2d, [r2+r5+16]
  140. shr r2d, 5
  141. %if cpuflag(ssse3)
  142. pxor m1, m1
  143. %endif
  144. SPLATB_REG m0, r2, m1
  145. %if mmsize==8
  146. mov r3d, 8
  147. .loop:
  148. mova [r4+r1*0+0], m0
  149. mova [r4+r1*0+8], m0
  150. mova [r4+r1*1+0], m0
  151. mova [r4+r1*1+8], m0
  152. %else
  153. mov r3d, 4
  154. .loop:
  155. mova [r4+r1*0], m0
  156. mova [r4+r1*1], m0
  157. lea r4, [r4+r1*2]
  158. mova [r4+r1*0], m0
  159. mova [r4+r1*1], m0
  160. %endif
  161. lea r4, [r4+r1*2]
  162. dec r3d
  163. jg .loop
  164. REP_RET
  165. %endmacro
  166. INIT_MMX mmxext
  167. PRED16x16_DC
  168. INIT_XMM sse2
  169. PRED16x16_DC
  170. INIT_XMM ssse3
  171. PRED16x16_DC
  172. ;-----------------------------------------------------------------------------
  173. ; void pred16x16_tm_vp8_8(uint8_t *src, int stride)
  174. ;-----------------------------------------------------------------------------
  175. %macro PRED16x16_TM 0
  176. cglobal pred16x16_tm_vp8_8, 2,5
  177. sub r0, r1
  178. pxor mm7, mm7
  179. movq mm0, [r0+0]
  180. movq mm2, [r0+8]
  181. movq mm1, mm0
  182. movq mm3, mm2
  183. punpcklbw mm0, mm7
  184. punpckhbw mm1, mm7
  185. punpcklbw mm2, mm7
  186. punpckhbw mm3, mm7
  187. movzx r3d, byte [r0-1]
  188. mov r4d, 16
  189. .loop:
  190. movzx r2d, byte [r0+r1-1]
  191. sub r2d, r3d
  192. movd mm4, r2d
  193. SPLATW mm4, mm4, 0
  194. movq mm5, mm4
  195. movq mm6, mm4
  196. movq mm7, mm4
  197. paddw mm4, mm0
  198. paddw mm5, mm1
  199. paddw mm6, mm2
  200. paddw mm7, mm3
  201. packuswb mm4, mm5
  202. packuswb mm6, mm7
  203. movq [r0+r1+0], mm4
  204. movq [r0+r1+8], mm6
  205. add r0, r1
  206. dec r4d
  207. jg .loop
  208. REP_RET
  209. %endmacro
  210. INIT_MMX mmx
  211. PRED16x16_TM
  212. INIT_MMX mmxext
  213. PRED16x16_TM
  214. INIT_XMM sse2
  215. cglobal pred16x16_tm_vp8_8, 2,6,6
  216. sub r0, r1
  217. pxor xmm2, xmm2
  218. movdqa xmm0, [r0]
  219. movdqa xmm1, xmm0
  220. punpcklbw xmm0, xmm2
  221. punpckhbw xmm1, xmm2
  222. movzx r4d, byte [r0-1]
  223. mov r5d, 8
  224. .loop:
  225. movzx r2d, byte [r0+r1*1-1]
  226. movzx r3d, byte [r0+r1*2-1]
  227. sub r2d, r4d
  228. sub r3d, r4d
  229. movd xmm2, r2d
  230. movd xmm4, r3d
  231. pshuflw xmm2, xmm2, 0
  232. pshuflw xmm4, xmm4, 0
  233. punpcklqdq xmm2, xmm2
  234. punpcklqdq xmm4, xmm4
  235. movdqa xmm3, xmm2
  236. movdqa xmm5, xmm4
  237. paddw xmm2, xmm0
  238. paddw xmm3, xmm1
  239. paddw xmm4, xmm0
  240. paddw xmm5, xmm1
  241. packuswb xmm2, xmm3
  242. packuswb xmm4, xmm5
  243. movdqa [r0+r1*1], xmm2
  244. movdqa [r0+r1*2], xmm4
  245. lea r0, [r0+r1*2]
  246. dec r5d
  247. jg .loop
  248. REP_RET
  249. ;-----------------------------------------------------------------------------
  250. ; void pred16x16_plane_*_8(uint8_t *src, int stride)
  251. ;-----------------------------------------------------------------------------
  252. %macro H264_PRED16x16_PLANE 1
  253. cglobal pred16x16_plane_%1_8, 2,9,7
  254. mov r2, r1 ; +stride
  255. neg r1 ; -stride
  256. movh m0, [r0+r1 -1]
  257. %if mmsize == 8
  258. pxor m4, m4
  259. movh m1, [r0+r1 +3 ]
  260. movh m2, [r0+r1 +8 ]
  261. movh m3, [r0+r1 +12]
  262. punpcklbw m0, m4
  263. punpcklbw m1, m4
  264. punpcklbw m2, m4
  265. punpcklbw m3, m4
  266. pmullw m0, [pw_m8tom1 ]
  267. pmullw m1, [pw_m8tom1+8]
  268. pmullw m2, [pw_1to8 ]
  269. pmullw m3, [pw_1to8 +8]
  270. paddw m0, m2
  271. paddw m1, m3
  272. %else ; mmsize == 16
  273. %if cpuflag(ssse3)
  274. movhps m0, [r0+r1 +8]
  275. pmaddubsw m0, [plane_shuf] ; H coefficients
  276. %else ; sse2
  277. pxor m2, m2
  278. movh m1, [r0+r1 +8]
  279. punpcklbw m0, m2
  280. punpcklbw m1, m2
  281. pmullw m0, [pw_m8tom1]
  282. pmullw m1, [pw_1to8]
  283. paddw m0, m1
  284. %endif
  285. movhlps m1, m0
  286. %endif
  287. paddw m0, m1
  288. %if cpuflag(mmxext)
  289. PSHUFLW m1, m0, 0xE
  290. %elif cpuflag(mmx)
  291. mova m1, m0
  292. psrlq m1, 32
  293. %endif
  294. paddw m0, m1
  295. %if cpuflag(mmxext)
  296. PSHUFLW m1, m0, 0x1
  297. %elif cpuflag(mmx)
  298. mova m1, m0
  299. psrlq m1, 16
  300. %endif
  301. paddw m0, m1 ; sum of H coefficients
  302. lea r4, [r0+r2*8-1]
  303. lea r3, [r0+r2*4-1]
  304. add r4, r2
  305. %if ARCH_X86_64
  306. %define e_reg r8
  307. %else
  308. %define e_reg r0
  309. %endif
  310. movzx e_reg, byte [r3+r2*2 ]
  311. movzx r5, byte [r4+r1 ]
  312. sub r5, e_reg
  313. movzx e_reg, byte [r3+r2 ]
  314. movzx r6, byte [r4 ]
  315. sub r6, e_reg
  316. lea r5, [r5+r6*2]
  317. movzx e_reg, byte [r3+r1 ]
  318. movzx r6, byte [r4+r2*2 ]
  319. sub r6, e_reg
  320. lea r5, [r5+r6*4]
  321. movzx e_reg, byte [r3 ]
  322. %if ARCH_X86_64
  323. movzx r7, byte [r4+r2 ]
  324. sub r7, e_reg
  325. %else
  326. movzx r6, byte [r4+r2 ]
  327. sub r6, e_reg
  328. lea r5, [r5+r6*4]
  329. sub r5, r6
  330. %endif
  331. lea e_reg, [r3+r1*4]
  332. lea r3, [r4+r2*4]
  333. movzx r4, byte [e_reg+r2 ]
  334. movzx r6, byte [r3 ]
  335. sub r6, r4
  336. %if ARCH_X86_64
  337. lea r6, [r7+r6*2]
  338. lea r5, [r5+r6*2]
  339. add r5, r6
  340. %else
  341. lea r5, [r5+r6*4]
  342. lea r5, [r5+r6*2]
  343. %endif
  344. movzx r4, byte [e_reg ]
  345. %if ARCH_X86_64
  346. movzx r7, byte [r3 +r2 ]
  347. sub r7, r4
  348. sub r5, r7
  349. %else
  350. movzx r6, byte [r3 +r2 ]
  351. sub r6, r4
  352. lea r5, [r5+r6*8]
  353. sub r5, r6
  354. %endif
  355. movzx r4, byte [e_reg+r1 ]
  356. movzx r6, byte [r3 +r2*2]
  357. sub r6, r4
  358. %if ARCH_X86_64
  359. add r6, r7
  360. %endif
  361. lea r5, [r5+r6*8]
  362. movzx r4, byte [e_reg+r2*2]
  363. movzx r6, byte [r3 +r1 ]
  364. sub r6, r4
  365. lea r5, [r5+r6*4]
  366. add r5, r6 ; sum of V coefficients
  367. %if ARCH_X86_64 == 0
  368. mov r0, r0m
  369. %endif
  370. %ifidn %1, h264
  371. lea r5, [r5*5+32]
  372. sar r5, 6
  373. %elifidn %1, rv40
  374. lea r5, [r5*5]
  375. sar r5, 6
  376. %elifidn %1, svq3
  377. test r5, r5
  378. lea r6, [r5+3]
  379. cmovs r5, r6
  380. sar r5, 2 ; V/4
  381. lea r5, [r5*5] ; 5*(V/4)
  382. test r5, r5
  383. lea r6, [r5+15]
  384. cmovs r5, r6
  385. sar r5, 4 ; (5*(V/4))/16
  386. %endif
  387. movzx r4, byte [r0+r1 +15]
  388. movzx r3, byte [r3+r2*2 ]
  389. lea r3, [r3+r4+1]
  390. shl r3, 4
  391. movd r1d, m0
  392. movsx r1d, r1w
  393. %ifnidn %1, svq3
  394. %ifidn %1, h264
  395. lea r1d, [r1d*5+32]
  396. %else ; rv40
  397. lea r1d, [r1d*5]
  398. %endif
  399. sar r1d, 6
  400. %else ; svq3
  401. test r1d, r1d
  402. lea r4d, [r1d+3]
  403. cmovs r1d, r4d
  404. sar r1d, 2 ; H/4
  405. lea r1d, [r1d*5] ; 5*(H/4)
  406. test r1d, r1d
  407. lea r4d, [r1d+15]
  408. cmovs r1d, r4d
  409. sar r1d, 4 ; (5*(H/4))/16
  410. %endif
  411. movd m0, r1d
  412. add r1d, r5d
  413. add r3d, r1d
  414. shl r1d, 3
  415. sub r3d, r1d ; a
  416. movd m1, r5d
  417. movd m3, r3d
  418. SPLATW m0, m0, 0 ; H
  419. SPLATW m1, m1, 0 ; V
  420. SPLATW m3, m3, 0 ; a
  421. %ifidn %1, svq3
  422. SWAP 0, 1
  423. %endif
  424. mova m2, m0
  425. %if mmsize == 8
  426. mova m5, m0
  427. %endif
  428. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  429. %if mmsize == 16
  430. psllw m2, 3
  431. %else
  432. psllw m5, 3
  433. psllw m2, 2
  434. mova m6, m5
  435. paddw m6, m2
  436. %endif
  437. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  438. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  439. %if mmsize == 8
  440. paddw m5, m0 ; a + {8,9,10,11}*H
  441. paddw m6, m0 ; a + {12,13,14,15}*H
  442. %endif
  443. mov r4, 8
  444. .loop:
  445. mova m3, m0 ; b[0..7]
  446. mova m4, m2 ; b[8..15]
  447. psraw m3, 5
  448. psraw m4, 5
  449. packuswb m3, m4
  450. mova [r0], m3
  451. %if mmsize == 8
  452. mova m3, m5 ; b[8..11]
  453. mova m4, m6 ; b[12..15]
  454. psraw m3, 5
  455. psraw m4, 5
  456. packuswb m3, m4
  457. mova [r0+8], m3
  458. %endif
  459. paddw m0, m1
  460. paddw m2, m1
  461. %if mmsize == 8
  462. paddw m5, m1
  463. paddw m6, m1
  464. %endif
  465. mova m3, m0 ; b[0..7]
  466. mova m4, m2 ; b[8..15]
  467. psraw m3, 5
  468. psraw m4, 5
  469. packuswb m3, m4
  470. mova [r0+r2], m3
  471. %if mmsize == 8
  472. mova m3, m5 ; b[8..11]
  473. mova m4, m6 ; b[12..15]
  474. psraw m3, 5
  475. psraw m4, 5
  476. packuswb m3, m4
  477. mova [r0+r2+8], m3
  478. %endif
  479. paddw m0, m1
  480. paddw m2, m1
  481. %if mmsize == 8
  482. paddw m5, m1
  483. paddw m6, m1
  484. %endif
  485. lea r0, [r0+r2*2]
  486. dec r4
  487. jg .loop
  488. REP_RET
  489. %endmacro
  490. INIT_MMX mmx
  491. H264_PRED16x16_PLANE h264
  492. H264_PRED16x16_PLANE rv40
  493. H264_PRED16x16_PLANE svq3
  494. INIT_MMX mmxext
  495. H264_PRED16x16_PLANE h264
  496. H264_PRED16x16_PLANE rv40
  497. H264_PRED16x16_PLANE svq3
  498. INIT_XMM sse2
  499. H264_PRED16x16_PLANE h264
  500. H264_PRED16x16_PLANE rv40
  501. H264_PRED16x16_PLANE svq3
  502. INIT_XMM ssse3
  503. H264_PRED16x16_PLANE h264
  504. H264_PRED16x16_PLANE rv40
  505. H264_PRED16x16_PLANE svq3
  506. ;-----------------------------------------------------------------------------
  507. ; void pred8x8_plane_8(uint8_t *src, int stride)
  508. ;-----------------------------------------------------------------------------
  509. %macro H264_PRED8x8_PLANE 0
  510. cglobal pred8x8_plane_8, 2,9,7
  511. mov r2, r1 ; +stride
  512. neg r1 ; -stride
  513. movd m0, [r0+r1 -1]
  514. %if mmsize == 8
  515. pxor m2, m2
  516. movh m1, [r0+r1 +4 ]
  517. punpcklbw m0, m2
  518. punpcklbw m1, m2
  519. pmullw m0, [pw_m4to4]
  520. pmullw m1, [pw_m4to4+8]
  521. %else ; mmsize == 16
  522. %if cpuflag(ssse3)
  523. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  524. pmaddubsw m0, [plane8_shuf] ; H coefficients
  525. %else ; sse2
  526. pxor m2, m2
  527. movd m1, [r0+r1 +4]
  528. punpckldq m0, m1
  529. punpcklbw m0, m2
  530. pmullw m0, [pw_m4to4]
  531. %endif
  532. movhlps m1, m0
  533. %endif
  534. paddw m0, m1
  535. %if notcpuflag(ssse3)
  536. %if cpuflag(mmxext)
  537. PSHUFLW m1, m0, 0xE
  538. %elif cpuflag(mmx)
  539. mova m1, m0
  540. psrlq m1, 32
  541. %endif
  542. paddw m0, m1
  543. %endif ; !ssse3
  544. %if cpuflag(mmxext)
  545. PSHUFLW m1, m0, 0x1
  546. %elif cpuflag(mmx)
  547. mova m1, m0
  548. psrlq m1, 16
  549. %endif
  550. paddw m0, m1 ; sum of H coefficients
  551. lea r4, [r0+r2*4-1]
  552. lea r3, [r0 -1]
  553. add r4, r2
  554. %if ARCH_X86_64
  555. %define e_reg r8
  556. %else
  557. %define e_reg r0
  558. %endif
  559. movzx e_reg, byte [r3+r2*2 ]
  560. movzx r5, byte [r4+r1 ]
  561. sub r5, e_reg
  562. movzx e_reg, byte [r3 ]
  563. %if ARCH_X86_64
  564. movzx r7, byte [r4+r2 ]
  565. sub r7, e_reg
  566. sub r5, r7
  567. %else
  568. movzx r6, byte [r4+r2 ]
  569. sub r6, e_reg
  570. lea r5, [r5+r6*4]
  571. sub r5, r6
  572. %endif
  573. movzx e_reg, byte [r3+r1 ]
  574. movzx r6, byte [r4+r2*2 ]
  575. sub r6, e_reg
  576. %if ARCH_X86_64
  577. add r6, r7
  578. %endif
  579. lea r5, [r5+r6*4]
  580. movzx e_reg, byte [r3+r2 ]
  581. movzx r6, byte [r4 ]
  582. sub r6, e_reg
  583. lea r6, [r5+r6*2]
  584. lea r5, [r6*9+16]
  585. lea r5, [r5+r6*8]
  586. sar r5, 5
  587. %if ARCH_X86_64 == 0
  588. mov r0, r0m
  589. %endif
  590. movzx r3, byte [r4+r2*2 ]
  591. movzx r4, byte [r0+r1 +7]
  592. lea r3, [r3+r4+1]
  593. shl r3, 4
  594. movd r1d, m0
  595. movsx r1d, r1w
  596. imul r1d, 17
  597. add r1d, 16
  598. sar r1d, 5
  599. movd m0, r1d
  600. add r1d, r5d
  601. sub r3d, r1d
  602. add r1d, r1d
  603. sub r3d, r1d ; a
  604. movd m1, r5d
  605. movd m3, r3d
  606. SPLATW m0, m0, 0 ; H
  607. SPLATW m1, m1, 0 ; V
  608. SPLATW m3, m3, 0 ; a
  609. %if mmsize == 8
  610. mova m2, m0
  611. %endif
  612. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  613. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  614. %if mmsize == 8
  615. psllw m2, 2
  616. paddw m2, m0 ; a + {4,5,6,7}*H
  617. %endif
  618. mov r4, 4
  619. ALIGN 16
  620. .loop:
  621. %if mmsize == 16
  622. mova m3, m0 ; b[0..7]
  623. paddw m0, m1
  624. psraw m3, 5
  625. mova m4, m0 ; V+b[0..7]
  626. paddw m0, m1
  627. psraw m4, 5
  628. packuswb m3, m4
  629. movh [r0], m3
  630. movhps [r0+r2], m3
  631. %else ; mmsize == 8
  632. mova m3, m0 ; b[0..3]
  633. mova m4, m2 ; b[4..7]
  634. paddw m0, m1
  635. paddw m2, m1
  636. psraw m3, 5
  637. psraw m4, 5
  638. mova m5, m0 ; V+b[0..3]
  639. mova m6, m2 ; V+b[4..7]
  640. paddw m0, m1
  641. paddw m2, m1
  642. psraw m5, 5
  643. psraw m6, 5
  644. packuswb m3, m4
  645. packuswb m5, m6
  646. mova [r0], m3
  647. mova [r0+r2], m5
  648. %endif
  649. lea r0, [r0+r2*2]
  650. dec r4
  651. jg .loop
  652. REP_RET
  653. %endmacro
  654. INIT_MMX mmx
  655. H264_PRED8x8_PLANE
  656. INIT_MMX mmxext
  657. H264_PRED8x8_PLANE
  658. INIT_XMM sse2
  659. H264_PRED8x8_PLANE
  660. INIT_XMM ssse3
  661. H264_PRED8x8_PLANE
  662. ;-----------------------------------------------------------------------------
  663. ; void pred8x8_vertical_8(uint8_t *src, int stride)
  664. ;-----------------------------------------------------------------------------
  665. INIT_MMX mmx
  666. cglobal pred8x8_vertical_8, 2,2
  667. sub r0, r1
  668. movq mm0, [r0]
  669. %rep 3
  670. movq [r0+r1*1], mm0
  671. movq [r0+r1*2], mm0
  672. lea r0, [r0+r1*2]
  673. %endrep
  674. movq [r0+r1*1], mm0
  675. movq [r0+r1*2], mm0
  676. RET
  677. ;-----------------------------------------------------------------------------
  678. ; void pred8x8_horizontal_8(uint8_t *src, int stride)
  679. ;-----------------------------------------------------------------------------
  680. %macro PRED8x8_H 0
  681. cglobal pred8x8_horizontal_8, 2,3
  682. mov r2, 4
  683. %if cpuflag(ssse3)
  684. mova m2, [pb_3]
  685. %endif
  686. .loop:
  687. SPLATB_LOAD m0, r0+r1*0-1, m2
  688. SPLATB_LOAD m1, r0+r1*1-1, m2
  689. mova [r0+r1*0], m0
  690. mova [r0+r1*1], m1
  691. lea r0, [r0+r1*2]
  692. dec r2
  693. jg .loop
  694. REP_RET
  695. %endmacro
  696. INIT_MMX mmx
  697. PRED8x8_H
  698. INIT_MMX mmxext
  699. PRED8x8_H
  700. INIT_MMX ssse3
  701. PRED8x8_H
  702. ;-----------------------------------------------------------------------------
  703. ; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
  704. ;-----------------------------------------------------------------------------
  705. INIT_MMX mmxext
  706. cglobal pred8x8_top_dc_8, 2,5
  707. sub r0, r1
  708. movq mm0, [r0]
  709. pxor mm1, mm1
  710. pxor mm2, mm2
  711. lea r2, [r0+r1*2]
  712. punpckhbw mm1, mm0
  713. punpcklbw mm0, mm2
  714. psadbw mm1, mm2 ; s1
  715. lea r3, [r2+r1*2]
  716. psadbw mm0, mm2 ; s0
  717. psrlw mm1, 1
  718. psrlw mm0, 1
  719. pavgw mm1, mm2
  720. lea r4, [r3+r1*2]
  721. pavgw mm0, mm2
  722. pshufw mm1, mm1, 0
  723. pshufw mm0, mm0, 0 ; dc0 (w)
  724. packuswb mm0, mm1 ; dc0,dc1 (b)
  725. movq [r0+r1*1], mm0
  726. movq [r0+r1*2], mm0
  727. lea r0, [r3+r1*2]
  728. movq [r2+r1*1], mm0
  729. movq [r2+r1*2], mm0
  730. movq [r3+r1*1], mm0
  731. movq [r3+r1*2], mm0
  732. movq [r0+r1*1], mm0
  733. movq [r0+r1*2], mm0
  734. RET
  735. ;-----------------------------------------------------------------------------
  736. ; void pred8x8_dc_8_mmxext(uint8_t *src, int stride)
  737. ;-----------------------------------------------------------------------------
  738. INIT_MMX mmxext
  739. cglobal pred8x8_dc_8, 2,5
  740. sub r0, r1
  741. pxor m7, m7
  742. movd m0, [r0+0]
  743. movd m1, [r0+4]
  744. psadbw m0, m7 ; s0
  745. mov r4, r0
  746. psadbw m1, m7 ; s1
  747. movzx r2d, byte [r0+r1*1-1]
  748. movzx r3d, byte [r0+r1*2-1]
  749. lea r0, [r0+r1*2]
  750. add r2d, r3d
  751. movzx r3d, byte [r0+r1*1-1]
  752. add r2d, r3d
  753. movzx r3d, byte [r0+r1*2-1]
  754. add r2d, r3d
  755. lea r0, [r0+r1*2]
  756. movd m2, r2d ; s2
  757. movzx r2d, byte [r0+r1*1-1]
  758. movzx r3d, byte [r0+r1*2-1]
  759. lea r0, [r0+r1*2]
  760. add r2d, r3d
  761. movzx r3d, byte [r0+r1*1-1]
  762. add r2d, r3d
  763. movzx r3d, byte [r0+r1*2-1]
  764. add r2d, r3d
  765. movd m3, r2d ; s3
  766. punpcklwd m0, m1
  767. mov r0, r4
  768. punpcklwd m2, m3
  769. punpckldq m0, m2 ; s0, s1, s2, s3
  770. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  771. lea r2, [r0+r1*2]
  772. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  773. paddw m0, m3
  774. lea r3, [r2+r1*2]
  775. psrlw m0, 2
  776. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  777. lea r4, [r3+r1*2]
  778. packuswb m0, m0
  779. punpcklbw m0, m0
  780. movq m1, m0
  781. punpcklbw m0, m0
  782. punpckhbw m1, m1
  783. movq [r0+r1*1], m0
  784. movq [r0+r1*2], m0
  785. movq [r2+r1*1], m0
  786. movq [r2+r1*2], m0
  787. movq [r3+r1*1], m1
  788. movq [r3+r1*2], m1
  789. movq [r4+r1*1], m1
  790. movq [r4+r1*2], m1
  791. RET
  792. ;-----------------------------------------------------------------------------
  793. ; void pred8x8_dc_rv40_8(uint8_t *src, int stride)
  794. ;-----------------------------------------------------------------------------
  795. INIT_MMX mmxext
  796. cglobal pred8x8_dc_rv40_8, 2,7
  797. mov r4, r0
  798. sub r0, r1
  799. pxor mm0, mm0
  800. psadbw mm0, [r0]
  801. dec r0
  802. movzx r5d, byte [r0+r1*1]
  803. movd r6d, mm0
  804. lea r0, [r0+r1*2]
  805. %rep 3
  806. movzx r2d, byte [r0+r1*0]
  807. movzx r3d, byte [r0+r1*1]
  808. add r5d, r2d
  809. add r6d, r3d
  810. lea r0, [r0+r1*2]
  811. %endrep
  812. movzx r2d, byte [r0+r1*0]
  813. add r5d, r6d
  814. lea r2d, [r2+r5+8]
  815. shr r2d, 4
  816. movd mm0, r2d
  817. punpcklbw mm0, mm0
  818. pshufw mm0, mm0, 0
  819. mov r3d, 4
  820. .loop:
  821. movq [r4+r1*0], mm0
  822. movq [r4+r1*1], mm0
  823. lea r4, [r4+r1*2]
  824. dec r3d
  825. jg .loop
  826. REP_RET
  827. ;-----------------------------------------------------------------------------
  828. ; void pred8x8_tm_vp8_8(uint8_t *src, int stride)
  829. ;-----------------------------------------------------------------------------
  830. %macro PRED8x8_TM 0
  831. cglobal pred8x8_tm_vp8_8, 2,6
  832. sub r0, r1
  833. pxor mm7, mm7
  834. movq mm0, [r0]
  835. movq mm1, mm0
  836. punpcklbw mm0, mm7
  837. punpckhbw mm1, mm7
  838. movzx r4d, byte [r0-1]
  839. mov r5d, 4
  840. .loop:
  841. movzx r2d, byte [r0+r1*1-1]
  842. movzx r3d, byte [r0+r1*2-1]
  843. sub r2d, r4d
  844. sub r3d, r4d
  845. movd mm2, r2d
  846. movd mm4, r3d
  847. SPLATW mm2, mm2, 0
  848. SPLATW mm4, mm4, 0
  849. movq mm3, mm2
  850. movq mm5, mm4
  851. paddw mm2, mm0
  852. paddw mm3, mm1
  853. paddw mm4, mm0
  854. paddw mm5, mm1
  855. packuswb mm2, mm3
  856. packuswb mm4, mm5
  857. movq [r0+r1*1], mm2
  858. movq [r0+r1*2], mm4
  859. lea r0, [r0+r1*2]
  860. dec r5d
  861. jg .loop
  862. REP_RET
  863. %endmacro
  864. INIT_MMX mmx
  865. PRED8x8_TM
  866. INIT_MMX mmxext
  867. PRED8x8_TM
  868. INIT_XMM sse2
  869. cglobal pred8x8_tm_vp8_8, 2,6,4
  870. sub r0, r1
  871. pxor xmm1, xmm1
  872. movq xmm0, [r0]
  873. punpcklbw xmm0, xmm1
  874. movzx r4d, byte [r0-1]
  875. mov r5d, 4
  876. .loop:
  877. movzx r2d, byte [r0+r1*1-1]
  878. movzx r3d, byte [r0+r1*2-1]
  879. sub r2d, r4d
  880. sub r3d, r4d
  881. movd xmm2, r2d
  882. movd xmm3, r3d
  883. pshuflw xmm2, xmm2, 0
  884. pshuflw xmm3, xmm3, 0
  885. punpcklqdq xmm2, xmm2
  886. punpcklqdq xmm3, xmm3
  887. paddw xmm2, xmm0
  888. paddw xmm3, xmm0
  889. packuswb xmm2, xmm3
  890. movq [r0+r1*1], xmm2
  891. movhps [r0+r1*2], xmm2
  892. lea r0, [r0+r1*2]
  893. dec r5d
  894. jg .loop
  895. REP_RET
  896. INIT_XMM ssse3
  897. cglobal pred8x8_tm_vp8_8, 2,3,6
  898. sub r0, r1
  899. movdqa xmm4, [tm_shuf]
  900. pxor xmm1, xmm1
  901. movq xmm0, [r0]
  902. punpcklbw xmm0, xmm1
  903. movd xmm5, [r0-4]
  904. pshufb xmm5, xmm4
  905. mov r2d, 4
  906. .loop:
  907. movd xmm2, [r0+r1*1-4]
  908. movd xmm3, [r0+r1*2-4]
  909. pshufb xmm2, xmm4
  910. pshufb xmm3, xmm4
  911. psubw xmm2, xmm5
  912. psubw xmm3, xmm5
  913. paddw xmm2, xmm0
  914. paddw xmm3, xmm0
  915. packuswb xmm2, xmm3
  916. movq [r0+r1*1], xmm2
  917. movhps [r0+r1*2], xmm2
  918. lea r0, [r0+r1*2]
  919. dec r2d
  920. jg .loop
  921. REP_RET
  922. ; dest, left, right, src, tmp
  923. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  924. %macro PRED4x4_LOWPASS 5
  925. mova %5, %2
  926. pavgb %2, %3
  927. pxor %3, %5
  928. mova %1, %4
  929. pand %3, [pb_1]
  930. psubusb %2, %3
  931. pavgb %1, %2
  932. %endmacro
  933. ;-----------------------------------------------------------------------------
  934. ; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  935. ;-----------------------------------------------------------------------------
  936. %macro PRED8x8L_TOP_DC 0
  937. cglobal pred8x8l_top_dc_8, 4,4
  938. sub r0, r3
  939. pxor mm7, mm7
  940. movq mm0, [r0-8]
  941. movq mm3, [r0]
  942. movq mm1, [r0+8]
  943. movq mm2, mm3
  944. movq mm4, mm3
  945. PALIGNR mm2, mm0, 7, mm0
  946. PALIGNR mm1, mm4, 1, mm4
  947. test r1, r1 ; top_left
  948. jz .fix_lt_2
  949. test r2, r2 ; top_right
  950. jz .fix_tr_1
  951. jmp .body
  952. .fix_lt_2:
  953. movq mm5, mm3
  954. pxor mm5, mm2
  955. psllq mm5, 56
  956. psrlq mm5, 56
  957. pxor mm2, mm5
  958. test r2, r2 ; top_right
  959. jnz .body
  960. .fix_tr_1:
  961. movq mm5, mm3
  962. pxor mm5, mm1
  963. psrlq mm5, 56
  964. psllq mm5, 56
  965. pxor mm1, mm5
  966. .body:
  967. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  968. psadbw mm7, mm0
  969. paddw mm7, [pw_4]
  970. psrlw mm7, 3
  971. pshufw mm7, mm7, 0
  972. packuswb mm7, mm7
  973. %rep 3
  974. movq [r0+r3*1], mm7
  975. movq [r0+r3*2], mm7
  976. lea r0, [r0+r3*2]
  977. %endrep
  978. movq [r0+r3*1], mm7
  979. movq [r0+r3*2], mm7
  980. RET
  981. %endmacro
  982. INIT_MMX mmxext
  983. PRED8x8L_TOP_DC
  984. INIT_MMX ssse3
  985. PRED8x8L_TOP_DC
  986. ;-----------------------------------------------------------------------------
  987. ;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  988. ;-----------------------------------------------------------------------------
  989. %macro PRED8x8L_DC 0
  990. cglobal pred8x8l_dc_8, 4,5
  991. sub r0, r3
  992. lea r4, [r0+r3*2]
  993. movq mm0, [r0+r3*1-8]
  994. punpckhbw mm0, [r0+r3*0-8]
  995. movq mm1, [r4+r3*1-8]
  996. punpckhbw mm1, [r0+r3*2-8]
  997. mov r4, r0
  998. punpckhwd mm1, mm0
  999. lea r0, [r0+r3*4]
  1000. movq mm2, [r0+r3*1-8]
  1001. punpckhbw mm2, [r0+r3*0-8]
  1002. lea r0, [r0+r3*2]
  1003. movq mm3, [r0+r3*1-8]
  1004. punpckhbw mm3, [r0+r3*0-8]
  1005. punpckhwd mm3, mm2
  1006. punpckhdq mm3, mm1
  1007. lea r0, [r0+r3*2]
  1008. movq mm0, [r0+r3*0-8]
  1009. movq mm1, [r4]
  1010. mov r0, r4
  1011. movq mm4, mm3
  1012. movq mm2, mm3
  1013. PALIGNR mm4, mm0, 7, mm0
  1014. PALIGNR mm1, mm2, 1, mm2
  1015. test r1, r1
  1016. jnz .do_left
  1017. .fix_lt_1:
  1018. movq mm5, mm3
  1019. pxor mm5, mm4
  1020. psrlq mm5, 56
  1021. psllq mm5, 48
  1022. pxor mm1, mm5
  1023. jmp .do_left
  1024. .fix_lt_2:
  1025. movq mm5, mm3
  1026. pxor mm5, mm2
  1027. psllq mm5, 56
  1028. psrlq mm5, 56
  1029. pxor mm2, mm5
  1030. test r2, r2
  1031. jnz .body
  1032. .fix_tr_1:
  1033. movq mm5, mm3
  1034. pxor mm5, mm1
  1035. psrlq mm5, 56
  1036. psllq mm5, 56
  1037. pxor mm1, mm5
  1038. jmp .body
  1039. .do_left:
  1040. movq mm0, mm4
  1041. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1042. movq mm4, mm0
  1043. movq mm7, mm2
  1044. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1045. psllq mm1, 56
  1046. PALIGNR mm7, mm1, 7, mm3
  1047. movq mm0, [r0-8]
  1048. movq mm3, [r0]
  1049. movq mm1, [r0+8]
  1050. movq mm2, mm3
  1051. movq mm4, mm3
  1052. PALIGNR mm2, mm0, 7, mm0
  1053. PALIGNR mm1, mm4, 1, mm4
  1054. test r1, r1
  1055. jz .fix_lt_2
  1056. test r2, r2
  1057. jz .fix_tr_1
  1058. .body:
  1059. lea r1, [r0+r3*2]
  1060. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1061. pxor mm0, mm0
  1062. pxor mm1, mm1
  1063. lea r2, [r1+r3*2]
  1064. psadbw mm0, mm7
  1065. psadbw mm1, mm6
  1066. paddw mm0, [pw_8]
  1067. paddw mm0, mm1
  1068. lea r4, [r2+r3*2]
  1069. psrlw mm0, 4
  1070. pshufw mm0, mm0, 0
  1071. packuswb mm0, mm0
  1072. movq [r0+r3*1], mm0
  1073. movq [r0+r3*2], mm0
  1074. movq [r1+r3*1], mm0
  1075. movq [r1+r3*2], mm0
  1076. movq [r2+r3*1], mm0
  1077. movq [r2+r3*2], mm0
  1078. movq [r4+r3*1], mm0
  1079. movq [r4+r3*2], mm0
  1080. RET
  1081. %endmacro
  1082. INIT_MMX mmxext
  1083. PRED8x8L_DC
  1084. INIT_MMX ssse3
  1085. PRED8x8L_DC
  1086. ;-----------------------------------------------------------------------------
  1087. ; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  1088. ;-----------------------------------------------------------------------------
  1089. %macro PRED8x8L_HORIZONTAL 0
  1090. cglobal pred8x8l_horizontal_8, 4,4
  1091. sub r0, r3
  1092. lea r2, [r0+r3*2]
  1093. movq mm0, [r0+r3*1-8]
  1094. test r1, r1
  1095. lea r1, [r0+r3]
  1096. cmovnz r1, r0
  1097. punpckhbw mm0, [r1+r3*0-8]
  1098. movq mm1, [r2+r3*1-8]
  1099. punpckhbw mm1, [r0+r3*2-8]
  1100. mov r2, r0
  1101. punpckhwd mm1, mm0
  1102. lea r0, [r0+r3*4]
  1103. movq mm2, [r0+r3*1-8]
  1104. punpckhbw mm2, [r0+r3*0-8]
  1105. lea r0, [r0+r3*2]
  1106. movq mm3, [r0+r3*1-8]
  1107. punpckhbw mm3, [r0+r3*0-8]
  1108. punpckhwd mm3, mm2
  1109. punpckhdq mm3, mm1
  1110. lea r0, [r0+r3*2]
  1111. movq mm0, [r0+r3*0-8]
  1112. movq mm1, [r1+r3*0-8]
  1113. mov r0, r2
  1114. movq mm4, mm3
  1115. movq mm2, mm3
  1116. PALIGNR mm4, mm0, 7, mm0
  1117. PALIGNR mm1, mm2, 1, mm2
  1118. movq mm0, mm4
  1119. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1120. movq mm4, mm0
  1121. movq mm7, mm2
  1122. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1123. psllq mm1, 56
  1124. PALIGNR mm7, mm1, 7, mm3
  1125. movq mm3, mm7
  1126. lea r1, [r0+r3*2]
  1127. movq mm7, mm3
  1128. punpckhbw mm3, mm3
  1129. punpcklbw mm7, mm7
  1130. pshufw mm0, mm3, 0xff
  1131. pshufw mm1, mm3, 0xaa
  1132. lea r2, [r1+r3*2]
  1133. pshufw mm2, mm3, 0x55
  1134. pshufw mm3, mm3, 0x00
  1135. pshufw mm4, mm7, 0xff
  1136. pshufw mm5, mm7, 0xaa
  1137. pshufw mm6, mm7, 0x55
  1138. pshufw mm7, mm7, 0x00
  1139. movq [r0+r3*1], mm0
  1140. movq [r0+r3*2], mm1
  1141. movq [r1+r3*1], mm2
  1142. movq [r1+r3*2], mm3
  1143. movq [r2+r3*1], mm4
  1144. movq [r2+r3*2], mm5
  1145. lea r0, [r2+r3*2]
  1146. movq [r0+r3*1], mm6
  1147. movq [r0+r3*2], mm7
  1148. RET
  1149. %endmacro
  1150. INIT_MMX mmxext
  1151. PRED8x8L_HORIZONTAL
  1152. INIT_MMX ssse3
  1153. PRED8x8L_HORIZONTAL
  1154. ;-----------------------------------------------------------------------------
  1155. ; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  1156. ;-----------------------------------------------------------------------------
  1157. %macro PRED8x8L_VERTICAL 0
  1158. cglobal pred8x8l_vertical_8, 4,4
  1159. sub r0, r3
  1160. movq mm0, [r0-8]
  1161. movq mm3, [r0]
  1162. movq mm1, [r0+8]
  1163. movq mm2, mm3
  1164. movq mm4, mm3
  1165. PALIGNR mm2, mm0, 7, mm0
  1166. PALIGNR mm1, mm4, 1, mm4
  1167. test r1, r1 ; top_left
  1168. jz .fix_lt_2
  1169. test r2, r2 ; top_right
  1170. jz .fix_tr_1
  1171. jmp .body
  1172. .fix_lt_2:
  1173. movq mm5, mm3
  1174. pxor mm5, mm2
  1175. psllq mm5, 56
  1176. psrlq mm5, 56
  1177. pxor mm2, mm5
  1178. test r2, r2 ; top_right
  1179. jnz .body
  1180. .fix_tr_1:
  1181. movq mm5, mm3
  1182. pxor mm5, mm1
  1183. psrlq mm5, 56
  1184. psllq mm5, 56
  1185. pxor mm1, mm5
  1186. .body:
  1187. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1188. %rep 3
  1189. movq [r0+r3*1], mm0
  1190. movq [r0+r3*2], mm0
  1191. lea r0, [r0+r3*2]
  1192. %endrep
  1193. movq [r0+r3*1], mm0
  1194. movq [r0+r3*2], mm0
  1195. RET
  1196. %endmacro
  1197. INIT_MMX mmxext
  1198. PRED8x8L_VERTICAL
  1199. INIT_MMX ssse3
  1200. PRED8x8L_VERTICAL
  1201. ;-----------------------------------------------------------------------------
  1202. ;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  1203. ;-----------------------------------------------------------------------------
  1204. INIT_MMX mmxext
  1205. cglobal pred8x8l_down_left_8, 4,5
  1206. sub r0, r3
  1207. movq mm0, [r0-8]
  1208. movq mm3, [r0]
  1209. movq mm1, [r0+8]
  1210. movq mm2, mm3
  1211. movq mm4, mm3
  1212. PALIGNR mm2, mm0, 7, mm0
  1213. PALIGNR mm1, mm4, 1, mm4
  1214. test r1, r1
  1215. jz .fix_lt_2
  1216. test r2, r2
  1217. jz .fix_tr_1
  1218. jmp .do_top
  1219. .fix_lt_2:
  1220. movq mm5, mm3
  1221. pxor mm5, mm2
  1222. psllq mm5, 56
  1223. psrlq mm5, 56
  1224. pxor mm2, mm5
  1225. test r2, r2
  1226. jnz .do_top
  1227. .fix_tr_1:
  1228. movq mm5, mm3
  1229. pxor mm5, mm1
  1230. psrlq mm5, 56
  1231. psllq mm5, 56
  1232. pxor mm1, mm5
  1233. jmp .do_top
  1234. .fix_tr_2:
  1235. punpckhbw mm3, mm3
  1236. pshufw mm1, mm3, 0xFF
  1237. jmp .do_topright
  1238. .do_top:
  1239. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1240. movq mm7, mm4
  1241. test r2, r2
  1242. jz .fix_tr_2
  1243. movq mm0, [r0+8]
  1244. movq mm5, mm0
  1245. movq mm2, mm0
  1246. movq mm4, mm0
  1247. psrlq mm5, 56
  1248. PALIGNR mm2, mm3, 7, mm3
  1249. PALIGNR mm5, mm4, 1, mm4
  1250. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1251. .do_topright:
  1252. lea r1, [r0+r3*2]
  1253. movq mm6, mm1
  1254. psrlq mm1, 56
  1255. movq mm4, mm1
  1256. lea r2, [r1+r3*2]
  1257. movq mm2, mm6
  1258. PALIGNR mm2, mm7, 1, mm0
  1259. movq mm3, mm6
  1260. PALIGNR mm3, mm7, 7, mm0
  1261. PALIGNR mm4, mm6, 1, mm0
  1262. movq mm5, mm7
  1263. movq mm1, mm7
  1264. movq mm7, mm6
  1265. lea r4, [r2+r3*2]
  1266. psllq mm1, 8
  1267. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1268. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1269. movq [r4+r3*2], mm1
  1270. movq mm2, mm0
  1271. psllq mm1, 8
  1272. psrlq mm2, 56
  1273. psllq mm0, 8
  1274. por mm1, mm2
  1275. movq [r4+r3*1], mm1
  1276. movq mm2, mm0
  1277. psllq mm1, 8
  1278. psrlq mm2, 56
  1279. psllq mm0, 8
  1280. por mm1, mm2
  1281. movq [r2+r3*2], mm1
  1282. movq mm2, mm0
  1283. psllq mm1, 8
  1284. psrlq mm2, 56
  1285. psllq mm0, 8
  1286. por mm1, mm2
  1287. movq [r2+r3*1], mm1
  1288. movq mm2, mm0
  1289. psllq mm1, 8
  1290. psrlq mm2, 56
  1291. psllq mm0, 8
  1292. por mm1, mm2
  1293. movq [r1+r3*2], mm1
  1294. movq mm2, mm0
  1295. psllq mm1, 8
  1296. psrlq mm2, 56
  1297. psllq mm0, 8
  1298. por mm1, mm2
  1299. movq [r1+r3*1], mm1
  1300. movq mm2, mm0
  1301. psllq mm1, 8
  1302. psrlq mm2, 56
  1303. psllq mm0, 8
  1304. por mm1, mm2
  1305. movq [r0+r3*2], mm1
  1306. psllq mm1, 8
  1307. psrlq mm0, 56
  1308. por mm1, mm0
  1309. movq [r0+r3*1], mm1
  1310. RET
  1311. %macro PRED8x8L_DOWN_LEFT 0
  1312. cglobal pred8x8l_down_left_8, 4,4
  1313. sub r0, r3
  1314. movq mm0, [r0-8]
  1315. movq mm3, [r0]
  1316. movq mm1, [r0+8]
  1317. movq mm2, mm3
  1318. movq mm4, mm3
  1319. PALIGNR mm2, mm0, 7, mm0
  1320. PALIGNR mm1, mm4, 1, mm4
  1321. test r1, r1 ; top_left
  1322. jz .fix_lt_2
  1323. test r2, r2 ; top_right
  1324. jz .fix_tr_1
  1325. jmp .do_top
  1326. .fix_lt_2:
  1327. movq mm5, mm3
  1328. pxor mm5, mm2
  1329. psllq mm5, 56
  1330. psrlq mm5, 56
  1331. pxor mm2, mm5
  1332. test r2, r2 ; top_right
  1333. jnz .do_top
  1334. .fix_tr_1:
  1335. movq mm5, mm3
  1336. pxor mm5, mm1
  1337. psrlq mm5, 56
  1338. psllq mm5, 56
  1339. pxor mm1, mm5
  1340. jmp .do_top
  1341. .fix_tr_2:
  1342. punpckhbw mm3, mm3
  1343. pshufw mm1, mm3, 0xFF
  1344. jmp .do_topright
  1345. .do_top:
  1346. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1347. movq2dq xmm3, mm4
  1348. test r2, r2 ; top_right
  1349. jz .fix_tr_2
  1350. movq mm0, [r0+8]
  1351. movq mm5, mm0
  1352. movq mm2, mm0
  1353. movq mm4, mm0
  1354. psrlq mm5, 56
  1355. PALIGNR mm2, mm3, 7, mm3
  1356. PALIGNR mm5, mm4, 1, mm4
  1357. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1358. .do_topright:
  1359. movq2dq xmm4, mm1
  1360. psrlq mm1, 56
  1361. movq2dq xmm5, mm1
  1362. lea r1, [r0+r3*2]
  1363. pslldq xmm4, 8
  1364. por xmm3, xmm4
  1365. movdqa xmm2, xmm3
  1366. psrldq xmm2, 1
  1367. pslldq xmm5, 15
  1368. por xmm2, xmm5
  1369. lea r2, [r1+r3*2]
  1370. movdqa xmm1, xmm3
  1371. pslldq xmm1, 1
  1372. INIT_XMM cpuname
  1373. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1374. psrldq xmm0, 1
  1375. movq [r0+r3*1], xmm0
  1376. psrldq xmm0, 1
  1377. movq [r0+r3*2], xmm0
  1378. psrldq xmm0, 1
  1379. lea r0, [r2+r3*2]
  1380. movq [r1+r3*1], xmm0
  1381. psrldq xmm0, 1
  1382. movq [r1+r3*2], xmm0
  1383. psrldq xmm0, 1
  1384. movq [r2+r3*1], xmm0
  1385. psrldq xmm0, 1
  1386. movq [r2+r3*2], xmm0
  1387. psrldq xmm0, 1
  1388. movq [r0+r3*1], xmm0
  1389. psrldq xmm0, 1
  1390. movq [r0+r3*2], xmm0
  1391. RET
  1392. %endmacro
  1393. INIT_MMX sse2
  1394. PRED8x8L_DOWN_LEFT
  1395. INIT_MMX ssse3
  1396. PRED8x8L_DOWN_LEFT
  1397. ;-----------------------------------------------------------------------------
  1398. ;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1399. ;-----------------------------------------------------------------------------
  1400. INIT_MMX mmxext
  1401. cglobal pred8x8l_down_right_8, 4,5
  1402. sub r0, r3
  1403. lea r4, [r0+r3*2]
  1404. movq mm0, [r0+r3*1-8]
  1405. punpckhbw mm0, [r0+r3*0-8]
  1406. movq mm1, [r4+r3*1-8]
  1407. punpckhbw mm1, [r0+r3*2-8]
  1408. mov r4, r0
  1409. punpckhwd mm1, mm0
  1410. lea r0, [r0+r3*4]
  1411. movq mm2, [r0+r3*1-8]
  1412. punpckhbw mm2, [r0+r3*0-8]
  1413. lea r0, [r0+r3*2]
  1414. movq mm3, [r0+r3*1-8]
  1415. punpckhbw mm3, [r0+r3*0-8]
  1416. punpckhwd mm3, mm2
  1417. punpckhdq mm3, mm1
  1418. lea r0, [r0+r3*2]
  1419. movq mm0, [r0+r3*0-8]
  1420. movq mm1, [r4]
  1421. mov r0, r4
  1422. movq mm4, mm3
  1423. movq mm2, mm3
  1424. PALIGNR mm4, mm0, 7, mm0
  1425. PALIGNR mm1, mm2, 1, mm2
  1426. test r1, r1 ; top_left
  1427. jz .fix_lt_1
  1428. .do_left:
  1429. movq mm0, mm4
  1430. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1431. movq mm4, mm0
  1432. movq mm7, mm2
  1433. movq mm6, mm2
  1434. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1435. psllq mm1, 56
  1436. PALIGNR mm7, mm1, 7, mm3
  1437. movq mm0, [r0-8]
  1438. movq mm3, [r0]
  1439. movq mm1, [r0+8]
  1440. movq mm2, mm3
  1441. movq mm4, mm3
  1442. PALIGNR mm2, mm0, 7, mm0
  1443. PALIGNR mm1, mm4, 1, mm4
  1444. test r1, r1 ; top_left
  1445. jz .fix_lt_2
  1446. test r2, r2 ; top_right
  1447. jz .fix_tr_1
  1448. .do_top:
  1449. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1450. movq mm5, mm4
  1451. jmp .body
  1452. .fix_lt_1:
  1453. movq mm5, mm3
  1454. pxor mm5, mm4
  1455. psrlq mm5, 56
  1456. psllq mm5, 48
  1457. pxor mm1, mm5
  1458. jmp .do_left
  1459. .fix_lt_2:
  1460. movq mm5, mm3
  1461. pxor mm5, mm2
  1462. psllq mm5, 56
  1463. psrlq mm5, 56
  1464. pxor mm2, mm5
  1465. test r2, r2 ; top_right
  1466. jnz .do_top
  1467. .fix_tr_1:
  1468. movq mm5, mm3
  1469. pxor mm5, mm1
  1470. psrlq mm5, 56
  1471. psllq mm5, 56
  1472. pxor mm1, mm5
  1473. jmp .do_top
  1474. .body:
  1475. lea r1, [r0+r3*2]
  1476. movq mm1, mm7
  1477. movq mm7, mm5
  1478. movq mm5, mm6
  1479. movq mm2, mm7
  1480. lea r2, [r1+r3*2]
  1481. PALIGNR mm2, mm6, 1, mm0
  1482. movq mm3, mm7
  1483. PALIGNR mm3, mm6, 7, mm0
  1484. movq mm4, mm7
  1485. lea r4, [r2+r3*2]
  1486. psrlq mm4, 8
  1487. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1488. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1489. movq [r4+r3*2], mm0
  1490. movq mm2, mm1
  1491. psrlq mm0, 8
  1492. psllq mm2, 56
  1493. psrlq mm1, 8
  1494. por mm0, mm2
  1495. movq [r4+r3*1], mm0
  1496. movq mm2, mm1
  1497. psrlq mm0, 8
  1498. psllq mm2, 56
  1499. psrlq mm1, 8
  1500. por mm0, mm2
  1501. movq [r2+r3*2], mm0
  1502. movq mm2, mm1
  1503. psrlq mm0, 8
  1504. psllq mm2, 56
  1505. psrlq mm1, 8
  1506. por mm0, mm2
  1507. movq [r2+r3*1], mm0
  1508. movq mm2, mm1
  1509. psrlq mm0, 8
  1510. psllq mm2, 56
  1511. psrlq mm1, 8
  1512. por mm0, mm2
  1513. movq [r1+r3*2], mm0
  1514. movq mm2, mm1
  1515. psrlq mm0, 8
  1516. psllq mm2, 56
  1517. psrlq mm1, 8
  1518. por mm0, mm2
  1519. movq [r1+r3*1], mm0
  1520. movq mm2, mm1
  1521. psrlq mm0, 8
  1522. psllq mm2, 56
  1523. psrlq mm1, 8
  1524. por mm0, mm2
  1525. movq [r0+r3*2], mm0
  1526. psrlq mm0, 8
  1527. psllq mm1, 56
  1528. por mm0, mm1
  1529. movq [r0+r3*1], mm0
  1530. RET
  1531. %macro PRED8x8L_DOWN_RIGHT 0
  1532. cglobal pred8x8l_down_right_8, 4,5
  1533. sub r0, r3
  1534. lea r4, [r0+r3*2]
  1535. movq mm0, [r0+r3*1-8]
  1536. punpckhbw mm0, [r0+r3*0-8]
  1537. movq mm1, [r4+r3*1-8]
  1538. punpckhbw mm1, [r0+r3*2-8]
  1539. mov r4, r0
  1540. punpckhwd mm1, mm0
  1541. lea r0, [r0+r3*4]
  1542. movq mm2, [r0+r3*1-8]
  1543. punpckhbw mm2, [r0+r3*0-8]
  1544. lea r0, [r0+r3*2]
  1545. movq mm3, [r0+r3*1-8]
  1546. punpckhbw mm3, [r0+r3*0-8]
  1547. punpckhwd mm3, mm2
  1548. punpckhdq mm3, mm1
  1549. lea r0, [r0+r3*2]
  1550. movq mm0, [r0+r3*0-8]
  1551. movq mm1, [r4]
  1552. mov r0, r4
  1553. movq mm4, mm3
  1554. movq mm2, mm3
  1555. PALIGNR mm4, mm0, 7, mm0
  1556. PALIGNR mm1, mm2, 1, mm2
  1557. test r1, r1
  1558. jz .fix_lt_1
  1559. jmp .do_left
  1560. .fix_lt_1:
  1561. movq mm5, mm3
  1562. pxor mm5, mm4
  1563. psrlq mm5, 56
  1564. psllq mm5, 48
  1565. pxor mm1, mm5
  1566. jmp .do_left
  1567. .fix_lt_2:
  1568. movq mm5, mm3
  1569. pxor mm5, mm2
  1570. psllq mm5, 56
  1571. psrlq mm5, 56
  1572. pxor mm2, mm5
  1573. test r2, r2
  1574. jnz .do_top
  1575. .fix_tr_1:
  1576. movq mm5, mm3
  1577. pxor mm5, mm1
  1578. psrlq mm5, 56
  1579. psllq mm5, 56
  1580. pxor mm1, mm5
  1581. jmp .do_top
  1582. .do_left:
  1583. movq mm0, mm4
  1584. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1585. movq mm4, mm0
  1586. movq mm7, mm2
  1587. movq2dq xmm3, mm2
  1588. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1589. psllq mm1, 56
  1590. PALIGNR mm7, mm1, 7, mm3
  1591. movq2dq xmm1, mm7
  1592. movq mm0, [r0-8]
  1593. movq mm3, [r0]
  1594. movq mm1, [r0+8]
  1595. movq mm2, mm3
  1596. movq mm4, mm3
  1597. PALIGNR mm2, mm0, 7, mm0
  1598. PALIGNR mm1, mm4, 1, mm4
  1599. test r1, r1
  1600. jz .fix_lt_2
  1601. test r2, r2
  1602. jz .fix_tr_1
  1603. .do_top:
  1604. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1605. movq2dq xmm4, mm4
  1606. lea r1, [r0+r3*2]
  1607. movdqa xmm0, xmm3
  1608. pslldq xmm4, 8
  1609. por xmm3, xmm4
  1610. lea r2, [r1+r3*2]
  1611. pslldq xmm4, 1
  1612. por xmm1, xmm4
  1613. psrldq xmm0, 7
  1614. pslldq xmm0, 15
  1615. psrldq xmm0, 7
  1616. por xmm1, xmm0
  1617. lea r0, [r2+r3*2]
  1618. movdqa xmm2, xmm3
  1619. psrldq xmm2, 1
  1620. INIT_XMM cpuname
  1621. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1622. movdqa xmm1, xmm0
  1623. psrldq xmm1, 1
  1624. movq [r0+r3*2], xmm0
  1625. movq [r0+r3*1], xmm1
  1626. psrldq xmm0, 2
  1627. psrldq xmm1, 2
  1628. movq [r2+r3*2], xmm0
  1629. movq [r2+r3*1], xmm1
  1630. psrldq xmm0, 2
  1631. psrldq xmm1, 2
  1632. movq [r1+r3*2], xmm0
  1633. movq [r1+r3*1], xmm1
  1634. psrldq xmm0, 2
  1635. psrldq xmm1, 2
  1636. movq [r4+r3*2], xmm0
  1637. movq [r4+r3*1], xmm1
  1638. RET
  1639. %endmacro
  1640. INIT_MMX sse2
  1641. PRED8x8L_DOWN_RIGHT
  1642. INIT_MMX ssse3
  1643. PRED8x8L_DOWN_RIGHT
  1644. ;-----------------------------------------------------------------------------
  1645. ; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  1646. ;-----------------------------------------------------------------------------
  1647. INIT_MMX mmxext
  1648. cglobal pred8x8l_vertical_right_8, 4,5
  1649. sub r0, r3
  1650. lea r4, [r0+r3*2]
  1651. movq mm0, [r0+r3*1-8]
  1652. punpckhbw mm0, [r0+r3*0-8]
  1653. movq mm1, [r4+r3*1-8]
  1654. punpckhbw mm1, [r0+r3*2-8]
  1655. mov r4, r0
  1656. punpckhwd mm1, mm0
  1657. lea r0, [r0+r3*4]
  1658. movq mm2, [r0+r3*1-8]
  1659. punpckhbw mm2, [r0+r3*0-8]
  1660. lea r0, [r0+r3*2]
  1661. movq mm3, [r0+r3*1-8]
  1662. punpckhbw mm3, [r0+r3*0-8]
  1663. punpckhwd mm3, mm2
  1664. punpckhdq mm3, mm1
  1665. lea r0, [r0+r3*2]
  1666. movq mm0, [r0+r3*0-8]
  1667. movq mm1, [r4]
  1668. mov r0, r4
  1669. movq mm4, mm3
  1670. movq mm2, mm3
  1671. PALIGNR mm4, mm0, 7, mm0
  1672. PALIGNR mm1, mm2, 1, mm2
  1673. test r1, r1
  1674. jz .fix_lt_1
  1675. jmp .do_left
  1676. .fix_lt_1:
  1677. movq mm5, mm3
  1678. pxor mm5, mm4
  1679. psrlq mm5, 56
  1680. psllq mm5, 48
  1681. pxor mm1, mm5
  1682. jmp .do_left
  1683. .fix_lt_2:
  1684. movq mm5, mm3
  1685. pxor mm5, mm2
  1686. psllq mm5, 56
  1687. psrlq mm5, 56
  1688. pxor mm2, mm5
  1689. test r2, r2
  1690. jnz .do_top
  1691. .fix_tr_1:
  1692. movq mm5, mm3
  1693. pxor mm5, mm1
  1694. psrlq mm5, 56
  1695. psllq mm5, 56
  1696. pxor mm1, mm5
  1697. jmp .do_top
  1698. .do_left:
  1699. movq mm0, mm4
  1700. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1701. movq mm7, mm2
  1702. movq mm0, [r0-8]
  1703. movq mm3, [r0]
  1704. movq mm1, [r0+8]
  1705. movq mm2, mm3
  1706. movq mm4, mm3
  1707. PALIGNR mm2, mm0, 7, mm0
  1708. PALIGNR mm1, mm4, 1, mm4
  1709. test r1, r1
  1710. jz .fix_lt_2
  1711. test r2, r2
  1712. jz .fix_tr_1
  1713. .do_top:
  1714. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1715. lea r1, [r0+r3*2]
  1716. movq mm2, mm6
  1717. movq mm3, mm6
  1718. PALIGNR mm3, mm7, 7, mm0
  1719. PALIGNR mm6, mm7, 6, mm1
  1720. movq mm4, mm3
  1721. pavgb mm3, mm2
  1722. lea r2, [r1+r3*2]
  1723. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1724. movq [r0+r3*1], mm3
  1725. movq [r0+r3*2], mm0
  1726. movq mm5, mm0
  1727. movq mm6, mm3
  1728. movq mm1, mm7
  1729. movq mm2, mm1
  1730. psllq mm2, 8
  1731. movq mm3, mm1
  1732. psllq mm3, 16
  1733. lea r4, [r2+r3*2]
  1734. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1735. PALIGNR mm6, mm0, 7, mm2
  1736. movq [r1+r3*1], mm6
  1737. psllq mm0, 8
  1738. PALIGNR mm5, mm0, 7, mm1
  1739. movq [r1+r3*2], mm5
  1740. psllq mm0, 8
  1741. PALIGNR mm6, mm0, 7, mm2
  1742. movq [r2+r3*1], mm6
  1743. psllq mm0, 8
  1744. PALIGNR mm5, mm0, 7, mm1
  1745. movq [r2+r3*2], mm5
  1746. psllq mm0, 8
  1747. PALIGNR mm6, mm0, 7, mm2
  1748. movq [r4+r3*1], mm6
  1749. psllq mm0, 8
  1750. PALIGNR mm5, mm0, 7, mm1
  1751. movq [r4+r3*2], mm5
  1752. RET
  1753. %macro PRED8x8L_VERTICAL_RIGHT 0
  1754. cglobal pred8x8l_vertical_right_8, 4,5,7
  1755. ; manually spill XMM registers for Win64 because
  1756. ; the code here is initialized with INIT_MMX
  1757. WIN64_SPILL_XMM 7
  1758. sub r0, r3
  1759. lea r4, [r0+r3*2]
  1760. movq mm0, [r0+r3*1-8]
  1761. punpckhbw mm0, [r0+r3*0-8]
  1762. movq mm1, [r4+r3*1-8]
  1763. punpckhbw mm1, [r0+r3*2-8]
  1764. mov r4, r0
  1765. punpckhwd mm1, mm0
  1766. lea r0, [r0+r3*4]
  1767. movq mm2, [r0+r3*1-8]
  1768. punpckhbw mm2, [r0+r3*0-8]
  1769. lea r0, [r0+r3*2]
  1770. movq mm3, [r0+r3*1-8]
  1771. punpckhbw mm3, [r0+r3*0-8]
  1772. punpckhwd mm3, mm2
  1773. punpckhdq mm3, mm1
  1774. lea r0, [r0+r3*2]
  1775. movq mm0, [r0+r3*0-8]
  1776. movq mm1, [r4]
  1777. mov r0, r4
  1778. movq mm4, mm3
  1779. movq mm2, mm3
  1780. PALIGNR mm4, mm0, 7, mm0
  1781. PALIGNR mm1, mm2, 1, mm2
  1782. test r1, r1
  1783. jnz .do_left
  1784. .fix_lt_1:
  1785. movq mm5, mm3
  1786. pxor mm5, mm4
  1787. psrlq mm5, 56
  1788. psllq mm5, 48
  1789. pxor mm1, mm5
  1790. jmp .do_left
  1791. .fix_lt_2:
  1792. movq mm5, mm3
  1793. pxor mm5, mm2
  1794. psllq mm5, 56
  1795. psrlq mm5, 56
  1796. pxor mm2, mm5
  1797. test r2, r2
  1798. jnz .do_top
  1799. .fix_tr_1:
  1800. movq mm5, mm3
  1801. pxor mm5, mm1
  1802. psrlq mm5, 56
  1803. psllq mm5, 56
  1804. pxor mm1, mm5
  1805. jmp .do_top
  1806. .do_left:
  1807. movq mm0, mm4
  1808. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1809. movq2dq xmm0, mm2
  1810. movq mm0, [r0-8]
  1811. movq mm3, [r0]
  1812. movq mm1, [r0+8]
  1813. movq mm2, mm3
  1814. movq mm4, mm3
  1815. PALIGNR mm2, mm0, 7, mm0
  1816. PALIGNR mm1, mm4, 1, mm4
  1817. test r1, r1
  1818. jz .fix_lt_2
  1819. test r2, r2
  1820. jz .fix_tr_1
  1821. .do_top:
  1822. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1823. lea r1, [r0+r3*2]
  1824. movq2dq xmm4, mm6
  1825. pslldq xmm4, 8
  1826. por xmm0, xmm4
  1827. movdqa xmm6, [pw_ff00]
  1828. movdqa xmm1, xmm0
  1829. lea r2, [r1+r3*2]
  1830. movdqa xmm2, xmm0
  1831. movdqa xmm3, xmm0
  1832. pslldq xmm0, 1
  1833. pslldq xmm1, 2
  1834. pavgb xmm2, xmm0
  1835. INIT_XMM cpuname
  1836. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1837. pandn xmm6, xmm4
  1838. movdqa xmm5, xmm4
  1839. psrlw xmm4, 8
  1840. packuswb xmm6, xmm4
  1841. movhlps xmm4, xmm6
  1842. movhps [r0+r3*2], xmm5
  1843. movhps [r0+r3*1], xmm2
  1844. psrldq xmm5, 4
  1845. movss xmm5, xmm6
  1846. psrldq xmm2, 4
  1847. movss xmm2, xmm4
  1848. lea r0, [r2+r3*2]
  1849. psrldq xmm5, 1
  1850. psrldq xmm2, 1
  1851. movq [r0+r3*2], xmm5
  1852. movq [r0+r3*1], xmm2
  1853. psrldq xmm5, 1
  1854. psrldq xmm2, 1
  1855. movq [r2+r3*2], xmm5
  1856. movq [r2+r3*1], xmm2
  1857. psrldq xmm5, 1
  1858. psrldq xmm2, 1
  1859. movq [r1+r3*2], xmm5
  1860. movq [r1+r3*1], xmm2
  1861. RET
  1862. %endmacro
  1863. INIT_MMX sse2
  1864. PRED8x8L_VERTICAL_RIGHT
  1865. INIT_MMX ssse3
  1866. PRED8x8L_VERTICAL_RIGHT
  1867. ;-----------------------------------------------------------------------------
  1868. ;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  1869. ;-----------------------------------------------------------------------------
  1870. %macro PRED8x8L_VERTICAL_LEFT 0
  1871. cglobal pred8x8l_vertical_left_8, 4,4
  1872. sub r0, r3
  1873. movq mm0, [r0-8]
  1874. movq mm3, [r0]
  1875. movq mm1, [r0+8]
  1876. movq mm2, mm3
  1877. movq mm4, mm3
  1878. PALIGNR mm2, mm0, 7, mm0
  1879. PALIGNR mm1, mm4, 1, mm4
  1880. test r1, r1
  1881. jz .fix_lt_2
  1882. test r2, r2
  1883. jz .fix_tr_1
  1884. jmp .do_top
  1885. .fix_lt_2:
  1886. movq mm5, mm3
  1887. pxor mm5, mm2
  1888. psllq mm5, 56
  1889. psrlq mm5, 56
  1890. pxor mm2, mm5
  1891. test r2, r2
  1892. jnz .do_top
  1893. .fix_tr_1:
  1894. movq mm5, mm3
  1895. pxor mm5, mm1
  1896. psrlq mm5, 56
  1897. psllq mm5, 56
  1898. pxor mm1, mm5
  1899. jmp .do_top
  1900. .fix_tr_2:
  1901. punpckhbw mm3, mm3
  1902. pshufw mm1, mm3, 0xFF
  1903. jmp .do_topright
  1904. .do_top:
  1905. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1906. movq2dq xmm4, mm4
  1907. test r2, r2
  1908. jz .fix_tr_2
  1909. movq mm0, [r0+8]
  1910. movq mm5, mm0
  1911. movq mm2, mm0
  1912. movq mm4, mm0
  1913. psrlq mm5, 56
  1914. PALIGNR mm2, mm3, 7, mm3
  1915. PALIGNR mm5, mm4, 1, mm4
  1916. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1917. .do_topright:
  1918. movq2dq xmm3, mm1
  1919. lea r1, [r0+r3*2]
  1920. pslldq xmm3, 8
  1921. por xmm4, xmm3
  1922. movdqa xmm2, xmm4
  1923. movdqa xmm1, xmm4
  1924. movdqa xmm3, xmm4
  1925. psrldq xmm2, 1
  1926. pslldq xmm1, 1
  1927. pavgb xmm3, xmm2
  1928. lea r2, [r1+r3*2]
  1929. INIT_XMM cpuname
  1930. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1931. psrldq xmm0, 1
  1932. movq [r0+r3*1], xmm3
  1933. movq [r0+r3*2], xmm0
  1934. lea r0, [r2+r3*2]
  1935. psrldq xmm3, 1
  1936. psrldq xmm0, 1
  1937. movq [r1+r3*1], xmm3
  1938. movq [r1+r3*2], xmm0
  1939. psrldq xmm3, 1
  1940. psrldq xmm0, 1
  1941. movq [r2+r3*1], xmm3
  1942. movq [r2+r3*2], xmm0
  1943. psrldq xmm3, 1
  1944. psrldq xmm0, 1
  1945. movq [r0+r3*1], xmm3
  1946. movq [r0+r3*2], xmm0
  1947. RET
  1948. %endmacro
  1949. INIT_MMX sse2
  1950. PRED8x8L_VERTICAL_LEFT
  1951. INIT_MMX ssse3
  1952. PRED8x8L_VERTICAL_LEFT
  1953. ;-----------------------------------------------------------------------------
  1954. ; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  1955. ;-----------------------------------------------------------------------------
  1956. %macro PRED8x8L_HORIZONTAL_UP 0
  1957. cglobal pred8x8l_horizontal_up_8, 4,4
  1958. sub r0, r3
  1959. lea r2, [r0+r3*2]
  1960. movq mm0, [r0+r3*1-8]
  1961. test r1, r1
  1962. lea r1, [r0+r3]
  1963. cmovnz r1, r0
  1964. punpckhbw mm0, [r1+r3*0-8]
  1965. movq mm1, [r2+r3*1-8]
  1966. punpckhbw mm1, [r0+r3*2-8]
  1967. mov r2, r0
  1968. punpckhwd mm1, mm0
  1969. lea r0, [r0+r3*4]
  1970. movq mm2, [r0+r3*1-8]
  1971. punpckhbw mm2, [r0+r3*0-8]
  1972. lea r0, [r0+r3*2]
  1973. movq mm3, [r0+r3*1-8]
  1974. punpckhbw mm3, [r0+r3*0-8]
  1975. punpckhwd mm3, mm2
  1976. punpckhdq mm3, mm1
  1977. lea r0, [r0+r3*2]
  1978. movq mm0, [r0+r3*0-8]
  1979. movq mm1, [r1+r3*0-8]
  1980. mov r0, r2
  1981. movq mm4, mm3
  1982. movq mm2, mm3
  1983. PALIGNR mm4, mm0, 7, mm0
  1984. PALIGNR mm1, mm2, 1, mm2
  1985. movq mm0, mm4
  1986. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1987. movq mm4, mm0
  1988. movq mm7, mm2
  1989. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1990. psllq mm1, 56
  1991. PALIGNR mm7, mm1, 7, mm3
  1992. lea r1, [r0+r3*2]
  1993. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  1994. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  1995. movq mm2, mm0
  1996. psllw mm0, 8
  1997. psrlw mm2, 8
  1998. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  1999. movq mm3, mm2
  2000. movq mm4, mm2
  2001. movq mm5, mm2
  2002. psrlq mm2, 8
  2003. psrlq mm3, 16
  2004. lea r2, [r1+r3*2]
  2005. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2006. punpckhbw mm7, mm7
  2007. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2008. pavgb mm4, mm2
  2009. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2010. movq mm5, mm4
  2011. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2012. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2013. movq mm6, mm5
  2014. movq mm7, mm5
  2015. movq mm0, mm5
  2016. PALIGNR mm5, mm4, 2, mm1
  2017. pshufw mm1, mm6, 11111001b
  2018. PALIGNR mm6, mm4, 4, mm2
  2019. pshufw mm2, mm7, 11111110b
  2020. PALIGNR mm7, mm4, 6, mm3
  2021. pshufw mm3, mm0, 11111111b
  2022. movq [r0+r3*1], mm4
  2023. movq [r0+r3*2], mm5
  2024. lea r0, [r2+r3*2]
  2025. movq [r1+r3*1], mm6
  2026. movq [r1+r3*2], mm7
  2027. movq [r2+r3*1], mm0
  2028. movq [r2+r3*2], mm1
  2029. movq [r0+r3*1], mm2
  2030. movq [r0+r3*2], mm3
  2031. RET
  2032. %endmacro
  2033. INIT_MMX mmxext
  2034. PRED8x8L_HORIZONTAL_UP
  2035. INIT_MMX ssse3
  2036. PRED8x8L_HORIZONTAL_UP
  2037. ;-----------------------------------------------------------------------------
  2038. ;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride)
  2039. ;-----------------------------------------------------------------------------
  2040. INIT_MMX mmxext
  2041. cglobal pred8x8l_horizontal_down_8, 4,5
  2042. sub r0, r3
  2043. lea r4, [r0+r3*2]
  2044. movq mm0, [r0+r3*1-8]
  2045. punpckhbw mm0, [r0+r3*0-8]
  2046. movq mm1, [r4+r3*1-8]
  2047. punpckhbw mm1, [r0+r3*2-8]
  2048. mov r4, r0
  2049. punpckhwd mm1, mm0
  2050. lea r0, [r0+r3*4]
  2051. movq mm2, [r0+r3*1-8]
  2052. punpckhbw mm2, [r0+r3*0-8]
  2053. lea r0, [r0+r3*2]
  2054. movq mm3, [r0+r3*1-8]
  2055. punpckhbw mm3, [r0+r3*0-8]
  2056. punpckhwd mm3, mm2
  2057. punpckhdq mm3, mm1
  2058. lea r0, [r0+r3*2]
  2059. movq mm0, [r0+r3*0-8]
  2060. movq mm1, [r4]
  2061. mov r0, r4
  2062. movq mm4, mm3
  2063. movq mm2, mm3
  2064. PALIGNR mm4, mm0, 7, mm0
  2065. PALIGNR mm1, mm2, 1, mm2
  2066. test r1, r1
  2067. jnz .do_left
  2068. .fix_lt_1:
  2069. movq mm5, mm3
  2070. pxor mm5, mm4
  2071. psrlq mm5, 56
  2072. psllq mm5, 48
  2073. pxor mm1, mm5
  2074. jmp .do_left
  2075. .fix_lt_2:
  2076. movq mm5, mm3
  2077. pxor mm5, mm2
  2078. psllq mm5, 56
  2079. psrlq mm5, 56
  2080. pxor mm2, mm5
  2081. test r2, r2
  2082. jnz .do_top
  2083. .fix_tr_1:
  2084. movq mm5, mm3
  2085. pxor mm5, mm1
  2086. psrlq mm5, 56
  2087. psllq mm5, 56
  2088. pxor mm1, mm5
  2089. jmp .do_top
  2090. .do_left:
  2091. movq mm0, mm4
  2092. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2093. movq mm4, mm0
  2094. movq mm7, mm2
  2095. movq mm6, mm2
  2096. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2097. psllq mm1, 56
  2098. PALIGNR mm7, mm1, 7, mm3
  2099. movq mm0, [r0-8]
  2100. movq mm3, [r0]
  2101. movq mm1, [r0+8]
  2102. movq mm2, mm3
  2103. movq mm4, mm3
  2104. PALIGNR mm2, mm0, 7, mm0
  2105. PALIGNR mm1, mm4, 1, mm4
  2106. test r1, r1
  2107. jz .fix_lt_2
  2108. test r2, r2
  2109. jz .fix_tr_1
  2110. .do_top:
  2111. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2112. movq mm5, mm4
  2113. lea r1, [r0+r3*2]
  2114. psllq mm7, 56
  2115. movq mm2, mm5
  2116. movq mm3, mm6
  2117. movq mm4, mm2
  2118. PALIGNR mm2, mm6, 7, mm5
  2119. PALIGNR mm6, mm7, 7, mm0
  2120. lea r2, [r1+r3*2]
  2121. PALIGNR mm4, mm3, 1, mm7
  2122. movq mm5, mm3
  2123. pavgb mm3, mm6
  2124. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2125. movq mm4, mm2
  2126. movq mm1, mm2
  2127. lea r4, [r2+r3*2]
  2128. psrlq mm4, 16
  2129. psrlq mm1, 8
  2130. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2131. movq mm7, mm3
  2132. punpcklbw mm3, mm0
  2133. punpckhbw mm7, mm0
  2134. movq mm1, mm7
  2135. movq mm0, mm7
  2136. movq mm4, mm7
  2137. movq [r4+r3*2], mm3
  2138. PALIGNR mm7, mm3, 2, mm5
  2139. movq [r4+r3*1], mm7
  2140. PALIGNR mm1, mm3, 4, mm5
  2141. movq [r2+r3*2], mm1
  2142. PALIGNR mm0, mm3, 6, mm3
  2143. movq [r2+r3*1], mm0
  2144. movq mm2, mm6
  2145. movq mm3, mm6
  2146. movq [r1+r3*2], mm4
  2147. PALIGNR mm6, mm4, 2, mm5
  2148. movq [r1+r3*1], mm6
  2149. PALIGNR mm2, mm4, 4, mm5
  2150. movq [r0+r3*2], mm2
  2151. PALIGNR mm3, mm4, 6, mm4
  2152. movq [r0+r3*1], mm3
  2153. RET
  2154. %macro PRED8x8L_HORIZONTAL_DOWN 0
  2155. cglobal pred8x8l_horizontal_down_8, 4,5
  2156. sub r0, r3
  2157. lea r4, [r0+r3*2]
  2158. movq mm0, [r0+r3*1-8]
  2159. punpckhbw mm0, [r0+r3*0-8]
  2160. movq mm1, [r4+r3*1-8]
  2161. punpckhbw mm1, [r0+r3*2-8]
  2162. mov r4, r0
  2163. punpckhwd mm1, mm0
  2164. lea r0, [r0+r3*4]
  2165. movq mm2, [r0+r3*1-8]
  2166. punpckhbw mm2, [r0+r3*0-8]
  2167. lea r0, [r0+r3*2]
  2168. movq mm3, [r0+r3*1-8]
  2169. punpckhbw mm3, [r0+r3*0-8]
  2170. punpckhwd mm3, mm2
  2171. punpckhdq mm3, mm1
  2172. lea r0, [r0+r3*2]
  2173. movq mm0, [r0+r3*0-8]
  2174. movq mm1, [r4]
  2175. mov r0, r4
  2176. movq mm4, mm3
  2177. movq mm2, mm3
  2178. PALIGNR mm4, mm0, 7, mm0
  2179. PALIGNR mm1, mm2, 1, mm2
  2180. test r1, r1
  2181. jnz .do_left
  2182. .fix_lt_1:
  2183. movq mm5, mm3
  2184. pxor mm5, mm4
  2185. psrlq mm5, 56
  2186. psllq mm5, 48
  2187. pxor mm1, mm5
  2188. jmp .do_left
  2189. .fix_lt_2:
  2190. movq mm5, mm3
  2191. pxor mm5, mm2
  2192. psllq mm5, 56
  2193. psrlq mm5, 56
  2194. pxor mm2, mm5
  2195. test r2, r2
  2196. jnz .do_top
  2197. .fix_tr_1:
  2198. movq mm5, mm3
  2199. pxor mm5, mm1
  2200. psrlq mm5, 56
  2201. psllq mm5, 56
  2202. pxor mm1, mm5
  2203. jmp .do_top
  2204. .fix_tr_2:
  2205. punpckhbw mm3, mm3
  2206. pshufw mm1, mm3, 0xFF
  2207. jmp .do_topright
  2208. .do_left:
  2209. movq mm0, mm4
  2210. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2211. movq2dq xmm0, mm2
  2212. pslldq xmm0, 8
  2213. movq mm4, mm0
  2214. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2215. movq2dq xmm2, mm1
  2216. pslldq xmm2, 15
  2217. psrldq xmm2, 8
  2218. por xmm0, xmm2
  2219. movq mm0, [r0-8]
  2220. movq mm3, [r0]
  2221. movq mm1, [r0+8]
  2222. movq mm2, mm3
  2223. movq mm4, mm3
  2224. PALIGNR mm2, mm0, 7, mm0
  2225. PALIGNR mm1, mm4, 1, mm4
  2226. test r1, r1
  2227. jz .fix_lt_2
  2228. test r2, r2
  2229. jz .fix_tr_1
  2230. .do_top:
  2231. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2232. movq2dq xmm1, mm4
  2233. test r2, r2
  2234. jz .fix_tr_2
  2235. movq mm0, [r0+8]
  2236. movq mm5, mm0
  2237. movq mm2, mm0
  2238. movq mm4, mm0
  2239. psrlq mm5, 56
  2240. PALIGNR mm2, mm3, 7, mm3
  2241. PALIGNR mm5, mm4, 1, mm4
  2242. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2243. .do_topright:
  2244. movq2dq xmm5, mm1
  2245. pslldq xmm5, 8
  2246. por xmm1, xmm5
  2247. INIT_XMM cpuname
  2248. lea r2, [r4+r3*2]
  2249. movdqa xmm2, xmm1
  2250. movdqa xmm3, xmm1
  2251. PALIGNR xmm1, xmm0, 7, xmm4
  2252. PALIGNR xmm2, xmm0, 9, xmm5
  2253. lea r1, [r2+r3*2]
  2254. PALIGNR xmm3, xmm0, 8, xmm0
  2255. movdqa xmm4, xmm1
  2256. pavgb xmm4, xmm3
  2257. lea r0, [r1+r3*2]
  2258. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2259. punpcklbw xmm4, xmm0
  2260. movhlps xmm0, xmm4
  2261. movq [r0+r3*2], xmm4
  2262. movq [r2+r3*2], xmm0
  2263. psrldq xmm4, 2
  2264. psrldq xmm0, 2
  2265. movq [r0+r3*1], xmm4
  2266. movq [r2+r3*1], xmm0
  2267. psrldq xmm4, 2
  2268. psrldq xmm0, 2
  2269. movq [r1+r3*2], xmm4
  2270. movq [r4+r3*2], xmm0
  2271. psrldq xmm4, 2
  2272. psrldq xmm0, 2
  2273. movq [r1+r3*1], xmm4
  2274. movq [r4+r3*1], xmm0
  2275. RET
  2276. %endmacro
  2277. INIT_MMX sse2
  2278. PRED8x8L_HORIZONTAL_DOWN
  2279. INIT_MMX ssse3
  2280. PRED8x8L_HORIZONTAL_DOWN
  2281. ;-----------------------------------------------------------------------------
  2282. ; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2283. ;-----------------------------------------------------------------------------
  2284. INIT_MMX mmxext
  2285. cglobal pred4x4_dc_8, 3,5
  2286. pxor mm7, mm7
  2287. mov r4, r0
  2288. sub r0, r2
  2289. movd mm0, [r0]
  2290. psadbw mm0, mm7
  2291. movzx r1d, byte [r0+r2*1-1]
  2292. movd r3d, mm0
  2293. add r3d, r1d
  2294. movzx r1d, byte [r0+r2*2-1]
  2295. lea r0, [r0+r2*2]
  2296. add r3d, r1d
  2297. movzx r1d, byte [r0+r2*1-1]
  2298. add r3d, r1d
  2299. movzx r1d, byte [r0+r2*2-1]
  2300. add r3d, r1d
  2301. add r3d, 4
  2302. shr r3d, 3
  2303. imul r3d, 0x01010101
  2304. mov [r4+r2*0], r3d
  2305. mov [r0+r2*0], r3d
  2306. mov [r0+r2*1], r3d
  2307. mov [r0+r2*2], r3d
  2308. RET
  2309. ;-----------------------------------------------------------------------------
  2310. ; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2311. ;-----------------------------------------------------------------------------
  2312. %macro PRED4x4_TM 0
  2313. cglobal pred4x4_tm_vp8_8, 3,6
  2314. sub r0, r2
  2315. pxor mm7, mm7
  2316. movd mm0, [r0]
  2317. punpcklbw mm0, mm7
  2318. movzx r4d, byte [r0-1]
  2319. mov r5d, 2
  2320. .loop:
  2321. movzx r1d, byte [r0+r2*1-1]
  2322. movzx r3d, byte [r0+r2*2-1]
  2323. sub r1d, r4d
  2324. sub r3d, r4d
  2325. movd mm2, r1d
  2326. movd mm4, r3d
  2327. %if cpuflag(mmxext)
  2328. pshufw mm2, mm2, 0
  2329. pshufw mm4, mm4, 0
  2330. %else
  2331. punpcklwd mm2, mm2
  2332. punpcklwd mm4, mm4
  2333. punpckldq mm2, mm2
  2334. punpckldq mm4, mm4
  2335. %endif
  2336. paddw mm2, mm0
  2337. paddw mm4, mm0
  2338. packuswb mm2, mm2
  2339. packuswb mm4, mm4
  2340. movd [r0+r2*1], mm2
  2341. movd [r0+r2*2], mm4
  2342. lea r0, [r0+r2*2]
  2343. dec r5d
  2344. jg .loop
  2345. REP_RET
  2346. %endmacro
  2347. INIT_MMX mmx
  2348. PRED4x4_TM
  2349. INIT_MMX mmxext
  2350. PRED4x4_TM
  2351. INIT_XMM ssse3
  2352. cglobal pred4x4_tm_vp8_8, 3,3
  2353. sub r0, r2
  2354. movq mm6, [tm_shuf]
  2355. pxor mm1, mm1
  2356. movd mm0, [r0]
  2357. punpcklbw mm0, mm1
  2358. movd mm7, [r0-4]
  2359. pshufb mm7, mm6
  2360. lea r1, [r0+r2*2]
  2361. movd mm2, [r0+r2*1-4]
  2362. movd mm3, [r0+r2*2-4]
  2363. movd mm4, [r1+r2*1-4]
  2364. movd mm5, [r1+r2*2-4]
  2365. pshufb mm2, mm6
  2366. pshufb mm3, mm6
  2367. pshufb mm4, mm6
  2368. pshufb mm5, mm6
  2369. psubw mm2, mm7
  2370. psubw mm3, mm7
  2371. psubw mm4, mm7
  2372. psubw mm5, mm7
  2373. paddw mm2, mm0
  2374. paddw mm3, mm0
  2375. paddw mm4, mm0
  2376. paddw mm5, mm0
  2377. packuswb mm2, mm2
  2378. packuswb mm3, mm3
  2379. packuswb mm4, mm4
  2380. packuswb mm5, mm5
  2381. movd [r0+r2*1], mm2
  2382. movd [r0+r2*2], mm3
  2383. movd [r1+r2*1], mm4
  2384. movd [r1+r2*2], mm5
  2385. RET
  2386. ;-----------------------------------------------------------------------------
  2387. ; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2388. ;-----------------------------------------------------------------------------
  2389. INIT_MMX mmxext
  2390. cglobal pred4x4_vertical_vp8_8, 3,3
  2391. sub r0, r2
  2392. movd m1, [r0-1]
  2393. movd m0, [r0]
  2394. mova m2, m0 ;t0 t1 t2 t3
  2395. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2396. lea r1, [r0+r2*2]
  2397. psrlq m0, 8 ;t1 t2 t3 t4
  2398. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2399. movd [r0+r2*1], m3
  2400. movd [r0+r2*2], m3
  2401. movd [r1+r2*1], m3
  2402. movd [r1+r2*2], m3
  2403. RET
  2404. ;-----------------------------------------------------------------------------
  2405. ; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2406. ;-----------------------------------------------------------------------------
  2407. INIT_MMX mmxext
  2408. cglobal pred4x4_down_left_8, 3,3
  2409. sub r0, r2
  2410. movq m1, [r0]
  2411. punpckldq m1, [r1]
  2412. movq m2, m1
  2413. movq m3, m1
  2414. psllq m1, 8
  2415. pxor m2, m1
  2416. psrlq m2, 8
  2417. pxor m2, m3
  2418. PRED4x4_LOWPASS m0, m1, m2, m3, m4
  2419. lea r1, [r0+r2*2]
  2420. psrlq m0, 8
  2421. movd [r0+r2*1], m0
  2422. psrlq m0, 8
  2423. movd [r0+r2*2], m0
  2424. psrlq m0, 8
  2425. movd [r1+r2*1], m0
  2426. psrlq m0, 8
  2427. movd [r1+r2*2], m0
  2428. RET
  2429. ;-----------------------------------------------------------------------------
  2430. ; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2431. ;-----------------------------------------------------------------------------
  2432. INIT_MMX mmxext
  2433. cglobal pred4x4_vertical_left_8, 3,3
  2434. sub r0, r2
  2435. movq m1, [r0]
  2436. punpckldq m1, [r1]
  2437. movq m3, m1
  2438. movq m2, m1
  2439. psrlq m3, 8
  2440. psrlq m2, 16
  2441. movq m4, m3
  2442. pavgb m4, m1
  2443. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2444. lea r1, [r0+r2*2]
  2445. movh [r0+r2*1], m4
  2446. movh [r0+r2*2], m0
  2447. psrlq m4, 8
  2448. psrlq m0, 8
  2449. movh [r1+r2*1], m4
  2450. movh [r1+r2*2], m0
  2451. RET
  2452. ;-----------------------------------------------------------------------------
  2453. ; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2454. ;-----------------------------------------------------------------------------
  2455. INIT_MMX mmxext
  2456. cglobal pred4x4_horizontal_up_8, 3,3
  2457. sub r0, r2
  2458. lea r1, [r0+r2*2]
  2459. movd m0, [r0+r2*1-4]
  2460. punpcklbw m0, [r0+r2*2-4]
  2461. movd m1, [r1+r2*1-4]
  2462. punpcklbw m1, [r1+r2*2-4]
  2463. punpckhwd m0, m1
  2464. movq m1, m0
  2465. punpckhbw m1, m1
  2466. pshufw m1, m1, 0xFF
  2467. punpckhdq m0, m1
  2468. movq m2, m0
  2469. movq m3, m0
  2470. movq m7, m0
  2471. psrlq m2, 16
  2472. psrlq m3, 8
  2473. pavgb m7, m3
  2474. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2475. punpcklbw m7, m4
  2476. movd [r0+r2*1], m7
  2477. psrlq m7, 16
  2478. movd [r0+r2*2], m7
  2479. psrlq m7, 16
  2480. movd [r1+r2*1], m7
  2481. movd [r1+r2*2], m1
  2482. RET
  2483. ;-----------------------------------------------------------------------------
  2484. ; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2485. ;-----------------------------------------------------------------------------
  2486. INIT_MMX mmxext
  2487. cglobal pred4x4_horizontal_down_8, 3,3
  2488. sub r0, r2
  2489. lea r1, [r0+r2*2]
  2490. movh m0, [r0-4] ; lt ..
  2491. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2492. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2493. movd m1, [r1+r2*2-4] ; l3
  2494. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2495. movd m2, [r0+r2*2-4] ; l1
  2496. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2497. punpckhwd m1, m2 ; l0 l1 l2 l3
  2498. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2499. movq m0, m1
  2500. movq m2, m1
  2501. movq m5, m1
  2502. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2503. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2504. pavgb m5, m2
  2505. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2506. punpcklbw m5, m3
  2507. psrlq m3, 32
  2508. PALIGNR m3, m5, 6, m4
  2509. movh [r1+r2*2], m5
  2510. psrlq m5, 16
  2511. movh [r1+r2*1], m5
  2512. psrlq m5, 16
  2513. movh [r0+r2*2], m5
  2514. movh [r0+r2*1], m3
  2515. RET
  2516. ;-----------------------------------------------------------------------------
  2517. ; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2518. ;-----------------------------------------------------------------------------
  2519. INIT_MMX mmxext
  2520. cglobal pred4x4_vertical_right_8, 3,3
  2521. sub r0, r2
  2522. lea r1, [r0+r2*2]
  2523. movh m0, [r0] ; ........t3t2t1t0
  2524. movq m5, m0
  2525. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2526. pavgb m5, m0
  2527. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2528. movq m1, m0
  2529. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2530. movq m2, m0
  2531. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2532. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2533. movq m1, m3
  2534. psrlq m3, 16
  2535. psllq m1, 48
  2536. movh [r0+r2*1], m5
  2537. movh [r0+r2*2], m3
  2538. PALIGNR m5, m1, 7, m2
  2539. psllq m1, 8
  2540. movh [r1+r2*1], m5
  2541. PALIGNR m3, m1, 7, m1
  2542. movh [r1+r2*2], m3
  2543. RET
  2544. ;-----------------------------------------------------------------------------
  2545. ; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2546. ;-----------------------------------------------------------------------------
  2547. INIT_MMX mmxext
  2548. cglobal pred4x4_down_right_8, 3,3
  2549. sub r0, r2
  2550. lea r1, [r0+r2*2]
  2551. movq m1, [r1-8]
  2552. movq m2, [r0+r2*1-8]
  2553. punpckhbw m2, [r0-8]
  2554. movh m3, [r0]
  2555. punpckhwd m1, m2
  2556. PALIGNR m3, m1, 5, m1
  2557. movq m1, m3
  2558. PALIGNR m3, [r1+r2*1-8], 7, m4
  2559. movq m2, m3
  2560. PALIGNR m3, [r1+r2*2-8], 7, m4
  2561. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2562. movh [r1+r2*2], m0
  2563. psrlq m0, 8
  2564. movh [r1+r2*1], m0
  2565. psrlq m0, 8
  2566. movh [r0+r2*2], m0
  2567. psrlq m0, 8
  2568. movh [r0+r2*1], m0
  2569. RET