You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2730 lines
69KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. tm_shuf: times 8 db 0x03, 0x80
  27. pw_ff00: times 8 dw 0xff00
  28. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  29. db 1, 2, 3, 4, 5, 6, 7, 8
  30. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  31. db 1, 2, 3, 4, 0, 0, 0, 0
  32. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  33. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  34. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  35. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  36. SECTION .text
  37. cextern pb_1
  38. cextern pb_3
  39. cextern pw_4
  40. cextern pw_5
  41. cextern pw_8
  42. cextern pw_16
  43. cextern pw_17
  44. cextern pw_32
  45. ;-----------------------------------------------------------------------------
  46. ; void pred16x16_vertical(uint8_t *src, int stride)
  47. ;-----------------------------------------------------------------------------
  48. INIT_MMX mmx
  49. cglobal pred16x16_vertical_8, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. INIT_XMM sse
  64. cglobal pred16x16_vertical_8, 2,3
  65. sub r0, r1
  66. mov r2, 4
  67. movaps xmm0, [r0]
  68. .loop:
  69. movaps [r0+r1*1], xmm0
  70. movaps [r0+r1*2], xmm0
  71. lea r0, [r0+r1*2]
  72. movaps [r0+r1*1], xmm0
  73. movaps [r0+r1*2], xmm0
  74. lea r0, [r0+r1*2]
  75. dec r2
  76. jg .loop
  77. REP_RET
  78. ;-----------------------------------------------------------------------------
  79. ; void pred16x16_horizontal(uint8_t *src, int stride)
  80. ;-----------------------------------------------------------------------------
  81. %macro PRED16x16_H 0
  82. cglobal pred16x16_horizontal_8, 2,3
  83. mov r2, 8
  84. %if cpuflag(ssse3)
  85. mova m2, [pb_3]
  86. %endif
  87. .loop:
  88. movd m0, [r0+r1*0-4]
  89. movd m1, [r0+r1*1-4]
  90. %if cpuflag(ssse3)
  91. pshufb m0, m2
  92. pshufb m1, m2
  93. %else
  94. punpcklbw m0, m0
  95. punpcklbw m1, m1
  96. SPLATW m0, m0, 3
  97. SPLATW m1, m1, 3
  98. mova [r0+r1*0+8], m0
  99. mova [r0+r1*1+8], m1
  100. %endif
  101. mova [r0+r1*0], m0
  102. mova [r0+r1*1], m1
  103. lea r0, [r0+r1*2]
  104. dec r2
  105. jg .loop
  106. REP_RET
  107. %endmacro
  108. INIT_MMX mmx
  109. PRED16x16_H
  110. INIT_MMX mmx2
  111. PRED16x16_H
  112. INIT_XMM ssse3
  113. PRED16x16_H
  114. ;-----------------------------------------------------------------------------
  115. ; void pred16x16_dc(uint8_t *src, int stride)
  116. ;-----------------------------------------------------------------------------
  117. %macro PRED16x16_DC 0
  118. cglobal pred16x16_dc_8, 2,7
  119. mov r4, r0
  120. sub r0, r1
  121. pxor mm0, mm0
  122. pxor mm1, mm1
  123. psadbw mm0, [r0+0]
  124. psadbw mm1, [r0+8]
  125. dec r0
  126. movzx r5d, byte [r0+r1*1]
  127. paddw mm0, mm1
  128. movd r6d, mm0
  129. lea r0, [r0+r1*2]
  130. %rep 7
  131. movzx r2d, byte [r0+r1*0]
  132. movzx r3d, byte [r0+r1*1]
  133. add r5d, r2d
  134. add r6d, r3d
  135. lea r0, [r0+r1*2]
  136. %endrep
  137. movzx r2d, byte [r0+r1*0]
  138. add r5d, r6d
  139. lea r2d, [r2+r5+16]
  140. shr r2d, 5
  141. %if cpuflag(ssse3)
  142. pxor m1, m1
  143. %endif
  144. SPLATB_REG m0, r2, m1
  145. %if mmsize==8
  146. mov r3d, 8
  147. .loop:
  148. mova [r4+r1*0+0], m0
  149. mova [r4+r1*0+8], m0
  150. mova [r4+r1*1+0], m0
  151. mova [r4+r1*1+8], m0
  152. %else
  153. mov r3d, 4
  154. .loop:
  155. mova [r4+r1*0], m0
  156. mova [r4+r1*1], m0
  157. lea r4, [r4+r1*2]
  158. mova [r4+r1*0], m0
  159. mova [r4+r1*1], m0
  160. %endif
  161. lea r4, [r4+r1*2]
  162. dec r3d
  163. jg .loop
  164. REP_RET
  165. %endmacro
  166. INIT_MMX mmx2
  167. PRED16x16_DC
  168. INIT_XMM sse2
  169. PRED16x16_DC
  170. INIT_XMM ssse3
  171. PRED16x16_DC
  172. ;-----------------------------------------------------------------------------
  173. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  174. ;-----------------------------------------------------------------------------
  175. %macro PRED16x16_TM 0
  176. cglobal pred16x16_tm_vp8_8, 2,5
  177. sub r0, r1
  178. pxor mm7, mm7
  179. movq mm0, [r0+0]
  180. movq mm2, [r0+8]
  181. movq mm1, mm0
  182. movq mm3, mm2
  183. punpcklbw mm0, mm7
  184. punpckhbw mm1, mm7
  185. punpcklbw mm2, mm7
  186. punpckhbw mm3, mm7
  187. movzx r3d, byte [r0-1]
  188. mov r4d, 16
  189. .loop:
  190. movzx r2d, byte [r0+r1-1]
  191. sub r2d, r3d
  192. movd mm4, r2d
  193. SPLATW mm4, mm4, 0
  194. movq mm5, mm4
  195. movq mm6, mm4
  196. movq mm7, mm4
  197. paddw mm4, mm0
  198. paddw mm5, mm1
  199. paddw mm6, mm2
  200. paddw mm7, mm3
  201. packuswb mm4, mm5
  202. packuswb mm6, mm7
  203. movq [r0+r1+0], mm4
  204. movq [r0+r1+8], mm6
  205. add r0, r1
  206. dec r4d
  207. jg .loop
  208. REP_RET
  209. %endmacro
  210. INIT_MMX mmx
  211. PRED16x16_TM
  212. INIT_MMX mmx2
  213. PRED16x16_TM
  214. INIT_XMM sse2
  215. cglobal pred16x16_tm_vp8_8, 2,6,6
  216. sub r0, r1
  217. pxor xmm2, xmm2
  218. movdqa xmm0, [r0]
  219. movdqa xmm1, xmm0
  220. punpcklbw xmm0, xmm2
  221. punpckhbw xmm1, xmm2
  222. movzx r4d, byte [r0-1]
  223. mov r5d, 8
  224. .loop:
  225. movzx r2d, byte [r0+r1*1-1]
  226. movzx r3d, byte [r0+r1*2-1]
  227. sub r2d, r4d
  228. sub r3d, r4d
  229. movd xmm2, r2d
  230. movd xmm4, r3d
  231. pshuflw xmm2, xmm2, 0
  232. pshuflw xmm4, xmm4, 0
  233. punpcklqdq xmm2, xmm2
  234. punpcklqdq xmm4, xmm4
  235. movdqa xmm3, xmm2
  236. movdqa xmm5, xmm4
  237. paddw xmm2, xmm0
  238. paddw xmm3, xmm1
  239. paddw xmm4, xmm0
  240. paddw xmm5, xmm1
  241. packuswb xmm2, xmm3
  242. packuswb xmm4, xmm5
  243. movdqa [r0+r1*1], xmm2
  244. movdqa [r0+r1*2], xmm4
  245. lea r0, [r0+r1*2]
  246. dec r5d
  247. jg .loop
  248. REP_RET
  249. ;-----------------------------------------------------------------------------
  250. ; void pred16x16_plane(uint8_t *src, int stride)
  251. ;-----------------------------------------------------------------------------
  252. %macro H264_PRED16x16_PLANE 1
  253. cglobal pred16x16_plane_%1_8, 2,9,7
  254. mov r2, r1 ; +stride
  255. neg r1 ; -stride
  256. movh m0, [r0+r1 -1]
  257. %if mmsize == 8
  258. pxor m4, m4
  259. movh m1, [r0+r1 +3 ]
  260. movh m2, [r0+r1 +8 ]
  261. movh m3, [r0+r1 +12]
  262. punpcklbw m0, m4
  263. punpcklbw m1, m4
  264. punpcklbw m2, m4
  265. punpcklbw m3, m4
  266. pmullw m0, [pw_m8tom1 ]
  267. pmullw m1, [pw_m8tom1+8]
  268. pmullw m2, [pw_1to8 ]
  269. pmullw m3, [pw_1to8 +8]
  270. paddw m0, m2
  271. paddw m1, m3
  272. %else ; mmsize == 16
  273. %if cpuflag(ssse3)
  274. movhps m0, [r0+r1 +8]
  275. pmaddubsw m0, [plane_shuf] ; H coefficients
  276. %else ; sse2
  277. pxor m2, m2
  278. movh m1, [r0+r1 +8]
  279. punpcklbw m0, m2
  280. punpcklbw m1, m2
  281. pmullw m0, [pw_m8tom1]
  282. pmullw m1, [pw_1to8]
  283. paddw m0, m1
  284. %endif
  285. movhlps m1, m0
  286. %endif
  287. paddw m0, m1
  288. %if cpuflag(mmx2)
  289. PSHUFLW m1, m0, 0xE
  290. %elif cpuflag(mmx)
  291. mova m1, m0
  292. psrlq m1, 32
  293. %endif
  294. paddw m0, m1
  295. %if cpuflag(mmx2)
  296. PSHUFLW m1, m0, 0x1
  297. %elif cpuflag(mmx)
  298. mova m1, m0
  299. psrlq m1, 16
  300. %endif
  301. paddw m0, m1 ; sum of H coefficients
  302. lea r4, [r0+r2*8-1]
  303. lea r3, [r0+r2*4-1]
  304. add r4, r2
  305. %if ARCH_X86_64
  306. %define e_reg r8
  307. %else
  308. %define e_reg r0
  309. %endif
  310. movzx e_reg, byte [r3+r2*2 ]
  311. movzx r5, byte [r4+r1 ]
  312. sub r5, e_reg
  313. movzx e_reg, byte [r3+r2 ]
  314. movzx r6, byte [r4 ]
  315. sub r6, e_reg
  316. lea r5, [r5+r6*2]
  317. movzx e_reg, byte [r3+r1 ]
  318. movzx r6, byte [r4+r2*2 ]
  319. sub r6, e_reg
  320. lea r5, [r5+r6*4]
  321. movzx e_reg, byte [r3 ]
  322. %if ARCH_X86_64
  323. movzx r7, byte [r4+r2 ]
  324. sub r7, e_reg
  325. %else
  326. movzx r6, byte [r4+r2 ]
  327. sub r6, e_reg
  328. lea r5, [r5+r6*4]
  329. sub r5, r6
  330. %endif
  331. lea e_reg, [r3+r1*4]
  332. lea r3, [r4+r2*4]
  333. movzx r4, byte [e_reg+r2 ]
  334. movzx r6, byte [r3 ]
  335. sub r6, r4
  336. %if ARCH_X86_64
  337. lea r6, [r7+r6*2]
  338. lea r5, [r5+r6*2]
  339. add r5, r6
  340. %else
  341. lea r5, [r5+r6*4]
  342. lea r5, [r5+r6*2]
  343. %endif
  344. movzx r4, byte [e_reg ]
  345. %if ARCH_X86_64
  346. movzx r7, byte [r3 +r2 ]
  347. sub r7, r4
  348. sub r5, r7
  349. %else
  350. movzx r6, byte [r3 +r2 ]
  351. sub r6, r4
  352. lea r5, [r5+r6*8]
  353. sub r5, r6
  354. %endif
  355. movzx r4, byte [e_reg+r1 ]
  356. movzx r6, byte [r3 +r2*2]
  357. sub r6, r4
  358. %if ARCH_X86_64
  359. add r6, r7
  360. %endif
  361. lea r5, [r5+r6*8]
  362. movzx r4, byte [e_reg+r2*2]
  363. movzx r6, byte [r3 +r1 ]
  364. sub r6, r4
  365. lea r5, [r5+r6*4]
  366. add r5, r6 ; sum of V coefficients
  367. %if ARCH_X86_64 == 0
  368. mov r0, r0m
  369. %endif
  370. %ifidn %1, h264
  371. lea r5, [r5*5+32]
  372. sar r5, 6
  373. %elifidn %1, rv40
  374. lea r5, [r5*5]
  375. sar r5, 6
  376. %elifidn %1, svq3
  377. test r5, r5
  378. lea r6, [r5+3]
  379. cmovs r5, r6
  380. sar r5, 2 ; V/4
  381. lea r5, [r5*5] ; 5*(V/4)
  382. test r5, r5
  383. lea r6, [r5+15]
  384. cmovs r5, r6
  385. sar r5, 4 ; (5*(V/4))/16
  386. %endif
  387. movzx r4, byte [r0+r1 +15]
  388. movzx r3, byte [r3+r2*2 ]
  389. lea r3, [r3+r4+1]
  390. shl r3, 4
  391. movd r1d, m0
  392. movsx r1d, r1w
  393. %ifnidn %1, svq3
  394. %ifidn %1, h264
  395. lea r1d, [r1d*5+32]
  396. %else ; rv40
  397. lea r1d, [r1d*5]
  398. %endif
  399. sar r1d, 6
  400. %else ; svq3
  401. test r1d, r1d
  402. lea r4d, [r1d+3]
  403. cmovs r1d, r4d
  404. sar r1d, 2 ; H/4
  405. lea r1d, [r1d*5] ; 5*(H/4)
  406. test r1d, r1d
  407. lea r4d, [r1d+15]
  408. cmovs r1d, r4d
  409. sar r1d, 4 ; (5*(H/4))/16
  410. %endif
  411. movd m0, r1d
  412. add r1d, r5d
  413. add r3d, r1d
  414. shl r1d, 3
  415. sub r3d, r1d ; a
  416. movd m1, r5d
  417. movd m3, r3d
  418. SPLATW m0, m0, 0 ; H
  419. SPLATW m1, m1, 0 ; V
  420. SPLATW m3, m3, 0 ; a
  421. %ifidn %1, svq3
  422. SWAP 0, 1
  423. %endif
  424. mova m2, m0
  425. %if mmsize == 8
  426. mova m5, m0
  427. %endif
  428. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  429. %if mmsize == 16
  430. psllw m2, 3
  431. %else
  432. psllw m5, 3
  433. psllw m2, 2
  434. mova m6, m5
  435. paddw m6, m2
  436. %endif
  437. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  438. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  439. %if mmsize == 8
  440. paddw m5, m0 ; a + {8,9,10,11}*H
  441. paddw m6, m0 ; a + {12,13,14,15}*H
  442. %endif
  443. mov r4, 8
  444. .loop:
  445. mova m3, m0 ; b[0..7]
  446. mova m4, m2 ; b[8..15]
  447. psraw m3, 5
  448. psraw m4, 5
  449. packuswb m3, m4
  450. mova [r0], m3
  451. %if mmsize == 8
  452. mova m3, m5 ; b[8..11]
  453. mova m4, m6 ; b[12..15]
  454. psraw m3, 5
  455. psraw m4, 5
  456. packuswb m3, m4
  457. mova [r0+8], m3
  458. %endif
  459. paddw m0, m1
  460. paddw m2, m1
  461. %if mmsize == 8
  462. paddw m5, m1
  463. paddw m6, m1
  464. %endif
  465. mova m3, m0 ; b[0..7]
  466. mova m4, m2 ; b[8..15]
  467. psraw m3, 5
  468. psraw m4, 5
  469. packuswb m3, m4
  470. mova [r0+r2], m3
  471. %if mmsize == 8
  472. mova m3, m5 ; b[8..11]
  473. mova m4, m6 ; b[12..15]
  474. psraw m3, 5
  475. psraw m4, 5
  476. packuswb m3, m4
  477. mova [r0+r2+8], m3
  478. %endif
  479. paddw m0, m1
  480. paddw m2, m1
  481. %if mmsize == 8
  482. paddw m5, m1
  483. paddw m6, m1
  484. %endif
  485. lea r0, [r0+r2*2]
  486. dec r4
  487. jg .loop
  488. REP_RET
  489. %endmacro
  490. INIT_MMX mmx
  491. H264_PRED16x16_PLANE h264
  492. H264_PRED16x16_PLANE rv40
  493. H264_PRED16x16_PLANE svq3
  494. INIT_MMX mmx2
  495. H264_PRED16x16_PLANE h264
  496. H264_PRED16x16_PLANE rv40
  497. H264_PRED16x16_PLANE svq3
  498. INIT_XMM sse2
  499. H264_PRED16x16_PLANE h264
  500. H264_PRED16x16_PLANE rv40
  501. H264_PRED16x16_PLANE svq3
  502. INIT_XMM ssse3
  503. H264_PRED16x16_PLANE h264
  504. H264_PRED16x16_PLANE rv40
  505. H264_PRED16x16_PLANE svq3
  506. ;-----------------------------------------------------------------------------
  507. ; void pred8x8_plane(uint8_t *src, int stride)
  508. ;-----------------------------------------------------------------------------
  509. %macro H264_PRED8x8_PLANE 0
  510. cglobal pred8x8_plane_8, 2,9,7
  511. mov r2, r1 ; +stride
  512. neg r1 ; -stride
  513. movd m0, [r0+r1 -1]
  514. %if mmsize == 8
  515. pxor m2, m2
  516. movh m1, [r0+r1 +4 ]
  517. punpcklbw m0, m2
  518. punpcklbw m1, m2
  519. pmullw m0, [pw_m4to4]
  520. pmullw m1, [pw_m4to4+8]
  521. %else ; mmsize == 16
  522. %if cpuflag(ssse3)
  523. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  524. pmaddubsw m0, [plane8_shuf] ; H coefficients
  525. %else ; sse2
  526. pxor m2, m2
  527. movd m1, [r0+r1 +4]
  528. punpckldq m0, m1
  529. punpcklbw m0, m2
  530. pmullw m0, [pw_m4to4]
  531. %endif
  532. movhlps m1, m0
  533. %endif
  534. paddw m0, m1
  535. %if notcpuflag(ssse3)
  536. %if cpuflag(mmx2)
  537. PSHUFLW m1, m0, 0xE
  538. %elif cpuflag(mmx)
  539. mova m1, m0
  540. psrlq m1, 32
  541. %endif
  542. paddw m0, m1
  543. %endif ; !ssse3
  544. %if cpuflag(mmx2)
  545. PSHUFLW m1, m0, 0x1
  546. %elif cpuflag(mmx)
  547. mova m1, m0
  548. psrlq m1, 16
  549. %endif
  550. paddw m0, m1 ; sum of H coefficients
  551. lea r4, [r0+r2*4-1]
  552. lea r3, [r0 -1]
  553. add r4, r2
  554. %if ARCH_X86_64
  555. %define e_reg r8
  556. %else
  557. %define e_reg r0
  558. %endif
  559. movzx e_reg, byte [r3+r2*2 ]
  560. movzx r5, byte [r4+r1 ]
  561. sub r5, e_reg
  562. movzx e_reg, byte [r3 ]
  563. %if ARCH_X86_64
  564. movzx r7, byte [r4+r2 ]
  565. sub r7, e_reg
  566. sub r5, r7
  567. %else
  568. movzx r6, byte [r4+r2 ]
  569. sub r6, e_reg
  570. lea r5, [r5+r6*4]
  571. sub r5, r6
  572. %endif
  573. movzx e_reg, byte [r3+r1 ]
  574. movzx r6, byte [r4+r2*2 ]
  575. sub r6, e_reg
  576. %if ARCH_X86_64
  577. add r6, r7
  578. %endif
  579. lea r5, [r5+r6*4]
  580. movzx e_reg, byte [r3+r2 ]
  581. movzx r6, byte [r4 ]
  582. sub r6, e_reg
  583. lea r6, [r5+r6*2]
  584. lea r5, [r6*9+16]
  585. lea r5, [r5+r6*8]
  586. sar r5, 5
  587. %if ARCH_X86_64 == 0
  588. mov r0, r0m
  589. %endif
  590. movzx r3, byte [r4+r2*2 ]
  591. movzx r4, byte [r0+r1 +7]
  592. lea r3, [r3+r4+1]
  593. shl r3, 4
  594. movd r1d, m0
  595. movsx r1d, r1w
  596. imul r1d, 17
  597. add r1d, 16
  598. sar r1d, 5
  599. movd m0, r1d
  600. add r1d, r5d
  601. sub r3d, r1d
  602. add r1d, r1d
  603. sub r3d, r1d ; a
  604. movd m1, r5d
  605. movd m3, r3d
  606. SPLATW m0, m0, 0 ; H
  607. SPLATW m1, m1, 0 ; V
  608. SPLATW m3, m3, 0 ; a
  609. %if mmsize == 8
  610. mova m2, m0
  611. %endif
  612. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  613. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  614. %if mmsize == 8
  615. psllw m2, 2
  616. paddw m2, m0 ; a + {4,5,6,7}*H
  617. %endif
  618. mov r4, 4
  619. ALIGN 16
  620. .loop:
  621. %if mmsize == 16
  622. mova m3, m0 ; b[0..7]
  623. paddw m0, m1
  624. psraw m3, 5
  625. mova m4, m0 ; V+b[0..7]
  626. paddw m0, m1
  627. psraw m4, 5
  628. packuswb m3, m4
  629. movh [r0], m3
  630. movhps [r0+r2], m3
  631. %else ; mmsize == 8
  632. mova m3, m0 ; b[0..3]
  633. mova m4, m2 ; b[4..7]
  634. paddw m0, m1
  635. paddw m2, m1
  636. psraw m3, 5
  637. psraw m4, 5
  638. mova m5, m0 ; V+b[0..3]
  639. mova m6, m2 ; V+b[4..7]
  640. paddw m0, m1
  641. paddw m2, m1
  642. psraw m5, 5
  643. psraw m6, 5
  644. packuswb m3, m4
  645. packuswb m5, m6
  646. mova [r0], m3
  647. mova [r0+r2], m5
  648. %endif
  649. lea r0, [r0+r2*2]
  650. dec r4
  651. jg .loop
  652. REP_RET
  653. %endmacro
  654. INIT_MMX mmx
  655. H264_PRED8x8_PLANE
  656. INIT_MMX mmx2
  657. H264_PRED8x8_PLANE
  658. INIT_XMM sse2
  659. H264_PRED8x8_PLANE
  660. INIT_XMM ssse3
  661. H264_PRED8x8_PLANE
  662. ;-----------------------------------------------------------------------------
  663. ; void pred8x8_vertical(uint8_t *src, int stride)
  664. ;-----------------------------------------------------------------------------
  665. INIT_MMX mmx
  666. cglobal pred8x8_vertical_8, 2,2
  667. sub r0, r1
  668. movq mm0, [r0]
  669. %rep 3
  670. movq [r0+r1*1], mm0
  671. movq [r0+r1*2], mm0
  672. lea r0, [r0+r1*2]
  673. %endrep
  674. movq [r0+r1*1], mm0
  675. movq [r0+r1*2], mm0
  676. RET
  677. ;-----------------------------------------------------------------------------
  678. ; void pred8x8_horizontal(uint8_t *src, int stride)
  679. ;-----------------------------------------------------------------------------
  680. %macro PRED8x8_H 0
  681. cglobal pred8x8_horizontal_8, 2,3
  682. mov r2, 4
  683. %if cpuflag(ssse3)
  684. mova m2, [pb_3]
  685. %endif
  686. .loop:
  687. SPLATB_LOAD m0, r0+r1*0-1, m2
  688. SPLATB_LOAD m1, r0+r1*1-1, m2
  689. mova [r0+r1*0], m0
  690. mova [r0+r1*1], m1
  691. lea r0, [r0+r1*2]
  692. dec r2
  693. jg .loop
  694. REP_RET
  695. %endmacro
  696. INIT_MMX mmx
  697. PRED8x8_H
  698. INIT_MMX mmx2
  699. PRED8x8_H
  700. INIT_MMX ssse3
  701. PRED8x8_H
  702. ;-----------------------------------------------------------------------------
  703. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  704. ;-----------------------------------------------------------------------------
  705. INIT_MMX mmxext
  706. cglobal pred8x8_top_dc_8, 2,5
  707. sub r0, r1
  708. movq mm0, [r0]
  709. pxor mm1, mm1
  710. pxor mm2, mm2
  711. lea r2, [r0+r1*2]
  712. punpckhbw mm1, mm0
  713. punpcklbw mm0, mm2
  714. psadbw mm1, mm2 ; s1
  715. lea r3, [r2+r1*2]
  716. psadbw mm0, mm2 ; s0
  717. psrlw mm1, 1
  718. psrlw mm0, 1
  719. pavgw mm1, mm2
  720. lea r4, [r3+r1*2]
  721. pavgw mm0, mm2
  722. pshufw mm1, mm1, 0
  723. pshufw mm0, mm0, 0 ; dc0 (w)
  724. packuswb mm0, mm1 ; dc0,dc1 (b)
  725. movq [r0+r1*1], mm0
  726. movq [r0+r1*2], mm0
  727. lea r0, [r3+r1*2]
  728. movq [r2+r1*1], mm0
  729. movq [r2+r1*2], mm0
  730. movq [r3+r1*1], mm0
  731. movq [r3+r1*2], mm0
  732. movq [r0+r1*1], mm0
  733. movq [r0+r1*2], mm0
  734. RET
  735. ;-----------------------------------------------------------------------------
  736. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  737. ;-----------------------------------------------------------------------------
  738. INIT_MMX mmxext
  739. cglobal pred8x8_dc_8, 2,5
  740. sub r0, r1
  741. pxor m7, m7
  742. movd m0, [r0+0]
  743. movd m1, [r0+4]
  744. psadbw m0, m7 ; s0
  745. mov r4, r0
  746. psadbw m1, m7 ; s1
  747. movzx r2d, byte [r0+r1*1-1]
  748. movzx r3d, byte [r0+r1*2-1]
  749. lea r0, [r0+r1*2]
  750. add r2d, r3d
  751. movzx r3d, byte [r0+r1*1-1]
  752. add r2d, r3d
  753. movzx r3d, byte [r0+r1*2-1]
  754. add r2d, r3d
  755. lea r0, [r0+r1*2]
  756. movd m2, r2d ; s2
  757. movzx r2d, byte [r0+r1*1-1]
  758. movzx r3d, byte [r0+r1*2-1]
  759. lea r0, [r0+r1*2]
  760. add r2d, r3d
  761. movzx r3d, byte [r0+r1*1-1]
  762. add r2d, r3d
  763. movzx r3d, byte [r0+r1*2-1]
  764. add r2d, r3d
  765. movd m3, r2d ; s3
  766. punpcklwd m0, m1
  767. mov r0, r4
  768. punpcklwd m2, m3
  769. punpckldq m0, m2 ; s0, s1, s2, s3
  770. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  771. lea r2, [r0+r1*2]
  772. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  773. paddw m0, m3
  774. lea r3, [r2+r1*2]
  775. psrlw m0, 2
  776. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  777. lea r4, [r3+r1*2]
  778. packuswb m0, m0
  779. punpcklbw m0, m0
  780. movq m1, m0
  781. punpcklbw m0, m0
  782. punpckhbw m1, m1
  783. movq [r0+r1*1], m0
  784. movq [r0+r1*2], m0
  785. movq [r2+r1*1], m0
  786. movq [r2+r1*2], m0
  787. movq [r3+r1*1], m1
  788. movq [r3+r1*2], m1
  789. movq [r4+r1*1], m1
  790. movq [r4+r1*2], m1
  791. RET
  792. ;-----------------------------------------------------------------------------
  793. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  794. ;-----------------------------------------------------------------------------
  795. INIT_MMX mmxext
  796. cglobal pred8x8_dc_rv40_8, 2,7
  797. mov r4, r0
  798. sub r0, r1
  799. pxor mm0, mm0
  800. psadbw mm0, [r0]
  801. dec r0
  802. movzx r5d, byte [r0+r1*1]
  803. movd r6d, mm0
  804. lea r0, [r0+r1*2]
  805. %rep 3
  806. movzx r2d, byte [r0+r1*0]
  807. movzx r3d, byte [r0+r1*1]
  808. add r5d, r2d
  809. add r6d, r3d
  810. lea r0, [r0+r1*2]
  811. %endrep
  812. movzx r2d, byte [r0+r1*0]
  813. add r5d, r6d
  814. lea r2d, [r2+r5+8]
  815. shr r2d, 4
  816. movd mm0, r2d
  817. punpcklbw mm0, mm0
  818. pshufw mm0, mm0, 0
  819. mov r3d, 4
  820. .loop:
  821. movq [r4+r1*0], mm0
  822. movq [r4+r1*1], mm0
  823. lea r4, [r4+r1*2]
  824. dec r3d
  825. jg .loop
  826. REP_RET
  827. ;-----------------------------------------------------------------------------
  828. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  829. ;-----------------------------------------------------------------------------
  830. %macro PRED8x8_TM 0
  831. cglobal pred8x8_tm_vp8_8, 2,6
  832. sub r0, r1
  833. pxor mm7, mm7
  834. movq mm0, [r0]
  835. movq mm1, mm0
  836. punpcklbw mm0, mm7
  837. punpckhbw mm1, mm7
  838. movzx r4d, byte [r0-1]
  839. mov r5d, 4
  840. .loop:
  841. movzx r2d, byte [r0+r1*1-1]
  842. movzx r3d, byte [r0+r1*2-1]
  843. sub r2d, r4d
  844. sub r3d, r4d
  845. movd mm2, r2d
  846. movd mm4, r3d
  847. SPLATW mm2, mm2, 0
  848. SPLATW mm4, mm4, 0
  849. movq mm3, mm2
  850. movq mm5, mm4
  851. paddw mm2, mm0
  852. paddw mm3, mm1
  853. paddw mm4, mm0
  854. paddw mm5, mm1
  855. packuswb mm2, mm3
  856. packuswb mm4, mm5
  857. movq [r0+r1*1], mm2
  858. movq [r0+r1*2], mm4
  859. lea r0, [r0+r1*2]
  860. dec r5d
  861. jg .loop
  862. REP_RET
  863. %endmacro
  864. INIT_MMX mmx
  865. PRED8x8_TM
  866. INIT_MMX mmx2
  867. PRED8x8_TM
  868. INIT_XMM sse2
  869. cglobal pred8x8_tm_vp8_8, 2,6,4
  870. sub r0, r1
  871. pxor xmm1, xmm1
  872. movq xmm0, [r0]
  873. punpcklbw xmm0, xmm1
  874. movzx r4d, byte [r0-1]
  875. mov r5d, 4
  876. .loop:
  877. movzx r2d, byte [r0+r1*1-1]
  878. movzx r3d, byte [r0+r1*2-1]
  879. sub r2d, r4d
  880. sub r3d, r4d
  881. movd xmm2, r2d
  882. movd xmm3, r3d
  883. pshuflw xmm2, xmm2, 0
  884. pshuflw xmm3, xmm3, 0
  885. punpcklqdq xmm2, xmm2
  886. punpcklqdq xmm3, xmm3
  887. paddw xmm2, xmm0
  888. paddw xmm3, xmm0
  889. packuswb xmm2, xmm3
  890. movq [r0+r1*1], xmm2
  891. movhps [r0+r1*2], xmm2
  892. lea r0, [r0+r1*2]
  893. dec r5d
  894. jg .loop
  895. REP_RET
  896. INIT_XMM ssse3
  897. cglobal pred8x8_tm_vp8_8, 2,3,6
  898. sub r0, r1
  899. movdqa xmm4, [tm_shuf]
  900. pxor xmm1, xmm1
  901. movq xmm0, [r0]
  902. punpcklbw xmm0, xmm1
  903. movd xmm5, [r0-4]
  904. pshufb xmm5, xmm4
  905. mov r2d, 4
  906. .loop:
  907. movd xmm2, [r0+r1*1-4]
  908. movd xmm3, [r0+r1*2-4]
  909. pshufb xmm2, xmm4
  910. pshufb xmm3, xmm4
  911. psubw xmm2, xmm5
  912. psubw xmm3, xmm5
  913. paddw xmm2, xmm0
  914. paddw xmm3, xmm0
  915. packuswb xmm2, xmm3
  916. movq [r0+r1*1], xmm2
  917. movhps [r0+r1*2], xmm2
  918. lea r0, [r0+r1*2]
  919. dec r2d
  920. jg .loop
  921. REP_RET
  922. ; dest, left, right, src, tmp
  923. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  924. %macro PRED4x4_LOWPASS 5
  925. mova %5, %2
  926. pavgb %2, %3
  927. pxor %3, %5
  928. mova %1, %4
  929. pand %3, [pb_1]
  930. psubusb %2, %3
  931. pavgb %1, %2
  932. %endmacro
  933. ;-----------------------------------------------------------------------------
  934. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  935. ;-----------------------------------------------------------------------------
  936. %macro PRED8x8L_TOP_DC 0
  937. cglobal pred8x8l_top_dc_8, 4,4
  938. sub r0, r3
  939. pxor mm7, mm7
  940. movq mm0, [r0-8]
  941. movq mm3, [r0]
  942. movq mm1, [r0+8]
  943. movq mm2, mm3
  944. movq mm4, mm3
  945. PALIGNR mm2, mm0, 7, mm0
  946. PALIGNR mm1, mm4, 1, mm4
  947. test r1, r1 ; top_left
  948. jz .fix_lt_2
  949. test r2, r2 ; top_right
  950. jz .fix_tr_1
  951. jmp .body
  952. .fix_lt_2:
  953. movq mm5, mm3
  954. pxor mm5, mm2
  955. psllq mm5, 56
  956. psrlq mm5, 56
  957. pxor mm2, mm5
  958. test r2, r2 ; top_right
  959. jnz .body
  960. .fix_tr_1:
  961. movq mm5, mm3
  962. pxor mm5, mm1
  963. psrlq mm5, 56
  964. psllq mm5, 56
  965. pxor mm1, mm5
  966. .body:
  967. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  968. psadbw mm7, mm0
  969. paddw mm7, [pw_4]
  970. psrlw mm7, 3
  971. pshufw mm7, mm7, 0
  972. packuswb mm7, mm7
  973. %rep 3
  974. movq [r0+r3*1], mm7
  975. movq [r0+r3*2], mm7
  976. lea r0, [r0+r3*2]
  977. %endrep
  978. movq [r0+r3*1], mm7
  979. movq [r0+r3*2], mm7
  980. RET
  981. %endmacro
  982. INIT_MMX mmxext
  983. %define PALIGNR PALIGNR_MMX
  984. PRED8x8L_TOP_DC
  985. INIT_MMX ssse3
  986. %define PALIGNR PALIGNR_SSSE3
  987. PRED8x8L_TOP_DC
  988. ;-----------------------------------------------------------------------------
  989. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  990. ;-----------------------------------------------------------------------------
  991. %macro PRED8x8L_DC 0
  992. cglobal pred8x8l_dc_8, 4,5
  993. sub r0, r3
  994. lea r4, [r0+r3*2]
  995. movq mm0, [r0+r3*1-8]
  996. punpckhbw mm0, [r0+r3*0-8]
  997. movq mm1, [r4+r3*1-8]
  998. punpckhbw mm1, [r0+r3*2-8]
  999. mov r4, r0
  1000. punpckhwd mm1, mm0
  1001. lea r0, [r0+r3*4]
  1002. movq mm2, [r0+r3*1-8]
  1003. punpckhbw mm2, [r0+r3*0-8]
  1004. lea r0, [r0+r3*2]
  1005. movq mm3, [r0+r3*1-8]
  1006. punpckhbw mm3, [r0+r3*0-8]
  1007. punpckhwd mm3, mm2
  1008. punpckhdq mm3, mm1
  1009. lea r0, [r0+r3*2]
  1010. movq mm0, [r0+r3*0-8]
  1011. movq mm1, [r4]
  1012. mov r0, r4
  1013. movq mm4, mm3
  1014. movq mm2, mm3
  1015. PALIGNR mm4, mm0, 7, mm0
  1016. PALIGNR mm1, mm2, 1, mm2
  1017. test r1, r1
  1018. jnz .do_left
  1019. .fix_lt_1:
  1020. movq mm5, mm3
  1021. pxor mm5, mm4
  1022. psrlq mm5, 56
  1023. psllq mm5, 48
  1024. pxor mm1, mm5
  1025. jmp .do_left
  1026. .fix_lt_2:
  1027. movq mm5, mm3
  1028. pxor mm5, mm2
  1029. psllq mm5, 56
  1030. psrlq mm5, 56
  1031. pxor mm2, mm5
  1032. test r2, r2
  1033. jnz .body
  1034. .fix_tr_1:
  1035. movq mm5, mm3
  1036. pxor mm5, mm1
  1037. psrlq mm5, 56
  1038. psllq mm5, 56
  1039. pxor mm1, mm5
  1040. jmp .body
  1041. .do_left:
  1042. movq mm0, mm4
  1043. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1044. movq mm4, mm0
  1045. movq mm7, mm2
  1046. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1047. psllq mm1, 56
  1048. PALIGNR mm7, mm1, 7, mm3
  1049. movq mm0, [r0-8]
  1050. movq mm3, [r0]
  1051. movq mm1, [r0+8]
  1052. movq mm2, mm3
  1053. movq mm4, mm3
  1054. PALIGNR mm2, mm0, 7, mm0
  1055. PALIGNR mm1, mm4, 1, mm4
  1056. test r1, r1
  1057. jz .fix_lt_2
  1058. test r2, r2
  1059. jz .fix_tr_1
  1060. .body:
  1061. lea r1, [r0+r3*2]
  1062. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1063. pxor mm0, mm0
  1064. pxor mm1, mm1
  1065. lea r2, [r1+r3*2]
  1066. psadbw mm0, mm7
  1067. psadbw mm1, mm6
  1068. paddw mm0, [pw_8]
  1069. paddw mm0, mm1
  1070. lea r4, [r2+r3*2]
  1071. psrlw mm0, 4
  1072. pshufw mm0, mm0, 0
  1073. packuswb mm0, mm0
  1074. movq [r0+r3*1], mm0
  1075. movq [r0+r3*2], mm0
  1076. movq [r1+r3*1], mm0
  1077. movq [r1+r3*2], mm0
  1078. movq [r2+r3*1], mm0
  1079. movq [r2+r3*2], mm0
  1080. movq [r4+r3*1], mm0
  1081. movq [r4+r3*2], mm0
  1082. RET
  1083. %endmacro
  1084. INIT_MMX mmxext
  1085. %define PALIGNR PALIGNR_MMX
  1086. PRED8x8L_DC
  1087. INIT_MMX ssse3
  1088. %define PALIGNR PALIGNR_SSSE3
  1089. PRED8x8L_DC
  1090. ;-----------------------------------------------------------------------------
  1091. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1092. ;-----------------------------------------------------------------------------
  1093. %macro PRED8x8L_HORIZONTAL 0
  1094. cglobal pred8x8l_horizontal_8, 4,4
  1095. sub r0, r3
  1096. lea r2, [r0+r3*2]
  1097. movq mm0, [r0+r3*1-8]
  1098. test r1, r1
  1099. lea r1, [r0+r3]
  1100. cmovnz r1, r0
  1101. punpckhbw mm0, [r1+r3*0-8]
  1102. movq mm1, [r2+r3*1-8]
  1103. punpckhbw mm1, [r0+r3*2-8]
  1104. mov r2, r0
  1105. punpckhwd mm1, mm0
  1106. lea r0, [r0+r3*4]
  1107. movq mm2, [r0+r3*1-8]
  1108. punpckhbw mm2, [r0+r3*0-8]
  1109. lea r0, [r0+r3*2]
  1110. movq mm3, [r0+r3*1-8]
  1111. punpckhbw mm3, [r0+r3*0-8]
  1112. punpckhwd mm3, mm2
  1113. punpckhdq mm3, mm1
  1114. lea r0, [r0+r3*2]
  1115. movq mm0, [r0+r3*0-8]
  1116. movq mm1, [r1+r3*0-8]
  1117. mov r0, r2
  1118. movq mm4, mm3
  1119. movq mm2, mm3
  1120. PALIGNR mm4, mm0, 7, mm0
  1121. PALIGNR mm1, mm2, 1, mm2
  1122. movq mm0, mm4
  1123. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1124. movq mm4, mm0
  1125. movq mm7, mm2
  1126. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1127. psllq mm1, 56
  1128. PALIGNR mm7, mm1, 7, mm3
  1129. movq mm3, mm7
  1130. lea r1, [r0+r3*2]
  1131. movq mm7, mm3
  1132. punpckhbw mm3, mm3
  1133. punpcklbw mm7, mm7
  1134. pshufw mm0, mm3, 0xff
  1135. pshufw mm1, mm3, 0xaa
  1136. lea r2, [r1+r3*2]
  1137. pshufw mm2, mm3, 0x55
  1138. pshufw mm3, mm3, 0x00
  1139. pshufw mm4, mm7, 0xff
  1140. pshufw mm5, mm7, 0xaa
  1141. pshufw mm6, mm7, 0x55
  1142. pshufw mm7, mm7, 0x00
  1143. movq [r0+r3*1], mm0
  1144. movq [r0+r3*2], mm1
  1145. movq [r1+r3*1], mm2
  1146. movq [r1+r3*2], mm3
  1147. movq [r2+r3*1], mm4
  1148. movq [r2+r3*2], mm5
  1149. lea r0, [r2+r3*2]
  1150. movq [r0+r3*1], mm6
  1151. movq [r0+r3*2], mm7
  1152. RET
  1153. %endmacro
  1154. INIT_MMX mmxext
  1155. %define PALIGNR PALIGNR_MMX
  1156. PRED8x8L_HORIZONTAL
  1157. INIT_MMX ssse3
  1158. %define PALIGNR PALIGNR_SSSE3
  1159. PRED8x8L_HORIZONTAL
  1160. ;-----------------------------------------------------------------------------
  1161. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1162. ;-----------------------------------------------------------------------------
  1163. %macro PRED8x8L_VERTICAL 0
  1164. cglobal pred8x8l_vertical_8, 4,4
  1165. sub r0, r3
  1166. movq mm0, [r0-8]
  1167. movq mm3, [r0]
  1168. movq mm1, [r0+8]
  1169. movq mm2, mm3
  1170. movq mm4, mm3
  1171. PALIGNR mm2, mm0, 7, mm0
  1172. PALIGNR mm1, mm4, 1, mm4
  1173. test r1, r1 ; top_left
  1174. jz .fix_lt_2
  1175. test r2, r2 ; top_right
  1176. jz .fix_tr_1
  1177. jmp .body
  1178. .fix_lt_2:
  1179. movq mm5, mm3
  1180. pxor mm5, mm2
  1181. psllq mm5, 56
  1182. psrlq mm5, 56
  1183. pxor mm2, mm5
  1184. test r2, r2 ; top_right
  1185. jnz .body
  1186. .fix_tr_1:
  1187. movq mm5, mm3
  1188. pxor mm5, mm1
  1189. psrlq mm5, 56
  1190. psllq mm5, 56
  1191. pxor mm1, mm5
  1192. .body:
  1193. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1194. %rep 3
  1195. movq [r0+r3*1], mm0
  1196. movq [r0+r3*2], mm0
  1197. lea r0, [r0+r3*2]
  1198. %endrep
  1199. movq [r0+r3*1], mm0
  1200. movq [r0+r3*2], mm0
  1201. RET
  1202. %endmacro
  1203. INIT_MMX mmxext
  1204. %define PALIGNR PALIGNR_MMX
  1205. PRED8x8L_VERTICAL
  1206. INIT_MMX ssse3
  1207. %define PALIGNR PALIGNR_SSSE3
  1208. PRED8x8L_VERTICAL
  1209. ;-----------------------------------------------------------------------------
  1210. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1211. ;-----------------------------------------------------------------------------
  1212. INIT_MMX mmxext
  1213. %define PALIGNR PALIGNR_MMX
  1214. cglobal pred8x8l_down_left_8, 4,5
  1215. sub r0, r3
  1216. movq mm0, [r0-8]
  1217. movq mm3, [r0]
  1218. movq mm1, [r0+8]
  1219. movq mm2, mm3
  1220. movq mm4, mm3
  1221. PALIGNR mm2, mm0, 7, mm0
  1222. PALIGNR mm1, mm4, 1, mm4
  1223. test r1, r1
  1224. jz .fix_lt_2
  1225. test r2, r2
  1226. jz .fix_tr_1
  1227. jmp .do_top
  1228. .fix_lt_2:
  1229. movq mm5, mm3
  1230. pxor mm5, mm2
  1231. psllq mm5, 56
  1232. psrlq mm5, 56
  1233. pxor mm2, mm5
  1234. test r2, r2
  1235. jnz .do_top
  1236. .fix_tr_1:
  1237. movq mm5, mm3
  1238. pxor mm5, mm1
  1239. psrlq mm5, 56
  1240. psllq mm5, 56
  1241. pxor mm1, mm5
  1242. jmp .do_top
  1243. .fix_tr_2:
  1244. punpckhbw mm3, mm3
  1245. pshufw mm1, mm3, 0xFF
  1246. jmp .do_topright
  1247. .do_top:
  1248. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1249. movq mm7, mm4
  1250. test r2, r2
  1251. jz .fix_tr_2
  1252. movq mm0, [r0+8]
  1253. movq mm5, mm0
  1254. movq mm2, mm0
  1255. movq mm4, mm0
  1256. psrlq mm5, 56
  1257. PALIGNR mm2, mm3, 7, mm3
  1258. PALIGNR mm5, mm4, 1, mm4
  1259. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1260. .do_topright:
  1261. lea r1, [r0+r3*2]
  1262. movq mm6, mm1
  1263. psrlq mm1, 56
  1264. movq mm4, mm1
  1265. lea r2, [r1+r3*2]
  1266. movq mm2, mm6
  1267. PALIGNR mm2, mm7, 1, mm0
  1268. movq mm3, mm6
  1269. PALIGNR mm3, mm7, 7, mm0
  1270. PALIGNR mm4, mm6, 1, mm0
  1271. movq mm5, mm7
  1272. movq mm1, mm7
  1273. movq mm7, mm6
  1274. lea r4, [r2+r3*2]
  1275. psllq mm1, 8
  1276. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1277. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1278. movq [r4+r3*2], mm1
  1279. movq mm2, mm0
  1280. psllq mm1, 8
  1281. psrlq mm2, 56
  1282. psllq mm0, 8
  1283. por mm1, mm2
  1284. movq [r4+r3*1], mm1
  1285. movq mm2, mm0
  1286. psllq mm1, 8
  1287. psrlq mm2, 56
  1288. psllq mm0, 8
  1289. por mm1, mm2
  1290. movq [r2+r3*2], mm1
  1291. movq mm2, mm0
  1292. psllq mm1, 8
  1293. psrlq mm2, 56
  1294. psllq mm0, 8
  1295. por mm1, mm2
  1296. movq [r2+r3*1], mm1
  1297. movq mm2, mm0
  1298. psllq mm1, 8
  1299. psrlq mm2, 56
  1300. psllq mm0, 8
  1301. por mm1, mm2
  1302. movq [r1+r3*2], mm1
  1303. movq mm2, mm0
  1304. psllq mm1, 8
  1305. psrlq mm2, 56
  1306. psllq mm0, 8
  1307. por mm1, mm2
  1308. movq [r1+r3*1], mm1
  1309. movq mm2, mm0
  1310. psllq mm1, 8
  1311. psrlq mm2, 56
  1312. psllq mm0, 8
  1313. por mm1, mm2
  1314. movq [r0+r3*2], mm1
  1315. psllq mm1, 8
  1316. psrlq mm0, 56
  1317. por mm1, mm0
  1318. movq [r0+r3*1], mm1
  1319. RET
  1320. %macro PRED8x8L_DOWN_LEFT 0
  1321. cglobal pred8x8l_down_left_8, 4,4
  1322. sub r0, r3
  1323. movq mm0, [r0-8]
  1324. movq mm3, [r0]
  1325. movq mm1, [r0+8]
  1326. movq mm2, mm3
  1327. movq mm4, mm3
  1328. PALIGNR mm2, mm0, 7, mm0
  1329. PALIGNR mm1, mm4, 1, mm4
  1330. test r1, r1 ; top_left
  1331. jz .fix_lt_2
  1332. test r2, r2 ; top_right
  1333. jz .fix_tr_1
  1334. jmp .do_top
  1335. .fix_lt_2:
  1336. movq mm5, mm3
  1337. pxor mm5, mm2
  1338. psllq mm5, 56
  1339. psrlq mm5, 56
  1340. pxor mm2, mm5
  1341. test r2, r2 ; top_right
  1342. jnz .do_top
  1343. .fix_tr_1:
  1344. movq mm5, mm3
  1345. pxor mm5, mm1
  1346. psrlq mm5, 56
  1347. psllq mm5, 56
  1348. pxor mm1, mm5
  1349. jmp .do_top
  1350. .fix_tr_2:
  1351. punpckhbw mm3, mm3
  1352. pshufw mm1, mm3, 0xFF
  1353. jmp .do_topright
  1354. .do_top:
  1355. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1356. movq2dq xmm3, mm4
  1357. test r2, r2 ; top_right
  1358. jz .fix_tr_2
  1359. movq mm0, [r0+8]
  1360. movq mm5, mm0
  1361. movq mm2, mm0
  1362. movq mm4, mm0
  1363. psrlq mm5, 56
  1364. PALIGNR mm2, mm3, 7, mm3
  1365. PALIGNR mm5, mm4, 1, mm4
  1366. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1367. .do_topright:
  1368. movq2dq xmm4, mm1
  1369. psrlq mm1, 56
  1370. movq2dq xmm5, mm1
  1371. lea r1, [r0+r3*2]
  1372. pslldq xmm4, 8
  1373. por xmm3, xmm4
  1374. movdqa xmm2, xmm3
  1375. psrldq xmm2, 1
  1376. pslldq xmm5, 15
  1377. por xmm2, xmm5
  1378. lea r2, [r1+r3*2]
  1379. movdqa xmm1, xmm3
  1380. pslldq xmm1, 1
  1381. INIT_XMM cpuname
  1382. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1383. psrldq xmm0, 1
  1384. movq [r0+r3*1], xmm0
  1385. psrldq xmm0, 1
  1386. movq [r0+r3*2], xmm0
  1387. psrldq xmm0, 1
  1388. lea r0, [r2+r3*2]
  1389. movq [r1+r3*1], xmm0
  1390. psrldq xmm0, 1
  1391. movq [r1+r3*2], xmm0
  1392. psrldq xmm0, 1
  1393. movq [r2+r3*1], xmm0
  1394. psrldq xmm0, 1
  1395. movq [r2+r3*2], xmm0
  1396. psrldq xmm0, 1
  1397. movq [r0+r3*1], xmm0
  1398. psrldq xmm0, 1
  1399. movq [r0+r3*2], xmm0
  1400. RET
  1401. %endmacro
  1402. INIT_MMX sse2
  1403. %define PALIGNR PALIGNR_MMX
  1404. PRED8x8L_DOWN_LEFT
  1405. INIT_MMX ssse3
  1406. %define PALIGNR PALIGNR_SSSE3
  1407. PRED8x8L_DOWN_LEFT
  1408. ;-----------------------------------------------------------------------------
  1409. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1410. ;-----------------------------------------------------------------------------
  1411. INIT_MMX mmxext
  1412. %define PALIGNR PALIGNR_MMX
  1413. cglobal pred8x8l_down_right_8, 4,5
  1414. sub r0, r3
  1415. lea r4, [r0+r3*2]
  1416. movq mm0, [r0+r3*1-8]
  1417. punpckhbw mm0, [r0+r3*0-8]
  1418. movq mm1, [r4+r3*1-8]
  1419. punpckhbw mm1, [r0+r3*2-8]
  1420. mov r4, r0
  1421. punpckhwd mm1, mm0
  1422. lea r0, [r0+r3*4]
  1423. movq mm2, [r0+r3*1-8]
  1424. punpckhbw mm2, [r0+r3*0-8]
  1425. lea r0, [r0+r3*2]
  1426. movq mm3, [r0+r3*1-8]
  1427. punpckhbw mm3, [r0+r3*0-8]
  1428. punpckhwd mm3, mm2
  1429. punpckhdq mm3, mm1
  1430. lea r0, [r0+r3*2]
  1431. movq mm0, [r0+r3*0-8]
  1432. movq mm1, [r4]
  1433. mov r0, r4
  1434. movq mm4, mm3
  1435. movq mm2, mm3
  1436. PALIGNR mm4, mm0, 7, mm0
  1437. PALIGNR mm1, mm2, 1, mm2
  1438. test r1, r1 ; top_left
  1439. jz .fix_lt_1
  1440. .do_left:
  1441. movq mm0, mm4
  1442. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1443. movq mm4, mm0
  1444. movq mm7, mm2
  1445. movq mm6, mm2
  1446. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1447. psllq mm1, 56
  1448. PALIGNR mm7, mm1, 7, mm3
  1449. movq mm0, [r0-8]
  1450. movq mm3, [r0]
  1451. movq mm1, [r0+8]
  1452. movq mm2, mm3
  1453. movq mm4, mm3
  1454. PALIGNR mm2, mm0, 7, mm0
  1455. PALIGNR mm1, mm4, 1, mm4
  1456. test r1, r1 ; top_left
  1457. jz .fix_lt_2
  1458. test r2, r2 ; top_right
  1459. jz .fix_tr_1
  1460. .do_top:
  1461. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1462. movq mm5, mm4
  1463. jmp .body
  1464. .fix_lt_1:
  1465. movq mm5, mm3
  1466. pxor mm5, mm4
  1467. psrlq mm5, 56
  1468. psllq mm5, 48
  1469. pxor mm1, mm5
  1470. jmp .do_left
  1471. .fix_lt_2:
  1472. movq mm5, mm3
  1473. pxor mm5, mm2
  1474. psllq mm5, 56
  1475. psrlq mm5, 56
  1476. pxor mm2, mm5
  1477. test r2, r2 ; top_right
  1478. jnz .do_top
  1479. .fix_tr_1:
  1480. movq mm5, mm3
  1481. pxor mm5, mm1
  1482. psrlq mm5, 56
  1483. psllq mm5, 56
  1484. pxor mm1, mm5
  1485. jmp .do_top
  1486. .body:
  1487. lea r1, [r0+r3*2]
  1488. movq mm1, mm7
  1489. movq mm7, mm5
  1490. movq mm5, mm6
  1491. movq mm2, mm7
  1492. lea r2, [r1+r3*2]
  1493. PALIGNR mm2, mm6, 1, mm0
  1494. movq mm3, mm7
  1495. PALIGNR mm3, mm6, 7, mm0
  1496. movq mm4, mm7
  1497. lea r4, [r2+r3*2]
  1498. psrlq mm4, 8
  1499. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1500. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1501. movq [r4+r3*2], mm0
  1502. movq mm2, mm1
  1503. psrlq mm0, 8
  1504. psllq mm2, 56
  1505. psrlq mm1, 8
  1506. por mm0, mm2
  1507. movq [r4+r3*1], mm0
  1508. movq mm2, mm1
  1509. psrlq mm0, 8
  1510. psllq mm2, 56
  1511. psrlq mm1, 8
  1512. por mm0, mm2
  1513. movq [r2+r3*2], mm0
  1514. movq mm2, mm1
  1515. psrlq mm0, 8
  1516. psllq mm2, 56
  1517. psrlq mm1, 8
  1518. por mm0, mm2
  1519. movq [r2+r3*1], mm0
  1520. movq mm2, mm1
  1521. psrlq mm0, 8
  1522. psllq mm2, 56
  1523. psrlq mm1, 8
  1524. por mm0, mm2
  1525. movq [r1+r3*2], mm0
  1526. movq mm2, mm1
  1527. psrlq mm0, 8
  1528. psllq mm2, 56
  1529. psrlq mm1, 8
  1530. por mm0, mm2
  1531. movq [r1+r3*1], mm0
  1532. movq mm2, mm1
  1533. psrlq mm0, 8
  1534. psllq mm2, 56
  1535. psrlq mm1, 8
  1536. por mm0, mm2
  1537. movq [r0+r3*2], mm0
  1538. psrlq mm0, 8
  1539. psllq mm1, 56
  1540. por mm0, mm1
  1541. movq [r0+r3*1], mm0
  1542. RET
  1543. %macro PRED8x8L_DOWN_RIGHT 0
  1544. cglobal pred8x8l_down_right_8, 4,5
  1545. sub r0, r3
  1546. lea r4, [r0+r3*2]
  1547. movq mm0, [r0+r3*1-8]
  1548. punpckhbw mm0, [r0+r3*0-8]
  1549. movq mm1, [r4+r3*1-8]
  1550. punpckhbw mm1, [r0+r3*2-8]
  1551. mov r4, r0
  1552. punpckhwd mm1, mm0
  1553. lea r0, [r0+r3*4]
  1554. movq mm2, [r0+r3*1-8]
  1555. punpckhbw mm2, [r0+r3*0-8]
  1556. lea r0, [r0+r3*2]
  1557. movq mm3, [r0+r3*1-8]
  1558. punpckhbw mm3, [r0+r3*0-8]
  1559. punpckhwd mm3, mm2
  1560. punpckhdq mm3, mm1
  1561. lea r0, [r0+r3*2]
  1562. movq mm0, [r0+r3*0-8]
  1563. movq mm1, [r4]
  1564. mov r0, r4
  1565. movq mm4, mm3
  1566. movq mm2, mm3
  1567. PALIGNR mm4, mm0, 7, mm0
  1568. PALIGNR mm1, mm2, 1, mm2
  1569. test r1, r1
  1570. jz .fix_lt_1
  1571. jmp .do_left
  1572. .fix_lt_1:
  1573. movq mm5, mm3
  1574. pxor mm5, mm4
  1575. psrlq mm5, 56
  1576. psllq mm5, 48
  1577. pxor mm1, mm5
  1578. jmp .do_left
  1579. .fix_lt_2:
  1580. movq mm5, mm3
  1581. pxor mm5, mm2
  1582. psllq mm5, 56
  1583. psrlq mm5, 56
  1584. pxor mm2, mm5
  1585. test r2, r2
  1586. jnz .do_top
  1587. .fix_tr_1:
  1588. movq mm5, mm3
  1589. pxor mm5, mm1
  1590. psrlq mm5, 56
  1591. psllq mm5, 56
  1592. pxor mm1, mm5
  1593. jmp .do_top
  1594. .do_left:
  1595. movq mm0, mm4
  1596. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1597. movq mm4, mm0
  1598. movq mm7, mm2
  1599. movq2dq xmm3, mm2
  1600. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1601. psllq mm1, 56
  1602. PALIGNR mm7, mm1, 7, mm3
  1603. movq2dq xmm1, mm7
  1604. movq mm0, [r0-8]
  1605. movq mm3, [r0]
  1606. movq mm1, [r0+8]
  1607. movq mm2, mm3
  1608. movq mm4, mm3
  1609. PALIGNR mm2, mm0, 7, mm0
  1610. PALIGNR mm1, mm4, 1, mm4
  1611. test r1, r1
  1612. jz .fix_lt_2
  1613. test r2, r2
  1614. jz .fix_tr_1
  1615. .do_top:
  1616. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1617. movq2dq xmm4, mm4
  1618. lea r1, [r0+r3*2]
  1619. movdqa xmm0, xmm3
  1620. pslldq xmm4, 8
  1621. por xmm3, xmm4
  1622. lea r2, [r1+r3*2]
  1623. pslldq xmm4, 1
  1624. por xmm1, xmm4
  1625. psrldq xmm0, 7
  1626. pslldq xmm0, 15
  1627. psrldq xmm0, 7
  1628. por xmm1, xmm0
  1629. lea r0, [r2+r3*2]
  1630. movdqa xmm2, xmm3
  1631. psrldq xmm2, 1
  1632. INIT_XMM cpuname
  1633. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1634. movdqa xmm1, xmm0
  1635. psrldq xmm1, 1
  1636. movq [r0+r3*2], xmm0
  1637. movq [r0+r3*1], xmm1
  1638. psrldq xmm0, 2
  1639. psrldq xmm1, 2
  1640. movq [r2+r3*2], xmm0
  1641. movq [r2+r3*1], xmm1
  1642. psrldq xmm0, 2
  1643. psrldq xmm1, 2
  1644. movq [r1+r3*2], xmm0
  1645. movq [r1+r3*1], xmm1
  1646. psrldq xmm0, 2
  1647. psrldq xmm1, 2
  1648. movq [r4+r3*2], xmm0
  1649. movq [r4+r3*1], xmm1
  1650. RET
  1651. %endmacro
  1652. INIT_MMX sse2
  1653. %define PALIGNR PALIGNR_MMX
  1654. PRED8x8L_DOWN_RIGHT
  1655. INIT_MMX ssse3
  1656. %define PALIGNR PALIGNR_SSSE3
  1657. PRED8x8L_DOWN_RIGHT
  1658. ;-----------------------------------------------------------------------------
  1659. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1660. ;-----------------------------------------------------------------------------
  1661. INIT_MMX mmxext
  1662. %define PALIGNR PALIGNR_MMX
  1663. cglobal pred8x8l_vertical_right_8, 4,5
  1664. sub r0, r3
  1665. lea r4, [r0+r3*2]
  1666. movq mm0, [r0+r3*1-8]
  1667. punpckhbw mm0, [r0+r3*0-8]
  1668. movq mm1, [r4+r3*1-8]
  1669. punpckhbw mm1, [r0+r3*2-8]
  1670. mov r4, r0
  1671. punpckhwd mm1, mm0
  1672. lea r0, [r0+r3*4]
  1673. movq mm2, [r0+r3*1-8]
  1674. punpckhbw mm2, [r0+r3*0-8]
  1675. lea r0, [r0+r3*2]
  1676. movq mm3, [r0+r3*1-8]
  1677. punpckhbw mm3, [r0+r3*0-8]
  1678. punpckhwd mm3, mm2
  1679. punpckhdq mm3, mm1
  1680. lea r0, [r0+r3*2]
  1681. movq mm0, [r0+r3*0-8]
  1682. movq mm1, [r4]
  1683. mov r0, r4
  1684. movq mm4, mm3
  1685. movq mm2, mm3
  1686. PALIGNR mm4, mm0, 7, mm0
  1687. PALIGNR mm1, mm2, 1, mm2
  1688. test r1, r1
  1689. jz .fix_lt_1
  1690. jmp .do_left
  1691. .fix_lt_1:
  1692. movq mm5, mm3
  1693. pxor mm5, mm4
  1694. psrlq mm5, 56
  1695. psllq mm5, 48
  1696. pxor mm1, mm5
  1697. jmp .do_left
  1698. .fix_lt_2:
  1699. movq mm5, mm3
  1700. pxor mm5, mm2
  1701. psllq mm5, 56
  1702. psrlq mm5, 56
  1703. pxor mm2, mm5
  1704. test r2, r2
  1705. jnz .do_top
  1706. .fix_tr_1:
  1707. movq mm5, mm3
  1708. pxor mm5, mm1
  1709. psrlq mm5, 56
  1710. psllq mm5, 56
  1711. pxor mm1, mm5
  1712. jmp .do_top
  1713. .do_left:
  1714. movq mm0, mm4
  1715. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1716. movq mm7, mm2
  1717. movq mm0, [r0-8]
  1718. movq mm3, [r0]
  1719. movq mm1, [r0+8]
  1720. movq mm2, mm3
  1721. movq mm4, mm3
  1722. PALIGNR mm2, mm0, 7, mm0
  1723. PALIGNR mm1, mm4, 1, mm4
  1724. test r1, r1
  1725. jz .fix_lt_2
  1726. test r2, r2
  1727. jz .fix_tr_1
  1728. .do_top:
  1729. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1730. lea r1, [r0+r3*2]
  1731. movq mm2, mm6
  1732. movq mm3, mm6
  1733. PALIGNR mm3, mm7, 7, mm0
  1734. PALIGNR mm6, mm7, 6, mm1
  1735. movq mm4, mm3
  1736. pavgb mm3, mm2
  1737. lea r2, [r1+r3*2]
  1738. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1739. movq [r0+r3*1], mm3
  1740. movq [r0+r3*2], mm0
  1741. movq mm5, mm0
  1742. movq mm6, mm3
  1743. movq mm1, mm7
  1744. movq mm2, mm1
  1745. psllq mm2, 8
  1746. movq mm3, mm1
  1747. psllq mm3, 16
  1748. lea r4, [r2+r3*2]
  1749. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1750. PALIGNR mm6, mm0, 7, mm2
  1751. movq [r1+r3*1], mm6
  1752. psllq mm0, 8
  1753. PALIGNR mm5, mm0, 7, mm1
  1754. movq [r1+r3*2], mm5
  1755. psllq mm0, 8
  1756. PALIGNR mm6, mm0, 7, mm2
  1757. movq [r2+r3*1], mm6
  1758. psllq mm0, 8
  1759. PALIGNR mm5, mm0, 7, mm1
  1760. movq [r2+r3*2], mm5
  1761. psllq mm0, 8
  1762. PALIGNR mm6, mm0, 7, mm2
  1763. movq [r4+r3*1], mm6
  1764. psllq mm0, 8
  1765. PALIGNR mm5, mm0, 7, mm1
  1766. movq [r4+r3*2], mm5
  1767. RET
  1768. %macro PRED8x8L_VERTICAL_RIGHT 0
  1769. cglobal pred8x8l_vertical_right_8, 4,5,7
  1770. ; manually spill XMM registers for Win64 because
  1771. ; the code here is initialized with INIT_MMX
  1772. WIN64_SPILL_XMM 7
  1773. sub r0, r3
  1774. lea r4, [r0+r3*2]
  1775. movq mm0, [r0+r3*1-8]
  1776. punpckhbw mm0, [r0+r3*0-8]
  1777. movq mm1, [r4+r3*1-8]
  1778. punpckhbw mm1, [r0+r3*2-8]
  1779. mov r4, r0
  1780. punpckhwd mm1, mm0
  1781. lea r0, [r0+r3*4]
  1782. movq mm2, [r0+r3*1-8]
  1783. punpckhbw mm2, [r0+r3*0-8]
  1784. lea r0, [r0+r3*2]
  1785. movq mm3, [r0+r3*1-8]
  1786. punpckhbw mm3, [r0+r3*0-8]
  1787. punpckhwd mm3, mm2
  1788. punpckhdq mm3, mm1
  1789. lea r0, [r0+r3*2]
  1790. movq mm0, [r0+r3*0-8]
  1791. movq mm1, [r4]
  1792. mov r0, r4
  1793. movq mm4, mm3
  1794. movq mm2, mm3
  1795. PALIGNR mm4, mm0, 7, mm0
  1796. PALIGNR mm1, mm2, 1, mm2
  1797. test r1, r1
  1798. jnz .do_left
  1799. .fix_lt_1:
  1800. movq mm5, mm3
  1801. pxor mm5, mm4
  1802. psrlq mm5, 56
  1803. psllq mm5, 48
  1804. pxor mm1, mm5
  1805. jmp .do_left
  1806. .fix_lt_2:
  1807. movq mm5, mm3
  1808. pxor mm5, mm2
  1809. psllq mm5, 56
  1810. psrlq mm5, 56
  1811. pxor mm2, mm5
  1812. test r2, r2
  1813. jnz .do_top
  1814. .fix_tr_1:
  1815. movq mm5, mm3
  1816. pxor mm5, mm1
  1817. psrlq mm5, 56
  1818. psllq mm5, 56
  1819. pxor mm1, mm5
  1820. jmp .do_top
  1821. .do_left:
  1822. movq mm0, mm4
  1823. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1824. movq2dq xmm0, mm2
  1825. movq mm0, [r0-8]
  1826. movq mm3, [r0]
  1827. movq mm1, [r0+8]
  1828. movq mm2, mm3
  1829. movq mm4, mm3
  1830. PALIGNR mm2, mm0, 7, mm0
  1831. PALIGNR mm1, mm4, 1, mm4
  1832. test r1, r1
  1833. jz .fix_lt_2
  1834. test r2, r2
  1835. jz .fix_tr_1
  1836. .do_top:
  1837. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1838. lea r1, [r0+r3*2]
  1839. movq2dq xmm4, mm6
  1840. pslldq xmm4, 8
  1841. por xmm0, xmm4
  1842. movdqa xmm6, [pw_ff00]
  1843. movdqa xmm1, xmm0
  1844. lea r2, [r1+r3*2]
  1845. movdqa xmm2, xmm0
  1846. movdqa xmm3, xmm0
  1847. pslldq xmm0, 1
  1848. pslldq xmm1, 2
  1849. pavgb xmm2, xmm0
  1850. INIT_XMM cpuname
  1851. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1852. pandn xmm6, xmm4
  1853. movdqa xmm5, xmm4
  1854. psrlw xmm4, 8
  1855. packuswb xmm6, xmm4
  1856. movhlps xmm4, xmm6
  1857. movhps [r0+r3*2], xmm5
  1858. movhps [r0+r3*1], xmm2
  1859. psrldq xmm5, 4
  1860. movss xmm5, xmm6
  1861. psrldq xmm2, 4
  1862. movss xmm2, xmm4
  1863. lea r0, [r2+r3*2]
  1864. psrldq xmm5, 1
  1865. psrldq xmm2, 1
  1866. movq [r0+r3*2], xmm5
  1867. movq [r0+r3*1], xmm2
  1868. psrldq xmm5, 1
  1869. psrldq xmm2, 1
  1870. movq [r2+r3*2], xmm5
  1871. movq [r2+r3*1], xmm2
  1872. psrldq xmm5, 1
  1873. psrldq xmm2, 1
  1874. movq [r1+r3*2], xmm5
  1875. movq [r1+r3*1], xmm2
  1876. RET
  1877. %endmacro
  1878. INIT_MMX sse2
  1879. %define PALIGNR PALIGNR_MMX
  1880. PRED8x8L_VERTICAL_RIGHT
  1881. INIT_MMX ssse3
  1882. %define PALIGNR PALIGNR_SSSE3
  1883. PRED8x8L_VERTICAL_RIGHT
  1884. ;-----------------------------------------------------------------------------
  1885. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1886. ;-----------------------------------------------------------------------------
  1887. %macro PRED8x8L_VERTICAL_LEFT 0
  1888. cglobal pred8x8l_vertical_left_8, 4,4
  1889. sub r0, r3
  1890. movq mm0, [r0-8]
  1891. movq mm3, [r0]
  1892. movq mm1, [r0+8]
  1893. movq mm2, mm3
  1894. movq mm4, mm3
  1895. PALIGNR mm2, mm0, 7, mm0
  1896. PALIGNR mm1, mm4, 1, mm4
  1897. test r1, r1
  1898. jz .fix_lt_2
  1899. test r2, r2
  1900. jz .fix_tr_1
  1901. jmp .do_top
  1902. .fix_lt_2:
  1903. movq mm5, mm3
  1904. pxor mm5, mm2
  1905. psllq mm5, 56
  1906. psrlq mm5, 56
  1907. pxor mm2, mm5
  1908. test r2, r2
  1909. jnz .do_top
  1910. .fix_tr_1:
  1911. movq mm5, mm3
  1912. pxor mm5, mm1
  1913. psrlq mm5, 56
  1914. psllq mm5, 56
  1915. pxor mm1, mm5
  1916. jmp .do_top
  1917. .fix_tr_2:
  1918. punpckhbw mm3, mm3
  1919. pshufw mm1, mm3, 0xFF
  1920. jmp .do_topright
  1921. .do_top:
  1922. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1923. movq2dq xmm4, mm4
  1924. test r2, r2
  1925. jz .fix_tr_2
  1926. movq mm0, [r0+8]
  1927. movq mm5, mm0
  1928. movq mm2, mm0
  1929. movq mm4, mm0
  1930. psrlq mm5, 56
  1931. PALIGNR mm2, mm3, 7, mm3
  1932. PALIGNR mm5, mm4, 1, mm4
  1933. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1934. .do_topright:
  1935. movq2dq xmm3, mm1
  1936. lea r1, [r0+r3*2]
  1937. pslldq xmm3, 8
  1938. por xmm4, xmm3
  1939. movdqa xmm2, xmm4
  1940. movdqa xmm1, xmm4
  1941. movdqa xmm3, xmm4
  1942. psrldq xmm2, 1
  1943. pslldq xmm1, 1
  1944. pavgb xmm3, xmm2
  1945. lea r2, [r1+r3*2]
  1946. INIT_XMM cpuname
  1947. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1948. psrldq xmm0, 1
  1949. movq [r0+r3*1], xmm3
  1950. movq [r0+r3*2], xmm0
  1951. lea r0, [r2+r3*2]
  1952. psrldq xmm3, 1
  1953. psrldq xmm0, 1
  1954. movq [r1+r3*1], xmm3
  1955. movq [r1+r3*2], xmm0
  1956. psrldq xmm3, 1
  1957. psrldq xmm0, 1
  1958. movq [r2+r3*1], xmm3
  1959. movq [r2+r3*2], xmm0
  1960. psrldq xmm3, 1
  1961. psrldq xmm0, 1
  1962. movq [r0+r3*1], xmm3
  1963. movq [r0+r3*2], xmm0
  1964. RET
  1965. %endmacro
  1966. INIT_MMX sse2
  1967. %define PALIGNR PALIGNR_MMX
  1968. PRED8x8L_VERTICAL_LEFT
  1969. INIT_MMX ssse3
  1970. %define PALIGNR PALIGNR_SSSE3
  1971. PRED8x8L_VERTICAL_LEFT
  1972. ;-----------------------------------------------------------------------------
  1973. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  1974. ;-----------------------------------------------------------------------------
  1975. %macro PRED8x8L_HORIZONTAL_UP 0
  1976. cglobal pred8x8l_horizontal_up_8, 4,4
  1977. sub r0, r3
  1978. lea r2, [r0+r3*2]
  1979. movq mm0, [r0+r3*1-8]
  1980. test r1, r1
  1981. lea r1, [r0+r3]
  1982. cmovnz r1, r0
  1983. punpckhbw mm0, [r1+r3*0-8]
  1984. movq mm1, [r2+r3*1-8]
  1985. punpckhbw mm1, [r0+r3*2-8]
  1986. mov r2, r0
  1987. punpckhwd mm1, mm0
  1988. lea r0, [r0+r3*4]
  1989. movq mm2, [r0+r3*1-8]
  1990. punpckhbw mm2, [r0+r3*0-8]
  1991. lea r0, [r0+r3*2]
  1992. movq mm3, [r0+r3*1-8]
  1993. punpckhbw mm3, [r0+r3*0-8]
  1994. punpckhwd mm3, mm2
  1995. punpckhdq mm3, mm1
  1996. lea r0, [r0+r3*2]
  1997. movq mm0, [r0+r3*0-8]
  1998. movq mm1, [r1+r3*0-8]
  1999. mov r0, r2
  2000. movq mm4, mm3
  2001. movq mm2, mm3
  2002. PALIGNR mm4, mm0, 7, mm0
  2003. PALIGNR mm1, mm2, 1, mm2
  2004. movq mm0, mm4
  2005. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2006. movq mm4, mm0
  2007. movq mm7, mm2
  2008. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2009. psllq mm1, 56
  2010. PALIGNR mm7, mm1, 7, mm3
  2011. lea r1, [r0+r3*2]
  2012. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2013. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2014. movq mm2, mm0
  2015. psllw mm0, 8
  2016. psrlw mm2, 8
  2017. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2018. movq mm3, mm2
  2019. movq mm4, mm2
  2020. movq mm5, mm2
  2021. psrlq mm2, 8
  2022. psrlq mm3, 16
  2023. lea r2, [r1+r3*2]
  2024. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2025. punpckhbw mm7, mm7
  2026. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2027. pavgb mm4, mm2
  2028. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2029. movq mm5, mm4
  2030. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2031. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2032. movq mm6, mm5
  2033. movq mm7, mm5
  2034. movq mm0, mm5
  2035. PALIGNR mm5, mm4, 2, mm1
  2036. pshufw mm1, mm6, 11111001b
  2037. PALIGNR mm6, mm4, 4, mm2
  2038. pshufw mm2, mm7, 11111110b
  2039. PALIGNR mm7, mm4, 6, mm3
  2040. pshufw mm3, mm0, 11111111b
  2041. movq [r0+r3*1], mm4
  2042. movq [r0+r3*2], mm5
  2043. lea r0, [r2+r3*2]
  2044. movq [r1+r3*1], mm6
  2045. movq [r1+r3*2], mm7
  2046. movq [r2+r3*1], mm0
  2047. movq [r2+r3*2], mm1
  2048. movq [r0+r3*1], mm2
  2049. movq [r0+r3*2], mm3
  2050. RET
  2051. %endmacro
  2052. INIT_MMX mmxext
  2053. %define PALIGNR PALIGNR_MMX
  2054. PRED8x8L_HORIZONTAL_UP
  2055. INIT_MMX ssse3
  2056. %define PALIGNR PALIGNR_SSSE3
  2057. PRED8x8L_HORIZONTAL_UP
  2058. ;-----------------------------------------------------------------------------
  2059. ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
  2060. ;-----------------------------------------------------------------------------
  2061. INIT_MMX mmxext
  2062. %define PALIGNR PALIGNR_MMX
  2063. cglobal pred8x8l_horizontal_down_8, 4,5
  2064. sub r0, r3
  2065. lea r4, [r0+r3*2]
  2066. movq mm0, [r0+r3*1-8]
  2067. punpckhbw mm0, [r0+r3*0-8]
  2068. movq mm1, [r4+r3*1-8]
  2069. punpckhbw mm1, [r0+r3*2-8]
  2070. mov r4, r0
  2071. punpckhwd mm1, mm0
  2072. lea r0, [r0+r3*4]
  2073. movq mm2, [r0+r3*1-8]
  2074. punpckhbw mm2, [r0+r3*0-8]
  2075. lea r0, [r0+r3*2]
  2076. movq mm3, [r0+r3*1-8]
  2077. punpckhbw mm3, [r0+r3*0-8]
  2078. punpckhwd mm3, mm2
  2079. punpckhdq mm3, mm1
  2080. lea r0, [r0+r3*2]
  2081. movq mm0, [r0+r3*0-8]
  2082. movq mm1, [r4]
  2083. mov r0, r4
  2084. movq mm4, mm3
  2085. movq mm2, mm3
  2086. PALIGNR mm4, mm0, 7, mm0
  2087. PALIGNR mm1, mm2, 1, mm2
  2088. test r1, r1
  2089. jnz .do_left
  2090. .fix_lt_1:
  2091. movq mm5, mm3
  2092. pxor mm5, mm4
  2093. psrlq mm5, 56
  2094. psllq mm5, 48
  2095. pxor mm1, mm5
  2096. jmp .do_left
  2097. .fix_lt_2:
  2098. movq mm5, mm3
  2099. pxor mm5, mm2
  2100. psllq mm5, 56
  2101. psrlq mm5, 56
  2102. pxor mm2, mm5
  2103. test r2, r2
  2104. jnz .do_top
  2105. .fix_tr_1:
  2106. movq mm5, mm3
  2107. pxor mm5, mm1
  2108. psrlq mm5, 56
  2109. psllq mm5, 56
  2110. pxor mm1, mm5
  2111. jmp .do_top
  2112. .do_left:
  2113. movq mm0, mm4
  2114. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2115. movq mm4, mm0
  2116. movq mm7, mm2
  2117. movq mm6, mm2
  2118. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2119. psllq mm1, 56
  2120. PALIGNR mm7, mm1, 7, mm3
  2121. movq mm0, [r0-8]
  2122. movq mm3, [r0]
  2123. movq mm1, [r0+8]
  2124. movq mm2, mm3
  2125. movq mm4, mm3
  2126. PALIGNR mm2, mm0, 7, mm0
  2127. PALIGNR mm1, mm4, 1, mm4
  2128. test r1, r1
  2129. jz .fix_lt_2
  2130. test r2, r2
  2131. jz .fix_tr_1
  2132. .do_top:
  2133. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2134. movq mm5, mm4
  2135. lea r1, [r0+r3*2]
  2136. psllq mm7, 56
  2137. movq mm2, mm5
  2138. movq mm3, mm6
  2139. movq mm4, mm2
  2140. PALIGNR mm2, mm6, 7, mm5
  2141. PALIGNR mm6, mm7, 7, mm0
  2142. lea r2, [r1+r3*2]
  2143. PALIGNR mm4, mm3, 1, mm7
  2144. movq mm5, mm3
  2145. pavgb mm3, mm6
  2146. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2147. movq mm4, mm2
  2148. movq mm1, mm2
  2149. lea r4, [r2+r3*2]
  2150. psrlq mm4, 16
  2151. psrlq mm1, 8
  2152. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2153. movq mm7, mm3
  2154. punpcklbw mm3, mm0
  2155. punpckhbw mm7, mm0
  2156. movq mm1, mm7
  2157. movq mm0, mm7
  2158. movq mm4, mm7
  2159. movq [r4+r3*2], mm3
  2160. PALIGNR mm7, mm3, 2, mm5
  2161. movq [r4+r3*1], mm7
  2162. PALIGNR mm1, mm3, 4, mm5
  2163. movq [r2+r3*2], mm1
  2164. PALIGNR mm0, mm3, 6, mm3
  2165. movq [r2+r3*1], mm0
  2166. movq mm2, mm6
  2167. movq mm3, mm6
  2168. movq [r1+r3*2], mm4
  2169. PALIGNR mm6, mm4, 2, mm5
  2170. movq [r1+r3*1], mm6
  2171. PALIGNR mm2, mm4, 4, mm5
  2172. movq [r0+r3*2], mm2
  2173. PALIGNR mm3, mm4, 6, mm4
  2174. movq [r0+r3*1], mm3
  2175. RET
  2176. %macro PRED8x8L_HORIZONTAL_DOWN 0
  2177. cglobal pred8x8l_horizontal_down_8, 4,5
  2178. sub r0, r3
  2179. lea r4, [r0+r3*2]
  2180. movq mm0, [r0+r3*1-8]
  2181. punpckhbw mm0, [r0+r3*0-8]
  2182. movq mm1, [r4+r3*1-8]
  2183. punpckhbw mm1, [r0+r3*2-8]
  2184. mov r4, r0
  2185. punpckhwd mm1, mm0
  2186. lea r0, [r0+r3*4]
  2187. movq mm2, [r0+r3*1-8]
  2188. punpckhbw mm2, [r0+r3*0-8]
  2189. lea r0, [r0+r3*2]
  2190. movq mm3, [r0+r3*1-8]
  2191. punpckhbw mm3, [r0+r3*0-8]
  2192. punpckhwd mm3, mm2
  2193. punpckhdq mm3, mm1
  2194. lea r0, [r0+r3*2]
  2195. movq mm0, [r0+r3*0-8]
  2196. movq mm1, [r4]
  2197. mov r0, r4
  2198. movq mm4, mm3
  2199. movq mm2, mm3
  2200. PALIGNR mm4, mm0, 7, mm0
  2201. PALIGNR mm1, mm2, 1, mm2
  2202. test r1, r1
  2203. jnz .do_left
  2204. .fix_lt_1:
  2205. movq mm5, mm3
  2206. pxor mm5, mm4
  2207. psrlq mm5, 56
  2208. psllq mm5, 48
  2209. pxor mm1, mm5
  2210. jmp .do_left
  2211. .fix_lt_2:
  2212. movq mm5, mm3
  2213. pxor mm5, mm2
  2214. psllq mm5, 56
  2215. psrlq mm5, 56
  2216. pxor mm2, mm5
  2217. test r2, r2
  2218. jnz .do_top
  2219. .fix_tr_1:
  2220. movq mm5, mm3
  2221. pxor mm5, mm1
  2222. psrlq mm5, 56
  2223. psllq mm5, 56
  2224. pxor mm1, mm5
  2225. jmp .do_top
  2226. .fix_tr_2:
  2227. punpckhbw mm3, mm3
  2228. pshufw mm1, mm3, 0xFF
  2229. jmp .do_topright
  2230. .do_left:
  2231. movq mm0, mm4
  2232. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2233. movq2dq xmm0, mm2
  2234. pslldq xmm0, 8
  2235. movq mm4, mm0
  2236. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2237. movq2dq xmm2, mm1
  2238. pslldq xmm2, 15
  2239. psrldq xmm2, 8
  2240. por xmm0, xmm2
  2241. movq mm0, [r0-8]
  2242. movq mm3, [r0]
  2243. movq mm1, [r0+8]
  2244. movq mm2, mm3
  2245. movq mm4, mm3
  2246. PALIGNR mm2, mm0, 7, mm0
  2247. PALIGNR mm1, mm4, 1, mm4
  2248. test r1, r1
  2249. jz .fix_lt_2
  2250. test r2, r2
  2251. jz .fix_tr_1
  2252. .do_top:
  2253. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2254. movq2dq xmm1, mm4
  2255. test r2, r2
  2256. jz .fix_tr_2
  2257. movq mm0, [r0+8]
  2258. movq mm5, mm0
  2259. movq mm2, mm0
  2260. movq mm4, mm0
  2261. psrlq mm5, 56
  2262. PALIGNR mm2, mm3, 7, mm3
  2263. PALIGNR mm5, mm4, 1, mm4
  2264. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2265. .do_topright:
  2266. movq2dq xmm5, mm1
  2267. pslldq xmm5, 8
  2268. por xmm1, xmm5
  2269. INIT_XMM cpuname
  2270. lea r2, [r4+r3*2]
  2271. movdqa xmm2, xmm1
  2272. movdqa xmm3, xmm1
  2273. PALIGNR xmm1, xmm0, 7, xmm4
  2274. PALIGNR xmm2, xmm0, 9, xmm5
  2275. lea r1, [r2+r3*2]
  2276. PALIGNR xmm3, xmm0, 8, xmm0
  2277. movdqa xmm4, xmm1
  2278. pavgb xmm4, xmm3
  2279. lea r0, [r1+r3*2]
  2280. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2281. punpcklbw xmm4, xmm0
  2282. movhlps xmm0, xmm4
  2283. movq [r0+r3*2], xmm4
  2284. movq [r2+r3*2], xmm0
  2285. psrldq xmm4, 2
  2286. psrldq xmm0, 2
  2287. movq [r0+r3*1], xmm4
  2288. movq [r2+r3*1], xmm0
  2289. psrldq xmm4, 2
  2290. psrldq xmm0, 2
  2291. movq [r1+r3*2], xmm4
  2292. movq [r4+r3*2], xmm0
  2293. psrldq xmm4, 2
  2294. psrldq xmm0, 2
  2295. movq [r1+r3*1], xmm4
  2296. movq [r4+r3*1], xmm0
  2297. RET
  2298. %endmacro
  2299. INIT_MMX sse2
  2300. %define PALIGNR PALIGNR_MMX
  2301. PRED8x8L_HORIZONTAL_DOWN
  2302. INIT_MMX ssse3
  2303. %define PALIGNR PALIGNR_SSSE3
  2304. PRED8x8L_HORIZONTAL_DOWN
  2305. ;-----------------------------------------------------------------------------
  2306. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2307. ;-----------------------------------------------------------------------------
  2308. INIT_MMX mmxext
  2309. cglobal pred4x4_dc_8, 3,5
  2310. pxor mm7, mm7
  2311. mov r4, r0
  2312. sub r0, r2
  2313. movd mm0, [r0]
  2314. psadbw mm0, mm7
  2315. movzx r1d, byte [r0+r2*1-1]
  2316. movd r3d, mm0
  2317. add r3d, r1d
  2318. movzx r1d, byte [r0+r2*2-1]
  2319. lea r0, [r0+r2*2]
  2320. add r3d, r1d
  2321. movzx r1d, byte [r0+r2*1-1]
  2322. add r3d, r1d
  2323. movzx r1d, byte [r0+r2*2-1]
  2324. add r3d, r1d
  2325. add r3d, 4
  2326. shr r3d, 3
  2327. imul r3d, 0x01010101
  2328. mov [r4+r2*0], r3d
  2329. mov [r0+r2*0], r3d
  2330. mov [r0+r2*1], r3d
  2331. mov [r0+r2*2], r3d
  2332. RET
  2333. ;-----------------------------------------------------------------------------
  2334. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2335. ;-----------------------------------------------------------------------------
  2336. %macro PRED4x4_TM 0
  2337. cglobal pred4x4_tm_vp8_8, 3,6
  2338. sub r0, r2
  2339. pxor mm7, mm7
  2340. movd mm0, [r0]
  2341. punpcklbw mm0, mm7
  2342. movzx r4d, byte [r0-1]
  2343. mov r5d, 2
  2344. .loop:
  2345. movzx r1d, byte [r0+r2*1-1]
  2346. movzx r3d, byte [r0+r2*2-1]
  2347. sub r1d, r4d
  2348. sub r3d, r4d
  2349. movd mm2, r1d
  2350. movd mm4, r3d
  2351. %if cpuflag(mmx2)
  2352. pshufw mm2, mm2, 0
  2353. pshufw mm4, mm4, 0
  2354. %else
  2355. punpcklwd mm2, mm2
  2356. punpcklwd mm4, mm4
  2357. punpckldq mm2, mm2
  2358. punpckldq mm4, mm4
  2359. %endif
  2360. paddw mm2, mm0
  2361. paddw mm4, mm0
  2362. packuswb mm2, mm2
  2363. packuswb mm4, mm4
  2364. movd [r0+r2*1], mm2
  2365. movd [r0+r2*2], mm4
  2366. lea r0, [r0+r2*2]
  2367. dec r5d
  2368. jg .loop
  2369. REP_RET
  2370. %endmacro
  2371. INIT_MMX mmx
  2372. PRED4x4_TM
  2373. INIT_MMX mmx2
  2374. PRED4x4_TM
  2375. INIT_XMM ssse3
  2376. cglobal pred4x4_tm_vp8_8, 3,3
  2377. sub r0, r2
  2378. movq mm6, [tm_shuf]
  2379. pxor mm1, mm1
  2380. movd mm0, [r0]
  2381. punpcklbw mm0, mm1
  2382. movd mm7, [r0-4]
  2383. pshufb mm7, mm6
  2384. lea r1, [r0+r2*2]
  2385. movd mm2, [r0+r2*1-4]
  2386. movd mm3, [r0+r2*2-4]
  2387. movd mm4, [r1+r2*1-4]
  2388. movd mm5, [r1+r2*2-4]
  2389. pshufb mm2, mm6
  2390. pshufb mm3, mm6
  2391. pshufb mm4, mm6
  2392. pshufb mm5, mm6
  2393. psubw mm2, mm7
  2394. psubw mm3, mm7
  2395. psubw mm4, mm7
  2396. psubw mm5, mm7
  2397. paddw mm2, mm0
  2398. paddw mm3, mm0
  2399. paddw mm4, mm0
  2400. paddw mm5, mm0
  2401. packuswb mm2, mm2
  2402. packuswb mm3, mm3
  2403. packuswb mm4, mm4
  2404. packuswb mm5, mm5
  2405. movd [r0+r2*1], mm2
  2406. movd [r0+r2*2], mm3
  2407. movd [r1+r2*1], mm4
  2408. movd [r1+r2*2], mm5
  2409. RET
  2410. ;-----------------------------------------------------------------------------
  2411. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2412. ;-----------------------------------------------------------------------------
  2413. INIT_MMX mmxext
  2414. cglobal pred4x4_vertical_vp8_8, 3,3
  2415. sub r0, r2
  2416. movd m1, [r0-1]
  2417. movd m0, [r0]
  2418. mova m2, m0 ;t0 t1 t2 t3
  2419. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2420. lea r1, [r0+r2*2]
  2421. psrlq m0, 8 ;t1 t2 t3 t4
  2422. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2423. movd [r0+r2*1], m3
  2424. movd [r0+r2*2], m3
  2425. movd [r1+r2*1], m3
  2426. movd [r1+r2*2], m3
  2427. RET
  2428. ;-----------------------------------------------------------------------------
  2429. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2430. ;-----------------------------------------------------------------------------
  2431. INIT_MMX mmxext
  2432. cglobal pred4x4_down_left_8, 3,3
  2433. sub r0, r2
  2434. movq m1, [r0]
  2435. punpckldq m1, [r1]
  2436. movq m2, m1
  2437. movq m3, m1
  2438. psllq m1, 8
  2439. pxor m2, m1
  2440. psrlq m2, 8
  2441. pxor m2, m3
  2442. PRED4x4_LOWPASS m0, m1, m2, m3, m4
  2443. lea r1, [r0+r2*2]
  2444. psrlq m0, 8
  2445. movd [r0+r2*1], m0
  2446. psrlq m0, 8
  2447. movd [r0+r2*2], m0
  2448. psrlq m0, 8
  2449. movd [r1+r2*1], m0
  2450. psrlq m0, 8
  2451. movd [r1+r2*2], m0
  2452. RET
  2453. ;-----------------------------------------------------------------------------
  2454. ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2455. ;-----------------------------------------------------------------------------
  2456. INIT_MMX mmxext
  2457. cglobal pred4x4_vertical_left_8, 3,3
  2458. sub r0, r2
  2459. movq m1, [r0]
  2460. punpckldq m1, [r1]
  2461. movq m3, m1
  2462. movq m2, m1
  2463. psrlq m3, 8
  2464. psrlq m2, 16
  2465. movq m4, m3
  2466. pavgb m4, m1
  2467. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2468. lea r1, [r0+r2*2]
  2469. movh [r0+r2*1], m4
  2470. movh [r0+r2*2], m0
  2471. psrlq m4, 8
  2472. psrlq m0, 8
  2473. movh [r1+r2*1], m4
  2474. movh [r1+r2*2], m0
  2475. RET
  2476. ;-----------------------------------------------------------------------------
  2477. ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2478. ;-----------------------------------------------------------------------------
  2479. INIT_MMX mmxext
  2480. cglobal pred4x4_horizontal_up_8, 3,3
  2481. sub r0, r2
  2482. lea r1, [r0+r2*2]
  2483. movd m0, [r0+r2*1-4]
  2484. punpcklbw m0, [r0+r2*2-4]
  2485. movd m1, [r1+r2*1-4]
  2486. punpcklbw m1, [r1+r2*2-4]
  2487. punpckhwd m0, m1
  2488. movq m1, m0
  2489. punpckhbw m1, m1
  2490. pshufw m1, m1, 0xFF
  2491. punpckhdq m0, m1
  2492. movq m2, m0
  2493. movq m3, m0
  2494. movq m7, m0
  2495. psrlq m2, 16
  2496. psrlq m3, 8
  2497. pavgb m7, m3
  2498. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2499. punpcklbw m7, m4
  2500. movd [r0+r2*1], m7
  2501. psrlq m7, 16
  2502. movd [r0+r2*2], m7
  2503. psrlq m7, 16
  2504. movd [r1+r2*1], m7
  2505. movd [r1+r2*2], m1
  2506. RET
  2507. ;-----------------------------------------------------------------------------
  2508. ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2509. ;-----------------------------------------------------------------------------
  2510. INIT_MMX mmxext
  2511. %define PALIGNR PALIGNR_MMX
  2512. cglobal pred4x4_horizontal_down_8, 3,3
  2513. sub r0, r2
  2514. lea r1, [r0+r2*2]
  2515. movh m0, [r0-4] ; lt ..
  2516. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2517. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2518. movd m1, [r1+r2*2-4] ; l3
  2519. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2520. movd m2, [r0+r2*2-4] ; l1
  2521. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2522. punpckhwd m1, m2 ; l0 l1 l2 l3
  2523. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2524. movq m0, m1
  2525. movq m2, m1
  2526. movq m5, m1
  2527. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2528. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2529. pavgb m5, m2
  2530. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2531. punpcklbw m5, m3
  2532. psrlq m3, 32
  2533. PALIGNR m3, m5, 6, m4
  2534. movh [r1+r2*2], m5
  2535. psrlq m5, 16
  2536. movh [r1+r2*1], m5
  2537. psrlq m5, 16
  2538. movh [r0+r2*2], m5
  2539. movh [r0+r2*1], m3
  2540. RET
  2541. ;-----------------------------------------------------------------------------
  2542. ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2543. ;-----------------------------------------------------------------------------
  2544. INIT_MMX mmxext
  2545. %define PALIGNR PALIGNR_MMX
  2546. cglobal pred4x4_vertical_right_8, 3,3
  2547. sub r0, r2
  2548. lea r1, [r0+r2*2]
  2549. movh m0, [r0] ; ........t3t2t1t0
  2550. movq m5, m0
  2551. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2552. pavgb m5, m0
  2553. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2554. movq m1, m0
  2555. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2556. movq m2, m0
  2557. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2558. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2559. movq m1, m3
  2560. psrlq m3, 16
  2561. psllq m1, 48
  2562. movh [r0+r2*1], m5
  2563. movh [r0+r2*2], m3
  2564. PALIGNR m5, m1, 7, m2
  2565. psllq m1, 8
  2566. movh [r1+r2*1], m5
  2567. PALIGNR m3, m1, 7, m1
  2568. movh [r1+r2*2], m3
  2569. RET
  2570. ;-----------------------------------------------------------------------------
  2571. ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2572. ;-----------------------------------------------------------------------------
  2573. INIT_MMX mmxext
  2574. %define PALIGNR PALIGNR_MMX
  2575. cglobal pred4x4_down_right_8, 3,3
  2576. sub r0, r2
  2577. lea r1, [r0+r2*2]
  2578. movq m1, [r1-8]
  2579. movq m2, [r0+r2*1-8]
  2580. punpckhbw m2, [r0-8]
  2581. movh m3, [r0]
  2582. punpckhwd m1, m2
  2583. PALIGNR m3, m1, 5, m1
  2584. movq m1, m3
  2585. PALIGNR m3, [r1+r2*1-8], 7, m4
  2586. movq m2, m3
  2587. PALIGNR m3, [r1+r2*2-8], 7, m4
  2588. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2589. movh [r1+r2*2], m0
  2590. psrlq m0, 8
  2591. movh [r1+r2*1], m0
  2592. psrlq m0, 8
  2593. movh [r0+r2*2], m0
  2594. psrlq m0, 8
  2595. movh [r0+r2*1], m0
  2596. RET