You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2598 lines
65KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA
  27. tm_shuf: times 8 db 0x03, 0x80
  28. pw_ff00: times 8 dw 0xff00
  29. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  30. db 1, 2, 3, 4, 5, 6, 7, 8
  31. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  32. db 1, 2, 3, 4, 0, 0, 0, 0
  33. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  34. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  35. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  36. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  37. SECTION .text
  38. cextern pb_1
  39. cextern pb_3
  40. cextern pw_4
  41. cextern pw_5
  42. cextern pw_8
  43. cextern pw_16
  44. cextern pw_17
  45. cextern pw_32
  46. ;-----------------------------------------------------------------------------
  47. ; void pred16x16_vertical(uint8_t *src, int stride)
  48. ;-----------------------------------------------------------------------------
  49. cglobal pred16x16_vertical_mmx, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. cglobal pred16x16_vertical_sse, 2,3
  64. sub r0, r1
  65. mov r2, 4
  66. movaps xmm0, [r0]
  67. .loop:
  68. movaps [r0+r1*1], xmm0
  69. movaps [r0+r1*2], xmm0
  70. lea r0, [r0+r1*2]
  71. movaps [r0+r1*1], xmm0
  72. movaps [r0+r1*2], xmm0
  73. lea r0, [r0+r1*2]
  74. dec r2
  75. jg .loop
  76. REP_RET
  77. ;-----------------------------------------------------------------------------
  78. ; void pred16x16_horizontal(uint8_t *src, int stride)
  79. ;-----------------------------------------------------------------------------
  80. %macro PRED16x16_H 1
  81. cglobal pred16x16_horizontal_%1, 2,3
  82. mov r2, 8
  83. %ifidn %1, ssse3
  84. mova m2, [pb_3]
  85. %endif
  86. .loop:
  87. movd m0, [r0+r1*0-4]
  88. movd m1, [r0+r1*1-4]
  89. %ifidn %1, ssse3
  90. pshufb m0, m2
  91. pshufb m1, m2
  92. %else
  93. punpcklbw m0, m0
  94. punpcklbw m1, m1
  95. %ifidn %1, mmxext
  96. pshufw m0, m0, 0xff
  97. pshufw m1, m1, 0xff
  98. %else
  99. punpckhwd m0, m0
  100. punpckhwd m1, m1
  101. punpckhdq m0, m0
  102. punpckhdq m1, m1
  103. %endif
  104. mova [r0+r1*0+8], m0
  105. mova [r0+r1*1+8], m1
  106. %endif
  107. mova [r0+r1*0], m0
  108. mova [r0+r1*1], m1
  109. lea r0, [r0+r1*2]
  110. dec r2
  111. jg .loop
  112. REP_RET
  113. %endmacro
  114. INIT_MMX
  115. PRED16x16_H mmx
  116. PRED16x16_H mmxext
  117. INIT_XMM
  118. PRED16x16_H ssse3
  119. ;-----------------------------------------------------------------------------
  120. ; void pred16x16_dc(uint8_t *src, int stride)
  121. ;-----------------------------------------------------------------------------
  122. %macro PRED16x16_DC 1
  123. cglobal pred16x16_dc_%1, 2,7
  124. mov r4, r0
  125. sub r0, r1
  126. pxor mm0, mm0
  127. pxor mm1, mm1
  128. psadbw mm0, [r0+0]
  129. psadbw mm1, [r0+8]
  130. dec r0
  131. movzx r5d, byte [r0+r1*1]
  132. paddw mm0, mm1
  133. movd r6d, mm0
  134. lea r0, [r0+r1*2]
  135. %rep 7
  136. movzx r2d, byte [r0+r1*0]
  137. movzx r3d, byte [r0+r1*1]
  138. add r5d, r2d
  139. add r6d, r3d
  140. lea r0, [r0+r1*2]
  141. %endrep
  142. movzx r2d, byte [r0+r1*0]
  143. add r5d, r6d
  144. lea r2d, [r2+r5+16]
  145. shr r2d, 5
  146. %ifidn %1, mmxext
  147. movd m0, r2d
  148. punpcklbw m0, m0
  149. pshufw m0, m0, 0
  150. %elifidn %1, sse2
  151. movd m0, r2d
  152. punpcklbw m0, m0
  153. pshuflw m0, m0, 0
  154. punpcklqdq m0, m0
  155. %elifidn %1, ssse3
  156. pxor m1, m1
  157. movd m0, r2d
  158. pshufb m0, m1
  159. %endif
  160. %if mmsize==8
  161. mov r3d, 8
  162. .loop:
  163. mova [r4+r1*0+0], m0
  164. mova [r4+r1*0+8], m0
  165. mova [r4+r1*1+0], m0
  166. mova [r4+r1*1+8], m0
  167. %else
  168. mov r3d, 4
  169. .loop:
  170. mova [r4+r1*0], m0
  171. mova [r4+r1*1], m0
  172. lea r4, [r4+r1*2]
  173. mova [r4+r1*0], m0
  174. mova [r4+r1*1], m0
  175. %endif
  176. lea r4, [r4+r1*2]
  177. dec r3d
  178. jg .loop
  179. REP_RET
  180. %endmacro
  181. INIT_MMX
  182. PRED16x16_DC mmxext
  183. INIT_XMM
  184. PRED16x16_DC sse2
  185. PRED16x16_DC ssse3
  186. ;-----------------------------------------------------------------------------
  187. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  188. ;-----------------------------------------------------------------------------
  189. %macro PRED16x16_TM_MMX 1
  190. cglobal pred16x16_tm_vp8_%1, 2,5
  191. sub r0, r1
  192. pxor mm7, mm7
  193. movq mm0, [r0+0]
  194. movq mm2, [r0+8]
  195. movq mm1, mm0
  196. movq mm3, mm2
  197. punpcklbw mm0, mm7
  198. punpckhbw mm1, mm7
  199. punpcklbw mm2, mm7
  200. punpckhbw mm3, mm7
  201. movzx r3d, byte [r0-1]
  202. mov r4d, 16
  203. .loop:
  204. movzx r2d, byte [r0+r1-1]
  205. sub r2d, r3d
  206. movd mm4, r2d
  207. %ifidn %1, mmx
  208. punpcklwd mm4, mm4
  209. punpckldq mm4, mm4
  210. %else
  211. pshufw mm4, mm4, 0
  212. %endif
  213. movq mm5, mm4
  214. movq mm6, mm4
  215. movq mm7, mm4
  216. paddw mm4, mm0
  217. paddw mm5, mm1
  218. paddw mm6, mm2
  219. paddw mm7, mm3
  220. packuswb mm4, mm5
  221. packuswb mm6, mm7
  222. movq [r0+r1+0], mm4
  223. movq [r0+r1+8], mm6
  224. add r0, r1
  225. dec r4d
  226. jg .loop
  227. REP_RET
  228. %endmacro
  229. PRED16x16_TM_MMX mmx
  230. PRED16x16_TM_MMX mmxext
  231. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  232. sub r0, r1
  233. pxor xmm2, xmm2
  234. movdqa xmm0, [r0]
  235. movdqa xmm1, xmm0
  236. punpcklbw xmm0, xmm2
  237. punpckhbw xmm1, xmm2
  238. movzx r4d, byte [r0-1]
  239. mov r5d, 8
  240. .loop:
  241. movzx r2d, byte [r0+r1*1-1]
  242. movzx r3d, byte [r0+r1*2-1]
  243. sub r2d, r4d
  244. sub r3d, r4d
  245. movd xmm2, r2d
  246. movd xmm4, r3d
  247. pshuflw xmm2, xmm2, 0
  248. pshuflw xmm4, xmm4, 0
  249. punpcklqdq xmm2, xmm2
  250. punpcklqdq xmm4, xmm4
  251. movdqa xmm3, xmm2
  252. movdqa xmm5, xmm4
  253. paddw xmm2, xmm0
  254. paddw xmm3, xmm1
  255. paddw xmm4, xmm0
  256. paddw xmm5, xmm1
  257. packuswb xmm2, xmm3
  258. packuswb xmm4, xmm5
  259. movdqa [r0+r1*1], xmm2
  260. movdqa [r0+r1*2], xmm4
  261. lea r0, [r0+r1*2]
  262. dec r5d
  263. jg .loop
  264. REP_RET
  265. ;-----------------------------------------------------------------------------
  266. ; void pred16x16_plane(uint8_t *src, int stride)
  267. ;-----------------------------------------------------------------------------
  268. %macro H264_PRED16x16_PLANE 3
  269. cglobal pred16x16_plane_%3_%1, 2, 7, %2
  270. mov r2, r1 ; +stride
  271. neg r1 ; -stride
  272. movh m0, [r0+r1 -1]
  273. %if mmsize == 8
  274. pxor m4, m4
  275. movh m1, [r0+r1 +3 ]
  276. movh m2, [r0+r1 +8 ]
  277. movh m3, [r0+r1 +12]
  278. punpcklbw m0, m4
  279. punpcklbw m1, m4
  280. punpcklbw m2, m4
  281. punpcklbw m3, m4
  282. pmullw m0, [pw_m8tom1 ]
  283. pmullw m1, [pw_m8tom1+8]
  284. pmullw m2, [pw_1to8 ]
  285. pmullw m3, [pw_1to8 +8]
  286. paddw m0, m2
  287. paddw m1, m3
  288. %else ; mmsize == 16
  289. %ifidn %1, sse2
  290. pxor m2, m2
  291. movh m1, [r0+r1 +8]
  292. punpcklbw m0, m2
  293. punpcklbw m1, m2
  294. pmullw m0, [pw_m8tom1]
  295. pmullw m1, [pw_1to8]
  296. paddw m0, m1
  297. %else ; ssse3
  298. movhps m0, [r0+r1 +8]
  299. pmaddubsw m0, [plane_shuf] ; H coefficients
  300. %endif
  301. movhlps m1, m0
  302. %endif
  303. paddw m0, m1
  304. %ifidn %1, mmx
  305. mova m1, m0
  306. psrlq m1, 32
  307. %elifidn %1, mmx2
  308. pshufw m1, m0, 0xE
  309. %else ; mmsize == 16
  310. pshuflw m1, m0, 0xE
  311. %endif
  312. paddw m0, m1
  313. %ifidn %1, mmx
  314. mova m1, m0
  315. psrlq m1, 16
  316. %elifidn %1, mmx2
  317. pshufw m1, m0, 0x1
  318. %else
  319. pshuflw m1, m0, 0x1
  320. %endif
  321. paddw m0, m1 ; sum of H coefficients
  322. %ifidn %3, h264
  323. pmullw m0, [pw_5]
  324. paddw m0, [pw_32]
  325. psraw m0, 6
  326. %elifidn %3, rv40
  327. pmullw m0, [pw_5]
  328. psraw m0, 6
  329. %elifidn %3, svq3
  330. movd r3d, m0
  331. movsx r3, r3w
  332. test r3, r3
  333. lea r4, [r3+3]
  334. cmovs r3, r4
  335. sar r3, 2 ; H/4
  336. lea r3, [r3*5] ; 5*(H/4)
  337. test r3, r3
  338. lea r4, [r3+15]
  339. cmovs r3, r4
  340. sar r3, 4 ; (5*(H/4))/16
  341. movd m0, r3d
  342. %endif
  343. lea r4, [r0+r2*8-1]
  344. lea r3, [r0+r2*4-1]
  345. add r4, r2
  346. %ifdef ARCH_X86_64
  347. %define e_reg r11
  348. %else
  349. %define e_reg r0
  350. %endif
  351. movzx e_reg, byte [r3+r2*2 ]
  352. movzx r5, byte [r4+r1 ]
  353. sub r5, e_reg
  354. movzx e_reg, byte [r3+r2 ]
  355. movzx r6, byte [r4 ]
  356. sub r6, e_reg
  357. lea r5, [r5+r6*2]
  358. movzx e_reg, byte [r3+r1 ]
  359. movzx r6, byte [r4+r2*2 ]
  360. sub r6, e_reg
  361. lea r5, [r5+r6*4]
  362. movzx e_reg, byte [r3 ]
  363. %ifdef ARCH_X86_64
  364. movzx r10, byte [r4+r2 ]
  365. sub r10, e_reg
  366. %else
  367. movzx r6, byte [r4+r2 ]
  368. sub r6, e_reg
  369. lea r5, [r5+r6*4]
  370. sub r5, r6
  371. %endif
  372. lea e_reg, [r3+r1*4]
  373. lea r3, [r4+r2*4]
  374. movzx r4, byte [e_reg+r2 ]
  375. movzx r6, byte [r3 ]
  376. sub r6, r4
  377. %ifdef ARCH_X86_64
  378. lea r6, [r10+r6*2]
  379. lea r5, [r5+r6*2]
  380. add r5, r6
  381. %else
  382. lea r5, [r5+r6*4]
  383. lea r5, [r5+r6*2]
  384. %endif
  385. movzx r4, byte [e_reg ]
  386. %ifdef ARCH_X86_64
  387. movzx r10, byte [r3 +r2 ]
  388. sub r10, r4
  389. sub r5, r10
  390. %else
  391. movzx r6, byte [r3 +r2 ]
  392. sub r6, r4
  393. lea r5, [r5+r6*8]
  394. sub r5, r6
  395. %endif
  396. movzx r4, byte [e_reg+r1 ]
  397. movzx r6, byte [r3 +r2*2]
  398. sub r6, r4
  399. %ifdef ARCH_X86_64
  400. add r6, r10
  401. %endif
  402. lea r5, [r5+r6*8]
  403. movzx r4, byte [e_reg+r2*2]
  404. movzx r6, byte [r3 +r1 ]
  405. sub r6, r4
  406. lea r5, [r5+r6*4]
  407. add r5, r6 ; sum of V coefficients
  408. %ifndef ARCH_X86_64
  409. mov r0, r0m
  410. %endif
  411. %ifidn %3, h264
  412. lea r5, [r5*5+32]
  413. sar r5, 6
  414. %elifidn %3, rv40
  415. lea r5, [r5*5]
  416. sar r5, 6
  417. %elifidn %3, svq3
  418. test r5, r5
  419. lea r6, [r5+3]
  420. cmovs r5, r6
  421. sar r5, 2 ; V/4
  422. lea r5, [r5*5] ; 5*(V/4)
  423. test r5, r5
  424. lea r6, [r5+15]
  425. cmovs r5, r6
  426. sar r5, 4 ; (5*(V/4))/16
  427. %endif
  428. movzx r4, byte [r0+r1 +15]
  429. movzx r3, byte [r3+r2*2 ]
  430. lea r3, [r3+r4+1]
  431. shl r3, 4
  432. movd r1d, m0
  433. movsx r1d, r1w
  434. add r1d, r5d
  435. add r3d, r1d
  436. shl r1d, 3
  437. sub r3d, r1d ; a
  438. movd m1, r5d
  439. movd m3, r3d
  440. %ifidn %1, mmx
  441. punpcklwd m0, m0
  442. punpcklwd m1, m1
  443. punpcklwd m3, m3
  444. punpckldq m0, m0
  445. punpckldq m1, m1
  446. punpckldq m3, m3
  447. %elifidn %1, mmx2
  448. pshufw m0, m0, 0x0
  449. pshufw m1, m1, 0x0
  450. pshufw m3, m3, 0x0
  451. %else
  452. pshuflw m0, m0, 0x0
  453. pshuflw m1, m1, 0x0
  454. pshuflw m3, m3, 0x0
  455. punpcklqdq m0, m0 ; splat H (words)
  456. punpcklqdq m1, m1 ; splat V (words)
  457. punpcklqdq m3, m3 ; splat a (words)
  458. %endif
  459. %ifidn %3, svq3
  460. SWAP 0, 1
  461. %endif
  462. mova m2, m0
  463. %if mmsize == 8
  464. mova m5, m0
  465. %endif
  466. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  467. %if mmsize == 16
  468. psllw m2, 3
  469. %else
  470. psllw m5, 3
  471. psllw m2, 2
  472. mova m6, m5
  473. paddw m6, m2
  474. %endif
  475. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  476. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  477. %if mmsize == 8
  478. paddw m5, m0 ; a + {8,9,10,11}*H
  479. paddw m6, m0 ; a + {12,13,14,15}*H
  480. %endif
  481. mov r4, 8
  482. .loop
  483. mova m3, m0 ; b[0..7]
  484. mova m4, m2 ; b[8..15]
  485. psraw m3, 5
  486. psraw m4, 5
  487. packuswb m3, m4
  488. mova [r0], m3
  489. %if mmsize == 8
  490. mova m3, m5 ; b[8..11]
  491. mova m4, m6 ; b[12..15]
  492. psraw m3, 5
  493. psraw m4, 5
  494. packuswb m3, m4
  495. mova [r0+8], m3
  496. %endif
  497. paddw m0, m1
  498. paddw m2, m1
  499. %if mmsize == 8
  500. paddw m5, m1
  501. paddw m6, m1
  502. %endif
  503. mova m3, m0 ; b[0..7]
  504. mova m4, m2 ; b[8..15]
  505. psraw m3, 5
  506. psraw m4, 5
  507. packuswb m3, m4
  508. mova [r0+r2], m3
  509. %if mmsize == 8
  510. mova m3, m5 ; b[8..11]
  511. mova m4, m6 ; b[12..15]
  512. psraw m3, 5
  513. psraw m4, 5
  514. packuswb m3, m4
  515. mova [r0+r2+8], m3
  516. %endif
  517. paddw m0, m1
  518. paddw m2, m1
  519. %if mmsize == 8
  520. paddw m5, m1
  521. paddw m6, m1
  522. %endif
  523. lea r0, [r0+r2*2]
  524. dec r4
  525. jg .loop
  526. REP_RET
  527. %endmacro
  528. INIT_MMX
  529. H264_PRED16x16_PLANE mmx, 0, h264
  530. H264_PRED16x16_PLANE mmx, 0, rv40
  531. H264_PRED16x16_PLANE mmx, 0, svq3
  532. H264_PRED16x16_PLANE mmx2, 0, h264
  533. H264_PRED16x16_PLANE mmx2, 0, rv40
  534. H264_PRED16x16_PLANE mmx2, 0, svq3
  535. INIT_XMM
  536. H264_PRED16x16_PLANE sse2, 8, h264
  537. H264_PRED16x16_PLANE sse2, 8, rv40
  538. H264_PRED16x16_PLANE sse2, 8, svq3
  539. H264_PRED16x16_PLANE ssse3, 8, h264
  540. H264_PRED16x16_PLANE ssse3, 8, rv40
  541. H264_PRED16x16_PLANE ssse3, 8, svq3
  542. ;-----------------------------------------------------------------------------
  543. ; void pred8x8_plane(uint8_t *src, int stride)
  544. ;-----------------------------------------------------------------------------
  545. %macro H264_PRED8x8_PLANE 2
  546. cglobal pred8x8_plane_%1, 2, 7, %2
  547. mov r2, r1 ; +stride
  548. neg r1 ; -stride
  549. movd m0, [r0+r1 -1]
  550. %if mmsize == 8
  551. pxor m2, m2
  552. movh m1, [r0+r1 +4 ]
  553. punpcklbw m0, m2
  554. punpcklbw m1, m2
  555. pmullw m0, [pw_m4to4]
  556. pmullw m1, [pw_m4to4+8]
  557. %else ; mmsize == 16
  558. %ifidn %1, sse2
  559. pxor m2, m2
  560. movd m1, [r0+r1 +4]
  561. punpckldq m0, m1
  562. punpcklbw m0, m2
  563. pmullw m0, [pw_m4to4]
  564. %else ; ssse3
  565. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  566. pmaddubsw m0, [plane8_shuf] ; H coefficients
  567. %endif
  568. movhlps m1, m0
  569. %endif
  570. paddw m0, m1
  571. %ifnidn %1, ssse3
  572. %ifidn %1, mmx
  573. mova m1, m0
  574. psrlq m1, 32
  575. %elifidn %1, mmx2
  576. pshufw m1, m0, 0xE
  577. %else ; mmsize == 16
  578. pshuflw m1, m0, 0xE
  579. %endif
  580. paddw m0, m1
  581. %endif ; !ssse3
  582. %ifidn %1, mmx
  583. mova m1, m0
  584. psrlq m1, 16
  585. %elifidn %1, mmx2
  586. pshufw m1, m0, 0x1
  587. %else
  588. pshuflw m1, m0, 0x1
  589. %endif
  590. paddw m0, m1 ; sum of H coefficients
  591. pmullw m0, [pw_17]
  592. paddw m0, [pw_16]
  593. psraw m0, 5
  594. lea r4, [r0+r2*4-1]
  595. lea r3, [r0 -1]
  596. add r4, r2
  597. %ifdef ARCH_X86_64
  598. %define e_reg r11
  599. %else
  600. %define e_reg r0
  601. %endif
  602. movzx e_reg, byte [r3+r2*2 ]
  603. movzx r5, byte [r4+r1 ]
  604. sub r5, e_reg
  605. movzx e_reg, byte [r3 ]
  606. %ifdef ARCH_X86_64
  607. movzx r10, byte [r4+r2 ]
  608. sub r10, e_reg
  609. sub r5, r10
  610. %else
  611. movzx r6, byte [r4+r2 ]
  612. sub r6, e_reg
  613. lea r5, [r5+r6*4]
  614. sub r5, r6
  615. %endif
  616. movzx e_reg, byte [r3+r1 ]
  617. movzx r6, byte [r4+r2*2 ]
  618. sub r6, e_reg
  619. %ifdef ARCH_X86_64
  620. add r6, r10
  621. %endif
  622. lea r5, [r5+r6*4]
  623. movzx e_reg, byte [r3+r2 ]
  624. movzx r6, byte [r4 ]
  625. sub r6, e_reg
  626. lea r6, [r5+r6*2]
  627. lea r5, [r6*9+16]
  628. lea r5, [r5+r6*8]
  629. sar r5, 5
  630. %ifndef ARCH_X86_64
  631. mov r0, r0m
  632. %endif
  633. movzx r3, byte [r4+r2*2 ]
  634. movzx r4, byte [r0+r1 +7]
  635. lea r3, [r3+r4+1]
  636. shl r3, 4
  637. movd r1d, m0
  638. movsx r1d, r1w
  639. add r1d, r5d
  640. sub r3d, r1d
  641. add r1d, r1d
  642. sub r3d, r1d ; a
  643. movd m1, r5d
  644. movd m3, r3d
  645. %ifidn %1, mmx
  646. punpcklwd m0, m0
  647. punpcklwd m1, m1
  648. punpcklwd m3, m3
  649. punpckldq m0, m0
  650. punpckldq m1, m1
  651. punpckldq m3, m3
  652. %elifidn %1, mmx2
  653. pshufw m0, m0, 0x0
  654. pshufw m1, m1, 0x0
  655. pshufw m3, m3, 0x0
  656. %else
  657. pshuflw m0, m0, 0x0
  658. pshuflw m1, m1, 0x0
  659. pshuflw m3, m3, 0x0
  660. punpcklqdq m0, m0 ; splat H (words)
  661. punpcklqdq m1, m1 ; splat V (words)
  662. punpcklqdq m3, m3 ; splat a (words)
  663. %endif
  664. %if mmsize == 8
  665. mova m2, m0
  666. %endif
  667. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  668. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  669. %if mmsize == 8
  670. psllw m2, 2
  671. paddw m2, m0 ; a + {4,5,6,7}*H
  672. %endif
  673. mov r4, 4
  674. ALIGN 16
  675. .loop
  676. %if mmsize == 16
  677. mova m3, m0 ; b[0..7]
  678. paddw m0, m1
  679. psraw m3, 5
  680. mova m4, m0 ; V+b[0..7]
  681. paddw m0, m1
  682. psraw m4, 5
  683. packuswb m3, m4
  684. movh [r0], m3
  685. movhps [r0+r2], m3
  686. %else ; mmsize == 8
  687. mova m3, m0 ; b[0..3]
  688. mova m4, m2 ; b[4..7]
  689. paddw m0, m1
  690. paddw m2, m1
  691. psraw m3, 5
  692. psraw m4, 5
  693. mova m5, m0 ; V+b[0..3]
  694. mova m6, m2 ; V+b[4..7]
  695. paddw m0, m1
  696. paddw m2, m1
  697. psraw m5, 5
  698. psraw m6, 5
  699. packuswb m3, m4
  700. packuswb m5, m6
  701. mova [r0], m3
  702. mova [r0+r2], m5
  703. %endif
  704. lea r0, [r0+r2*2]
  705. dec r4
  706. jg .loop
  707. REP_RET
  708. %endmacro
  709. INIT_MMX
  710. H264_PRED8x8_PLANE mmx, 0
  711. H264_PRED8x8_PLANE mmx2, 0
  712. INIT_XMM
  713. H264_PRED8x8_PLANE sse2, 8
  714. H264_PRED8x8_PLANE ssse3, 8
  715. ;-----------------------------------------------------------------------------
  716. ; void pred8x8_vertical(uint8_t *src, int stride)
  717. ;-----------------------------------------------------------------------------
  718. cglobal pred8x8_vertical_mmx, 2,2
  719. sub r0, r1
  720. movq mm0, [r0]
  721. %rep 3
  722. movq [r0+r1*1], mm0
  723. movq [r0+r1*2], mm0
  724. lea r0, [r0+r1*2]
  725. %endrep
  726. movq [r0+r1*1], mm0
  727. movq [r0+r1*2], mm0
  728. RET
  729. ;-----------------------------------------------------------------------------
  730. ; void pred8x8_horizontal(uint8_t *src, int stride)
  731. ;-----------------------------------------------------------------------------
  732. %macro PRED8x8_H 1
  733. cglobal pred8x8_horizontal_%1, 2,3
  734. mov r2, 4
  735. %ifidn %1, ssse3
  736. mova m2, [pb_3]
  737. %endif
  738. .loop:
  739. movd m0, [r0+r1*0-4]
  740. movd m1, [r0+r1*1-4]
  741. %ifidn %1, ssse3
  742. pshufb m0, m2
  743. pshufb m1, m2
  744. %else
  745. punpcklbw m0, m0
  746. punpcklbw m1, m1
  747. %ifidn %1, mmxext
  748. pshufw m0, m0, 0xff
  749. pshufw m1, m1, 0xff
  750. %else
  751. punpckhwd m0, m0
  752. punpckhwd m1, m1
  753. punpckhdq m0, m0
  754. punpckhdq m1, m1
  755. %endif
  756. %endif
  757. mova [r0+r1*0], m0
  758. mova [r0+r1*1], m1
  759. lea r0, [r0+r1*2]
  760. dec r2
  761. jg .loop
  762. REP_RET
  763. %endmacro
  764. INIT_MMX
  765. PRED8x8_H mmx
  766. PRED8x8_H mmxext
  767. PRED8x8_H ssse3
  768. ;-----------------------------------------------------------------------------
  769. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  770. ;-----------------------------------------------------------------------------
  771. %ifdef CONFIG_GPL
  772. cglobal pred8x8_top_dc_mmxext, 2,5
  773. sub r0, r1
  774. movq mm0, [r0]
  775. pxor mm1, mm1
  776. pxor mm2, mm2
  777. lea r2, [r0+r1*2]
  778. punpckhbw mm1, mm0
  779. punpcklbw mm0, mm2
  780. psadbw mm1, mm2 ; s1
  781. lea r3, [r2+r1*2]
  782. psadbw mm0, mm2 ; s0
  783. psrlw mm1, 1
  784. psrlw mm0, 1
  785. pavgw mm1, mm2
  786. lea r4, [r3+r1*2]
  787. pavgw mm0, mm2
  788. pshufw mm1, mm1, 0
  789. pshufw mm0, mm0, 0 ; dc0 (w)
  790. packuswb mm0, mm1 ; dc0,dc1 (b)
  791. movq [r0+r1*1], mm0
  792. movq [r0+r1*2], mm0
  793. lea r0, [r3+r1*2]
  794. movq [r2+r1*1], mm0
  795. movq [r2+r1*2], mm0
  796. movq [r3+r1*1], mm0
  797. movq [r3+r1*2], mm0
  798. movq [r0+r1*1], mm0
  799. movq [r0+r1*2], mm0
  800. RET
  801. ;-----------------------------------------------------------------------------
  802. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  803. ;-----------------------------------------------------------------------------
  804. INIT_MMX
  805. cglobal pred8x8_dc_mmxext, 2,5
  806. sub r0, r1
  807. pxor m7, m7
  808. movd m0, [r0+0]
  809. movd m1, [r0+4]
  810. psadbw m0, m7 ; s0
  811. mov r4, r0
  812. psadbw m1, m7 ; s1
  813. movzx r2d, byte [r0+r1*1-1]
  814. movzx r3d, byte [r0+r1*2-1]
  815. lea r0, [r0+r1*2]
  816. add r2d, r3d
  817. movzx r3d, byte [r0+r1*1-1]
  818. add r2d, r3d
  819. movzx r3d, byte [r0+r1*2-1]
  820. add r2d, r3d
  821. lea r0, [r0+r1*2]
  822. movd m2, r2d ; s2
  823. movzx r2d, byte [r0+r1*1-1]
  824. movzx r3d, byte [r0+r1*2-1]
  825. lea r0, [r0+r1*2]
  826. add r2d, r3d
  827. movzx r3d, byte [r0+r1*1-1]
  828. add r2d, r3d
  829. movzx r3d, byte [r0+r1*2-1]
  830. add r2d, r3d
  831. movd m3, r2d ; s3
  832. punpcklwd m0, m1
  833. mov r0, r4
  834. punpcklwd m2, m3
  835. punpckldq m0, m2 ; s0, s1, s2, s3
  836. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  837. lea r2, [r0+r1*2]
  838. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  839. paddw m0, m3
  840. lea r3, [r2+r1*2]
  841. psrlw m0, 2
  842. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  843. lea r4, [r3+r1*2]
  844. packuswb m0, m0
  845. punpcklbw m0, m0
  846. movq m1, m0
  847. punpcklbw m0, m0
  848. punpckhbw m1, m1
  849. movq [r0+r1*1], m0
  850. movq [r0+r1*2], m0
  851. movq [r2+r1*1], m0
  852. movq [r2+r1*2], m0
  853. movq [r3+r1*1], m1
  854. movq [r3+r1*2], m1
  855. movq [r4+r1*1], m1
  856. movq [r4+r1*2], m1
  857. RET
  858. %endif
  859. ;-----------------------------------------------------------------------------
  860. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  861. ;-----------------------------------------------------------------------------
  862. cglobal pred8x8_dc_rv40_mmxext, 2,7
  863. mov r4, r0
  864. sub r0, r1
  865. pxor mm0, mm0
  866. psadbw mm0, [r0]
  867. dec r0
  868. movzx r5d, byte [r0+r1*1]
  869. movd r6d, mm0
  870. lea r0, [r0+r1*2]
  871. %rep 3
  872. movzx r2d, byte [r0+r1*0]
  873. movzx r3d, byte [r0+r1*1]
  874. add r5d, r2d
  875. add r6d, r3d
  876. lea r0, [r0+r1*2]
  877. %endrep
  878. movzx r2d, byte [r0+r1*0]
  879. add r5d, r6d
  880. lea r2d, [r2+r5+8]
  881. shr r2d, 4
  882. movd mm0, r2d
  883. punpcklbw mm0, mm0
  884. pshufw mm0, mm0, 0
  885. mov r3d, 4
  886. .loop:
  887. movq [r4+r1*0], mm0
  888. movq [r4+r1*1], mm0
  889. lea r4, [r4+r1*2]
  890. dec r3d
  891. jg .loop
  892. REP_RET
  893. ;-----------------------------------------------------------------------------
  894. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  895. ;-----------------------------------------------------------------------------
  896. %macro PRED8x8_TM_MMX 1
  897. cglobal pred8x8_tm_vp8_%1, 2,6
  898. sub r0, r1
  899. pxor mm7, mm7
  900. movq mm0, [r0]
  901. movq mm1, mm0
  902. punpcklbw mm0, mm7
  903. punpckhbw mm1, mm7
  904. movzx r4d, byte [r0-1]
  905. mov r5d, 4
  906. .loop:
  907. movzx r2d, byte [r0+r1*1-1]
  908. movzx r3d, byte [r0+r1*2-1]
  909. sub r2d, r4d
  910. sub r3d, r4d
  911. movd mm2, r2d
  912. movd mm4, r3d
  913. %ifidn %1, mmx
  914. punpcklwd mm2, mm2
  915. punpcklwd mm4, mm4
  916. punpckldq mm2, mm2
  917. punpckldq mm4, mm4
  918. %else
  919. pshufw mm2, mm2, 0
  920. pshufw mm4, mm4, 0
  921. %endif
  922. movq mm3, mm2
  923. movq mm5, mm4
  924. paddw mm2, mm0
  925. paddw mm3, mm1
  926. paddw mm4, mm0
  927. paddw mm5, mm1
  928. packuswb mm2, mm3
  929. packuswb mm4, mm5
  930. movq [r0+r1*1], mm2
  931. movq [r0+r1*2], mm4
  932. lea r0, [r0+r1*2]
  933. dec r5d
  934. jg .loop
  935. REP_RET
  936. %endmacro
  937. PRED8x8_TM_MMX mmx
  938. PRED8x8_TM_MMX mmxext
  939. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  940. sub r0, r1
  941. pxor xmm1, xmm1
  942. movq xmm0, [r0]
  943. punpcklbw xmm0, xmm1
  944. movzx r4d, byte [r0-1]
  945. mov r5d, 4
  946. .loop:
  947. movzx r2d, byte [r0+r1*1-1]
  948. movzx r3d, byte [r0+r1*2-1]
  949. sub r2d, r4d
  950. sub r3d, r4d
  951. movd xmm2, r2d
  952. movd xmm3, r3d
  953. pshuflw xmm2, xmm2, 0
  954. pshuflw xmm3, xmm3, 0
  955. punpcklqdq xmm2, xmm2
  956. punpcklqdq xmm3, xmm3
  957. paddw xmm2, xmm0
  958. paddw xmm3, xmm0
  959. packuswb xmm2, xmm3
  960. movq [r0+r1*1], xmm2
  961. movhps [r0+r1*2], xmm2
  962. lea r0, [r0+r1*2]
  963. dec r5d
  964. jg .loop
  965. REP_RET
  966. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  967. sub r0, r1
  968. movdqa xmm4, [tm_shuf]
  969. pxor xmm1, xmm1
  970. movq xmm0, [r0]
  971. punpcklbw xmm0, xmm1
  972. movd xmm5, [r0-4]
  973. pshufb xmm5, xmm4
  974. mov r2d, 4
  975. .loop:
  976. movd xmm2, [r0+r1*1-4]
  977. movd xmm3, [r0+r1*2-4]
  978. pshufb xmm2, xmm4
  979. pshufb xmm3, xmm4
  980. psubw xmm2, xmm5
  981. psubw xmm3, xmm5
  982. paddw xmm2, xmm0
  983. paddw xmm3, xmm0
  984. packuswb xmm2, xmm3
  985. movq [r0+r1*1], xmm2
  986. movhps [r0+r1*2], xmm2
  987. lea r0, [r0+r1*2]
  988. dec r2d
  989. jg .loop
  990. REP_RET
  991. ; dest, left, right, src, tmp
  992. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  993. %macro PRED4x4_LOWPASS 5
  994. mova %5, %2
  995. pavgb %2, %3
  996. pxor %3, %5
  997. mova %1, %4
  998. pand %3, [pb_1]
  999. psubusb %2, %3
  1000. pavgb %1, %2
  1001. %endmacro
  1002. ;-----------------------------------------------------------------------------
  1003. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1004. ;-----------------------------------------------------------------------------
  1005. %ifdef CONFIG_GPL
  1006. %macro PRED8x8L_TOP_DC 1
  1007. cglobal pred8x8l_top_dc_%1, 4,4
  1008. sub r0, r3
  1009. pxor mm7, mm7
  1010. movq mm0, [r0-8]
  1011. movq mm3, [r0]
  1012. movq mm1, [r0+8]
  1013. movq mm2, mm3
  1014. movq mm4, mm3
  1015. PALIGNR mm2, mm0, 7, mm0
  1016. PALIGNR mm1, mm4, 1, mm4
  1017. test r1, r1 ; top_left
  1018. jz .fix_lt_2
  1019. test r2, r2 ; top_right
  1020. jz .fix_tr_1
  1021. jmp .body
  1022. .fix_lt_2:
  1023. movq mm5, mm3
  1024. pxor mm5, mm2
  1025. psllq mm5, 56
  1026. psrlq mm5, 56
  1027. pxor mm2, mm5
  1028. test r2, r2 ; top_right
  1029. jnz .body
  1030. .fix_tr_1:
  1031. movq mm5, mm3
  1032. pxor mm5, mm1
  1033. psrlq mm5, 56
  1034. psllq mm5, 56
  1035. pxor mm1, mm5
  1036. .body
  1037. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1038. psadbw mm7, mm0
  1039. paddw mm7, [pw_4]
  1040. psrlw mm7, 3
  1041. pshufw mm7, mm7, 0
  1042. packuswb mm7, mm7
  1043. %rep 3
  1044. movq [r0+r3*1], mm7
  1045. movq [r0+r3*2], mm7
  1046. lea r0, [r0+r3*2]
  1047. %endrep
  1048. movq [r0+r3*1], mm7
  1049. movq [r0+r3*2], mm7
  1050. RET
  1051. %endmacro
  1052. INIT_MMX
  1053. %define PALIGNR PALIGNR_MMX
  1054. PRED8x8L_TOP_DC mmxext
  1055. %define PALIGNR PALIGNR_SSSE3
  1056. PRED8x8L_TOP_DC ssse3
  1057. ;-----------------------------------------------------------------------------
  1058. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1059. ;-----------------------------------------------------------------------------
  1060. %macro PRED8x8L_DC 1
  1061. cglobal pred8x8l_dc_%1, 4,5
  1062. sub r0, r3
  1063. lea r4, [r0+r3*2]
  1064. movq mm0, [r0+r3*1-8]
  1065. punpckhbw mm0, [r0+r3*0-8]
  1066. movq mm1, [r4+r3*1-8]
  1067. punpckhbw mm1, [r0+r3*2-8]
  1068. mov r4, r0
  1069. punpckhwd mm1, mm0
  1070. lea r0, [r0+r3*4]
  1071. movq mm2, [r0+r3*1-8]
  1072. punpckhbw mm2, [r0+r3*0-8]
  1073. lea r0, [r0+r3*2]
  1074. movq mm3, [r0+r3*1-8]
  1075. punpckhbw mm3, [r0+r3*0-8]
  1076. punpckhwd mm3, mm2
  1077. punpckhdq mm3, mm1
  1078. lea r0, [r0+r3*2]
  1079. movq mm0, [r0+r3*0-8]
  1080. movq mm1, [r4]
  1081. mov r0, r4
  1082. movq mm4, mm3
  1083. movq mm2, mm3
  1084. PALIGNR mm4, mm0, 7, mm0
  1085. PALIGNR mm1, mm2, 1, mm2
  1086. test r1, r1
  1087. jnz .do_left
  1088. .fix_lt_1:
  1089. movq mm5, mm3
  1090. pxor mm5, mm4
  1091. psrlq mm5, 56
  1092. psllq mm5, 48
  1093. pxor mm1, mm5
  1094. jmp .do_left
  1095. .fix_lt_2:
  1096. movq mm5, mm3
  1097. pxor mm5, mm2
  1098. psllq mm5, 56
  1099. psrlq mm5, 56
  1100. pxor mm2, mm5
  1101. test r2, r2
  1102. jnz .body
  1103. .fix_tr_1:
  1104. movq mm5, mm3
  1105. pxor mm5, mm1
  1106. psrlq mm5, 56
  1107. psllq mm5, 56
  1108. pxor mm1, mm5
  1109. jmp .body
  1110. .do_left:
  1111. movq mm0, mm4
  1112. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1113. movq mm4, mm0
  1114. movq mm7, mm2
  1115. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1116. psllq mm1, 56
  1117. PALIGNR mm7, mm1, 7, mm3
  1118. movq mm0, [r0-8]
  1119. movq mm3, [r0]
  1120. movq mm1, [r0+8]
  1121. movq mm2, mm3
  1122. movq mm4, mm3
  1123. PALIGNR mm2, mm0, 7, mm0
  1124. PALIGNR mm1, mm4, 1, mm4
  1125. test r1, r1
  1126. jz .fix_lt_2
  1127. test r2, r2
  1128. jz .fix_tr_1
  1129. .body
  1130. lea r1, [r0+r3*2]
  1131. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1132. pxor mm0, mm0
  1133. pxor mm1, mm1
  1134. lea r2, [r1+r3*2]
  1135. psadbw mm0, mm7
  1136. psadbw mm1, mm6
  1137. paddw mm0, [pw_8]
  1138. paddw mm0, mm1
  1139. lea r4, [r2+r3*2]
  1140. psrlw mm0, 4
  1141. pshufw mm0, mm0, 0
  1142. packuswb mm0, mm0
  1143. movq [r0+r3*1], mm0
  1144. movq [r0+r3*2], mm0
  1145. movq [r1+r3*1], mm0
  1146. movq [r1+r3*2], mm0
  1147. movq [r2+r3*1], mm0
  1148. movq [r2+r3*2], mm0
  1149. movq [r4+r3*1], mm0
  1150. movq [r4+r3*2], mm0
  1151. RET
  1152. %endmacro
  1153. INIT_MMX
  1154. %define PALIGNR PALIGNR_MMX
  1155. PRED8x8L_DC mmxext
  1156. %define PALIGNR PALIGNR_SSSE3
  1157. PRED8x8L_DC ssse3
  1158. ;-----------------------------------------------------------------------------
  1159. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1160. ;-----------------------------------------------------------------------------
  1161. %macro PRED8x8L_HORIZONTAL 1
  1162. cglobal pred8x8l_horizontal_%1, 4,4
  1163. sub r0, r3
  1164. lea r2, [r0+r3*2]
  1165. movq mm0, [r0+r3*1-8]
  1166. punpckhbw mm0, [r0+r3*0-8]
  1167. movq mm1, [r2+r3*1-8]
  1168. punpckhbw mm1, [r0+r3*2-8]
  1169. mov r2, r0
  1170. punpckhwd mm1, mm0
  1171. lea r0, [r0+r3*4]
  1172. movq mm2, [r0+r3*1-8]
  1173. punpckhbw mm2, [r0+r3*0-8]
  1174. lea r0, [r0+r3*2]
  1175. movq mm3, [r0+r3*1-8]
  1176. punpckhbw mm3, [r0+r3*0-8]
  1177. punpckhwd mm3, mm2
  1178. punpckhdq mm3, mm1
  1179. lea r0, [r0+r3*2]
  1180. movq mm0, [r0+r3*0-8]
  1181. movq mm1, [r2]
  1182. mov r0, r2
  1183. movq mm4, mm3
  1184. movq mm2, mm3
  1185. PALIGNR mm4, mm0, 7, mm0
  1186. PALIGNR mm1, mm2, 1, mm2
  1187. test r1, r1 ; top_left
  1188. jnz .do_left
  1189. .fix_lt_1:
  1190. movq mm5, mm3
  1191. pxor mm5, mm4
  1192. psrlq mm5, 56
  1193. psllq mm5, 48
  1194. pxor mm1, mm5
  1195. .do_left:
  1196. movq mm0, mm4
  1197. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1198. movq mm4, mm0
  1199. movq mm7, mm2
  1200. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1201. psllq mm1, 56
  1202. PALIGNR mm7, mm1, 7, mm3
  1203. movq mm3, mm7
  1204. lea r1, [r0+r3*2]
  1205. movq mm7, mm3
  1206. punpckhbw mm3, mm3
  1207. punpcklbw mm7, mm7
  1208. pshufw mm0, mm3, 0xff
  1209. pshufw mm1, mm3, 0xaa
  1210. lea r2, [r1+r3*2]
  1211. pshufw mm2, mm3, 0x55
  1212. pshufw mm3, mm3, 0x00
  1213. pshufw mm4, mm7, 0xff
  1214. pshufw mm5, mm7, 0xaa
  1215. pshufw mm6, mm7, 0x55
  1216. pshufw mm7, mm7, 0x00
  1217. movq [r0+r3*1], mm0
  1218. movq [r0+r3*2], mm1
  1219. movq [r1+r3*1], mm2
  1220. movq [r1+r3*2], mm3
  1221. movq [r2+r3*1], mm4
  1222. movq [r2+r3*2], mm5
  1223. lea r0, [r2+r3*2]
  1224. movq [r0+r3*1], mm6
  1225. movq [r0+r3*2], mm7
  1226. RET
  1227. %endmacro
  1228. INIT_MMX
  1229. %define PALIGNR PALIGNR_MMX
  1230. PRED8x8L_HORIZONTAL mmxext
  1231. %define PALIGNR PALIGNR_SSSE3
  1232. PRED8x8L_HORIZONTAL ssse3
  1233. ;-----------------------------------------------------------------------------
  1234. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1235. ;-----------------------------------------------------------------------------
  1236. %macro PRED8x8L_VERTICAL 1
  1237. cglobal pred8x8l_vertical_%1, 4,4
  1238. sub r0, r3
  1239. movq mm0, [r0-8]
  1240. movq mm3, [r0]
  1241. movq mm1, [r0+8]
  1242. movq mm2, mm3
  1243. movq mm4, mm3
  1244. PALIGNR mm2, mm0, 7, mm0
  1245. PALIGNR mm1, mm4, 1, mm4
  1246. test r1, r1 ; top_left
  1247. jz .fix_lt_2
  1248. test r2, r2 ; top_right
  1249. jz .fix_tr_1
  1250. jmp .body
  1251. .fix_lt_2:
  1252. movq mm5, mm3
  1253. pxor mm5, mm2
  1254. psllq mm5, 56
  1255. psrlq mm5, 56
  1256. pxor mm2, mm5
  1257. test r2, r2 ; top_right
  1258. jnz .body
  1259. .fix_tr_1:
  1260. movq mm5, mm3
  1261. pxor mm5, mm1
  1262. psrlq mm5, 56
  1263. psllq mm5, 56
  1264. pxor mm1, mm5
  1265. .body
  1266. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1267. %rep 3
  1268. movq [r0+r3*1], mm0
  1269. movq [r0+r3*2], mm0
  1270. lea r0, [r0+r3*2]
  1271. %endrep
  1272. movq [r0+r3*1], mm0
  1273. movq [r0+r3*2], mm0
  1274. RET
  1275. %endmacro
  1276. INIT_MMX
  1277. %define PALIGNR PALIGNR_MMX
  1278. PRED8x8L_VERTICAL mmxext
  1279. %define PALIGNR PALIGNR_SSSE3
  1280. PRED8x8L_VERTICAL ssse3
  1281. ;-----------------------------------------------------------------------------
  1282. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1283. ;-----------------------------------------------------------------------------
  1284. %macro PRED8x8L_DOWN_LEFT 1
  1285. cglobal pred8x8l_down_left_%1, 4,4
  1286. sub r0, r3
  1287. movq mm0, [r0-8]
  1288. movq mm3, [r0]
  1289. movq mm1, [r0+8]
  1290. movq mm2, mm3
  1291. movq mm4, mm3
  1292. PALIGNR mm2, mm0, 7, mm0
  1293. PALIGNR mm1, mm4, 1, mm4
  1294. test r1, r1 ; top_left
  1295. jz .fix_lt_2
  1296. test r2, r2 ; top_right
  1297. jz .fix_tr_1
  1298. jmp .do_top
  1299. .fix_lt_2:
  1300. movq mm5, mm3
  1301. pxor mm5, mm2
  1302. psllq mm5, 56
  1303. psrlq mm5, 56
  1304. pxor mm2, mm5
  1305. test r2, r2 ; top_right
  1306. jnz .do_top
  1307. .fix_tr_1:
  1308. movq mm5, mm3
  1309. pxor mm5, mm1
  1310. psrlq mm5, 56
  1311. psllq mm5, 56
  1312. pxor mm1, mm5
  1313. jmp .do_top
  1314. .fix_tr_2:
  1315. punpckhbw mm3, mm3
  1316. pshufw mm1, mm3, 0xFF
  1317. jmp .do_topright
  1318. .do_top:
  1319. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1320. movq2dq xmm3, mm4
  1321. test r2, r2 ; top_right
  1322. jz .fix_tr_2
  1323. movq mm0, [r0+8]
  1324. movq mm5, mm0
  1325. movq mm2, mm0
  1326. movq mm4, mm0
  1327. psrlq mm5, 56
  1328. PALIGNR mm2, mm3, 7, mm3
  1329. PALIGNR mm5, mm4, 1, mm4
  1330. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1331. .do_topright:
  1332. movq2dq xmm4, mm1
  1333. psrlq mm1, 56
  1334. movq2dq xmm5, mm1
  1335. lea r1, [r0+r3*2]
  1336. pslldq xmm4, 8
  1337. por xmm3, xmm4
  1338. movdqa xmm2, xmm3
  1339. psrldq xmm2, 1
  1340. pslldq xmm5, 15
  1341. por xmm2, xmm5
  1342. lea r2, [r1+r3*2]
  1343. movdqa xmm1, xmm3
  1344. pslldq xmm1, 1
  1345. INIT_XMM
  1346. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1347. psrldq xmm0, 1
  1348. movq [r0+r3*1], xmm0
  1349. psrldq xmm0, 1
  1350. movq [r0+r3*2], xmm0
  1351. psrldq xmm0, 1
  1352. lea r0, [r2+r3*2]
  1353. movq [r1+r3*1], xmm0
  1354. psrldq xmm0, 1
  1355. movq [r1+r3*2], xmm0
  1356. psrldq xmm0, 1
  1357. movq [r2+r3*1], xmm0
  1358. psrldq xmm0, 1
  1359. movq [r2+r3*2], xmm0
  1360. psrldq xmm0, 1
  1361. movq [r0+r3*1], xmm0
  1362. psrldq xmm0, 1
  1363. movq [r0+r3*2], xmm0
  1364. RET
  1365. %endmacro
  1366. INIT_MMX
  1367. %define PALIGNR PALIGNR_MMX
  1368. PRED8x8L_DOWN_LEFT sse2
  1369. INIT_MMX
  1370. %define PALIGNR PALIGNR_SSSE3
  1371. PRED8x8L_DOWN_LEFT ssse3
  1372. ;-----------------------------------------------------------------------------
  1373. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1374. ;-----------------------------------------------------------------------------
  1375. INIT_MMX
  1376. %define PALIGNR PALIGNR_MMX
  1377. cglobal pred8x8l_down_right_mmxext, 4,5
  1378. sub r0, r3
  1379. lea r4, [r0+r3*2]
  1380. movq mm0, [r0+r3*1-8]
  1381. punpckhbw mm0, [r0+r3*0-8]
  1382. movq mm1, [r4+r3*1-8]
  1383. punpckhbw mm1, [r0+r3*2-8]
  1384. mov r4, r0
  1385. punpckhwd mm1, mm0
  1386. lea r0, [r0+r3*4]
  1387. movq mm2, [r0+r3*1-8]
  1388. punpckhbw mm2, [r0+r3*0-8]
  1389. lea r0, [r0+r3*2]
  1390. movq mm3, [r0+r3*1-8]
  1391. punpckhbw mm3, [r0+r3*0-8]
  1392. punpckhwd mm3, mm2
  1393. punpckhdq mm3, mm1
  1394. lea r0, [r0+r3*2]
  1395. movq mm0, [r0+r3*0-8]
  1396. movq mm1, [r4]
  1397. mov r0, r4
  1398. movq mm4, mm3
  1399. movq mm2, mm3
  1400. PALIGNR mm4, mm0, 7, mm0
  1401. PALIGNR mm1, mm2, 1, mm2
  1402. test r1, r1 ; top_left
  1403. jz .fix_lt_1
  1404. .do_left:
  1405. movq mm0, mm4
  1406. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1407. movq mm4, mm0
  1408. movq mm7, mm2
  1409. movq mm6, mm2
  1410. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1411. psllq mm1, 56
  1412. PALIGNR mm7, mm1, 7, mm3
  1413. movq mm0, [r0-8]
  1414. movq mm3, [r0]
  1415. movq mm1, [r0+8]
  1416. movq mm2, mm3
  1417. movq mm4, mm3
  1418. PALIGNR mm2, mm0, 7, mm0
  1419. PALIGNR mm1, mm4, 1, mm4
  1420. test r1, r1 ; top_left
  1421. jz .fix_lt_2
  1422. test r2, r2 ; top_right
  1423. jz .fix_tr_1
  1424. .do_top:
  1425. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1426. movq mm5, mm4
  1427. jmp .body
  1428. .fix_lt_1:
  1429. movq mm5, mm3
  1430. pxor mm5, mm4
  1431. psrlq mm5, 56
  1432. psllq mm5, 48
  1433. pxor mm1, mm5
  1434. jmp .do_left
  1435. .fix_lt_2:
  1436. movq mm5, mm3
  1437. pxor mm5, mm2
  1438. psllq mm5, 56
  1439. psrlq mm5, 56
  1440. pxor mm2, mm5
  1441. test r2, r2 ; top_right
  1442. jnz .do_top
  1443. .fix_tr_1:
  1444. movq mm5, mm3
  1445. pxor mm5, mm1
  1446. psrlq mm5, 56
  1447. psllq mm5, 56
  1448. pxor mm1, mm5
  1449. jmp .do_top
  1450. .body
  1451. lea r1, [r0+r3*2]
  1452. movq mm1, mm7
  1453. movq mm7, mm5
  1454. movq mm5, mm6
  1455. movq mm2, mm7
  1456. lea r2, [r1+r3*2]
  1457. PALIGNR mm2, mm6, 1, mm0
  1458. movq mm3, mm7
  1459. PALIGNR mm3, mm6, 7, mm0
  1460. movq mm4, mm7
  1461. lea r4, [r2+r3*2]
  1462. psrlq mm4, 8
  1463. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1464. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1465. movq [r4+r3*2], mm0
  1466. movq mm2, mm1
  1467. psrlq mm0, 8
  1468. psllq mm2, 56
  1469. psrlq mm1, 8
  1470. por mm0, mm2
  1471. movq [r4+r3*1], mm0
  1472. movq mm2, mm1
  1473. psrlq mm0, 8
  1474. psllq mm2, 56
  1475. psrlq mm1, 8
  1476. por mm0, mm2
  1477. movq [r2+r3*2], mm0
  1478. movq mm2, mm1
  1479. psrlq mm0, 8
  1480. psllq mm2, 56
  1481. psrlq mm1, 8
  1482. por mm0, mm2
  1483. movq [r2+r3*1], mm0
  1484. movq mm2, mm1
  1485. psrlq mm0, 8
  1486. psllq mm2, 56
  1487. psrlq mm1, 8
  1488. por mm0, mm2
  1489. movq [r1+r3*2], mm0
  1490. movq mm2, mm1
  1491. psrlq mm0, 8
  1492. psllq mm2, 56
  1493. psrlq mm1, 8
  1494. por mm0, mm2
  1495. movq [r1+r3*1], mm0
  1496. movq mm2, mm1
  1497. psrlq mm0, 8
  1498. psllq mm2, 56
  1499. psrlq mm1, 8
  1500. por mm0, mm2
  1501. movq [r0+r3*2], mm0
  1502. psrlq mm0, 8
  1503. psllq mm1, 56
  1504. por mm0, mm1
  1505. movq [r0+r3*1], mm0
  1506. RET
  1507. %macro PRED8x8L_DOWN_RIGHT 1
  1508. cglobal pred8x8l_down_right_%1, 4,5
  1509. sub r0, r3
  1510. lea r4, [r0+r3*2]
  1511. movq mm0, [r0+r3*1-8]
  1512. punpckhbw mm0, [r0+r3*0-8]
  1513. movq mm1, [r4+r3*1-8]
  1514. punpckhbw mm1, [r0+r3*2-8]
  1515. mov r4, r0
  1516. punpckhwd mm1, mm0
  1517. lea r0, [r0+r3*4]
  1518. movq mm2, [r0+r3*1-8]
  1519. punpckhbw mm2, [r0+r3*0-8]
  1520. lea r0, [r0+r3*2]
  1521. movq mm3, [r0+r3*1-8]
  1522. punpckhbw mm3, [r0+r3*0-8]
  1523. punpckhwd mm3, mm2
  1524. punpckhdq mm3, mm1
  1525. lea r0, [r0+r3*2]
  1526. movq mm0, [r0+r3*0-8]
  1527. movq mm1, [r4]
  1528. mov r0, r4
  1529. movq mm4, mm3
  1530. movq mm2, mm3
  1531. PALIGNR mm4, mm0, 7, mm0
  1532. PALIGNR mm1, mm2, 1, mm2
  1533. test r1, r1
  1534. jz .fix_lt_1
  1535. jmp .do_left
  1536. .fix_lt_1:
  1537. movq mm5, mm3
  1538. pxor mm5, mm4
  1539. psrlq mm5, 56
  1540. psllq mm5, 48
  1541. pxor mm1, mm5
  1542. jmp .do_left
  1543. .fix_lt_2:
  1544. movq mm5, mm3
  1545. pxor mm5, mm2
  1546. psllq mm5, 56
  1547. psrlq mm5, 56
  1548. pxor mm2, mm5
  1549. test r2, r2
  1550. jnz .do_top
  1551. .fix_tr_1:
  1552. movq mm5, mm3
  1553. pxor mm5, mm1
  1554. psrlq mm5, 56
  1555. psllq mm5, 56
  1556. pxor mm1, mm5
  1557. jmp .do_top
  1558. .do_left:
  1559. movq mm0, mm4
  1560. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1561. movq mm4, mm0
  1562. movq mm7, mm2
  1563. movq2dq xmm3, mm2
  1564. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1565. psllq mm1, 56
  1566. PALIGNR mm7, mm1, 7, mm3
  1567. movq2dq xmm1, mm7
  1568. movq mm0, [r0-8]
  1569. movq mm3, [r0]
  1570. movq mm1, [r0+8]
  1571. movq mm2, mm3
  1572. movq mm4, mm3
  1573. PALIGNR mm2, mm0, 7, mm0
  1574. PALIGNR mm1, mm4, 1, mm4
  1575. test r1, r1
  1576. jz .fix_lt_2
  1577. test r2, r2
  1578. jz .fix_tr_1
  1579. .do_top:
  1580. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1581. movq2dq xmm4, mm4
  1582. lea r1, [r0+r3*2]
  1583. movdqa xmm0, xmm3
  1584. pslldq xmm4, 8
  1585. por xmm3, xmm4
  1586. lea r2, [r1+r3*2]
  1587. pslldq xmm4, 1
  1588. por xmm1, xmm4
  1589. psrldq xmm0, 7
  1590. pslldq xmm0, 15
  1591. psrldq xmm0, 7
  1592. por xmm1, xmm0
  1593. lea r0, [r2+r3*2]
  1594. movdqa xmm2, xmm3
  1595. psrldq xmm2, 1
  1596. INIT_XMM
  1597. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1598. movdqa xmm1, xmm0
  1599. psrldq xmm1, 1
  1600. movq [r0+r3*2], xmm0
  1601. movq [r0+r3*1], xmm1
  1602. psrldq xmm0, 2
  1603. psrldq xmm1, 2
  1604. movq [r2+r3*2], xmm0
  1605. movq [r2+r3*1], xmm1
  1606. psrldq xmm0, 2
  1607. psrldq xmm1, 2
  1608. movq [r1+r3*2], xmm0
  1609. movq [r1+r3*1], xmm1
  1610. psrldq xmm0, 2
  1611. psrldq xmm1, 2
  1612. movq [r4+r3*2], xmm0
  1613. movq [r4+r3*1], xmm1
  1614. RET
  1615. %endmacro
  1616. INIT_MMX
  1617. %define PALIGNR PALIGNR_MMX
  1618. PRED8x8L_DOWN_RIGHT sse2
  1619. INIT_MMX
  1620. %define PALIGNR PALIGNR_SSSE3
  1621. PRED8x8L_DOWN_RIGHT ssse3
  1622. ;-----------------------------------------------------------------------------
  1623. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1624. ;-----------------------------------------------------------------------------
  1625. INIT_MMX
  1626. %define PALIGNR PALIGNR_MMX
  1627. cglobal pred8x8l_vertical_right_mmxext, 4,5
  1628. sub r0, r3
  1629. lea r4, [r0+r3*2]
  1630. movq mm0, [r0+r3*1-8]
  1631. punpckhbw mm0, [r0+r3*0-8]
  1632. movq mm1, [r4+r3*1-8]
  1633. punpckhbw mm1, [r0+r3*2-8]
  1634. mov r4, r0
  1635. punpckhwd mm1, mm0
  1636. lea r0, [r0+r3*4]
  1637. movq mm2, [r0+r3*1-8]
  1638. punpckhbw mm2, [r0+r3*0-8]
  1639. lea r0, [r0+r3*2]
  1640. movq mm3, [r0+r3*1-8]
  1641. punpckhbw mm3, [r0+r3*0-8]
  1642. punpckhwd mm3, mm2
  1643. punpckhdq mm3, mm1
  1644. lea r0, [r0+r3*2]
  1645. movq mm0, [r0+r3*0-8]
  1646. movq mm1, [r4]
  1647. mov r0, r4
  1648. movq mm4, mm3
  1649. movq mm2, mm3
  1650. PALIGNR mm4, mm0, 7, mm0
  1651. PALIGNR mm1, mm2, 1, mm2
  1652. test r1, r1
  1653. jz .fix_lt_1
  1654. jmp .do_left
  1655. .fix_lt_1:
  1656. movq mm5, mm3
  1657. pxor mm5, mm4
  1658. psrlq mm5, 56
  1659. psllq mm5, 48
  1660. pxor mm1, mm5
  1661. jmp .do_left
  1662. .fix_lt_2:
  1663. movq mm5, mm3
  1664. pxor mm5, mm2
  1665. psllq mm5, 56
  1666. psrlq mm5, 56
  1667. pxor mm2, mm5
  1668. test r2, r2
  1669. jnz .do_top
  1670. .fix_tr_1:
  1671. movq mm5, mm3
  1672. pxor mm5, mm1
  1673. psrlq mm5, 56
  1674. psllq mm5, 56
  1675. pxor mm1, mm5
  1676. jmp .do_top
  1677. .do_left:
  1678. movq mm0, mm4
  1679. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1680. movq mm7, mm2
  1681. movq mm0, [r0-8]
  1682. movq mm3, [r0]
  1683. movq mm1, [r0+8]
  1684. movq mm2, mm3
  1685. movq mm4, mm3
  1686. PALIGNR mm2, mm0, 7, mm0
  1687. PALIGNR mm1, mm4, 1, mm4
  1688. test r1, r1
  1689. jz .fix_lt_2
  1690. test r2, r2
  1691. jz .fix_tr_1
  1692. .do_top
  1693. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1694. lea r1, [r0+r3*2]
  1695. movq mm2, mm6
  1696. movq mm3, mm6
  1697. PALIGNR mm3, mm7, 7, mm0
  1698. PALIGNR mm6, mm7, 6, mm1
  1699. movq mm4, mm3
  1700. pavgb mm3, mm2
  1701. lea r2, [r1+r3*2]
  1702. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1703. movq [r0+r3*1], mm3
  1704. movq [r0+r3*2], mm0
  1705. movq mm5, mm0
  1706. movq mm6, mm3
  1707. movq mm1, mm7
  1708. movq mm2, mm1
  1709. psllq mm2, 8
  1710. movq mm3, mm1
  1711. psllq mm3, 16
  1712. lea r4, [r2+r3*2]
  1713. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1714. PALIGNR mm6, mm0, 7, mm2
  1715. movq [r1+r3*1], mm6
  1716. psllq mm0, 8
  1717. PALIGNR mm5, mm0, 7, mm1
  1718. movq [r1+r3*2], mm5
  1719. psllq mm0, 8
  1720. PALIGNR mm6, mm0, 7, mm2
  1721. movq [r2+r3*1], mm6
  1722. psllq mm0, 8
  1723. PALIGNR mm5, mm0, 7, mm1
  1724. movq [r2+r3*2], mm5
  1725. psllq mm0, 8
  1726. PALIGNR mm6, mm0, 7, mm2
  1727. movq [r4+r3*1], mm6
  1728. psllq mm0, 8
  1729. PALIGNR mm5, mm0, 7, mm1
  1730. movq [r4+r3*2], mm5
  1731. RET
  1732. %macro PRED8x8L_VERTICAL_RIGHT 1
  1733. cglobal pred8x8l_vertical_right_%1, 4,5,7
  1734. sub r0, r3
  1735. lea r4, [r0+r3*2]
  1736. movq mm0, [r0+r3*1-8]
  1737. punpckhbw mm0, [r0+r3*0-8]
  1738. movq mm1, [r4+r3*1-8]
  1739. punpckhbw mm1, [r0+r3*2-8]
  1740. mov r4, r0
  1741. punpckhwd mm1, mm0
  1742. lea r0, [r0+r3*4]
  1743. movq mm2, [r0+r3*1-8]
  1744. punpckhbw mm2, [r0+r3*0-8]
  1745. lea r0, [r0+r3*2]
  1746. movq mm3, [r0+r3*1-8]
  1747. punpckhbw mm3, [r0+r3*0-8]
  1748. punpckhwd mm3, mm2
  1749. punpckhdq mm3, mm1
  1750. lea r0, [r0+r3*2]
  1751. movq mm0, [r0+r3*0-8]
  1752. movq mm1, [r4]
  1753. mov r0, r4
  1754. movq mm4, mm3
  1755. movq mm2, mm3
  1756. PALIGNR mm4, mm0, 7, mm0
  1757. PALIGNR mm1, mm2, 1, mm2
  1758. test r1, r1
  1759. jnz .do_left
  1760. .fix_lt_1:
  1761. movq mm5, mm3
  1762. pxor mm5, mm4
  1763. psrlq mm5, 56
  1764. psllq mm5, 48
  1765. pxor mm1, mm5
  1766. jmp .do_left
  1767. .fix_lt_2:
  1768. movq mm5, mm3
  1769. pxor mm5, mm2
  1770. psllq mm5, 56
  1771. psrlq mm5, 56
  1772. pxor mm2, mm5
  1773. test r2, r2
  1774. jnz .do_top
  1775. .fix_tr_1:
  1776. movq mm5, mm3
  1777. pxor mm5, mm1
  1778. psrlq mm5, 56
  1779. psllq mm5, 56
  1780. pxor mm1, mm5
  1781. jmp .do_top
  1782. .do_left:
  1783. movq mm0, mm4
  1784. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1785. movq2dq xmm0, mm2
  1786. movq mm0, [r0-8]
  1787. movq mm3, [r0]
  1788. movq mm1, [r0+8]
  1789. movq mm2, mm3
  1790. movq mm4, mm3
  1791. PALIGNR mm2, mm0, 7, mm0
  1792. PALIGNR mm1, mm4, 1, mm4
  1793. test r1, r1
  1794. jz .fix_lt_2
  1795. test r2, r2
  1796. jz .fix_tr_1
  1797. .do_top
  1798. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1799. lea r1, [r0+r3*2]
  1800. movq2dq xmm4, mm6
  1801. pslldq xmm4, 8
  1802. por xmm0, xmm4
  1803. movdqa xmm6, [pw_ff00]
  1804. movdqa xmm1, xmm0
  1805. lea r2, [r1+r3*2]
  1806. movdqa xmm2, xmm0
  1807. movdqa xmm3, xmm0
  1808. pslldq xmm0, 1
  1809. pslldq xmm1, 2
  1810. pavgb xmm2, xmm0
  1811. INIT_XMM
  1812. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1813. pandn xmm6, xmm4
  1814. movdqa xmm5, xmm4
  1815. psrlw xmm4, 8
  1816. packuswb xmm6, xmm4
  1817. movhlps xmm4, xmm6
  1818. movhps [r0+r3*2], xmm5
  1819. movhps [r0+r3*1], xmm2
  1820. psrldq xmm5, 4
  1821. movss xmm5, xmm6
  1822. psrldq xmm2, 4
  1823. movss xmm2, xmm4
  1824. lea r0, [r2+r3*2]
  1825. psrldq xmm5, 1
  1826. psrldq xmm2, 1
  1827. movq [r0+r3*2], xmm5
  1828. movq [r0+r3*1], xmm2
  1829. psrldq xmm5, 1
  1830. psrldq xmm2, 1
  1831. movq [r2+r3*2], xmm5
  1832. movq [r2+r3*1], xmm2
  1833. psrldq xmm5, 1
  1834. psrldq xmm2, 1
  1835. movq [r1+r3*2], xmm5
  1836. movq [r1+r3*1], xmm2
  1837. RET
  1838. %endmacro
  1839. INIT_MMX
  1840. %define PALIGNR PALIGNR_MMX
  1841. PRED8x8L_VERTICAL_RIGHT sse2
  1842. INIT_MMX
  1843. %define PALIGNR PALIGNR_SSSE3
  1844. PRED8x8L_VERTICAL_RIGHT ssse3
  1845. ;-----------------------------------------------------------------------------
  1846. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1847. ;-----------------------------------------------------------------------------
  1848. %macro PRED8x8L_VERTICAL_LEFT 1
  1849. cglobal pred8x8l_vertical_left_%1, 4,4
  1850. sub r0, r3
  1851. movq mm0, [r0-8]
  1852. movq mm3, [r0]
  1853. movq mm1, [r0+8]
  1854. movq mm2, mm3
  1855. movq mm4, mm3
  1856. PALIGNR mm2, mm0, 7, mm0
  1857. PALIGNR mm1, mm4, 1, mm4
  1858. test r1, r1
  1859. jz .fix_lt_2
  1860. test r2, r2
  1861. jz .fix_tr_1
  1862. jmp .do_top
  1863. .fix_lt_2:
  1864. movq mm5, mm3
  1865. pxor mm5, mm2
  1866. psllq mm5, 56
  1867. psrlq mm5, 56
  1868. pxor mm2, mm5
  1869. test r2, r2
  1870. jnz .do_top
  1871. .fix_tr_1:
  1872. movq mm5, mm3
  1873. pxor mm5, mm1
  1874. psrlq mm5, 56
  1875. psllq mm5, 56
  1876. pxor mm1, mm5
  1877. jmp .do_top
  1878. .fix_tr_2:
  1879. punpckhbw mm3, mm3
  1880. pshufw mm1, mm3, 0xFF
  1881. jmp .do_topright
  1882. .do_top:
  1883. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1884. movq2dq xmm4, mm4
  1885. test r2, r2
  1886. jz .fix_tr_2
  1887. movq mm0, [r0+8]
  1888. movq mm5, mm0
  1889. movq mm2, mm0
  1890. movq mm4, mm0
  1891. psrlq mm5, 56
  1892. PALIGNR mm2, mm3, 7, mm3
  1893. PALIGNR mm5, mm4, 1, mm4
  1894. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1895. .do_topright:
  1896. movq2dq xmm3, mm1
  1897. lea r1, [r0+r3*2]
  1898. pslldq xmm3, 8
  1899. por xmm4, xmm3
  1900. movdqa xmm2, xmm4
  1901. movdqa xmm1, xmm4
  1902. movdqa xmm3, xmm4
  1903. psrldq xmm2, 1
  1904. pslldq xmm1, 1
  1905. pavgb xmm3, xmm2
  1906. lea r2, [r1+r3*2]
  1907. INIT_XMM
  1908. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1909. psrldq xmm0, 1
  1910. movq [r0+r3*1], xmm3
  1911. movq [r0+r3*2], xmm0
  1912. lea r0, [r2+r3*2]
  1913. psrldq xmm3, 1
  1914. psrldq xmm0, 1
  1915. movq [r1+r3*1], xmm3
  1916. movq [r1+r3*2], xmm0
  1917. psrldq xmm3, 1
  1918. psrldq xmm0, 1
  1919. movq [r2+r3*1], xmm3
  1920. movq [r2+r3*2], xmm0
  1921. psrldq xmm3, 1
  1922. psrldq xmm0, 1
  1923. movq [r0+r3*1], xmm3
  1924. movq [r0+r3*2], xmm0
  1925. RET
  1926. %endmacro
  1927. INIT_MMX
  1928. %define PALIGNR PALIGNR_MMX
  1929. PRED8x8L_VERTICAL_LEFT sse2
  1930. %define PALIGNR PALIGNR_SSSE3
  1931. INIT_MMX
  1932. PRED8x8L_VERTICAL_LEFT ssse3
  1933. ;-----------------------------------------------------------------------------
  1934. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  1935. ;-----------------------------------------------------------------------------
  1936. %macro PRED8x8L_HORIZONTAL_UP 1
  1937. cglobal pred8x8l_horizontal_up_%1, 4,4
  1938. sub r0, r3
  1939. lea r2, [r0+r3*2]
  1940. movq mm0, [r0+r3*1-8]
  1941. punpckhbw mm0, [r0+r3*0-8]
  1942. movq mm1, [r2+r3*1-8]
  1943. punpckhbw mm1, [r0+r3*2-8]
  1944. mov r2, r0
  1945. punpckhwd mm1, mm0
  1946. lea r0, [r0+r3*4]
  1947. movq mm2, [r0+r3*1-8]
  1948. punpckhbw mm2, [r0+r3*0-8]
  1949. lea r0, [r0+r3*2]
  1950. movq mm3, [r0+r3*1-8]
  1951. punpckhbw mm3, [r0+r3*0-8]
  1952. punpckhwd mm3, mm2
  1953. punpckhdq mm3, mm1
  1954. lea r0, [r0+r3*2]
  1955. movq mm0, [r0+r3*0-8]
  1956. movq mm1, [r2]
  1957. mov r0, r2
  1958. movq mm4, mm3
  1959. movq mm2, mm3
  1960. PALIGNR mm4, mm0, 7, mm0
  1961. PALIGNR mm1, mm2, 1, mm2
  1962. test r1, r1
  1963. jnz .do_left
  1964. .fix_lt_1:
  1965. movq mm5, mm3
  1966. pxor mm5, mm4
  1967. psrlq mm5, 56
  1968. psllq mm5, 48
  1969. pxor mm1, mm5
  1970. .do_left:
  1971. movq mm0, mm4
  1972. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1973. movq mm4, mm0
  1974. movq mm7, mm2
  1975. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1976. psllq mm1, 56
  1977. PALIGNR mm7, mm1, 7, mm3
  1978. lea r1, [r0+r3*2]
  1979. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  1980. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  1981. movq mm2, mm0
  1982. psllw mm0, 8
  1983. psrlw mm2, 8
  1984. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  1985. movq mm3, mm2
  1986. movq mm4, mm2
  1987. movq mm5, mm2
  1988. psrlq mm2, 8
  1989. psrlq mm3, 16
  1990. lea r2, [r1+r3*2]
  1991. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  1992. punpckhbw mm7, mm7
  1993. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  1994. pavgb mm4, mm2
  1995. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  1996. movq mm5, mm4
  1997. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  1998. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  1999. movq mm6, mm5
  2000. movq mm7, mm5
  2001. movq mm0, mm5
  2002. PALIGNR mm5, mm4, 2, mm1
  2003. pshufw mm1, mm6, 11111001b
  2004. PALIGNR mm6, mm4, 4, mm2
  2005. pshufw mm2, mm7, 11111110b
  2006. PALIGNR mm7, mm4, 6, mm3
  2007. pshufw mm3, mm0, 11111111b
  2008. movq [r0+r3*1], mm4
  2009. movq [r0+r3*2], mm5
  2010. lea r0, [r2+r3*2]
  2011. movq [r1+r3*1], mm6
  2012. movq [r1+r3*2], mm7
  2013. movq [r2+r3*1], mm0
  2014. movq [r2+r3*2], mm1
  2015. movq [r0+r3*1], mm2
  2016. movq [r0+r3*2], mm3
  2017. RET
  2018. %endmacro
  2019. INIT_MMX
  2020. %define PALIGNR PALIGNR_MMX
  2021. PRED8x8L_HORIZONTAL_UP mmxext
  2022. %define PALIGNR PALIGNR_SSSE3
  2023. PRED8x8L_HORIZONTAL_UP ssse3
  2024. ;-----------------------------------------------------------------------------
  2025. ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
  2026. ;-----------------------------------------------------------------------------
  2027. INIT_MMX
  2028. %define PALIGNR PALIGNR_MMX
  2029. cglobal pred8x8l_horizontal_down_mmxext, 4,5
  2030. sub r0, r3
  2031. lea r4, [r0+r3*2]
  2032. movq mm0, [r0+r3*1-8]
  2033. punpckhbw mm0, [r0+r3*0-8]
  2034. movq mm1, [r4+r3*1-8]
  2035. punpckhbw mm1, [r0+r3*2-8]
  2036. mov r4, r0
  2037. punpckhwd mm1, mm0
  2038. lea r0, [r0+r3*4]
  2039. movq mm2, [r0+r3*1-8]
  2040. punpckhbw mm2, [r0+r3*0-8]
  2041. lea r0, [r0+r3*2]
  2042. movq mm3, [r0+r3*1-8]
  2043. punpckhbw mm3, [r0+r3*0-8]
  2044. punpckhwd mm3, mm2
  2045. punpckhdq mm3, mm1
  2046. lea r0, [r0+r3*2]
  2047. movq mm0, [r0+r3*0-8]
  2048. movq mm1, [r4]
  2049. mov r0, r4
  2050. movq mm4, mm3
  2051. movq mm2, mm3
  2052. PALIGNR mm4, mm0, 7, mm0
  2053. PALIGNR mm1, mm2, 1, mm2
  2054. test r1, r1
  2055. jnz .do_left
  2056. .fix_lt_1:
  2057. movq mm5, mm3
  2058. pxor mm5, mm4
  2059. psrlq mm5, 56
  2060. psllq mm5, 48
  2061. pxor mm1, mm5
  2062. jmp .do_left
  2063. .fix_lt_2:
  2064. movq mm5, mm3
  2065. pxor mm5, mm2
  2066. psllq mm5, 56
  2067. psrlq mm5, 56
  2068. pxor mm2, mm5
  2069. test r2, r2
  2070. jnz .do_top
  2071. .fix_tr_1:
  2072. movq mm5, mm3
  2073. pxor mm5, mm1
  2074. psrlq mm5, 56
  2075. psllq mm5, 56
  2076. pxor mm1, mm5
  2077. jmp .do_top
  2078. .do_left:
  2079. movq mm0, mm4
  2080. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2081. movq mm4, mm0
  2082. movq mm7, mm2
  2083. movq mm6, mm2
  2084. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2085. psllq mm1, 56
  2086. PALIGNR mm7, mm1, 7, mm3
  2087. movq mm0, [r0-8]
  2088. movq mm3, [r0]
  2089. movq mm1, [r0+8]
  2090. movq mm2, mm3
  2091. movq mm4, mm3
  2092. PALIGNR mm2, mm0, 7, mm0
  2093. PALIGNR mm1, mm4, 1, mm4
  2094. test r1, r1
  2095. jz .fix_lt_2
  2096. test r2, r2
  2097. jz .fix_tr_1
  2098. .do_top:
  2099. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2100. movq mm5, mm4
  2101. lea r1, [r0+r3*2]
  2102. psllq mm7, 56
  2103. movq mm2, mm5
  2104. movq mm3, mm6
  2105. movq mm4, mm2
  2106. PALIGNR mm2, mm6, 7, mm5
  2107. PALIGNR mm6, mm7, 7, mm0
  2108. lea r2, [r1+r3*2]
  2109. PALIGNR mm4, mm3, 1, mm7
  2110. movq mm5, mm3
  2111. pavgb mm3, mm6
  2112. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2113. movq mm4, mm2
  2114. movq mm1, mm2
  2115. lea r4, [r2+r3*2]
  2116. psrlq mm4, 16
  2117. psrlq mm1, 8
  2118. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2119. movq mm7, mm3
  2120. punpcklbw mm3, mm0
  2121. punpckhbw mm7, mm0
  2122. movq mm1, mm7
  2123. movq mm0, mm7
  2124. movq mm4, mm7
  2125. movq [r4+r3*2], mm3
  2126. PALIGNR mm7, mm3, 2, mm5
  2127. movq [r4+r3*1], mm7
  2128. PALIGNR mm1, mm3, 4, mm5
  2129. movq [r2+r3*2], mm1
  2130. PALIGNR mm0, mm3, 6, mm3
  2131. movq [r2+r3*1], mm0
  2132. movq mm2, mm6
  2133. movq mm3, mm6
  2134. movq [r1+r3*2], mm4
  2135. PALIGNR mm6, mm4, 2, mm5
  2136. movq [r1+r3*1], mm6
  2137. PALIGNR mm2, mm4, 4, mm5
  2138. movq [r0+r3*2], mm2
  2139. PALIGNR mm3, mm4, 6, mm4
  2140. movq [r0+r3*1], mm3
  2141. RET
  2142. %macro PRED8x8L_HORIZONTAL_DOWN 1
  2143. cglobal pred8x8l_horizontal_down_%1, 4,5
  2144. sub r0, r3
  2145. lea r4, [r0+r3*2]
  2146. movq mm0, [r0+r3*1-8]
  2147. punpckhbw mm0, [r0+r3*0-8]
  2148. movq mm1, [r4+r3*1-8]
  2149. punpckhbw mm1, [r0+r3*2-8]
  2150. mov r4, r0
  2151. punpckhwd mm1, mm0
  2152. lea r0, [r0+r3*4]
  2153. movq mm2, [r0+r3*1-8]
  2154. punpckhbw mm2, [r0+r3*0-8]
  2155. lea r0, [r0+r3*2]
  2156. movq mm3, [r0+r3*1-8]
  2157. punpckhbw mm3, [r0+r3*0-8]
  2158. punpckhwd mm3, mm2
  2159. punpckhdq mm3, mm1
  2160. lea r0, [r0+r3*2]
  2161. movq mm0, [r0+r3*0-8]
  2162. movq mm1, [r4]
  2163. mov r0, r4
  2164. movq mm4, mm3
  2165. movq mm2, mm3
  2166. PALIGNR mm4, mm0, 7, mm0
  2167. PALIGNR mm1, mm2, 1, mm2
  2168. test r1, r1
  2169. jnz .do_left
  2170. .fix_lt_1:
  2171. movq mm5, mm3
  2172. pxor mm5, mm4
  2173. psrlq mm5, 56
  2174. psllq mm5, 48
  2175. pxor mm1, mm5
  2176. jmp .do_left
  2177. .fix_lt_2:
  2178. movq mm5, mm3
  2179. pxor mm5, mm2
  2180. psllq mm5, 56
  2181. psrlq mm5, 56
  2182. pxor mm2, mm5
  2183. test r2, r2
  2184. jnz .do_top
  2185. .fix_tr_1:
  2186. movq mm5, mm3
  2187. pxor mm5, mm1
  2188. psrlq mm5, 56
  2189. psllq mm5, 56
  2190. pxor mm1, mm5
  2191. jmp .do_top
  2192. .fix_tr_2:
  2193. punpckhbw mm3, mm3
  2194. pshufw mm1, mm3, 0xFF
  2195. jmp .do_topright
  2196. .do_left:
  2197. movq mm0, mm4
  2198. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2199. movq2dq xmm0, mm2
  2200. pslldq xmm0, 8
  2201. movq mm4, mm0
  2202. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2203. movq2dq xmm2, mm1
  2204. pslldq xmm2, 15
  2205. psrldq xmm2, 8
  2206. por xmm0, xmm2
  2207. movq mm0, [r0-8]
  2208. movq mm3, [r0]
  2209. movq mm1, [r0+8]
  2210. movq mm2, mm3
  2211. movq mm4, mm3
  2212. PALIGNR mm2, mm0, 7, mm0
  2213. PALIGNR mm1, mm4, 1, mm4
  2214. test r1, r1
  2215. jz .fix_lt_2
  2216. test r2, r2
  2217. jz .fix_tr_1
  2218. .do_top:
  2219. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2220. movq2dq xmm1, mm4
  2221. test r2, r2
  2222. jz .fix_tr_2
  2223. movq mm0, [r0+8]
  2224. movq mm5, mm0
  2225. movq mm2, mm0
  2226. movq mm4, mm0
  2227. psrlq mm5, 56
  2228. PALIGNR mm2, mm3, 7, mm3
  2229. PALIGNR mm5, mm4, 1, mm4
  2230. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2231. .do_topright:
  2232. movq2dq xmm5, mm1
  2233. pslldq xmm5, 8
  2234. por xmm1, xmm5
  2235. INIT_XMM
  2236. lea r2, [r4+r3*2]
  2237. movdqa xmm2, xmm1
  2238. movdqa xmm3, xmm1
  2239. PALIGNR xmm1, xmm0, 7, xmm4
  2240. PALIGNR xmm2, xmm0, 9, xmm5
  2241. lea r1, [r2+r3*2]
  2242. PALIGNR xmm3, xmm0, 8, xmm0
  2243. movdqa xmm4, xmm1
  2244. pavgb xmm4, xmm3
  2245. lea r0, [r1+r3*2]
  2246. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2247. punpcklbw xmm4, xmm0
  2248. movhlps xmm0, xmm4
  2249. movq [r0+r3*2], xmm4
  2250. movq [r2+r3*2], xmm0
  2251. psrldq xmm4, 2
  2252. psrldq xmm0, 2
  2253. movq [r0+r3*1], xmm4
  2254. movq [r2+r3*1], xmm0
  2255. psrldq xmm4, 2
  2256. psrldq xmm0, 2
  2257. movq [r1+r3*2], xmm4
  2258. movq [r4+r3*2], xmm0
  2259. psrldq xmm4, 2
  2260. psrldq xmm0, 2
  2261. movq [r1+r3*1], xmm4
  2262. movq [r4+r3*1], xmm0
  2263. RET
  2264. %endmacro
  2265. INIT_MMX
  2266. %define PALIGNR PALIGNR_MMX
  2267. PRED8x8L_HORIZONTAL_DOWN sse2
  2268. INIT_MMX
  2269. %define PALIGNR PALIGNR_SSSE3
  2270. PRED8x8L_HORIZONTAL_DOWN ssse3
  2271. %endif
  2272. ;-----------------------------------------------------------------------------
  2273. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2274. ;-----------------------------------------------------------------------------
  2275. cglobal pred4x4_dc_mmxext, 3,5
  2276. pxor mm7, mm7
  2277. mov r4, r0
  2278. sub r0, r2
  2279. movd mm0, [r0]
  2280. psadbw mm0, mm7
  2281. movzx r1d, byte [r0+r2*1-1]
  2282. movd r3d, mm0
  2283. add r3d, r1d
  2284. movzx r1d, byte [r0+r2*2-1]
  2285. lea r0, [r0+r2*2]
  2286. add r3d, r1d
  2287. movzx r1d, byte [r0+r2*1-1]
  2288. add r3d, r1d
  2289. movzx r1d, byte [r0+r2*2-1]
  2290. add r3d, r1d
  2291. add r3d, 4
  2292. shr r3d, 3
  2293. imul r3d, 0x01010101
  2294. mov [r4+r2*0], r3d
  2295. mov [r0+r2*0], r3d
  2296. mov [r0+r2*1], r3d
  2297. mov [r0+r2*2], r3d
  2298. RET
  2299. ;-----------------------------------------------------------------------------
  2300. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2301. ;-----------------------------------------------------------------------------
  2302. %macro PRED4x4_TM_MMX 1
  2303. cglobal pred4x4_tm_vp8_%1, 3,6
  2304. sub r0, r2
  2305. pxor mm7, mm7
  2306. movd mm0, [r0]
  2307. punpcklbw mm0, mm7
  2308. movzx r4d, byte [r0-1]
  2309. mov r5d, 2
  2310. .loop:
  2311. movzx r1d, byte [r0+r2*1-1]
  2312. movzx r3d, byte [r0+r2*2-1]
  2313. sub r1d, r4d
  2314. sub r3d, r4d
  2315. movd mm2, r1d
  2316. movd mm4, r3d
  2317. %ifidn %1, mmx
  2318. punpcklwd mm2, mm2
  2319. punpcklwd mm4, mm4
  2320. punpckldq mm2, mm2
  2321. punpckldq mm4, mm4
  2322. %else
  2323. pshufw mm2, mm2, 0
  2324. pshufw mm4, mm4, 0
  2325. %endif
  2326. paddw mm2, mm0
  2327. paddw mm4, mm0
  2328. packuswb mm2, mm2
  2329. packuswb mm4, mm4
  2330. movd [r0+r2*1], mm2
  2331. movd [r0+r2*2], mm4
  2332. lea r0, [r0+r2*2]
  2333. dec r5d
  2334. jg .loop
  2335. REP_RET
  2336. %endmacro
  2337. PRED4x4_TM_MMX mmx
  2338. PRED4x4_TM_MMX mmxext
  2339. cglobal pred4x4_tm_vp8_ssse3, 3,3
  2340. sub r0, r2
  2341. movq mm6, [tm_shuf]
  2342. pxor mm1, mm1
  2343. movd mm0, [r0]
  2344. punpcklbw mm0, mm1
  2345. movd mm7, [r0-4]
  2346. pshufb mm7, mm6
  2347. lea r1, [r0+r2*2]
  2348. movd mm2, [r0+r2*1-4]
  2349. movd mm3, [r0+r2*2-4]
  2350. movd mm4, [r1+r2*1-4]
  2351. movd mm5, [r1+r2*2-4]
  2352. pshufb mm2, mm6
  2353. pshufb mm3, mm6
  2354. pshufb mm4, mm6
  2355. pshufb mm5, mm6
  2356. psubw mm2, mm7
  2357. psubw mm3, mm7
  2358. psubw mm4, mm7
  2359. psubw mm5, mm7
  2360. paddw mm2, mm0
  2361. paddw mm3, mm0
  2362. paddw mm4, mm0
  2363. paddw mm5, mm0
  2364. packuswb mm2, mm2
  2365. packuswb mm3, mm3
  2366. packuswb mm4, mm4
  2367. packuswb mm5, mm5
  2368. movd [r0+r2*1], mm2
  2369. movd [r0+r2*2], mm3
  2370. movd [r1+r2*1], mm4
  2371. movd [r1+r2*2], mm5
  2372. RET
  2373. ;-----------------------------------------------------------------------------
  2374. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2375. ;-----------------------------------------------------------------------------
  2376. INIT_MMX
  2377. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  2378. sub r0, r2
  2379. movd m1, [r0-1]
  2380. movd m0, [r0]
  2381. mova m2, m0 ;t0 t1 t2 t3
  2382. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2383. lea r1, [r0+r2*2]
  2384. psrlq m0, 8 ;t1 t2 t3 t4
  2385. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2386. movd [r0+r2*1], m3
  2387. movd [r0+r2*2], m3
  2388. movd [r1+r2*1], m3
  2389. movd [r1+r2*2], m3
  2390. RET
  2391. ;-----------------------------------------------------------------------------
  2392. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2393. ;-----------------------------------------------------------------------------
  2394. %ifdef CONFIG_GPL
  2395. INIT_MMX
  2396. cglobal pred4x4_down_left_mmxext, 3,3
  2397. sub r0, r2
  2398. movq m1, [r0]
  2399. punpckldq m1, [r1]
  2400. movq m2, m1
  2401. movq m3, m1
  2402. movq m4, m1
  2403. psllq m1, 8
  2404. pxor m2, m1
  2405. psrlq m2, 8
  2406. pxor m3, m2
  2407. PRED4x4_LOWPASS m0, m1, m3, m4, m5
  2408. lea r1, [r0+r2*2]
  2409. psrlq m0, 8
  2410. movd [r0+r2*1], m0
  2411. psrlq m0, 8
  2412. movd [r0+r2*2], m0
  2413. psrlq m0, 8
  2414. movd [r1+r2*1], m0
  2415. psrlq m0, 8
  2416. movd [r1+r2*2], m0
  2417. RET
  2418. ;-----------------------------------------------------------------------------
  2419. ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2420. ;-----------------------------------------------------------------------------
  2421. INIT_MMX
  2422. cglobal pred4x4_vertical_left_mmxext, 3,3
  2423. sub r0, r2
  2424. movq m1, [r0]
  2425. punpckldq m1, [r1]
  2426. movq m3, m1
  2427. movq m2, m1
  2428. psrlq m3, 8
  2429. psrlq m2, 16
  2430. movq m4, m3
  2431. pavgb m4, m1
  2432. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2433. lea r1, [r0+r2*2]
  2434. movh [r0+r2*1], m4
  2435. movh [r0+r2*2], m0
  2436. psrlq m4, 8
  2437. psrlq m0, 8
  2438. movh [r1+r2*1], m4
  2439. movh [r1+r2*2], m0
  2440. RET
  2441. ;-----------------------------------------------------------------------------
  2442. ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2443. ;-----------------------------------------------------------------------------
  2444. INIT_MMX
  2445. cglobal pred4x4_horizontal_up_mmxext, 3,3
  2446. sub r0, r2
  2447. lea r1, [r0+r2*2]
  2448. movq m0, [r0+r2*1-8]
  2449. punpckhbw m0, [r0+r2*2-8]
  2450. movq m1, [r1+r2*1-8]
  2451. punpckhbw m1, [r1+r2*2-8]
  2452. punpckhwd m0, m1
  2453. movq m1, m0
  2454. punpckhbw m1, m1
  2455. pshufw m1, m1, 0xFF
  2456. punpckhdq m0, m1
  2457. movq m2, m0
  2458. movq m3, m0
  2459. movq m7, m0
  2460. psrlq m2, 16
  2461. psrlq m3, 8
  2462. pavgb m7, m3
  2463. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2464. punpcklbw m7, m4
  2465. movd [r0+r2*1], m7
  2466. psrlq m7, 16
  2467. movd [r0+r2*2], m7
  2468. psrlq m7, 16
  2469. movd [r1+r2*1], m7
  2470. movd [r1+r2*2], m1
  2471. RET
  2472. %endif