You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2790 lines
71KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA
  27. tm_shuf: times 8 db 0x03, 0x80
  28. pw_ff00: times 8 dw 0xff00
  29. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  30. db 1, 2, 3, 4, 5, 6, 7, 8
  31. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  32. db 1, 2, 3, 4, 0, 0, 0, 0
  33. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  34. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  35. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  36. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  37. SECTION .text
  38. cextern pb_1
  39. cextern pb_3
  40. cextern pw_4
  41. cextern pw_5
  42. cextern pw_8
  43. cextern pw_16
  44. cextern pw_17
  45. cextern pw_32
  46. ;-----------------------------------------------------------------------------
  47. ; void pred16x16_vertical(uint8_t *src, int stride)
  48. ;-----------------------------------------------------------------------------
  49. cglobal pred16x16_vertical_mmx, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. cglobal pred16x16_vertical_sse, 2,3
  64. sub r0, r1
  65. mov r2, 4
  66. movaps xmm0, [r0]
  67. .loop:
  68. movaps [r0+r1*1], xmm0
  69. movaps [r0+r1*2], xmm0
  70. lea r0, [r0+r1*2]
  71. movaps [r0+r1*1], xmm0
  72. movaps [r0+r1*2], xmm0
  73. lea r0, [r0+r1*2]
  74. dec r2
  75. jg .loop
  76. REP_RET
  77. ;-----------------------------------------------------------------------------
  78. ; void pred16x16_horizontal(uint8_t *src, int stride)
  79. ;-----------------------------------------------------------------------------
  80. %macro PRED16x16_H 1
  81. cglobal pred16x16_horizontal_%1, 2,3
  82. mov r2, 8
  83. %ifidn %1, ssse3
  84. mova m2, [pb_3]
  85. %endif
  86. .loop:
  87. movd m0, [r0+r1*0-4]
  88. movd m1, [r0+r1*1-4]
  89. %ifidn %1, ssse3
  90. pshufb m0, m2
  91. pshufb m1, m2
  92. %else
  93. punpcklbw m0, m0
  94. punpcklbw m1, m1
  95. %ifidn %1, mmxext
  96. pshufw m0, m0, 0xff
  97. pshufw m1, m1, 0xff
  98. %else
  99. punpckhwd m0, m0
  100. punpckhwd m1, m1
  101. punpckhdq m0, m0
  102. punpckhdq m1, m1
  103. %endif
  104. mova [r0+r1*0+8], m0
  105. mova [r0+r1*1+8], m1
  106. %endif
  107. mova [r0+r1*0], m0
  108. mova [r0+r1*1], m1
  109. lea r0, [r0+r1*2]
  110. dec r2
  111. jg .loop
  112. REP_RET
  113. %endmacro
  114. INIT_MMX
  115. PRED16x16_H mmx
  116. PRED16x16_H mmxext
  117. INIT_XMM
  118. PRED16x16_H ssse3
  119. ;-----------------------------------------------------------------------------
  120. ; void pred16x16_dc(uint8_t *src, int stride)
  121. ;-----------------------------------------------------------------------------
  122. %macro PRED16x16_DC 1
  123. cglobal pred16x16_dc_%1, 2,7
  124. mov r4, r0
  125. sub r0, r1
  126. pxor mm0, mm0
  127. pxor mm1, mm1
  128. psadbw mm0, [r0+0]
  129. psadbw mm1, [r0+8]
  130. dec r0
  131. movzx r5d, byte [r0+r1*1]
  132. paddw mm0, mm1
  133. movd r6d, mm0
  134. lea r0, [r0+r1*2]
  135. %rep 7
  136. movzx r2d, byte [r0+r1*0]
  137. movzx r3d, byte [r0+r1*1]
  138. add r5d, r2d
  139. add r6d, r3d
  140. lea r0, [r0+r1*2]
  141. %endrep
  142. movzx r2d, byte [r0+r1*0]
  143. add r5d, r6d
  144. lea r2d, [r2+r5+16]
  145. shr r2d, 5
  146. %ifidn %1, mmxext
  147. movd m0, r2d
  148. punpcklbw m0, m0
  149. pshufw m0, m0, 0
  150. %elifidn %1, sse2
  151. movd m0, r2d
  152. punpcklbw m0, m0
  153. pshuflw m0, m0, 0
  154. punpcklqdq m0, m0
  155. %elifidn %1, ssse3
  156. pxor m1, m1
  157. movd m0, r2d
  158. pshufb m0, m1
  159. %endif
  160. %if mmsize==8
  161. mov r3d, 8
  162. .loop:
  163. mova [r4+r1*0+0], m0
  164. mova [r4+r1*0+8], m0
  165. mova [r4+r1*1+0], m0
  166. mova [r4+r1*1+8], m0
  167. %else
  168. mov r3d, 4
  169. .loop:
  170. mova [r4+r1*0], m0
  171. mova [r4+r1*1], m0
  172. lea r4, [r4+r1*2]
  173. mova [r4+r1*0], m0
  174. mova [r4+r1*1], m0
  175. %endif
  176. lea r4, [r4+r1*2]
  177. dec r3d
  178. jg .loop
  179. REP_RET
  180. %endmacro
  181. INIT_MMX
  182. PRED16x16_DC mmxext
  183. INIT_XMM
  184. PRED16x16_DC sse2
  185. PRED16x16_DC ssse3
  186. ;-----------------------------------------------------------------------------
  187. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  188. ;-----------------------------------------------------------------------------
  189. %macro PRED16x16_TM_MMX 1
  190. cglobal pred16x16_tm_vp8_%1, 2,5
  191. sub r0, r1
  192. pxor mm7, mm7
  193. movq mm0, [r0+0]
  194. movq mm2, [r0+8]
  195. movq mm1, mm0
  196. movq mm3, mm2
  197. punpcklbw mm0, mm7
  198. punpckhbw mm1, mm7
  199. punpcklbw mm2, mm7
  200. punpckhbw mm3, mm7
  201. movzx r3d, byte [r0-1]
  202. mov r4d, 16
  203. .loop:
  204. movzx r2d, byte [r0+r1-1]
  205. sub r2d, r3d
  206. movd mm4, r2d
  207. %ifidn %1, mmx
  208. punpcklwd mm4, mm4
  209. punpckldq mm4, mm4
  210. %else
  211. pshufw mm4, mm4, 0
  212. %endif
  213. movq mm5, mm4
  214. movq mm6, mm4
  215. movq mm7, mm4
  216. paddw mm4, mm0
  217. paddw mm5, mm1
  218. paddw mm6, mm2
  219. paddw mm7, mm3
  220. packuswb mm4, mm5
  221. packuswb mm6, mm7
  222. movq [r0+r1+0], mm4
  223. movq [r0+r1+8], mm6
  224. add r0, r1
  225. dec r4d
  226. jg .loop
  227. REP_RET
  228. %endmacro
  229. PRED16x16_TM_MMX mmx
  230. PRED16x16_TM_MMX mmxext
  231. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  232. sub r0, r1
  233. pxor xmm2, xmm2
  234. movdqa xmm0, [r0]
  235. movdqa xmm1, xmm0
  236. punpcklbw xmm0, xmm2
  237. punpckhbw xmm1, xmm2
  238. movzx r4d, byte [r0-1]
  239. mov r5d, 8
  240. .loop:
  241. movzx r2d, byte [r0+r1*1-1]
  242. movzx r3d, byte [r0+r1*2-1]
  243. sub r2d, r4d
  244. sub r3d, r4d
  245. movd xmm2, r2d
  246. movd xmm4, r3d
  247. pshuflw xmm2, xmm2, 0
  248. pshuflw xmm4, xmm4, 0
  249. punpcklqdq xmm2, xmm2
  250. punpcklqdq xmm4, xmm4
  251. movdqa xmm3, xmm2
  252. movdqa xmm5, xmm4
  253. paddw xmm2, xmm0
  254. paddw xmm3, xmm1
  255. paddw xmm4, xmm0
  256. paddw xmm5, xmm1
  257. packuswb xmm2, xmm3
  258. packuswb xmm4, xmm5
  259. movdqa [r0+r1*1], xmm2
  260. movdqa [r0+r1*2], xmm4
  261. lea r0, [r0+r1*2]
  262. dec r5d
  263. jg .loop
  264. REP_RET
  265. ;-----------------------------------------------------------------------------
  266. ; void pred16x16_plane(uint8_t *src, int stride)
  267. ;-----------------------------------------------------------------------------
  268. %macro H264_PRED16x16_PLANE 3
  269. cglobal pred16x16_plane_%3_%1, 2, 7, %2
  270. mov r2, r1 ; +stride
  271. neg r1 ; -stride
  272. movh m0, [r0+r1 -1]
  273. %if mmsize == 8
  274. pxor m4, m4
  275. movh m1, [r0+r1 +3 ]
  276. movh m2, [r0+r1 +8 ]
  277. movh m3, [r0+r1 +12]
  278. punpcklbw m0, m4
  279. punpcklbw m1, m4
  280. punpcklbw m2, m4
  281. punpcklbw m3, m4
  282. pmullw m0, [pw_m8tom1 ]
  283. pmullw m1, [pw_m8tom1+8]
  284. pmullw m2, [pw_1to8 ]
  285. pmullw m3, [pw_1to8 +8]
  286. paddw m0, m2
  287. paddw m1, m3
  288. %else ; mmsize == 16
  289. %ifidn %1, sse2
  290. pxor m2, m2
  291. movh m1, [r0+r1 +8]
  292. punpcklbw m0, m2
  293. punpcklbw m1, m2
  294. pmullw m0, [pw_m8tom1]
  295. pmullw m1, [pw_1to8]
  296. paddw m0, m1
  297. %else ; ssse3
  298. movhps m0, [r0+r1 +8]
  299. pmaddubsw m0, [plane_shuf] ; H coefficients
  300. %endif
  301. movhlps m1, m0
  302. %endif
  303. paddw m0, m1
  304. %ifidn %1, mmx
  305. mova m1, m0
  306. psrlq m1, 32
  307. %elifidn %1, mmx2
  308. pshufw m1, m0, 0xE
  309. %else ; mmsize == 16
  310. pshuflw m1, m0, 0xE
  311. %endif
  312. paddw m0, m1
  313. %ifidn %1, mmx
  314. mova m1, m0
  315. psrlq m1, 16
  316. %elifidn %1, mmx2
  317. pshufw m1, m0, 0x1
  318. %else
  319. pshuflw m1, m0, 0x1
  320. %endif
  321. paddw m0, m1 ; sum of H coefficients
  322. lea r4, [r0+r2*8-1]
  323. lea r3, [r0+r2*4-1]
  324. add r4, r2
  325. %ifdef ARCH_X86_64
  326. %define e_reg r11
  327. %else
  328. %define e_reg r0
  329. %endif
  330. movzx e_reg, byte [r3+r2*2 ]
  331. movzx r5, byte [r4+r1 ]
  332. sub r5, e_reg
  333. movzx e_reg, byte [r3+r2 ]
  334. movzx r6, byte [r4 ]
  335. sub r6, e_reg
  336. lea r5, [r5+r6*2]
  337. movzx e_reg, byte [r3+r1 ]
  338. movzx r6, byte [r4+r2*2 ]
  339. sub r6, e_reg
  340. lea r5, [r5+r6*4]
  341. movzx e_reg, byte [r3 ]
  342. %ifdef ARCH_X86_64
  343. movzx r10, byte [r4+r2 ]
  344. sub r10, e_reg
  345. %else
  346. movzx r6, byte [r4+r2 ]
  347. sub r6, e_reg
  348. lea r5, [r5+r6*4]
  349. sub r5, r6
  350. %endif
  351. lea e_reg, [r3+r1*4]
  352. lea r3, [r4+r2*4]
  353. movzx r4, byte [e_reg+r2 ]
  354. movzx r6, byte [r3 ]
  355. sub r6, r4
  356. %ifdef ARCH_X86_64
  357. lea r6, [r10+r6*2]
  358. lea r5, [r5+r6*2]
  359. add r5, r6
  360. %else
  361. lea r5, [r5+r6*4]
  362. lea r5, [r5+r6*2]
  363. %endif
  364. movzx r4, byte [e_reg ]
  365. %ifdef ARCH_X86_64
  366. movzx r10, byte [r3 +r2 ]
  367. sub r10, r4
  368. sub r5, r10
  369. %else
  370. movzx r6, byte [r3 +r2 ]
  371. sub r6, r4
  372. lea r5, [r5+r6*8]
  373. sub r5, r6
  374. %endif
  375. movzx r4, byte [e_reg+r1 ]
  376. movzx r6, byte [r3 +r2*2]
  377. sub r6, r4
  378. %ifdef ARCH_X86_64
  379. add r6, r10
  380. %endif
  381. lea r5, [r5+r6*8]
  382. movzx r4, byte [e_reg+r2*2]
  383. movzx r6, byte [r3 +r1 ]
  384. sub r6, r4
  385. lea r5, [r5+r6*4]
  386. add r5, r6 ; sum of V coefficients
  387. %ifndef ARCH_X86_64
  388. mov r0, r0m
  389. %endif
  390. %ifidn %3, h264
  391. lea r5, [r5*5+32]
  392. sar r5, 6
  393. %elifidn %3, rv40
  394. lea r5, [r5*5]
  395. sar r5, 6
  396. %elifidn %3, svq3
  397. test r5, r5
  398. lea r6, [r5+3]
  399. cmovs r5, r6
  400. sar r5, 2 ; V/4
  401. lea r5, [r5*5] ; 5*(V/4)
  402. test r5, r5
  403. lea r6, [r5+15]
  404. cmovs r5, r6
  405. sar r5, 4 ; (5*(V/4))/16
  406. %endif
  407. movzx r4, byte [r0+r1 +15]
  408. movzx r3, byte [r3+r2*2 ]
  409. lea r3, [r3+r4+1]
  410. shl r3, 4
  411. movd r1d, m0
  412. movsx r1d, r1w
  413. %ifnidn %3, svq3
  414. %ifidn %3, h264
  415. lea r1d, [r1d*5+32]
  416. %else ; rv40
  417. lea r1d, [r1d*5]
  418. %endif
  419. sar r1d, 6
  420. %else ; svq3
  421. test r1d, r1d
  422. lea r4d, [r1d+3]
  423. cmovs r1d, r4d
  424. sar r1d, 2 ; H/4
  425. lea r1d, [r1d*5] ; 5*(H/4)
  426. test r1d, r1d
  427. lea r4d, [r1d+15]
  428. cmovs r1d, r4d
  429. sar r1d, 4 ; (5*(H/4))/16
  430. %endif
  431. movd m0, r1d
  432. add r1d, r5d
  433. add r3d, r1d
  434. shl r1d, 3
  435. sub r3d, r1d ; a
  436. movd m1, r5d
  437. movd m3, r3d
  438. %ifidn %1, mmx
  439. punpcklwd m0, m0
  440. punpcklwd m1, m1
  441. punpcklwd m3, m3
  442. punpckldq m0, m0
  443. punpckldq m1, m1
  444. punpckldq m3, m3
  445. %elifidn %1, mmx2
  446. pshufw m0, m0, 0x0
  447. pshufw m1, m1, 0x0
  448. pshufw m3, m3, 0x0
  449. %else
  450. pshuflw m0, m0, 0x0
  451. pshuflw m1, m1, 0x0
  452. pshuflw m3, m3, 0x0
  453. punpcklqdq m0, m0 ; splat H (words)
  454. punpcklqdq m1, m1 ; splat V (words)
  455. punpcklqdq m3, m3 ; splat a (words)
  456. %endif
  457. %ifidn %3, svq3
  458. SWAP 0, 1
  459. %endif
  460. mova m2, m0
  461. %if mmsize == 8
  462. mova m5, m0
  463. %endif
  464. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  465. %if mmsize == 16
  466. psllw m2, 3
  467. %else
  468. psllw m5, 3
  469. psllw m2, 2
  470. mova m6, m5
  471. paddw m6, m2
  472. %endif
  473. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  474. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  475. %if mmsize == 8
  476. paddw m5, m0 ; a + {8,9,10,11}*H
  477. paddw m6, m0 ; a + {12,13,14,15}*H
  478. %endif
  479. mov r4, 8
  480. .loop
  481. mova m3, m0 ; b[0..7]
  482. mova m4, m2 ; b[8..15]
  483. psraw m3, 5
  484. psraw m4, 5
  485. packuswb m3, m4
  486. mova [r0], m3
  487. %if mmsize == 8
  488. mova m3, m5 ; b[8..11]
  489. mova m4, m6 ; b[12..15]
  490. psraw m3, 5
  491. psraw m4, 5
  492. packuswb m3, m4
  493. mova [r0+8], m3
  494. %endif
  495. paddw m0, m1
  496. paddw m2, m1
  497. %if mmsize == 8
  498. paddw m5, m1
  499. paddw m6, m1
  500. %endif
  501. mova m3, m0 ; b[0..7]
  502. mova m4, m2 ; b[8..15]
  503. psraw m3, 5
  504. psraw m4, 5
  505. packuswb m3, m4
  506. mova [r0+r2], m3
  507. %if mmsize == 8
  508. mova m3, m5 ; b[8..11]
  509. mova m4, m6 ; b[12..15]
  510. psraw m3, 5
  511. psraw m4, 5
  512. packuswb m3, m4
  513. mova [r0+r2+8], m3
  514. %endif
  515. paddw m0, m1
  516. paddw m2, m1
  517. %if mmsize == 8
  518. paddw m5, m1
  519. paddw m6, m1
  520. %endif
  521. lea r0, [r0+r2*2]
  522. dec r4
  523. jg .loop
  524. REP_RET
  525. %endmacro
  526. INIT_MMX
  527. H264_PRED16x16_PLANE mmx, 0, h264
  528. H264_PRED16x16_PLANE mmx, 0, rv40
  529. H264_PRED16x16_PLANE mmx, 0, svq3
  530. H264_PRED16x16_PLANE mmx2, 0, h264
  531. H264_PRED16x16_PLANE mmx2, 0, rv40
  532. H264_PRED16x16_PLANE mmx2, 0, svq3
  533. INIT_XMM
  534. H264_PRED16x16_PLANE sse2, 8, h264
  535. H264_PRED16x16_PLANE sse2, 8, rv40
  536. H264_PRED16x16_PLANE sse2, 8, svq3
  537. H264_PRED16x16_PLANE ssse3, 8, h264
  538. H264_PRED16x16_PLANE ssse3, 8, rv40
  539. H264_PRED16x16_PLANE ssse3, 8, svq3
  540. ;-----------------------------------------------------------------------------
  541. ; void pred8x8_plane(uint8_t *src, int stride)
  542. ;-----------------------------------------------------------------------------
  543. %macro H264_PRED8x8_PLANE 2
  544. cglobal pred8x8_plane_%1, 2, 7, %2
  545. mov r2, r1 ; +stride
  546. neg r1 ; -stride
  547. movd m0, [r0+r1 -1]
  548. %if mmsize == 8
  549. pxor m2, m2
  550. movh m1, [r0+r1 +4 ]
  551. punpcklbw m0, m2
  552. punpcklbw m1, m2
  553. pmullw m0, [pw_m4to4]
  554. pmullw m1, [pw_m4to4+8]
  555. %else ; mmsize == 16
  556. %ifidn %1, sse2
  557. pxor m2, m2
  558. movd m1, [r0+r1 +4]
  559. punpckldq m0, m1
  560. punpcklbw m0, m2
  561. pmullw m0, [pw_m4to4]
  562. %else ; ssse3
  563. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  564. pmaddubsw m0, [plane8_shuf] ; H coefficients
  565. %endif
  566. movhlps m1, m0
  567. %endif
  568. paddw m0, m1
  569. %ifnidn %1, ssse3
  570. %ifidn %1, mmx
  571. mova m1, m0
  572. psrlq m1, 32
  573. %elifidn %1, mmx2
  574. pshufw m1, m0, 0xE
  575. %else ; mmsize == 16
  576. pshuflw m1, m0, 0xE
  577. %endif
  578. paddw m0, m1
  579. %endif ; !ssse3
  580. %ifidn %1, mmx
  581. mova m1, m0
  582. psrlq m1, 16
  583. %elifidn %1, mmx2
  584. pshufw m1, m0, 0x1
  585. %else
  586. pshuflw m1, m0, 0x1
  587. %endif
  588. paddw m0, m1 ; sum of H coefficients
  589. lea r4, [r0+r2*4-1]
  590. lea r3, [r0 -1]
  591. add r4, r2
  592. %ifdef ARCH_X86_64
  593. %define e_reg r11
  594. %else
  595. %define e_reg r0
  596. %endif
  597. movzx e_reg, byte [r3+r2*2 ]
  598. movzx r5, byte [r4+r1 ]
  599. sub r5, e_reg
  600. movzx e_reg, byte [r3 ]
  601. %ifdef ARCH_X86_64
  602. movzx r10, byte [r4+r2 ]
  603. sub r10, e_reg
  604. sub r5, r10
  605. %else
  606. movzx r6, byte [r4+r2 ]
  607. sub r6, e_reg
  608. lea r5, [r5+r6*4]
  609. sub r5, r6
  610. %endif
  611. movzx e_reg, byte [r3+r1 ]
  612. movzx r6, byte [r4+r2*2 ]
  613. sub r6, e_reg
  614. %ifdef ARCH_X86_64
  615. add r6, r10
  616. %endif
  617. lea r5, [r5+r6*4]
  618. movzx e_reg, byte [r3+r2 ]
  619. movzx r6, byte [r4 ]
  620. sub r6, e_reg
  621. lea r6, [r5+r6*2]
  622. lea r5, [r6*9+16]
  623. lea r5, [r5+r6*8]
  624. sar r5, 5
  625. %ifndef ARCH_X86_64
  626. mov r0, r0m
  627. %endif
  628. movzx r3, byte [r4+r2*2 ]
  629. movzx r4, byte [r0+r1 +7]
  630. lea r3, [r3+r4+1]
  631. shl r3, 4
  632. movd r1d, m0
  633. movsx r1d, r1w
  634. imul r1d, 17
  635. add r1d, 16
  636. sar r1d, 5
  637. movd m0, r1d
  638. add r1d, r5d
  639. sub r3d, r1d
  640. add r1d, r1d
  641. sub r3d, r1d ; a
  642. movd m1, r5d
  643. movd m3, r3d
  644. %ifidn %1, mmx
  645. punpcklwd m0, m0
  646. punpcklwd m1, m1
  647. punpcklwd m3, m3
  648. punpckldq m0, m0
  649. punpckldq m1, m1
  650. punpckldq m3, m3
  651. %elifidn %1, mmx2
  652. pshufw m0, m0, 0x0
  653. pshufw m1, m1, 0x0
  654. pshufw m3, m3, 0x0
  655. %else
  656. pshuflw m0, m0, 0x0
  657. pshuflw m1, m1, 0x0
  658. pshuflw m3, m3, 0x0
  659. punpcklqdq m0, m0 ; splat H (words)
  660. punpcklqdq m1, m1 ; splat V (words)
  661. punpcklqdq m3, m3 ; splat a (words)
  662. %endif
  663. %if mmsize == 8
  664. mova m2, m0
  665. %endif
  666. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  667. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  668. %if mmsize == 8
  669. psllw m2, 2
  670. paddw m2, m0 ; a + {4,5,6,7}*H
  671. %endif
  672. mov r4, 4
  673. ALIGN 16
  674. .loop
  675. %if mmsize == 16
  676. mova m3, m0 ; b[0..7]
  677. paddw m0, m1
  678. psraw m3, 5
  679. mova m4, m0 ; V+b[0..7]
  680. paddw m0, m1
  681. psraw m4, 5
  682. packuswb m3, m4
  683. movh [r0], m3
  684. movhps [r0+r2], m3
  685. %else ; mmsize == 8
  686. mova m3, m0 ; b[0..3]
  687. mova m4, m2 ; b[4..7]
  688. paddw m0, m1
  689. paddw m2, m1
  690. psraw m3, 5
  691. psraw m4, 5
  692. mova m5, m0 ; V+b[0..3]
  693. mova m6, m2 ; V+b[4..7]
  694. paddw m0, m1
  695. paddw m2, m1
  696. psraw m5, 5
  697. psraw m6, 5
  698. packuswb m3, m4
  699. packuswb m5, m6
  700. mova [r0], m3
  701. mova [r0+r2], m5
  702. %endif
  703. lea r0, [r0+r2*2]
  704. dec r4
  705. jg .loop
  706. REP_RET
  707. %endmacro
  708. INIT_MMX
  709. H264_PRED8x8_PLANE mmx, 0
  710. H264_PRED8x8_PLANE mmx2, 0
  711. INIT_XMM
  712. H264_PRED8x8_PLANE sse2, 8
  713. H264_PRED8x8_PLANE ssse3, 8
  714. ;-----------------------------------------------------------------------------
  715. ; void pred8x8_vertical(uint8_t *src, int stride)
  716. ;-----------------------------------------------------------------------------
  717. cglobal pred8x8_vertical_mmx, 2,2
  718. sub r0, r1
  719. movq mm0, [r0]
  720. %rep 3
  721. movq [r0+r1*1], mm0
  722. movq [r0+r1*2], mm0
  723. lea r0, [r0+r1*2]
  724. %endrep
  725. movq [r0+r1*1], mm0
  726. movq [r0+r1*2], mm0
  727. RET
  728. ;-----------------------------------------------------------------------------
  729. ; void pred8x8_horizontal(uint8_t *src, int stride)
  730. ;-----------------------------------------------------------------------------
  731. %macro PRED8x8_H 1
  732. cglobal pred8x8_horizontal_%1, 2,3
  733. mov r2, 4
  734. %ifidn %1, ssse3
  735. mova m2, [pb_3]
  736. %endif
  737. .loop:
  738. movd m0, [r0+r1*0-4]
  739. movd m1, [r0+r1*1-4]
  740. %ifidn %1, ssse3
  741. pshufb m0, m2
  742. pshufb m1, m2
  743. %else
  744. punpcklbw m0, m0
  745. punpcklbw m1, m1
  746. %ifidn %1, mmxext
  747. pshufw m0, m0, 0xff
  748. pshufw m1, m1, 0xff
  749. %else
  750. punpckhwd m0, m0
  751. punpckhwd m1, m1
  752. punpckhdq m0, m0
  753. punpckhdq m1, m1
  754. %endif
  755. %endif
  756. mova [r0+r1*0], m0
  757. mova [r0+r1*1], m1
  758. lea r0, [r0+r1*2]
  759. dec r2
  760. jg .loop
  761. REP_RET
  762. %endmacro
  763. INIT_MMX
  764. PRED8x8_H mmx
  765. PRED8x8_H mmxext
  766. PRED8x8_H ssse3
  767. ;-----------------------------------------------------------------------------
  768. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  769. ;-----------------------------------------------------------------------------
  770. %ifdef CONFIG_GPL
  771. cglobal pred8x8_top_dc_mmxext, 2,5
  772. sub r0, r1
  773. movq mm0, [r0]
  774. pxor mm1, mm1
  775. pxor mm2, mm2
  776. lea r2, [r0+r1*2]
  777. punpckhbw mm1, mm0
  778. punpcklbw mm0, mm2
  779. psadbw mm1, mm2 ; s1
  780. lea r3, [r2+r1*2]
  781. psadbw mm0, mm2 ; s0
  782. psrlw mm1, 1
  783. psrlw mm0, 1
  784. pavgw mm1, mm2
  785. lea r4, [r3+r1*2]
  786. pavgw mm0, mm2
  787. pshufw mm1, mm1, 0
  788. pshufw mm0, mm0, 0 ; dc0 (w)
  789. packuswb mm0, mm1 ; dc0,dc1 (b)
  790. movq [r0+r1*1], mm0
  791. movq [r0+r1*2], mm0
  792. lea r0, [r3+r1*2]
  793. movq [r2+r1*1], mm0
  794. movq [r2+r1*2], mm0
  795. movq [r3+r1*1], mm0
  796. movq [r3+r1*2], mm0
  797. movq [r0+r1*1], mm0
  798. movq [r0+r1*2], mm0
  799. RET
  800. ;-----------------------------------------------------------------------------
  801. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  802. ;-----------------------------------------------------------------------------
  803. INIT_MMX
  804. cglobal pred8x8_dc_mmxext, 2,5
  805. sub r0, r1
  806. pxor m7, m7
  807. movd m0, [r0+0]
  808. movd m1, [r0+4]
  809. psadbw m0, m7 ; s0
  810. mov r4, r0
  811. psadbw m1, m7 ; s1
  812. movzx r2d, byte [r0+r1*1-1]
  813. movzx r3d, byte [r0+r1*2-1]
  814. lea r0, [r0+r1*2]
  815. add r2d, r3d
  816. movzx r3d, byte [r0+r1*1-1]
  817. add r2d, r3d
  818. movzx r3d, byte [r0+r1*2-1]
  819. add r2d, r3d
  820. lea r0, [r0+r1*2]
  821. movd m2, r2d ; s2
  822. movzx r2d, byte [r0+r1*1-1]
  823. movzx r3d, byte [r0+r1*2-1]
  824. lea r0, [r0+r1*2]
  825. add r2d, r3d
  826. movzx r3d, byte [r0+r1*1-1]
  827. add r2d, r3d
  828. movzx r3d, byte [r0+r1*2-1]
  829. add r2d, r3d
  830. movd m3, r2d ; s3
  831. punpcklwd m0, m1
  832. mov r0, r4
  833. punpcklwd m2, m3
  834. punpckldq m0, m2 ; s0, s1, s2, s3
  835. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  836. lea r2, [r0+r1*2]
  837. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  838. paddw m0, m3
  839. lea r3, [r2+r1*2]
  840. psrlw m0, 2
  841. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  842. lea r4, [r3+r1*2]
  843. packuswb m0, m0
  844. punpcklbw m0, m0
  845. movq m1, m0
  846. punpcklbw m0, m0
  847. punpckhbw m1, m1
  848. movq [r0+r1*1], m0
  849. movq [r0+r1*2], m0
  850. movq [r2+r1*1], m0
  851. movq [r2+r1*2], m0
  852. movq [r3+r1*1], m1
  853. movq [r3+r1*2], m1
  854. movq [r4+r1*1], m1
  855. movq [r4+r1*2], m1
  856. RET
  857. %endif
  858. ;-----------------------------------------------------------------------------
  859. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  860. ;-----------------------------------------------------------------------------
  861. cglobal pred8x8_dc_rv40_mmxext, 2,7
  862. mov r4, r0
  863. sub r0, r1
  864. pxor mm0, mm0
  865. psadbw mm0, [r0]
  866. dec r0
  867. movzx r5d, byte [r0+r1*1]
  868. movd r6d, mm0
  869. lea r0, [r0+r1*2]
  870. %rep 3
  871. movzx r2d, byte [r0+r1*0]
  872. movzx r3d, byte [r0+r1*1]
  873. add r5d, r2d
  874. add r6d, r3d
  875. lea r0, [r0+r1*2]
  876. %endrep
  877. movzx r2d, byte [r0+r1*0]
  878. add r5d, r6d
  879. lea r2d, [r2+r5+8]
  880. shr r2d, 4
  881. movd mm0, r2d
  882. punpcklbw mm0, mm0
  883. pshufw mm0, mm0, 0
  884. mov r3d, 4
  885. .loop:
  886. movq [r4+r1*0], mm0
  887. movq [r4+r1*1], mm0
  888. lea r4, [r4+r1*2]
  889. dec r3d
  890. jg .loop
  891. REP_RET
  892. ;-----------------------------------------------------------------------------
  893. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  894. ;-----------------------------------------------------------------------------
  895. %macro PRED8x8_TM_MMX 1
  896. cglobal pred8x8_tm_vp8_%1, 2,6
  897. sub r0, r1
  898. pxor mm7, mm7
  899. movq mm0, [r0]
  900. movq mm1, mm0
  901. punpcklbw mm0, mm7
  902. punpckhbw mm1, mm7
  903. movzx r4d, byte [r0-1]
  904. mov r5d, 4
  905. .loop:
  906. movzx r2d, byte [r0+r1*1-1]
  907. movzx r3d, byte [r0+r1*2-1]
  908. sub r2d, r4d
  909. sub r3d, r4d
  910. movd mm2, r2d
  911. movd mm4, r3d
  912. %ifidn %1, mmx
  913. punpcklwd mm2, mm2
  914. punpcklwd mm4, mm4
  915. punpckldq mm2, mm2
  916. punpckldq mm4, mm4
  917. %else
  918. pshufw mm2, mm2, 0
  919. pshufw mm4, mm4, 0
  920. %endif
  921. movq mm3, mm2
  922. movq mm5, mm4
  923. paddw mm2, mm0
  924. paddw mm3, mm1
  925. paddw mm4, mm0
  926. paddw mm5, mm1
  927. packuswb mm2, mm3
  928. packuswb mm4, mm5
  929. movq [r0+r1*1], mm2
  930. movq [r0+r1*2], mm4
  931. lea r0, [r0+r1*2]
  932. dec r5d
  933. jg .loop
  934. REP_RET
  935. %endmacro
  936. PRED8x8_TM_MMX mmx
  937. PRED8x8_TM_MMX mmxext
  938. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  939. sub r0, r1
  940. pxor xmm1, xmm1
  941. movq xmm0, [r0]
  942. punpcklbw xmm0, xmm1
  943. movzx r4d, byte [r0-1]
  944. mov r5d, 4
  945. .loop:
  946. movzx r2d, byte [r0+r1*1-1]
  947. movzx r3d, byte [r0+r1*2-1]
  948. sub r2d, r4d
  949. sub r3d, r4d
  950. movd xmm2, r2d
  951. movd xmm3, r3d
  952. pshuflw xmm2, xmm2, 0
  953. pshuflw xmm3, xmm3, 0
  954. punpcklqdq xmm2, xmm2
  955. punpcklqdq xmm3, xmm3
  956. paddw xmm2, xmm0
  957. paddw xmm3, xmm0
  958. packuswb xmm2, xmm3
  959. movq [r0+r1*1], xmm2
  960. movhps [r0+r1*2], xmm2
  961. lea r0, [r0+r1*2]
  962. dec r5d
  963. jg .loop
  964. REP_RET
  965. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  966. sub r0, r1
  967. movdqa xmm4, [tm_shuf]
  968. pxor xmm1, xmm1
  969. movq xmm0, [r0]
  970. punpcklbw xmm0, xmm1
  971. movd xmm5, [r0-4]
  972. pshufb xmm5, xmm4
  973. mov r2d, 4
  974. .loop:
  975. movd xmm2, [r0+r1*1-4]
  976. movd xmm3, [r0+r1*2-4]
  977. pshufb xmm2, xmm4
  978. pshufb xmm3, xmm4
  979. psubw xmm2, xmm5
  980. psubw xmm3, xmm5
  981. paddw xmm2, xmm0
  982. paddw xmm3, xmm0
  983. packuswb xmm2, xmm3
  984. movq [r0+r1*1], xmm2
  985. movhps [r0+r1*2], xmm2
  986. lea r0, [r0+r1*2]
  987. dec r2d
  988. jg .loop
  989. REP_RET
  990. ; dest, left, right, src, tmp
  991. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  992. %macro PRED4x4_LOWPASS 5
  993. mova %5, %2
  994. pavgb %2, %3
  995. pxor %3, %5
  996. mova %1, %4
  997. pand %3, [pb_1]
  998. psubusb %2, %3
  999. pavgb %1, %2
  1000. %endmacro
  1001. ;-----------------------------------------------------------------------------
  1002. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1003. ;-----------------------------------------------------------------------------
  1004. %ifdef CONFIG_GPL
  1005. %macro PRED8x8L_TOP_DC 1
  1006. cglobal pred8x8l_top_dc_%1, 4,4
  1007. sub r0, r3
  1008. pxor mm7, mm7
  1009. movq mm0, [r0-8]
  1010. movq mm3, [r0]
  1011. movq mm1, [r0+8]
  1012. movq mm2, mm3
  1013. movq mm4, mm3
  1014. PALIGNR mm2, mm0, 7, mm0
  1015. PALIGNR mm1, mm4, 1, mm4
  1016. test r1, r1 ; top_left
  1017. jz .fix_lt_2
  1018. test r2, r2 ; top_right
  1019. jz .fix_tr_1
  1020. jmp .body
  1021. .fix_lt_2:
  1022. movq mm5, mm3
  1023. pxor mm5, mm2
  1024. psllq mm5, 56
  1025. psrlq mm5, 56
  1026. pxor mm2, mm5
  1027. test r2, r2 ; top_right
  1028. jnz .body
  1029. .fix_tr_1:
  1030. movq mm5, mm3
  1031. pxor mm5, mm1
  1032. psrlq mm5, 56
  1033. psllq mm5, 56
  1034. pxor mm1, mm5
  1035. .body
  1036. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1037. psadbw mm7, mm0
  1038. paddw mm7, [pw_4]
  1039. psrlw mm7, 3
  1040. pshufw mm7, mm7, 0
  1041. packuswb mm7, mm7
  1042. %rep 3
  1043. movq [r0+r3*1], mm7
  1044. movq [r0+r3*2], mm7
  1045. lea r0, [r0+r3*2]
  1046. %endrep
  1047. movq [r0+r3*1], mm7
  1048. movq [r0+r3*2], mm7
  1049. RET
  1050. %endmacro
  1051. INIT_MMX
  1052. %define PALIGNR PALIGNR_MMX
  1053. PRED8x8L_TOP_DC mmxext
  1054. %define PALIGNR PALIGNR_SSSE3
  1055. PRED8x8L_TOP_DC ssse3
  1056. ;-----------------------------------------------------------------------------
  1057. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1058. ;-----------------------------------------------------------------------------
  1059. %macro PRED8x8L_DC 1
  1060. cglobal pred8x8l_dc_%1, 4,5
  1061. sub r0, r3
  1062. lea r4, [r0+r3*2]
  1063. movq mm0, [r0+r3*1-8]
  1064. punpckhbw mm0, [r0+r3*0-8]
  1065. movq mm1, [r4+r3*1-8]
  1066. punpckhbw mm1, [r0+r3*2-8]
  1067. mov r4, r0
  1068. punpckhwd mm1, mm0
  1069. lea r0, [r0+r3*4]
  1070. movq mm2, [r0+r3*1-8]
  1071. punpckhbw mm2, [r0+r3*0-8]
  1072. lea r0, [r0+r3*2]
  1073. movq mm3, [r0+r3*1-8]
  1074. punpckhbw mm3, [r0+r3*0-8]
  1075. punpckhwd mm3, mm2
  1076. punpckhdq mm3, mm1
  1077. lea r0, [r0+r3*2]
  1078. movq mm0, [r0+r3*0-8]
  1079. movq mm1, [r4]
  1080. mov r0, r4
  1081. movq mm4, mm3
  1082. movq mm2, mm3
  1083. PALIGNR mm4, mm0, 7, mm0
  1084. PALIGNR mm1, mm2, 1, mm2
  1085. test r1, r1
  1086. jnz .do_left
  1087. .fix_lt_1:
  1088. movq mm5, mm3
  1089. pxor mm5, mm4
  1090. psrlq mm5, 56
  1091. psllq mm5, 48
  1092. pxor mm1, mm5
  1093. jmp .do_left
  1094. .fix_lt_2:
  1095. movq mm5, mm3
  1096. pxor mm5, mm2
  1097. psllq mm5, 56
  1098. psrlq mm5, 56
  1099. pxor mm2, mm5
  1100. test r2, r2
  1101. jnz .body
  1102. .fix_tr_1:
  1103. movq mm5, mm3
  1104. pxor mm5, mm1
  1105. psrlq mm5, 56
  1106. psllq mm5, 56
  1107. pxor mm1, mm5
  1108. jmp .body
  1109. .do_left:
  1110. movq mm0, mm4
  1111. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1112. movq mm4, mm0
  1113. movq mm7, mm2
  1114. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1115. psllq mm1, 56
  1116. PALIGNR mm7, mm1, 7, mm3
  1117. movq mm0, [r0-8]
  1118. movq mm3, [r0]
  1119. movq mm1, [r0+8]
  1120. movq mm2, mm3
  1121. movq mm4, mm3
  1122. PALIGNR mm2, mm0, 7, mm0
  1123. PALIGNR mm1, mm4, 1, mm4
  1124. test r1, r1
  1125. jz .fix_lt_2
  1126. test r2, r2
  1127. jz .fix_tr_1
  1128. .body
  1129. lea r1, [r0+r3*2]
  1130. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1131. pxor mm0, mm0
  1132. pxor mm1, mm1
  1133. lea r2, [r1+r3*2]
  1134. psadbw mm0, mm7
  1135. psadbw mm1, mm6
  1136. paddw mm0, [pw_8]
  1137. paddw mm0, mm1
  1138. lea r4, [r2+r3*2]
  1139. psrlw mm0, 4
  1140. pshufw mm0, mm0, 0
  1141. packuswb mm0, mm0
  1142. movq [r0+r3*1], mm0
  1143. movq [r0+r3*2], mm0
  1144. movq [r1+r3*1], mm0
  1145. movq [r1+r3*2], mm0
  1146. movq [r2+r3*1], mm0
  1147. movq [r2+r3*2], mm0
  1148. movq [r4+r3*1], mm0
  1149. movq [r4+r3*2], mm0
  1150. RET
  1151. %endmacro
  1152. INIT_MMX
  1153. %define PALIGNR PALIGNR_MMX
  1154. PRED8x8L_DC mmxext
  1155. %define PALIGNR PALIGNR_SSSE3
  1156. PRED8x8L_DC ssse3
  1157. ;-----------------------------------------------------------------------------
  1158. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1159. ;-----------------------------------------------------------------------------
  1160. %macro PRED8x8L_HORIZONTAL 1
  1161. cglobal pred8x8l_horizontal_%1, 4,4
  1162. sub r0, r3
  1163. lea r2, [r0+r3*2]
  1164. movq mm0, [r0+r3*1-8]
  1165. test r1, r1
  1166. lea r1, [r0+r3]
  1167. cmovnz r1, r0
  1168. punpckhbw mm0, [r1+r3*0-8]
  1169. movq mm1, [r2+r3*1-8]
  1170. punpckhbw mm1, [r0+r3*2-8]
  1171. mov r2, r0
  1172. punpckhwd mm1, mm0
  1173. lea r0, [r0+r3*4]
  1174. movq mm2, [r0+r3*1-8]
  1175. punpckhbw mm2, [r0+r3*0-8]
  1176. lea r0, [r0+r3*2]
  1177. movq mm3, [r0+r3*1-8]
  1178. punpckhbw mm3, [r0+r3*0-8]
  1179. punpckhwd mm3, mm2
  1180. punpckhdq mm3, mm1
  1181. lea r0, [r0+r3*2]
  1182. movq mm0, [r0+r3*0-8]
  1183. movq mm1, [r1+r3*0-8]
  1184. mov r0, r2
  1185. movq mm4, mm3
  1186. movq mm2, mm3
  1187. PALIGNR mm4, mm0, 7, mm0
  1188. PALIGNR mm1, mm2, 1, mm2
  1189. movq mm0, mm4
  1190. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1191. movq mm4, mm0
  1192. movq mm7, mm2
  1193. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1194. psllq mm1, 56
  1195. PALIGNR mm7, mm1, 7, mm3
  1196. movq mm3, mm7
  1197. lea r1, [r0+r3*2]
  1198. movq mm7, mm3
  1199. punpckhbw mm3, mm3
  1200. punpcklbw mm7, mm7
  1201. pshufw mm0, mm3, 0xff
  1202. pshufw mm1, mm3, 0xaa
  1203. lea r2, [r1+r3*2]
  1204. pshufw mm2, mm3, 0x55
  1205. pshufw mm3, mm3, 0x00
  1206. pshufw mm4, mm7, 0xff
  1207. pshufw mm5, mm7, 0xaa
  1208. pshufw mm6, mm7, 0x55
  1209. pshufw mm7, mm7, 0x00
  1210. movq [r0+r3*1], mm0
  1211. movq [r0+r3*2], mm1
  1212. movq [r1+r3*1], mm2
  1213. movq [r1+r3*2], mm3
  1214. movq [r2+r3*1], mm4
  1215. movq [r2+r3*2], mm5
  1216. lea r0, [r2+r3*2]
  1217. movq [r0+r3*1], mm6
  1218. movq [r0+r3*2], mm7
  1219. RET
  1220. %endmacro
  1221. INIT_MMX
  1222. %define PALIGNR PALIGNR_MMX
  1223. PRED8x8L_HORIZONTAL mmxext
  1224. %define PALIGNR PALIGNR_SSSE3
  1225. PRED8x8L_HORIZONTAL ssse3
  1226. ;-----------------------------------------------------------------------------
  1227. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1228. ;-----------------------------------------------------------------------------
  1229. %macro PRED8x8L_VERTICAL 1
  1230. cglobal pred8x8l_vertical_%1, 4,4
  1231. sub r0, r3
  1232. movq mm0, [r0-8]
  1233. movq mm3, [r0]
  1234. movq mm1, [r0+8]
  1235. movq mm2, mm3
  1236. movq mm4, mm3
  1237. PALIGNR mm2, mm0, 7, mm0
  1238. PALIGNR mm1, mm4, 1, mm4
  1239. test r1, r1 ; top_left
  1240. jz .fix_lt_2
  1241. test r2, r2 ; top_right
  1242. jz .fix_tr_1
  1243. jmp .body
  1244. .fix_lt_2:
  1245. movq mm5, mm3
  1246. pxor mm5, mm2
  1247. psllq mm5, 56
  1248. psrlq mm5, 56
  1249. pxor mm2, mm5
  1250. test r2, r2 ; top_right
  1251. jnz .body
  1252. .fix_tr_1:
  1253. movq mm5, mm3
  1254. pxor mm5, mm1
  1255. psrlq mm5, 56
  1256. psllq mm5, 56
  1257. pxor mm1, mm5
  1258. .body
  1259. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1260. %rep 3
  1261. movq [r0+r3*1], mm0
  1262. movq [r0+r3*2], mm0
  1263. lea r0, [r0+r3*2]
  1264. %endrep
  1265. movq [r0+r3*1], mm0
  1266. movq [r0+r3*2], mm0
  1267. RET
  1268. %endmacro
  1269. INIT_MMX
  1270. %define PALIGNR PALIGNR_MMX
  1271. PRED8x8L_VERTICAL mmxext
  1272. %define PALIGNR PALIGNR_SSSE3
  1273. PRED8x8L_VERTICAL ssse3
  1274. ;-----------------------------------------------------------------------------
  1275. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1276. ;-----------------------------------------------------------------------------
  1277. INIT_MMX
  1278. %define PALIGNR PALIGNR_MMX
  1279. cglobal pred8x8l_down_left_mmxext, 4,5
  1280. sub r0, r3
  1281. movq mm0, [r0-8]
  1282. movq mm3, [r0]
  1283. movq mm1, [r0+8]
  1284. movq mm2, mm3
  1285. movq mm4, mm3
  1286. PALIGNR mm2, mm0, 7, mm0
  1287. PALIGNR mm1, mm4, 1, mm4
  1288. test r1, r1
  1289. jz .fix_lt_2
  1290. test r2, r2
  1291. jz .fix_tr_1
  1292. jmp .do_top
  1293. .fix_lt_2:
  1294. movq mm5, mm3
  1295. pxor mm5, mm2
  1296. psllq mm5, 56
  1297. psrlq mm5, 56
  1298. pxor mm2, mm5
  1299. test r2, r2
  1300. jnz .do_top
  1301. .fix_tr_1:
  1302. movq mm5, mm3
  1303. pxor mm5, mm1
  1304. psrlq mm5, 56
  1305. psllq mm5, 56
  1306. pxor mm1, mm5
  1307. jmp .do_top
  1308. .fix_tr_2:
  1309. punpckhbw mm3, mm3
  1310. pshufw mm1, mm3, 0xFF
  1311. jmp .do_topright
  1312. .do_top:
  1313. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1314. movq mm7, mm4
  1315. test r2, r2
  1316. jz .fix_tr_2
  1317. movq mm0, [r0+8]
  1318. movq mm5, mm0
  1319. movq mm2, mm0
  1320. movq mm4, mm0
  1321. psrlq mm5, 56
  1322. PALIGNR mm2, mm3, 7, mm3
  1323. PALIGNR mm5, mm4, 1, mm4
  1324. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1325. .do_topright:
  1326. lea r1, [r0+r3*2]
  1327. movq mm6, mm1
  1328. psrlq mm1, 56
  1329. movq mm4, mm1
  1330. lea r2, [r1+r3*2]
  1331. movq mm2, mm6
  1332. PALIGNR mm2, mm7, 1, mm0
  1333. movq mm3, mm6
  1334. PALIGNR mm3, mm7, 7, mm0
  1335. PALIGNR mm4, mm6, 1, mm0
  1336. movq mm5, mm7
  1337. movq mm1, mm7
  1338. movq mm7, mm6
  1339. lea r4, [r2+r3*2]
  1340. psllq mm1, 8
  1341. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1342. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1343. movq [r4+r3*2], mm1
  1344. movq mm2, mm0
  1345. psllq mm1, 8
  1346. psrlq mm2, 56
  1347. psllq mm0, 8
  1348. por mm1, mm2
  1349. movq [r4+r3*1], mm1
  1350. movq mm2, mm0
  1351. psllq mm1, 8
  1352. psrlq mm2, 56
  1353. psllq mm0, 8
  1354. por mm1, mm2
  1355. movq [r2+r3*2], mm1
  1356. movq mm2, mm0
  1357. psllq mm1, 8
  1358. psrlq mm2, 56
  1359. psllq mm0, 8
  1360. por mm1, mm2
  1361. movq [r2+r3*1], mm1
  1362. movq mm2, mm0
  1363. psllq mm1, 8
  1364. psrlq mm2, 56
  1365. psllq mm0, 8
  1366. por mm1, mm2
  1367. movq [r1+r3*2], mm1
  1368. movq mm2, mm0
  1369. psllq mm1, 8
  1370. psrlq mm2, 56
  1371. psllq mm0, 8
  1372. por mm1, mm2
  1373. movq [r1+r3*1], mm1
  1374. movq mm2, mm0
  1375. psllq mm1, 8
  1376. psrlq mm2, 56
  1377. psllq mm0, 8
  1378. por mm1, mm2
  1379. movq [r0+r3*2], mm1
  1380. psllq mm1, 8
  1381. psrlq mm0, 56
  1382. por mm1, mm0
  1383. movq [r0+r3*1], mm1
  1384. RET
  1385. %macro PRED8x8L_DOWN_LEFT 1
  1386. cglobal pred8x8l_down_left_%1, 4,4
  1387. sub r0, r3
  1388. movq mm0, [r0-8]
  1389. movq mm3, [r0]
  1390. movq mm1, [r0+8]
  1391. movq mm2, mm3
  1392. movq mm4, mm3
  1393. PALIGNR mm2, mm0, 7, mm0
  1394. PALIGNR mm1, mm4, 1, mm4
  1395. test r1, r1 ; top_left
  1396. jz .fix_lt_2
  1397. test r2, r2 ; top_right
  1398. jz .fix_tr_1
  1399. jmp .do_top
  1400. .fix_lt_2:
  1401. movq mm5, mm3
  1402. pxor mm5, mm2
  1403. psllq mm5, 56
  1404. psrlq mm5, 56
  1405. pxor mm2, mm5
  1406. test r2, r2 ; top_right
  1407. jnz .do_top
  1408. .fix_tr_1:
  1409. movq mm5, mm3
  1410. pxor mm5, mm1
  1411. psrlq mm5, 56
  1412. psllq mm5, 56
  1413. pxor mm1, mm5
  1414. jmp .do_top
  1415. .fix_tr_2:
  1416. punpckhbw mm3, mm3
  1417. pshufw mm1, mm3, 0xFF
  1418. jmp .do_topright
  1419. .do_top:
  1420. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1421. movq2dq xmm3, mm4
  1422. test r2, r2 ; top_right
  1423. jz .fix_tr_2
  1424. movq mm0, [r0+8]
  1425. movq mm5, mm0
  1426. movq mm2, mm0
  1427. movq mm4, mm0
  1428. psrlq mm5, 56
  1429. PALIGNR mm2, mm3, 7, mm3
  1430. PALIGNR mm5, mm4, 1, mm4
  1431. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1432. .do_topright:
  1433. movq2dq xmm4, mm1
  1434. psrlq mm1, 56
  1435. movq2dq xmm5, mm1
  1436. lea r1, [r0+r3*2]
  1437. pslldq xmm4, 8
  1438. por xmm3, xmm4
  1439. movdqa xmm2, xmm3
  1440. psrldq xmm2, 1
  1441. pslldq xmm5, 15
  1442. por xmm2, xmm5
  1443. lea r2, [r1+r3*2]
  1444. movdqa xmm1, xmm3
  1445. pslldq xmm1, 1
  1446. INIT_XMM
  1447. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1448. psrldq xmm0, 1
  1449. movq [r0+r3*1], xmm0
  1450. psrldq xmm0, 1
  1451. movq [r0+r3*2], xmm0
  1452. psrldq xmm0, 1
  1453. lea r0, [r2+r3*2]
  1454. movq [r1+r3*1], xmm0
  1455. psrldq xmm0, 1
  1456. movq [r1+r3*2], xmm0
  1457. psrldq xmm0, 1
  1458. movq [r2+r3*1], xmm0
  1459. psrldq xmm0, 1
  1460. movq [r2+r3*2], xmm0
  1461. psrldq xmm0, 1
  1462. movq [r0+r3*1], xmm0
  1463. psrldq xmm0, 1
  1464. movq [r0+r3*2], xmm0
  1465. RET
  1466. %endmacro
  1467. INIT_MMX
  1468. %define PALIGNR PALIGNR_MMX
  1469. PRED8x8L_DOWN_LEFT sse2
  1470. INIT_MMX
  1471. %define PALIGNR PALIGNR_SSSE3
  1472. PRED8x8L_DOWN_LEFT ssse3
  1473. ;-----------------------------------------------------------------------------
  1474. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1475. ;-----------------------------------------------------------------------------
  1476. INIT_MMX
  1477. %define PALIGNR PALIGNR_MMX
  1478. cglobal pred8x8l_down_right_mmxext, 4,5
  1479. sub r0, r3
  1480. lea r4, [r0+r3*2]
  1481. movq mm0, [r0+r3*1-8]
  1482. punpckhbw mm0, [r0+r3*0-8]
  1483. movq mm1, [r4+r3*1-8]
  1484. punpckhbw mm1, [r0+r3*2-8]
  1485. mov r4, r0
  1486. punpckhwd mm1, mm0
  1487. lea r0, [r0+r3*4]
  1488. movq mm2, [r0+r3*1-8]
  1489. punpckhbw mm2, [r0+r3*0-8]
  1490. lea r0, [r0+r3*2]
  1491. movq mm3, [r0+r3*1-8]
  1492. punpckhbw mm3, [r0+r3*0-8]
  1493. punpckhwd mm3, mm2
  1494. punpckhdq mm3, mm1
  1495. lea r0, [r0+r3*2]
  1496. movq mm0, [r0+r3*0-8]
  1497. movq mm1, [r4]
  1498. mov r0, r4
  1499. movq mm4, mm3
  1500. movq mm2, mm3
  1501. PALIGNR mm4, mm0, 7, mm0
  1502. PALIGNR mm1, mm2, 1, mm2
  1503. test r1, r1 ; top_left
  1504. jz .fix_lt_1
  1505. .do_left:
  1506. movq mm0, mm4
  1507. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1508. movq mm4, mm0
  1509. movq mm7, mm2
  1510. movq mm6, mm2
  1511. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1512. psllq mm1, 56
  1513. PALIGNR mm7, mm1, 7, mm3
  1514. movq mm0, [r0-8]
  1515. movq mm3, [r0]
  1516. movq mm1, [r0+8]
  1517. movq mm2, mm3
  1518. movq mm4, mm3
  1519. PALIGNR mm2, mm0, 7, mm0
  1520. PALIGNR mm1, mm4, 1, mm4
  1521. test r1, r1 ; top_left
  1522. jz .fix_lt_2
  1523. test r2, r2 ; top_right
  1524. jz .fix_tr_1
  1525. .do_top:
  1526. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1527. movq mm5, mm4
  1528. jmp .body
  1529. .fix_lt_1:
  1530. movq mm5, mm3
  1531. pxor mm5, mm4
  1532. psrlq mm5, 56
  1533. psllq mm5, 48
  1534. pxor mm1, mm5
  1535. jmp .do_left
  1536. .fix_lt_2:
  1537. movq mm5, mm3
  1538. pxor mm5, mm2
  1539. psllq mm5, 56
  1540. psrlq mm5, 56
  1541. pxor mm2, mm5
  1542. test r2, r2 ; top_right
  1543. jnz .do_top
  1544. .fix_tr_1:
  1545. movq mm5, mm3
  1546. pxor mm5, mm1
  1547. psrlq mm5, 56
  1548. psllq mm5, 56
  1549. pxor mm1, mm5
  1550. jmp .do_top
  1551. .body
  1552. lea r1, [r0+r3*2]
  1553. movq mm1, mm7
  1554. movq mm7, mm5
  1555. movq mm5, mm6
  1556. movq mm2, mm7
  1557. lea r2, [r1+r3*2]
  1558. PALIGNR mm2, mm6, 1, mm0
  1559. movq mm3, mm7
  1560. PALIGNR mm3, mm6, 7, mm0
  1561. movq mm4, mm7
  1562. lea r4, [r2+r3*2]
  1563. psrlq mm4, 8
  1564. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1565. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1566. movq [r4+r3*2], mm0
  1567. movq mm2, mm1
  1568. psrlq mm0, 8
  1569. psllq mm2, 56
  1570. psrlq mm1, 8
  1571. por mm0, mm2
  1572. movq [r4+r3*1], mm0
  1573. movq mm2, mm1
  1574. psrlq mm0, 8
  1575. psllq mm2, 56
  1576. psrlq mm1, 8
  1577. por mm0, mm2
  1578. movq [r2+r3*2], mm0
  1579. movq mm2, mm1
  1580. psrlq mm0, 8
  1581. psllq mm2, 56
  1582. psrlq mm1, 8
  1583. por mm0, mm2
  1584. movq [r2+r3*1], mm0
  1585. movq mm2, mm1
  1586. psrlq mm0, 8
  1587. psllq mm2, 56
  1588. psrlq mm1, 8
  1589. por mm0, mm2
  1590. movq [r1+r3*2], mm0
  1591. movq mm2, mm1
  1592. psrlq mm0, 8
  1593. psllq mm2, 56
  1594. psrlq mm1, 8
  1595. por mm0, mm2
  1596. movq [r1+r3*1], mm0
  1597. movq mm2, mm1
  1598. psrlq mm0, 8
  1599. psllq mm2, 56
  1600. psrlq mm1, 8
  1601. por mm0, mm2
  1602. movq [r0+r3*2], mm0
  1603. psrlq mm0, 8
  1604. psllq mm1, 56
  1605. por mm0, mm1
  1606. movq [r0+r3*1], mm0
  1607. RET
  1608. %macro PRED8x8L_DOWN_RIGHT 1
  1609. cglobal pred8x8l_down_right_%1, 4,5
  1610. sub r0, r3
  1611. lea r4, [r0+r3*2]
  1612. movq mm0, [r0+r3*1-8]
  1613. punpckhbw mm0, [r0+r3*0-8]
  1614. movq mm1, [r4+r3*1-8]
  1615. punpckhbw mm1, [r0+r3*2-8]
  1616. mov r4, r0
  1617. punpckhwd mm1, mm0
  1618. lea r0, [r0+r3*4]
  1619. movq mm2, [r0+r3*1-8]
  1620. punpckhbw mm2, [r0+r3*0-8]
  1621. lea r0, [r0+r3*2]
  1622. movq mm3, [r0+r3*1-8]
  1623. punpckhbw mm3, [r0+r3*0-8]
  1624. punpckhwd mm3, mm2
  1625. punpckhdq mm3, mm1
  1626. lea r0, [r0+r3*2]
  1627. movq mm0, [r0+r3*0-8]
  1628. movq mm1, [r4]
  1629. mov r0, r4
  1630. movq mm4, mm3
  1631. movq mm2, mm3
  1632. PALIGNR mm4, mm0, 7, mm0
  1633. PALIGNR mm1, mm2, 1, mm2
  1634. test r1, r1
  1635. jz .fix_lt_1
  1636. jmp .do_left
  1637. .fix_lt_1:
  1638. movq mm5, mm3
  1639. pxor mm5, mm4
  1640. psrlq mm5, 56
  1641. psllq mm5, 48
  1642. pxor mm1, mm5
  1643. jmp .do_left
  1644. .fix_lt_2:
  1645. movq mm5, mm3
  1646. pxor mm5, mm2
  1647. psllq mm5, 56
  1648. psrlq mm5, 56
  1649. pxor mm2, mm5
  1650. test r2, r2
  1651. jnz .do_top
  1652. .fix_tr_1:
  1653. movq mm5, mm3
  1654. pxor mm5, mm1
  1655. psrlq mm5, 56
  1656. psllq mm5, 56
  1657. pxor mm1, mm5
  1658. jmp .do_top
  1659. .do_left:
  1660. movq mm0, mm4
  1661. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1662. movq mm4, mm0
  1663. movq mm7, mm2
  1664. movq2dq xmm3, mm2
  1665. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1666. psllq mm1, 56
  1667. PALIGNR mm7, mm1, 7, mm3
  1668. movq2dq xmm1, mm7
  1669. movq mm0, [r0-8]
  1670. movq mm3, [r0]
  1671. movq mm1, [r0+8]
  1672. movq mm2, mm3
  1673. movq mm4, mm3
  1674. PALIGNR mm2, mm0, 7, mm0
  1675. PALIGNR mm1, mm4, 1, mm4
  1676. test r1, r1
  1677. jz .fix_lt_2
  1678. test r2, r2
  1679. jz .fix_tr_1
  1680. .do_top:
  1681. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1682. movq2dq xmm4, mm4
  1683. lea r1, [r0+r3*2]
  1684. movdqa xmm0, xmm3
  1685. pslldq xmm4, 8
  1686. por xmm3, xmm4
  1687. lea r2, [r1+r3*2]
  1688. pslldq xmm4, 1
  1689. por xmm1, xmm4
  1690. psrldq xmm0, 7
  1691. pslldq xmm0, 15
  1692. psrldq xmm0, 7
  1693. por xmm1, xmm0
  1694. lea r0, [r2+r3*2]
  1695. movdqa xmm2, xmm3
  1696. psrldq xmm2, 1
  1697. INIT_XMM
  1698. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1699. movdqa xmm1, xmm0
  1700. psrldq xmm1, 1
  1701. movq [r0+r3*2], xmm0
  1702. movq [r0+r3*1], xmm1
  1703. psrldq xmm0, 2
  1704. psrldq xmm1, 2
  1705. movq [r2+r3*2], xmm0
  1706. movq [r2+r3*1], xmm1
  1707. psrldq xmm0, 2
  1708. psrldq xmm1, 2
  1709. movq [r1+r3*2], xmm0
  1710. movq [r1+r3*1], xmm1
  1711. psrldq xmm0, 2
  1712. psrldq xmm1, 2
  1713. movq [r4+r3*2], xmm0
  1714. movq [r4+r3*1], xmm1
  1715. RET
  1716. %endmacro
  1717. INIT_MMX
  1718. %define PALIGNR PALIGNR_MMX
  1719. PRED8x8L_DOWN_RIGHT sse2
  1720. INIT_MMX
  1721. %define PALIGNR PALIGNR_SSSE3
  1722. PRED8x8L_DOWN_RIGHT ssse3
  1723. ;-----------------------------------------------------------------------------
  1724. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1725. ;-----------------------------------------------------------------------------
  1726. INIT_MMX
  1727. %define PALIGNR PALIGNR_MMX
  1728. cglobal pred8x8l_vertical_right_mmxext, 4,5
  1729. sub r0, r3
  1730. lea r4, [r0+r3*2]
  1731. movq mm0, [r0+r3*1-8]
  1732. punpckhbw mm0, [r0+r3*0-8]
  1733. movq mm1, [r4+r3*1-8]
  1734. punpckhbw mm1, [r0+r3*2-8]
  1735. mov r4, r0
  1736. punpckhwd mm1, mm0
  1737. lea r0, [r0+r3*4]
  1738. movq mm2, [r0+r3*1-8]
  1739. punpckhbw mm2, [r0+r3*0-8]
  1740. lea r0, [r0+r3*2]
  1741. movq mm3, [r0+r3*1-8]
  1742. punpckhbw mm3, [r0+r3*0-8]
  1743. punpckhwd mm3, mm2
  1744. punpckhdq mm3, mm1
  1745. lea r0, [r0+r3*2]
  1746. movq mm0, [r0+r3*0-8]
  1747. movq mm1, [r4]
  1748. mov r0, r4
  1749. movq mm4, mm3
  1750. movq mm2, mm3
  1751. PALIGNR mm4, mm0, 7, mm0
  1752. PALIGNR mm1, mm2, 1, mm2
  1753. test r1, r1
  1754. jz .fix_lt_1
  1755. jmp .do_left
  1756. .fix_lt_1:
  1757. movq mm5, mm3
  1758. pxor mm5, mm4
  1759. psrlq mm5, 56
  1760. psllq mm5, 48
  1761. pxor mm1, mm5
  1762. jmp .do_left
  1763. .fix_lt_2:
  1764. movq mm5, mm3
  1765. pxor mm5, mm2
  1766. psllq mm5, 56
  1767. psrlq mm5, 56
  1768. pxor mm2, mm5
  1769. test r2, r2
  1770. jnz .do_top
  1771. .fix_tr_1:
  1772. movq mm5, mm3
  1773. pxor mm5, mm1
  1774. psrlq mm5, 56
  1775. psllq mm5, 56
  1776. pxor mm1, mm5
  1777. jmp .do_top
  1778. .do_left:
  1779. movq mm0, mm4
  1780. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1781. movq mm7, mm2
  1782. movq mm0, [r0-8]
  1783. movq mm3, [r0]
  1784. movq mm1, [r0+8]
  1785. movq mm2, mm3
  1786. movq mm4, mm3
  1787. PALIGNR mm2, mm0, 7, mm0
  1788. PALIGNR mm1, mm4, 1, mm4
  1789. test r1, r1
  1790. jz .fix_lt_2
  1791. test r2, r2
  1792. jz .fix_tr_1
  1793. .do_top
  1794. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1795. lea r1, [r0+r3*2]
  1796. movq mm2, mm6
  1797. movq mm3, mm6
  1798. PALIGNR mm3, mm7, 7, mm0
  1799. PALIGNR mm6, mm7, 6, mm1
  1800. movq mm4, mm3
  1801. pavgb mm3, mm2
  1802. lea r2, [r1+r3*2]
  1803. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1804. movq [r0+r3*1], mm3
  1805. movq [r0+r3*2], mm0
  1806. movq mm5, mm0
  1807. movq mm6, mm3
  1808. movq mm1, mm7
  1809. movq mm2, mm1
  1810. psllq mm2, 8
  1811. movq mm3, mm1
  1812. psllq mm3, 16
  1813. lea r4, [r2+r3*2]
  1814. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1815. PALIGNR mm6, mm0, 7, mm2
  1816. movq [r1+r3*1], mm6
  1817. psllq mm0, 8
  1818. PALIGNR mm5, mm0, 7, mm1
  1819. movq [r1+r3*2], mm5
  1820. psllq mm0, 8
  1821. PALIGNR mm6, mm0, 7, mm2
  1822. movq [r2+r3*1], mm6
  1823. psllq mm0, 8
  1824. PALIGNR mm5, mm0, 7, mm1
  1825. movq [r2+r3*2], mm5
  1826. psllq mm0, 8
  1827. PALIGNR mm6, mm0, 7, mm2
  1828. movq [r4+r3*1], mm6
  1829. psllq mm0, 8
  1830. PALIGNR mm5, mm0, 7, mm1
  1831. movq [r4+r3*2], mm5
  1832. RET
  1833. %macro PRED8x8L_VERTICAL_RIGHT 1
  1834. cglobal pred8x8l_vertical_right_%1, 4,5,7
  1835. sub r0, r3
  1836. lea r4, [r0+r3*2]
  1837. movq mm0, [r0+r3*1-8]
  1838. punpckhbw mm0, [r0+r3*0-8]
  1839. movq mm1, [r4+r3*1-8]
  1840. punpckhbw mm1, [r0+r3*2-8]
  1841. mov r4, r0
  1842. punpckhwd mm1, mm0
  1843. lea r0, [r0+r3*4]
  1844. movq mm2, [r0+r3*1-8]
  1845. punpckhbw mm2, [r0+r3*0-8]
  1846. lea r0, [r0+r3*2]
  1847. movq mm3, [r0+r3*1-8]
  1848. punpckhbw mm3, [r0+r3*0-8]
  1849. punpckhwd mm3, mm2
  1850. punpckhdq mm3, mm1
  1851. lea r0, [r0+r3*2]
  1852. movq mm0, [r0+r3*0-8]
  1853. movq mm1, [r4]
  1854. mov r0, r4
  1855. movq mm4, mm3
  1856. movq mm2, mm3
  1857. PALIGNR mm4, mm0, 7, mm0
  1858. PALIGNR mm1, mm2, 1, mm2
  1859. test r1, r1
  1860. jnz .do_left
  1861. .fix_lt_1:
  1862. movq mm5, mm3
  1863. pxor mm5, mm4
  1864. psrlq mm5, 56
  1865. psllq mm5, 48
  1866. pxor mm1, mm5
  1867. jmp .do_left
  1868. .fix_lt_2:
  1869. movq mm5, mm3
  1870. pxor mm5, mm2
  1871. psllq mm5, 56
  1872. psrlq mm5, 56
  1873. pxor mm2, mm5
  1874. test r2, r2
  1875. jnz .do_top
  1876. .fix_tr_1:
  1877. movq mm5, mm3
  1878. pxor mm5, mm1
  1879. psrlq mm5, 56
  1880. psllq mm5, 56
  1881. pxor mm1, mm5
  1882. jmp .do_top
  1883. .do_left:
  1884. movq mm0, mm4
  1885. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1886. movq2dq xmm0, mm2
  1887. movq mm0, [r0-8]
  1888. movq mm3, [r0]
  1889. movq mm1, [r0+8]
  1890. movq mm2, mm3
  1891. movq mm4, mm3
  1892. PALIGNR mm2, mm0, 7, mm0
  1893. PALIGNR mm1, mm4, 1, mm4
  1894. test r1, r1
  1895. jz .fix_lt_2
  1896. test r2, r2
  1897. jz .fix_tr_1
  1898. .do_top
  1899. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1900. lea r1, [r0+r3*2]
  1901. movq2dq xmm4, mm6
  1902. pslldq xmm4, 8
  1903. por xmm0, xmm4
  1904. movdqa xmm6, [pw_ff00]
  1905. movdqa xmm1, xmm0
  1906. lea r2, [r1+r3*2]
  1907. movdqa xmm2, xmm0
  1908. movdqa xmm3, xmm0
  1909. pslldq xmm0, 1
  1910. pslldq xmm1, 2
  1911. pavgb xmm2, xmm0
  1912. INIT_XMM
  1913. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1914. pandn xmm6, xmm4
  1915. movdqa xmm5, xmm4
  1916. psrlw xmm4, 8
  1917. packuswb xmm6, xmm4
  1918. movhlps xmm4, xmm6
  1919. movhps [r0+r3*2], xmm5
  1920. movhps [r0+r3*1], xmm2
  1921. psrldq xmm5, 4
  1922. movss xmm5, xmm6
  1923. psrldq xmm2, 4
  1924. movss xmm2, xmm4
  1925. lea r0, [r2+r3*2]
  1926. psrldq xmm5, 1
  1927. psrldq xmm2, 1
  1928. movq [r0+r3*2], xmm5
  1929. movq [r0+r3*1], xmm2
  1930. psrldq xmm5, 1
  1931. psrldq xmm2, 1
  1932. movq [r2+r3*2], xmm5
  1933. movq [r2+r3*1], xmm2
  1934. psrldq xmm5, 1
  1935. psrldq xmm2, 1
  1936. movq [r1+r3*2], xmm5
  1937. movq [r1+r3*1], xmm2
  1938. RET
  1939. %endmacro
  1940. INIT_MMX
  1941. %define PALIGNR PALIGNR_MMX
  1942. PRED8x8L_VERTICAL_RIGHT sse2
  1943. INIT_MMX
  1944. %define PALIGNR PALIGNR_SSSE3
  1945. PRED8x8L_VERTICAL_RIGHT ssse3
  1946. ;-----------------------------------------------------------------------------
  1947. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1948. ;-----------------------------------------------------------------------------
  1949. %macro PRED8x8L_VERTICAL_LEFT 1
  1950. cglobal pred8x8l_vertical_left_%1, 4,4
  1951. sub r0, r3
  1952. movq mm0, [r0-8]
  1953. movq mm3, [r0]
  1954. movq mm1, [r0+8]
  1955. movq mm2, mm3
  1956. movq mm4, mm3
  1957. PALIGNR mm2, mm0, 7, mm0
  1958. PALIGNR mm1, mm4, 1, mm4
  1959. test r1, r1
  1960. jz .fix_lt_2
  1961. test r2, r2
  1962. jz .fix_tr_1
  1963. jmp .do_top
  1964. .fix_lt_2:
  1965. movq mm5, mm3
  1966. pxor mm5, mm2
  1967. psllq mm5, 56
  1968. psrlq mm5, 56
  1969. pxor mm2, mm5
  1970. test r2, r2
  1971. jnz .do_top
  1972. .fix_tr_1:
  1973. movq mm5, mm3
  1974. pxor mm5, mm1
  1975. psrlq mm5, 56
  1976. psllq mm5, 56
  1977. pxor mm1, mm5
  1978. jmp .do_top
  1979. .fix_tr_2:
  1980. punpckhbw mm3, mm3
  1981. pshufw mm1, mm3, 0xFF
  1982. jmp .do_topright
  1983. .do_top:
  1984. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1985. movq2dq xmm4, mm4
  1986. test r2, r2
  1987. jz .fix_tr_2
  1988. movq mm0, [r0+8]
  1989. movq mm5, mm0
  1990. movq mm2, mm0
  1991. movq mm4, mm0
  1992. psrlq mm5, 56
  1993. PALIGNR mm2, mm3, 7, mm3
  1994. PALIGNR mm5, mm4, 1, mm4
  1995. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1996. .do_topright:
  1997. movq2dq xmm3, mm1
  1998. lea r1, [r0+r3*2]
  1999. pslldq xmm3, 8
  2000. por xmm4, xmm3
  2001. movdqa xmm2, xmm4
  2002. movdqa xmm1, xmm4
  2003. movdqa xmm3, xmm4
  2004. psrldq xmm2, 1
  2005. pslldq xmm1, 1
  2006. pavgb xmm3, xmm2
  2007. lea r2, [r1+r3*2]
  2008. INIT_XMM
  2009. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  2010. psrldq xmm0, 1
  2011. movq [r0+r3*1], xmm3
  2012. movq [r0+r3*2], xmm0
  2013. lea r0, [r2+r3*2]
  2014. psrldq xmm3, 1
  2015. psrldq xmm0, 1
  2016. movq [r1+r3*1], xmm3
  2017. movq [r1+r3*2], xmm0
  2018. psrldq xmm3, 1
  2019. psrldq xmm0, 1
  2020. movq [r2+r3*1], xmm3
  2021. movq [r2+r3*2], xmm0
  2022. psrldq xmm3, 1
  2023. psrldq xmm0, 1
  2024. movq [r0+r3*1], xmm3
  2025. movq [r0+r3*2], xmm0
  2026. RET
  2027. %endmacro
  2028. INIT_MMX
  2029. %define PALIGNR PALIGNR_MMX
  2030. PRED8x8L_VERTICAL_LEFT sse2
  2031. %define PALIGNR PALIGNR_SSSE3
  2032. INIT_MMX
  2033. PRED8x8L_VERTICAL_LEFT ssse3
  2034. ;-----------------------------------------------------------------------------
  2035. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  2036. ;-----------------------------------------------------------------------------
  2037. %macro PRED8x8L_HORIZONTAL_UP 1
  2038. cglobal pred8x8l_horizontal_up_%1, 4,4
  2039. sub r0, r3
  2040. lea r2, [r0+r3*2]
  2041. movq mm0, [r0+r3*1-8]
  2042. test r1, r1
  2043. lea r1, [r0+r3]
  2044. cmovnz r1, r0
  2045. punpckhbw mm0, [r1+r3*0-8]
  2046. movq mm1, [r2+r3*1-8]
  2047. punpckhbw mm1, [r0+r3*2-8]
  2048. mov r2, r0
  2049. punpckhwd mm1, mm0
  2050. lea r0, [r0+r3*4]
  2051. movq mm2, [r0+r3*1-8]
  2052. punpckhbw mm2, [r0+r3*0-8]
  2053. lea r0, [r0+r3*2]
  2054. movq mm3, [r0+r3*1-8]
  2055. punpckhbw mm3, [r0+r3*0-8]
  2056. punpckhwd mm3, mm2
  2057. punpckhdq mm3, mm1
  2058. lea r0, [r0+r3*2]
  2059. movq mm0, [r0+r3*0-8]
  2060. movq mm1, [r1+r3*0-8]
  2061. mov r0, r2
  2062. movq mm4, mm3
  2063. movq mm2, mm3
  2064. PALIGNR mm4, mm0, 7, mm0
  2065. PALIGNR mm1, mm2, 1, mm2
  2066. movq mm0, mm4
  2067. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2068. movq mm4, mm0
  2069. movq mm7, mm2
  2070. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2071. psllq mm1, 56
  2072. PALIGNR mm7, mm1, 7, mm3
  2073. lea r1, [r0+r3*2]
  2074. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2075. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2076. movq mm2, mm0
  2077. psllw mm0, 8
  2078. psrlw mm2, 8
  2079. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2080. movq mm3, mm2
  2081. movq mm4, mm2
  2082. movq mm5, mm2
  2083. psrlq mm2, 8
  2084. psrlq mm3, 16
  2085. lea r2, [r1+r3*2]
  2086. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2087. punpckhbw mm7, mm7
  2088. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2089. pavgb mm4, mm2
  2090. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2091. movq mm5, mm4
  2092. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2093. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2094. movq mm6, mm5
  2095. movq mm7, mm5
  2096. movq mm0, mm5
  2097. PALIGNR mm5, mm4, 2, mm1
  2098. pshufw mm1, mm6, 11111001b
  2099. PALIGNR mm6, mm4, 4, mm2
  2100. pshufw mm2, mm7, 11111110b
  2101. PALIGNR mm7, mm4, 6, mm3
  2102. pshufw mm3, mm0, 11111111b
  2103. movq [r0+r3*1], mm4
  2104. movq [r0+r3*2], mm5
  2105. lea r0, [r2+r3*2]
  2106. movq [r1+r3*1], mm6
  2107. movq [r1+r3*2], mm7
  2108. movq [r2+r3*1], mm0
  2109. movq [r2+r3*2], mm1
  2110. movq [r0+r3*1], mm2
  2111. movq [r0+r3*2], mm3
  2112. RET
  2113. %endmacro
  2114. INIT_MMX
  2115. %define PALIGNR PALIGNR_MMX
  2116. PRED8x8L_HORIZONTAL_UP mmxext
  2117. %define PALIGNR PALIGNR_SSSE3
  2118. PRED8x8L_HORIZONTAL_UP ssse3
  2119. ;-----------------------------------------------------------------------------
  2120. ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
  2121. ;-----------------------------------------------------------------------------
  2122. INIT_MMX
  2123. %define PALIGNR PALIGNR_MMX
  2124. cglobal pred8x8l_horizontal_down_mmxext, 4,5
  2125. sub r0, r3
  2126. lea r4, [r0+r3*2]
  2127. movq mm0, [r0+r3*1-8]
  2128. punpckhbw mm0, [r0+r3*0-8]
  2129. movq mm1, [r4+r3*1-8]
  2130. punpckhbw mm1, [r0+r3*2-8]
  2131. mov r4, r0
  2132. punpckhwd mm1, mm0
  2133. lea r0, [r0+r3*4]
  2134. movq mm2, [r0+r3*1-8]
  2135. punpckhbw mm2, [r0+r3*0-8]
  2136. lea r0, [r0+r3*2]
  2137. movq mm3, [r0+r3*1-8]
  2138. punpckhbw mm3, [r0+r3*0-8]
  2139. punpckhwd mm3, mm2
  2140. punpckhdq mm3, mm1
  2141. lea r0, [r0+r3*2]
  2142. movq mm0, [r0+r3*0-8]
  2143. movq mm1, [r4]
  2144. mov r0, r4
  2145. movq mm4, mm3
  2146. movq mm2, mm3
  2147. PALIGNR mm4, mm0, 7, mm0
  2148. PALIGNR mm1, mm2, 1, mm2
  2149. test r1, r1
  2150. jnz .do_left
  2151. .fix_lt_1:
  2152. movq mm5, mm3
  2153. pxor mm5, mm4
  2154. psrlq mm5, 56
  2155. psllq mm5, 48
  2156. pxor mm1, mm5
  2157. jmp .do_left
  2158. .fix_lt_2:
  2159. movq mm5, mm3
  2160. pxor mm5, mm2
  2161. psllq mm5, 56
  2162. psrlq mm5, 56
  2163. pxor mm2, mm5
  2164. test r2, r2
  2165. jnz .do_top
  2166. .fix_tr_1:
  2167. movq mm5, mm3
  2168. pxor mm5, mm1
  2169. psrlq mm5, 56
  2170. psllq mm5, 56
  2171. pxor mm1, mm5
  2172. jmp .do_top
  2173. .do_left:
  2174. movq mm0, mm4
  2175. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2176. movq mm4, mm0
  2177. movq mm7, mm2
  2178. movq mm6, mm2
  2179. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2180. psllq mm1, 56
  2181. PALIGNR mm7, mm1, 7, mm3
  2182. movq mm0, [r0-8]
  2183. movq mm3, [r0]
  2184. movq mm1, [r0+8]
  2185. movq mm2, mm3
  2186. movq mm4, mm3
  2187. PALIGNR mm2, mm0, 7, mm0
  2188. PALIGNR mm1, mm4, 1, mm4
  2189. test r1, r1
  2190. jz .fix_lt_2
  2191. test r2, r2
  2192. jz .fix_tr_1
  2193. .do_top:
  2194. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2195. movq mm5, mm4
  2196. lea r1, [r0+r3*2]
  2197. psllq mm7, 56
  2198. movq mm2, mm5
  2199. movq mm3, mm6
  2200. movq mm4, mm2
  2201. PALIGNR mm2, mm6, 7, mm5
  2202. PALIGNR mm6, mm7, 7, mm0
  2203. lea r2, [r1+r3*2]
  2204. PALIGNR mm4, mm3, 1, mm7
  2205. movq mm5, mm3
  2206. pavgb mm3, mm6
  2207. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2208. movq mm4, mm2
  2209. movq mm1, mm2
  2210. lea r4, [r2+r3*2]
  2211. psrlq mm4, 16
  2212. psrlq mm1, 8
  2213. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2214. movq mm7, mm3
  2215. punpcklbw mm3, mm0
  2216. punpckhbw mm7, mm0
  2217. movq mm1, mm7
  2218. movq mm0, mm7
  2219. movq mm4, mm7
  2220. movq [r4+r3*2], mm3
  2221. PALIGNR mm7, mm3, 2, mm5
  2222. movq [r4+r3*1], mm7
  2223. PALIGNR mm1, mm3, 4, mm5
  2224. movq [r2+r3*2], mm1
  2225. PALIGNR mm0, mm3, 6, mm3
  2226. movq [r2+r3*1], mm0
  2227. movq mm2, mm6
  2228. movq mm3, mm6
  2229. movq [r1+r3*2], mm4
  2230. PALIGNR mm6, mm4, 2, mm5
  2231. movq [r1+r3*1], mm6
  2232. PALIGNR mm2, mm4, 4, mm5
  2233. movq [r0+r3*2], mm2
  2234. PALIGNR mm3, mm4, 6, mm4
  2235. movq [r0+r3*1], mm3
  2236. RET
  2237. %macro PRED8x8L_HORIZONTAL_DOWN 1
  2238. cglobal pred8x8l_horizontal_down_%1, 4,5
  2239. sub r0, r3
  2240. lea r4, [r0+r3*2]
  2241. movq mm0, [r0+r3*1-8]
  2242. punpckhbw mm0, [r0+r3*0-8]
  2243. movq mm1, [r4+r3*1-8]
  2244. punpckhbw mm1, [r0+r3*2-8]
  2245. mov r4, r0
  2246. punpckhwd mm1, mm0
  2247. lea r0, [r0+r3*4]
  2248. movq mm2, [r0+r3*1-8]
  2249. punpckhbw mm2, [r0+r3*0-8]
  2250. lea r0, [r0+r3*2]
  2251. movq mm3, [r0+r3*1-8]
  2252. punpckhbw mm3, [r0+r3*0-8]
  2253. punpckhwd mm3, mm2
  2254. punpckhdq mm3, mm1
  2255. lea r0, [r0+r3*2]
  2256. movq mm0, [r0+r3*0-8]
  2257. movq mm1, [r4]
  2258. mov r0, r4
  2259. movq mm4, mm3
  2260. movq mm2, mm3
  2261. PALIGNR mm4, mm0, 7, mm0
  2262. PALIGNR mm1, mm2, 1, mm2
  2263. test r1, r1
  2264. jnz .do_left
  2265. .fix_lt_1:
  2266. movq mm5, mm3
  2267. pxor mm5, mm4
  2268. psrlq mm5, 56
  2269. psllq mm5, 48
  2270. pxor mm1, mm5
  2271. jmp .do_left
  2272. .fix_lt_2:
  2273. movq mm5, mm3
  2274. pxor mm5, mm2
  2275. psllq mm5, 56
  2276. psrlq mm5, 56
  2277. pxor mm2, mm5
  2278. test r2, r2
  2279. jnz .do_top
  2280. .fix_tr_1:
  2281. movq mm5, mm3
  2282. pxor mm5, mm1
  2283. psrlq mm5, 56
  2284. psllq mm5, 56
  2285. pxor mm1, mm5
  2286. jmp .do_top
  2287. .fix_tr_2:
  2288. punpckhbw mm3, mm3
  2289. pshufw mm1, mm3, 0xFF
  2290. jmp .do_topright
  2291. .do_left:
  2292. movq mm0, mm4
  2293. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2294. movq2dq xmm0, mm2
  2295. pslldq xmm0, 8
  2296. movq mm4, mm0
  2297. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2298. movq2dq xmm2, mm1
  2299. pslldq xmm2, 15
  2300. psrldq xmm2, 8
  2301. por xmm0, xmm2
  2302. movq mm0, [r0-8]
  2303. movq mm3, [r0]
  2304. movq mm1, [r0+8]
  2305. movq mm2, mm3
  2306. movq mm4, mm3
  2307. PALIGNR mm2, mm0, 7, mm0
  2308. PALIGNR mm1, mm4, 1, mm4
  2309. test r1, r1
  2310. jz .fix_lt_2
  2311. test r2, r2
  2312. jz .fix_tr_1
  2313. .do_top:
  2314. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2315. movq2dq xmm1, mm4
  2316. test r2, r2
  2317. jz .fix_tr_2
  2318. movq mm0, [r0+8]
  2319. movq mm5, mm0
  2320. movq mm2, mm0
  2321. movq mm4, mm0
  2322. psrlq mm5, 56
  2323. PALIGNR mm2, mm3, 7, mm3
  2324. PALIGNR mm5, mm4, 1, mm4
  2325. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2326. .do_topright:
  2327. movq2dq xmm5, mm1
  2328. pslldq xmm5, 8
  2329. por xmm1, xmm5
  2330. INIT_XMM
  2331. lea r2, [r4+r3*2]
  2332. movdqa xmm2, xmm1
  2333. movdqa xmm3, xmm1
  2334. PALIGNR xmm1, xmm0, 7, xmm4
  2335. PALIGNR xmm2, xmm0, 9, xmm5
  2336. lea r1, [r2+r3*2]
  2337. PALIGNR xmm3, xmm0, 8, xmm0
  2338. movdqa xmm4, xmm1
  2339. pavgb xmm4, xmm3
  2340. lea r0, [r1+r3*2]
  2341. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2342. punpcklbw xmm4, xmm0
  2343. movhlps xmm0, xmm4
  2344. movq [r0+r3*2], xmm4
  2345. movq [r2+r3*2], xmm0
  2346. psrldq xmm4, 2
  2347. psrldq xmm0, 2
  2348. movq [r0+r3*1], xmm4
  2349. movq [r2+r3*1], xmm0
  2350. psrldq xmm4, 2
  2351. psrldq xmm0, 2
  2352. movq [r1+r3*2], xmm4
  2353. movq [r4+r3*2], xmm0
  2354. psrldq xmm4, 2
  2355. psrldq xmm0, 2
  2356. movq [r1+r3*1], xmm4
  2357. movq [r4+r3*1], xmm0
  2358. RET
  2359. %endmacro
  2360. INIT_MMX
  2361. %define PALIGNR PALIGNR_MMX
  2362. PRED8x8L_HORIZONTAL_DOWN sse2
  2363. INIT_MMX
  2364. %define PALIGNR PALIGNR_SSSE3
  2365. PRED8x8L_HORIZONTAL_DOWN ssse3
  2366. %endif
  2367. ;-----------------------------------------------------------------------------
  2368. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2369. ;-----------------------------------------------------------------------------
  2370. cglobal pred4x4_dc_mmxext, 3,5
  2371. pxor mm7, mm7
  2372. mov r4, r0
  2373. sub r0, r2
  2374. movd mm0, [r0]
  2375. psadbw mm0, mm7
  2376. movzx r1d, byte [r0+r2*1-1]
  2377. movd r3d, mm0
  2378. add r3d, r1d
  2379. movzx r1d, byte [r0+r2*2-1]
  2380. lea r0, [r0+r2*2]
  2381. add r3d, r1d
  2382. movzx r1d, byte [r0+r2*1-1]
  2383. add r3d, r1d
  2384. movzx r1d, byte [r0+r2*2-1]
  2385. add r3d, r1d
  2386. add r3d, 4
  2387. shr r3d, 3
  2388. imul r3d, 0x01010101
  2389. mov [r4+r2*0], r3d
  2390. mov [r0+r2*0], r3d
  2391. mov [r0+r2*1], r3d
  2392. mov [r0+r2*2], r3d
  2393. RET
  2394. ;-----------------------------------------------------------------------------
  2395. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2396. ;-----------------------------------------------------------------------------
  2397. %macro PRED4x4_TM_MMX 1
  2398. cglobal pred4x4_tm_vp8_%1, 3,6
  2399. sub r0, r2
  2400. pxor mm7, mm7
  2401. movd mm0, [r0]
  2402. punpcklbw mm0, mm7
  2403. movzx r4d, byte [r0-1]
  2404. mov r5d, 2
  2405. .loop:
  2406. movzx r1d, byte [r0+r2*1-1]
  2407. movzx r3d, byte [r0+r2*2-1]
  2408. sub r1d, r4d
  2409. sub r3d, r4d
  2410. movd mm2, r1d
  2411. movd mm4, r3d
  2412. %ifidn %1, mmx
  2413. punpcklwd mm2, mm2
  2414. punpcklwd mm4, mm4
  2415. punpckldq mm2, mm2
  2416. punpckldq mm4, mm4
  2417. %else
  2418. pshufw mm2, mm2, 0
  2419. pshufw mm4, mm4, 0
  2420. %endif
  2421. paddw mm2, mm0
  2422. paddw mm4, mm0
  2423. packuswb mm2, mm2
  2424. packuswb mm4, mm4
  2425. movd [r0+r2*1], mm2
  2426. movd [r0+r2*2], mm4
  2427. lea r0, [r0+r2*2]
  2428. dec r5d
  2429. jg .loop
  2430. REP_RET
  2431. %endmacro
  2432. PRED4x4_TM_MMX mmx
  2433. PRED4x4_TM_MMX mmxext
  2434. cglobal pred4x4_tm_vp8_ssse3, 3,3
  2435. sub r0, r2
  2436. movq mm6, [tm_shuf]
  2437. pxor mm1, mm1
  2438. movd mm0, [r0]
  2439. punpcklbw mm0, mm1
  2440. movd mm7, [r0-4]
  2441. pshufb mm7, mm6
  2442. lea r1, [r0+r2*2]
  2443. movd mm2, [r0+r2*1-4]
  2444. movd mm3, [r0+r2*2-4]
  2445. movd mm4, [r1+r2*1-4]
  2446. movd mm5, [r1+r2*2-4]
  2447. pshufb mm2, mm6
  2448. pshufb mm3, mm6
  2449. pshufb mm4, mm6
  2450. pshufb mm5, mm6
  2451. psubw mm2, mm7
  2452. psubw mm3, mm7
  2453. psubw mm4, mm7
  2454. psubw mm5, mm7
  2455. paddw mm2, mm0
  2456. paddw mm3, mm0
  2457. paddw mm4, mm0
  2458. paddw mm5, mm0
  2459. packuswb mm2, mm2
  2460. packuswb mm3, mm3
  2461. packuswb mm4, mm4
  2462. packuswb mm5, mm5
  2463. movd [r0+r2*1], mm2
  2464. movd [r0+r2*2], mm3
  2465. movd [r1+r2*1], mm4
  2466. movd [r1+r2*2], mm5
  2467. RET
  2468. ;-----------------------------------------------------------------------------
  2469. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2470. ;-----------------------------------------------------------------------------
  2471. INIT_MMX
  2472. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  2473. sub r0, r2
  2474. movd m1, [r0-1]
  2475. movd m0, [r0]
  2476. mova m2, m0 ;t0 t1 t2 t3
  2477. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2478. lea r1, [r0+r2*2]
  2479. psrlq m0, 8 ;t1 t2 t3 t4
  2480. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2481. movd [r0+r2*1], m3
  2482. movd [r0+r2*2], m3
  2483. movd [r1+r2*1], m3
  2484. movd [r1+r2*2], m3
  2485. RET
  2486. ;-----------------------------------------------------------------------------
  2487. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2488. ;-----------------------------------------------------------------------------
  2489. %ifdef CONFIG_GPL
  2490. INIT_MMX
  2491. cglobal pred4x4_down_left_mmxext, 3,3
  2492. sub r0, r2
  2493. movq m1, [r0]
  2494. punpckldq m1, [r1]
  2495. movq m2, m1
  2496. movq m3, m1
  2497. movq m4, m1
  2498. psllq m1, 8
  2499. pxor m2, m1
  2500. psrlq m2, 8
  2501. pxor m3, m2
  2502. PRED4x4_LOWPASS m0, m1, m3, m4, m5
  2503. lea r1, [r0+r2*2]
  2504. psrlq m0, 8
  2505. movd [r0+r2*1], m0
  2506. psrlq m0, 8
  2507. movd [r0+r2*2], m0
  2508. psrlq m0, 8
  2509. movd [r1+r2*1], m0
  2510. psrlq m0, 8
  2511. movd [r1+r2*2], m0
  2512. RET
  2513. ;-----------------------------------------------------------------------------
  2514. ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2515. ;-----------------------------------------------------------------------------
  2516. INIT_MMX
  2517. cglobal pred4x4_vertical_left_mmxext, 3,3
  2518. sub r0, r2
  2519. movq m1, [r0]
  2520. punpckldq m1, [r1]
  2521. movq m3, m1
  2522. movq m2, m1
  2523. psrlq m3, 8
  2524. psrlq m2, 16
  2525. movq m4, m3
  2526. pavgb m4, m1
  2527. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2528. lea r1, [r0+r2*2]
  2529. movh [r0+r2*1], m4
  2530. movh [r0+r2*2], m0
  2531. psrlq m4, 8
  2532. psrlq m0, 8
  2533. movh [r1+r2*1], m4
  2534. movh [r1+r2*2], m0
  2535. RET
  2536. ;-----------------------------------------------------------------------------
  2537. ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2538. ;-----------------------------------------------------------------------------
  2539. INIT_MMX
  2540. cglobal pred4x4_horizontal_up_mmxext, 3,3
  2541. sub r0, r2
  2542. lea r1, [r0+r2*2]
  2543. movd m0, [r0+r2*1-4]
  2544. punpcklbw m0, [r0+r2*2-4]
  2545. movd m1, [r1+r2*1-4]
  2546. punpcklbw m1, [r1+r2*2-4]
  2547. punpckhwd m0, m1
  2548. movq m1, m0
  2549. punpckhbw m1, m1
  2550. pshufw m1, m1, 0xFF
  2551. punpckhdq m0, m1
  2552. movq m2, m0
  2553. movq m3, m0
  2554. movq m7, m0
  2555. psrlq m2, 16
  2556. psrlq m3, 8
  2557. pavgb m7, m3
  2558. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2559. punpcklbw m7, m4
  2560. movd [r0+r2*1], m7
  2561. psrlq m7, 16
  2562. movd [r0+r2*2], m7
  2563. psrlq m7, 16
  2564. movd [r1+r2*1], m7
  2565. movd [r1+r2*2], m1
  2566. RET
  2567. ;-----------------------------------------------------------------------------
  2568. ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2569. ;-----------------------------------------------------------------------------
  2570. INIT_MMX
  2571. %define PALIGNR PALIGNR_MMX
  2572. cglobal pred4x4_horizontal_down_mmxext, 3,3
  2573. sub r0, r2
  2574. lea r1, [r0+r2*2]
  2575. movh m0, [r0-4] ; lt ..
  2576. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2577. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2578. movd m1, [r1+r2*2-4] ; l3
  2579. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2580. movd m2, [r0+r2*2-4] ; l1
  2581. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2582. punpckhwd m1, m2 ; l0 l1 l2 l3
  2583. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2584. movq m0, m1
  2585. movq m2, m1
  2586. movq m5, m1
  2587. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2588. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2589. pavgb m5, m2
  2590. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2591. punpcklbw m5, m3
  2592. psrlq m3, 32
  2593. PALIGNR m3, m5, 6, m4
  2594. movh [r1+r2*2], m5
  2595. psrlq m5, 16
  2596. movh [r1+r2*1], m5
  2597. psrlq m5, 16
  2598. movh [r0+r2*2], m5
  2599. movh [r0+r2*1], m3
  2600. RET
  2601. ;-----------------------------------------------------------------------------
  2602. ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2603. ;-----------------------------------------------------------------------------
  2604. INIT_MMX
  2605. %define PALIGNR PALIGNR_MMX
  2606. cglobal pred4x4_vertical_right_mmxext, 3,3
  2607. sub r0, r2
  2608. lea r1, [r0+r2*2]
  2609. movh m0, [r0] ; ........t3t2t1t0
  2610. movq m5, m0
  2611. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2612. pavgb m5, m0
  2613. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2614. movq m1, m0
  2615. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2616. movq m2, m0
  2617. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2618. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2619. movq m1, m3
  2620. psrlq m3, 16
  2621. psllq m1, 48
  2622. movh [r0+r2*1], m5
  2623. movh [r0+r2*2], m3
  2624. PALIGNR m5, m1, 7, m2
  2625. psllq m1, 8
  2626. movh [r1+r2*1], m5
  2627. PALIGNR m3, m1, 7, m1
  2628. movh [r1+r2*2], m3
  2629. RET
  2630. ;-----------------------------------------------------------------------------
  2631. ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2632. ;-----------------------------------------------------------------------------
  2633. INIT_MMX
  2634. %define PALIGNR PALIGNR_MMX
  2635. cglobal pred4x4_down_right_mmxext, 3,3
  2636. sub r0, r2
  2637. lea r1, [r0+r2*2]
  2638. movq m1, [r1-8]
  2639. movq m2, [r0+r2*1-8]
  2640. punpckhbw m2, [r0-8]
  2641. movh m3, [r0]
  2642. punpckhwd m1, m2
  2643. PALIGNR m3, m1, 5, m1
  2644. movq m1, m3
  2645. PALIGNR m3, [r1+r2*1-8], 7, m4
  2646. movq m2, m3
  2647. PALIGNR m3, [r1+r2*2-8], 7, m4
  2648. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2649. movh [r1+r2*2], m0
  2650. psrlq m0, 8
  2651. movh [r1+r2*1], m0
  2652. psrlq m0, 8
  2653. movh [r0+r2*2], m0
  2654. psrlq m0, 8
  2655. movh [r0+r2*1], m0
  2656. RET
  2657. %endif