You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2803 lines
71KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA
  27. tm_shuf: times 8 db 0x03, 0x80
  28. pw_ff00: times 8 dw 0xff00
  29. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  30. db 1, 2, 3, 4, 5, 6, 7, 8
  31. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  32. db 1, 2, 3, 4, 0, 0, 0, 0
  33. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  34. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  35. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  36. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  37. SECTION .text
  38. cextern pb_1
  39. cextern pb_3
  40. cextern pw_4
  41. cextern pw_5
  42. cextern pw_8
  43. cextern pw_16
  44. cextern pw_17
  45. cextern pw_32
  46. ;-----------------------------------------------------------------------------
  47. ; void pred16x16_vertical(uint8_t *src, int stride)
  48. ;-----------------------------------------------------------------------------
  49. cglobal pred16x16_vertical_mmx, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. cglobal pred16x16_vertical_sse, 2,3
  64. sub r0, r1
  65. mov r2, 4
  66. movaps xmm0, [r0]
  67. .loop:
  68. movaps [r0+r1*1], xmm0
  69. movaps [r0+r1*2], xmm0
  70. lea r0, [r0+r1*2]
  71. movaps [r0+r1*1], xmm0
  72. movaps [r0+r1*2], xmm0
  73. lea r0, [r0+r1*2]
  74. dec r2
  75. jg .loop
  76. REP_RET
  77. ;-----------------------------------------------------------------------------
  78. ; void pred16x16_horizontal(uint8_t *src, int stride)
  79. ;-----------------------------------------------------------------------------
  80. %macro PRED16x16_H 1
  81. cglobal pred16x16_horizontal_%1, 2,3
  82. mov r2, 8
  83. %ifidn %1, ssse3
  84. mova m2, [pb_3]
  85. %endif
  86. .loop:
  87. movd m0, [r0+r1*0-4]
  88. movd m1, [r0+r1*1-4]
  89. %ifidn %1, ssse3
  90. pshufb m0, m2
  91. pshufb m1, m2
  92. %else
  93. punpcklbw m0, m0
  94. punpcklbw m1, m1
  95. %ifidn %1, mmxext
  96. pshufw m0, m0, 0xff
  97. pshufw m1, m1, 0xff
  98. %else
  99. punpckhwd m0, m0
  100. punpckhwd m1, m1
  101. punpckhdq m0, m0
  102. punpckhdq m1, m1
  103. %endif
  104. mova [r0+r1*0+8], m0
  105. mova [r0+r1*1+8], m1
  106. %endif
  107. mova [r0+r1*0], m0
  108. mova [r0+r1*1], m1
  109. lea r0, [r0+r1*2]
  110. dec r2
  111. jg .loop
  112. REP_RET
  113. %endmacro
  114. INIT_MMX
  115. PRED16x16_H mmx
  116. PRED16x16_H mmxext
  117. INIT_XMM
  118. PRED16x16_H ssse3
  119. ;-----------------------------------------------------------------------------
  120. ; void pred16x16_dc(uint8_t *src, int stride)
  121. ;-----------------------------------------------------------------------------
  122. %macro PRED16x16_DC 1
  123. cglobal pred16x16_dc_%1, 2,7
  124. mov r4, r0
  125. sub r0, r1
  126. pxor mm0, mm0
  127. pxor mm1, mm1
  128. psadbw mm0, [r0+0]
  129. psadbw mm1, [r0+8]
  130. dec r0
  131. movzx r5d, byte [r0+r1*1]
  132. paddw mm0, mm1
  133. movd r6d, mm0
  134. lea r0, [r0+r1*2]
  135. %rep 7
  136. movzx r2d, byte [r0+r1*0]
  137. movzx r3d, byte [r0+r1*1]
  138. add r5d, r2d
  139. add r6d, r3d
  140. lea r0, [r0+r1*2]
  141. %endrep
  142. movzx r2d, byte [r0+r1*0]
  143. add r5d, r6d
  144. lea r2d, [r2+r5+16]
  145. shr r2d, 5
  146. %ifidn %1, mmxext
  147. movd m0, r2d
  148. punpcklbw m0, m0
  149. pshufw m0, m0, 0
  150. %elifidn %1, sse2
  151. movd m0, r2d
  152. punpcklbw m0, m0
  153. pshuflw m0, m0, 0
  154. punpcklqdq m0, m0
  155. %elifidn %1, ssse3
  156. pxor m1, m1
  157. movd m0, r2d
  158. pshufb m0, m1
  159. %endif
  160. %if mmsize==8
  161. mov r3d, 8
  162. .loop:
  163. mova [r4+r1*0+0], m0
  164. mova [r4+r1*0+8], m0
  165. mova [r4+r1*1+0], m0
  166. mova [r4+r1*1+8], m0
  167. %else
  168. mov r3d, 4
  169. .loop:
  170. mova [r4+r1*0], m0
  171. mova [r4+r1*1], m0
  172. lea r4, [r4+r1*2]
  173. mova [r4+r1*0], m0
  174. mova [r4+r1*1], m0
  175. %endif
  176. lea r4, [r4+r1*2]
  177. dec r3d
  178. jg .loop
  179. REP_RET
  180. %endmacro
  181. INIT_MMX
  182. PRED16x16_DC mmxext
  183. INIT_XMM
  184. PRED16x16_DC sse2
  185. PRED16x16_DC ssse3
  186. ;-----------------------------------------------------------------------------
  187. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  188. ;-----------------------------------------------------------------------------
  189. %macro PRED16x16_TM_MMX 1
  190. cglobal pred16x16_tm_vp8_%1, 2,5
  191. sub r0, r1
  192. pxor mm7, mm7
  193. movq mm0, [r0+0]
  194. movq mm2, [r0+8]
  195. movq mm1, mm0
  196. movq mm3, mm2
  197. punpcklbw mm0, mm7
  198. punpckhbw mm1, mm7
  199. punpcklbw mm2, mm7
  200. punpckhbw mm3, mm7
  201. movzx r3d, byte [r0-1]
  202. mov r4d, 16
  203. .loop:
  204. movzx r2d, byte [r0+r1-1]
  205. sub r2d, r3d
  206. movd mm4, r2d
  207. %ifidn %1, mmx
  208. punpcklwd mm4, mm4
  209. punpckldq mm4, mm4
  210. %else
  211. pshufw mm4, mm4, 0
  212. %endif
  213. movq mm5, mm4
  214. movq mm6, mm4
  215. movq mm7, mm4
  216. paddw mm4, mm0
  217. paddw mm5, mm1
  218. paddw mm6, mm2
  219. paddw mm7, mm3
  220. packuswb mm4, mm5
  221. packuswb mm6, mm7
  222. movq [r0+r1+0], mm4
  223. movq [r0+r1+8], mm6
  224. add r0, r1
  225. dec r4d
  226. jg .loop
  227. REP_RET
  228. %endmacro
  229. PRED16x16_TM_MMX mmx
  230. PRED16x16_TM_MMX mmxext
  231. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  232. sub r0, r1
  233. pxor xmm2, xmm2
  234. movdqa xmm0, [r0]
  235. movdqa xmm1, xmm0
  236. punpcklbw xmm0, xmm2
  237. punpckhbw xmm1, xmm2
  238. movzx r4d, byte [r0-1]
  239. mov r5d, 8
  240. .loop:
  241. movzx r2d, byte [r0+r1*1-1]
  242. movzx r3d, byte [r0+r1*2-1]
  243. sub r2d, r4d
  244. sub r3d, r4d
  245. movd xmm2, r2d
  246. movd xmm4, r3d
  247. pshuflw xmm2, xmm2, 0
  248. pshuflw xmm4, xmm4, 0
  249. punpcklqdq xmm2, xmm2
  250. punpcklqdq xmm4, xmm4
  251. movdqa xmm3, xmm2
  252. movdqa xmm5, xmm4
  253. paddw xmm2, xmm0
  254. paddw xmm3, xmm1
  255. paddw xmm4, xmm0
  256. paddw xmm5, xmm1
  257. packuswb xmm2, xmm3
  258. packuswb xmm4, xmm5
  259. movdqa [r0+r1*1], xmm2
  260. movdqa [r0+r1*2], xmm4
  261. lea r0, [r0+r1*2]
  262. dec r5d
  263. jg .loop
  264. REP_RET
  265. ;-----------------------------------------------------------------------------
  266. ; void pred16x16_plane(uint8_t *src, int stride)
  267. ;-----------------------------------------------------------------------------
  268. %macro H264_PRED16x16_PLANE 3
  269. cglobal pred16x16_plane_%3_%1, 2, 7, %2
  270. mov r2, r1 ; +stride
  271. neg r1 ; -stride
  272. movh m0, [r0+r1 -1]
  273. %if mmsize == 8
  274. pxor m4, m4
  275. movh m1, [r0+r1 +3 ]
  276. movh m2, [r0+r1 +8 ]
  277. movh m3, [r0+r1 +12]
  278. punpcklbw m0, m4
  279. punpcklbw m1, m4
  280. punpcklbw m2, m4
  281. punpcklbw m3, m4
  282. pmullw m0, [pw_m8tom1 ]
  283. pmullw m1, [pw_m8tom1+8]
  284. pmullw m2, [pw_1to8 ]
  285. pmullw m3, [pw_1to8 +8]
  286. paddw m0, m2
  287. paddw m1, m3
  288. %else ; mmsize == 16
  289. %ifidn %1, sse2
  290. pxor m2, m2
  291. movh m1, [r0+r1 +8]
  292. punpcklbw m0, m2
  293. punpcklbw m1, m2
  294. pmullw m0, [pw_m8tom1]
  295. pmullw m1, [pw_1to8]
  296. paddw m0, m1
  297. %else ; ssse3
  298. movhps m0, [r0+r1 +8]
  299. pmaddubsw m0, [plane_shuf] ; H coefficients
  300. %endif
  301. movhlps m1, m0
  302. %endif
  303. paddw m0, m1
  304. %ifidn %1, mmx
  305. mova m1, m0
  306. psrlq m1, 32
  307. %elifidn %1, mmx2
  308. pshufw m1, m0, 0xE
  309. %else ; mmsize == 16
  310. pshuflw m1, m0, 0xE
  311. %endif
  312. paddw m0, m1
  313. %ifidn %1, mmx
  314. mova m1, m0
  315. psrlq m1, 16
  316. %elifidn %1, mmx2
  317. pshufw m1, m0, 0x1
  318. %else
  319. pshuflw m1, m0, 0x1
  320. %endif
  321. paddw m0, m1 ; sum of H coefficients
  322. %ifidn %3, h264
  323. pmullw m0, [pw_5]
  324. paddw m0, [pw_32]
  325. psraw m0, 6
  326. %elifidn %3, rv40
  327. pmullw m0, [pw_5]
  328. psraw m0, 6
  329. %elifidn %3, svq3
  330. movd r3d, m0
  331. movsx r3, r3w
  332. test r3, r3
  333. lea r4, [r3+3]
  334. cmovs r3, r4
  335. sar r3, 2 ; H/4
  336. lea r3, [r3*5] ; 5*(H/4)
  337. test r3, r3
  338. lea r4, [r3+15]
  339. cmovs r3, r4
  340. sar r3, 4 ; (5*(H/4))/16
  341. movd m0, r3d
  342. %endif
  343. lea r4, [r0+r2*8-1]
  344. lea r3, [r0+r2*4-1]
  345. add r4, r2
  346. %ifdef ARCH_X86_64
  347. %define e_reg r11
  348. %else
  349. %define e_reg r0
  350. %endif
  351. movzx e_reg, byte [r3+r2*2 ]
  352. movzx r5, byte [r4+r1 ]
  353. sub r5, e_reg
  354. movzx e_reg, byte [r3+r2 ]
  355. movzx r6, byte [r4 ]
  356. sub r6, e_reg
  357. lea r5, [r5+r6*2]
  358. movzx e_reg, byte [r3+r1 ]
  359. movzx r6, byte [r4+r2*2 ]
  360. sub r6, e_reg
  361. lea r5, [r5+r6*4]
  362. movzx e_reg, byte [r3 ]
  363. %ifdef ARCH_X86_64
  364. movzx r10, byte [r4+r2 ]
  365. sub r10, e_reg
  366. %else
  367. movzx r6, byte [r4+r2 ]
  368. sub r6, e_reg
  369. lea r5, [r5+r6*4]
  370. sub r5, r6
  371. %endif
  372. lea e_reg, [r3+r1*4]
  373. lea r3, [r4+r2*4]
  374. movzx r4, byte [e_reg+r2 ]
  375. movzx r6, byte [r3 ]
  376. sub r6, r4
  377. %ifdef ARCH_X86_64
  378. lea r6, [r10+r6*2]
  379. lea r5, [r5+r6*2]
  380. add r5, r6
  381. %else
  382. lea r5, [r5+r6*4]
  383. lea r5, [r5+r6*2]
  384. %endif
  385. movzx r4, byte [e_reg ]
  386. %ifdef ARCH_X86_64
  387. movzx r10, byte [r3 +r2 ]
  388. sub r10, r4
  389. sub r5, r10
  390. %else
  391. movzx r6, byte [r3 +r2 ]
  392. sub r6, r4
  393. lea r5, [r5+r6*8]
  394. sub r5, r6
  395. %endif
  396. movzx r4, byte [e_reg+r1 ]
  397. movzx r6, byte [r3 +r2*2]
  398. sub r6, r4
  399. %ifdef ARCH_X86_64
  400. add r6, r10
  401. %endif
  402. lea r5, [r5+r6*8]
  403. movzx r4, byte [e_reg+r2*2]
  404. movzx r6, byte [r3 +r1 ]
  405. sub r6, r4
  406. lea r5, [r5+r6*4]
  407. add r5, r6 ; sum of V coefficients
  408. %ifndef ARCH_X86_64
  409. mov r0, r0m
  410. %endif
  411. %ifidn %3, h264
  412. lea r5, [r5*5+32]
  413. sar r5, 6
  414. %elifidn %3, rv40
  415. lea r5, [r5*5]
  416. sar r5, 6
  417. %elifidn %3, svq3
  418. test r5, r5
  419. lea r6, [r5+3]
  420. cmovs r5, r6
  421. sar r5, 2 ; V/4
  422. lea r5, [r5*5] ; 5*(V/4)
  423. test r5, r5
  424. lea r6, [r5+15]
  425. cmovs r5, r6
  426. sar r5, 4 ; (5*(V/4))/16
  427. %endif
  428. movzx r4, byte [r0+r1 +15]
  429. movzx r3, byte [r3+r2*2 ]
  430. lea r3, [r3+r4+1]
  431. shl r3, 4
  432. movd r1d, m0
  433. movsx r1d, r1w
  434. add r1d, r5d
  435. add r3d, r1d
  436. shl r1d, 3
  437. sub r3d, r1d ; a
  438. movd m1, r5d
  439. movd m3, r3d
  440. %ifidn %1, mmx
  441. punpcklwd m0, m0
  442. punpcklwd m1, m1
  443. punpcklwd m3, m3
  444. punpckldq m0, m0
  445. punpckldq m1, m1
  446. punpckldq m3, m3
  447. %elifidn %1, mmx2
  448. pshufw m0, m0, 0x0
  449. pshufw m1, m1, 0x0
  450. pshufw m3, m3, 0x0
  451. %else
  452. pshuflw m0, m0, 0x0
  453. pshuflw m1, m1, 0x0
  454. pshuflw m3, m3, 0x0
  455. punpcklqdq m0, m0 ; splat H (words)
  456. punpcklqdq m1, m1 ; splat V (words)
  457. punpcklqdq m3, m3 ; splat a (words)
  458. %endif
  459. %ifidn %3, svq3
  460. SWAP 0, 1
  461. %endif
  462. mova m2, m0
  463. %if mmsize == 8
  464. mova m5, m0
  465. %endif
  466. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  467. %if mmsize == 16
  468. psllw m2, 3
  469. %else
  470. psllw m5, 3
  471. psllw m2, 2
  472. mova m6, m5
  473. paddw m6, m2
  474. %endif
  475. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  476. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  477. %if mmsize == 8
  478. paddw m5, m0 ; a + {8,9,10,11}*H
  479. paddw m6, m0 ; a + {12,13,14,15}*H
  480. %endif
  481. mov r4, 8
  482. .loop
  483. mova m3, m0 ; b[0..7]
  484. mova m4, m2 ; b[8..15]
  485. psraw m3, 5
  486. psraw m4, 5
  487. packuswb m3, m4
  488. mova [r0], m3
  489. %if mmsize == 8
  490. mova m3, m5 ; b[8..11]
  491. mova m4, m6 ; b[12..15]
  492. psraw m3, 5
  493. psraw m4, 5
  494. packuswb m3, m4
  495. mova [r0+8], m3
  496. %endif
  497. paddw m0, m1
  498. paddw m2, m1
  499. %if mmsize == 8
  500. paddw m5, m1
  501. paddw m6, m1
  502. %endif
  503. mova m3, m0 ; b[0..7]
  504. mova m4, m2 ; b[8..15]
  505. psraw m3, 5
  506. psraw m4, 5
  507. packuswb m3, m4
  508. mova [r0+r2], m3
  509. %if mmsize == 8
  510. mova m3, m5 ; b[8..11]
  511. mova m4, m6 ; b[12..15]
  512. psraw m3, 5
  513. psraw m4, 5
  514. packuswb m3, m4
  515. mova [r0+r2+8], m3
  516. %endif
  517. paddw m0, m1
  518. paddw m2, m1
  519. %if mmsize == 8
  520. paddw m5, m1
  521. paddw m6, m1
  522. %endif
  523. lea r0, [r0+r2*2]
  524. dec r4
  525. jg .loop
  526. REP_RET
  527. %endmacro
  528. INIT_MMX
  529. H264_PRED16x16_PLANE mmx, 0, h264
  530. H264_PRED16x16_PLANE mmx, 0, rv40
  531. H264_PRED16x16_PLANE mmx, 0, svq3
  532. H264_PRED16x16_PLANE mmx2, 0, h264
  533. H264_PRED16x16_PLANE mmx2, 0, rv40
  534. H264_PRED16x16_PLANE mmx2, 0, svq3
  535. INIT_XMM
  536. H264_PRED16x16_PLANE sse2, 8, h264
  537. H264_PRED16x16_PLANE sse2, 8, rv40
  538. H264_PRED16x16_PLANE sse2, 8, svq3
  539. H264_PRED16x16_PLANE ssse3, 8, h264
  540. H264_PRED16x16_PLANE ssse3, 8, rv40
  541. H264_PRED16x16_PLANE ssse3, 8, svq3
  542. ;-----------------------------------------------------------------------------
  543. ; void pred8x8_plane(uint8_t *src, int stride)
  544. ;-----------------------------------------------------------------------------
  545. %macro H264_PRED8x8_PLANE 2
  546. cglobal pred8x8_plane_%1, 2, 7, %2
  547. mov r2, r1 ; +stride
  548. neg r1 ; -stride
  549. movd m0, [r0+r1 -1]
  550. %if mmsize == 8
  551. pxor m2, m2
  552. movh m1, [r0+r1 +4 ]
  553. punpcklbw m0, m2
  554. punpcklbw m1, m2
  555. pmullw m0, [pw_m4to4]
  556. pmullw m1, [pw_m4to4+8]
  557. %else ; mmsize == 16
  558. %ifidn %1, sse2
  559. pxor m2, m2
  560. movd m1, [r0+r1 +4]
  561. punpckldq m0, m1
  562. punpcklbw m0, m2
  563. pmullw m0, [pw_m4to4]
  564. %else ; ssse3
  565. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  566. pmaddubsw m0, [plane8_shuf] ; H coefficients
  567. %endif
  568. movhlps m1, m0
  569. %endif
  570. paddw m0, m1
  571. %ifnidn %1, ssse3
  572. %ifidn %1, mmx
  573. mova m1, m0
  574. psrlq m1, 32
  575. %elifidn %1, mmx2
  576. pshufw m1, m0, 0xE
  577. %else ; mmsize == 16
  578. pshuflw m1, m0, 0xE
  579. %endif
  580. paddw m0, m1
  581. %endif ; !ssse3
  582. %ifidn %1, mmx
  583. mova m1, m0
  584. psrlq m1, 16
  585. %elifidn %1, mmx2
  586. pshufw m1, m0, 0x1
  587. %else
  588. pshuflw m1, m0, 0x1
  589. %endif
  590. paddw m0, m1 ; sum of H coefficients
  591. pmullw m0, [pw_17]
  592. paddw m0, [pw_16]
  593. psraw m0, 5
  594. lea r4, [r0+r2*4-1]
  595. lea r3, [r0 -1]
  596. add r4, r2
  597. %ifdef ARCH_X86_64
  598. %define e_reg r11
  599. %else
  600. %define e_reg r0
  601. %endif
  602. movzx e_reg, byte [r3+r2*2 ]
  603. movzx r5, byte [r4+r1 ]
  604. sub r5, e_reg
  605. movzx e_reg, byte [r3 ]
  606. %ifdef ARCH_X86_64
  607. movzx r10, byte [r4+r2 ]
  608. sub r10, e_reg
  609. sub r5, r10
  610. %else
  611. movzx r6, byte [r4+r2 ]
  612. sub r6, e_reg
  613. lea r5, [r5+r6*4]
  614. sub r5, r6
  615. %endif
  616. movzx e_reg, byte [r3+r1 ]
  617. movzx r6, byte [r4+r2*2 ]
  618. sub r6, e_reg
  619. %ifdef ARCH_X86_64
  620. add r6, r10
  621. %endif
  622. lea r5, [r5+r6*4]
  623. movzx e_reg, byte [r3+r2 ]
  624. movzx r6, byte [r4 ]
  625. sub r6, e_reg
  626. lea r6, [r5+r6*2]
  627. lea r5, [r6*9+16]
  628. lea r5, [r5+r6*8]
  629. sar r5, 5
  630. %ifndef ARCH_X86_64
  631. mov r0, r0m
  632. %endif
  633. movzx r3, byte [r4+r2*2 ]
  634. movzx r4, byte [r0+r1 +7]
  635. lea r3, [r3+r4+1]
  636. shl r3, 4
  637. movd r1d, m0
  638. movsx r1d, r1w
  639. add r1d, r5d
  640. sub r3d, r1d
  641. add r1d, r1d
  642. sub r3d, r1d ; a
  643. movd m1, r5d
  644. movd m3, r3d
  645. %ifidn %1, mmx
  646. punpcklwd m0, m0
  647. punpcklwd m1, m1
  648. punpcklwd m3, m3
  649. punpckldq m0, m0
  650. punpckldq m1, m1
  651. punpckldq m3, m3
  652. %elifidn %1, mmx2
  653. pshufw m0, m0, 0x0
  654. pshufw m1, m1, 0x0
  655. pshufw m3, m3, 0x0
  656. %else
  657. pshuflw m0, m0, 0x0
  658. pshuflw m1, m1, 0x0
  659. pshuflw m3, m3, 0x0
  660. punpcklqdq m0, m0 ; splat H (words)
  661. punpcklqdq m1, m1 ; splat V (words)
  662. punpcklqdq m3, m3 ; splat a (words)
  663. %endif
  664. %if mmsize == 8
  665. mova m2, m0
  666. %endif
  667. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  668. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  669. %if mmsize == 8
  670. psllw m2, 2
  671. paddw m2, m0 ; a + {4,5,6,7}*H
  672. %endif
  673. mov r4, 4
  674. ALIGN 16
  675. .loop
  676. %if mmsize == 16
  677. mova m3, m0 ; b[0..7]
  678. paddw m0, m1
  679. psraw m3, 5
  680. mova m4, m0 ; V+b[0..7]
  681. paddw m0, m1
  682. psraw m4, 5
  683. packuswb m3, m4
  684. movh [r0], m3
  685. movhps [r0+r2], m3
  686. %else ; mmsize == 8
  687. mova m3, m0 ; b[0..3]
  688. mova m4, m2 ; b[4..7]
  689. paddw m0, m1
  690. paddw m2, m1
  691. psraw m3, 5
  692. psraw m4, 5
  693. mova m5, m0 ; V+b[0..3]
  694. mova m6, m2 ; V+b[4..7]
  695. paddw m0, m1
  696. paddw m2, m1
  697. psraw m5, 5
  698. psraw m6, 5
  699. packuswb m3, m4
  700. packuswb m5, m6
  701. mova [r0], m3
  702. mova [r0+r2], m5
  703. %endif
  704. lea r0, [r0+r2*2]
  705. dec r4
  706. jg .loop
  707. REP_RET
  708. %endmacro
  709. INIT_MMX
  710. H264_PRED8x8_PLANE mmx, 0
  711. H264_PRED8x8_PLANE mmx2, 0
  712. INIT_XMM
  713. H264_PRED8x8_PLANE sse2, 8
  714. H264_PRED8x8_PLANE ssse3, 8
  715. ;-----------------------------------------------------------------------------
  716. ; void pred8x8_vertical(uint8_t *src, int stride)
  717. ;-----------------------------------------------------------------------------
  718. cglobal pred8x8_vertical_mmx, 2,2
  719. sub r0, r1
  720. movq mm0, [r0]
  721. %rep 3
  722. movq [r0+r1*1], mm0
  723. movq [r0+r1*2], mm0
  724. lea r0, [r0+r1*2]
  725. %endrep
  726. movq [r0+r1*1], mm0
  727. movq [r0+r1*2], mm0
  728. RET
  729. ;-----------------------------------------------------------------------------
  730. ; void pred8x8_horizontal(uint8_t *src, int stride)
  731. ;-----------------------------------------------------------------------------
  732. %macro PRED8x8_H 1
  733. cglobal pred8x8_horizontal_%1, 2,3
  734. mov r2, 4
  735. %ifidn %1, ssse3
  736. mova m2, [pb_3]
  737. %endif
  738. .loop:
  739. movd m0, [r0+r1*0-4]
  740. movd m1, [r0+r1*1-4]
  741. %ifidn %1, ssse3
  742. pshufb m0, m2
  743. pshufb m1, m2
  744. %else
  745. punpcklbw m0, m0
  746. punpcklbw m1, m1
  747. %ifidn %1, mmxext
  748. pshufw m0, m0, 0xff
  749. pshufw m1, m1, 0xff
  750. %else
  751. punpckhwd m0, m0
  752. punpckhwd m1, m1
  753. punpckhdq m0, m0
  754. punpckhdq m1, m1
  755. %endif
  756. %endif
  757. mova [r0+r1*0], m0
  758. mova [r0+r1*1], m1
  759. lea r0, [r0+r1*2]
  760. dec r2
  761. jg .loop
  762. REP_RET
  763. %endmacro
  764. INIT_MMX
  765. PRED8x8_H mmx
  766. PRED8x8_H mmxext
  767. PRED8x8_H ssse3
  768. ;-----------------------------------------------------------------------------
  769. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  770. ;-----------------------------------------------------------------------------
  771. %ifdef CONFIG_GPL
  772. cglobal pred8x8_top_dc_mmxext, 2,5
  773. sub r0, r1
  774. movq mm0, [r0]
  775. pxor mm1, mm1
  776. pxor mm2, mm2
  777. lea r2, [r0+r1*2]
  778. punpckhbw mm1, mm0
  779. punpcklbw mm0, mm2
  780. psadbw mm1, mm2 ; s1
  781. lea r3, [r2+r1*2]
  782. psadbw mm0, mm2 ; s0
  783. psrlw mm1, 1
  784. psrlw mm0, 1
  785. pavgw mm1, mm2
  786. lea r4, [r3+r1*2]
  787. pavgw mm0, mm2
  788. pshufw mm1, mm1, 0
  789. pshufw mm0, mm0, 0 ; dc0 (w)
  790. packuswb mm0, mm1 ; dc0,dc1 (b)
  791. movq [r0+r1*1], mm0
  792. movq [r0+r1*2], mm0
  793. lea r0, [r3+r1*2]
  794. movq [r2+r1*1], mm0
  795. movq [r2+r1*2], mm0
  796. movq [r3+r1*1], mm0
  797. movq [r3+r1*2], mm0
  798. movq [r0+r1*1], mm0
  799. movq [r0+r1*2], mm0
  800. RET
  801. ;-----------------------------------------------------------------------------
  802. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  803. ;-----------------------------------------------------------------------------
  804. INIT_MMX
  805. cglobal pred8x8_dc_mmxext, 2,5
  806. sub r0, r1
  807. pxor m7, m7
  808. movd m0, [r0+0]
  809. movd m1, [r0+4]
  810. psadbw m0, m7 ; s0
  811. mov r4, r0
  812. psadbw m1, m7 ; s1
  813. movzx r2d, byte [r0+r1*1-1]
  814. movzx r3d, byte [r0+r1*2-1]
  815. lea r0, [r0+r1*2]
  816. add r2d, r3d
  817. movzx r3d, byte [r0+r1*1-1]
  818. add r2d, r3d
  819. movzx r3d, byte [r0+r1*2-1]
  820. add r2d, r3d
  821. lea r0, [r0+r1*2]
  822. movd m2, r2d ; s2
  823. movzx r2d, byte [r0+r1*1-1]
  824. movzx r3d, byte [r0+r1*2-1]
  825. lea r0, [r0+r1*2]
  826. add r2d, r3d
  827. movzx r3d, byte [r0+r1*1-1]
  828. add r2d, r3d
  829. movzx r3d, byte [r0+r1*2-1]
  830. add r2d, r3d
  831. movd m3, r2d ; s3
  832. punpcklwd m0, m1
  833. mov r0, r4
  834. punpcklwd m2, m3
  835. punpckldq m0, m2 ; s0, s1, s2, s3
  836. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  837. lea r2, [r0+r1*2]
  838. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  839. paddw m0, m3
  840. lea r3, [r2+r1*2]
  841. psrlw m0, 2
  842. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  843. lea r4, [r3+r1*2]
  844. packuswb m0, m0
  845. punpcklbw m0, m0
  846. movq m1, m0
  847. punpcklbw m0, m0
  848. punpckhbw m1, m1
  849. movq [r0+r1*1], m0
  850. movq [r0+r1*2], m0
  851. movq [r2+r1*1], m0
  852. movq [r2+r1*2], m0
  853. movq [r3+r1*1], m1
  854. movq [r3+r1*2], m1
  855. movq [r4+r1*1], m1
  856. movq [r4+r1*2], m1
  857. RET
  858. %endif
  859. ;-----------------------------------------------------------------------------
  860. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  861. ;-----------------------------------------------------------------------------
  862. cglobal pred8x8_dc_rv40_mmxext, 2,7
  863. mov r4, r0
  864. sub r0, r1
  865. pxor mm0, mm0
  866. psadbw mm0, [r0]
  867. dec r0
  868. movzx r5d, byte [r0+r1*1]
  869. movd r6d, mm0
  870. lea r0, [r0+r1*2]
  871. %rep 3
  872. movzx r2d, byte [r0+r1*0]
  873. movzx r3d, byte [r0+r1*1]
  874. add r5d, r2d
  875. add r6d, r3d
  876. lea r0, [r0+r1*2]
  877. %endrep
  878. movzx r2d, byte [r0+r1*0]
  879. add r5d, r6d
  880. lea r2d, [r2+r5+8]
  881. shr r2d, 4
  882. movd mm0, r2d
  883. punpcklbw mm0, mm0
  884. pshufw mm0, mm0, 0
  885. mov r3d, 4
  886. .loop:
  887. movq [r4+r1*0], mm0
  888. movq [r4+r1*1], mm0
  889. lea r4, [r4+r1*2]
  890. dec r3d
  891. jg .loop
  892. REP_RET
  893. ;-----------------------------------------------------------------------------
  894. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  895. ;-----------------------------------------------------------------------------
  896. %macro PRED8x8_TM_MMX 1
  897. cglobal pred8x8_tm_vp8_%1, 2,6
  898. sub r0, r1
  899. pxor mm7, mm7
  900. movq mm0, [r0]
  901. movq mm1, mm0
  902. punpcklbw mm0, mm7
  903. punpckhbw mm1, mm7
  904. movzx r4d, byte [r0-1]
  905. mov r5d, 4
  906. .loop:
  907. movzx r2d, byte [r0+r1*1-1]
  908. movzx r3d, byte [r0+r1*2-1]
  909. sub r2d, r4d
  910. sub r3d, r4d
  911. movd mm2, r2d
  912. movd mm4, r3d
  913. %ifidn %1, mmx
  914. punpcklwd mm2, mm2
  915. punpcklwd mm4, mm4
  916. punpckldq mm2, mm2
  917. punpckldq mm4, mm4
  918. %else
  919. pshufw mm2, mm2, 0
  920. pshufw mm4, mm4, 0
  921. %endif
  922. movq mm3, mm2
  923. movq mm5, mm4
  924. paddw mm2, mm0
  925. paddw mm3, mm1
  926. paddw mm4, mm0
  927. paddw mm5, mm1
  928. packuswb mm2, mm3
  929. packuswb mm4, mm5
  930. movq [r0+r1*1], mm2
  931. movq [r0+r1*2], mm4
  932. lea r0, [r0+r1*2]
  933. dec r5d
  934. jg .loop
  935. REP_RET
  936. %endmacro
  937. PRED8x8_TM_MMX mmx
  938. PRED8x8_TM_MMX mmxext
  939. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  940. sub r0, r1
  941. pxor xmm1, xmm1
  942. movq xmm0, [r0]
  943. punpcklbw xmm0, xmm1
  944. movzx r4d, byte [r0-1]
  945. mov r5d, 4
  946. .loop:
  947. movzx r2d, byte [r0+r1*1-1]
  948. movzx r3d, byte [r0+r1*2-1]
  949. sub r2d, r4d
  950. sub r3d, r4d
  951. movd xmm2, r2d
  952. movd xmm3, r3d
  953. pshuflw xmm2, xmm2, 0
  954. pshuflw xmm3, xmm3, 0
  955. punpcklqdq xmm2, xmm2
  956. punpcklqdq xmm3, xmm3
  957. paddw xmm2, xmm0
  958. paddw xmm3, xmm0
  959. packuswb xmm2, xmm3
  960. movq [r0+r1*1], xmm2
  961. movhps [r0+r1*2], xmm2
  962. lea r0, [r0+r1*2]
  963. dec r5d
  964. jg .loop
  965. REP_RET
  966. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  967. sub r0, r1
  968. movdqa xmm4, [tm_shuf]
  969. pxor xmm1, xmm1
  970. movq xmm0, [r0]
  971. punpcklbw xmm0, xmm1
  972. movd xmm5, [r0-4]
  973. pshufb xmm5, xmm4
  974. mov r2d, 4
  975. .loop:
  976. movd xmm2, [r0+r1*1-4]
  977. movd xmm3, [r0+r1*2-4]
  978. pshufb xmm2, xmm4
  979. pshufb xmm3, xmm4
  980. psubw xmm2, xmm5
  981. psubw xmm3, xmm5
  982. paddw xmm2, xmm0
  983. paddw xmm3, xmm0
  984. packuswb xmm2, xmm3
  985. movq [r0+r1*1], xmm2
  986. movhps [r0+r1*2], xmm2
  987. lea r0, [r0+r1*2]
  988. dec r2d
  989. jg .loop
  990. REP_RET
  991. ; dest, left, right, src, tmp
  992. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  993. %macro PRED4x4_LOWPASS 5
  994. mova %5, %2
  995. pavgb %2, %3
  996. pxor %3, %5
  997. mova %1, %4
  998. pand %3, [pb_1]
  999. psubusb %2, %3
  1000. pavgb %1, %2
  1001. %endmacro
  1002. ;-----------------------------------------------------------------------------
  1003. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1004. ;-----------------------------------------------------------------------------
  1005. %ifdef CONFIG_GPL
  1006. %macro PRED8x8L_TOP_DC 1
  1007. cglobal pred8x8l_top_dc_%1, 4,4
  1008. sub r0, r3
  1009. pxor mm7, mm7
  1010. movq mm0, [r0-8]
  1011. movq mm3, [r0]
  1012. movq mm1, [r0+8]
  1013. movq mm2, mm3
  1014. movq mm4, mm3
  1015. PALIGNR mm2, mm0, 7, mm0
  1016. PALIGNR mm1, mm4, 1, mm4
  1017. test r1, r1 ; top_left
  1018. jz .fix_lt_2
  1019. test r2, r2 ; top_right
  1020. jz .fix_tr_1
  1021. jmp .body
  1022. .fix_lt_2:
  1023. movq mm5, mm3
  1024. pxor mm5, mm2
  1025. psllq mm5, 56
  1026. psrlq mm5, 56
  1027. pxor mm2, mm5
  1028. test r2, r2 ; top_right
  1029. jnz .body
  1030. .fix_tr_1:
  1031. movq mm5, mm3
  1032. pxor mm5, mm1
  1033. psrlq mm5, 56
  1034. psllq mm5, 56
  1035. pxor mm1, mm5
  1036. .body
  1037. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1038. psadbw mm7, mm0
  1039. paddw mm7, [pw_4]
  1040. psrlw mm7, 3
  1041. pshufw mm7, mm7, 0
  1042. packuswb mm7, mm7
  1043. %rep 3
  1044. movq [r0+r3*1], mm7
  1045. movq [r0+r3*2], mm7
  1046. lea r0, [r0+r3*2]
  1047. %endrep
  1048. movq [r0+r3*1], mm7
  1049. movq [r0+r3*2], mm7
  1050. RET
  1051. %endmacro
  1052. INIT_MMX
  1053. %define PALIGNR PALIGNR_MMX
  1054. PRED8x8L_TOP_DC mmxext
  1055. %define PALIGNR PALIGNR_SSSE3
  1056. PRED8x8L_TOP_DC ssse3
  1057. ;-----------------------------------------------------------------------------
  1058. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1059. ;-----------------------------------------------------------------------------
  1060. %macro PRED8x8L_DC 1
  1061. cglobal pred8x8l_dc_%1, 4,5
  1062. sub r0, r3
  1063. lea r4, [r0+r3*2]
  1064. movq mm0, [r0+r3*1-8]
  1065. punpckhbw mm0, [r0+r3*0-8]
  1066. movq mm1, [r4+r3*1-8]
  1067. punpckhbw mm1, [r0+r3*2-8]
  1068. mov r4, r0
  1069. punpckhwd mm1, mm0
  1070. lea r0, [r0+r3*4]
  1071. movq mm2, [r0+r3*1-8]
  1072. punpckhbw mm2, [r0+r3*0-8]
  1073. lea r0, [r0+r3*2]
  1074. movq mm3, [r0+r3*1-8]
  1075. punpckhbw mm3, [r0+r3*0-8]
  1076. punpckhwd mm3, mm2
  1077. punpckhdq mm3, mm1
  1078. lea r0, [r0+r3*2]
  1079. movq mm0, [r0+r3*0-8]
  1080. movq mm1, [r4]
  1081. mov r0, r4
  1082. movq mm4, mm3
  1083. movq mm2, mm3
  1084. PALIGNR mm4, mm0, 7, mm0
  1085. PALIGNR mm1, mm2, 1, mm2
  1086. test r1, r1
  1087. jnz .do_left
  1088. .fix_lt_1:
  1089. movq mm5, mm3
  1090. pxor mm5, mm4
  1091. psrlq mm5, 56
  1092. psllq mm5, 48
  1093. pxor mm1, mm5
  1094. jmp .do_left
  1095. .fix_lt_2:
  1096. movq mm5, mm3
  1097. pxor mm5, mm2
  1098. psllq mm5, 56
  1099. psrlq mm5, 56
  1100. pxor mm2, mm5
  1101. test r2, r2
  1102. jnz .body
  1103. .fix_tr_1:
  1104. movq mm5, mm3
  1105. pxor mm5, mm1
  1106. psrlq mm5, 56
  1107. psllq mm5, 56
  1108. pxor mm1, mm5
  1109. jmp .body
  1110. .do_left:
  1111. movq mm0, mm4
  1112. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1113. movq mm4, mm0
  1114. movq mm7, mm2
  1115. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1116. psllq mm1, 56
  1117. PALIGNR mm7, mm1, 7, mm3
  1118. movq mm0, [r0-8]
  1119. movq mm3, [r0]
  1120. movq mm1, [r0+8]
  1121. movq mm2, mm3
  1122. movq mm4, mm3
  1123. PALIGNR mm2, mm0, 7, mm0
  1124. PALIGNR mm1, mm4, 1, mm4
  1125. test r1, r1
  1126. jz .fix_lt_2
  1127. test r2, r2
  1128. jz .fix_tr_1
  1129. .body
  1130. lea r1, [r0+r3*2]
  1131. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1132. pxor mm0, mm0
  1133. pxor mm1, mm1
  1134. lea r2, [r1+r3*2]
  1135. psadbw mm0, mm7
  1136. psadbw mm1, mm6
  1137. paddw mm0, [pw_8]
  1138. paddw mm0, mm1
  1139. lea r4, [r2+r3*2]
  1140. psrlw mm0, 4
  1141. pshufw mm0, mm0, 0
  1142. packuswb mm0, mm0
  1143. movq [r0+r3*1], mm0
  1144. movq [r0+r3*2], mm0
  1145. movq [r1+r3*1], mm0
  1146. movq [r1+r3*2], mm0
  1147. movq [r2+r3*1], mm0
  1148. movq [r2+r3*2], mm0
  1149. movq [r4+r3*1], mm0
  1150. movq [r4+r3*2], mm0
  1151. RET
  1152. %endmacro
  1153. INIT_MMX
  1154. %define PALIGNR PALIGNR_MMX
  1155. PRED8x8L_DC mmxext
  1156. %define PALIGNR PALIGNR_SSSE3
  1157. PRED8x8L_DC ssse3
  1158. ;-----------------------------------------------------------------------------
  1159. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1160. ;-----------------------------------------------------------------------------
  1161. %macro PRED8x8L_HORIZONTAL 1
  1162. cglobal pred8x8l_horizontal_%1, 4,4
  1163. sub r0, r3
  1164. lea r2, [r0+r3*2]
  1165. movq mm0, [r0+r3*1-8]
  1166. punpckhbw mm0, [r0+r3*0-8]
  1167. movq mm1, [r2+r3*1-8]
  1168. punpckhbw mm1, [r0+r3*2-8]
  1169. mov r2, r0
  1170. punpckhwd mm1, mm0
  1171. lea r0, [r0+r3*4]
  1172. movq mm2, [r0+r3*1-8]
  1173. punpckhbw mm2, [r0+r3*0-8]
  1174. lea r0, [r0+r3*2]
  1175. movq mm3, [r0+r3*1-8]
  1176. punpckhbw mm3, [r0+r3*0-8]
  1177. punpckhwd mm3, mm2
  1178. punpckhdq mm3, mm1
  1179. lea r0, [r0+r3*2]
  1180. movq mm0, [r0+r3*0-8]
  1181. movq mm1, [r2]
  1182. mov r0, r2
  1183. movq mm4, mm3
  1184. movq mm2, mm3
  1185. PALIGNR mm4, mm0, 7, mm0
  1186. PALIGNR mm1, mm2, 1, mm2
  1187. test r1, r1 ; top_left
  1188. jnz .do_left
  1189. .fix_lt_1:
  1190. movq mm5, mm3
  1191. pxor mm5, mm4
  1192. psrlq mm5, 56
  1193. psllq mm5, 48
  1194. pxor mm1, mm5
  1195. .do_left:
  1196. movq mm0, mm4
  1197. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1198. movq mm4, mm0
  1199. movq mm7, mm2
  1200. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1201. psllq mm1, 56
  1202. PALIGNR mm7, mm1, 7, mm3
  1203. movq mm3, mm7
  1204. lea r1, [r0+r3*2]
  1205. movq mm7, mm3
  1206. punpckhbw mm3, mm3
  1207. punpcklbw mm7, mm7
  1208. pshufw mm0, mm3, 0xff
  1209. pshufw mm1, mm3, 0xaa
  1210. lea r2, [r1+r3*2]
  1211. pshufw mm2, mm3, 0x55
  1212. pshufw mm3, mm3, 0x00
  1213. pshufw mm4, mm7, 0xff
  1214. pshufw mm5, mm7, 0xaa
  1215. pshufw mm6, mm7, 0x55
  1216. pshufw mm7, mm7, 0x00
  1217. movq [r0+r3*1], mm0
  1218. movq [r0+r3*2], mm1
  1219. movq [r1+r3*1], mm2
  1220. movq [r1+r3*2], mm3
  1221. movq [r2+r3*1], mm4
  1222. movq [r2+r3*2], mm5
  1223. lea r0, [r2+r3*2]
  1224. movq [r0+r3*1], mm6
  1225. movq [r0+r3*2], mm7
  1226. RET
  1227. %endmacro
  1228. INIT_MMX
  1229. %define PALIGNR PALIGNR_MMX
  1230. PRED8x8L_HORIZONTAL mmxext
  1231. %define PALIGNR PALIGNR_SSSE3
  1232. PRED8x8L_HORIZONTAL ssse3
  1233. ;-----------------------------------------------------------------------------
  1234. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1235. ;-----------------------------------------------------------------------------
  1236. %macro PRED8x8L_VERTICAL 1
  1237. cglobal pred8x8l_vertical_%1, 4,4
  1238. sub r0, r3
  1239. movq mm0, [r0-8]
  1240. movq mm3, [r0]
  1241. movq mm1, [r0+8]
  1242. movq mm2, mm3
  1243. movq mm4, mm3
  1244. PALIGNR mm2, mm0, 7, mm0
  1245. PALIGNR mm1, mm4, 1, mm4
  1246. test r1, r1 ; top_left
  1247. jz .fix_lt_2
  1248. test r2, r2 ; top_right
  1249. jz .fix_tr_1
  1250. jmp .body
  1251. .fix_lt_2:
  1252. movq mm5, mm3
  1253. pxor mm5, mm2
  1254. psllq mm5, 56
  1255. psrlq mm5, 56
  1256. pxor mm2, mm5
  1257. test r2, r2 ; top_right
  1258. jnz .body
  1259. .fix_tr_1:
  1260. movq mm5, mm3
  1261. pxor mm5, mm1
  1262. psrlq mm5, 56
  1263. psllq mm5, 56
  1264. pxor mm1, mm5
  1265. .body
  1266. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1267. %rep 3
  1268. movq [r0+r3*1], mm0
  1269. movq [r0+r3*2], mm0
  1270. lea r0, [r0+r3*2]
  1271. %endrep
  1272. movq [r0+r3*1], mm0
  1273. movq [r0+r3*2], mm0
  1274. RET
  1275. %endmacro
  1276. INIT_MMX
  1277. %define PALIGNR PALIGNR_MMX
  1278. PRED8x8L_VERTICAL mmxext
  1279. %define PALIGNR PALIGNR_SSSE3
  1280. PRED8x8L_VERTICAL ssse3
  1281. ;-----------------------------------------------------------------------------
  1282. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1283. ;-----------------------------------------------------------------------------
  1284. INIT_MMX
  1285. %define PALIGNR PALIGNR_MMX
  1286. cglobal pred8x8l_down_left_mmxext, 4,5
  1287. sub r0, r3
  1288. movq mm0, [r0-8]
  1289. movq mm3, [r0]
  1290. movq mm1, [r0+8]
  1291. movq mm2, mm3
  1292. movq mm4, mm3
  1293. PALIGNR mm2, mm0, 7, mm0
  1294. PALIGNR mm1, mm4, 1, mm4
  1295. test r1, r1
  1296. jz .fix_lt_2
  1297. test r2, r2
  1298. jz .fix_tr_1
  1299. jmp .do_top
  1300. .fix_lt_2:
  1301. movq mm5, mm3
  1302. pxor mm5, mm2
  1303. psllq mm5, 56
  1304. psrlq mm5, 56
  1305. pxor mm2, mm5
  1306. test r2, r2
  1307. jnz .do_top
  1308. .fix_tr_1:
  1309. movq mm5, mm3
  1310. pxor mm5, mm1
  1311. psrlq mm5, 56
  1312. psllq mm5, 56
  1313. pxor mm1, mm5
  1314. jmp .do_top
  1315. .fix_tr_2:
  1316. punpckhbw mm3, mm3
  1317. pshufw mm1, mm3, 0xFF
  1318. jmp .do_topright
  1319. .do_top:
  1320. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1321. movq mm7, mm4
  1322. test r2, r2
  1323. jz .fix_tr_2
  1324. movq mm0, [r0+8]
  1325. movq mm5, mm0
  1326. movq mm2, mm0
  1327. movq mm4, mm0
  1328. psrlq mm5, 56
  1329. PALIGNR mm2, mm3, 7, mm3
  1330. PALIGNR mm5, mm4, 1, mm4
  1331. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1332. .do_topright:
  1333. lea r1, [r0+r3*2]
  1334. movq mm6, mm1
  1335. psrlq mm1, 56
  1336. movq mm4, mm1
  1337. lea r2, [r1+r3*2]
  1338. movq mm2, mm6
  1339. PALIGNR mm2, mm7, 1, mm0
  1340. movq mm3, mm6
  1341. PALIGNR mm3, mm7, 7, mm0
  1342. PALIGNR mm4, mm6, 1, mm0
  1343. movq mm5, mm7
  1344. movq mm1, mm7
  1345. movq mm7, mm6
  1346. lea r4, [r2+r3*2]
  1347. psllq mm1, 8
  1348. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1349. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1350. movq [r4+r3*2], mm1
  1351. movq mm2, mm0
  1352. psllq mm1, 8
  1353. psrlq mm2, 56
  1354. psllq mm0, 8
  1355. por mm1, mm2
  1356. movq [r4+r3*1], mm1
  1357. movq mm2, mm0
  1358. psllq mm1, 8
  1359. psrlq mm2, 56
  1360. psllq mm0, 8
  1361. por mm1, mm2
  1362. movq [r2+r3*2], mm1
  1363. movq mm2, mm0
  1364. psllq mm1, 8
  1365. psrlq mm2, 56
  1366. psllq mm0, 8
  1367. por mm1, mm2
  1368. movq [r2+r3*1], mm1
  1369. movq mm2, mm0
  1370. psllq mm1, 8
  1371. psrlq mm2, 56
  1372. psllq mm0, 8
  1373. por mm1, mm2
  1374. movq [r1+r3*2], mm1
  1375. movq mm2, mm0
  1376. psllq mm1, 8
  1377. psrlq mm2, 56
  1378. psllq mm0, 8
  1379. por mm1, mm2
  1380. movq [r1+r3*1], mm1
  1381. movq mm2, mm0
  1382. psllq mm1, 8
  1383. psrlq mm2, 56
  1384. psllq mm0, 8
  1385. por mm1, mm2
  1386. movq [r0+r3*2], mm1
  1387. psllq mm1, 8
  1388. psrlq mm0, 56
  1389. por mm1, mm0
  1390. movq [r0+r3*1], mm1
  1391. RET
  1392. %macro PRED8x8L_DOWN_LEFT 1
  1393. cglobal pred8x8l_down_left_%1, 4,4
  1394. sub r0, r3
  1395. movq mm0, [r0-8]
  1396. movq mm3, [r0]
  1397. movq mm1, [r0+8]
  1398. movq mm2, mm3
  1399. movq mm4, mm3
  1400. PALIGNR mm2, mm0, 7, mm0
  1401. PALIGNR mm1, mm4, 1, mm4
  1402. test r1, r1 ; top_left
  1403. jz .fix_lt_2
  1404. test r2, r2 ; top_right
  1405. jz .fix_tr_1
  1406. jmp .do_top
  1407. .fix_lt_2:
  1408. movq mm5, mm3
  1409. pxor mm5, mm2
  1410. psllq mm5, 56
  1411. psrlq mm5, 56
  1412. pxor mm2, mm5
  1413. test r2, r2 ; top_right
  1414. jnz .do_top
  1415. .fix_tr_1:
  1416. movq mm5, mm3
  1417. pxor mm5, mm1
  1418. psrlq mm5, 56
  1419. psllq mm5, 56
  1420. pxor mm1, mm5
  1421. jmp .do_top
  1422. .fix_tr_2:
  1423. punpckhbw mm3, mm3
  1424. pshufw mm1, mm3, 0xFF
  1425. jmp .do_topright
  1426. .do_top:
  1427. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1428. movq2dq xmm3, mm4
  1429. test r2, r2 ; top_right
  1430. jz .fix_tr_2
  1431. movq mm0, [r0+8]
  1432. movq mm5, mm0
  1433. movq mm2, mm0
  1434. movq mm4, mm0
  1435. psrlq mm5, 56
  1436. PALIGNR mm2, mm3, 7, mm3
  1437. PALIGNR mm5, mm4, 1, mm4
  1438. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1439. .do_topright:
  1440. movq2dq xmm4, mm1
  1441. psrlq mm1, 56
  1442. movq2dq xmm5, mm1
  1443. lea r1, [r0+r3*2]
  1444. pslldq xmm4, 8
  1445. por xmm3, xmm4
  1446. movdqa xmm2, xmm3
  1447. psrldq xmm2, 1
  1448. pslldq xmm5, 15
  1449. por xmm2, xmm5
  1450. lea r2, [r1+r3*2]
  1451. movdqa xmm1, xmm3
  1452. pslldq xmm1, 1
  1453. INIT_XMM
  1454. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1455. psrldq xmm0, 1
  1456. movq [r0+r3*1], xmm0
  1457. psrldq xmm0, 1
  1458. movq [r0+r3*2], xmm0
  1459. psrldq xmm0, 1
  1460. lea r0, [r2+r3*2]
  1461. movq [r1+r3*1], xmm0
  1462. psrldq xmm0, 1
  1463. movq [r1+r3*2], xmm0
  1464. psrldq xmm0, 1
  1465. movq [r2+r3*1], xmm0
  1466. psrldq xmm0, 1
  1467. movq [r2+r3*2], xmm0
  1468. psrldq xmm0, 1
  1469. movq [r0+r3*1], xmm0
  1470. psrldq xmm0, 1
  1471. movq [r0+r3*2], xmm0
  1472. RET
  1473. %endmacro
  1474. INIT_MMX
  1475. %define PALIGNR PALIGNR_MMX
  1476. PRED8x8L_DOWN_LEFT sse2
  1477. INIT_MMX
  1478. %define PALIGNR PALIGNR_SSSE3
  1479. PRED8x8L_DOWN_LEFT ssse3
  1480. ;-----------------------------------------------------------------------------
  1481. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1482. ;-----------------------------------------------------------------------------
  1483. INIT_MMX
  1484. %define PALIGNR PALIGNR_MMX
  1485. cglobal pred8x8l_down_right_mmxext, 4,5
  1486. sub r0, r3
  1487. lea r4, [r0+r3*2]
  1488. movq mm0, [r0+r3*1-8]
  1489. punpckhbw mm0, [r0+r3*0-8]
  1490. movq mm1, [r4+r3*1-8]
  1491. punpckhbw mm1, [r0+r3*2-8]
  1492. mov r4, r0
  1493. punpckhwd mm1, mm0
  1494. lea r0, [r0+r3*4]
  1495. movq mm2, [r0+r3*1-8]
  1496. punpckhbw mm2, [r0+r3*0-8]
  1497. lea r0, [r0+r3*2]
  1498. movq mm3, [r0+r3*1-8]
  1499. punpckhbw mm3, [r0+r3*0-8]
  1500. punpckhwd mm3, mm2
  1501. punpckhdq mm3, mm1
  1502. lea r0, [r0+r3*2]
  1503. movq mm0, [r0+r3*0-8]
  1504. movq mm1, [r4]
  1505. mov r0, r4
  1506. movq mm4, mm3
  1507. movq mm2, mm3
  1508. PALIGNR mm4, mm0, 7, mm0
  1509. PALIGNR mm1, mm2, 1, mm2
  1510. test r1, r1 ; top_left
  1511. jz .fix_lt_1
  1512. .do_left:
  1513. movq mm0, mm4
  1514. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1515. movq mm4, mm0
  1516. movq mm7, mm2
  1517. movq mm6, mm2
  1518. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1519. psllq mm1, 56
  1520. PALIGNR mm7, mm1, 7, mm3
  1521. movq mm0, [r0-8]
  1522. movq mm3, [r0]
  1523. movq mm1, [r0+8]
  1524. movq mm2, mm3
  1525. movq mm4, mm3
  1526. PALIGNR mm2, mm0, 7, mm0
  1527. PALIGNR mm1, mm4, 1, mm4
  1528. test r1, r1 ; top_left
  1529. jz .fix_lt_2
  1530. test r2, r2 ; top_right
  1531. jz .fix_tr_1
  1532. .do_top:
  1533. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1534. movq mm5, mm4
  1535. jmp .body
  1536. .fix_lt_1:
  1537. movq mm5, mm3
  1538. pxor mm5, mm4
  1539. psrlq mm5, 56
  1540. psllq mm5, 48
  1541. pxor mm1, mm5
  1542. jmp .do_left
  1543. .fix_lt_2:
  1544. movq mm5, mm3
  1545. pxor mm5, mm2
  1546. psllq mm5, 56
  1547. psrlq mm5, 56
  1548. pxor mm2, mm5
  1549. test r2, r2 ; top_right
  1550. jnz .do_top
  1551. .fix_tr_1:
  1552. movq mm5, mm3
  1553. pxor mm5, mm1
  1554. psrlq mm5, 56
  1555. psllq mm5, 56
  1556. pxor mm1, mm5
  1557. jmp .do_top
  1558. .body
  1559. lea r1, [r0+r3*2]
  1560. movq mm1, mm7
  1561. movq mm7, mm5
  1562. movq mm5, mm6
  1563. movq mm2, mm7
  1564. lea r2, [r1+r3*2]
  1565. PALIGNR mm2, mm6, 1, mm0
  1566. movq mm3, mm7
  1567. PALIGNR mm3, mm6, 7, mm0
  1568. movq mm4, mm7
  1569. lea r4, [r2+r3*2]
  1570. psrlq mm4, 8
  1571. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1572. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1573. movq [r4+r3*2], mm0
  1574. movq mm2, mm1
  1575. psrlq mm0, 8
  1576. psllq mm2, 56
  1577. psrlq mm1, 8
  1578. por mm0, mm2
  1579. movq [r4+r3*1], mm0
  1580. movq mm2, mm1
  1581. psrlq mm0, 8
  1582. psllq mm2, 56
  1583. psrlq mm1, 8
  1584. por mm0, mm2
  1585. movq [r2+r3*2], mm0
  1586. movq mm2, mm1
  1587. psrlq mm0, 8
  1588. psllq mm2, 56
  1589. psrlq mm1, 8
  1590. por mm0, mm2
  1591. movq [r2+r3*1], mm0
  1592. movq mm2, mm1
  1593. psrlq mm0, 8
  1594. psllq mm2, 56
  1595. psrlq mm1, 8
  1596. por mm0, mm2
  1597. movq [r1+r3*2], mm0
  1598. movq mm2, mm1
  1599. psrlq mm0, 8
  1600. psllq mm2, 56
  1601. psrlq mm1, 8
  1602. por mm0, mm2
  1603. movq [r1+r3*1], mm0
  1604. movq mm2, mm1
  1605. psrlq mm0, 8
  1606. psllq mm2, 56
  1607. psrlq mm1, 8
  1608. por mm0, mm2
  1609. movq [r0+r3*2], mm0
  1610. psrlq mm0, 8
  1611. psllq mm1, 56
  1612. por mm0, mm1
  1613. movq [r0+r3*1], mm0
  1614. RET
  1615. %macro PRED8x8L_DOWN_RIGHT 1
  1616. cglobal pred8x8l_down_right_%1, 4,5
  1617. sub r0, r3
  1618. lea r4, [r0+r3*2]
  1619. movq mm0, [r0+r3*1-8]
  1620. punpckhbw mm0, [r0+r3*0-8]
  1621. movq mm1, [r4+r3*1-8]
  1622. punpckhbw mm1, [r0+r3*2-8]
  1623. mov r4, r0
  1624. punpckhwd mm1, mm0
  1625. lea r0, [r0+r3*4]
  1626. movq mm2, [r0+r3*1-8]
  1627. punpckhbw mm2, [r0+r3*0-8]
  1628. lea r0, [r0+r3*2]
  1629. movq mm3, [r0+r3*1-8]
  1630. punpckhbw mm3, [r0+r3*0-8]
  1631. punpckhwd mm3, mm2
  1632. punpckhdq mm3, mm1
  1633. lea r0, [r0+r3*2]
  1634. movq mm0, [r0+r3*0-8]
  1635. movq mm1, [r4]
  1636. mov r0, r4
  1637. movq mm4, mm3
  1638. movq mm2, mm3
  1639. PALIGNR mm4, mm0, 7, mm0
  1640. PALIGNR mm1, mm2, 1, mm2
  1641. test r1, r1
  1642. jz .fix_lt_1
  1643. jmp .do_left
  1644. .fix_lt_1:
  1645. movq mm5, mm3
  1646. pxor mm5, mm4
  1647. psrlq mm5, 56
  1648. psllq mm5, 48
  1649. pxor mm1, mm5
  1650. jmp .do_left
  1651. .fix_lt_2:
  1652. movq mm5, mm3
  1653. pxor mm5, mm2
  1654. psllq mm5, 56
  1655. psrlq mm5, 56
  1656. pxor mm2, mm5
  1657. test r2, r2
  1658. jnz .do_top
  1659. .fix_tr_1:
  1660. movq mm5, mm3
  1661. pxor mm5, mm1
  1662. psrlq mm5, 56
  1663. psllq mm5, 56
  1664. pxor mm1, mm5
  1665. jmp .do_top
  1666. .do_left:
  1667. movq mm0, mm4
  1668. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1669. movq mm4, mm0
  1670. movq mm7, mm2
  1671. movq2dq xmm3, mm2
  1672. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1673. psllq mm1, 56
  1674. PALIGNR mm7, mm1, 7, mm3
  1675. movq2dq xmm1, mm7
  1676. movq mm0, [r0-8]
  1677. movq mm3, [r0]
  1678. movq mm1, [r0+8]
  1679. movq mm2, mm3
  1680. movq mm4, mm3
  1681. PALIGNR mm2, mm0, 7, mm0
  1682. PALIGNR mm1, mm4, 1, mm4
  1683. test r1, r1
  1684. jz .fix_lt_2
  1685. test r2, r2
  1686. jz .fix_tr_1
  1687. .do_top:
  1688. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1689. movq2dq xmm4, mm4
  1690. lea r1, [r0+r3*2]
  1691. movdqa xmm0, xmm3
  1692. pslldq xmm4, 8
  1693. por xmm3, xmm4
  1694. lea r2, [r1+r3*2]
  1695. pslldq xmm4, 1
  1696. por xmm1, xmm4
  1697. psrldq xmm0, 7
  1698. pslldq xmm0, 15
  1699. psrldq xmm0, 7
  1700. por xmm1, xmm0
  1701. lea r0, [r2+r3*2]
  1702. movdqa xmm2, xmm3
  1703. psrldq xmm2, 1
  1704. INIT_XMM
  1705. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1706. movdqa xmm1, xmm0
  1707. psrldq xmm1, 1
  1708. movq [r0+r3*2], xmm0
  1709. movq [r0+r3*1], xmm1
  1710. psrldq xmm0, 2
  1711. psrldq xmm1, 2
  1712. movq [r2+r3*2], xmm0
  1713. movq [r2+r3*1], xmm1
  1714. psrldq xmm0, 2
  1715. psrldq xmm1, 2
  1716. movq [r1+r3*2], xmm0
  1717. movq [r1+r3*1], xmm1
  1718. psrldq xmm0, 2
  1719. psrldq xmm1, 2
  1720. movq [r4+r3*2], xmm0
  1721. movq [r4+r3*1], xmm1
  1722. RET
  1723. %endmacro
  1724. INIT_MMX
  1725. %define PALIGNR PALIGNR_MMX
  1726. PRED8x8L_DOWN_RIGHT sse2
  1727. INIT_MMX
  1728. %define PALIGNR PALIGNR_SSSE3
  1729. PRED8x8L_DOWN_RIGHT ssse3
  1730. ;-----------------------------------------------------------------------------
  1731. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1732. ;-----------------------------------------------------------------------------
  1733. INIT_MMX
  1734. %define PALIGNR PALIGNR_MMX
  1735. cglobal pred8x8l_vertical_right_mmxext, 4,5
  1736. sub r0, r3
  1737. lea r4, [r0+r3*2]
  1738. movq mm0, [r0+r3*1-8]
  1739. punpckhbw mm0, [r0+r3*0-8]
  1740. movq mm1, [r4+r3*1-8]
  1741. punpckhbw mm1, [r0+r3*2-8]
  1742. mov r4, r0
  1743. punpckhwd mm1, mm0
  1744. lea r0, [r0+r3*4]
  1745. movq mm2, [r0+r3*1-8]
  1746. punpckhbw mm2, [r0+r3*0-8]
  1747. lea r0, [r0+r3*2]
  1748. movq mm3, [r0+r3*1-8]
  1749. punpckhbw mm3, [r0+r3*0-8]
  1750. punpckhwd mm3, mm2
  1751. punpckhdq mm3, mm1
  1752. lea r0, [r0+r3*2]
  1753. movq mm0, [r0+r3*0-8]
  1754. movq mm1, [r4]
  1755. mov r0, r4
  1756. movq mm4, mm3
  1757. movq mm2, mm3
  1758. PALIGNR mm4, mm0, 7, mm0
  1759. PALIGNR mm1, mm2, 1, mm2
  1760. test r1, r1
  1761. jz .fix_lt_1
  1762. jmp .do_left
  1763. .fix_lt_1:
  1764. movq mm5, mm3
  1765. pxor mm5, mm4
  1766. psrlq mm5, 56
  1767. psllq mm5, 48
  1768. pxor mm1, mm5
  1769. jmp .do_left
  1770. .fix_lt_2:
  1771. movq mm5, mm3
  1772. pxor mm5, mm2
  1773. psllq mm5, 56
  1774. psrlq mm5, 56
  1775. pxor mm2, mm5
  1776. test r2, r2
  1777. jnz .do_top
  1778. .fix_tr_1:
  1779. movq mm5, mm3
  1780. pxor mm5, mm1
  1781. psrlq mm5, 56
  1782. psllq mm5, 56
  1783. pxor mm1, mm5
  1784. jmp .do_top
  1785. .do_left:
  1786. movq mm0, mm4
  1787. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1788. movq mm7, mm2
  1789. movq mm0, [r0-8]
  1790. movq mm3, [r0]
  1791. movq mm1, [r0+8]
  1792. movq mm2, mm3
  1793. movq mm4, mm3
  1794. PALIGNR mm2, mm0, 7, mm0
  1795. PALIGNR mm1, mm4, 1, mm4
  1796. test r1, r1
  1797. jz .fix_lt_2
  1798. test r2, r2
  1799. jz .fix_tr_1
  1800. .do_top
  1801. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1802. lea r1, [r0+r3*2]
  1803. movq mm2, mm6
  1804. movq mm3, mm6
  1805. PALIGNR mm3, mm7, 7, mm0
  1806. PALIGNR mm6, mm7, 6, mm1
  1807. movq mm4, mm3
  1808. pavgb mm3, mm2
  1809. lea r2, [r1+r3*2]
  1810. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1811. movq [r0+r3*1], mm3
  1812. movq [r0+r3*2], mm0
  1813. movq mm5, mm0
  1814. movq mm6, mm3
  1815. movq mm1, mm7
  1816. movq mm2, mm1
  1817. psllq mm2, 8
  1818. movq mm3, mm1
  1819. psllq mm3, 16
  1820. lea r4, [r2+r3*2]
  1821. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1822. PALIGNR mm6, mm0, 7, mm2
  1823. movq [r1+r3*1], mm6
  1824. psllq mm0, 8
  1825. PALIGNR mm5, mm0, 7, mm1
  1826. movq [r1+r3*2], mm5
  1827. psllq mm0, 8
  1828. PALIGNR mm6, mm0, 7, mm2
  1829. movq [r2+r3*1], mm6
  1830. psllq mm0, 8
  1831. PALIGNR mm5, mm0, 7, mm1
  1832. movq [r2+r3*2], mm5
  1833. psllq mm0, 8
  1834. PALIGNR mm6, mm0, 7, mm2
  1835. movq [r4+r3*1], mm6
  1836. psllq mm0, 8
  1837. PALIGNR mm5, mm0, 7, mm1
  1838. movq [r4+r3*2], mm5
  1839. RET
  1840. %macro PRED8x8L_VERTICAL_RIGHT 1
  1841. cglobal pred8x8l_vertical_right_%1, 4,5,7
  1842. sub r0, r3
  1843. lea r4, [r0+r3*2]
  1844. movq mm0, [r0+r3*1-8]
  1845. punpckhbw mm0, [r0+r3*0-8]
  1846. movq mm1, [r4+r3*1-8]
  1847. punpckhbw mm1, [r0+r3*2-8]
  1848. mov r4, r0
  1849. punpckhwd mm1, mm0
  1850. lea r0, [r0+r3*4]
  1851. movq mm2, [r0+r3*1-8]
  1852. punpckhbw mm2, [r0+r3*0-8]
  1853. lea r0, [r0+r3*2]
  1854. movq mm3, [r0+r3*1-8]
  1855. punpckhbw mm3, [r0+r3*0-8]
  1856. punpckhwd mm3, mm2
  1857. punpckhdq mm3, mm1
  1858. lea r0, [r0+r3*2]
  1859. movq mm0, [r0+r3*0-8]
  1860. movq mm1, [r4]
  1861. mov r0, r4
  1862. movq mm4, mm3
  1863. movq mm2, mm3
  1864. PALIGNR mm4, mm0, 7, mm0
  1865. PALIGNR mm1, mm2, 1, mm2
  1866. test r1, r1
  1867. jnz .do_left
  1868. .fix_lt_1:
  1869. movq mm5, mm3
  1870. pxor mm5, mm4
  1871. psrlq mm5, 56
  1872. psllq mm5, 48
  1873. pxor mm1, mm5
  1874. jmp .do_left
  1875. .fix_lt_2:
  1876. movq mm5, mm3
  1877. pxor mm5, mm2
  1878. psllq mm5, 56
  1879. psrlq mm5, 56
  1880. pxor mm2, mm5
  1881. test r2, r2
  1882. jnz .do_top
  1883. .fix_tr_1:
  1884. movq mm5, mm3
  1885. pxor mm5, mm1
  1886. psrlq mm5, 56
  1887. psllq mm5, 56
  1888. pxor mm1, mm5
  1889. jmp .do_top
  1890. .do_left:
  1891. movq mm0, mm4
  1892. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1893. movq2dq xmm0, mm2
  1894. movq mm0, [r0-8]
  1895. movq mm3, [r0]
  1896. movq mm1, [r0+8]
  1897. movq mm2, mm3
  1898. movq mm4, mm3
  1899. PALIGNR mm2, mm0, 7, mm0
  1900. PALIGNR mm1, mm4, 1, mm4
  1901. test r1, r1
  1902. jz .fix_lt_2
  1903. test r2, r2
  1904. jz .fix_tr_1
  1905. .do_top
  1906. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1907. lea r1, [r0+r3*2]
  1908. movq2dq xmm4, mm6
  1909. pslldq xmm4, 8
  1910. por xmm0, xmm4
  1911. movdqa xmm6, [pw_ff00]
  1912. movdqa xmm1, xmm0
  1913. lea r2, [r1+r3*2]
  1914. movdqa xmm2, xmm0
  1915. movdqa xmm3, xmm0
  1916. pslldq xmm0, 1
  1917. pslldq xmm1, 2
  1918. pavgb xmm2, xmm0
  1919. INIT_XMM
  1920. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1921. pandn xmm6, xmm4
  1922. movdqa xmm5, xmm4
  1923. psrlw xmm4, 8
  1924. packuswb xmm6, xmm4
  1925. movhlps xmm4, xmm6
  1926. movhps [r0+r3*2], xmm5
  1927. movhps [r0+r3*1], xmm2
  1928. psrldq xmm5, 4
  1929. movss xmm5, xmm6
  1930. psrldq xmm2, 4
  1931. movss xmm2, xmm4
  1932. lea r0, [r2+r3*2]
  1933. psrldq xmm5, 1
  1934. psrldq xmm2, 1
  1935. movq [r0+r3*2], xmm5
  1936. movq [r0+r3*1], xmm2
  1937. psrldq xmm5, 1
  1938. psrldq xmm2, 1
  1939. movq [r2+r3*2], xmm5
  1940. movq [r2+r3*1], xmm2
  1941. psrldq xmm5, 1
  1942. psrldq xmm2, 1
  1943. movq [r1+r3*2], xmm5
  1944. movq [r1+r3*1], xmm2
  1945. RET
  1946. %endmacro
  1947. INIT_MMX
  1948. %define PALIGNR PALIGNR_MMX
  1949. PRED8x8L_VERTICAL_RIGHT sse2
  1950. INIT_MMX
  1951. %define PALIGNR PALIGNR_SSSE3
  1952. PRED8x8L_VERTICAL_RIGHT ssse3
  1953. ;-----------------------------------------------------------------------------
  1954. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1955. ;-----------------------------------------------------------------------------
  1956. %macro PRED8x8L_VERTICAL_LEFT 1
  1957. cglobal pred8x8l_vertical_left_%1, 4,4
  1958. sub r0, r3
  1959. movq mm0, [r0-8]
  1960. movq mm3, [r0]
  1961. movq mm1, [r0+8]
  1962. movq mm2, mm3
  1963. movq mm4, mm3
  1964. PALIGNR mm2, mm0, 7, mm0
  1965. PALIGNR mm1, mm4, 1, mm4
  1966. test r1, r1
  1967. jz .fix_lt_2
  1968. test r2, r2
  1969. jz .fix_tr_1
  1970. jmp .do_top
  1971. .fix_lt_2:
  1972. movq mm5, mm3
  1973. pxor mm5, mm2
  1974. psllq mm5, 56
  1975. psrlq mm5, 56
  1976. pxor mm2, mm5
  1977. test r2, r2
  1978. jnz .do_top
  1979. .fix_tr_1:
  1980. movq mm5, mm3
  1981. pxor mm5, mm1
  1982. psrlq mm5, 56
  1983. psllq mm5, 56
  1984. pxor mm1, mm5
  1985. jmp .do_top
  1986. .fix_tr_2:
  1987. punpckhbw mm3, mm3
  1988. pshufw mm1, mm3, 0xFF
  1989. jmp .do_topright
  1990. .do_top:
  1991. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1992. movq2dq xmm4, mm4
  1993. test r2, r2
  1994. jz .fix_tr_2
  1995. movq mm0, [r0+8]
  1996. movq mm5, mm0
  1997. movq mm2, mm0
  1998. movq mm4, mm0
  1999. psrlq mm5, 56
  2000. PALIGNR mm2, mm3, 7, mm3
  2001. PALIGNR mm5, mm4, 1, mm4
  2002. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2003. .do_topright:
  2004. movq2dq xmm3, mm1
  2005. lea r1, [r0+r3*2]
  2006. pslldq xmm3, 8
  2007. por xmm4, xmm3
  2008. movdqa xmm2, xmm4
  2009. movdqa xmm1, xmm4
  2010. movdqa xmm3, xmm4
  2011. psrldq xmm2, 1
  2012. pslldq xmm1, 1
  2013. pavgb xmm3, xmm2
  2014. lea r2, [r1+r3*2]
  2015. INIT_XMM
  2016. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  2017. psrldq xmm0, 1
  2018. movq [r0+r3*1], xmm3
  2019. movq [r0+r3*2], xmm0
  2020. lea r0, [r2+r3*2]
  2021. psrldq xmm3, 1
  2022. psrldq xmm0, 1
  2023. movq [r1+r3*1], xmm3
  2024. movq [r1+r3*2], xmm0
  2025. psrldq xmm3, 1
  2026. psrldq xmm0, 1
  2027. movq [r2+r3*1], xmm3
  2028. movq [r2+r3*2], xmm0
  2029. psrldq xmm3, 1
  2030. psrldq xmm0, 1
  2031. movq [r0+r3*1], xmm3
  2032. movq [r0+r3*2], xmm0
  2033. RET
  2034. %endmacro
  2035. INIT_MMX
  2036. %define PALIGNR PALIGNR_MMX
  2037. PRED8x8L_VERTICAL_LEFT sse2
  2038. %define PALIGNR PALIGNR_SSSE3
  2039. INIT_MMX
  2040. PRED8x8L_VERTICAL_LEFT ssse3
  2041. ;-----------------------------------------------------------------------------
  2042. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  2043. ;-----------------------------------------------------------------------------
  2044. %macro PRED8x8L_HORIZONTAL_UP 1
  2045. cglobal pred8x8l_horizontal_up_%1, 4,4
  2046. sub r0, r3
  2047. lea r2, [r0+r3*2]
  2048. movq mm0, [r0+r3*1-8]
  2049. punpckhbw mm0, [r0+r3*0-8]
  2050. movq mm1, [r2+r3*1-8]
  2051. punpckhbw mm1, [r0+r3*2-8]
  2052. mov r2, r0
  2053. punpckhwd mm1, mm0
  2054. lea r0, [r0+r3*4]
  2055. movq mm2, [r0+r3*1-8]
  2056. punpckhbw mm2, [r0+r3*0-8]
  2057. lea r0, [r0+r3*2]
  2058. movq mm3, [r0+r3*1-8]
  2059. punpckhbw mm3, [r0+r3*0-8]
  2060. punpckhwd mm3, mm2
  2061. punpckhdq mm3, mm1
  2062. lea r0, [r0+r3*2]
  2063. movq mm0, [r0+r3*0-8]
  2064. movq mm1, [r2]
  2065. mov r0, r2
  2066. movq mm4, mm3
  2067. movq mm2, mm3
  2068. PALIGNR mm4, mm0, 7, mm0
  2069. PALIGNR mm1, mm2, 1, mm2
  2070. test r1, r1
  2071. jnz .do_left
  2072. .fix_lt_1:
  2073. movq mm5, mm3
  2074. pxor mm5, mm4
  2075. psrlq mm5, 56
  2076. psllq mm5, 48
  2077. pxor mm1, mm5
  2078. .do_left:
  2079. movq mm0, mm4
  2080. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2081. movq mm4, mm0
  2082. movq mm7, mm2
  2083. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2084. psllq mm1, 56
  2085. PALIGNR mm7, mm1, 7, mm3
  2086. lea r1, [r0+r3*2]
  2087. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2088. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2089. movq mm2, mm0
  2090. psllw mm0, 8
  2091. psrlw mm2, 8
  2092. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2093. movq mm3, mm2
  2094. movq mm4, mm2
  2095. movq mm5, mm2
  2096. psrlq mm2, 8
  2097. psrlq mm3, 16
  2098. lea r2, [r1+r3*2]
  2099. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2100. punpckhbw mm7, mm7
  2101. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2102. pavgb mm4, mm2
  2103. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2104. movq mm5, mm4
  2105. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2106. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2107. movq mm6, mm5
  2108. movq mm7, mm5
  2109. movq mm0, mm5
  2110. PALIGNR mm5, mm4, 2, mm1
  2111. pshufw mm1, mm6, 11111001b
  2112. PALIGNR mm6, mm4, 4, mm2
  2113. pshufw mm2, mm7, 11111110b
  2114. PALIGNR mm7, mm4, 6, mm3
  2115. pshufw mm3, mm0, 11111111b
  2116. movq [r0+r3*1], mm4
  2117. movq [r0+r3*2], mm5
  2118. lea r0, [r2+r3*2]
  2119. movq [r1+r3*1], mm6
  2120. movq [r1+r3*2], mm7
  2121. movq [r2+r3*1], mm0
  2122. movq [r2+r3*2], mm1
  2123. movq [r0+r3*1], mm2
  2124. movq [r0+r3*2], mm3
  2125. RET
  2126. %endmacro
  2127. INIT_MMX
  2128. %define PALIGNR PALIGNR_MMX
  2129. PRED8x8L_HORIZONTAL_UP mmxext
  2130. %define PALIGNR PALIGNR_SSSE3
  2131. PRED8x8L_HORIZONTAL_UP ssse3
  2132. ;-----------------------------------------------------------------------------
  2133. ;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
  2134. ;-----------------------------------------------------------------------------
  2135. INIT_MMX
  2136. %define PALIGNR PALIGNR_MMX
  2137. cglobal pred8x8l_horizontal_down_mmxext, 4,5
  2138. sub r0, r3
  2139. lea r4, [r0+r3*2]
  2140. movq mm0, [r0+r3*1-8]
  2141. punpckhbw mm0, [r0+r3*0-8]
  2142. movq mm1, [r4+r3*1-8]
  2143. punpckhbw mm1, [r0+r3*2-8]
  2144. mov r4, r0
  2145. punpckhwd mm1, mm0
  2146. lea r0, [r0+r3*4]
  2147. movq mm2, [r0+r3*1-8]
  2148. punpckhbw mm2, [r0+r3*0-8]
  2149. lea r0, [r0+r3*2]
  2150. movq mm3, [r0+r3*1-8]
  2151. punpckhbw mm3, [r0+r3*0-8]
  2152. punpckhwd mm3, mm2
  2153. punpckhdq mm3, mm1
  2154. lea r0, [r0+r3*2]
  2155. movq mm0, [r0+r3*0-8]
  2156. movq mm1, [r4]
  2157. mov r0, r4
  2158. movq mm4, mm3
  2159. movq mm2, mm3
  2160. PALIGNR mm4, mm0, 7, mm0
  2161. PALIGNR mm1, mm2, 1, mm2
  2162. test r1, r1
  2163. jnz .do_left
  2164. .fix_lt_1:
  2165. movq mm5, mm3
  2166. pxor mm5, mm4
  2167. psrlq mm5, 56
  2168. psllq mm5, 48
  2169. pxor mm1, mm5
  2170. jmp .do_left
  2171. .fix_lt_2:
  2172. movq mm5, mm3
  2173. pxor mm5, mm2
  2174. psllq mm5, 56
  2175. psrlq mm5, 56
  2176. pxor mm2, mm5
  2177. test r2, r2
  2178. jnz .do_top
  2179. .fix_tr_1:
  2180. movq mm5, mm3
  2181. pxor mm5, mm1
  2182. psrlq mm5, 56
  2183. psllq mm5, 56
  2184. pxor mm1, mm5
  2185. jmp .do_top
  2186. .do_left:
  2187. movq mm0, mm4
  2188. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2189. movq mm4, mm0
  2190. movq mm7, mm2
  2191. movq mm6, mm2
  2192. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2193. psllq mm1, 56
  2194. PALIGNR mm7, mm1, 7, mm3
  2195. movq mm0, [r0-8]
  2196. movq mm3, [r0]
  2197. movq mm1, [r0+8]
  2198. movq mm2, mm3
  2199. movq mm4, mm3
  2200. PALIGNR mm2, mm0, 7, mm0
  2201. PALIGNR mm1, mm4, 1, mm4
  2202. test r1, r1
  2203. jz .fix_lt_2
  2204. test r2, r2
  2205. jz .fix_tr_1
  2206. .do_top:
  2207. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2208. movq mm5, mm4
  2209. lea r1, [r0+r3*2]
  2210. psllq mm7, 56
  2211. movq mm2, mm5
  2212. movq mm3, mm6
  2213. movq mm4, mm2
  2214. PALIGNR mm2, mm6, 7, mm5
  2215. PALIGNR mm6, mm7, 7, mm0
  2216. lea r2, [r1+r3*2]
  2217. PALIGNR mm4, mm3, 1, mm7
  2218. movq mm5, mm3
  2219. pavgb mm3, mm6
  2220. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2221. movq mm4, mm2
  2222. movq mm1, mm2
  2223. lea r4, [r2+r3*2]
  2224. psrlq mm4, 16
  2225. psrlq mm1, 8
  2226. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2227. movq mm7, mm3
  2228. punpcklbw mm3, mm0
  2229. punpckhbw mm7, mm0
  2230. movq mm1, mm7
  2231. movq mm0, mm7
  2232. movq mm4, mm7
  2233. movq [r4+r3*2], mm3
  2234. PALIGNR mm7, mm3, 2, mm5
  2235. movq [r4+r3*1], mm7
  2236. PALIGNR mm1, mm3, 4, mm5
  2237. movq [r2+r3*2], mm1
  2238. PALIGNR mm0, mm3, 6, mm3
  2239. movq [r2+r3*1], mm0
  2240. movq mm2, mm6
  2241. movq mm3, mm6
  2242. movq [r1+r3*2], mm4
  2243. PALIGNR mm6, mm4, 2, mm5
  2244. movq [r1+r3*1], mm6
  2245. PALIGNR mm2, mm4, 4, mm5
  2246. movq [r0+r3*2], mm2
  2247. PALIGNR mm3, mm4, 6, mm4
  2248. movq [r0+r3*1], mm3
  2249. RET
  2250. %macro PRED8x8L_HORIZONTAL_DOWN 1
  2251. cglobal pred8x8l_horizontal_down_%1, 4,5
  2252. sub r0, r3
  2253. lea r4, [r0+r3*2]
  2254. movq mm0, [r0+r3*1-8]
  2255. punpckhbw mm0, [r0+r3*0-8]
  2256. movq mm1, [r4+r3*1-8]
  2257. punpckhbw mm1, [r0+r3*2-8]
  2258. mov r4, r0
  2259. punpckhwd mm1, mm0
  2260. lea r0, [r0+r3*4]
  2261. movq mm2, [r0+r3*1-8]
  2262. punpckhbw mm2, [r0+r3*0-8]
  2263. lea r0, [r0+r3*2]
  2264. movq mm3, [r0+r3*1-8]
  2265. punpckhbw mm3, [r0+r3*0-8]
  2266. punpckhwd mm3, mm2
  2267. punpckhdq mm3, mm1
  2268. lea r0, [r0+r3*2]
  2269. movq mm0, [r0+r3*0-8]
  2270. movq mm1, [r4]
  2271. mov r0, r4
  2272. movq mm4, mm3
  2273. movq mm2, mm3
  2274. PALIGNR mm4, mm0, 7, mm0
  2275. PALIGNR mm1, mm2, 1, mm2
  2276. test r1, r1
  2277. jnz .do_left
  2278. .fix_lt_1:
  2279. movq mm5, mm3
  2280. pxor mm5, mm4
  2281. psrlq mm5, 56
  2282. psllq mm5, 48
  2283. pxor mm1, mm5
  2284. jmp .do_left
  2285. .fix_lt_2:
  2286. movq mm5, mm3
  2287. pxor mm5, mm2
  2288. psllq mm5, 56
  2289. psrlq mm5, 56
  2290. pxor mm2, mm5
  2291. test r2, r2
  2292. jnz .do_top
  2293. .fix_tr_1:
  2294. movq mm5, mm3
  2295. pxor mm5, mm1
  2296. psrlq mm5, 56
  2297. psllq mm5, 56
  2298. pxor mm1, mm5
  2299. jmp .do_top
  2300. .fix_tr_2:
  2301. punpckhbw mm3, mm3
  2302. pshufw mm1, mm3, 0xFF
  2303. jmp .do_topright
  2304. .do_left:
  2305. movq mm0, mm4
  2306. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2307. movq2dq xmm0, mm2
  2308. pslldq xmm0, 8
  2309. movq mm4, mm0
  2310. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2311. movq2dq xmm2, mm1
  2312. pslldq xmm2, 15
  2313. psrldq xmm2, 8
  2314. por xmm0, xmm2
  2315. movq mm0, [r0-8]
  2316. movq mm3, [r0]
  2317. movq mm1, [r0+8]
  2318. movq mm2, mm3
  2319. movq mm4, mm3
  2320. PALIGNR mm2, mm0, 7, mm0
  2321. PALIGNR mm1, mm4, 1, mm4
  2322. test r1, r1
  2323. jz .fix_lt_2
  2324. test r2, r2
  2325. jz .fix_tr_1
  2326. .do_top:
  2327. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2328. movq2dq xmm1, mm4
  2329. test r2, r2
  2330. jz .fix_tr_2
  2331. movq mm0, [r0+8]
  2332. movq mm5, mm0
  2333. movq mm2, mm0
  2334. movq mm4, mm0
  2335. psrlq mm5, 56
  2336. PALIGNR mm2, mm3, 7, mm3
  2337. PALIGNR mm5, mm4, 1, mm4
  2338. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2339. .do_topright:
  2340. movq2dq xmm5, mm1
  2341. pslldq xmm5, 8
  2342. por xmm1, xmm5
  2343. INIT_XMM
  2344. lea r2, [r4+r3*2]
  2345. movdqa xmm2, xmm1
  2346. movdqa xmm3, xmm1
  2347. PALIGNR xmm1, xmm0, 7, xmm4
  2348. PALIGNR xmm2, xmm0, 9, xmm5
  2349. lea r1, [r2+r3*2]
  2350. PALIGNR xmm3, xmm0, 8, xmm0
  2351. movdqa xmm4, xmm1
  2352. pavgb xmm4, xmm3
  2353. lea r0, [r1+r3*2]
  2354. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2355. punpcklbw xmm4, xmm0
  2356. movhlps xmm0, xmm4
  2357. movq [r0+r3*2], xmm4
  2358. movq [r2+r3*2], xmm0
  2359. psrldq xmm4, 2
  2360. psrldq xmm0, 2
  2361. movq [r0+r3*1], xmm4
  2362. movq [r2+r3*1], xmm0
  2363. psrldq xmm4, 2
  2364. psrldq xmm0, 2
  2365. movq [r1+r3*2], xmm4
  2366. movq [r4+r3*2], xmm0
  2367. psrldq xmm4, 2
  2368. psrldq xmm0, 2
  2369. movq [r1+r3*1], xmm4
  2370. movq [r4+r3*1], xmm0
  2371. RET
  2372. %endmacro
  2373. INIT_MMX
  2374. %define PALIGNR PALIGNR_MMX
  2375. PRED8x8L_HORIZONTAL_DOWN sse2
  2376. INIT_MMX
  2377. %define PALIGNR PALIGNR_SSSE3
  2378. PRED8x8L_HORIZONTAL_DOWN ssse3
  2379. %endif
  2380. ;-----------------------------------------------------------------------------
  2381. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2382. ;-----------------------------------------------------------------------------
  2383. cglobal pred4x4_dc_mmxext, 3,5
  2384. pxor mm7, mm7
  2385. mov r4, r0
  2386. sub r0, r2
  2387. movd mm0, [r0]
  2388. psadbw mm0, mm7
  2389. movzx r1d, byte [r0+r2*1-1]
  2390. movd r3d, mm0
  2391. add r3d, r1d
  2392. movzx r1d, byte [r0+r2*2-1]
  2393. lea r0, [r0+r2*2]
  2394. add r3d, r1d
  2395. movzx r1d, byte [r0+r2*1-1]
  2396. add r3d, r1d
  2397. movzx r1d, byte [r0+r2*2-1]
  2398. add r3d, r1d
  2399. add r3d, 4
  2400. shr r3d, 3
  2401. imul r3d, 0x01010101
  2402. mov [r4+r2*0], r3d
  2403. mov [r0+r2*0], r3d
  2404. mov [r0+r2*1], r3d
  2405. mov [r0+r2*2], r3d
  2406. RET
  2407. ;-----------------------------------------------------------------------------
  2408. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2409. ;-----------------------------------------------------------------------------
  2410. %macro PRED4x4_TM_MMX 1
  2411. cglobal pred4x4_tm_vp8_%1, 3,6
  2412. sub r0, r2
  2413. pxor mm7, mm7
  2414. movd mm0, [r0]
  2415. punpcklbw mm0, mm7
  2416. movzx r4d, byte [r0-1]
  2417. mov r5d, 2
  2418. .loop:
  2419. movzx r1d, byte [r0+r2*1-1]
  2420. movzx r3d, byte [r0+r2*2-1]
  2421. sub r1d, r4d
  2422. sub r3d, r4d
  2423. movd mm2, r1d
  2424. movd mm4, r3d
  2425. %ifidn %1, mmx
  2426. punpcklwd mm2, mm2
  2427. punpcklwd mm4, mm4
  2428. punpckldq mm2, mm2
  2429. punpckldq mm4, mm4
  2430. %else
  2431. pshufw mm2, mm2, 0
  2432. pshufw mm4, mm4, 0
  2433. %endif
  2434. paddw mm2, mm0
  2435. paddw mm4, mm0
  2436. packuswb mm2, mm2
  2437. packuswb mm4, mm4
  2438. movd [r0+r2*1], mm2
  2439. movd [r0+r2*2], mm4
  2440. lea r0, [r0+r2*2]
  2441. dec r5d
  2442. jg .loop
  2443. REP_RET
  2444. %endmacro
  2445. PRED4x4_TM_MMX mmx
  2446. PRED4x4_TM_MMX mmxext
  2447. cglobal pred4x4_tm_vp8_ssse3, 3,3
  2448. sub r0, r2
  2449. movq mm6, [tm_shuf]
  2450. pxor mm1, mm1
  2451. movd mm0, [r0]
  2452. punpcklbw mm0, mm1
  2453. movd mm7, [r0-4]
  2454. pshufb mm7, mm6
  2455. lea r1, [r0+r2*2]
  2456. movd mm2, [r0+r2*1-4]
  2457. movd mm3, [r0+r2*2-4]
  2458. movd mm4, [r1+r2*1-4]
  2459. movd mm5, [r1+r2*2-4]
  2460. pshufb mm2, mm6
  2461. pshufb mm3, mm6
  2462. pshufb mm4, mm6
  2463. pshufb mm5, mm6
  2464. psubw mm2, mm7
  2465. psubw mm3, mm7
  2466. psubw mm4, mm7
  2467. psubw mm5, mm7
  2468. paddw mm2, mm0
  2469. paddw mm3, mm0
  2470. paddw mm4, mm0
  2471. paddw mm5, mm0
  2472. packuswb mm2, mm2
  2473. packuswb mm3, mm3
  2474. packuswb mm4, mm4
  2475. packuswb mm5, mm5
  2476. movd [r0+r2*1], mm2
  2477. movd [r0+r2*2], mm3
  2478. movd [r1+r2*1], mm4
  2479. movd [r1+r2*2], mm5
  2480. RET
  2481. ;-----------------------------------------------------------------------------
  2482. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2483. ;-----------------------------------------------------------------------------
  2484. INIT_MMX
  2485. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  2486. sub r0, r2
  2487. movd m1, [r0-1]
  2488. movd m0, [r0]
  2489. mova m2, m0 ;t0 t1 t2 t3
  2490. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2491. lea r1, [r0+r2*2]
  2492. psrlq m0, 8 ;t1 t2 t3 t4
  2493. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2494. movd [r0+r2*1], m3
  2495. movd [r0+r2*2], m3
  2496. movd [r1+r2*1], m3
  2497. movd [r1+r2*2], m3
  2498. RET
  2499. ;-----------------------------------------------------------------------------
  2500. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2501. ;-----------------------------------------------------------------------------
  2502. %ifdef CONFIG_GPL
  2503. INIT_MMX
  2504. cglobal pred4x4_down_left_mmxext, 3,3
  2505. sub r0, r2
  2506. movq m1, [r0]
  2507. punpckldq m1, [r1]
  2508. movq m2, m1
  2509. movq m3, m1
  2510. movq m4, m1
  2511. psllq m1, 8
  2512. pxor m2, m1
  2513. psrlq m2, 8
  2514. pxor m3, m2
  2515. PRED4x4_LOWPASS m0, m1, m3, m4, m5
  2516. lea r1, [r0+r2*2]
  2517. psrlq m0, 8
  2518. movd [r0+r2*1], m0
  2519. psrlq m0, 8
  2520. movd [r0+r2*2], m0
  2521. psrlq m0, 8
  2522. movd [r1+r2*1], m0
  2523. psrlq m0, 8
  2524. movd [r1+r2*2], m0
  2525. RET
  2526. ;-----------------------------------------------------------------------------
  2527. ; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2528. ;-----------------------------------------------------------------------------
  2529. INIT_MMX
  2530. cglobal pred4x4_vertical_left_mmxext, 3,3
  2531. sub r0, r2
  2532. movq m1, [r0]
  2533. punpckldq m1, [r1]
  2534. movq m3, m1
  2535. movq m2, m1
  2536. psrlq m3, 8
  2537. psrlq m2, 16
  2538. movq m4, m3
  2539. pavgb m4, m1
  2540. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2541. lea r1, [r0+r2*2]
  2542. movh [r0+r2*1], m4
  2543. movh [r0+r2*2], m0
  2544. psrlq m4, 8
  2545. psrlq m0, 8
  2546. movh [r1+r2*1], m4
  2547. movh [r1+r2*2], m0
  2548. RET
  2549. ;-----------------------------------------------------------------------------
  2550. ; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2551. ;-----------------------------------------------------------------------------
  2552. INIT_MMX
  2553. cglobal pred4x4_horizontal_up_mmxext, 3,3
  2554. sub r0, r2
  2555. lea r1, [r0+r2*2]
  2556. movd m0, [r0+r2*1-4]
  2557. punpcklbw m0, [r0+r2*2-4]
  2558. movd m1, [r1+r2*1-4]
  2559. punpcklbw m1, [r1+r2*2-4]
  2560. punpckhwd m0, m1
  2561. movq m1, m0
  2562. punpckhbw m1, m1
  2563. pshufw m1, m1, 0xFF
  2564. punpckhdq m0, m1
  2565. movq m2, m0
  2566. movq m3, m0
  2567. movq m7, m0
  2568. psrlq m2, 16
  2569. psrlq m3, 8
  2570. pavgb m7, m3
  2571. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2572. punpcklbw m7, m4
  2573. movd [r0+r2*1], m7
  2574. psrlq m7, 16
  2575. movd [r0+r2*2], m7
  2576. psrlq m7, 16
  2577. movd [r1+r2*1], m7
  2578. movd [r1+r2*2], m1
  2579. RET
  2580. ;-----------------------------------------------------------------------------
  2581. ; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2582. ;-----------------------------------------------------------------------------
  2583. INIT_MMX
  2584. %define PALIGNR PALIGNR_MMX
  2585. cglobal pred4x4_horizontal_down_mmxext, 3,3
  2586. sub r0, r2
  2587. lea r1, [r0+r2*2]
  2588. movh m0, [r0-4] ; lt ..
  2589. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2590. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2591. movd m1, [r1+r2*2-4] ; l3
  2592. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2593. movd m2, [r0+r2*2-4] ; l1
  2594. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2595. punpckhwd m1, m2 ; l0 l1 l2 l3
  2596. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2597. movq m0, m1
  2598. movq m2, m1
  2599. movq m5, m1
  2600. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2601. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2602. pavgb m5, m2
  2603. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2604. punpcklbw m5, m3
  2605. psrlq m3, 32
  2606. PALIGNR m3, m5, 6, m4
  2607. movh [r1+r2*2], m5
  2608. psrlq m5, 16
  2609. movh [r1+r2*1], m5
  2610. psrlq m5, 16
  2611. movh [r0+r2*2], m5
  2612. movh [r0+r2*1], m3
  2613. RET
  2614. ;-----------------------------------------------------------------------------
  2615. ; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2616. ;-----------------------------------------------------------------------------
  2617. INIT_MMX
  2618. %define PALIGNR PALIGNR_MMX
  2619. cglobal pred4x4_vertical_right_mmxext, 3,3
  2620. sub r0, r2
  2621. lea r1, [r0+r2*2]
  2622. movh m0, [r0] ; ........t3t2t1t0
  2623. movq m5, m0
  2624. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2625. pavgb m5, m0
  2626. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2627. movq m1, m0
  2628. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2629. movq m2, m0
  2630. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2631. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2632. movq m1, m3
  2633. psrlq m3, 16
  2634. psllq m1, 48
  2635. movh [r0+r2*1], m5
  2636. movh [r0+r2*2], m3
  2637. PALIGNR m5, m1, 7, m2
  2638. psllq m1, 8
  2639. movh [r1+r2*1], m5
  2640. PALIGNR m3, m1, 7, m1
  2641. movh [r1+r2*2], m3
  2642. RET
  2643. ;-----------------------------------------------------------------------------
  2644. ; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2645. ;-----------------------------------------------------------------------------
  2646. INIT_MMX
  2647. %define PALIGNR PALIGNR_MMX
  2648. cglobal pred4x4_down_right_mmxext, 3,3
  2649. sub r0, r2
  2650. lea r1, [r0+r2*2]
  2651. movq m1, [r1-8]
  2652. movq m2, [r0+r2*1-8]
  2653. punpckhbw m2, [r0-8]
  2654. movh m3, [r0]
  2655. punpckhwd m1, m2
  2656. PALIGNR m3, m1, 5, m1
  2657. movq m1, m3
  2658. PALIGNR m3, [r1+r2*1-8], 7, m4
  2659. movq m2, m3
  2660. PALIGNR m3, [r1+r2*2-8], 7, m4
  2661. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2662. movh [r1+r2*2], m0
  2663. psrlq m0, 8
  2664. movh [r1+r2*1], m0
  2665. psrlq m0, 8
  2666. movh [r0+r2*2], m0
  2667. psrlq m0, 8
  2668. movh [r0+r2*1], m0
  2669. RET
  2670. %endif