You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2298 lines
57KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA
  27. tm_shuf: times 8 db 0x03, 0x80
  28. pw_ff00: times 8 dw 0xff00
  29. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  30. db 1, 2, 3, 4, 5, 6, 7, 8
  31. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  32. db 1, 2, 3, 4, 0, 0, 0, 0
  33. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  34. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  35. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  36. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  37. SECTION .text
  38. cextern pb_1
  39. cextern pb_3
  40. cextern pw_4
  41. cextern pw_5
  42. cextern pw_8
  43. cextern pw_16
  44. cextern pw_17
  45. cextern pw_32
  46. ;-----------------------------------------------------------------------------
  47. ; void pred16x16_vertical(uint8_t *src, int stride)
  48. ;-----------------------------------------------------------------------------
  49. cglobal pred16x16_vertical_mmx, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. cglobal pred16x16_vertical_sse, 2,3
  64. sub r0, r1
  65. mov r2, 4
  66. movaps xmm0, [r0]
  67. .loop:
  68. movaps [r0+r1*1], xmm0
  69. movaps [r0+r1*2], xmm0
  70. lea r0, [r0+r1*2]
  71. movaps [r0+r1*1], xmm0
  72. movaps [r0+r1*2], xmm0
  73. lea r0, [r0+r1*2]
  74. dec r2
  75. jg .loop
  76. REP_RET
  77. ;-----------------------------------------------------------------------------
  78. ; void pred16x16_horizontal(uint8_t *src, int stride)
  79. ;-----------------------------------------------------------------------------
  80. %macro PRED16x16_H 1
  81. cglobal pred16x16_horizontal_%1, 2,3
  82. mov r2, 8
  83. %ifidn %1, ssse3
  84. mova m2, [pb_3]
  85. %endif
  86. .loop:
  87. movd m0, [r0+r1*0-4]
  88. movd m1, [r0+r1*1-4]
  89. %ifidn %1, ssse3
  90. pshufb m0, m2
  91. pshufb m1, m2
  92. %else
  93. punpcklbw m0, m0
  94. punpcklbw m1, m1
  95. %ifidn %1, mmxext
  96. pshufw m0, m0, 0xff
  97. pshufw m1, m1, 0xff
  98. %else
  99. punpckhwd m0, m0
  100. punpckhwd m1, m1
  101. punpckhdq m0, m0
  102. punpckhdq m1, m1
  103. %endif
  104. mova [r0+r1*0+8], m0
  105. mova [r0+r1*1+8], m1
  106. %endif
  107. mova [r0+r1*0], m0
  108. mova [r0+r1*1], m1
  109. lea r0, [r0+r1*2]
  110. dec r2
  111. jg .loop
  112. REP_RET
  113. %endmacro
  114. INIT_MMX
  115. PRED16x16_H mmx
  116. PRED16x16_H mmxext
  117. INIT_XMM
  118. PRED16x16_H ssse3
  119. ;-----------------------------------------------------------------------------
  120. ; void pred16x16_dc(uint8_t *src, int stride)
  121. ;-----------------------------------------------------------------------------
  122. %macro PRED16x16_DC 1
  123. cglobal pred16x16_dc_%1, 2,7
  124. mov r4, r0
  125. sub r0, r1
  126. pxor mm0, mm0
  127. pxor mm1, mm1
  128. psadbw mm0, [r0+0]
  129. psadbw mm1, [r0+8]
  130. dec r0
  131. movzx r5d, byte [r0+r1*1]
  132. paddw mm0, mm1
  133. movd r6d, mm0
  134. lea r0, [r0+r1*2]
  135. %rep 7
  136. movzx r2d, byte [r0+r1*0]
  137. movzx r3d, byte [r0+r1*1]
  138. add r5d, r2d
  139. add r6d, r3d
  140. lea r0, [r0+r1*2]
  141. %endrep
  142. movzx r2d, byte [r0+r1*0]
  143. add r5d, r6d
  144. lea r2d, [r2+r5+16]
  145. shr r2d, 5
  146. %ifidn %1, mmxext
  147. movd m0, r2d
  148. punpcklbw m0, m0
  149. pshufw m0, m0, 0
  150. %elifidn %1, sse2
  151. movd m0, r2d
  152. punpcklbw m0, m0
  153. pshuflw m0, m0, 0
  154. punpcklqdq m0, m0
  155. %elifidn %1, ssse3
  156. pxor m1, m1
  157. movd m0, r2d
  158. pshufb m0, m1
  159. %endif
  160. %if mmsize==8
  161. mov r3d, 8
  162. .loop:
  163. mova [r4+r1*0+0], m0
  164. mova [r4+r1*0+8], m0
  165. mova [r4+r1*1+0], m0
  166. mova [r4+r1*1+8], m0
  167. %else
  168. mov r3d, 4
  169. .loop:
  170. mova [r4+r1*0], m0
  171. mova [r4+r1*1], m0
  172. lea r4, [r4+r1*2]
  173. mova [r4+r1*0], m0
  174. mova [r4+r1*1], m0
  175. %endif
  176. lea r4, [r4+r1*2]
  177. dec r3d
  178. jg .loop
  179. REP_RET
  180. %endmacro
  181. INIT_MMX
  182. PRED16x16_DC mmxext
  183. INIT_XMM
  184. PRED16x16_DC sse2
  185. PRED16x16_DC ssse3
  186. ;-----------------------------------------------------------------------------
  187. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  188. ;-----------------------------------------------------------------------------
  189. %macro PRED16x16_TM_MMX 1
  190. cglobal pred16x16_tm_vp8_%1, 2,5
  191. sub r0, r1
  192. pxor mm7, mm7
  193. movq mm0, [r0+0]
  194. movq mm2, [r0+8]
  195. movq mm1, mm0
  196. movq mm3, mm2
  197. punpcklbw mm0, mm7
  198. punpckhbw mm1, mm7
  199. punpcklbw mm2, mm7
  200. punpckhbw mm3, mm7
  201. movzx r3d, byte [r0-1]
  202. mov r4d, 16
  203. .loop:
  204. movzx r2d, byte [r0+r1-1]
  205. sub r2d, r3d
  206. movd mm4, r2d
  207. %ifidn %1, mmx
  208. punpcklwd mm4, mm4
  209. punpckldq mm4, mm4
  210. %else
  211. pshufw mm4, mm4, 0
  212. %endif
  213. movq mm5, mm4
  214. movq mm6, mm4
  215. movq mm7, mm4
  216. paddw mm4, mm0
  217. paddw mm5, mm1
  218. paddw mm6, mm2
  219. paddw mm7, mm3
  220. packuswb mm4, mm5
  221. packuswb mm6, mm7
  222. movq [r0+r1+0], mm4
  223. movq [r0+r1+8], mm6
  224. add r0, r1
  225. dec r4d
  226. jg .loop
  227. REP_RET
  228. %endmacro
  229. PRED16x16_TM_MMX mmx
  230. PRED16x16_TM_MMX mmxext
  231. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  232. sub r0, r1
  233. pxor xmm2, xmm2
  234. movdqa xmm0, [r0]
  235. movdqa xmm1, xmm0
  236. punpcklbw xmm0, xmm2
  237. punpckhbw xmm1, xmm2
  238. movzx r4d, byte [r0-1]
  239. mov r5d, 8
  240. .loop:
  241. movzx r2d, byte [r0+r1*1-1]
  242. movzx r3d, byte [r0+r1*2-1]
  243. sub r2d, r4d
  244. sub r3d, r4d
  245. movd xmm2, r2d
  246. movd xmm4, r3d
  247. pshuflw xmm2, xmm2, 0
  248. pshuflw xmm4, xmm4, 0
  249. punpcklqdq xmm2, xmm2
  250. punpcklqdq xmm4, xmm4
  251. movdqa xmm3, xmm2
  252. movdqa xmm5, xmm4
  253. paddw xmm2, xmm0
  254. paddw xmm3, xmm1
  255. paddw xmm4, xmm0
  256. paddw xmm5, xmm1
  257. packuswb xmm2, xmm3
  258. packuswb xmm4, xmm5
  259. movdqa [r0+r1*1], xmm2
  260. movdqa [r0+r1*2], xmm4
  261. lea r0, [r0+r1*2]
  262. dec r5d
  263. jg .loop
  264. REP_RET
  265. ;-----------------------------------------------------------------------------
  266. ; void pred16x16_plane(uint8_t *src, int stride)
  267. ;-----------------------------------------------------------------------------
  268. %macro H264_PRED16x16_PLANE 3
  269. cglobal pred16x16_plane_%3_%1, 2, 7, %2
  270. mov r2, r1 ; +stride
  271. neg r1 ; -stride
  272. movh m0, [r0+r1 -1]
  273. %if mmsize == 8
  274. pxor m4, m4
  275. movh m1, [r0+r1 +3 ]
  276. movh m2, [r0+r1 +8 ]
  277. movh m3, [r0+r1 +12]
  278. punpcklbw m0, m4
  279. punpcklbw m1, m4
  280. punpcklbw m2, m4
  281. punpcklbw m3, m4
  282. pmullw m0, [pw_m8tom1 ]
  283. pmullw m1, [pw_m8tom1+8]
  284. pmullw m2, [pw_1to8 ]
  285. pmullw m3, [pw_1to8 +8]
  286. paddw m0, m2
  287. paddw m1, m3
  288. %else ; mmsize == 16
  289. %ifidn %1, sse2
  290. pxor m2, m2
  291. movh m1, [r0+r1 +8]
  292. punpcklbw m0, m2
  293. punpcklbw m1, m2
  294. pmullw m0, [pw_m8tom1]
  295. pmullw m1, [pw_1to8]
  296. paddw m0, m1
  297. %else ; ssse3
  298. movhps m0, [r0+r1 +8]
  299. pmaddubsw m0, [plane_shuf] ; H coefficients
  300. %endif
  301. movhlps m1, m0
  302. %endif
  303. paddw m0, m1
  304. %ifidn %1, mmx
  305. mova m1, m0
  306. psrlq m1, 32
  307. %elifidn %1, mmx2
  308. pshufw m1, m0, 0xE
  309. %else ; mmsize == 16
  310. pshuflw m1, m0, 0xE
  311. %endif
  312. paddw m0, m1
  313. %ifidn %1, mmx
  314. mova m1, m0
  315. psrlq m1, 16
  316. %elifidn %1, mmx2
  317. pshufw m1, m0, 0x1
  318. %else
  319. pshuflw m1, m0, 0x1
  320. %endif
  321. paddw m0, m1 ; sum of H coefficients
  322. %ifidn %3, h264
  323. pmullw m0, [pw_5]
  324. paddw m0, [pw_32]
  325. psraw m0, 6
  326. %elifidn %3, rv40
  327. pmullw m0, [pw_5]
  328. psraw m0, 6
  329. %elifidn %3, svq3
  330. movd r3d, m0
  331. movsx r3, r3w
  332. test r3, r3
  333. lea r4, [r3+3]
  334. cmovs r3, r4
  335. sar r3, 2 ; H/4
  336. lea r3, [r3*5] ; 5*(H/4)
  337. test r3, r3
  338. lea r4, [r3+15]
  339. cmovs r3, r4
  340. sar r3, 4 ; (5*(H/4))/16
  341. movd m0, r3d
  342. %endif
  343. lea r4, [r0+r2*8-1]
  344. lea r3, [r0+r2*4-1]
  345. add r4, r2
  346. %ifdef ARCH_X86_64
  347. %define e_reg r11
  348. %else
  349. %define e_reg r0
  350. %endif
  351. movzx e_reg, byte [r3+r2*2 ]
  352. movzx r5, byte [r4+r1 ]
  353. sub r5, e_reg
  354. movzx e_reg, byte [r3+r2 ]
  355. movzx r6, byte [r4 ]
  356. sub r6, e_reg
  357. lea r5, [r5+r6*2]
  358. movzx e_reg, byte [r3+r1 ]
  359. movzx r6, byte [r4+r2*2 ]
  360. sub r6, e_reg
  361. lea r5, [r5+r6*4]
  362. movzx e_reg, byte [r3 ]
  363. %ifdef ARCH_X86_64
  364. movzx r10, byte [r4+r2 ]
  365. sub r10, e_reg
  366. %else
  367. movzx r6, byte [r4+r2 ]
  368. sub r6, e_reg
  369. lea r5, [r5+r6*4]
  370. sub r5, r6
  371. %endif
  372. lea e_reg, [r3+r1*4]
  373. lea r3, [r4+r2*4]
  374. movzx r4, byte [e_reg+r2 ]
  375. movzx r6, byte [r3 ]
  376. sub r6, r4
  377. %ifdef ARCH_X86_64
  378. lea r6, [r10+r6*2]
  379. lea r5, [r5+r6*2]
  380. add r5, r6
  381. %else
  382. lea r5, [r5+r6*4]
  383. lea r5, [r5+r6*2]
  384. %endif
  385. movzx r4, byte [e_reg ]
  386. %ifdef ARCH_X86_64
  387. movzx r10, byte [r3 +r2 ]
  388. sub r10, r4
  389. sub r5, r10
  390. %else
  391. movzx r6, byte [r3 +r2 ]
  392. sub r6, r4
  393. lea r5, [r5+r6*8]
  394. sub r5, r6
  395. %endif
  396. movzx r4, byte [e_reg+r1 ]
  397. movzx r6, byte [r3 +r2*2]
  398. sub r6, r4
  399. %ifdef ARCH_X86_64
  400. add r6, r10
  401. %endif
  402. lea r5, [r5+r6*8]
  403. movzx r4, byte [e_reg+r2*2]
  404. movzx r6, byte [r3 +r1 ]
  405. sub r6, r4
  406. lea r5, [r5+r6*4]
  407. add r5, r6 ; sum of V coefficients
  408. %ifndef ARCH_X86_64
  409. mov r0, r0m
  410. %endif
  411. %ifidn %3, h264
  412. lea r5, [r5*5+32]
  413. sar r5, 6
  414. %elifidn %3, rv40
  415. lea r5, [r5*5]
  416. sar r5, 6
  417. %elifidn %3, svq3
  418. test r5, r5
  419. lea r6, [r5+3]
  420. cmovs r5, r6
  421. sar r5, 2 ; V/4
  422. lea r5, [r5*5] ; 5*(V/4)
  423. test r5, r5
  424. lea r6, [r5+15]
  425. cmovs r5, r6
  426. sar r5, 4 ; (5*(V/4))/16
  427. %endif
  428. movzx r4, byte [r0+r1 +15]
  429. movzx r3, byte [r3+r2*2 ]
  430. lea r3, [r3+r4+1]
  431. shl r3, 4
  432. movd r1d, m0
  433. movsx r1d, r1w
  434. add r1d, r5d
  435. add r3d, r1d
  436. shl r1d, 3
  437. sub r3d, r1d ; a
  438. movd m1, r5d
  439. movd m3, r3d
  440. %ifidn %1, mmx
  441. punpcklwd m0, m0
  442. punpcklwd m1, m1
  443. punpcklwd m3, m3
  444. punpckldq m0, m0
  445. punpckldq m1, m1
  446. punpckldq m3, m3
  447. %elifidn %1, mmx2
  448. pshufw m0, m0, 0x0
  449. pshufw m1, m1, 0x0
  450. pshufw m3, m3, 0x0
  451. %else
  452. pshuflw m0, m0, 0x0
  453. pshuflw m1, m1, 0x0
  454. pshuflw m3, m3, 0x0
  455. punpcklqdq m0, m0 ; splat H (words)
  456. punpcklqdq m1, m1 ; splat V (words)
  457. punpcklqdq m3, m3 ; splat a (words)
  458. %endif
  459. %ifidn %3, svq3
  460. SWAP 0, 1
  461. %endif
  462. mova m2, m0
  463. %if mmsize == 8
  464. mova m5, m0
  465. %endif
  466. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  467. %if mmsize == 16
  468. psllw m2, 3
  469. %else
  470. psllw m5, 3
  471. psllw m2, 2
  472. mova m6, m5
  473. paddw m6, m2
  474. %endif
  475. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  476. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  477. %if mmsize == 8
  478. paddw m5, m0 ; a + {8,9,10,11}*H
  479. paddw m6, m0 ; a + {12,13,14,15}*H
  480. %endif
  481. mov r4, 8
  482. .loop
  483. mova m3, m0 ; b[0..7]
  484. mova m4, m2 ; b[8..15]
  485. psraw m3, 5
  486. psraw m4, 5
  487. packuswb m3, m4
  488. mova [r0], m3
  489. %if mmsize == 8
  490. mova m3, m5 ; b[8..11]
  491. mova m4, m6 ; b[12..15]
  492. psraw m3, 5
  493. psraw m4, 5
  494. packuswb m3, m4
  495. mova [r0+8], m3
  496. %endif
  497. paddw m0, m1
  498. paddw m2, m1
  499. %if mmsize == 8
  500. paddw m5, m1
  501. paddw m6, m1
  502. %endif
  503. mova m3, m0 ; b[0..7]
  504. mova m4, m2 ; b[8..15]
  505. psraw m3, 5
  506. psraw m4, 5
  507. packuswb m3, m4
  508. mova [r0+r2], m3
  509. %if mmsize == 8
  510. mova m3, m5 ; b[8..11]
  511. mova m4, m6 ; b[12..15]
  512. psraw m3, 5
  513. psraw m4, 5
  514. packuswb m3, m4
  515. mova [r0+r2+8], m3
  516. %endif
  517. paddw m0, m1
  518. paddw m2, m1
  519. %if mmsize == 8
  520. paddw m5, m1
  521. paddw m6, m1
  522. %endif
  523. lea r0, [r0+r2*2]
  524. dec r4
  525. jg .loop
  526. REP_RET
  527. %endmacro
  528. INIT_MMX
  529. H264_PRED16x16_PLANE mmx, 0, h264
  530. H264_PRED16x16_PLANE mmx, 0, rv40
  531. H264_PRED16x16_PLANE mmx, 0, svq3
  532. H264_PRED16x16_PLANE mmx2, 0, h264
  533. H264_PRED16x16_PLANE mmx2, 0, rv40
  534. H264_PRED16x16_PLANE mmx2, 0, svq3
  535. INIT_XMM
  536. H264_PRED16x16_PLANE sse2, 8, h264
  537. H264_PRED16x16_PLANE sse2, 8, rv40
  538. H264_PRED16x16_PLANE sse2, 8, svq3
  539. H264_PRED16x16_PLANE ssse3, 8, h264
  540. H264_PRED16x16_PLANE ssse3, 8, rv40
  541. H264_PRED16x16_PLANE ssse3, 8, svq3
  542. ;-----------------------------------------------------------------------------
  543. ; void pred8x8_plane(uint8_t *src, int stride)
  544. ;-----------------------------------------------------------------------------
  545. %macro H264_PRED8x8_PLANE 2
  546. cglobal pred8x8_plane_%1, 2, 7, %2
  547. mov r2, r1 ; +stride
  548. neg r1 ; -stride
  549. movd m0, [r0+r1 -1]
  550. %if mmsize == 8
  551. pxor m2, m2
  552. movh m1, [r0+r1 +4 ]
  553. punpcklbw m0, m2
  554. punpcklbw m1, m2
  555. pmullw m0, [pw_m4to4]
  556. pmullw m1, [pw_m4to4+8]
  557. %else ; mmsize == 16
  558. %ifidn %1, sse2
  559. pxor m2, m2
  560. movd m1, [r0+r1 +4]
  561. punpckldq m0, m1
  562. punpcklbw m0, m2
  563. pmullw m0, [pw_m4to4]
  564. %else ; ssse3
  565. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  566. pmaddubsw m0, [plane8_shuf] ; H coefficients
  567. %endif
  568. movhlps m1, m0
  569. %endif
  570. paddw m0, m1
  571. %ifnidn %1, ssse3
  572. %ifidn %1, mmx
  573. mova m1, m0
  574. psrlq m1, 32
  575. %elifidn %1, mmx2
  576. pshufw m1, m0, 0xE
  577. %else ; mmsize == 16
  578. pshuflw m1, m0, 0xE
  579. %endif
  580. paddw m0, m1
  581. %endif ; !ssse3
  582. %ifidn %1, mmx
  583. mova m1, m0
  584. psrlq m1, 16
  585. %elifidn %1, mmx2
  586. pshufw m1, m0, 0x1
  587. %else
  588. pshuflw m1, m0, 0x1
  589. %endif
  590. paddw m0, m1 ; sum of H coefficients
  591. pmullw m0, [pw_17]
  592. paddw m0, [pw_16]
  593. psraw m0, 5
  594. lea r4, [r0+r2*4-1]
  595. lea r3, [r0 -1]
  596. add r4, r2
  597. %ifdef ARCH_X86_64
  598. %define e_reg r11
  599. %else
  600. %define e_reg r0
  601. %endif
  602. movzx e_reg, byte [r3+r2*2 ]
  603. movzx r5, byte [r4+r1 ]
  604. sub r5, e_reg
  605. movzx e_reg, byte [r3 ]
  606. %ifdef ARCH_X86_64
  607. movzx r10, byte [r4+r2 ]
  608. sub r10, e_reg
  609. sub r5, r10
  610. %else
  611. movzx r6, byte [r4+r2 ]
  612. sub r6, e_reg
  613. lea r5, [r5+r6*4]
  614. sub r5, r6
  615. %endif
  616. movzx e_reg, byte [r3+r1 ]
  617. movzx r6, byte [r4+r2*2 ]
  618. sub r6, e_reg
  619. %ifdef ARCH_X86_64
  620. add r6, r10
  621. %endif
  622. lea r5, [r5+r6*4]
  623. movzx e_reg, byte [r3+r2 ]
  624. movzx r6, byte [r4 ]
  625. sub r6, e_reg
  626. lea r6, [r5+r6*2]
  627. lea r5, [r6*9+16]
  628. lea r5, [r5+r6*8]
  629. sar r5, 5
  630. %ifndef ARCH_X86_64
  631. mov r0, r0m
  632. %endif
  633. movzx r3, byte [r4+r2*2 ]
  634. movzx r4, byte [r0+r1 +7]
  635. lea r3, [r3+r4+1]
  636. shl r3, 4
  637. movd r1d, m0
  638. movsx r1d, r1w
  639. add r1d, r5d
  640. sub r3d, r1d
  641. add r1d, r1d
  642. sub r3d, r1d ; a
  643. movd m1, r5d
  644. movd m3, r3d
  645. %ifidn %1, mmx
  646. punpcklwd m0, m0
  647. punpcklwd m1, m1
  648. punpcklwd m3, m3
  649. punpckldq m0, m0
  650. punpckldq m1, m1
  651. punpckldq m3, m3
  652. %elifidn %1, mmx2
  653. pshufw m0, m0, 0x0
  654. pshufw m1, m1, 0x0
  655. pshufw m3, m3, 0x0
  656. %else
  657. pshuflw m0, m0, 0x0
  658. pshuflw m1, m1, 0x0
  659. pshuflw m3, m3, 0x0
  660. punpcklqdq m0, m0 ; splat H (words)
  661. punpcklqdq m1, m1 ; splat V (words)
  662. punpcklqdq m3, m3 ; splat a (words)
  663. %endif
  664. %if mmsize == 8
  665. mova m2, m0
  666. %endif
  667. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  668. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  669. %if mmsize == 8
  670. psllw m2, 2
  671. paddw m2, m0 ; a + {4,5,6,7}*H
  672. %endif
  673. mov r4, 4
  674. ALIGN 16
  675. .loop
  676. %if mmsize == 16
  677. mova m3, m0 ; b[0..7]
  678. paddw m0, m1
  679. psraw m3, 5
  680. mova m4, m0 ; V+b[0..7]
  681. paddw m0, m1
  682. psraw m4, 5
  683. packuswb m3, m4
  684. movh [r0], m3
  685. movhps [r0+r2], m3
  686. %else ; mmsize == 8
  687. mova m3, m0 ; b[0..3]
  688. mova m4, m2 ; b[4..7]
  689. paddw m0, m1
  690. paddw m2, m1
  691. psraw m3, 5
  692. psraw m4, 5
  693. mova m5, m0 ; V+b[0..3]
  694. mova m6, m2 ; V+b[4..7]
  695. paddw m0, m1
  696. paddw m2, m1
  697. psraw m5, 5
  698. psraw m6, 5
  699. packuswb m3, m4
  700. packuswb m5, m6
  701. mova [r0], m3
  702. mova [r0+r2], m5
  703. %endif
  704. lea r0, [r0+r2*2]
  705. dec r4
  706. jg .loop
  707. REP_RET
  708. %endmacro
  709. INIT_MMX
  710. H264_PRED8x8_PLANE mmx, 0
  711. H264_PRED8x8_PLANE mmx2, 0
  712. INIT_XMM
  713. H264_PRED8x8_PLANE sse2, 8
  714. H264_PRED8x8_PLANE ssse3, 8
  715. ;-----------------------------------------------------------------------------
  716. ; void pred8x8_vertical(uint8_t *src, int stride)
  717. ;-----------------------------------------------------------------------------
  718. cglobal pred8x8_vertical_mmx, 2,2
  719. sub r0, r1
  720. movq mm0, [r0]
  721. %rep 3
  722. movq [r0+r1*1], mm0
  723. movq [r0+r1*2], mm0
  724. lea r0, [r0+r1*2]
  725. %endrep
  726. movq [r0+r1*1], mm0
  727. movq [r0+r1*2], mm0
  728. RET
  729. ;-----------------------------------------------------------------------------
  730. ; void pred8x8_horizontal(uint8_t *src, int stride)
  731. ;-----------------------------------------------------------------------------
  732. %macro PRED8x8_H 1
  733. cglobal pred8x8_horizontal_%1, 2,3
  734. mov r2, 4
  735. %ifidn %1, ssse3
  736. mova m2, [pb_3]
  737. %endif
  738. .loop:
  739. movd m0, [r0+r1*0-4]
  740. movd m1, [r0+r1*1-4]
  741. %ifidn %1, ssse3
  742. pshufb m0, m2
  743. pshufb m1, m2
  744. %else
  745. punpcklbw m0, m0
  746. punpcklbw m1, m1
  747. %ifidn %1, mmxext
  748. pshufw m0, m0, 0xff
  749. pshufw m1, m1, 0xff
  750. %else
  751. punpckhwd m0, m0
  752. punpckhwd m1, m1
  753. punpckhdq m0, m0
  754. punpckhdq m1, m1
  755. %endif
  756. %endif
  757. mova [r0+r1*0], m0
  758. mova [r0+r1*1], m1
  759. lea r0, [r0+r1*2]
  760. dec r2
  761. jg .loop
  762. REP_RET
  763. %endmacro
  764. INIT_MMX
  765. PRED8x8_H mmx
  766. PRED8x8_H mmxext
  767. PRED8x8_H ssse3
  768. ;-----------------------------------------------------------------------------
  769. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  770. ;-----------------------------------------------------------------------------
  771. %ifdef CONFIG_GPL
  772. cglobal pred8x8_top_dc_mmxext, 2,5
  773. sub r0, r1
  774. movq mm0, [r0]
  775. pxor mm1, mm1
  776. pxor mm2, mm2
  777. lea r2, [r0+r1*2]
  778. punpckhbw mm1, mm0
  779. punpcklbw mm0, mm2
  780. psadbw mm1, mm2 ; s1
  781. lea r3, [r2+r1*2]
  782. psadbw mm0, mm2 ; s0
  783. psrlw mm1, 1
  784. psrlw mm0, 1
  785. pavgw mm1, mm2
  786. lea r4, [r3+r1*2]
  787. pavgw mm0, mm2
  788. pshufw mm1, mm1, 0
  789. pshufw mm0, mm0, 0 ; dc0 (w)
  790. packuswb mm0, mm1 ; dc0,dc1 (b)
  791. movq [r0+r1*1], mm0
  792. movq [r0+r1*2], mm0
  793. lea r0, [r3+r1*2]
  794. movq [r2+r1*1], mm0
  795. movq [r2+r1*2], mm0
  796. movq [r3+r1*1], mm0
  797. movq [r3+r1*2], mm0
  798. movq [r0+r1*1], mm0
  799. movq [r0+r1*2], mm0
  800. RET
  801. %endif
  802. ;-----------------------------------------------------------------------------
  803. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  804. ;-----------------------------------------------------------------------------
  805. %ifdef CONFIG_GPL
  806. INIT_MMX
  807. cglobal pred8x8_dc_mmxext, 2,5
  808. sub r0, r1
  809. pxor m7, m7
  810. movd m0, [r0+0]
  811. movd m1, [r0+4]
  812. psadbw m0, m7 ; s0
  813. mov r4, r0
  814. psadbw m1, m7 ; s1
  815. movzx r2d, byte [r0+r1*1-1]
  816. movzx r3d, byte [r0+r1*2-1]
  817. lea r0, [r0+r1*2]
  818. add r2d, r3d
  819. movzx r3d, byte [r0+r1*1-1]
  820. add r2d, r3d
  821. movzx r3d, byte [r0+r1*2-1]
  822. add r2d, r3d
  823. lea r0, [r0+r1*2]
  824. movd m2, r2d ; s2
  825. movzx r2d, byte [r0+r1*1-1]
  826. movzx r3d, byte [r0+r1*2-1]
  827. lea r0, [r0+r1*2]
  828. add r2d, r3d
  829. movzx r3d, byte [r0+r1*1-1]
  830. add r2d, r3d
  831. movzx r3d, byte [r0+r1*2-1]
  832. add r2d, r3d
  833. movd m3, r2d ; s3
  834. punpcklwd m0, m1
  835. mov r0, r4
  836. punpcklwd m2, m3
  837. punpckldq m0, m2 ; s0, s1, s2, s3
  838. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  839. lea r2, [r0+r1*2]
  840. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  841. paddw m0, m3
  842. lea r3, [r2+r1*2]
  843. psrlw m0, 2
  844. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  845. lea r4, [r3+r1*2]
  846. packuswb m0, m0
  847. punpcklbw m0, m0
  848. movq m1, m0
  849. punpcklbw m0, m0
  850. punpckhbw m1, m1
  851. movq [r0+r1*1], m0
  852. movq [r0+r1*2], m0
  853. movq [r2+r1*1], m0
  854. movq [r2+r1*2], m0
  855. movq [r3+r1*1], m1
  856. movq [r3+r1*2], m1
  857. movq [r4+r1*1], m1
  858. movq [r4+r1*2], m1
  859. RET
  860. %endif
  861. ;-----------------------------------------------------------------------------
  862. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  863. ;-----------------------------------------------------------------------------
  864. cglobal pred8x8_dc_rv40_mmxext, 2,7
  865. mov r4, r0
  866. sub r0, r1
  867. pxor mm0, mm0
  868. psadbw mm0, [r0]
  869. dec r0
  870. movzx r5d, byte [r0+r1*1]
  871. movd r6d, mm0
  872. lea r0, [r0+r1*2]
  873. %rep 3
  874. movzx r2d, byte [r0+r1*0]
  875. movzx r3d, byte [r0+r1*1]
  876. add r5d, r2d
  877. add r6d, r3d
  878. lea r0, [r0+r1*2]
  879. %endrep
  880. movzx r2d, byte [r0+r1*0]
  881. add r5d, r6d
  882. lea r2d, [r2+r5+8]
  883. shr r2d, 4
  884. movd mm0, r2d
  885. punpcklbw mm0, mm0
  886. pshufw mm0, mm0, 0
  887. mov r3d, 4
  888. .loop:
  889. movq [r4+r1*0], mm0
  890. movq [r4+r1*1], mm0
  891. lea r4, [r4+r1*2]
  892. dec r3d
  893. jg .loop
  894. REP_RET
  895. ;-----------------------------------------------------------------------------
  896. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  897. ;-----------------------------------------------------------------------------
  898. %macro PRED8x8_TM_MMX 1
  899. cglobal pred8x8_tm_vp8_%1, 2,6
  900. sub r0, r1
  901. pxor mm7, mm7
  902. movq mm0, [r0]
  903. movq mm1, mm0
  904. punpcklbw mm0, mm7
  905. punpckhbw mm1, mm7
  906. movzx r4d, byte [r0-1]
  907. mov r5d, 4
  908. .loop:
  909. movzx r2d, byte [r0+r1*1-1]
  910. movzx r3d, byte [r0+r1*2-1]
  911. sub r2d, r4d
  912. sub r3d, r4d
  913. movd mm2, r2d
  914. movd mm4, r3d
  915. %ifidn %1, mmx
  916. punpcklwd mm2, mm2
  917. punpcklwd mm4, mm4
  918. punpckldq mm2, mm2
  919. punpckldq mm4, mm4
  920. %else
  921. pshufw mm2, mm2, 0
  922. pshufw mm4, mm4, 0
  923. %endif
  924. movq mm3, mm2
  925. movq mm5, mm4
  926. paddw mm2, mm0
  927. paddw mm3, mm1
  928. paddw mm4, mm0
  929. paddw mm5, mm1
  930. packuswb mm2, mm3
  931. packuswb mm4, mm5
  932. movq [r0+r1*1], mm2
  933. movq [r0+r1*2], mm4
  934. lea r0, [r0+r1*2]
  935. dec r5d
  936. jg .loop
  937. REP_RET
  938. %endmacro
  939. PRED8x8_TM_MMX mmx
  940. PRED8x8_TM_MMX mmxext
  941. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  942. sub r0, r1
  943. pxor xmm1, xmm1
  944. movq xmm0, [r0]
  945. punpcklbw xmm0, xmm1
  946. movzx r4d, byte [r0-1]
  947. mov r5d, 4
  948. .loop:
  949. movzx r2d, byte [r0+r1*1-1]
  950. movzx r3d, byte [r0+r1*2-1]
  951. sub r2d, r4d
  952. sub r3d, r4d
  953. movd xmm2, r2d
  954. movd xmm3, r3d
  955. pshuflw xmm2, xmm2, 0
  956. pshuflw xmm3, xmm3, 0
  957. punpcklqdq xmm2, xmm2
  958. punpcklqdq xmm3, xmm3
  959. paddw xmm2, xmm0
  960. paddw xmm3, xmm0
  961. packuswb xmm2, xmm3
  962. movq [r0+r1*1], xmm2
  963. movhps [r0+r1*2], xmm2
  964. lea r0, [r0+r1*2]
  965. dec r5d
  966. jg .loop
  967. REP_RET
  968. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  969. sub r0, r1
  970. movdqa xmm4, [tm_shuf]
  971. pxor xmm1, xmm1
  972. movq xmm0, [r0]
  973. punpcklbw xmm0, xmm1
  974. movd xmm5, [r0-4]
  975. pshufb xmm5, xmm4
  976. mov r2d, 4
  977. .loop:
  978. movd xmm2, [r0+r1*1-4]
  979. movd xmm3, [r0+r1*2-4]
  980. pshufb xmm2, xmm4
  981. pshufb xmm3, xmm4
  982. psubw xmm2, xmm5
  983. psubw xmm3, xmm5
  984. paddw xmm2, xmm0
  985. paddw xmm3, xmm0
  986. packuswb xmm2, xmm3
  987. movq [r0+r1*1], xmm2
  988. movhps [r0+r1*2], xmm2
  989. lea r0, [r0+r1*2]
  990. dec r2d
  991. jg .loop
  992. REP_RET
  993. ; dest, left, right, src, tmp
  994. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  995. %macro PRED4x4_LOWPASS 5
  996. mova %5, %2
  997. pavgb %2, %3
  998. pxor %3, %5
  999. mova %1, %4
  1000. pand %3, [pb_1]
  1001. psubusb %2, %3
  1002. pavgb %1, %2
  1003. %endmacro
  1004. ;-----------------------------------------------------------------------------
  1005. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1006. ;-----------------------------------------------------------------------------
  1007. %ifdef CONFIG_GPL
  1008. %macro PRED8x8L_TOP_DC 1
  1009. cglobal pred8x8l_top_dc_%1, 4,4
  1010. sub r0, r3
  1011. pxor mm7, mm7
  1012. movq mm0, [r0-8]
  1013. movq mm3, [r0]
  1014. movq mm1, [r0+8]
  1015. movq mm2, mm3
  1016. movq mm4, mm3
  1017. PALIGNR mm2, mm0, 7, mm0
  1018. PALIGNR mm1, mm4, 1, mm4
  1019. test r1, r1 ; top_left
  1020. jz .fix_lt_2
  1021. test r2, r2 ; top_right
  1022. jz .fix_tr_1
  1023. jmp .body
  1024. .fix_lt_2:
  1025. movq mm5, mm3
  1026. pxor mm5, mm2
  1027. psllq mm5, 56
  1028. psrlq mm5, 56
  1029. pxor mm2, mm5
  1030. test r2, r2 ; top_right
  1031. jnz .body
  1032. .fix_tr_1:
  1033. movq mm5, mm3
  1034. pxor mm5, mm1
  1035. psrlq mm5, 56
  1036. psllq mm5, 56
  1037. pxor mm1, mm5
  1038. .body
  1039. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1040. psadbw mm7, mm0
  1041. paddw mm7, [pw_4]
  1042. psrlw mm7, 3
  1043. pshufw mm7, mm7, 0
  1044. packuswb mm7, mm7
  1045. %rep 3
  1046. movq [r0+r3*1], mm7
  1047. movq [r0+r3*2], mm7
  1048. lea r0, [r0+r3*2]
  1049. %endrep
  1050. movq [r0+r3*1], mm7
  1051. movq [r0+r3*2], mm7
  1052. RET
  1053. %endmacro
  1054. INIT_MMX
  1055. %define PALIGNR PALIGNR_MMX
  1056. PRED8x8L_TOP_DC mmxext
  1057. %define PALIGNR PALIGNR_SSSE3
  1058. PRED8x8L_TOP_DC ssse3
  1059. %endif
  1060. ;-----------------------------------------------------------------------------
  1061. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1062. ;-----------------------------------------------------------------------------
  1063. %ifdef CONFIG_GPL
  1064. %macro PRED8x8L_DC 1
  1065. cglobal pred8x8l_dc_%1, 4,5
  1066. sub r0, r3
  1067. lea r4, [r0+r3*2]
  1068. movq mm0, [r0+r3*1-8]
  1069. punpckhbw mm0, [r0+r3*0-8]
  1070. movq mm1, [r4+r3*1-8]
  1071. punpckhbw mm1, [r0+r3*2-8]
  1072. mov r4, r0
  1073. punpckhwd mm1, mm0
  1074. lea r0, [r0+r3*4]
  1075. movq mm2, [r0+r3*1-8]
  1076. punpckhbw mm2, [r0+r3*0-8]
  1077. lea r0, [r0+r3*2]
  1078. movq mm3, [r0+r3*1-8]
  1079. punpckhbw mm3, [r0+r3*0-8]
  1080. punpckhwd mm3, mm2
  1081. punpckhdq mm3, mm1
  1082. lea r0, [r0+r3*2]
  1083. movq mm0, [r0+r3*0-8]
  1084. movq mm1, [r4]
  1085. mov r0, r4
  1086. movq mm4, mm3
  1087. movq mm2, mm3
  1088. PALIGNR mm4, mm0, 7, mm0
  1089. PALIGNR mm1, mm2, 1, mm2
  1090. test r1, r1
  1091. jnz .do_left
  1092. .fix_lt_1:
  1093. movq mm5, mm3
  1094. pxor mm5, mm4
  1095. psrlq mm5, 56
  1096. psllq mm5, 48
  1097. pxor mm1, mm5
  1098. jmp .do_left
  1099. .fix_lt_2:
  1100. movq mm5, mm3
  1101. pxor mm5, mm2
  1102. psllq mm5, 56
  1103. psrlq mm5, 56
  1104. pxor mm2, mm5
  1105. test r2, r2
  1106. jnz .body
  1107. .fix_tr_1:
  1108. movq mm5, mm3
  1109. pxor mm5, mm1
  1110. psrlq mm5, 56
  1111. psllq mm5, 56
  1112. pxor mm1, mm5
  1113. jmp .body
  1114. .do_left:
  1115. movq mm0, mm4
  1116. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1117. movq mm4, mm0
  1118. movq mm7, mm2
  1119. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1120. psllq mm1, 56
  1121. PALIGNR mm7, mm1, 7, mm3
  1122. movq mm0, [r0-8]
  1123. movq mm3, [r0]
  1124. movq mm1, [r0+8]
  1125. movq mm2, mm3
  1126. movq mm4, mm3
  1127. PALIGNR mm2, mm0, 7, mm0
  1128. PALIGNR mm1, mm4, 1, mm4
  1129. test r1, r1
  1130. jz .fix_lt_2
  1131. test r2, r2
  1132. jz .fix_tr_1
  1133. .body
  1134. lea r1, [r0+r3*2]
  1135. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1136. pxor mm0, mm0
  1137. pxor mm1, mm1
  1138. lea r2, [r1+r3*2]
  1139. psadbw mm0, mm7
  1140. psadbw mm1, mm6
  1141. paddw mm0, [pw_8]
  1142. paddw mm0, mm1
  1143. lea r4, [r2+r3*2]
  1144. psrlw mm0, 4
  1145. pshufw mm0, mm0, 0
  1146. packuswb mm0, mm0
  1147. movq [r0+r3*1], mm0
  1148. movq [r0+r3*2], mm0
  1149. movq [r1+r3*1], mm0
  1150. movq [r1+r3*2], mm0
  1151. movq [r2+r3*1], mm0
  1152. movq [r2+r3*2], mm0
  1153. movq [r4+r3*1], mm0
  1154. movq [r4+r3*2], mm0
  1155. RET
  1156. %endmacro
  1157. INIT_MMX
  1158. %define PALIGNR PALIGNR_MMX
  1159. PRED8x8L_DC mmxext
  1160. %define PALIGNR PALIGNR_SSSE3
  1161. PRED8x8L_DC ssse3
  1162. %endif
  1163. ;-----------------------------------------------------------------------------
  1164. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1165. ;-----------------------------------------------------------------------------
  1166. %ifdef CONFIG_GPL
  1167. %macro PRED8x8L_HORIZONTAL 1
  1168. cglobal pred8x8l_horizontal_%1, 4,4
  1169. sub r0, r3
  1170. lea r2, [r0+r3*2]
  1171. movq mm0, [r0+r3*1-8]
  1172. punpckhbw mm0, [r0+r3*0-8]
  1173. movq mm1, [r2+r3*1-8]
  1174. punpckhbw mm1, [r0+r3*2-8]
  1175. mov r2, r0
  1176. punpckhwd mm1, mm0
  1177. lea r0, [r0+r3*4]
  1178. movq mm2, [r0+r3*1-8]
  1179. punpckhbw mm2, [r0+r3*0-8]
  1180. lea r0, [r0+r3*2]
  1181. movq mm3, [r0+r3*1-8]
  1182. punpckhbw mm3, [r0+r3*0-8]
  1183. punpckhwd mm3, mm2
  1184. punpckhdq mm3, mm1
  1185. lea r0, [r0+r3*2]
  1186. movq mm0, [r0+r3*0-8]
  1187. movq mm1, [r2]
  1188. mov r0, r2
  1189. movq mm4, mm3
  1190. movq mm2, mm3
  1191. PALIGNR mm4, mm0, 7, mm0
  1192. PALIGNR mm1, mm2, 1, mm2
  1193. test r1, r1 ; top_left
  1194. jnz .do_left
  1195. .fix_lt_1:
  1196. movq mm5, mm3
  1197. pxor mm5, mm4
  1198. psrlq mm5, 56
  1199. psllq mm5, 48
  1200. pxor mm1, mm5
  1201. .do_left:
  1202. movq mm0, mm4
  1203. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1204. movq mm4, mm0
  1205. movq mm7, mm2
  1206. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1207. psllq mm1, 56
  1208. PALIGNR mm7, mm1, 7, mm3
  1209. movq mm3, mm7
  1210. lea r1, [r0+r3*2]
  1211. movq mm7, mm3
  1212. punpckhbw mm3, mm3
  1213. punpcklbw mm7, mm7
  1214. pshufw mm0, mm3, 0xff
  1215. pshufw mm1, mm3, 0xaa
  1216. lea r2, [r1+r3*2]
  1217. pshufw mm2, mm3, 0x55
  1218. pshufw mm3, mm3, 0x00
  1219. pshufw mm4, mm7, 0xff
  1220. pshufw mm5, mm7, 0xaa
  1221. pshufw mm6, mm7, 0x55
  1222. pshufw mm7, mm7, 0x00
  1223. movq [r0+r3*1], mm0
  1224. movq [r0+r3*2], mm1
  1225. movq [r1+r3*1], mm2
  1226. movq [r1+r3*2], mm3
  1227. movq [r2+r3*1], mm4
  1228. movq [r2+r3*2], mm5
  1229. lea r0, [r2+r3*2]
  1230. movq [r0+r3*1], mm6
  1231. movq [r0+r3*2], mm7
  1232. RET
  1233. %endmacro
  1234. INIT_MMX
  1235. %define PALIGNR PALIGNR_MMX
  1236. PRED8x8L_HORIZONTAL mmxext
  1237. %define PALIGNR PALIGNR_SSSE3
  1238. PRED8x8L_HORIZONTAL ssse3
  1239. %endif
  1240. ;-----------------------------------------------------------------------------
  1241. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1242. ;-----------------------------------------------------------------------------
  1243. %ifdef CONFIG_GPL
  1244. %macro PRED8x8L_VERTICAL 1
  1245. cglobal pred8x8l_vertical_%1, 4,4
  1246. sub r0, r3
  1247. movq mm0, [r0-8]
  1248. movq mm3, [r0]
  1249. movq mm1, [r0+8]
  1250. movq mm2, mm3
  1251. movq mm4, mm3
  1252. PALIGNR mm2, mm0, 7, mm0
  1253. PALIGNR mm1, mm4, 1, mm4
  1254. test r1, r1 ; top_left
  1255. jz .fix_lt_2
  1256. test r2, r2 ; top_right
  1257. jz .fix_tr_1
  1258. jmp .body
  1259. .fix_lt_2:
  1260. movq mm5, mm3
  1261. pxor mm5, mm2
  1262. psllq mm5, 56
  1263. psrlq mm5, 56
  1264. pxor mm2, mm5
  1265. test r2, r2 ; top_right
  1266. jnz .body
  1267. .fix_tr_1:
  1268. movq mm5, mm3
  1269. pxor mm5, mm1
  1270. psrlq mm5, 56
  1271. psllq mm5, 56
  1272. pxor mm1, mm5
  1273. .body
  1274. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1275. %rep 3
  1276. movq [r0+r3*1], mm0
  1277. movq [r0+r3*2], mm0
  1278. lea r0, [r0+r3*2]
  1279. %endrep
  1280. movq [r0+r3*1], mm0
  1281. movq [r0+r3*2], mm0
  1282. RET
  1283. %endmacro
  1284. INIT_MMX
  1285. %define PALIGNR PALIGNR_MMX
  1286. PRED8x8L_VERTICAL mmxext
  1287. %define PALIGNR PALIGNR_SSSE3
  1288. PRED8x8L_VERTICAL ssse3
  1289. %endif
  1290. ;-----------------------------------------------------------------------------
  1291. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1292. ;-----------------------------------------------------------------------------
  1293. %ifdef CONFIG_GPL
  1294. %macro PRED8x8L_DOWN_LEFT 1
  1295. cglobal pred8x8l_down_left_%1, 4,4
  1296. sub r0, r3
  1297. movq mm0, [r0-8]
  1298. movq mm3, [r0]
  1299. movq mm1, [r0+8]
  1300. movq mm2, mm3
  1301. movq mm4, mm3
  1302. PALIGNR mm2, mm0, 7, mm0
  1303. PALIGNR mm1, mm4, 1, mm4
  1304. test r1, r1 ; top_left
  1305. jz .fix_lt_2
  1306. test r2, r2 ; top_right
  1307. jz .fix_tr_1
  1308. jmp .do_top
  1309. .fix_lt_2:
  1310. movq mm5, mm3
  1311. pxor mm5, mm2
  1312. psllq mm5, 56
  1313. psrlq mm5, 56
  1314. pxor mm2, mm5
  1315. test r2, r2 ; top_right
  1316. jnz .do_top
  1317. .fix_tr_1:
  1318. movq mm5, mm3
  1319. pxor mm5, mm1
  1320. psrlq mm5, 56
  1321. psllq mm5, 56
  1322. pxor mm1, mm5
  1323. jmp .do_top
  1324. .fix_tr_2:
  1325. punpckhbw mm3, mm3
  1326. pshufw mm1, mm3, 0xFF
  1327. jmp .do_topright
  1328. .do_top:
  1329. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1330. movq2dq xmm3, mm4
  1331. test r2, r2 ; top_right
  1332. jz .fix_tr_2
  1333. movq mm0, [r0+8]
  1334. movq mm5, mm0
  1335. movq mm2, mm0
  1336. movq mm4, mm0
  1337. psrlq mm5, 56
  1338. PALIGNR mm2, mm3, 7, mm3
  1339. PALIGNR mm5, mm4, 1, mm4
  1340. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1341. .do_topright:
  1342. movq2dq xmm4, mm1
  1343. psrlq mm1, 56
  1344. movq2dq xmm5, mm1
  1345. lea r1, [r0+r3*2]
  1346. pslldq xmm4, 8
  1347. por xmm3, xmm4
  1348. movdqa xmm2, xmm3
  1349. psrldq xmm2, 1
  1350. pslldq xmm5, 15
  1351. por xmm2, xmm5
  1352. lea r2, [r1+r3*2]
  1353. movdqa xmm1, xmm3
  1354. pslldq xmm1, 1
  1355. INIT_XMM
  1356. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1357. psrldq xmm0, 1
  1358. movq [r0+r3*1], xmm0
  1359. psrldq xmm0, 1
  1360. movq [r0+r3*2], xmm0
  1361. psrldq xmm0, 1
  1362. lea r0, [r2+r3*2]
  1363. movq [r1+r3*1], xmm0
  1364. psrldq xmm0, 1
  1365. movq [r1+r3*2], xmm0
  1366. psrldq xmm0, 1
  1367. movq [r2+r3*1], xmm0
  1368. psrldq xmm0, 1
  1369. movq [r2+r3*2], xmm0
  1370. psrldq xmm0, 1
  1371. movq [r0+r3*1], xmm0
  1372. psrldq xmm0, 1
  1373. movq [r0+r3*2], xmm0
  1374. RET
  1375. %endmacro
  1376. INIT_MMX
  1377. %define PALIGNR PALIGNR_MMX
  1378. PRED8x8L_DOWN_LEFT sse2
  1379. INIT_MMX
  1380. %define PALIGNR PALIGNR_SSSE3
  1381. PRED8x8L_DOWN_LEFT ssse3
  1382. %endif
  1383. ;-----------------------------------------------------------------------------
  1384. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1385. ;-----------------------------------------------------------------------------
  1386. %ifdef CONFIG_GPL
  1387. INIT_MMX
  1388. %define PALIGNR PALIGNR_MMX
  1389. cglobal pred8x8l_down_right_mmxext, 4,5
  1390. sub r0, r3
  1391. lea r4, [r0+r3*2]
  1392. movq mm0, [r0+r3*1-8]
  1393. punpckhbw mm0, [r0+r3*0-8]
  1394. movq mm1, [r4+r3*1-8]
  1395. punpckhbw mm1, [r0+r3*2-8]
  1396. mov r4, r0
  1397. punpckhwd mm1, mm0
  1398. lea r0, [r0+r3*4]
  1399. movq mm2, [r0+r3*1-8]
  1400. punpckhbw mm2, [r0+r3*0-8]
  1401. lea r0, [r0+r3*2]
  1402. movq mm3, [r0+r3*1-8]
  1403. punpckhbw mm3, [r0+r3*0-8]
  1404. punpckhwd mm3, mm2
  1405. punpckhdq mm3, mm1
  1406. lea r0, [r0+r3*2]
  1407. movq mm0, [r0+r3*0-8]
  1408. movq mm1, [r4]
  1409. mov r0, r4
  1410. movq mm4, mm3
  1411. movq mm2, mm3
  1412. PALIGNR mm4, mm0, 7, mm0
  1413. PALIGNR mm1, mm2, 1, mm2
  1414. test r1, r1 ; top_left
  1415. jz .fix_lt_1
  1416. .do_left:
  1417. movq mm0, mm4
  1418. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1419. movq mm4, mm0
  1420. movq mm7, mm2
  1421. movq mm6, mm2
  1422. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1423. psllq mm1, 56
  1424. PALIGNR mm7, mm1, 7, mm3
  1425. movq mm0, [r0-8]
  1426. movq mm3, [r0]
  1427. movq mm1, [r0+8]
  1428. movq mm2, mm3
  1429. movq mm4, mm3
  1430. PALIGNR mm2, mm0, 7, mm0
  1431. PALIGNR mm1, mm4, 1, mm4
  1432. test r1, r1 ; top_left
  1433. jz .fix_lt_2
  1434. test r2, r2 ; top_right
  1435. jz .fix_tr_1
  1436. .do_top:
  1437. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1438. movq mm5, mm4
  1439. jmp .body
  1440. .fix_lt_1:
  1441. movq mm5, mm3
  1442. pxor mm5, mm4
  1443. psrlq mm5, 56
  1444. psllq mm5, 48
  1445. pxor mm1, mm5
  1446. jmp .do_left
  1447. .fix_lt_2:
  1448. movq mm5, mm3
  1449. pxor mm5, mm2
  1450. psllq mm5, 56
  1451. psrlq mm5, 56
  1452. pxor mm2, mm5
  1453. test r2, r2 ; top_right
  1454. jnz .do_top
  1455. .fix_tr_1:
  1456. movq mm5, mm3
  1457. pxor mm5, mm1
  1458. psrlq mm5, 56
  1459. psllq mm5, 56
  1460. pxor mm1, mm5
  1461. jmp .do_top
  1462. .body
  1463. lea r1, [r0+r3*2]
  1464. movq mm1, mm7
  1465. movq mm7, mm5
  1466. movq mm5, mm6
  1467. movq mm2, mm7
  1468. lea r2, [r1+r3*2]
  1469. PALIGNR mm2, mm6, 1, mm0
  1470. movq mm3, mm7
  1471. PALIGNR mm3, mm6, 7, mm0
  1472. movq mm4, mm7
  1473. lea r4, [r2+r3*2]
  1474. psrlq mm4, 8
  1475. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1476. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1477. movq [r4+r3*2], mm0
  1478. movq mm2, mm1
  1479. psrlq mm0, 8
  1480. psllq mm2, 56
  1481. psrlq mm1, 8
  1482. por mm0, mm2
  1483. movq [r4+r3*1], mm0
  1484. movq mm2, mm1
  1485. psrlq mm0, 8
  1486. psllq mm2, 56
  1487. psrlq mm1, 8
  1488. por mm0, mm2
  1489. movq [r2+r3*2], mm0
  1490. movq mm2, mm1
  1491. psrlq mm0, 8
  1492. psllq mm2, 56
  1493. psrlq mm1, 8
  1494. por mm0, mm2
  1495. movq [r2+r3*1], mm0
  1496. movq mm2, mm1
  1497. psrlq mm0, 8
  1498. psllq mm2, 56
  1499. psrlq mm1, 8
  1500. por mm0, mm2
  1501. movq [r1+r3*2], mm0
  1502. movq mm2, mm1
  1503. psrlq mm0, 8
  1504. psllq mm2, 56
  1505. psrlq mm1, 8
  1506. por mm0, mm2
  1507. movq [r1+r3*1], mm0
  1508. movq mm2, mm1
  1509. psrlq mm0, 8
  1510. psllq mm2, 56
  1511. psrlq mm1, 8
  1512. por mm0, mm2
  1513. movq [r0+r3*2], mm0
  1514. psrlq mm0, 8
  1515. psllq mm1, 56
  1516. por mm0, mm1
  1517. movq [r0+r3*1], mm0
  1518. RET
  1519. %macro PRED8x8L_DOWN_RIGHT 1
  1520. cglobal pred8x8l_down_right_%1, 4,5
  1521. sub r0, r3
  1522. lea r4, [r0+r3*2]
  1523. movq mm0, [r0+r3*1-8]
  1524. punpckhbw mm0, [r0+r3*0-8]
  1525. movq mm1, [r4+r3*1-8]
  1526. punpckhbw mm1, [r0+r3*2-8]
  1527. mov r4, r0
  1528. punpckhwd mm1, mm0
  1529. lea r0, [r0+r3*4]
  1530. movq mm2, [r0+r3*1-8]
  1531. punpckhbw mm2, [r0+r3*0-8]
  1532. lea r0, [r0+r3*2]
  1533. movq mm3, [r0+r3*1-8]
  1534. punpckhbw mm3, [r0+r3*0-8]
  1535. punpckhwd mm3, mm2
  1536. punpckhdq mm3, mm1
  1537. lea r0, [r0+r3*2]
  1538. movq mm0, [r0+r3*0-8]
  1539. movq mm1, [r4]
  1540. mov r0, r4
  1541. movq mm4, mm3
  1542. movq mm2, mm3
  1543. PALIGNR mm4, mm0, 7, mm0
  1544. PALIGNR mm1, mm2, 1, mm2
  1545. test r1, r1
  1546. jz .fix_lt_1
  1547. jmp .do_left
  1548. .fix_lt_1:
  1549. movq mm5, mm3
  1550. pxor mm5, mm4
  1551. psrlq mm5, 56
  1552. psllq mm5, 48
  1553. pxor mm1, mm5
  1554. jmp .do_left
  1555. .fix_lt_2:
  1556. movq mm5, mm3
  1557. pxor mm5, mm2
  1558. psllq mm5, 56
  1559. psrlq mm5, 56
  1560. pxor mm2, mm5
  1561. test r2, r2
  1562. jnz .do_top
  1563. .fix_tr_1:
  1564. movq mm5, mm3
  1565. pxor mm5, mm1
  1566. psrlq mm5, 56
  1567. psllq mm5, 56
  1568. pxor mm1, mm5
  1569. jmp .do_top
  1570. .do_left:
  1571. movq mm0, mm4
  1572. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1573. movq mm4, mm0
  1574. movq mm7, mm2
  1575. movq2dq xmm3, mm2
  1576. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1577. psllq mm1, 56
  1578. PALIGNR mm7, mm1, 7, mm3
  1579. movq2dq xmm1, mm7
  1580. movq mm0, [r0-8]
  1581. movq mm3, [r0]
  1582. movq mm1, [r0+8]
  1583. movq mm2, mm3
  1584. movq mm4, mm3
  1585. PALIGNR mm2, mm0, 7, mm0
  1586. PALIGNR mm1, mm4, 1, mm4
  1587. test r1, r1
  1588. jz .fix_lt_2
  1589. test r2, r2
  1590. jz .fix_tr_1
  1591. .do_top:
  1592. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1593. movq2dq xmm4, mm4
  1594. lea r1, [r0+r3*2]
  1595. movdqa xmm0, xmm3
  1596. pslldq xmm4, 8
  1597. por xmm3, xmm4
  1598. lea r2, [r1+r3*2]
  1599. pslldq xmm4, 1
  1600. por xmm1, xmm4
  1601. psrldq xmm0, 7
  1602. pslldq xmm0, 15
  1603. psrldq xmm0, 7
  1604. por xmm1, xmm0
  1605. lea r0, [r2+r3*2]
  1606. movdqa xmm2, xmm3
  1607. psrldq xmm2, 1
  1608. INIT_XMM
  1609. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1610. movdqa xmm1, xmm0
  1611. psrldq xmm1, 1
  1612. movq [r0+r3*2], xmm0
  1613. movq [r0+r3*1], xmm1
  1614. psrldq xmm0, 2
  1615. psrldq xmm1, 2
  1616. movq [r2+r3*2], xmm0
  1617. movq [r2+r3*1], xmm1
  1618. psrldq xmm0, 2
  1619. psrldq xmm1, 2
  1620. movq [r1+r3*2], xmm0
  1621. movq [r1+r3*1], xmm1
  1622. psrldq xmm0, 2
  1623. psrldq xmm1, 2
  1624. movq [r4+r3*2], xmm0
  1625. movq [r4+r3*1], xmm1
  1626. RET
  1627. %endmacro
  1628. INIT_MMX
  1629. %define PALIGNR PALIGNR_MMX
  1630. PRED8x8L_DOWN_RIGHT sse2
  1631. INIT_MMX
  1632. %define PALIGNR PALIGNR_SSSE3
  1633. PRED8x8L_DOWN_RIGHT ssse3
  1634. %endif
  1635. ;-----------------------------------------------------------------------------
  1636. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1637. ;-----------------------------------------------------------------------------
  1638. %ifdef CONFIG_GPL
  1639. INIT_MMX
  1640. %define PALIGNR PALIGNR_MMX
  1641. cglobal pred8x8l_vertical_right_mmxext, 4,5
  1642. sub r0, r3
  1643. lea r4, [r0+r3*2]
  1644. movq mm0, [r0+r3*1-8]
  1645. punpckhbw mm0, [r0+r3*0-8]
  1646. movq mm1, [r4+r3*1-8]
  1647. punpckhbw mm1, [r0+r3*2-8]
  1648. mov r4, r0
  1649. punpckhwd mm1, mm0
  1650. lea r0, [r0+r3*4]
  1651. movq mm2, [r0+r3*1-8]
  1652. punpckhbw mm2, [r0+r3*0-8]
  1653. lea r0, [r0+r3*2]
  1654. movq mm3, [r0+r3*1-8]
  1655. punpckhbw mm3, [r0+r3*0-8]
  1656. punpckhwd mm3, mm2
  1657. punpckhdq mm3, mm1
  1658. lea r0, [r0+r3*2]
  1659. movq mm0, [r0+r3*0-8]
  1660. movq mm1, [r4]
  1661. mov r0, r4
  1662. movq mm4, mm3
  1663. movq mm2, mm3
  1664. PALIGNR mm4, mm0, 7, mm0
  1665. PALIGNR mm1, mm2, 1, mm2
  1666. test r1, r1
  1667. jz .fix_lt_1
  1668. jmp .do_left
  1669. .fix_lt_1:
  1670. movq mm5, mm3
  1671. pxor mm5, mm4
  1672. psrlq mm5, 56
  1673. psllq mm5, 48
  1674. pxor mm1, mm5
  1675. jmp .do_left
  1676. .fix_lt_2:
  1677. movq mm5, mm3
  1678. pxor mm5, mm2
  1679. psllq mm5, 56
  1680. psrlq mm5, 56
  1681. pxor mm2, mm5
  1682. test r2, r2
  1683. jnz .do_top
  1684. .fix_tr_1:
  1685. movq mm5, mm3
  1686. pxor mm5, mm1
  1687. psrlq mm5, 56
  1688. psllq mm5, 56
  1689. pxor mm1, mm5
  1690. jmp .do_top
  1691. .do_left:
  1692. movq mm0, mm4
  1693. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1694. movq mm7, mm2
  1695. movq mm0, [r0-8]
  1696. movq mm3, [r0]
  1697. movq mm1, [r0+8]
  1698. movq mm2, mm3
  1699. movq mm4, mm3
  1700. PALIGNR mm2, mm0, 7, mm0
  1701. PALIGNR mm1, mm4, 1, mm4
  1702. test r1, r1
  1703. jz .fix_lt_2
  1704. test r2, r2
  1705. jz .fix_tr_1
  1706. .do_top
  1707. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1708. lea r1, [r0+r3*2]
  1709. movq mm2, mm6
  1710. movq mm3, mm6
  1711. PALIGNR mm3, mm7, 7, mm0
  1712. PALIGNR mm6, mm7, 6, mm1
  1713. movq mm4, mm3
  1714. pavgb mm3, mm2
  1715. lea r2, [r1+r3*2]
  1716. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1717. movq [r0+r3*1], mm3
  1718. movq [r0+r3*2], mm0
  1719. movq mm5, mm0
  1720. movq mm6, mm3
  1721. movq mm1, mm7
  1722. movq mm2, mm1
  1723. psllq mm2, 8
  1724. movq mm3, mm1
  1725. psllq mm3, 16
  1726. lea r4, [r2+r3*2]
  1727. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1728. PALIGNR mm6, mm0, 7, mm2
  1729. movq [r1+r3*1], mm6
  1730. psllq mm0, 8
  1731. PALIGNR mm5, mm0, 7, mm1
  1732. movq [r1+r3*2], mm5
  1733. psllq mm0, 8
  1734. PALIGNR mm6, mm0, 7, mm2
  1735. movq [r2+r3*1], mm6
  1736. psllq mm0, 8
  1737. PALIGNR mm5, mm0, 7, mm1
  1738. movq [r2+r3*2], mm5
  1739. psllq mm0, 8
  1740. PALIGNR mm6, mm0, 7, mm2
  1741. movq [r4+r3*1], mm6
  1742. psllq mm0, 8
  1743. PALIGNR mm5, mm0, 7, mm1
  1744. movq [r4+r3*2], mm5
  1745. RET
  1746. %macro PRED8x8L_VERTICAL_RIGHT 1
  1747. cglobal pred8x8l_vertical_right_%1, 4,5,7
  1748. sub r0, r3
  1749. lea r4, [r0+r3*2]
  1750. movq mm0, [r0+r3*1-8]
  1751. punpckhbw mm0, [r0+r3*0-8]
  1752. movq mm1, [r4+r3*1-8]
  1753. punpckhbw mm1, [r0+r3*2-8]
  1754. mov r4, r0
  1755. punpckhwd mm1, mm0
  1756. lea r0, [r0+r3*4]
  1757. movq mm2, [r0+r3*1-8]
  1758. punpckhbw mm2, [r0+r3*0-8]
  1759. lea r0, [r0+r3*2]
  1760. movq mm3, [r0+r3*1-8]
  1761. punpckhbw mm3, [r0+r3*0-8]
  1762. punpckhwd mm3, mm2
  1763. punpckhdq mm3, mm1
  1764. lea r0, [r0+r3*2]
  1765. movq mm0, [r0+r3*0-8]
  1766. movq mm1, [r4]
  1767. mov r0, r4
  1768. movq mm4, mm3
  1769. movq mm2, mm3
  1770. PALIGNR mm4, mm0, 7, mm0
  1771. PALIGNR mm1, mm2, 1, mm2
  1772. test r1, r1
  1773. jnz .do_left
  1774. .fix_lt_1:
  1775. movq mm5, mm3
  1776. pxor mm5, mm4
  1777. psrlq mm5, 56
  1778. psllq mm5, 48
  1779. pxor mm1, mm5
  1780. jmp .do_left
  1781. .fix_lt_2:
  1782. movq mm5, mm3
  1783. pxor mm5, mm2
  1784. psllq mm5, 56
  1785. psrlq mm5, 56
  1786. pxor mm2, mm5
  1787. test r2, r2
  1788. jnz .do_top
  1789. .fix_tr_1:
  1790. movq mm5, mm3
  1791. pxor mm5, mm1
  1792. psrlq mm5, 56
  1793. psllq mm5, 56
  1794. pxor mm1, mm5
  1795. jmp .do_top
  1796. .do_left:
  1797. movq mm0, mm4
  1798. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1799. movq2dq xmm0, mm2
  1800. movq mm0, [r0-8]
  1801. movq mm3, [r0]
  1802. movq mm1, [r0+8]
  1803. movq mm2, mm3
  1804. movq mm4, mm3
  1805. PALIGNR mm2, mm0, 7, mm0
  1806. PALIGNR mm1, mm4, 1, mm4
  1807. test r1, r1
  1808. jz .fix_lt_2
  1809. test r2, r2
  1810. jz .fix_tr_1
  1811. .do_top
  1812. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1813. lea r1, [r0+r3*2]
  1814. movq2dq xmm4, mm6
  1815. pslldq xmm4, 8
  1816. por xmm0, xmm4
  1817. movdqa xmm6, [pw_ff00]
  1818. movdqa xmm1, xmm0
  1819. lea r2, [r1+r3*2]
  1820. movdqa xmm2, xmm0
  1821. movdqa xmm3, xmm0
  1822. pslldq xmm0, 1
  1823. pslldq xmm1, 2
  1824. pavgb xmm2, xmm0
  1825. INIT_XMM
  1826. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1827. pandn xmm6, xmm4
  1828. movdqa xmm5, xmm4
  1829. psrlw xmm4, 8
  1830. packuswb xmm6, xmm4
  1831. movhlps xmm4, xmm6
  1832. movhps [r0+r3*2], xmm5
  1833. movhps [r0+r3*1], xmm2
  1834. psrldq xmm5, 4
  1835. movss xmm5, xmm6
  1836. psrldq xmm2, 4
  1837. movss xmm2, xmm4
  1838. lea r0, [r2+r3*2]
  1839. psrldq xmm5, 1
  1840. psrldq xmm2, 1
  1841. movq [r0+r3*2], xmm5
  1842. movq [r0+r3*1], xmm2
  1843. psrldq xmm5, 1
  1844. psrldq xmm2, 1
  1845. movq [r2+r3*2], xmm5
  1846. movq [r2+r3*1], xmm2
  1847. psrldq xmm5, 1
  1848. psrldq xmm2, 1
  1849. movq [r1+r3*2], xmm5
  1850. movq [r1+r3*1], xmm2
  1851. RET
  1852. %endmacro
  1853. INIT_MMX
  1854. %define PALIGNR PALIGNR_MMX
  1855. PRED8x8L_VERTICAL_RIGHT sse2
  1856. INIT_MMX
  1857. %define PALIGNR PALIGNR_SSSE3
  1858. PRED8x8L_VERTICAL_RIGHT ssse3
  1859. %endif
  1860. ;-----------------------------------------------------------------------------
  1861. ;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1862. ;-----------------------------------------------------------------------------
  1863. %ifdef CONFIG_GPL
  1864. %macro PRED8x8L_VERTICAL_LEFT 1
  1865. cglobal pred8x8l_vertical_left_%1, 4,4
  1866. sub r0, r3
  1867. movq mm0, [r0-8]
  1868. movq mm3, [r0]
  1869. movq mm1, [r0+8]
  1870. movq mm2, mm3
  1871. movq mm4, mm3
  1872. PALIGNR mm2, mm0, 7, mm0
  1873. PALIGNR mm1, mm4, 1, mm4
  1874. test r1, r1
  1875. jz .fix_lt_2
  1876. test r2, r2
  1877. jz .fix_tr_1
  1878. jmp .do_top
  1879. .fix_lt_2:
  1880. movq mm5, mm3
  1881. pxor mm5, mm2
  1882. psllq mm5, 56
  1883. psrlq mm5, 56
  1884. pxor mm2, mm5
  1885. test r2, r2
  1886. jnz .do_top
  1887. .fix_tr_1:
  1888. movq mm5, mm3
  1889. pxor mm5, mm1
  1890. psrlq mm5, 56
  1891. psllq mm5, 56
  1892. pxor mm1, mm5
  1893. jmp .do_top
  1894. .fix_tr_2:
  1895. punpckhbw mm3, mm3
  1896. pshufw mm1, mm3, 0xFF
  1897. jmp .do_topright
  1898. .do_top:
  1899. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1900. movq2dq xmm4, mm4
  1901. test r2, r2
  1902. jz .fix_tr_2
  1903. movq mm0, [r0+8]
  1904. movq mm5, mm0
  1905. movq mm2, mm0
  1906. movq mm4, mm0
  1907. psrlq mm5, 56
  1908. PALIGNR mm2, mm3, 7, mm3
  1909. PALIGNR mm5, mm4, 1, mm4
  1910. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1911. .do_topright:
  1912. movq2dq xmm3, mm1
  1913. lea r1, [r0+r3*2]
  1914. pslldq xmm3, 8
  1915. por xmm4, xmm3
  1916. movdqa xmm2, xmm4
  1917. movdqa xmm1, xmm4
  1918. movdqa xmm3, xmm4
  1919. psrldq xmm2, 1
  1920. pslldq xmm1, 1
  1921. pavgb xmm3, xmm2
  1922. lea r2, [r1+r3*2]
  1923. INIT_XMM
  1924. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1925. psrldq xmm0, 1
  1926. movq [r0+r3*1], xmm3
  1927. movq [r0+r3*2], xmm0
  1928. lea r0, [r2+r3*2]
  1929. psrldq xmm3, 1
  1930. psrldq xmm0, 1
  1931. movq [r1+r3*1], xmm3
  1932. movq [r1+r3*2], xmm0
  1933. psrldq xmm3, 1
  1934. psrldq xmm0, 1
  1935. movq [r2+r3*1], xmm3
  1936. movq [r2+r3*2], xmm0
  1937. psrldq xmm3, 1
  1938. psrldq xmm0, 1
  1939. movq [r0+r3*1], xmm3
  1940. movq [r0+r3*2], xmm0
  1941. RET
  1942. %endmacro
  1943. INIT_MMX
  1944. %define PALIGNR PALIGNR_MMX
  1945. PRED8x8L_VERTICAL_LEFT sse2
  1946. %define PALIGNR PALIGNR_SSSE3
  1947. INIT_MMX
  1948. PRED8x8L_VERTICAL_LEFT ssse3
  1949. %endif
  1950. ;-----------------------------------------------------------------------------
  1951. ; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
  1952. ;-----------------------------------------------------------------------------
  1953. %ifdef CONFIG_GPL
  1954. %macro PRED8x8L_HORIZONTAL_UP 1
  1955. cglobal pred8x8l_horizontal_up_%1, 4,4
  1956. sub r0, r3
  1957. lea r2, [r0+r3*2]
  1958. movq mm0, [r0+r3*1-8]
  1959. punpckhbw mm0, [r0+r3*0-8]
  1960. movq mm1, [r2+r3*1-8]
  1961. punpckhbw mm1, [r0+r3*2-8]
  1962. mov r2, r0
  1963. punpckhwd mm1, mm0
  1964. lea r0, [r0+r3*4]
  1965. movq mm2, [r0+r3*1-8]
  1966. punpckhbw mm2, [r0+r3*0-8]
  1967. lea r0, [r0+r3*2]
  1968. movq mm3, [r0+r3*1-8]
  1969. punpckhbw mm3, [r0+r3*0-8]
  1970. punpckhwd mm3, mm2
  1971. punpckhdq mm3, mm1
  1972. lea r0, [r0+r3*2]
  1973. movq mm0, [r0+r3*0-8]
  1974. movq mm1, [r2]
  1975. mov r0, r2
  1976. movq mm4, mm3
  1977. movq mm2, mm3
  1978. PALIGNR mm4, mm0, 7, mm0
  1979. PALIGNR mm1, mm2, 1, mm2
  1980. test r1, r1
  1981. jnz .do_left
  1982. .fix_lt_1:
  1983. movq mm5, mm3
  1984. pxor mm5, mm4
  1985. psrlq mm5, 56
  1986. psllq mm5, 48
  1987. pxor mm1, mm5
  1988. .do_left:
  1989. movq mm0, mm4
  1990. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1991. movq mm4, mm0
  1992. movq mm7, mm2
  1993. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1994. psllq mm1, 56
  1995. PALIGNR mm7, mm1, 7, mm3
  1996. lea r1, [r0+r3*2]
  1997. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  1998. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  1999. movq mm2, mm0
  2000. psllw mm0, 8
  2001. psrlw mm2, 8
  2002. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2003. movq mm3, mm2
  2004. movq mm4, mm2
  2005. movq mm5, mm2
  2006. psrlq mm2, 8
  2007. psrlq mm3, 16
  2008. lea r2, [r1+r3*2]
  2009. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2010. punpckhbw mm7, mm7
  2011. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2012. pavgb mm4, mm2
  2013. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2014. movq mm5, mm4
  2015. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2016. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2017. movq mm6, mm5
  2018. movq mm7, mm5
  2019. movq mm0, mm5
  2020. PALIGNR mm5, mm4, 2, mm1
  2021. pshufw mm1, mm6, 11111001b
  2022. PALIGNR mm6, mm4, 4, mm2
  2023. pshufw mm2, mm7, 11111110b
  2024. PALIGNR mm7, mm4, 6, mm3
  2025. pshufw mm3, mm0, 11111111b
  2026. movq [r0+r3*1], mm4
  2027. movq [r0+r3*2], mm5
  2028. lea r0, [r2+r3*2]
  2029. movq [r1+r3*1], mm6
  2030. movq [r1+r3*2], mm7
  2031. movq [r2+r3*1], mm0
  2032. movq [r2+r3*2], mm1
  2033. movq [r0+r3*1], mm2
  2034. movq [r0+r3*2], mm3
  2035. RET
  2036. %endmacro
  2037. INIT_MMX
  2038. %define PALIGNR PALIGNR_MMX
  2039. PRED8x8L_HORIZONTAL_UP mmxext
  2040. %define PALIGNR PALIGNR_SSSE3
  2041. PRED8x8L_HORIZONTAL_UP ssse3
  2042. %endif
  2043. ;-----------------------------------------------------------------------------
  2044. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2045. ;-----------------------------------------------------------------------------
  2046. cglobal pred4x4_dc_mmxext, 3,5
  2047. pxor mm7, mm7
  2048. mov r4, r0
  2049. sub r0, r2
  2050. movd mm0, [r0]
  2051. psadbw mm0, mm7
  2052. movzx r1d, byte [r0+r2*1-1]
  2053. movd r3d, mm0
  2054. add r3d, r1d
  2055. movzx r1d, byte [r0+r2*2-1]
  2056. lea r0, [r0+r2*2]
  2057. add r3d, r1d
  2058. movzx r1d, byte [r0+r2*1-1]
  2059. add r3d, r1d
  2060. movzx r1d, byte [r0+r2*2-1]
  2061. add r3d, r1d
  2062. add r3d, 4
  2063. shr r3d, 3
  2064. imul r3d, 0x01010101
  2065. mov [r4+r2*0], r3d
  2066. mov [r0+r2*0], r3d
  2067. mov [r0+r2*1], r3d
  2068. mov [r0+r2*2], r3d
  2069. RET
  2070. ;-----------------------------------------------------------------------------
  2071. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2072. ;-----------------------------------------------------------------------------
  2073. %macro PRED4x4_TM_MMX 1
  2074. cglobal pred4x4_tm_vp8_%1, 3,6
  2075. sub r0, r2
  2076. pxor mm7, mm7
  2077. movd mm0, [r0]
  2078. punpcklbw mm0, mm7
  2079. movzx r4d, byte [r0-1]
  2080. mov r5d, 2
  2081. .loop:
  2082. movzx r1d, byte [r0+r2*1-1]
  2083. movzx r3d, byte [r0+r2*2-1]
  2084. sub r1d, r4d
  2085. sub r3d, r4d
  2086. movd mm2, r1d
  2087. movd mm4, r3d
  2088. %ifidn %1, mmx
  2089. punpcklwd mm2, mm2
  2090. punpcklwd mm4, mm4
  2091. punpckldq mm2, mm2
  2092. punpckldq mm4, mm4
  2093. %else
  2094. pshufw mm2, mm2, 0
  2095. pshufw mm4, mm4, 0
  2096. %endif
  2097. paddw mm2, mm0
  2098. paddw mm4, mm0
  2099. packuswb mm2, mm2
  2100. packuswb mm4, mm4
  2101. movd [r0+r2*1], mm2
  2102. movd [r0+r2*2], mm4
  2103. lea r0, [r0+r2*2]
  2104. dec r5d
  2105. jg .loop
  2106. REP_RET
  2107. %endmacro
  2108. PRED4x4_TM_MMX mmx
  2109. PRED4x4_TM_MMX mmxext
  2110. cglobal pred4x4_tm_vp8_ssse3, 3,3
  2111. sub r0, r2
  2112. movq mm6, [tm_shuf]
  2113. pxor mm1, mm1
  2114. movd mm0, [r0]
  2115. punpcklbw mm0, mm1
  2116. movd mm7, [r0-4]
  2117. pshufb mm7, mm6
  2118. lea r1, [r0+r2*2]
  2119. movd mm2, [r0+r2*1-4]
  2120. movd mm3, [r0+r2*2-4]
  2121. movd mm4, [r1+r2*1-4]
  2122. movd mm5, [r1+r2*2-4]
  2123. pshufb mm2, mm6
  2124. pshufb mm3, mm6
  2125. pshufb mm4, mm6
  2126. pshufb mm5, mm6
  2127. psubw mm2, mm7
  2128. psubw mm3, mm7
  2129. psubw mm4, mm7
  2130. psubw mm5, mm7
  2131. paddw mm2, mm0
  2132. paddw mm3, mm0
  2133. paddw mm4, mm0
  2134. paddw mm5, mm0
  2135. packuswb mm2, mm2
  2136. packuswb mm3, mm3
  2137. packuswb mm4, mm4
  2138. packuswb mm5, mm5
  2139. movd [r0+r2*1], mm2
  2140. movd [r0+r2*2], mm3
  2141. movd [r1+r2*1], mm4
  2142. movd [r1+r2*2], mm5
  2143. RET
  2144. ;-----------------------------------------------------------------------------
  2145. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2146. ;-----------------------------------------------------------------------------
  2147. INIT_MMX
  2148. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  2149. sub r0, r2
  2150. movd m1, [r0-1]
  2151. movd m0, [r0]
  2152. mova m2, m0 ;t0 t1 t2 t3
  2153. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2154. lea r1, [r0+r2*2]
  2155. psrlq m0, 8 ;t1 t2 t3 t4
  2156. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2157. movd [r0+r2*1], m3
  2158. movd [r0+r2*2], m3
  2159. movd [r1+r2*1], m3
  2160. movd [r1+r2*2], m3
  2161. RET
  2162. ;-----------------------------------------------------------------------------
  2163. ; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  2164. ;-----------------------------------------------------------------------------
  2165. %ifdef CONFIG_GPL
  2166. INIT_MMX
  2167. cglobal pred4x4_down_left_mmxext, 3,3
  2168. sub r0, r2
  2169. movq m1, [r0]
  2170. punpckldq m1, [r1]
  2171. movq m2, m1
  2172. movq m3, m1
  2173. movq m4, m1
  2174. psllq m1, 8
  2175. pxor m2, m1
  2176. psrlq m2, 8
  2177. pxor m3, m2
  2178. PRED4x4_LOWPASS m0, m1, m3, m4, m5
  2179. lea r1, [r0+r2*2]
  2180. psrlq m0, 8
  2181. movd [r0+r2*1], m0
  2182. psrlq m0, 8
  2183. movd [r0+r2*2], m0
  2184. psrlq m0, 8
  2185. movd [r1+r2*1], m0
  2186. psrlq m0, 8
  2187. movd [r1+r2*2], m0
  2188. RET
  2189. %endif