You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1209 lines
32KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA
  27. cextern pw_16
  28. cextern pw_8
  29. cextern pw_4
  30. cextern pw_2
  31. cextern pw_1
  32. pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
  33. pw_m3: times 8 dw -3
  34. pw_pixel_max: times 8 dw ((1 << 10)-1)
  35. pw_512: times 8 dw 512
  36. pd_17: times 4 dd 17
  37. pd_16: times 4 dd 16
  38. SECTION .text
  39. ; dest, left, right, src
  40. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  41. %macro PRED4x4_LOWPASS 4
  42. paddw %2, %3
  43. psrlw %2, 1
  44. pavgw %1, %4, %2
  45. %endmacro
  46. ;-----------------------------------------------------------------------------
  47. ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
  48. ;-----------------------------------------------------------------------------
  49. %macro PRED4x4_DR 1
  50. cglobal pred4x4_down_right_10_%1, 3,3
  51. sub r0, r2
  52. lea r1, [r0+r2*2]
  53. movhps m1, [r1-8]
  54. movhps m2, [r0+r2*1-8]
  55. movhps m4, [r0-8]
  56. punpckhwd m2, m4
  57. movq m3, [r0]
  58. punpckhdq m1, m2
  59. PALIGNR m3, m1, 10, m1
  60. movhps m4, [r1+r2*1-8]
  61. PALIGNR m0, m3, m4, 14, m4
  62. movhps m4, [r1+r2*2-8]
  63. PALIGNR m2, m0, m4, 14, m4
  64. PRED4x4_LOWPASS m0, m2, m3, m0
  65. movq [r1+r2*2], m0
  66. psrldq m0, 2
  67. movq [r1+r2*1], m0
  68. psrldq m0, 2
  69. movq [r0+r2*2], m0
  70. psrldq m0, 2
  71. movq [r0+r2*1], m0
  72. RET
  73. %endmacro
  74. INIT_XMM
  75. %define PALIGNR PALIGNR_MMX
  76. PRED4x4_DR sse2
  77. %define PALIGNR PALIGNR_SSSE3
  78. PRED4x4_DR ssse3
  79. %if HAVE_AVX
  80. INIT_AVX
  81. PRED4x4_DR avx
  82. %endif
  83. ;-----------------------------------------------------------------------------
  84. ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
  85. ;-----------------------------------------------------------------------------
  86. %macro PRED4x4_VR 1
  87. cglobal pred4x4_vertical_right_10_%1, 3,3,6
  88. sub r0, r2
  89. lea r1, [r0+r2*2]
  90. movq m5, [r0] ; ........t3t2t1t0
  91. movhps m1, [r0-8]
  92. PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
  93. pavgw m5, m0
  94. movhps m1, [r0+r2*1-8]
  95. PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
  96. movhps m2, [r0+r2*2-8]
  97. PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
  98. movhps m3, [r1+r2*1-8]
  99. PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
  100. PRED4x4_LOWPASS m1, m0, m2, m1
  101. pslldq m0, m1, 12
  102. psrldq m1, 4
  103. movq [r0+r2*1], m5
  104. movq [r0+r2*2], m1
  105. PALIGNR m5, m0, 14, m2
  106. pslldq m0, 2
  107. movq [r1+r2*1], m5
  108. PALIGNR m1, m0, 14, m0
  109. movq [r1+r2*2], m1
  110. RET
  111. %endmacro
  112. INIT_XMM
  113. %define PALIGNR PALIGNR_MMX
  114. PRED4x4_VR sse2
  115. %define PALIGNR PALIGNR_SSSE3
  116. PRED4x4_VR ssse3
  117. %if HAVE_AVX
  118. INIT_AVX
  119. PRED4x4_VR avx
  120. %endif
  121. ;-----------------------------------------------------------------------------
  122. ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
  123. ;-----------------------------------------------------------------------------
  124. %macro PRED4x4_HD 1
  125. cglobal pred4x4_horizontal_down_10_%1, 3,3
  126. sub r0, r2
  127. lea r1, [r0+r2*2]
  128. movq m0, [r0-8] ; lt ..
  129. movhps m0, [r0]
  130. pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
  131. movq m1, [r1+r2*2-8] ; l3
  132. movq m3, [r1+r2*1-8]
  133. punpcklwd m1, m3 ; l2 l3
  134. movq m2, [r0+r2*2-8] ; l1
  135. movq m3, [r0+r2*1-8]
  136. punpcklwd m2, m3 ; l0 l1
  137. punpckhdq m1, m2 ; l0 l1 l2 l3
  138. punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  139. psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
  140. psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
  141. pavgw m5, m1, m3
  142. PRED4x4_LOWPASS m3, m1, m0, m3
  143. punpcklwd m5, m3
  144. psrldq m3, 8
  145. PALIGNR m3, m5, 12, m4
  146. movq [r1+r2*2], m5
  147. movhps [r0+r2*2], m5
  148. psrldq m5, 4
  149. movq [r1+r2*1], m5
  150. movq [r0+r2*1], m3
  151. RET
  152. %endmacro
  153. INIT_XMM
  154. %define PALIGNR PALIGNR_MMX
  155. PRED4x4_HD sse2
  156. %define PALIGNR PALIGNR_SSSE3
  157. PRED4x4_HD ssse3
  158. %if HAVE_AVX
  159. INIT_AVX
  160. PRED4x4_HD avx
  161. %endif
  162. ;-----------------------------------------------------------------------------
  163. ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
  164. ;-----------------------------------------------------------------------------
  165. %macro HADDD 2 ; sum junk
  166. %if mmsize == 16
  167. movhlps %2, %1
  168. paddd %1, %2
  169. pshuflw %2, %1, 0xE
  170. paddd %1, %2
  171. %else
  172. pshufw %2, %1, 0xE
  173. paddd %1, %2
  174. %endif
  175. %endmacro
  176. %macro HADDW 2
  177. pmaddwd %1, [pw_1]
  178. HADDD %1, %2
  179. %endmacro
  180. INIT_MMX
  181. cglobal pred4x4_dc_10_mmxext, 3,3
  182. sub r0, r2
  183. lea r1, [r0+r2*2]
  184. movq m2, [r0+r2*1-8]
  185. paddw m2, [r0+r2*2-8]
  186. paddw m2, [r1+r2*1-8]
  187. paddw m2, [r1+r2*2-8]
  188. psrlq m2, 48
  189. movq m0, [r0]
  190. HADDW m0, m1
  191. paddw m0, [pw_4]
  192. paddw m0, m2
  193. psrlw m0, 3
  194. SPLATW m0, m0, 0
  195. movq [r0+r2*1], m0
  196. movq [r0+r2*2], m0
  197. movq [r1+r2*1], m0
  198. movq [r1+r2*2], m0
  199. RET
  200. ;-----------------------------------------------------------------------------
  201. ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
  202. ;-----------------------------------------------------------------------------
  203. %macro PRED4x4_DL 1
  204. cglobal pred4x4_down_left_10_%1, 3,3
  205. sub r0, r2
  206. movq m0, [r0]
  207. movhps m0, [r1]
  208. psrldq m2, m0, 2
  209. pslldq m3, m0, 2
  210. pshufhw m2, m2, 10100100b
  211. PRED4x4_LOWPASS m0, m3, m2, m0
  212. lea r1, [r0+r2*2]
  213. movhps [r1+r2*2], m0
  214. psrldq m0, 2
  215. movq [r0+r2*1], m0
  216. psrldq m0, 2
  217. movq [r0+r2*2], m0
  218. psrldq m0, 2
  219. movq [r1+r2*1], m0
  220. RET
  221. %endmacro
  222. INIT_XMM
  223. PRED4x4_DL sse2
  224. %if HAVE_AVX
  225. INIT_AVX
  226. PRED4x4_DL avx
  227. %endif
  228. ;-----------------------------------------------------------------------------
  229. ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
  230. ;-----------------------------------------------------------------------------
  231. %macro PRED4x4_VL 1
  232. cglobal pred4x4_vertical_left_10_%1, 3,3
  233. sub r0, r2
  234. movu m1, [r0]
  235. movhps m1, [r1]
  236. psrldq m0, m1, 2
  237. psrldq m2, m1, 4
  238. pavgw m4, m0, m1
  239. PRED4x4_LOWPASS m0, m1, m2, m0
  240. lea r1, [r0+r2*2]
  241. movq [r0+r2*1], m4
  242. movq [r0+r2*2], m0
  243. psrldq m4, 2
  244. psrldq m0, 2
  245. movq [r1+r2*1], m4
  246. movq [r1+r2*2], m0
  247. RET
  248. %endmacro
  249. INIT_XMM
  250. PRED4x4_VL sse2
  251. %if HAVE_AVX
  252. INIT_AVX
  253. PRED4x4_VL avx
  254. %endif
  255. ;-----------------------------------------------------------------------------
  256. ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
  257. ;-----------------------------------------------------------------------------
  258. INIT_MMX
  259. cglobal pred4x4_horizontal_up_10_mmxext, 3,3
  260. sub r0, r2
  261. lea r1, [r0+r2*2]
  262. movq m0, [r0+r2*1-8]
  263. punpckhwd m0, [r0+r2*2-8]
  264. movq m1, [r1+r2*1-8]
  265. punpckhwd m1, [r1+r2*2-8]
  266. punpckhdq m0, m1
  267. pshufw m1, m1, 0xFF
  268. movq [r1+r2*2], m1
  269. movd [r1+r2*1+4], m1
  270. pshufw m2, m0, 11111001b
  271. movq m1, m2
  272. pavgw m2, m0
  273. pshufw m5, m0, 11111110b
  274. PRED4x4_LOWPASS m1, m0, m5, m1
  275. movq m6, m2
  276. punpcklwd m6, m1
  277. movq [r0+r2*1], m6
  278. psrlq m2, 16
  279. psrlq m1, 16
  280. punpcklwd m2, m1
  281. movq [r0+r2*2], m2
  282. psrlq m2, 32
  283. movd [r1+r2*1], m2
  284. RET
  285. ;-----------------------------------------------------------------------------
  286. ; void pred8x8_vertical(pixel *src, int stride)
  287. ;-----------------------------------------------------------------------------
  288. INIT_XMM
  289. cglobal pred8x8_vertical_10_sse2, 2,2
  290. sub r0, r1
  291. mova m0, [r0]
  292. %rep 3
  293. mova [r0+r1*1], m0
  294. mova [r0+r1*2], m0
  295. lea r0, [r0+r1*2]
  296. %endrep
  297. mova [r0+r1*1], m0
  298. mova [r0+r1*2], m0
  299. RET
  300. ;-----------------------------------------------------------------------------
  301. ; void pred8x8_horizontal(pixel *src, int stride)
  302. ;-----------------------------------------------------------------------------
  303. INIT_XMM
  304. cglobal pred8x8_horizontal_10_sse2, 2,3
  305. mov r2d, 4
  306. .loop:
  307. movq m0, [r0+r1*0-8]
  308. movq m1, [r0+r1*1-8]
  309. pshuflw m0, m0, 0xff
  310. pshuflw m1, m1, 0xff
  311. punpcklqdq m0, m0
  312. punpcklqdq m1, m1
  313. mova [r0+r1*0], m0
  314. mova [r0+r1*1], m1
  315. lea r0, [r0+r1*2]
  316. dec r2d
  317. jg .loop
  318. REP_RET
  319. ;-----------------------------------------------------------------------------
  320. ; void predict_8x8_dc(pixel *src, int stride)
  321. ;-----------------------------------------------------------------------------
  322. %macro MOV8 2-3
  323. ; sort of a hack, but it works
  324. %if mmsize==8
  325. movq [%1+0], %2
  326. movq [%1+8], %3
  327. %else
  328. movdqa [%1], %2
  329. %endif
  330. %endmacro
  331. %macro PRED8x8_DC 2
  332. cglobal pred8x8_dc_10_%1, 2,6
  333. sub r0, r1
  334. pxor m4, m4
  335. movq m0, [r0+0]
  336. movq m1, [r0+8]
  337. %if mmsize==16
  338. punpcklwd m0, m1
  339. movhlps m1, m0
  340. paddw m0, m1
  341. %else
  342. pshufw m2, m0, 00001110b
  343. pshufw m3, m1, 00001110b
  344. paddw m0, m2
  345. paddw m1, m3
  346. punpcklwd m0, m1
  347. %endif
  348. %2 m2, m0, 00001110b
  349. paddw m0, m2
  350. lea r5, [r1*3]
  351. lea r4, [r0+r1*4]
  352. movzx r2d, word [r0+r1*1-2]
  353. movzx r3d, word [r0+r1*2-2]
  354. add r2d, r3d
  355. movzx r3d, word [r0+r5*1-2]
  356. add r2d, r3d
  357. movzx r3d, word [r4-2]
  358. add r2d, r3d
  359. movd m2, r2d ; s2
  360. movzx r2d, word [r4+r1*1-2]
  361. movzx r3d, word [r4+r1*2-2]
  362. add r2d, r3d
  363. movzx r3d, word [r4+r5*1-2]
  364. add r2d, r3d
  365. movzx r3d, word [r4+r1*4-2]
  366. add r2d, r3d
  367. movd m3, r2d ; s3
  368. punpcklwd m2, m3
  369. punpckldq m0, m2 ; s0, s1, s2, s3
  370. %2 m3, m0, 11110110b ; s2, s1, s3, s3
  371. %2 m0, m0, 01110100b ; s0, s1, s3, s1
  372. paddw m0, m3
  373. psrlw m0, 2
  374. pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
  375. %if mmsize==16
  376. punpcklwd m0, m0
  377. pshufd m3, m0, 11111010b
  378. punpckldq m0, m0
  379. SWAP 0,1
  380. %else
  381. pshufw m1, m0, 0x00
  382. pshufw m2, m0, 0x55
  383. pshufw m3, m0, 0xaa
  384. pshufw m4, m0, 0xff
  385. %endif
  386. MOV8 r0+r1*1, m1, m2
  387. MOV8 r0+r1*2, m1, m2
  388. MOV8 r0+r5*1, m1, m2
  389. MOV8 r0+r1*4, m1, m2
  390. MOV8 r4+r1*1, m3, m4
  391. MOV8 r4+r1*2, m3, m4
  392. MOV8 r4+r5*1, m3, m4
  393. MOV8 r4+r1*4, m3, m4
  394. RET
  395. %endmacro
  396. INIT_MMX
  397. PRED8x8_DC mmxext, pshufw
  398. INIT_XMM
  399. PRED8x8_DC sse2 , pshuflw
  400. ;-----------------------------------------------------------------------------
  401. ; void pred8x8_top_dc(pixel *src, int stride)
  402. ;-----------------------------------------------------------------------------
  403. INIT_XMM
  404. cglobal pred8x8_top_dc_10_sse2, 2,4
  405. sub r0, r1
  406. mova m0, [r0]
  407. pshuflw m1, m0, 0x4e
  408. pshufhw m1, m1, 0x4e
  409. paddw m0, m1
  410. pshuflw m1, m0, 0xb1
  411. pshufhw m1, m1, 0xb1
  412. paddw m0, m1
  413. lea r2, [r1*3]
  414. lea r3, [r0+r1*4]
  415. paddw m0, [pw_2]
  416. psrlw m0, 2
  417. mova [r0+r1*1], m0
  418. mova [r0+r1*2], m0
  419. mova [r0+r2*1], m0
  420. mova [r0+r1*4], m0
  421. mova [r3+r1*1], m0
  422. mova [r3+r1*2], m0
  423. mova [r3+r2*1], m0
  424. mova [r3+r1*4], m0
  425. RET
  426. ;-----------------------------------------------------------------------------
  427. ; void pred8x8_plane(pixel *src, int stride)
  428. ;-----------------------------------------------------------------------------
  429. INIT_XMM
  430. cglobal pred8x8_plane_10_sse2, 2,7,7
  431. sub r0, r1
  432. lea r2, [r1*3]
  433. lea r3, [r0+r1*4]
  434. mova m2, [r0]
  435. pmaddwd m2, [pw_m32101234]
  436. HADDD m2, m1
  437. movd m0, [r0-4]
  438. psrld m0, 14
  439. psubw m2, m0 ; H
  440. movd m0, [r3+r1*4-4]
  441. movd m1, [r0+12]
  442. paddw m0, m1
  443. psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
  444. movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
  445. movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
  446. sub r4d, r5d
  447. movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
  448. movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
  449. sub r6d, r5d
  450. lea r4d, [r4+r6*2]
  451. movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
  452. movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
  453. sub r5d, r6d
  454. lea r5d, [r5*3]
  455. add r4d, r5d
  456. movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
  457. movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
  458. sub r6d, r5d
  459. lea r4d, [r4+r6*4]
  460. movd m3, r4d ; V
  461. punpckldq m2, m3
  462. pmaddwd m2, [pd_17]
  463. paddd m2, [pd_16]
  464. psrad m2, 5 ; b, c
  465. mova m3, [pw_pixel_max]
  466. pxor m1, m1
  467. SPLATW m0, m0, 1
  468. SPLATW m4, m2, 2
  469. SPLATW m2, m2, 0
  470. pmullw m2, [pw_m32101234] ; b
  471. pmullw m5, m4, [pw_m3] ; c
  472. paddw m5, [pw_16]
  473. mov r2d, 8
  474. add r0, r1
  475. .loop:
  476. paddsw m6, m2, m5
  477. paddsw m6, m0
  478. psraw m6, 5
  479. CLIPW m6, m1, m3
  480. mova [r0], m6
  481. paddw m5, m4
  482. add r0, r1
  483. dec r2d
  484. jg .loop
  485. REP_RET
  486. ;-----------------------------------------------------------------------------
  487. ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
  488. ;-----------------------------------------------------------------------------
  489. %macro PRED8x8L_128_DC 1
  490. cglobal pred8x8l_128_dc_10_%1, 4,4
  491. mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
  492. lea r1, [r3*3]
  493. lea r2, [r0+r3*4]
  494. MOV8 r0+r3*0, m0, m0
  495. MOV8 r0+r3*1, m0, m0
  496. MOV8 r0+r3*2, m0, m0
  497. MOV8 r0+r1*1, m0, m0
  498. MOV8 r2+r3*0, m0, m0
  499. MOV8 r2+r3*1, m0, m0
  500. MOV8 r2+r3*2, m0, m0
  501. MOV8 r2+r1*1, m0, m0
  502. RET
  503. %endmacro
  504. INIT_MMX
  505. PRED8x8L_128_DC mmxext
  506. INIT_XMM
  507. PRED8x8L_128_DC sse2
  508. ;-----------------------------------------------------------------------------
  509. ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
  510. ;-----------------------------------------------------------------------------
  511. %macro PRED8x8L_TOP_DC 1
  512. cglobal pred8x8l_top_dc_10_%1, 4,4,6
  513. sub r0, r3
  514. mova m0, [r0]
  515. shr r1d, 14
  516. shr r2d, 13
  517. neg r1
  518. pslldq m1, m0, 2
  519. psrldq m2, m0, 2
  520. pinsrw m1, [r0+r1], 0
  521. pinsrw m2, [r0+r2+14], 7
  522. lea r1, [r3*3]
  523. lea r2, [r0+r3*4]
  524. PRED4x4_LOWPASS m0, m2, m1, m0
  525. HADDW m0, m1
  526. paddw m0, [pw_4]
  527. psrlw m0, 3
  528. SPLATW m0, m0, 0
  529. mova [r0+r3*1], m0
  530. mova [r0+r3*2], m0
  531. mova [r0+r1*1], m0
  532. mova [r0+r3*4], m0
  533. mova [r2+r3*1], m0
  534. mova [r2+r3*2], m0
  535. mova [r2+r1*1], m0
  536. mova [r2+r3*4], m0
  537. RET
  538. %endmacro
  539. INIT_XMM
  540. PRED8x8L_TOP_DC sse2
  541. %if HAVE_AVX
  542. INIT_AVX
  543. PRED8x8L_TOP_DC avx
  544. %endif
  545. ;-----------------------------------------------------------------------------
  546. ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
  547. ;-----------------------------------------------------------------------------
  548. ;TODO: see if scalar is faster
  549. %macro PRED8x8L_DC 1
  550. cglobal pred8x8l_dc_10_%1, 4,6,6
  551. sub r0, r3
  552. lea r4, [r0+r3*4]
  553. lea r5, [r3*3]
  554. mova m0, [r0+r3*2-16]
  555. punpckhwd m0, [r0+r3*1-16]
  556. mova m1, [r4+r3*0-16]
  557. punpckhwd m1, [r0+r5*1-16]
  558. punpckhdq m1, m0
  559. mova m2, [r4+r3*2-16]
  560. punpckhwd m2, [r4+r3*1-16]
  561. mova m3, [r4+r3*4-16]
  562. punpckhwd m3, [r4+r5*1-16]
  563. punpckhdq m3, m2
  564. punpckhqdq m3, m1
  565. mova m0, [r0]
  566. shr r1d, 14
  567. shr r2d, 13
  568. neg r1
  569. pslldq m1, m0, 2
  570. psrldq m2, m0, 2
  571. pinsrw m1, [r0+r1], 0
  572. pinsrw m2, [r0+r2+14], 7
  573. not r1
  574. and r1, r3
  575. pslldq m4, m3, 2
  576. psrldq m5, m3, 2
  577. pshuflw m4, m4, 11100101b
  578. pinsrw m5, [r0+r1-2], 7
  579. PRED4x4_LOWPASS m3, m4, m5, m3
  580. PRED4x4_LOWPASS m0, m2, m1, m0
  581. paddw m0, m3
  582. HADDW m0, m1
  583. paddw m0, [pw_8]
  584. psrlw m0, 4
  585. SPLATW m0, m0
  586. mova [r0+r3*1], m0
  587. mova [r0+r3*2], m0
  588. mova [r0+r5*1], m0
  589. mova [r0+r3*4], m0
  590. mova [r4+r3*1], m0
  591. mova [r4+r3*2], m0
  592. mova [r4+r5*1], m0
  593. mova [r4+r3*4], m0
  594. RET
  595. %endmacro
  596. INIT_XMM
  597. PRED8x8L_DC sse2
  598. %if HAVE_AVX
  599. INIT_AVX
  600. PRED8x8L_DC avx
  601. %endif
  602. ;-----------------------------------------------------------------------------
  603. ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
  604. ;-----------------------------------------------------------------------------
  605. %macro PRED8x8L_VERTICAL 1
  606. cglobal pred8x8l_vertical_10_%1, 4,4,6
  607. sub r0, r3
  608. mova m0, [r0]
  609. shr r1d, 14
  610. shr r2d, 13
  611. neg r1
  612. pslldq m1, m0, 2
  613. psrldq m2, m0, 2
  614. pinsrw m1, [r0+r1], 0
  615. pinsrw m2, [r0+r2+14], 7
  616. lea r1, [r3*3]
  617. lea r2, [r0+r3*4]
  618. PRED4x4_LOWPASS m0, m2, m1, m0
  619. mova [r0+r3*1], m0
  620. mova [r0+r3*2], m0
  621. mova [r0+r1*1], m0
  622. mova [r0+r3*4], m0
  623. mova [r2+r3*1], m0
  624. mova [r2+r3*2], m0
  625. mova [r2+r1*1], m0
  626. mova [r2+r3*4], m0
  627. RET
  628. %endmacro
  629. INIT_XMM
  630. PRED8x8L_VERTICAL sse2
  631. %if HAVE_AVX
  632. INIT_AVX
  633. PRED8x8L_VERTICAL avx
  634. %endif
  635. ;-----------------------------------------------------------------------------
  636. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  637. ;-----------------------------------------------------------------------------
  638. %macro PRED8x8L_HORIZONTAL 1
  639. cglobal pred8x8l_horizontal_10_%1, 4,4,5
  640. mova m0, [r0-16]
  641. shr r1d, 14
  642. dec r1
  643. and r1, r3
  644. sub r1, r3
  645. punpckhwd m0, [r0+r1-16]
  646. mova m1, [r0+r3*2-16]
  647. punpckhwd m1, [r0+r3*1-16]
  648. lea r2, [r0+r3*4]
  649. lea r1, [r3*3]
  650. punpckhdq m1, m0
  651. mova m2, [r2+r3*0-16]
  652. punpckhwd m2, [r0+r1-16]
  653. mova m3, [r2+r3*2-16]
  654. punpckhwd m3, [r2+r3*1-16]
  655. punpckhdq m3, m2
  656. punpckhqdq m3, m1
  657. PALIGNR m4, m3, [r2+r1-16], 14, m0
  658. pslldq m0, m4, 2
  659. pshuflw m0, m0, 11100101b
  660. PRED4x4_LOWPASS m4, m3, m0, m4
  661. punpckhwd m3, m4, m4
  662. punpcklwd m4, m4
  663. pshufd m0, m3, 0xff
  664. pshufd m1, m3, 0xaa
  665. pshufd m2, m3, 0x55
  666. pshufd m3, m3, 0x00
  667. mova [r0+r3*0], m0
  668. mova [r0+r3*1], m1
  669. mova [r0+r3*2], m2
  670. mova [r0+r1*1], m3
  671. pshufd m0, m4, 0xff
  672. pshufd m1, m4, 0xaa
  673. pshufd m2, m4, 0x55
  674. pshufd m3, m4, 0x00
  675. mova [r2+r3*0], m0
  676. mova [r2+r3*1], m1
  677. mova [r2+r3*2], m2
  678. mova [r2+r1*1], m3
  679. RET
  680. %endmacro
  681. INIT_XMM
  682. %define PALIGNR PALIGNR_MMX
  683. PRED8x8L_HORIZONTAL sse2
  684. %define PALIGNR PALIGNR_SSSE3
  685. PRED8x8L_HORIZONTAL ssse3
  686. %if HAVE_AVX
  687. INIT_AVX
  688. PRED8x8L_HORIZONTAL avx
  689. %endif
  690. ;-----------------------------------------------------------------------------
  691. ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
  692. ;-----------------------------------------------------------------------------
  693. %macro PRED8x8L_DOWN_LEFT 1
  694. cglobal pred8x8l_down_left_10_%1, 4,4,7
  695. sub r0, r3
  696. mova m3, [r0]
  697. shr r1d, 14
  698. neg r1
  699. shr r2d, 13
  700. pslldq m1, m3, 2
  701. psrldq m2, m3, 2
  702. pinsrw m1, [r0+r1], 0
  703. pinsrw m2, [r0+r2+14], 7
  704. PRED4x4_LOWPASS m6, m2, m1, m3
  705. jz .fix_tr ; flags from shr r2d
  706. mova m1, [r0+16]
  707. psrldq m5, m1, 2
  708. PALIGNR m2, m1, m3, 14, m3
  709. pshufhw m5, m5, 10100100b
  710. PRED4x4_LOWPASS m1, m2, m5, m1
  711. .do_topright:
  712. lea r1, [r3*3]
  713. psrldq m5, m1, 14
  714. lea r2, [r0+r3*4]
  715. PALIGNR m2, m1, m6, 2, m0
  716. PALIGNR m3, m1, m6, 14, m0
  717. PALIGNR m5, m1, 2, m0
  718. pslldq m4, m6, 2
  719. PRED4x4_LOWPASS m6, m4, m2, m6
  720. PRED4x4_LOWPASS m1, m3, m5, m1
  721. mova [r2+r3*4], m1
  722. PALIGNR m1, m6, 14, m2
  723. pslldq m6, 2
  724. mova [r2+r1*1], m1
  725. PALIGNR m1, m6, 14, m2
  726. pslldq m6, 2
  727. mova [r2+r3*2], m1
  728. PALIGNR m1, m6, 14, m2
  729. pslldq m6, 2
  730. mova [r2+r3*1], m1
  731. PALIGNR m1, m6, 14, m2
  732. pslldq m6, 2
  733. mova [r0+r3*4], m1
  734. PALIGNR m1, m6, 14, m2
  735. pslldq m6, 2
  736. mova [r0+r1*1], m1
  737. PALIGNR m1, m6, 14, m2
  738. pslldq m6, 2
  739. mova [r0+r3*2], m1
  740. PALIGNR m1, m6, 14, m6
  741. mova [r0+r3*1], m1
  742. RET
  743. .fix_tr:
  744. punpckhwd m3, m3
  745. pshufd m1, m3, 0xFF
  746. jmp .do_topright
  747. %endmacro
  748. INIT_XMM
  749. %define PALIGNR PALIGNR_MMX
  750. PRED8x8L_DOWN_LEFT sse2
  751. %define PALIGNR PALIGNR_SSSE3
  752. PRED8x8L_DOWN_LEFT ssse3
  753. %if HAVE_AVX
  754. INIT_AVX
  755. PRED8x8L_DOWN_LEFT avx
  756. %endif
  757. ;-----------------------------------------------------------------------------
  758. ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
  759. ;-----------------------------------------------------------------------------
  760. %macro PRED8x8L_DOWN_RIGHT 1
  761. ; standard forbids this when has_topleft is false
  762. ; no need to check
  763. cglobal pred8x8l_down_right_10_%1, 4,5,8
  764. sub r0, r3
  765. lea r4, [r0+r3*4]
  766. lea r1, [r3*3]
  767. mova m0, [r0+r3*1-16]
  768. punpckhwd m0, [r0+r3*0-16]
  769. mova m1, [r0+r1*1-16]
  770. punpckhwd m1, [r0+r3*2-16]
  771. punpckhdq m1, m0
  772. mova m2, [r4+r3*1-16]
  773. punpckhwd m2, [r4+r3*0-16]
  774. mova m3, [r4+r1*1-16]
  775. punpckhwd m3, [r4+r3*2-16]
  776. punpckhdq m3, m2
  777. punpckhqdq m3, m1
  778. mova m0, [r4+r3*4-16]
  779. mova m1, [r0]
  780. PALIGNR m4, m3, m0, 14, m0
  781. PALIGNR m1, m3, 2, m2
  782. pslldq m0, m4, 2
  783. pshuflw m0, m0, 11100101b
  784. PRED4x4_LOWPASS m6, m1, m4, m3
  785. PRED4x4_LOWPASS m4, m3, m0, m4
  786. mova m3, [r0]
  787. shr r2d, 13
  788. pslldq m1, m3, 2
  789. psrldq m2, m3, 2
  790. pinsrw m1, [r0-2], 0
  791. pinsrw m2, [r0+r2+14], 7
  792. PRED4x4_LOWPASS m3, m2, m1, m3
  793. PALIGNR m2, m3, m6, 2, m0
  794. PALIGNR m5, m3, m6, 14, m0
  795. psrldq m7, m3, 2
  796. PRED4x4_LOWPASS m6, m4, m2, m6
  797. PRED4x4_LOWPASS m3, m5, m7, m3
  798. mova [r4+r3*4], m6
  799. PALIGNR m3, m6, 14, m2
  800. pslldq m6, 2
  801. mova [r0+r3*1], m3
  802. PALIGNR m3, m6, 14, m2
  803. pslldq m6, 2
  804. mova [r0+r3*2], m3
  805. PALIGNR m3, m6, 14, m2
  806. pslldq m6, 2
  807. mova [r0+r1*1], m3
  808. PALIGNR m3, m6, 14, m2
  809. pslldq m6, 2
  810. mova [r0+r3*4], m3
  811. PALIGNR m3, m6, 14, m2
  812. pslldq m6, 2
  813. mova [r4+r3*1], m3
  814. PALIGNR m3, m6, 14, m2
  815. pslldq m6, 2
  816. mova [r4+r3*2], m3
  817. PALIGNR m3, m6, 14, m6
  818. mova [r4+r1*1], m3
  819. RET
  820. %endmacro
  821. INIT_XMM
  822. %define PALIGNR PALIGNR_MMX
  823. PRED8x8L_DOWN_RIGHT sse2
  824. %define PALIGNR PALIGNR_SSSE3
  825. PRED8x8L_DOWN_RIGHT ssse3
  826. %if HAVE_AVX
  827. INIT_AVX
  828. PRED8x8L_DOWN_RIGHT avx
  829. %endif
  830. ;-----------------------------------------------------------------------------
  831. ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
  832. ;-----------------------------------------------------------------------------
  833. %macro PRED8x8L_VERTICAL_RIGHT 1
  834. ; likewise with 8x8l_down_right
  835. cglobal pred8x8l_vertical_right_10_%1, 4,5,7
  836. sub r0, r3
  837. lea r4, [r0+r3*4]
  838. lea r1, [r3*3]
  839. mova m0, [r0+r3*1-16]
  840. punpckhwd m0, [r0+r3*0-16]
  841. mova m1, [r0+r1*1-16]
  842. punpckhwd m1, [r0+r3*2-16]
  843. punpckhdq m1, m0
  844. mova m2, [r4+r3*1-16]
  845. punpckhwd m2, [r4+r3*0-16]
  846. mova m3, [r4+r1*1-16]
  847. punpckhwd m3, [r4+r3*2-16]
  848. punpckhdq m3, m2
  849. punpckhqdq m3, m1
  850. mova m0, [r4+r3*4-16]
  851. mova m1, [r0]
  852. PALIGNR m4, m3, m0, 14, m0
  853. PALIGNR m1, m3, 2, m2
  854. PRED4x4_LOWPASS m3, m1, m4, m3
  855. mova m2, [r0]
  856. shr r2d, 13
  857. pslldq m1, m2, 2
  858. psrldq m5, m2, 2
  859. pinsrw m1, [r0-2], 0
  860. pinsrw m5, [r0+r2+14], 7
  861. PRED4x4_LOWPASS m2, m5, m1, m2
  862. PALIGNR m6, m2, m3, 12, m1
  863. PALIGNR m5, m2, m3, 14, m0
  864. PRED4x4_LOWPASS m0, m6, m2, m5
  865. pavgw m2, m5
  866. mova [r0+r3*2], m0
  867. mova [r0+r3*1], m2
  868. pslldq m6, m3, 4
  869. pslldq m1, m3, 2
  870. PRED4x4_LOWPASS m1, m3, m6, m1
  871. PALIGNR m2, m1, 14, m4
  872. mova [r0+r1*1], m2
  873. pslldq m1, 2
  874. PALIGNR m0, m1, 14, m3
  875. mova [r0+r3*4], m0
  876. pslldq m1, 2
  877. PALIGNR m2, m1, 14, m4
  878. mova [r4+r3*1], m2
  879. pslldq m1, 2
  880. PALIGNR m0, m1, 14, m3
  881. mova [r4+r3*2], m0
  882. pslldq m1, 2
  883. PALIGNR m2, m1, 14, m4
  884. mova [r4+r1*1], m2
  885. pslldq m1, 2
  886. PALIGNR m0, m1, 14, m1
  887. mova [r4+r3*4], m0
  888. RET
  889. %endmacro
  890. INIT_XMM
  891. %define PALIGNR PALIGNR_MMX
  892. PRED8x8L_VERTICAL_RIGHT sse2
  893. %define PALIGNR PALIGNR_SSSE3
  894. PRED8x8L_VERTICAL_RIGHT ssse3
  895. %if HAVE_AVX
  896. INIT_AVX
  897. PRED8x8L_VERTICAL_RIGHT avx
  898. %endif
  899. ;-----------------------------------------------------------------------------
  900. ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
  901. ;-----------------------------------------------------------------------------
  902. %macro PRED8x8L_HORIZONTAL_UP 1
  903. cglobal pred8x8l_horizontal_up_10_%1, 4,4,6
  904. mova m0, [r0+r3*0-16]
  905. punpckhwd m0, [r0+r3*1-16]
  906. shr r1d, 14
  907. dec r1
  908. and r1, r3
  909. sub r1, r3
  910. mova m4, [r0+r1*1-16]
  911. lea r1, [r3*3]
  912. lea r2, [r0+r3*4]
  913. mova m1, [r0+r3*2-16]
  914. punpckhwd m1, [r0+r1*1-16]
  915. punpckhdq m0, m1
  916. mova m2, [r2+r3*0-16]
  917. punpckhwd m2, [r2+r3*1-16]
  918. mova m3, [r2+r3*2-16]
  919. punpckhwd m3, [r2+r1*1-16]
  920. punpckhdq m2, m3
  921. punpckhqdq m0, m2
  922. PALIGNR m1, m0, m4, 14, m4
  923. psrldq m2, m0, 2
  924. pshufhw m2, m2, 10100100b
  925. PRED4x4_LOWPASS m0, m1, m2, m0
  926. psrldq m1, m0, 2
  927. psrldq m2, m0, 4
  928. pshufhw m1, m1, 10100100b
  929. pshufhw m2, m2, 01010100b
  930. pavgw m4, m0, m1
  931. PRED4x4_LOWPASS m1, m2, m0, m1
  932. punpckhwd m5, m4, m1
  933. punpcklwd m4, m1
  934. mova [r2+r3*0], m5
  935. mova [r0+r3*0], m4
  936. pshufd m0, m5, 11111001b
  937. pshufd m1, m5, 11111110b
  938. pshufd m2, m5, 11111111b
  939. mova [r2+r3*1], m0
  940. mova [r2+r3*2], m1
  941. mova [r2+r1*1], m2
  942. PALIGNR m2, m5, m4, 4, m0
  943. PALIGNR m3, m5, m4, 8, m1
  944. PALIGNR m5, m5, m4, 12, m4
  945. mova [r0+r3*1], m2
  946. mova [r0+r3*2], m3
  947. mova [r0+r1*1], m5
  948. RET
  949. %endmacro
  950. INIT_XMM
  951. %define PALIGNR PALIGNR_MMX
  952. PRED8x8L_HORIZONTAL_UP sse2
  953. %define PALIGNR PALIGNR_SSSE3
  954. PRED8x8L_HORIZONTAL_UP ssse3
  955. %if HAVE_AVX
  956. INIT_AVX
  957. PRED8x8L_HORIZONTAL_UP avx
  958. %endif
  959. ;-----------------------------------------------------------------------------
  960. ; void pred16x16_vertical(pixel *src, int stride)
  961. ;-----------------------------------------------------------------------------
  962. %macro MOV16 3-5
  963. mova [%1+ 0], %2
  964. mova [%1+mmsize], %3
  965. %if mmsize==8
  966. mova [%1+ 16], %4
  967. mova [%1+ 24], %5
  968. %endif
  969. %endmacro
  970. %macro PRED16x16_VERTICAL 1
  971. cglobal pred16x16_vertical_10_%1, 2,3
  972. sub r0, r1
  973. mov r2d, 8
  974. mova m0, [r0+ 0]
  975. mova m1, [r0+mmsize]
  976. %if mmsize==8
  977. mova m2, [r0+16]
  978. mova m3, [r0+24]
  979. %endif
  980. .loop:
  981. MOV16 r0+r1*1, m0, m1, m2, m3
  982. MOV16 r0+r1*2, m0, m1, m2, m3
  983. lea r0, [r0+r1*2]
  984. dec r2d
  985. jg .loop
  986. REP_RET
  987. %endmacro
  988. INIT_MMX
  989. PRED16x16_VERTICAL mmxext
  990. INIT_XMM
  991. PRED16x16_VERTICAL sse2
  992. ;-----------------------------------------------------------------------------
  993. ; void pred16x16_horizontal(pixel *src, int stride)
  994. ;-----------------------------------------------------------------------------
  995. %macro PRED16x16_HORIZONTAL 1
  996. cglobal pred16x16_horizontal_10_%1, 2,3
  997. mov r2d, 8
  998. .vloop:
  999. movd m0, [r0+r1*0-4]
  1000. movd m1, [r0+r1*1-4]
  1001. SPLATW m0, m0, 1
  1002. SPLATW m1, m1, 1
  1003. MOV16 r0+r1*0, m0, m0, m0, m0
  1004. MOV16 r0+r1*1, m1, m1, m1, m1
  1005. lea r0, [r0+r1*2]
  1006. dec r2d
  1007. jg .vloop
  1008. REP_RET
  1009. %endmacro
  1010. INIT_MMX
  1011. PRED16x16_HORIZONTAL mmxext
  1012. INIT_XMM
  1013. PRED16x16_HORIZONTAL sse2
  1014. ;-----------------------------------------------------------------------------
  1015. ; void pred16x16_dc(pixel *src, int stride)
  1016. ;-----------------------------------------------------------------------------
  1017. %macro PRED16x16_DC 1
  1018. cglobal pred16x16_dc_10_%1, 2,6
  1019. mov r5, r0
  1020. sub r0, r1
  1021. mova m0, [r0+0]
  1022. paddw m0, [r0+mmsize]
  1023. %if mmsize==8
  1024. paddw m0, [r0+16]
  1025. paddw m0, [r0+24]
  1026. %endif
  1027. HADDW m0, m2
  1028. lea r0, [r0+r1-2]
  1029. movzx r3d, word [r0]
  1030. movzx r4d, word [r0+r1]
  1031. %rep 7
  1032. lea r0, [r0+r1*2]
  1033. movzx r2d, word [r0]
  1034. add r3d, r2d
  1035. movzx r2d, word [r0+r1]
  1036. add r4d, r2d
  1037. %endrep
  1038. lea r3d, [r3+r4+16]
  1039. movd m1, r3d
  1040. paddw m0, m1
  1041. psrlw m0, 5
  1042. SPLATW m0, m0
  1043. mov r3d, 8
  1044. .loop:
  1045. MOV16 r5+r1*0, m0, m0, m0, m0
  1046. MOV16 r5+r1*1, m0, m0, m0, m0
  1047. lea r5, [r5+r1*2]
  1048. dec r3d
  1049. jg .loop
  1050. REP_RET
  1051. %endmacro
  1052. INIT_MMX
  1053. PRED16x16_DC mmxext
  1054. INIT_XMM
  1055. PRED16x16_DC sse2
  1056. ;-----------------------------------------------------------------------------
  1057. ; void pred16x16_top_dc(pixel *src, int stride)
  1058. ;-----------------------------------------------------------------------------
  1059. %macro PRED16x16_TOP_DC 1
  1060. cglobal pred16x16_top_dc_10_%1, 2,3
  1061. sub r0, r1
  1062. mova m0, [r0+0]
  1063. paddw m0, [r0+mmsize]
  1064. %if mmsize==8
  1065. paddw m0, [r0+16]
  1066. paddw m0, [r0+24]
  1067. %endif
  1068. HADDW m0, m2
  1069. SPLATW m0, m0
  1070. paddw m0, [pw_8]
  1071. psrlw m0, 4
  1072. mov r2d, 8
  1073. .loop:
  1074. MOV16 r0+r1*1, m0, m0, m0, m0
  1075. MOV16 r0+r1*2, m0, m0, m0, m0
  1076. lea r0, [r0+r1*2]
  1077. dec r2d
  1078. jg .loop
  1079. REP_RET
  1080. %endmacro
  1081. INIT_MMX
  1082. PRED16x16_TOP_DC mmxext
  1083. INIT_XMM
  1084. PRED16x16_TOP_DC sse2
  1085. ;-----------------------------------------------------------------------------
  1086. ; void pred16x16_left_dc(pixel *src, int stride)
  1087. ;-----------------------------------------------------------------------------
  1088. %macro PRED16x16_LEFT_DC 1
  1089. cglobal pred16x16_left_dc_10_%1, 2,6
  1090. mov r5, r0
  1091. sub r0, 2
  1092. movzx r3d, word [r0]
  1093. movzx r4d, word [r0+r1]
  1094. %rep 7
  1095. lea r0, [r0+r1*2]
  1096. movzx r2d, word [r0]
  1097. add r3d, r2d
  1098. movzx r2d, word [r0+r1]
  1099. add r4d, r2d
  1100. %endrep
  1101. lea r3d, [r3+r4+8]
  1102. shr r3d, 4
  1103. movd m0, r3d
  1104. SPLATW m0, m0
  1105. mov r3d, 8
  1106. .loop:
  1107. MOV16 r5+r1*0, m0, m0, m0, m0
  1108. MOV16 r5+r1*1, m0, m0, m0, m0
  1109. lea r5, [r5+r1*2]
  1110. dec r3d
  1111. jg .loop
  1112. REP_RET
  1113. %endmacro
  1114. INIT_MMX
  1115. PRED16x16_LEFT_DC mmxext
  1116. INIT_XMM
  1117. PRED16x16_LEFT_DC sse2
  1118. ;-----------------------------------------------------------------------------
  1119. ; void pred16x16_128_dc(pixel *src, int stride)
  1120. ;-----------------------------------------------------------------------------
  1121. %macro PRED16x16_128_DC 1
  1122. cglobal pred16x16_128_dc_10_%1, 2,3
  1123. mova m0, [pw_512]
  1124. mov r2d, 8
  1125. .loop:
  1126. MOV16 r0+r1*0, m0, m0, m0, m0
  1127. MOV16 r0+r1*1, m0, m0, m0, m0
  1128. lea r0, [r0+r1*2]
  1129. dec r2d
  1130. jg .loop
  1131. REP_RET
  1132. %endmacro
  1133. INIT_MMX
  1134. PRED16x16_128_DC mmxext
  1135. INIT_XMM
  1136. PRED16x16_128_DC sse2