You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1182 lines
31KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. cextern pw_16
  27. cextern pw_8
  28. cextern pw_4
  29. cextern pw_2
  30. cextern pw_1
  31. pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
  32. pw_m3: times 8 dw -3
  33. pw_pixel_max: times 8 dw ((1 << 10)-1)
  34. pw_512: times 8 dw 512
  35. pd_17: times 4 dd 17
  36. pd_16: times 4 dd 16
  37. SECTION .text
  38. ; dest, left, right, src
  39. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  40. %macro PRED4x4_LOWPASS 4
  41. paddw %2, %3
  42. psrlw %2, 1
  43. pavgw %1, %4, %2
  44. %endmacro
  45. ;-----------------------------------------------------------------------------
  46. ; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
  47. ;-----------------------------------------------------------------------------
  48. %macro PRED4x4_DR 0
  49. cglobal pred4x4_down_right_10, 3, 3
  50. sub r0, r2
  51. lea r1, [r0+r2*2]
  52. movhps m1, [r1-8]
  53. movhps m2, [r0+r2*1-8]
  54. movhps m4, [r0-8]
  55. punpckhwd m2, m4
  56. movq m3, [r0]
  57. punpckhdq m1, m2
  58. PALIGNR m3, m1, 10, m1
  59. movhps m4, [r1+r2*1-8]
  60. PALIGNR m0, m3, m4, 14, m4
  61. movhps m4, [r1+r2*2-8]
  62. PALIGNR m2, m0, m4, 14, m4
  63. PRED4x4_LOWPASS m0, m2, m3, m0
  64. movq [r1+r2*2], m0
  65. psrldq m0, 2
  66. movq [r1+r2*1], m0
  67. psrldq m0, 2
  68. movq [r0+r2*2], m0
  69. psrldq m0, 2
  70. movq [r0+r2*1], m0
  71. RET
  72. %endmacro
  73. INIT_XMM sse2
  74. PRED4x4_DR
  75. INIT_XMM ssse3
  76. PRED4x4_DR
  77. INIT_XMM avx
  78. PRED4x4_DR
  79. ;------------------------------------------------------------------------------
  80. ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
  81. ;------------------------------------------------------------------------------
  82. %macro PRED4x4_VR 0
  83. cglobal pred4x4_vertical_right_10, 3, 3, 6
  84. sub r0, r2
  85. lea r1, [r0+r2*2]
  86. movq m5, [r0] ; ........t3t2t1t0
  87. movhps m1, [r0-8]
  88. PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
  89. pavgw m5, m0
  90. movhps m1, [r0+r2*1-8]
  91. PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
  92. movhps m2, [r0+r2*2-8]
  93. PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
  94. movhps m3, [r1+r2*1-8]
  95. PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
  96. PRED4x4_LOWPASS m1, m0, m2, m1
  97. pslldq m0, m1, 12
  98. psrldq m1, 4
  99. movq [r0+r2*1], m5
  100. movq [r0+r2*2], m1
  101. PALIGNR m5, m0, 14, m2
  102. pslldq m0, 2
  103. movq [r1+r2*1], m5
  104. PALIGNR m1, m0, 14, m0
  105. movq [r1+r2*2], m1
  106. RET
  107. %endmacro
  108. INIT_XMM sse2
  109. PRED4x4_VR
  110. INIT_XMM ssse3
  111. PRED4x4_VR
  112. INIT_XMM avx
  113. PRED4x4_VR
  114. ;-------------------------------------------------------------------------------
  115. ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
  116. ;-------------------------------------------------------------------------------
  117. %macro PRED4x4_HD 0
  118. cglobal pred4x4_horizontal_down_10, 3, 3
  119. sub r0, r2
  120. lea r1, [r0+r2*2]
  121. movq m0, [r0-8] ; lt ..
  122. movhps m0, [r0]
  123. pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
  124. movq m1, [r1+r2*2-8] ; l3
  125. movq m3, [r1+r2*1-8]
  126. punpcklwd m1, m3 ; l2 l3
  127. movq m2, [r0+r2*2-8] ; l1
  128. movq m3, [r0+r2*1-8]
  129. punpcklwd m2, m3 ; l0 l1
  130. punpckhdq m1, m2 ; l0 l1 l2 l3
  131. punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  132. psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
  133. psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
  134. pavgw m5, m1, m3
  135. PRED4x4_LOWPASS m3, m1, m0, m3
  136. punpcklwd m5, m3
  137. psrldq m3, 8
  138. PALIGNR m3, m5, 12, m4
  139. movq [r1+r2*2], m5
  140. movhps [r0+r2*2], m5
  141. psrldq m5, 4
  142. movq [r1+r2*1], m5
  143. movq [r0+r2*1], m3
  144. RET
  145. %endmacro
  146. INIT_XMM sse2
  147. PRED4x4_HD
  148. INIT_XMM ssse3
  149. PRED4x4_HD
  150. INIT_XMM avx
  151. PRED4x4_HD
  152. ;-----------------------------------------------------------------------------
  153. ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
  154. ;-----------------------------------------------------------------------------
  155. %macro HADDD 2 ; sum junk
  156. %if mmsize == 16
  157. movhlps %2, %1
  158. paddd %1, %2
  159. pshuflw %2, %1, 0xE
  160. paddd %1, %2
  161. %else
  162. pshufw %2, %1, 0xE
  163. paddd %1, %2
  164. %endif
  165. %endmacro
  166. %macro HADDW 2
  167. pmaddwd %1, [pw_1]
  168. HADDD %1, %2
  169. %endmacro
  170. INIT_MMX mmxext
  171. cglobal pred4x4_dc_10, 3, 3
  172. sub r0, r2
  173. lea r1, [r0+r2*2]
  174. movq m2, [r0+r2*1-8]
  175. paddw m2, [r0+r2*2-8]
  176. paddw m2, [r1+r2*1-8]
  177. paddw m2, [r1+r2*2-8]
  178. psrlq m2, 48
  179. movq m0, [r0]
  180. HADDW m0, m1
  181. paddw m0, [pw_4]
  182. paddw m0, m2
  183. psrlw m0, 3
  184. SPLATW m0, m0, 0
  185. movq [r0+r2*1], m0
  186. movq [r0+r2*2], m0
  187. movq [r1+r2*1], m0
  188. movq [r1+r2*2], m0
  189. RET
  190. ;-----------------------------------------------------------------------------
  191. ; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
  192. ;-----------------------------------------------------------------------------
  193. %macro PRED4x4_DL 0
  194. cglobal pred4x4_down_left_10, 3, 3
  195. sub r0, r2
  196. movq m0, [r0]
  197. movhps m0, [r1]
  198. psrldq m2, m0, 2
  199. pslldq m3, m0, 2
  200. pshufhw m2, m2, 10100100b
  201. PRED4x4_LOWPASS m0, m3, m2, m0
  202. lea r1, [r0+r2*2]
  203. movhps [r1+r2*2], m0
  204. psrldq m0, 2
  205. movq [r0+r2*1], m0
  206. psrldq m0, 2
  207. movq [r0+r2*2], m0
  208. psrldq m0, 2
  209. movq [r1+r2*1], m0
  210. RET
  211. %endmacro
  212. INIT_XMM sse2
  213. PRED4x4_DL
  214. INIT_XMM avx
  215. PRED4x4_DL
  216. ;-----------------------------------------------------------------------------
  217. ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
  218. ;-----------------------------------------------------------------------------
  219. %macro PRED4x4_VL 0
  220. cglobal pred4x4_vertical_left_10, 3, 3
  221. sub r0, r2
  222. movu m1, [r0]
  223. movhps m1, [r1]
  224. psrldq m0, m1, 2
  225. psrldq m2, m1, 4
  226. pavgw m4, m0, m1
  227. PRED4x4_LOWPASS m0, m1, m2, m0
  228. lea r1, [r0+r2*2]
  229. movq [r0+r2*1], m4
  230. movq [r0+r2*2], m0
  231. psrldq m4, 2
  232. psrldq m0, 2
  233. movq [r1+r2*1], m4
  234. movq [r1+r2*2], m0
  235. RET
  236. %endmacro
  237. INIT_XMM sse2
  238. PRED4x4_VL
  239. INIT_XMM avx
  240. PRED4x4_VL
  241. ;-----------------------------------------------------------------------------
  242. ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
  243. ;-----------------------------------------------------------------------------
  244. INIT_MMX mmxext
  245. cglobal pred4x4_horizontal_up_10, 3, 3
  246. sub r0, r2
  247. lea r1, [r0+r2*2]
  248. movq m0, [r0+r2*1-8]
  249. punpckhwd m0, [r0+r2*2-8]
  250. movq m1, [r1+r2*1-8]
  251. punpckhwd m1, [r1+r2*2-8]
  252. punpckhdq m0, m1
  253. pshufw m1, m1, 0xFF
  254. movq [r1+r2*2], m1
  255. movd [r1+r2*1+4], m1
  256. pshufw m2, m0, 11111001b
  257. movq m1, m2
  258. pavgw m2, m0
  259. pshufw m5, m0, 11111110b
  260. PRED4x4_LOWPASS m1, m0, m5, m1
  261. movq m6, m2
  262. punpcklwd m6, m1
  263. movq [r0+r2*1], m6
  264. psrlq m2, 16
  265. psrlq m1, 16
  266. punpcklwd m2, m1
  267. movq [r0+r2*2], m2
  268. psrlq m2, 32
  269. movd [r1+r2*1], m2
  270. RET
  271. ;-----------------------------------------------------------------------------
  272. ; void ff_pred8x8_vertical(pixel *src, int stride)
  273. ;-----------------------------------------------------------------------------
  274. INIT_XMM sse2
  275. cglobal pred8x8_vertical_10, 2, 2
  276. sub r0, r1
  277. mova m0, [r0]
  278. %rep 3
  279. mova [r0+r1*1], m0
  280. mova [r0+r1*2], m0
  281. lea r0, [r0+r1*2]
  282. %endrep
  283. mova [r0+r1*1], m0
  284. mova [r0+r1*2], m0
  285. RET
  286. ;-----------------------------------------------------------------------------
  287. ; void ff_pred8x8_horizontal(pixel *src, int stride)
  288. ;-----------------------------------------------------------------------------
  289. INIT_XMM sse2
  290. cglobal pred8x8_horizontal_10, 2, 3
  291. mov r2d, 4
  292. .loop:
  293. movq m0, [r0+r1*0-8]
  294. movq m1, [r0+r1*1-8]
  295. pshuflw m0, m0, 0xff
  296. pshuflw m1, m1, 0xff
  297. punpcklqdq m0, m0
  298. punpcklqdq m1, m1
  299. mova [r0+r1*0], m0
  300. mova [r0+r1*1], m1
  301. lea r0, [r0+r1*2]
  302. dec r2d
  303. jg .loop
  304. REP_RET
  305. ;-----------------------------------------------------------------------------
  306. ; void ff_predict_8x8_dc(pixel *src, int stride)
  307. ;-----------------------------------------------------------------------------
  308. %macro MOV8 2-3
  309. ; sort of a hack, but it works
  310. %if mmsize==8
  311. movq [%1+0], %2
  312. movq [%1+8], %3
  313. %else
  314. movdqa [%1], %2
  315. %endif
  316. %endmacro
  317. %macro PRED8x8_DC 1
  318. cglobal pred8x8_dc_10, 2, 6
  319. sub r0, r1
  320. pxor m4, m4
  321. movq m0, [r0+0]
  322. movq m1, [r0+8]
  323. %if mmsize==16
  324. punpcklwd m0, m1
  325. movhlps m1, m0
  326. paddw m0, m1
  327. %else
  328. pshufw m2, m0, 00001110b
  329. pshufw m3, m1, 00001110b
  330. paddw m0, m2
  331. paddw m1, m3
  332. punpcklwd m0, m1
  333. %endif
  334. %1 m2, m0, 00001110b
  335. paddw m0, m2
  336. lea r5, [r1*3]
  337. lea r4, [r0+r1*4]
  338. movzx r2d, word [r0+r1*1-2]
  339. movzx r3d, word [r0+r1*2-2]
  340. add r2d, r3d
  341. movzx r3d, word [r0+r5*1-2]
  342. add r2d, r3d
  343. movzx r3d, word [r4-2]
  344. add r2d, r3d
  345. movd m2, r2d ; s2
  346. movzx r2d, word [r4+r1*1-2]
  347. movzx r3d, word [r4+r1*2-2]
  348. add r2d, r3d
  349. movzx r3d, word [r4+r5*1-2]
  350. add r2d, r3d
  351. movzx r3d, word [r4+r1*4-2]
  352. add r2d, r3d
  353. movd m3, r2d ; s3
  354. punpcklwd m2, m3
  355. punpckldq m0, m2 ; s0, s1, s2, s3
  356. %1 m3, m0, 11110110b ; s2, s1, s3, s3
  357. %1 m0, m0, 01110100b ; s0, s1, s3, s1
  358. paddw m0, m3
  359. psrlw m0, 2
  360. pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
  361. %if mmsize==16
  362. punpcklwd m0, m0
  363. pshufd m3, m0, 11111010b
  364. punpckldq m0, m0
  365. SWAP 0,1
  366. %else
  367. pshufw m1, m0, 0x00
  368. pshufw m2, m0, 0x55
  369. pshufw m3, m0, 0xaa
  370. pshufw m4, m0, 0xff
  371. %endif
  372. MOV8 r0+r1*1, m1, m2
  373. MOV8 r0+r1*2, m1, m2
  374. MOV8 r0+r5*1, m1, m2
  375. MOV8 r0+r1*4, m1, m2
  376. MOV8 r4+r1*1, m3, m4
  377. MOV8 r4+r1*2, m3, m4
  378. MOV8 r4+r5*1, m3, m4
  379. MOV8 r4+r1*4, m3, m4
  380. RET
  381. %endmacro
  382. INIT_MMX mmxext
  383. PRED8x8_DC pshufw
  384. INIT_XMM sse2
  385. PRED8x8_DC pshuflw
  386. ;-----------------------------------------------------------------------------
  387. ; void ff_pred8x8_top_dc(pixel *src, int stride)
  388. ;-----------------------------------------------------------------------------
  389. INIT_XMM sse2
  390. cglobal pred8x8_top_dc_10, 2, 4
  391. sub r0, r1
  392. mova m0, [r0]
  393. pshuflw m1, m0, 0x4e
  394. pshufhw m1, m1, 0x4e
  395. paddw m0, m1
  396. pshuflw m1, m0, 0xb1
  397. pshufhw m1, m1, 0xb1
  398. paddw m0, m1
  399. lea r2, [r1*3]
  400. lea r3, [r0+r1*4]
  401. paddw m0, [pw_2]
  402. psrlw m0, 2
  403. mova [r0+r1*1], m0
  404. mova [r0+r1*2], m0
  405. mova [r0+r2*1], m0
  406. mova [r0+r1*4], m0
  407. mova [r3+r1*1], m0
  408. mova [r3+r1*2], m0
  409. mova [r3+r2*1], m0
  410. mova [r3+r1*4], m0
  411. RET
  412. ;-----------------------------------------------------------------------------
  413. ; void ff_pred8x8_plane(pixel *src, int stride)
  414. ;-----------------------------------------------------------------------------
  415. INIT_XMM sse2
  416. cglobal pred8x8_plane_10, 2, 7, 7
  417. sub r0, r1
  418. lea r2, [r1*3]
  419. lea r3, [r0+r1*4]
  420. mova m2, [r0]
  421. pmaddwd m2, [pw_m32101234]
  422. HADDD m2, m1
  423. movd m0, [r0-4]
  424. psrld m0, 14
  425. psubw m2, m0 ; H
  426. movd m0, [r3+r1*4-4]
  427. movd m1, [r0+12]
  428. paddw m0, m1
  429. psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
  430. movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
  431. movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
  432. sub r4d, r5d
  433. movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
  434. movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
  435. sub r6d, r5d
  436. lea r4d, [r4+r6*2]
  437. movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
  438. movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
  439. sub r5d, r6d
  440. lea r5d, [r5*3]
  441. add r4d, r5d
  442. movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
  443. movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
  444. sub r6d, r5d
  445. lea r4d, [r4+r6*4]
  446. movd m3, r4d ; V
  447. punpckldq m2, m3
  448. pmaddwd m2, [pd_17]
  449. paddd m2, [pd_16]
  450. psrad m2, 5 ; b, c
  451. mova m3, [pw_pixel_max]
  452. pxor m1, m1
  453. SPLATW m0, m0, 1
  454. SPLATW m4, m2, 2
  455. SPLATW m2, m2, 0
  456. pmullw m2, [pw_m32101234] ; b
  457. pmullw m5, m4, [pw_m3] ; c
  458. paddw m5, [pw_16]
  459. mov r2d, 8
  460. add r0, r1
  461. .loop:
  462. paddsw m6, m2, m5
  463. paddsw m6, m0
  464. psraw m6, 5
  465. CLIPW m6, m1, m3
  466. mova [r0], m6
  467. paddw m5, m4
  468. add r0, r1
  469. dec r2d
  470. jg .loop
  471. REP_RET
  472. ;-----------------------------------------------------------------------------
  473. ; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
  474. ; int stride)
  475. ;-----------------------------------------------------------------------------
  476. %macro PRED8x8L_128_DC 0
  477. cglobal pred8x8l_128_dc_10, 4, 4
  478. mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
  479. lea r1, [r3*3]
  480. lea r2, [r0+r3*4]
  481. MOV8 r0+r3*0, m0, m0
  482. MOV8 r0+r3*1, m0, m0
  483. MOV8 r0+r3*2, m0, m0
  484. MOV8 r0+r1*1, m0, m0
  485. MOV8 r2+r3*0, m0, m0
  486. MOV8 r2+r3*1, m0, m0
  487. MOV8 r2+r3*2, m0, m0
  488. MOV8 r2+r1*1, m0, m0
  489. RET
  490. %endmacro
  491. INIT_MMX mmxext
  492. PRED8x8L_128_DC
  493. INIT_XMM sse2
  494. PRED8x8L_128_DC
  495. ;-----------------------------------------------------------------------------
  496. ; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
  497. ; int stride)
  498. ;-----------------------------------------------------------------------------
  499. %macro PRED8x8L_TOP_DC 0
  500. cglobal pred8x8l_top_dc_10, 4, 4, 6
  501. sub r0, r3
  502. mova m0, [r0]
  503. shr r1d, 14
  504. shr r2d, 13
  505. neg r1
  506. pslldq m1, m0, 2
  507. psrldq m2, m0, 2
  508. pinsrw m1, [r0+r1], 0
  509. pinsrw m2, [r0+r2+14], 7
  510. lea r1, [r3*3]
  511. lea r2, [r0+r3*4]
  512. PRED4x4_LOWPASS m0, m2, m1, m0
  513. HADDW m0, m1
  514. paddw m0, [pw_4]
  515. psrlw m0, 3
  516. SPLATW m0, m0, 0
  517. mova [r0+r3*1], m0
  518. mova [r0+r3*2], m0
  519. mova [r0+r1*1], m0
  520. mova [r0+r3*4], m0
  521. mova [r2+r3*1], m0
  522. mova [r2+r3*2], m0
  523. mova [r2+r1*1], m0
  524. mova [r2+r3*4], m0
  525. RET
  526. %endmacro
  527. INIT_XMM sse2
  528. PRED8x8L_TOP_DC
  529. INIT_XMM avx
  530. PRED8x8L_TOP_DC
  531. ;-------------------------------------------------------------------------------
  532. ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
  533. ;-------------------------------------------------------------------------------
  534. ;TODO: see if scalar is faster
  535. %macro PRED8x8L_DC 0
  536. cglobal pred8x8l_dc_10, 4, 6, 6
  537. sub r0, r3
  538. lea r4, [r0+r3*4]
  539. lea r5, [r3*3]
  540. mova m0, [r0+r3*2-16]
  541. punpckhwd m0, [r0+r3*1-16]
  542. mova m1, [r4+r3*0-16]
  543. punpckhwd m1, [r0+r5*1-16]
  544. punpckhdq m1, m0
  545. mova m2, [r4+r3*2-16]
  546. punpckhwd m2, [r4+r3*1-16]
  547. mova m3, [r4+r3*4-16]
  548. punpckhwd m3, [r4+r5*1-16]
  549. punpckhdq m3, m2
  550. punpckhqdq m3, m1
  551. mova m0, [r0]
  552. shr r1d, 14
  553. shr r2d, 13
  554. neg r1
  555. pslldq m1, m0, 2
  556. psrldq m2, m0, 2
  557. pinsrw m1, [r0+r1], 0
  558. pinsrw m2, [r0+r2+14], 7
  559. not r1
  560. and r1, r3
  561. pslldq m4, m3, 2
  562. psrldq m5, m3, 2
  563. pshuflw m4, m4, 11100101b
  564. pinsrw m5, [r0+r1-2], 7
  565. PRED4x4_LOWPASS m3, m4, m5, m3
  566. PRED4x4_LOWPASS m0, m2, m1, m0
  567. paddw m0, m3
  568. HADDW m0, m1
  569. paddw m0, [pw_8]
  570. psrlw m0, 4
  571. SPLATW m0, m0
  572. mova [r0+r3*1], m0
  573. mova [r0+r3*2], m0
  574. mova [r0+r5*1], m0
  575. mova [r0+r3*4], m0
  576. mova [r4+r3*1], m0
  577. mova [r4+r3*2], m0
  578. mova [r4+r5*1], m0
  579. mova [r4+r3*4], m0
  580. RET
  581. %endmacro
  582. INIT_XMM sse2
  583. PRED8x8L_DC
  584. INIT_XMM avx
  585. PRED8x8L_DC
  586. ;-----------------------------------------------------------------------------
  587. ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
  588. ; int stride)
  589. ;-----------------------------------------------------------------------------
  590. %macro PRED8x8L_VERTICAL 0
  591. cglobal pred8x8l_vertical_10, 4, 4, 6
  592. sub r0, r3
  593. mova m0, [r0]
  594. shr r1d, 14
  595. shr r2d, 13
  596. neg r1
  597. pslldq m1, m0, 2
  598. psrldq m2, m0, 2
  599. pinsrw m1, [r0+r1], 0
  600. pinsrw m2, [r0+r2+14], 7
  601. lea r1, [r3*3]
  602. lea r2, [r0+r3*4]
  603. PRED4x4_LOWPASS m0, m2, m1, m0
  604. mova [r0+r3*1], m0
  605. mova [r0+r3*2], m0
  606. mova [r0+r1*1], m0
  607. mova [r0+r3*4], m0
  608. mova [r2+r3*1], m0
  609. mova [r2+r3*2], m0
  610. mova [r2+r1*1], m0
  611. mova [r2+r3*4], m0
  612. RET
  613. %endmacro
  614. INIT_XMM sse2
  615. PRED8x8L_VERTICAL
  616. INIT_XMM avx
  617. PRED8x8L_VERTICAL
  618. ;-----------------------------------------------------------------------------
  619. ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
  620. ; int stride)
  621. ;-----------------------------------------------------------------------------
  622. %macro PRED8x8L_HORIZONTAL 0
  623. cglobal pred8x8l_horizontal_10, 4, 4, 5
  624. mova m0, [r0-16]
  625. shr r1d, 14
  626. dec r1
  627. and r1, r3
  628. sub r1, r3
  629. punpckhwd m0, [r0+r1-16]
  630. mova m1, [r0+r3*2-16]
  631. punpckhwd m1, [r0+r3*1-16]
  632. lea r2, [r0+r3*4]
  633. lea r1, [r3*3]
  634. punpckhdq m1, m0
  635. mova m2, [r2+r3*0-16]
  636. punpckhwd m2, [r0+r1-16]
  637. mova m3, [r2+r3*2-16]
  638. punpckhwd m3, [r2+r3*1-16]
  639. punpckhdq m3, m2
  640. punpckhqdq m3, m1
  641. PALIGNR m4, m3, [r2+r1-16], 14, m0
  642. pslldq m0, m4, 2
  643. pshuflw m0, m0, 11100101b
  644. PRED4x4_LOWPASS m4, m3, m0, m4
  645. punpckhwd m3, m4, m4
  646. punpcklwd m4, m4
  647. pshufd m0, m3, 0xff
  648. pshufd m1, m3, 0xaa
  649. pshufd m2, m3, 0x55
  650. pshufd m3, m3, 0x00
  651. mova [r0+r3*0], m0
  652. mova [r0+r3*1], m1
  653. mova [r0+r3*2], m2
  654. mova [r0+r1*1], m3
  655. pshufd m0, m4, 0xff
  656. pshufd m1, m4, 0xaa
  657. pshufd m2, m4, 0x55
  658. pshufd m3, m4, 0x00
  659. mova [r2+r3*0], m0
  660. mova [r2+r3*1], m1
  661. mova [r2+r3*2], m2
  662. mova [r2+r1*1], m3
  663. RET
  664. %endmacro
  665. INIT_XMM sse2
  666. PRED8x8L_HORIZONTAL
  667. INIT_XMM ssse3
  668. PRED8x8L_HORIZONTAL
  669. INIT_XMM avx
  670. PRED8x8L_HORIZONTAL
  671. ;-----------------------------------------------------------------------------
  672. ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
  673. ; int stride)
  674. ;-----------------------------------------------------------------------------
  675. %macro PRED8x8L_DOWN_LEFT 0
  676. cglobal pred8x8l_down_left_10, 4, 4, 7
  677. sub r0, r3
  678. mova m3, [r0]
  679. shr r1d, 14
  680. neg r1
  681. shr r2d, 13
  682. pslldq m1, m3, 2
  683. psrldq m2, m3, 2
  684. pinsrw m1, [r0+r1], 0
  685. pinsrw m2, [r0+r2+14], 7
  686. PRED4x4_LOWPASS m6, m2, m1, m3
  687. jz .fix_tr ; flags from shr r2d
  688. mova m1, [r0+16]
  689. psrldq m5, m1, 2
  690. PALIGNR m2, m1, m3, 14, m3
  691. pshufhw m5, m5, 10100100b
  692. PRED4x4_LOWPASS m1, m2, m5, m1
  693. .do_topright:
  694. lea r1, [r3*3]
  695. psrldq m5, m1, 14
  696. lea r2, [r0+r3*4]
  697. PALIGNR m2, m1, m6, 2, m0
  698. PALIGNR m3, m1, m6, 14, m0
  699. PALIGNR m5, m1, 2, m0
  700. pslldq m4, m6, 2
  701. PRED4x4_LOWPASS m6, m4, m2, m6
  702. PRED4x4_LOWPASS m1, m3, m5, m1
  703. mova [r2+r3*4], m1
  704. PALIGNR m1, m6, 14, m2
  705. pslldq m6, 2
  706. mova [r2+r1*1], m1
  707. PALIGNR m1, m6, 14, m2
  708. pslldq m6, 2
  709. mova [r2+r3*2], m1
  710. PALIGNR m1, m6, 14, m2
  711. pslldq m6, 2
  712. mova [r2+r3*1], m1
  713. PALIGNR m1, m6, 14, m2
  714. pslldq m6, 2
  715. mova [r0+r3*4], m1
  716. PALIGNR m1, m6, 14, m2
  717. pslldq m6, 2
  718. mova [r0+r1*1], m1
  719. PALIGNR m1, m6, 14, m2
  720. pslldq m6, 2
  721. mova [r0+r3*2], m1
  722. PALIGNR m1, m6, 14, m6
  723. mova [r0+r3*1], m1
  724. RET
  725. .fix_tr:
  726. punpckhwd m3, m3
  727. pshufd m1, m3, 0xFF
  728. jmp .do_topright
  729. %endmacro
  730. INIT_XMM sse2
  731. PRED8x8L_DOWN_LEFT
  732. INIT_XMM ssse3
  733. PRED8x8L_DOWN_LEFT
  734. INIT_XMM avx
  735. PRED8x8L_DOWN_LEFT
  736. ;-----------------------------------------------------------------------------
  737. ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
  738. ; int stride)
  739. ;-----------------------------------------------------------------------------
  740. %macro PRED8x8L_DOWN_RIGHT 0
  741. ; standard forbids this when has_topleft is false
  742. ; no need to check
  743. cglobal pred8x8l_down_right_10, 4, 5, 8
  744. sub r0, r3
  745. lea r4, [r0+r3*4]
  746. lea r1, [r3*3]
  747. mova m0, [r0+r3*1-16]
  748. punpckhwd m0, [r0+r3*0-16]
  749. mova m1, [r0+r1*1-16]
  750. punpckhwd m1, [r0+r3*2-16]
  751. punpckhdq m1, m0
  752. mova m2, [r4+r3*1-16]
  753. punpckhwd m2, [r4+r3*0-16]
  754. mova m3, [r4+r1*1-16]
  755. punpckhwd m3, [r4+r3*2-16]
  756. punpckhdq m3, m2
  757. punpckhqdq m3, m1
  758. mova m0, [r4+r3*4-16]
  759. mova m1, [r0]
  760. PALIGNR m4, m3, m0, 14, m0
  761. PALIGNR m1, m3, 2, m2
  762. pslldq m0, m4, 2
  763. pshuflw m0, m0, 11100101b
  764. PRED4x4_LOWPASS m6, m1, m4, m3
  765. PRED4x4_LOWPASS m4, m3, m0, m4
  766. mova m3, [r0]
  767. shr r2d, 13
  768. pslldq m1, m3, 2
  769. psrldq m2, m3, 2
  770. pinsrw m1, [r0-2], 0
  771. pinsrw m2, [r0+r2+14], 7
  772. PRED4x4_LOWPASS m3, m2, m1, m3
  773. PALIGNR m2, m3, m6, 2, m0
  774. PALIGNR m5, m3, m6, 14, m0
  775. psrldq m7, m3, 2
  776. PRED4x4_LOWPASS m6, m4, m2, m6
  777. PRED4x4_LOWPASS m3, m5, m7, m3
  778. mova [r4+r3*4], m6
  779. PALIGNR m3, m6, 14, m2
  780. pslldq m6, 2
  781. mova [r0+r3*1], m3
  782. PALIGNR m3, m6, 14, m2
  783. pslldq m6, 2
  784. mova [r0+r3*2], m3
  785. PALIGNR m3, m6, 14, m2
  786. pslldq m6, 2
  787. mova [r0+r1*1], m3
  788. PALIGNR m3, m6, 14, m2
  789. pslldq m6, 2
  790. mova [r0+r3*4], m3
  791. PALIGNR m3, m6, 14, m2
  792. pslldq m6, 2
  793. mova [r4+r3*1], m3
  794. PALIGNR m3, m6, 14, m2
  795. pslldq m6, 2
  796. mova [r4+r3*2], m3
  797. PALIGNR m3, m6, 14, m6
  798. mova [r4+r1*1], m3
  799. RET
  800. %endmacro
  801. INIT_XMM sse2
  802. PRED8x8L_DOWN_RIGHT
  803. INIT_XMM ssse3
  804. PRED8x8L_DOWN_RIGHT
  805. INIT_XMM avx
  806. PRED8x8L_DOWN_RIGHT
  807. ;-----------------------------------------------------------------------------
  808. ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
  809. ; int has_topright, int stride)
  810. ;-----------------------------------------------------------------------------
  811. %macro PRED8x8L_VERTICAL_RIGHT 0
  812. ; likewise with 8x8l_down_right
  813. cglobal pred8x8l_vertical_right_10, 4, 5, 7
  814. sub r0, r3
  815. lea r4, [r0+r3*4]
  816. lea r1, [r3*3]
  817. mova m0, [r0+r3*1-16]
  818. punpckhwd m0, [r0+r3*0-16]
  819. mova m1, [r0+r1*1-16]
  820. punpckhwd m1, [r0+r3*2-16]
  821. punpckhdq m1, m0
  822. mova m2, [r4+r3*1-16]
  823. punpckhwd m2, [r4+r3*0-16]
  824. mova m3, [r4+r1*1-16]
  825. punpckhwd m3, [r4+r3*2-16]
  826. punpckhdq m3, m2
  827. punpckhqdq m3, m1
  828. mova m0, [r4+r3*4-16]
  829. mova m1, [r0]
  830. PALIGNR m4, m3, m0, 14, m0
  831. PALIGNR m1, m3, 2, m2
  832. PRED4x4_LOWPASS m3, m1, m4, m3
  833. mova m2, [r0]
  834. shr r2d, 13
  835. pslldq m1, m2, 2
  836. psrldq m5, m2, 2
  837. pinsrw m1, [r0-2], 0
  838. pinsrw m5, [r0+r2+14], 7
  839. PRED4x4_LOWPASS m2, m5, m1, m2
  840. PALIGNR m6, m2, m3, 12, m1
  841. PALIGNR m5, m2, m3, 14, m0
  842. PRED4x4_LOWPASS m0, m6, m2, m5
  843. pavgw m2, m5
  844. mova [r0+r3*2], m0
  845. mova [r0+r3*1], m2
  846. pslldq m6, m3, 4
  847. pslldq m1, m3, 2
  848. PRED4x4_LOWPASS m1, m3, m6, m1
  849. PALIGNR m2, m1, 14, m4
  850. mova [r0+r1*1], m2
  851. pslldq m1, 2
  852. PALIGNR m0, m1, 14, m3
  853. mova [r0+r3*4], m0
  854. pslldq m1, 2
  855. PALIGNR m2, m1, 14, m4
  856. mova [r4+r3*1], m2
  857. pslldq m1, 2
  858. PALIGNR m0, m1, 14, m3
  859. mova [r4+r3*2], m0
  860. pslldq m1, 2
  861. PALIGNR m2, m1, 14, m4
  862. mova [r4+r1*1], m2
  863. pslldq m1, 2
  864. PALIGNR m0, m1, 14, m1
  865. mova [r4+r3*4], m0
  866. RET
  867. %endmacro
  868. INIT_XMM sse2
  869. PRED8x8L_VERTICAL_RIGHT
  870. INIT_XMM ssse3
  871. PRED8x8L_VERTICAL_RIGHT
  872. INIT_XMM avx
  873. PRED8x8L_VERTICAL_RIGHT
  874. ;-----------------------------------------------------------------------------
  875. ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
  876. ; int has_topright, int stride)
  877. ;-----------------------------------------------------------------------------
  878. %macro PRED8x8L_HORIZONTAL_UP 0
  879. cglobal pred8x8l_horizontal_up_10, 4, 4, 6
  880. mova m0, [r0+r3*0-16]
  881. punpckhwd m0, [r0+r3*1-16]
  882. shr r1d, 14
  883. dec r1
  884. and r1, r3
  885. sub r1, r3
  886. mova m4, [r0+r1*1-16]
  887. lea r1, [r3*3]
  888. lea r2, [r0+r3*4]
  889. mova m1, [r0+r3*2-16]
  890. punpckhwd m1, [r0+r1*1-16]
  891. punpckhdq m0, m1
  892. mova m2, [r2+r3*0-16]
  893. punpckhwd m2, [r2+r3*1-16]
  894. mova m3, [r2+r3*2-16]
  895. punpckhwd m3, [r2+r1*1-16]
  896. punpckhdq m2, m3
  897. punpckhqdq m0, m2
  898. PALIGNR m1, m0, m4, 14, m4
  899. psrldq m2, m0, 2
  900. pshufhw m2, m2, 10100100b
  901. PRED4x4_LOWPASS m0, m1, m2, m0
  902. psrldq m1, m0, 2
  903. psrldq m2, m0, 4
  904. pshufhw m1, m1, 10100100b
  905. pshufhw m2, m2, 01010100b
  906. pavgw m4, m0, m1
  907. PRED4x4_LOWPASS m1, m2, m0, m1
  908. punpckhwd m5, m4, m1
  909. punpcklwd m4, m1
  910. mova [r2+r3*0], m5
  911. mova [r0+r3*0], m4
  912. pshufd m0, m5, 11111001b
  913. pshufd m1, m5, 11111110b
  914. pshufd m2, m5, 11111111b
  915. mova [r2+r3*1], m0
  916. mova [r2+r3*2], m1
  917. mova [r2+r1*1], m2
  918. PALIGNR m2, m5, m4, 4, m0
  919. PALIGNR m3, m5, m4, 8, m1
  920. PALIGNR m5, m5, m4, 12, m4
  921. mova [r0+r3*1], m2
  922. mova [r0+r3*2], m3
  923. mova [r0+r1*1], m5
  924. RET
  925. %endmacro
  926. INIT_XMM sse2
  927. PRED8x8L_HORIZONTAL_UP
  928. INIT_XMM ssse3
  929. PRED8x8L_HORIZONTAL_UP
  930. INIT_XMM avx
  931. PRED8x8L_HORIZONTAL_UP
  932. ;-----------------------------------------------------------------------------
  933. ; void ff_pred16x16_vertical(pixel *src, int stride)
  934. ;-----------------------------------------------------------------------------
  935. %macro MOV16 3-5
  936. mova [%1+ 0], %2
  937. mova [%1+mmsize], %3
  938. %if mmsize==8
  939. mova [%1+ 16], %4
  940. mova [%1+ 24], %5
  941. %endif
  942. %endmacro
  943. %macro PRED16x16_VERTICAL 0
  944. cglobal pred16x16_vertical_10, 2, 3
  945. sub r0, r1
  946. mov r2d, 8
  947. mova m0, [r0+ 0]
  948. mova m1, [r0+mmsize]
  949. %if mmsize==8
  950. mova m2, [r0+16]
  951. mova m3, [r0+24]
  952. %endif
  953. .loop:
  954. MOV16 r0+r1*1, m0, m1, m2, m3
  955. MOV16 r0+r1*2, m0, m1, m2, m3
  956. lea r0, [r0+r1*2]
  957. dec r2d
  958. jg .loop
  959. REP_RET
  960. %endmacro
  961. INIT_MMX mmxext
  962. PRED16x16_VERTICAL
  963. INIT_XMM sse2
  964. PRED16x16_VERTICAL
  965. ;-----------------------------------------------------------------------------
  966. ; void ff_pred16x16_horizontal(pixel *src, int stride)
  967. ;-----------------------------------------------------------------------------
  968. %macro PRED16x16_HORIZONTAL 0
  969. cglobal pred16x16_horizontal_10, 2, 3
  970. mov r2d, 8
  971. .vloop:
  972. movd m0, [r0+r1*0-4]
  973. movd m1, [r0+r1*1-4]
  974. SPLATW m0, m0, 1
  975. SPLATW m1, m1, 1
  976. MOV16 r0+r1*0, m0, m0, m0, m0
  977. MOV16 r0+r1*1, m1, m1, m1, m1
  978. lea r0, [r0+r1*2]
  979. dec r2d
  980. jg .vloop
  981. REP_RET
  982. %endmacro
  983. INIT_MMX mmxext
  984. PRED16x16_HORIZONTAL
  985. INIT_XMM sse2
  986. PRED16x16_HORIZONTAL
  987. ;-----------------------------------------------------------------------------
  988. ; void ff_pred16x16_dc(pixel *src, int stride)
  989. ;-----------------------------------------------------------------------------
  990. %macro PRED16x16_DC 0
  991. cglobal pred16x16_dc_10, 2, 6
  992. mov r5, r0
  993. sub r0, r1
  994. mova m0, [r0+0]
  995. paddw m0, [r0+mmsize]
  996. %if mmsize==8
  997. paddw m0, [r0+16]
  998. paddw m0, [r0+24]
  999. %endif
  1000. HADDW m0, m2
  1001. lea r0, [r0+r1-2]
  1002. movzx r3d, word [r0]
  1003. movzx r4d, word [r0+r1]
  1004. %rep 7
  1005. lea r0, [r0+r1*2]
  1006. movzx r2d, word [r0]
  1007. add r3d, r2d
  1008. movzx r2d, word [r0+r1]
  1009. add r4d, r2d
  1010. %endrep
  1011. lea r3d, [r3+r4+16]
  1012. movd m1, r3d
  1013. paddw m0, m1
  1014. psrlw m0, 5
  1015. SPLATW m0, m0
  1016. mov r3d, 8
  1017. .loop:
  1018. MOV16 r5+r1*0, m0, m0, m0, m0
  1019. MOV16 r5+r1*1, m0, m0, m0, m0
  1020. lea r5, [r5+r1*2]
  1021. dec r3d
  1022. jg .loop
  1023. REP_RET
  1024. %endmacro
  1025. INIT_MMX mmxext
  1026. PRED16x16_DC
  1027. INIT_XMM sse2
  1028. PRED16x16_DC
  1029. ;-----------------------------------------------------------------------------
  1030. ; void ff_pred16x16_top_dc(pixel *src, int stride)
  1031. ;-----------------------------------------------------------------------------
  1032. %macro PRED16x16_TOP_DC 0
  1033. cglobal pred16x16_top_dc_10, 2, 3
  1034. sub r0, r1
  1035. mova m0, [r0+0]
  1036. paddw m0, [r0+mmsize]
  1037. %if mmsize==8
  1038. paddw m0, [r0+16]
  1039. paddw m0, [r0+24]
  1040. %endif
  1041. HADDW m0, m2
  1042. SPLATW m0, m0
  1043. paddw m0, [pw_8]
  1044. psrlw m0, 4
  1045. mov r2d, 8
  1046. .loop:
  1047. MOV16 r0+r1*1, m0, m0, m0, m0
  1048. MOV16 r0+r1*2, m0, m0, m0, m0
  1049. lea r0, [r0+r1*2]
  1050. dec r2d
  1051. jg .loop
  1052. REP_RET
  1053. %endmacro
  1054. INIT_MMX mmxext
  1055. PRED16x16_TOP_DC
  1056. INIT_XMM sse2
  1057. PRED16x16_TOP_DC
  1058. ;-----------------------------------------------------------------------------
  1059. ; void ff_pred16x16_left_dc(pixel *src, int stride)
  1060. ;-----------------------------------------------------------------------------
  1061. %macro PRED16x16_LEFT_DC 0
  1062. cglobal pred16x16_left_dc_10, 2, 6
  1063. mov r5, r0
  1064. sub r0, 2
  1065. movzx r3d, word [r0]
  1066. movzx r4d, word [r0+r1]
  1067. %rep 7
  1068. lea r0, [r0+r1*2]
  1069. movzx r2d, word [r0]
  1070. add r3d, r2d
  1071. movzx r2d, word [r0+r1]
  1072. add r4d, r2d
  1073. %endrep
  1074. lea r3d, [r3+r4+8]
  1075. shr r3d, 4
  1076. movd m0, r3d
  1077. SPLATW m0, m0
  1078. mov r3d, 8
  1079. .loop:
  1080. MOV16 r5+r1*0, m0, m0, m0, m0
  1081. MOV16 r5+r1*1, m0, m0, m0, m0
  1082. lea r5, [r5+r1*2]
  1083. dec r3d
  1084. jg .loop
  1085. REP_RET
  1086. %endmacro
  1087. INIT_MMX mmxext
  1088. PRED16x16_LEFT_DC
  1089. INIT_XMM sse2
  1090. PRED16x16_LEFT_DC
  1091. ;-----------------------------------------------------------------------------
  1092. ; void ff_pred16x16_128_dc(pixel *src, int stride)
  1093. ;-----------------------------------------------------------------------------
  1094. %macro PRED16x16_128_DC 0
  1095. cglobal pred16x16_128_dc_10, 2,3
  1096. mova m0, [pw_512]
  1097. mov r2d, 8
  1098. .loop:
  1099. MOV16 r0+r1*0, m0, m0, m0, m0
  1100. MOV16 r0+r1*1, m0, m0, m0, m0
  1101. lea r0, [r0+r1*2]
  1102. dec r2d
  1103. jg .loop
  1104. REP_RET
  1105. %endmacro
  1106. INIT_MMX mmxext
  1107. PRED16x16_128_DC
  1108. INIT_XMM sse2
  1109. PRED16x16_128_DC