You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1200 lines
32KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. cextern pw_16
  27. cextern pw_8
  28. cextern pw_4
  29. cextern pw_2
  30. cextern pw_1
  31. pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
  32. pw_m3: times 8 dw -3
  33. pw_pixel_max: times 8 dw ((1 << 10)-1)
  34. pw_512: times 8 dw 512
  35. pd_17: times 4 dd 17
  36. pd_16: times 4 dd 16
  37. SECTION .text
  38. ; dest, left, right, src
  39. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  40. %macro PRED4x4_LOWPASS 4
  41. paddw %2, %3
  42. psrlw %2, 1
  43. pavgw %1, %4, %2
  44. %endmacro
  45. ;-----------------------------------------------------------------------------
  46. ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
  47. ;-----------------------------------------------------------------------------
  48. %macro PRED4x4_DR 0
  49. cglobal pred4x4_down_right_10, 3, 3
  50. sub r0, r2
  51. lea r1, [r0+r2*2]
  52. movhps m1, [r1-8]
  53. movhps m2, [r0+r2*1-8]
  54. movhps m4, [r0-8]
  55. punpckhwd m2, m4
  56. movq m3, [r0]
  57. punpckhdq m1, m2
  58. PALIGNR m3, m1, 10, m1
  59. movhps m4, [r1+r2*1-8]
  60. PALIGNR m0, m3, m4, 14, m4
  61. movhps m4, [r1+r2*2-8]
  62. PALIGNR m2, m0, m4, 14, m4
  63. PRED4x4_LOWPASS m0, m2, m3, m0
  64. movq [r1+r2*2], m0
  65. psrldq m0, 2
  66. movq [r1+r2*1], m0
  67. psrldq m0, 2
  68. movq [r0+r2*2], m0
  69. psrldq m0, 2
  70. movq [r0+r2*1], m0
  71. RET
  72. %endmacro
  73. INIT_XMM sse2
  74. PRED4x4_DR
  75. INIT_XMM ssse3
  76. PRED4x4_DR
  77. %if HAVE_AVX_EXTERNAL
  78. INIT_XMM avx
  79. PRED4x4_DR
  80. %endif
  81. ;-----------------------------------------------------------------------------
  82. ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
  83. ;-----------------------------------------------------------------------------
  84. %macro PRED4x4_VR 0
  85. cglobal pred4x4_vertical_right_10, 3, 3, 6
  86. sub r0, r2
  87. lea r1, [r0+r2*2]
  88. movq m5, [r0] ; ........t3t2t1t0
  89. movhps m1, [r0-8]
  90. PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
  91. pavgw m5, m0
  92. movhps m1, [r0+r2*1-8]
  93. PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
  94. movhps m2, [r0+r2*2-8]
  95. PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
  96. movhps m3, [r1+r2*1-8]
  97. PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
  98. PRED4x4_LOWPASS m1, m0, m2, m1
  99. pslldq m0, m1, 12
  100. psrldq m1, 4
  101. movq [r0+r2*1], m5
  102. movq [r0+r2*2], m1
  103. PALIGNR m5, m0, 14, m2
  104. pslldq m0, 2
  105. movq [r1+r2*1], m5
  106. PALIGNR m1, m0, 14, m0
  107. movq [r1+r2*2], m1
  108. RET
  109. %endmacro
  110. INIT_XMM sse2
  111. PRED4x4_VR
  112. INIT_XMM ssse3
  113. PRED4x4_VR
  114. %if HAVE_AVX_EXTERNAL
  115. INIT_XMM avx
  116. PRED4x4_VR
  117. %endif
  118. ;-----------------------------------------------------------------------------
  119. ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
  120. ;-----------------------------------------------------------------------------
  121. %macro PRED4x4_HD 0
  122. cglobal pred4x4_horizontal_down_10, 3, 3
  123. sub r0, r2
  124. lea r1, [r0+r2*2]
  125. movq m0, [r0-8] ; lt ..
  126. movhps m0, [r0]
  127. pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
  128. movq m1, [r1+r2*2-8] ; l3
  129. movq m3, [r1+r2*1-8]
  130. punpcklwd m1, m3 ; l2 l3
  131. movq m2, [r0+r2*2-8] ; l1
  132. movq m3, [r0+r2*1-8]
  133. punpcklwd m2, m3 ; l0 l1
  134. punpckhdq m1, m2 ; l0 l1 l2 l3
  135. punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  136. psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
  137. psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
  138. pavgw m5, m1, m3
  139. PRED4x4_LOWPASS m3, m1, m0, m3
  140. punpcklwd m5, m3
  141. psrldq m3, 8
  142. PALIGNR m3, m5, 12, m4
  143. movq [r1+r2*2], m5
  144. movhps [r0+r2*2], m5
  145. psrldq m5, 4
  146. movq [r1+r2*1], m5
  147. movq [r0+r2*1], m3
  148. RET
  149. %endmacro
  150. INIT_XMM sse2
  151. PRED4x4_HD
  152. INIT_XMM ssse3
  153. PRED4x4_HD
  154. %if HAVE_AVX_EXTERNAL
  155. INIT_XMM avx
  156. PRED4x4_HD
  157. %endif
  158. ;-----------------------------------------------------------------------------
  159. ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
  160. ;-----------------------------------------------------------------------------
  161. %macro HADDD 2 ; sum junk
  162. %if mmsize == 16
  163. movhlps %2, %1
  164. paddd %1, %2
  165. pshuflw %2, %1, 0xE
  166. paddd %1, %2
  167. %else
  168. pshufw %2, %1, 0xE
  169. paddd %1, %2
  170. %endif
  171. %endmacro
  172. %macro HADDW 2
  173. pmaddwd %1, [pw_1]
  174. HADDD %1, %2
  175. %endmacro
  176. INIT_MMX mmxext
  177. cglobal pred4x4_dc_10, 3, 3
  178. sub r0, r2
  179. lea r1, [r0+r2*2]
  180. movq m2, [r0+r2*1-8]
  181. paddw m2, [r0+r2*2-8]
  182. paddw m2, [r1+r2*1-8]
  183. paddw m2, [r1+r2*2-8]
  184. psrlq m2, 48
  185. movq m0, [r0]
  186. HADDW m0, m1
  187. paddw m0, [pw_4]
  188. paddw m0, m2
  189. psrlw m0, 3
  190. SPLATW m0, m0, 0
  191. movq [r0+r2*1], m0
  192. movq [r0+r2*2], m0
  193. movq [r1+r2*1], m0
  194. movq [r1+r2*2], m0
  195. RET
  196. ;-----------------------------------------------------------------------------
  197. ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
  198. ;-----------------------------------------------------------------------------
  199. %macro PRED4x4_DL 0
  200. cglobal pred4x4_down_left_10, 3, 3
  201. sub r0, r2
  202. movq m0, [r0]
  203. movhps m0, [r1]
  204. psrldq m2, m0, 2
  205. pslldq m3, m0, 2
  206. pshufhw m2, m2, 10100100b
  207. PRED4x4_LOWPASS m0, m3, m2, m0
  208. lea r1, [r0+r2*2]
  209. movhps [r1+r2*2], m0
  210. psrldq m0, 2
  211. movq [r0+r2*1], m0
  212. psrldq m0, 2
  213. movq [r0+r2*2], m0
  214. psrldq m0, 2
  215. movq [r1+r2*1], m0
  216. RET
  217. %endmacro
  218. INIT_XMM sse2
  219. PRED4x4_DL
  220. %if HAVE_AVX_EXTERNAL
  221. INIT_XMM avx
  222. PRED4x4_DL
  223. %endif
  224. ;-----------------------------------------------------------------------------
  225. ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
  226. ;-----------------------------------------------------------------------------
  227. %macro PRED4x4_VL 0
  228. cglobal pred4x4_vertical_left_10, 3, 3
  229. sub r0, r2
  230. movu m1, [r0]
  231. movhps m1, [r1]
  232. psrldq m0, m1, 2
  233. psrldq m2, m1, 4
  234. pavgw m4, m0, m1
  235. PRED4x4_LOWPASS m0, m1, m2, m0
  236. lea r1, [r0+r2*2]
  237. movq [r0+r2*1], m4
  238. movq [r0+r2*2], m0
  239. psrldq m4, 2
  240. psrldq m0, 2
  241. movq [r1+r2*1], m4
  242. movq [r1+r2*2], m0
  243. RET
  244. %endmacro
  245. INIT_XMM sse2
  246. PRED4x4_VL
  247. %if HAVE_AVX_EXTERNAL
  248. INIT_XMM avx
  249. PRED4x4_VL
  250. %endif
  251. ;-----------------------------------------------------------------------------
  252. ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
  253. ;-----------------------------------------------------------------------------
  254. INIT_MMX mmxext
  255. cglobal pred4x4_horizontal_up_10, 3, 3
  256. sub r0, r2
  257. lea r1, [r0+r2*2]
  258. movq m0, [r0+r2*1-8]
  259. punpckhwd m0, [r0+r2*2-8]
  260. movq m1, [r1+r2*1-8]
  261. punpckhwd m1, [r1+r2*2-8]
  262. punpckhdq m0, m1
  263. pshufw m1, m1, 0xFF
  264. movq [r1+r2*2], m1
  265. movd [r1+r2*1+4], m1
  266. pshufw m2, m0, 11111001b
  267. movq m1, m2
  268. pavgw m2, m0
  269. pshufw m5, m0, 11111110b
  270. PRED4x4_LOWPASS m1, m0, m5, m1
  271. movq m6, m2
  272. punpcklwd m6, m1
  273. movq [r0+r2*1], m6
  274. psrlq m2, 16
  275. psrlq m1, 16
  276. punpcklwd m2, m1
  277. movq [r0+r2*2], m2
  278. psrlq m2, 32
  279. movd [r1+r2*1], m2
  280. RET
  281. ;-----------------------------------------------------------------------------
  282. ; void pred8x8_vertical(pixel *src, int stride)
  283. ;-----------------------------------------------------------------------------
  284. INIT_XMM sse2
  285. cglobal pred8x8_vertical_10, 2, 2
  286. sub r0, r1
  287. mova m0, [r0]
  288. %rep 3
  289. mova [r0+r1*1], m0
  290. mova [r0+r1*2], m0
  291. lea r0, [r0+r1*2]
  292. %endrep
  293. mova [r0+r1*1], m0
  294. mova [r0+r1*2], m0
  295. RET
  296. ;-----------------------------------------------------------------------------
  297. ; void pred8x8_horizontal(pixel *src, int stride)
  298. ;-----------------------------------------------------------------------------
  299. INIT_XMM sse2
  300. cglobal pred8x8_horizontal_10, 2, 3
  301. mov r2d, 4
  302. .loop:
  303. movq m0, [r0+r1*0-8]
  304. movq m1, [r0+r1*1-8]
  305. pshuflw m0, m0, 0xff
  306. pshuflw m1, m1, 0xff
  307. punpcklqdq m0, m0
  308. punpcklqdq m1, m1
  309. mova [r0+r1*0], m0
  310. mova [r0+r1*1], m1
  311. lea r0, [r0+r1*2]
  312. dec r2d
  313. jg .loop
  314. REP_RET
  315. ;-----------------------------------------------------------------------------
  316. ; void predict_8x8_dc(pixel *src, int stride)
  317. ;-----------------------------------------------------------------------------
  318. %macro MOV8 2-3
  319. ; sort of a hack, but it works
  320. %if mmsize==8
  321. movq [%1+0], %2
  322. movq [%1+8], %3
  323. %else
  324. movdqa [%1], %2
  325. %endif
  326. %endmacro
  327. %macro PRED8x8_DC 1
  328. cglobal pred8x8_dc_10, 2, 6
  329. sub r0, r1
  330. pxor m4, m4
  331. movq m0, [r0+0]
  332. movq m1, [r0+8]
  333. %if mmsize==16
  334. punpcklwd m0, m1
  335. movhlps m1, m0
  336. paddw m0, m1
  337. %else
  338. pshufw m2, m0, 00001110b
  339. pshufw m3, m1, 00001110b
  340. paddw m0, m2
  341. paddw m1, m3
  342. punpcklwd m0, m1
  343. %endif
  344. %1 m2, m0, 00001110b
  345. paddw m0, m2
  346. lea r5, [r1*3]
  347. lea r4, [r0+r1*4]
  348. movzx r2d, word [r0+r1*1-2]
  349. movzx r3d, word [r0+r1*2-2]
  350. add r2d, r3d
  351. movzx r3d, word [r0+r5*1-2]
  352. add r2d, r3d
  353. movzx r3d, word [r4-2]
  354. add r2d, r3d
  355. movd m2, r2d ; s2
  356. movzx r2d, word [r4+r1*1-2]
  357. movzx r3d, word [r4+r1*2-2]
  358. add r2d, r3d
  359. movzx r3d, word [r4+r5*1-2]
  360. add r2d, r3d
  361. movzx r3d, word [r4+r1*4-2]
  362. add r2d, r3d
  363. movd m3, r2d ; s3
  364. punpcklwd m2, m3
  365. punpckldq m0, m2 ; s0, s1, s2, s3
  366. %1 m3, m0, 11110110b ; s2, s1, s3, s3
  367. %1 m0, m0, 01110100b ; s0, s1, s3, s1
  368. paddw m0, m3
  369. psrlw m0, 2
  370. pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
  371. %if mmsize==16
  372. punpcklwd m0, m0
  373. pshufd m3, m0, 11111010b
  374. punpckldq m0, m0
  375. SWAP 0,1
  376. %else
  377. pshufw m1, m0, 0x00
  378. pshufw m2, m0, 0x55
  379. pshufw m3, m0, 0xaa
  380. pshufw m4, m0, 0xff
  381. %endif
  382. MOV8 r0+r1*1, m1, m2
  383. MOV8 r0+r1*2, m1, m2
  384. MOV8 r0+r5*1, m1, m2
  385. MOV8 r0+r1*4, m1, m2
  386. MOV8 r4+r1*1, m3, m4
  387. MOV8 r4+r1*2, m3, m4
  388. MOV8 r4+r5*1, m3, m4
  389. MOV8 r4+r1*4, m3, m4
  390. RET
  391. %endmacro
  392. INIT_MMX mmxext
  393. PRED8x8_DC pshufw
  394. INIT_XMM sse2
  395. PRED8x8_DC pshuflw
  396. ;-----------------------------------------------------------------------------
  397. ; void pred8x8_top_dc(pixel *src, int stride)
  398. ;-----------------------------------------------------------------------------
  399. INIT_XMM sse2
  400. cglobal pred8x8_top_dc_10, 2, 4
  401. sub r0, r1
  402. mova m0, [r0]
  403. pshuflw m1, m0, 0x4e
  404. pshufhw m1, m1, 0x4e
  405. paddw m0, m1
  406. pshuflw m1, m0, 0xb1
  407. pshufhw m1, m1, 0xb1
  408. paddw m0, m1
  409. lea r2, [r1*3]
  410. lea r3, [r0+r1*4]
  411. paddw m0, [pw_2]
  412. psrlw m0, 2
  413. mova [r0+r1*1], m0
  414. mova [r0+r1*2], m0
  415. mova [r0+r2*1], m0
  416. mova [r0+r1*4], m0
  417. mova [r3+r1*1], m0
  418. mova [r3+r1*2], m0
  419. mova [r3+r2*1], m0
  420. mova [r3+r1*4], m0
  421. RET
  422. ;-----------------------------------------------------------------------------
  423. ; void pred8x8_plane(pixel *src, int stride)
  424. ;-----------------------------------------------------------------------------
  425. INIT_XMM sse2
  426. cglobal pred8x8_plane_10, 2, 7, 7
  427. sub r0, r1
  428. lea r2, [r1*3]
  429. lea r3, [r0+r1*4]
  430. mova m2, [r0]
  431. pmaddwd m2, [pw_m32101234]
  432. HADDD m2, m1
  433. movd m0, [r0-4]
  434. psrld m0, 14
  435. psubw m2, m0 ; H
  436. movd m0, [r3+r1*4-4]
  437. movd m1, [r0+12]
  438. paddw m0, m1
  439. psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
  440. movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
  441. movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
  442. sub r4d, r5d
  443. movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
  444. movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
  445. sub r6d, r5d
  446. lea r4d, [r4+r6*2]
  447. movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
  448. movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
  449. sub r5d, r6d
  450. lea r5d, [r5*3]
  451. add r4d, r5d
  452. movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
  453. movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
  454. sub r6d, r5d
  455. lea r4d, [r4+r6*4]
  456. movd m3, r4d ; V
  457. punpckldq m2, m3
  458. pmaddwd m2, [pd_17]
  459. paddd m2, [pd_16]
  460. psrad m2, 5 ; b, c
  461. mova m3, [pw_pixel_max]
  462. pxor m1, m1
  463. SPLATW m0, m0, 1
  464. SPLATW m4, m2, 2
  465. SPLATW m2, m2, 0
  466. pmullw m2, [pw_m32101234] ; b
  467. pmullw m5, m4, [pw_m3] ; c
  468. paddw m5, [pw_16]
  469. mov r2d, 8
  470. add r0, r1
  471. .loop:
  472. paddsw m6, m2, m5
  473. paddsw m6, m0
  474. psraw m6, 5
  475. CLIPW m6, m1, m3
  476. mova [r0], m6
  477. paddw m5, m4
  478. add r0, r1
  479. dec r2d
  480. jg .loop
  481. REP_RET
  482. ;-----------------------------------------------------------------------------
  483. ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
  484. ;-----------------------------------------------------------------------------
  485. %macro PRED8x8L_128_DC 0
  486. cglobal pred8x8l_128_dc_10, 4, 4
  487. mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
  488. lea r1, [r3*3]
  489. lea r2, [r0+r3*4]
  490. MOV8 r0+r3*0, m0, m0
  491. MOV8 r0+r3*1, m0, m0
  492. MOV8 r0+r3*2, m0, m0
  493. MOV8 r0+r1*1, m0, m0
  494. MOV8 r2+r3*0, m0, m0
  495. MOV8 r2+r3*1, m0, m0
  496. MOV8 r2+r3*2, m0, m0
  497. MOV8 r2+r1*1, m0, m0
  498. RET
  499. %endmacro
  500. INIT_MMX mmxext
  501. PRED8x8L_128_DC
  502. INIT_XMM sse2
  503. PRED8x8L_128_DC
  504. ;-----------------------------------------------------------------------------
  505. ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
  506. ;-----------------------------------------------------------------------------
  507. %macro PRED8x8L_TOP_DC 0
  508. cglobal pred8x8l_top_dc_10, 4, 4, 6
  509. sub r0, r3
  510. mova m0, [r0]
  511. shr r1d, 14
  512. shr r2d, 13
  513. neg r1
  514. pslldq m1, m0, 2
  515. psrldq m2, m0, 2
  516. pinsrw m1, [r0+r1], 0
  517. pinsrw m2, [r0+r2+14], 7
  518. lea r1, [r3*3]
  519. lea r2, [r0+r3*4]
  520. PRED4x4_LOWPASS m0, m2, m1, m0
  521. HADDW m0, m1
  522. paddw m0, [pw_4]
  523. psrlw m0, 3
  524. SPLATW m0, m0, 0
  525. mova [r0+r3*1], m0
  526. mova [r0+r3*2], m0
  527. mova [r0+r1*1], m0
  528. mova [r0+r3*4], m0
  529. mova [r2+r3*1], m0
  530. mova [r2+r3*2], m0
  531. mova [r2+r1*1], m0
  532. mova [r2+r3*4], m0
  533. RET
  534. %endmacro
  535. INIT_XMM sse2
  536. PRED8x8L_TOP_DC
  537. %if HAVE_AVX_EXTERNAL
  538. INIT_XMM avx
  539. PRED8x8L_TOP_DC
  540. %endif
  541. ;-----------------------------------------------------------------------------
  542. ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
  543. ;-----------------------------------------------------------------------------
  544. ;TODO: see if scalar is faster
  545. %macro PRED8x8L_DC 0
  546. cglobal pred8x8l_dc_10, 4, 6, 6
  547. sub r0, r3
  548. lea r4, [r0+r3*4]
  549. lea r5, [r3*3]
  550. mova m0, [r0+r3*2-16]
  551. punpckhwd m0, [r0+r3*1-16]
  552. mova m1, [r4+r3*0-16]
  553. punpckhwd m1, [r0+r5*1-16]
  554. punpckhdq m1, m0
  555. mova m2, [r4+r3*2-16]
  556. punpckhwd m2, [r4+r3*1-16]
  557. mova m3, [r4+r3*4-16]
  558. punpckhwd m3, [r4+r5*1-16]
  559. punpckhdq m3, m2
  560. punpckhqdq m3, m1
  561. mova m0, [r0]
  562. shr r1d, 14
  563. shr r2d, 13
  564. neg r1
  565. pslldq m1, m0, 2
  566. psrldq m2, m0, 2
  567. pinsrw m1, [r0+r1], 0
  568. pinsrw m2, [r0+r2+14], 7
  569. not r1
  570. and r1, r3
  571. pslldq m4, m3, 2
  572. psrldq m5, m3, 2
  573. pshuflw m4, m4, 11100101b
  574. pinsrw m5, [r0+r1-2], 7
  575. PRED4x4_LOWPASS m3, m4, m5, m3
  576. PRED4x4_LOWPASS m0, m2, m1, m0
  577. paddw m0, m3
  578. HADDW m0, m1
  579. paddw m0, [pw_8]
  580. psrlw m0, 4
  581. SPLATW m0, m0
  582. mova [r0+r3*1], m0
  583. mova [r0+r3*2], m0
  584. mova [r0+r5*1], m0
  585. mova [r0+r3*4], m0
  586. mova [r4+r3*1], m0
  587. mova [r4+r3*2], m0
  588. mova [r4+r5*1], m0
  589. mova [r4+r3*4], m0
  590. RET
  591. %endmacro
  592. INIT_XMM sse2
  593. PRED8x8L_DC
  594. %if HAVE_AVX_EXTERNAL
  595. INIT_XMM avx
  596. PRED8x8L_DC
  597. %endif
  598. ;-----------------------------------------------------------------------------
  599. ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
  600. ;-----------------------------------------------------------------------------
  601. %macro PRED8x8L_VERTICAL 0
  602. cglobal pred8x8l_vertical_10, 4, 4, 6
  603. sub r0, r3
  604. mova m0, [r0]
  605. shr r1d, 14
  606. shr r2d, 13
  607. neg r1
  608. pslldq m1, m0, 2
  609. psrldq m2, m0, 2
  610. pinsrw m1, [r0+r1], 0
  611. pinsrw m2, [r0+r2+14], 7
  612. lea r1, [r3*3]
  613. lea r2, [r0+r3*4]
  614. PRED4x4_LOWPASS m0, m2, m1, m0
  615. mova [r0+r3*1], m0
  616. mova [r0+r3*2], m0
  617. mova [r0+r1*1], m0
  618. mova [r0+r3*4], m0
  619. mova [r2+r3*1], m0
  620. mova [r2+r3*2], m0
  621. mova [r2+r1*1], m0
  622. mova [r2+r3*4], m0
  623. RET
  624. %endmacro
  625. INIT_XMM sse2
  626. PRED8x8L_VERTICAL
  627. %if HAVE_AVX_EXTERNAL
  628. INIT_XMM avx
  629. PRED8x8L_VERTICAL
  630. %endif
  631. ;-----------------------------------------------------------------------------
  632. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  633. ;-----------------------------------------------------------------------------
  634. %macro PRED8x8L_HORIZONTAL 0
  635. cglobal pred8x8l_horizontal_10, 4, 4, 5
  636. mova m0, [r0-16]
  637. shr r1d, 14
  638. dec r1
  639. and r1, r3
  640. sub r1, r3
  641. punpckhwd m0, [r0+r1-16]
  642. mova m1, [r0+r3*2-16]
  643. punpckhwd m1, [r0+r3*1-16]
  644. lea r2, [r0+r3*4]
  645. lea r1, [r3*3]
  646. punpckhdq m1, m0
  647. mova m2, [r2+r3*0-16]
  648. punpckhwd m2, [r0+r1-16]
  649. mova m3, [r2+r3*2-16]
  650. punpckhwd m3, [r2+r3*1-16]
  651. punpckhdq m3, m2
  652. punpckhqdq m3, m1
  653. PALIGNR m4, m3, [r2+r1-16], 14, m0
  654. pslldq m0, m4, 2
  655. pshuflw m0, m0, 11100101b
  656. PRED4x4_LOWPASS m4, m3, m0, m4
  657. punpckhwd m3, m4, m4
  658. punpcklwd m4, m4
  659. pshufd m0, m3, 0xff
  660. pshufd m1, m3, 0xaa
  661. pshufd m2, m3, 0x55
  662. pshufd m3, m3, 0x00
  663. mova [r0+r3*0], m0
  664. mova [r0+r3*1], m1
  665. mova [r0+r3*2], m2
  666. mova [r0+r1*1], m3
  667. pshufd m0, m4, 0xff
  668. pshufd m1, m4, 0xaa
  669. pshufd m2, m4, 0x55
  670. pshufd m3, m4, 0x00
  671. mova [r2+r3*0], m0
  672. mova [r2+r3*1], m1
  673. mova [r2+r3*2], m2
  674. mova [r2+r1*1], m3
  675. RET
  676. %endmacro
  677. INIT_XMM sse2
  678. PRED8x8L_HORIZONTAL
  679. INIT_XMM ssse3
  680. PRED8x8L_HORIZONTAL
  681. %if HAVE_AVX_EXTERNAL
  682. INIT_XMM avx
  683. PRED8x8L_HORIZONTAL
  684. %endif
  685. ;-----------------------------------------------------------------------------
  686. ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
  687. ;-----------------------------------------------------------------------------
  688. %macro PRED8x8L_DOWN_LEFT 0
  689. cglobal pred8x8l_down_left_10, 4, 4, 7
  690. sub r0, r3
  691. mova m3, [r0]
  692. shr r1d, 14
  693. neg r1
  694. shr r2d, 13
  695. pslldq m1, m3, 2
  696. psrldq m2, m3, 2
  697. pinsrw m1, [r0+r1], 0
  698. pinsrw m2, [r0+r2+14], 7
  699. PRED4x4_LOWPASS m6, m2, m1, m3
  700. jz .fix_tr ; flags from shr r2d
  701. mova m1, [r0+16]
  702. psrldq m5, m1, 2
  703. PALIGNR m2, m1, m3, 14, m3
  704. pshufhw m5, m5, 10100100b
  705. PRED4x4_LOWPASS m1, m2, m5, m1
  706. .do_topright:
  707. lea r1, [r3*3]
  708. psrldq m5, m1, 14
  709. lea r2, [r0+r3*4]
  710. PALIGNR m2, m1, m6, 2, m0
  711. PALIGNR m3, m1, m6, 14, m0
  712. PALIGNR m5, m1, 2, m0
  713. pslldq m4, m6, 2
  714. PRED4x4_LOWPASS m6, m4, m2, m6
  715. PRED4x4_LOWPASS m1, m3, m5, m1
  716. mova [r2+r3*4], m1
  717. PALIGNR m1, m6, 14, m2
  718. pslldq m6, 2
  719. mova [r2+r1*1], m1
  720. PALIGNR m1, m6, 14, m2
  721. pslldq m6, 2
  722. mova [r2+r3*2], m1
  723. PALIGNR m1, m6, 14, m2
  724. pslldq m6, 2
  725. mova [r2+r3*1], m1
  726. PALIGNR m1, m6, 14, m2
  727. pslldq m6, 2
  728. mova [r0+r3*4], m1
  729. PALIGNR m1, m6, 14, m2
  730. pslldq m6, 2
  731. mova [r0+r1*1], m1
  732. PALIGNR m1, m6, 14, m2
  733. pslldq m6, 2
  734. mova [r0+r3*2], m1
  735. PALIGNR m1, m6, 14, m6
  736. mova [r0+r3*1], m1
  737. RET
  738. .fix_tr:
  739. punpckhwd m3, m3
  740. pshufd m1, m3, 0xFF
  741. jmp .do_topright
  742. %endmacro
  743. INIT_XMM sse2
  744. PRED8x8L_DOWN_LEFT
  745. INIT_XMM ssse3
  746. PRED8x8L_DOWN_LEFT
  747. %if HAVE_AVX_EXTERNAL
  748. INIT_XMM avx
  749. PRED8x8L_DOWN_LEFT
  750. %endif
  751. ;-----------------------------------------------------------------------------
  752. ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
  753. ;-----------------------------------------------------------------------------
  754. %macro PRED8x8L_DOWN_RIGHT 0
  755. ; standard forbids this when has_topleft is false
  756. ; no need to check
  757. cglobal pred8x8l_down_right_10, 4, 5, 8
  758. sub r0, r3
  759. lea r4, [r0+r3*4]
  760. lea r1, [r3*3]
  761. mova m0, [r0+r3*1-16]
  762. punpckhwd m0, [r0+r3*0-16]
  763. mova m1, [r0+r1*1-16]
  764. punpckhwd m1, [r0+r3*2-16]
  765. punpckhdq m1, m0
  766. mova m2, [r4+r3*1-16]
  767. punpckhwd m2, [r4+r3*0-16]
  768. mova m3, [r4+r1*1-16]
  769. punpckhwd m3, [r4+r3*2-16]
  770. punpckhdq m3, m2
  771. punpckhqdq m3, m1
  772. mova m0, [r4+r3*4-16]
  773. mova m1, [r0]
  774. PALIGNR m4, m3, m0, 14, m0
  775. PALIGNR m1, m3, 2, m2
  776. pslldq m0, m4, 2
  777. pshuflw m0, m0, 11100101b
  778. PRED4x4_LOWPASS m6, m1, m4, m3
  779. PRED4x4_LOWPASS m4, m3, m0, m4
  780. mova m3, [r0]
  781. shr r2d, 13
  782. pslldq m1, m3, 2
  783. psrldq m2, m3, 2
  784. pinsrw m1, [r0-2], 0
  785. pinsrw m2, [r0+r2+14], 7
  786. PRED4x4_LOWPASS m3, m2, m1, m3
  787. PALIGNR m2, m3, m6, 2, m0
  788. PALIGNR m5, m3, m6, 14, m0
  789. psrldq m7, m3, 2
  790. PRED4x4_LOWPASS m6, m4, m2, m6
  791. PRED4x4_LOWPASS m3, m5, m7, m3
  792. mova [r4+r3*4], m6
  793. PALIGNR m3, m6, 14, m2
  794. pslldq m6, 2
  795. mova [r0+r3*1], m3
  796. PALIGNR m3, m6, 14, m2
  797. pslldq m6, 2
  798. mova [r0+r3*2], m3
  799. PALIGNR m3, m6, 14, m2
  800. pslldq m6, 2
  801. mova [r0+r1*1], m3
  802. PALIGNR m3, m6, 14, m2
  803. pslldq m6, 2
  804. mova [r0+r3*4], m3
  805. PALIGNR m3, m6, 14, m2
  806. pslldq m6, 2
  807. mova [r4+r3*1], m3
  808. PALIGNR m3, m6, 14, m2
  809. pslldq m6, 2
  810. mova [r4+r3*2], m3
  811. PALIGNR m3, m6, 14, m6
  812. mova [r4+r1*1], m3
  813. RET
  814. %endmacro
  815. INIT_XMM sse2
  816. PRED8x8L_DOWN_RIGHT
  817. INIT_XMM ssse3
  818. PRED8x8L_DOWN_RIGHT
  819. %if HAVE_AVX_EXTERNAL
  820. INIT_XMM avx
  821. PRED8x8L_DOWN_RIGHT
  822. %endif
  823. ;-----------------------------------------------------------------------------
  824. ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
  825. ;-----------------------------------------------------------------------------
  826. %macro PRED8x8L_VERTICAL_RIGHT 0
  827. ; likewise with 8x8l_down_right
  828. cglobal pred8x8l_vertical_right_10, 4, 5, 7
  829. sub r0, r3
  830. lea r4, [r0+r3*4]
  831. lea r1, [r3*3]
  832. mova m0, [r0+r3*1-16]
  833. punpckhwd m0, [r0+r3*0-16]
  834. mova m1, [r0+r1*1-16]
  835. punpckhwd m1, [r0+r3*2-16]
  836. punpckhdq m1, m0
  837. mova m2, [r4+r3*1-16]
  838. punpckhwd m2, [r4+r3*0-16]
  839. mova m3, [r4+r1*1-16]
  840. punpckhwd m3, [r4+r3*2-16]
  841. punpckhdq m3, m2
  842. punpckhqdq m3, m1
  843. mova m0, [r4+r3*4-16]
  844. mova m1, [r0]
  845. PALIGNR m4, m3, m0, 14, m0
  846. PALIGNR m1, m3, 2, m2
  847. PRED4x4_LOWPASS m3, m1, m4, m3
  848. mova m2, [r0]
  849. shr r2d, 13
  850. pslldq m1, m2, 2
  851. psrldq m5, m2, 2
  852. pinsrw m1, [r0-2], 0
  853. pinsrw m5, [r0+r2+14], 7
  854. PRED4x4_LOWPASS m2, m5, m1, m2
  855. PALIGNR m6, m2, m3, 12, m1
  856. PALIGNR m5, m2, m3, 14, m0
  857. PRED4x4_LOWPASS m0, m6, m2, m5
  858. pavgw m2, m5
  859. mova [r0+r3*2], m0
  860. mova [r0+r3*1], m2
  861. pslldq m6, m3, 4
  862. pslldq m1, m3, 2
  863. PRED4x4_LOWPASS m1, m3, m6, m1
  864. PALIGNR m2, m1, 14, m4
  865. mova [r0+r1*1], m2
  866. pslldq m1, 2
  867. PALIGNR m0, m1, 14, m3
  868. mova [r0+r3*4], m0
  869. pslldq m1, 2
  870. PALIGNR m2, m1, 14, m4
  871. mova [r4+r3*1], m2
  872. pslldq m1, 2
  873. PALIGNR m0, m1, 14, m3
  874. mova [r4+r3*2], m0
  875. pslldq m1, 2
  876. PALIGNR m2, m1, 14, m4
  877. mova [r4+r1*1], m2
  878. pslldq m1, 2
  879. PALIGNR m0, m1, 14, m1
  880. mova [r4+r3*4], m0
  881. RET
  882. %endmacro
  883. INIT_XMM sse2
  884. PRED8x8L_VERTICAL_RIGHT
  885. INIT_XMM ssse3
  886. PRED8x8L_VERTICAL_RIGHT
  887. %if HAVE_AVX_EXTERNAL
  888. INIT_XMM avx
  889. PRED8x8L_VERTICAL_RIGHT
  890. %endif
  891. ;-----------------------------------------------------------------------------
  892. ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
  893. ;-----------------------------------------------------------------------------
  894. %macro PRED8x8L_HORIZONTAL_UP 0
  895. cglobal pred8x8l_horizontal_up_10, 4, 4, 6
  896. mova m0, [r0+r3*0-16]
  897. punpckhwd m0, [r0+r3*1-16]
  898. shr r1d, 14
  899. dec r1
  900. and r1, r3
  901. sub r1, r3
  902. mova m4, [r0+r1*1-16]
  903. lea r1, [r3*3]
  904. lea r2, [r0+r3*4]
  905. mova m1, [r0+r3*2-16]
  906. punpckhwd m1, [r0+r1*1-16]
  907. punpckhdq m0, m1
  908. mova m2, [r2+r3*0-16]
  909. punpckhwd m2, [r2+r3*1-16]
  910. mova m3, [r2+r3*2-16]
  911. punpckhwd m3, [r2+r1*1-16]
  912. punpckhdq m2, m3
  913. punpckhqdq m0, m2
  914. PALIGNR m1, m0, m4, 14, m4
  915. psrldq m2, m0, 2
  916. pshufhw m2, m2, 10100100b
  917. PRED4x4_LOWPASS m0, m1, m2, m0
  918. psrldq m1, m0, 2
  919. psrldq m2, m0, 4
  920. pshufhw m1, m1, 10100100b
  921. pshufhw m2, m2, 01010100b
  922. pavgw m4, m0, m1
  923. PRED4x4_LOWPASS m1, m2, m0, m1
  924. punpckhwd m5, m4, m1
  925. punpcklwd m4, m1
  926. mova [r2+r3*0], m5
  927. mova [r0+r3*0], m4
  928. pshufd m0, m5, 11111001b
  929. pshufd m1, m5, 11111110b
  930. pshufd m2, m5, 11111111b
  931. mova [r2+r3*1], m0
  932. mova [r2+r3*2], m1
  933. mova [r2+r1*1], m2
  934. PALIGNR m2, m5, m4, 4, m0
  935. PALIGNR m3, m5, m4, 8, m1
  936. PALIGNR m5, m5, m4, 12, m4
  937. mova [r0+r3*1], m2
  938. mova [r0+r3*2], m3
  939. mova [r0+r1*1], m5
  940. RET
  941. %endmacro
  942. INIT_XMM sse2
  943. PRED8x8L_HORIZONTAL_UP
  944. INIT_XMM ssse3
  945. PRED8x8L_HORIZONTAL_UP
  946. %if HAVE_AVX_EXTERNAL
  947. INIT_XMM avx
  948. PRED8x8L_HORIZONTAL_UP
  949. %endif
  950. ;-----------------------------------------------------------------------------
  951. ; void pred16x16_vertical(pixel *src, int stride)
  952. ;-----------------------------------------------------------------------------
  953. %macro MOV16 3-5
  954. mova [%1+ 0], %2
  955. mova [%1+mmsize], %3
  956. %if mmsize==8
  957. mova [%1+ 16], %4
  958. mova [%1+ 24], %5
  959. %endif
  960. %endmacro
  961. %macro PRED16x16_VERTICAL 0
  962. cglobal pred16x16_vertical_10, 2, 3
  963. sub r0, r1
  964. mov r2d, 8
  965. mova m0, [r0+ 0]
  966. mova m1, [r0+mmsize]
  967. %if mmsize==8
  968. mova m2, [r0+16]
  969. mova m3, [r0+24]
  970. %endif
  971. .loop:
  972. MOV16 r0+r1*1, m0, m1, m2, m3
  973. MOV16 r0+r1*2, m0, m1, m2, m3
  974. lea r0, [r0+r1*2]
  975. dec r2d
  976. jg .loop
  977. REP_RET
  978. %endmacro
  979. INIT_MMX mmxext
  980. PRED16x16_VERTICAL
  981. INIT_XMM sse2
  982. PRED16x16_VERTICAL
  983. ;-----------------------------------------------------------------------------
  984. ; void pred16x16_horizontal(pixel *src, int stride)
  985. ;-----------------------------------------------------------------------------
  986. %macro PRED16x16_HORIZONTAL 0
  987. cglobal pred16x16_horizontal_10, 2, 3
  988. mov r2d, 8
  989. .vloop:
  990. movd m0, [r0+r1*0-4]
  991. movd m1, [r0+r1*1-4]
  992. SPLATW m0, m0, 1
  993. SPLATW m1, m1, 1
  994. MOV16 r0+r1*0, m0, m0, m0, m0
  995. MOV16 r0+r1*1, m1, m1, m1, m1
  996. lea r0, [r0+r1*2]
  997. dec r2d
  998. jg .vloop
  999. REP_RET
  1000. %endmacro
  1001. INIT_MMX mmxext
  1002. PRED16x16_HORIZONTAL
  1003. INIT_XMM sse2
  1004. PRED16x16_HORIZONTAL
  1005. ;-----------------------------------------------------------------------------
  1006. ; void pred16x16_dc(pixel *src, int stride)
  1007. ;-----------------------------------------------------------------------------
  1008. %macro PRED16x16_DC 0
  1009. cglobal pred16x16_dc_10, 2, 6
  1010. mov r5, r0
  1011. sub r0, r1
  1012. mova m0, [r0+0]
  1013. paddw m0, [r0+mmsize]
  1014. %if mmsize==8
  1015. paddw m0, [r0+16]
  1016. paddw m0, [r0+24]
  1017. %endif
  1018. HADDW m0, m2
  1019. lea r0, [r0+r1-2]
  1020. movzx r3d, word [r0]
  1021. movzx r4d, word [r0+r1]
  1022. %rep 7
  1023. lea r0, [r0+r1*2]
  1024. movzx r2d, word [r0]
  1025. add r3d, r2d
  1026. movzx r2d, word [r0+r1]
  1027. add r4d, r2d
  1028. %endrep
  1029. lea r3d, [r3+r4+16]
  1030. movd m1, r3d
  1031. paddw m0, m1
  1032. psrlw m0, 5
  1033. SPLATW m0, m0
  1034. mov r3d, 8
  1035. .loop:
  1036. MOV16 r5+r1*0, m0, m0, m0, m0
  1037. MOV16 r5+r1*1, m0, m0, m0, m0
  1038. lea r5, [r5+r1*2]
  1039. dec r3d
  1040. jg .loop
  1041. REP_RET
  1042. %endmacro
  1043. INIT_MMX mmxext
  1044. PRED16x16_DC
  1045. INIT_XMM sse2
  1046. PRED16x16_DC
  1047. ;-----------------------------------------------------------------------------
  1048. ; void pred16x16_top_dc(pixel *src, int stride)
  1049. ;-----------------------------------------------------------------------------
  1050. %macro PRED16x16_TOP_DC 0
  1051. cglobal pred16x16_top_dc_10, 2, 3
  1052. sub r0, r1
  1053. mova m0, [r0+0]
  1054. paddw m0, [r0+mmsize]
  1055. %if mmsize==8
  1056. paddw m0, [r0+16]
  1057. paddw m0, [r0+24]
  1058. %endif
  1059. HADDW m0, m2
  1060. SPLATW m0, m0
  1061. paddw m0, [pw_8]
  1062. psrlw m0, 4
  1063. mov r2d, 8
  1064. .loop:
  1065. MOV16 r0+r1*1, m0, m0, m0, m0
  1066. MOV16 r0+r1*2, m0, m0, m0, m0
  1067. lea r0, [r0+r1*2]
  1068. dec r2d
  1069. jg .loop
  1070. REP_RET
  1071. %endmacro
  1072. INIT_MMX mmxext
  1073. PRED16x16_TOP_DC
  1074. INIT_XMM sse2
  1075. PRED16x16_TOP_DC
  1076. ;-----------------------------------------------------------------------------
  1077. ; void pred16x16_left_dc(pixel *src, int stride)
  1078. ;-----------------------------------------------------------------------------
  1079. %macro PRED16x16_LEFT_DC 0
  1080. cglobal pred16x16_left_dc_10, 2, 6
  1081. mov r5, r0
  1082. sub r0, 2
  1083. movzx r3d, word [r0]
  1084. movzx r4d, word [r0+r1]
  1085. %rep 7
  1086. lea r0, [r0+r1*2]
  1087. movzx r2d, word [r0]
  1088. add r3d, r2d
  1089. movzx r2d, word [r0+r1]
  1090. add r4d, r2d
  1091. %endrep
  1092. lea r3d, [r3+r4+8]
  1093. shr r3d, 4
  1094. movd m0, r3d
  1095. SPLATW m0, m0
  1096. mov r3d, 8
  1097. .loop:
  1098. MOV16 r5+r1*0, m0, m0, m0, m0
  1099. MOV16 r5+r1*1, m0, m0, m0, m0
  1100. lea r5, [r5+r1*2]
  1101. dec r3d
  1102. jg .loop
  1103. REP_RET
  1104. %endmacro
  1105. INIT_MMX mmxext
  1106. PRED16x16_LEFT_DC
  1107. INIT_XMM sse2
  1108. PRED16x16_LEFT_DC
  1109. ;-----------------------------------------------------------------------------
  1110. ; void pred16x16_128_dc(pixel *src, int stride)
  1111. ;-----------------------------------------------------------------------------
  1112. %macro PRED16x16_128_DC 0
  1113. cglobal pred16x16_128_dc_10, 2,3
  1114. mova m0, [pw_512]
  1115. mov r2d, 8
  1116. .loop:
  1117. MOV16 r0+r1*0, m0, m0, m0, m0
  1118. MOV16 r0+r1*1, m0, m0, m0, m0
  1119. lea r0, [r0+r1*2]
  1120. dec r2d
  1121. jg .loop
  1122. REP_RET
  1123. %endmacro
  1124. INIT_MMX mmxext
  1125. PRED16x16_128_DC
  1126. INIT_XMM sse2
  1127. PRED16x16_128_DC