You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1216 lines
32KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. cextern pw_16
  27. cextern pw_8
  28. cextern pw_4
  29. cextern pw_2
  30. cextern pw_1
  31. pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
  32. pw_m3: times 8 dw -3
  33. pw_pixel_max: times 8 dw ((1 << 10)-1)
  34. pw_512: times 8 dw 512
  35. pd_17: times 4 dd 17
  36. pd_16: times 4 dd 16
  37. SECTION .text
  38. ; dest, left, right, src
  39. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  40. %macro PRED4x4_LOWPASS 4
  41. paddw %2, %3
  42. psrlw %2, 1
  43. pavgw %1, %4, %2
  44. %endmacro
  45. ;-----------------------------------------------------------------------------
  46. ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
  47. ;-----------------------------------------------------------------------------
  48. %macro PRED4x4_DR 0
  49. cglobal pred4x4_down_right_10, 3, 3
  50. sub r0, r2
  51. lea r1, [r0+r2*2]
  52. movhps m1, [r1-8]
  53. movhps m2, [r0+r2*1-8]
  54. movhps m4, [r0-8]
  55. punpckhwd m2, m4
  56. movq m3, [r0]
  57. punpckhdq m1, m2
  58. PALIGNR m3, m1, 10, m1
  59. movhps m4, [r1+r2*1-8]
  60. PALIGNR m0, m3, m4, 14, m4
  61. movhps m4, [r1+r2*2-8]
  62. PALIGNR m2, m0, m4, 14, m4
  63. PRED4x4_LOWPASS m0, m2, m3, m0
  64. movq [r1+r2*2], m0
  65. psrldq m0, 2
  66. movq [r1+r2*1], m0
  67. psrldq m0, 2
  68. movq [r0+r2*2], m0
  69. psrldq m0, 2
  70. movq [r0+r2*1], m0
  71. RET
  72. %endmacro
  73. INIT_XMM sse2
  74. %define PALIGNR PALIGNR_MMX
  75. PRED4x4_DR
  76. INIT_XMM ssse3
  77. %define PALIGNR PALIGNR_SSSE3
  78. PRED4x4_DR
  79. %if HAVE_AVX_EXTERNAL
  80. INIT_XMM avx
  81. PRED4x4_DR
  82. %endif
  83. ;-----------------------------------------------------------------------------
  84. ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
  85. ;-----------------------------------------------------------------------------
  86. %macro PRED4x4_VR 0
  87. cglobal pred4x4_vertical_right_10, 3, 3, 6
  88. sub r0, r2
  89. lea r1, [r0+r2*2]
  90. movq m5, [r0] ; ........t3t2t1t0
  91. movhps m1, [r0-8]
  92. PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
  93. pavgw m5, m0
  94. movhps m1, [r0+r2*1-8]
  95. PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
  96. movhps m2, [r0+r2*2-8]
  97. PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
  98. movhps m3, [r1+r2*1-8]
  99. PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
  100. PRED4x4_LOWPASS m1, m0, m2, m1
  101. pslldq m0, m1, 12
  102. psrldq m1, 4
  103. movq [r0+r2*1], m5
  104. movq [r0+r2*2], m1
  105. PALIGNR m5, m0, 14, m2
  106. pslldq m0, 2
  107. movq [r1+r2*1], m5
  108. PALIGNR m1, m0, 14, m0
  109. movq [r1+r2*2], m1
  110. RET
  111. %endmacro
  112. INIT_XMM sse2
  113. %define PALIGNR PALIGNR_MMX
  114. PRED4x4_VR
  115. INIT_XMM ssse3
  116. %define PALIGNR PALIGNR_SSSE3
  117. PRED4x4_VR
  118. %if HAVE_AVX_EXTERNAL
  119. INIT_XMM avx
  120. PRED4x4_VR
  121. %endif
  122. ;-----------------------------------------------------------------------------
  123. ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
  124. ;-----------------------------------------------------------------------------
  125. %macro PRED4x4_HD 0
  126. cglobal pred4x4_horizontal_down_10, 3, 3
  127. sub r0, r2
  128. lea r1, [r0+r2*2]
  129. movq m0, [r0-8] ; lt ..
  130. movhps m0, [r0]
  131. pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
  132. movq m1, [r1+r2*2-8] ; l3
  133. movq m3, [r1+r2*1-8]
  134. punpcklwd m1, m3 ; l2 l3
  135. movq m2, [r0+r2*2-8] ; l1
  136. movq m3, [r0+r2*1-8]
  137. punpcklwd m2, m3 ; l0 l1
  138. punpckhdq m1, m2 ; l0 l1 l2 l3
  139. punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  140. psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
  141. psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
  142. pavgw m5, m1, m3
  143. PRED4x4_LOWPASS m3, m1, m0, m3
  144. punpcklwd m5, m3
  145. psrldq m3, 8
  146. PALIGNR m3, m5, 12, m4
  147. movq [r1+r2*2], m5
  148. movhps [r0+r2*2], m5
  149. psrldq m5, 4
  150. movq [r1+r2*1], m5
  151. movq [r0+r2*1], m3
  152. RET
  153. %endmacro
  154. INIT_XMM sse2
  155. %define PALIGNR PALIGNR_MMX
  156. PRED4x4_HD
  157. INIT_XMM ssse3
  158. %define PALIGNR PALIGNR_SSSE3
  159. PRED4x4_HD
  160. %if HAVE_AVX_EXTERNAL
  161. INIT_XMM avx
  162. PRED4x4_HD
  163. %endif
  164. ;-----------------------------------------------------------------------------
  165. ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
  166. ;-----------------------------------------------------------------------------
  167. %macro HADDD 2 ; sum junk
  168. %if mmsize == 16
  169. movhlps %2, %1
  170. paddd %1, %2
  171. pshuflw %2, %1, 0xE
  172. paddd %1, %2
  173. %else
  174. pshufw %2, %1, 0xE
  175. paddd %1, %2
  176. %endif
  177. %endmacro
  178. %macro HADDW 2
  179. pmaddwd %1, [pw_1]
  180. HADDD %1, %2
  181. %endmacro
  182. INIT_MMX mmx2
  183. cglobal pred4x4_dc_10, 3, 3
  184. sub r0, r2
  185. lea r1, [r0+r2*2]
  186. movq m2, [r0+r2*1-8]
  187. paddw m2, [r0+r2*2-8]
  188. paddw m2, [r1+r2*1-8]
  189. paddw m2, [r1+r2*2-8]
  190. psrlq m2, 48
  191. movq m0, [r0]
  192. HADDW m0, m1
  193. paddw m0, [pw_4]
  194. paddw m0, m2
  195. psrlw m0, 3
  196. SPLATW m0, m0, 0
  197. movq [r0+r2*1], m0
  198. movq [r0+r2*2], m0
  199. movq [r1+r2*1], m0
  200. movq [r1+r2*2], m0
  201. RET
  202. ;-----------------------------------------------------------------------------
  203. ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
  204. ;-----------------------------------------------------------------------------
  205. %macro PRED4x4_DL 0
  206. cglobal pred4x4_down_left_10, 3, 3
  207. sub r0, r2
  208. movq m0, [r0]
  209. movhps m0, [r1]
  210. psrldq m2, m0, 2
  211. pslldq m3, m0, 2
  212. pshufhw m2, m2, 10100100b
  213. PRED4x4_LOWPASS m0, m3, m2, m0
  214. lea r1, [r0+r2*2]
  215. movhps [r1+r2*2], m0
  216. psrldq m0, 2
  217. movq [r0+r2*1], m0
  218. psrldq m0, 2
  219. movq [r0+r2*2], m0
  220. psrldq m0, 2
  221. movq [r1+r2*1], m0
  222. RET
  223. %endmacro
  224. INIT_XMM sse2
  225. PRED4x4_DL
  226. %if HAVE_AVX_EXTERNAL
  227. INIT_XMM avx
  228. PRED4x4_DL
  229. %endif
  230. ;-----------------------------------------------------------------------------
  231. ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
  232. ;-----------------------------------------------------------------------------
  233. %macro PRED4x4_VL 0
  234. cglobal pred4x4_vertical_left_10, 3, 3
  235. sub r0, r2
  236. movu m1, [r0]
  237. movhps m1, [r1]
  238. psrldq m0, m1, 2
  239. psrldq m2, m1, 4
  240. pavgw m4, m0, m1
  241. PRED4x4_LOWPASS m0, m1, m2, m0
  242. lea r1, [r0+r2*2]
  243. movq [r0+r2*1], m4
  244. movq [r0+r2*2], m0
  245. psrldq m4, 2
  246. psrldq m0, 2
  247. movq [r1+r2*1], m4
  248. movq [r1+r2*2], m0
  249. RET
  250. %endmacro
  251. INIT_XMM sse2
  252. PRED4x4_VL
  253. %if HAVE_AVX_EXTERNAL
  254. INIT_XMM avx
  255. PRED4x4_VL
  256. %endif
  257. ;-----------------------------------------------------------------------------
  258. ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
  259. ;-----------------------------------------------------------------------------
  260. INIT_MMX mmx2
  261. cglobal pred4x4_horizontal_up_10, 3, 3
  262. sub r0, r2
  263. lea r1, [r0+r2*2]
  264. movq m0, [r0+r2*1-8]
  265. punpckhwd m0, [r0+r2*2-8]
  266. movq m1, [r1+r2*1-8]
  267. punpckhwd m1, [r1+r2*2-8]
  268. punpckhdq m0, m1
  269. pshufw m1, m1, 0xFF
  270. movq [r1+r2*2], m1
  271. movd [r1+r2*1+4], m1
  272. pshufw m2, m0, 11111001b
  273. movq m1, m2
  274. pavgw m2, m0
  275. pshufw m5, m0, 11111110b
  276. PRED4x4_LOWPASS m1, m0, m5, m1
  277. movq m6, m2
  278. punpcklwd m6, m1
  279. movq [r0+r2*1], m6
  280. psrlq m2, 16
  281. psrlq m1, 16
  282. punpcklwd m2, m1
  283. movq [r0+r2*2], m2
  284. psrlq m2, 32
  285. movd [r1+r2*1], m2
  286. RET
  287. ;-----------------------------------------------------------------------------
  288. ; void pred8x8_vertical(pixel *src, int stride)
  289. ;-----------------------------------------------------------------------------
  290. INIT_XMM sse2
  291. cglobal pred8x8_vertical_10, 2, 2
  292. sub r0, r1
  293. mova m0, [r0]
  294. %rep 3
  295. mova [r0+r1*1], m0
  296. mova [r0+r1*2], m0
  297. lea r0, [r0+r1*2]
  298. %endrep
  299. mova [r0+r1*1], m0
  300. mova [r0+r1*2], m0
  301. RET
  302. ;-----------------------------------------------------------------------------
  303. ; void pred8x8_horizontal(pixel *src, int stride)
  304. ;-----------------------------------------------------------------------------
  305. INIT_XMM sse2
  306. cglobal pred8x8_horizontal_10, 2, 3
  307. mov r2d, 4
  308. .loop:
  309. movq m0, [r0+r1*0-8]
  310. movq m1, [r0+r1*1-8]
  311. pshuflw m0, m0, 0xff
  312. pshuflw m1, m1, 0xff
  313. punpcklqdq m0, m0
  314. punpcklqdq m1, m1
  315. mova [r0+r1*0], m0
  316. mova [r0+r1*1], m1
  317. lea r0, [r0+r1*2]
  318. dec r2d
  319. jg .loop
  320. REP_RET
  321. ;-----------------------------------------------------------------------------
  322. ; void predict_8x8_dc(pixel *src, int stride)
  323. ;-----------------------------------------------------------------------------
  324. %macro MOV8 2-3
  325. ; sort of a hack, but it works
  326. %if mmsize==8
  327. movq [%1+0], %2
  328. movq [%1+8], %3
  329. %else
  330. movdqa [%1], %2
  331. %endif
  332. %endmacro
  333. %macro PRED8x8_DC 1
  334. cglobal pred8x8_dc_10, 2, 6
  335. sub r0, r1
  336. pxor m4, m4
  337. movq m0, [r0+0]
  338. movq m1, [r0+8]
  339. %if mmsize==16
  340. punpcklwd m0, m1
  341. movhlps m1, m0
  342. paddw m0, m1
  343. %else
  344. pshufw m2, m0, 00001110b
  345. pshufw m3, m1, 00001110b
  346. paddw m0, m2
  347. paddw m1, m3
  348. punpcklwd m0, m1
  349. %endif
  350. %1 m2, m0, 00001110b
  351. paddw m0, m2
  352. lea r5, [r1*3]
  353. lea r4, [r0+r1*4]
  354. movzx r2d, word [r0+r1*1-2]
  355. movzx r3d, word [r0+r1*2-2]
  356. add r2d, r3d
  357. movzx r3d, word [r0+r5*1-2]
  358. add r2d, r3d
  359. movzx r3d, word [r4-2]
  360. add r2d, r3d
  361. movd m2, r2d ; s2
  362. movzx r2d, word [r4+r1*1-2]
  363. movzx r3d, word [r4+r1*2-2]
  364. add r2d, r3d
  365. movzx r3d, word [r4+r5*1-2]
  366. add r2d, r3d
  367. movzx r3d, word [r4+r1*4-2]
  368. add r2d, r3d
  369. movd m3, r2d ; s3
  370. punpcklwd m2, m3
  371. punpckldq m0, m2 ; s0, s1, s2, s3
  372. %1 m3, m0, 11110110b ; s2, s1, s3, s3
  373. %1 m0, m0, 01110100b ; s0, s1, s3, s1
  374. paddw m0, m3
  375. psrlw m0, 2
  376. pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
  377. %if mmsize==16
  378. punpcklwd m0, m0
  379. pshufd m3, m0, 11111010b
  380. punpckldq m0, m0
  381. SWAP 0,1
  382. %else
  383. pshufw m1, m0, 0x00
  384. pshufw m2, m0, 0x55
  385. pshufw m3, m0, 0xaa
  386. pshufw m4, m0, 0xff
  387. %endif
  388. MOV8 r0+r1*1, m1, m2
  389. MOV8 r0+r1*2, m1, m2
  390. MOV8 r0+r5*1, m1, m2
  391. MOV8 r0+r1*4, m1, m2
  392. MOV8 r4+r1*1, m3, m4
  393. MOV8 r4+r1*2, m3, m4
  394. MOV8 r4+r5*1, m3, m4
  395. MOV8 r4+r1*4, m3, m4
  396. RET
  397. %endmacro
  398. INIT_MMX mmx2
  399. PRED8x8_DC pshufw
  400. INIT_XMM sse2
  401. PRED8x8_DC pshuflw
  402. ;-----------------------------------------------------------------------------
  403. ; void pred8x8_top_dc(pixel *src, int stride)
  404. ;-----------------------------------------------------------------------------
  405. INIT_XMM sse2
  406. cglobal pred8x8_top_dc_10, 2, 4
  407. sub r0, r1
  408. mova m0, [r0]
  409. pshuflw m1, m0, 0x4e
  410. pshufhw m1, m1, 0x4e
  411. paddw m0, m1
  412. pshuflw m1, m0, 0xb1
  413. pshufhw m1, m1, 0xb1
  414. paddw m0, m1
  415. lea r2, [r1*3]
  416. lea r3, [r0+r1*4]
  417. paddw m0, [pw_2]
  418. psrlw m0, 2
  419. mova [r0+r1*1], m0
  420. mova [r0+r1*2], m0
  421. mova [r0+r2*1], m0
  422. mova [r0+r1*4], m0
  423. mova [r3+r1*1], m0
  424. mova [r3+r1*2], m0
  425. mova [r3+r2*1], m0
  426. mova [r3+r1*4], m0
  427. RET
  428. ;-----------------------------------------------------------------------------
  429. ; void pred8x8_plane(pixel *src, int stride)
  430. ;-----------------------------------------------------------------------------
  431. INIT_XMM sse2
  432. cglobal pred8x8_plane_10, 2, 7, 7
  433. sub r0, r1
  434. lea r2, [r1*3]
  435. lea r3, [r0+r1*4]
  436. mova m2, [r0]
  437. pmaddwd m2, [pw_m32101234]
  438. HADDD m2, m1
  439. movd m0, [r0-4]
  440. psrld m0, 14
  441. psubw m2, m0 ; H
  442. movd m0, [r3+r1*4-4]
  443. movd m1, [r0+12]
  444. paddw m0, m1
  445. psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
  446. movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
  447. movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
  448. sub r4d, r5d
  449. movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
  450. movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
  451. sub r6d, r5d
  452. lea r4d, [r4+r6*2]
  453. movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
  454. movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
  455. sub r5d, r6d
  456. lea r5d, [r5*3]
  457. add r4d, r5d
  458. movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
  459. movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
  460. sub r6d, r5d
  461. lea r4d, [r4+r6*4]
  462. movd m3, r4d ; V
  463. punpckldq m2, m3
  464. pmaddwd m2, [pd_17]
  465. paddd m2, [pd_16]
  466. psrad m2, 5 ; b, c
  467. mova m3, [pw_pixel_max]
  468. pxor m1, m1
  469. SPLATW m0, m0, 1
  470. SPLATW m4, m2, 2
  471. SPLATW m2, m2, 0
  472. pmullw m2, [pw_m32101234] ; b
  473. pmullw m5, m4, [pw_m3] ; c
  474. paddw m5, [pw_16]
  475. mov r2d, 8
  476. add r0, r1
  477. .loop:
  478. paddsw m6, m2, m5
  479. paddsw m6, m0
  480. psraw m6, 5
  481. CLIPW m6, m1, m3
  482. mova [r0], m6
  483. paddw m5, m4
  484. add r0, r1
  485. dec r2d
  486. jg .loop
  487. REP_RET
  488. ;-----------------------------------------------------------------------------
  489. ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
  490. ;-----------------------------------------------------------------------------
  491. %macro PRED8x8L_128_DC 0
  492. cglobal pred8x8l_128_dc_10, 4, 4
  493. mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
  494. lea r1, [r3*3]
  495. lea r2, [r0+r3*4]
  496. MOV8 r0+r3*0, m0, m0
  497. MOV8 r0+r3*1, m0, m0
  498. MOV8 r0+r3*2, m0, m0
  499. MOV8 r0+r1*1, m0, m0
  500. MOV8 r2+r3*0, m0, m0
  501. MOV8 r2+r3*1, m0, m0
  502. MOV8 r2+r3*2, m0, m0
  503. MOV8 r2+r1*1, m0, m0
  504. RET
  505. %endmacro
  506. INIT_MMX mmx2
  507. PRED8x8L_128_DC
  508. INIT_XMM sse2
  509. PRED8x8L_128_DC
  510. ;-----------------------------------------------------------------------------
  511. ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
  512. ;-----------------------------------------------------------------------------
  513. %macro PRED8x8L_TOP_DC 0
  514. cglobal pred8x8l_top_dc_10, 4, 4, 6
  515. sub r0, r3
  516. mova m0, [r0]
  517. shr r1d, 14
  518. shr r2d, 13
  519. neg r1
  520. pslldq m1, m0, 2
  521. psrldq m2, m0, 2
  522. pinsrw m1, [r0+r1], 0
  523. pinsrw m2, [r0+r2+14], 7
  524. lea r1, [r3*3]
  525. lea r2, [r0+r3*4]
  526. PRED4x4_LOWPASS m0, m2, m1, m0
  527. HADDW m0, m1
  528. paddw m0, [pw_4]
  529. psrlw m0, 3
  530. SPLATW m0, m0, 0
  531. mova [r0+r3*1], m0
  532. mova [r0+r3*2], m0
  533. mova [r0+r1*1], m0
  534. mova [r0+r3*4], m0
  535. mova [r2+r3*1], m0
  536. mova [r2+r3*2], m0
  537. mova [r2+r1*1], m0
  538. mova [r2+r3*4], m0
  539. RET
  540. %endmacro
  541. INIT_XMM sse2
  542. PRED8x8L_TOP_DC
  543. %if HAVE_AVX_EXTERNAL
  544. INIT_XMM avx
  545. PRED8x8L_TOP_DC
  546. %endif
  547. ;-----------------------------------------------------------------------------
  548. ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
  549. ;-----------------------------------------------------------------------------
  550. ;TODO: see if scalar is faster
  551. %macro PRED8x8L_DC 0
  552. cglobal pred8x8l_dc_10, 4, 6, 6
  553. sub r0, r3
  554. lea r4, [r0+r3*4]
  555. lea r5, [r3*3]
  556. mova m0, [r0+r3*2-16]
  557. punpckhwd m0, [r0+r3*1-16]
  558. mova m1, [r4+r3*0-16]
  559. punpckhwd m1, [r0+r5*1-16]
  560. punpckhdq m1, m0
  561. mova m2, [r4+r3*2-16]
  562. punpckhwd m2, [r4+r3*1-16]
  563. mova m3, [r4+r3*4-16]
  564. punpckhwd m3, [r4+r5*1-16]
  565. punpckhdq m3, m2
  566. punpckhqdq m3, m1
  567. mova m0, [r0]
  568. shr r1d, 14
  569. shr r2d, 13
  570. neg r1
  571. pslldq m1, m0, 2
  572. psrldq m2, m0, 2
  573. pinsrw m1, [r0+r1], 0
  574. pinsrw m2, [r0+r2+14], 7
  575. not r1
  576. and r1, r3
  577. pslldq m4, m3, 2
  578. psrldq m5, m3, 2
  579. pshuflw m4, m4, 11100101b
  580. pinsrw m5, [r0+r1-2], 7
  581. PRED4x4_LOWPASS m3, m4, m5, m3
  582. PRED4x4_LOWPASS m0, m2, m1, m0
  583. paddw m0, m3
  584. HADDW m0, m1
  585. paddw m0, [pw_8]
  586. psrlw m0, 4
  587. SPLATW m0, m0
  588. mova [r0+r3*1], m0
  589. mova [r0+r3*2], m0
  590. mova [r0+r5*1], m0
  591. mova [r0+r3*4], m0
  592. mova [r4+r3*1], m0
  593. mova [r4+r3*2], m0
  594. mova [r4+r5*1], m0
  595. mova [r4+r3*4], m0
  596. RET
  597. %endmacro
  598. INIT_XMM sse2
  599. PRED8x8L_DC
  600. %if HAVE_AVX_EXTERNAL
  601. INIT_XMM avx
  602. PRED8x8L_DC
  603. %endif
  604. ;-----------------------------------------------------------------------------
  605. ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
  606. ;-----------------------------------------------------------------------------
  607. %macro PRED8x8L_VERTICAL 0
  608. cglobal pred8x8l_vertical_10, 4, 4, 6
  609. sub r0, r3
  610. mova m0, [r0]
  611. shr r1d, 14
  612. shr r2d, 13
  613. neg r1
  614. pslldq m1, m0, 2
  615. psrldq m2, m0, 2
  616. pinsrw m1, [r0+r1], 0
  617. pinsrw m2, [r0+r2+14], 7
  618. lea r1, [r3*3]
  619. lea r2, [r0+r3*4]
  620. PRED4x4_LOWPASS m0, m2, m1, m0
  621. mova [r0+r3*1], m0
  622. mova [r0+r3*2], m0
  623. mova [r0+r1*1], m0
  624. mova [r0+r3*4], m0
  625. mova [r2+r3*1], m0
  626. mova [r2+r3*2], m0
  627. mova [r2+r1*1], m0
  628. mova [r2+r3*4], m0
  629. RET
  630. %endmacro
  631. INIT_XMM sse2
  632. PRED8x8L_VERTICAL
  633. %if HAVE_AVX_EXTERNAL
  634. INIT_XMM avx
  635. PRED8x8L_VERTICAL
  636. %endif
  637. ;-----------------------------------------------------------------------------
  638. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  639. ;-----------------------------------------------------------------------------
  640. %macro PRED8x8L_HORIZONTAL 0
  641. cglobal pred8x8l_horizontal_10, 4, 4, 5
  642. mova m0, [r0-16]
  643. shr r1d, 14
  644. dec r1
  645. and r1, r3
  646. sub r1, r3
  647. punpckhwd m0, [r0+r1-16]
  648. mova m1, [r0+r3*2-16]
  649. punpckhwd m1, [r0+r3*1-16]
  650. lea r2, [r0+r3*4]
  651. lea r1, [r3*3]
  652. punpckhdq m1, m0
  653. mova m2, [r2+r3*0-16]
  654. punpckhwd m2, [r0+r1-16]
  655. mova m3, [r2+r3*2-16]
  656. punpckhwd m3, [r2+r3*1-16]
  657. punpckhdq m3, m2
  658. punpckhqdq m3, m1
  659. PALIGNR m4, m3, [r2+r1-16], 14, m0
  660. pslldq m0, m4, 2
  661. pshuflw m0, m0, 11100101b
  662. PRED4x4_LOWPASS m4, m3, m0, m4
  663. punpckhwd m3, m4, m4
  664. punpcklwd m4, m4
  665. pshufd m0, m3, 0xff
  666. pshufd m1, m3, 0xaa
  667. pshufd m2, m3, 0x55
  668. pshufd m3, m3, 0x00
  669. mova [r0+r3*0], m0
  670. mova [r0+r3*1], m1
  671. mova [r0+r3*2], m2
  672. mova [r0+r1*1], m3
  673. pshufd m0, m4, 0xff
  674. pshufd m1, m4, 0xaa
  675. pshufd m2, m4, 0x55
  676. pshufd m3, m4, 0x00
  677. mova [r2+r3*0], m0
  678. mova [r2+r3*1], m1
  679. mova [r2+r3*2], m2
  680. mova [r2+r1*1], m3
  681. RET
  682. %endmacro
  683. INIT_XMM sse2
  684. %define PALIGNR PALIGNR_MMX
  685. PRED8x8L_HORIZONTAL
  686. INIT_XMM ssse3
  687. %define PALIGNR PALIGNR_SSSE3
  688. PRED8x8L_HORIZONTAL
  689. %if HAVE_AVX_EXTERNAL
  690. INIT_XMM avx
  691. PRED8x8L_HORIZONTAL
  692. %endif
  693. ;-----------------------------------------------------------------------------
  694. ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
  695. ;-----------------------------------------------------------------------------
  696. %macro PRED8x8L_DOWN_LEFT 0
  697. cglobal pred8x8l_down_left_10, 4, 4, 7
  698. sub r0, r3
  699. mova m3, [r0]
  700. shr r1d, 14
  701. neg r1
  702. shr r2d, 13
  703. pslldq m1, m3, 2
  704. psrldq m2, m3, 2
  705. pinsrw m1, [r0+r1], 0
  706. pinsrw m2, [r0+r2+14], 7
  707. PRED4x4_LOWPASS m6, m2, m1, m3
  708. jz .fix_tr ; flags from shr r2d
  709. mova m1, [r0+16]
  710. psrldq m5, m1, 2
  711. PALIGNR m2, m1, m3, 14, m3
  712. pshufhw m5, m5, 10100100b
  713. PRED4x4_LOWPASS m1, m2, m5, m1
  714. .do_topright:
  715. lea r1, [r3*3]
  716. psrldq m5, m1, 14
  717. lea r2, [r0+r3*4]
  718. PALIGNR m2, m1, m6, 2, m0
  719. PALIGNR m3, m1, m6, 14, m0
  720. PALIGNR m5, m1, 2, m0
  721. pslldq m4, m6, 2
  722. PRED4x4_LOWPASS m6, m4, m2, m6
  723. PRED4x4_LOWPASS m1, m3, m5, m1
  724. mova [r2+r3*4], m1
  725. PALIGNR m1, m6, 14, m2
  726. pslldq m6, 2
  727. mova [r2+r1*1], m1
  728. PALIGNR m1, m6, 14, m2
  729. pslldq m6, 2
  730. mova [r2+r3*2], m1
  731. PALIGNR m1, m6, 14, m2
  732. pslldq m6, 2
  733. mova [r2+r3*1], m1
  734. PALIGNR m1, m6, 14, m2
  735. pslldq m6, 2
  736. mova [r0+r3*4], m1
  737. PALIGNR m1, m6, 14, m2
  738. pslldq m6, 2
  739. mova [r0+r1*1], m1
  740. PALIGNR m1, m6, 14, m2
  741. pslldq m6, 2
  742. mova [r0+r3*2], m1
  743. PALIGNR m1, m6, 14, m6
  744. mova [r0+r3*1], m1
  745. RET
  746. .fix_tr:
  747. punpckhwd m3, m3
  748. pshufd m1, m3, 0xFF
  749. jmp .do_topright
  750. %endmacro
  751. INIT_XMM sse2
  752. %define PALIGNR PALIGNR_MMX
  753. PRED8x8L_DOWN_LEFT
  754. INIT_XMM ssse3
  755. %define PALIGNR PALIGNR_SSSE3
  756. PRED8x8L_DOWN_LEFT
  757. %if HAVE_AVX_EXTERNAL
  758. INIT_XMM avx
  759. PRED8x8L_DOWN_LEFT
  760. %endif
  761. ;-----------------------------------------------------------------------------
  762. ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
  763. ;-----------------------------------------------------------------------------
  764. %macro PRED8x8L_DOWN_RIGHT 0
  765. ; standard forbids this when has_topleft is false
  766. ; no need to check
  767. cglobal pred8x8l_down_right_10, 4, 5, 8
  768. sub r0, r3
  769. lea r4, [r0+r3*4]
  770. lea r1, [r3*3]
  771. mova m0, [r0+r3*1-16]
  772. punpckhwd m0, [r0+r3*0-16]
  773. mova m1, [r0+r1*1-16]
  774. punpckhwd m1, [r0+r3*2-16]
  775. punpckhdq m1, m0
  776. mova m2, [r4+r3*1-16]
  777. punpckhwd m2, [r4+r3*0-16]
  778. mova m3, [r4+r1*1-16]
  779. punpckhwd m3, [r4+r3*2-16]
  780. punpckhdq m3, m2
  781. punpckhqdq m3, m1
  782. mova m0, [r4+r3*4-16]
  783. mova m1, [r0]
  784. PALIGNR m4, m3, m0, 14, m0
  785. PALIGNR m1, m3, 2, m2
  786. pslldq m0, m4, 2
  787. pshuflw m0, m0, 11100101b
  788. PRED4x4_LOWPASS m6, m1, m4, m3
  789. PRED4x4_LOWPASS m4, m3, m0, m4
  790. mova m3, [r0]
  791. shr r2d, 13
  792. pslldq m1, m3, 2
  793. psrldq m2, m3, 2
  794. pinsrw m1, [r0-2], 0
  795. pinsrw m2, [r0+r2+14], 7
  796. PRED4x4_LOWPASS m3, m2, m1, m3
  797. PALIGNR m2, m3, m6, 2, m0
  798. PALIGNR m5, m3, m6, 14, m0
  799. psrldq m7, m3, 2
  800. PRED4x4_LOWPASS m6, m4, m2, m6
  801. PRED4x4_LOWPASS m3, m5, m7, m3
  802. mova [r4+r3*4], m6
  803. PALIGNR m3, m6, 14, m2
  804. pslldq m6, 2
  805. mova [r0+r3*1], m3
  806. PALIGNR m3, m6, 14, m2
  807. pslldq m6, 2
  808. mova [r0+r3*2], m3
  809. PALIGNR m3, m6, 14, m2
  810. pslldq m6, 2
  811. mova [r0+r1*1], m3
  812. PALIGNR m3, m6, 14, m2
  813. pslldq m6, 2
  814. mova [r0+r3*4], m3
  815. PALIGNR m3, m6, 14, m2
  816. pslldq m6, 2
  817. mova [r4+r3*1], m3
  818. PALIGNR m3, m6, 14, m2
  819. pslldq m6, 2
  820. mova [r4+r3*2], m3
  821. PALIGNR m3, m6, 14, m6
  822. mova [r4+r1*1], m3
  823. RET
  824. %endmacro
  825. INIT_XMM sse2
  826. %define PALIGNR PALIGNR_MMX
  827. PRED8x8L_DOWN_RIGHT
  828. INIT_XMM ssse3
  829. %define PALIGNR PALIGNR_SSSE3
  830. PRED8x8L_DOWN_RIGHT
  831. %if HAVE_AVX_EXTERNAL
  832. INIT_XMM avx
  833. PRED8x8L_DOWN_RIGHT
  834. %endif
  835. ;-----------------------------------------------------------------------------
  836. ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
  837. ;-----------------------------------------------------------------------------
  838. %macro PRED8x8L_VERTICAL_RIGHT 0
  839. ; likewise with 8x8l_down_right
  840. cglobal pred8x8l_vertical_right_10, 4, 5, 7
  841. sub r0, r3
  842. lea r4, [r0+r3*4]
  843. lea r1, [r3*3]
  844. mova m0, [r0+r3*1-16]
  845. punpckhwd m0, [r0+r3*0-16]
  846. mova m1, [r0+r1*1-16]
  847. punpckhwd m1, [r0+r3*2-16]
  848. punpckhdq m1, m0
  849. mova m2, [r4+r3*1-16]
  850. punpckhwd m2, [r4+r3*0-16]
  851. mova m3, [r4+r1*1-16]
  852. punpckhwd m3, [r4+r3*2-16]
  853. punpckhdq m3, m2
  854. punpckhqdq m3, m1
  855. mova m0, [r4+r3*4-16]
  856. mova m1, [r0]
  857. PALIGNR m4, m3, m0, 14, m0
  858. PALIGNR m1, m3, 2, m2
  859. PRED4x4_LOWPASS m3, m1, m4, m3
  860. mova m2, [r0]
  861. shr r2d, 13
  862. pslldq m1, m2, 2
  863. psrldq m5, m2, 2
  864. pinsrw m1, [r0-2], 0
  865. pinsrw m5, [r0+r2+14], 7
  866. PRED4x4_LOWPASS m2, m5, m1, m2
  867. PALIGNR m6, m2, m3, 12, m1
  868. PALIGNR m5, m2, m3, 14, m0
  869. PRED4x4_LOWPASS m0, m6, m2, m5
  870. pavgw m2, m5
  871. mova [r0+r3*2], m0
  872. mova [r0+r3*1], m2
  873. pslldq m6, m3, 4
  874. pslldq m1, m3, 2
  875. PRED4x4_LOWPASS m1, m3, m6, m1
  876. PALIGNR m2, m1, 14, m4
  877. mova [r0+r1*1], m2
  878. pslldq m1, 2
  879. PALIGNR m0, m1, 14, m3
  880. mova [r0+r3*4], m0
  881. pslldq m1, 2
  882. PALIGNR m2, m1, 14, m4
  883. mova [r4+r3*1], m2
  884. pslldq m1, 2
  885. PALIGNR m0, m1, 14, m3
  886. mova [r4+r3*2], m0
  887. pslldq m1, 2
  888. PALIGNR m2, m1, 14, m4
  889. mova [r4+r1*1], m2
  890. pslldq m1, 2
  891. PALIGNR m0, m1, 14, m1
  892. mova [r4+r3*4], m0
  893. RET
  894. %endmacro
  895. INIT_XMM sse2
  896. %define PALIGNR PALIGNR_MMX
  897. PRED8x8L_VERTICAL_RIGHT
  898. INIT_XMM ssse3
  899. %define PALIGNR PALIGNR_SSSE3
  900. PRED8x8L_VERTICAL_RIGHT
  901. %if HAVE_AVX_EXTERNAL
  902. INIT_XMM avx
  903. PRED8x8L_VERTICAL_RIGHT
  904. %endif
  905. ;-----------------------------------------------------------------------------
  906. ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
  907. ;-----------------------------------------------------------------------------
  908. %macro PRED8x8L_HORIZONTAL_UP 0
  909. cglobal pred8x8l_horizontal_up_10, 4, 4, 6
  910. mova m0, [r0+r3*0-16]
  911. punpckhwd m0, [r0+r3*1-16]
  912. shr r1d, 14
  913. dec r1
  914. and r1, r3
  915. sub r1, r3
  916. mova m4, [r0+r1*1-16]
  917. lea r1, [r3*3]
  918. lea r2, [r0+r3*4]
  919. mova m1, [r0+r3*2-16]
  920. punpckhwd m1, [r0+r1*1-16]
  921. punpckhdq m0, m1
  922. mova m2, [r2+r3*0-16]
  923. punpckhwd m2, [r2+r3*1-16]
  924. mova m3, [r2+r3*2-16]
  925. punpckhwd m3, [r2+r1*1-16]
  926. punpckhdq m2, m3
  927. punpckhqdq m0, m2
  928. PALIGNR m1, m0, m4, 14, m4
  929. psrldq m2, m0, 2
  930. pshufhw m2, m2, 10100100b
  931. PRED4x4_LOWPASS m0, m1, m2, m0
  932. psrldq m1, m0, 2
  933. psrldq m2, m0, 4
  934. pshufhw m1, m1, 10100100b
  935. pshufhw m2, m2, 01010100b
  936. pavgw m4, m0, m1
  937. PRED4x4_LOWPASS m1, m2, m0, m1
  938. punpckhwd m5, m4, m1
  939. punpcklwd m4, m1
  940. mova [r2+r3*0], m5
  941. mova [r0+r3*0], m4
  942. pshufd m0, m5, 11111001b
  943. pshufd m1, m5, 11111110b
  944. pshufd m2, m5, 11111111b
  945. mova [r2+r3*1], m0
  946. mova [r2+r3*2], m1
  947. mova [r2+r1*1], m2
  948. PALIGNR m2, m5, m4, 4, m0
  949. PALIGNR m3, m5, m4, 8, m1
  950. PALIGNR m5, m5, m4, 12, m4
  951. mova [r0+r3*1], m2
  952. mova [r0+r3*2], m3
  953. mova [r0+r1*1], m5
  954. RET
  955. %endmacro
  956. INIT_XMM sse2
  957. %define PALIGNR PALIGNR_MMX
  958. PRED8x8L_HORIZONTAL_UP
  959. INIT_XMM ssse3
  960. %define PALIGNR PALIGNR_SSSE3
  961. PRED8x8L_HORIZONTAL_UP
  962. %if HAVE_AVX_EXTERNAL
  963. INIT_XMM avx
  964. PRED8x8L_HORIZONTAL_UP
  965. %endif
  966. ;-----------------------------------------------------------------------------
  967. ; void pred16x16_vertical(pixel *src, int stride)
  968. ;-----------------------------------------------------------------------------
  969. %macro MOV16 3-5
  970. mova [%1+ 0], %2
  971. mova [%1+mmsize], %3
  972. %if mmsize==8
  973. mova [%1+ 16], %4
  974. mova [%1+ 24], %5
  975. %endif
  976. %endmacro
  977. %macro PRED16x16_VERTICAL 0
  978. cglobal pred16x16_vertical_10, 2, 3
  979. sub r0, r1
  980. mov r2d, 8
  981. mova m0, [r0+ 0]
  982. mova m1, [r0+mmsize]
  983. %if mmsize==8
  984. mova m2, [r0+16]
  985. mova m3, [r0+24]
  986. %endif
  987. .loop:
  988. MOV16 r0+r1*1, m0, m1, m2, m3
  989. MOV16 r0+r1*2, m0, m1, m2, m3
  990. lea r0, [r0+r1*2]
  991. dec r2d
  992. jg .loop
  993. REP_RET
  994. %endmacro
  995. INIT_MMX mmx2
  996. PRED16x16_VERTICAL
  997. INIT_XMM sse2
  998. PRED16x16_VERTICAL
  999. ;-----------------------------------------------------------------------------
  1000. ; void pred16x16_horizontal(pixel *src, int stride)
  1001. ;-----------------------------------------------------------------------------
  1002. %macro PRED16x16_HORIZONTAL 0
  1003. cglobal pred16x16_horizontal_10, 2, 3
  1004. mov r2d, 8
  1005. .vloop:
  1006. movd m0, [r0+r1*0-4]
  1007. movd m1, [r0+r1*1-4]
  1008. SPLATW m0, m0, 1
  1009. SPLATW m1, m1, 1
  1010. MOV16 r0+r1*0, m0, m0, m0, m0
  1011. MOV16 r0+r1*1, m1, m1, m1, m1
  1012. lea r0, [r0+r1*2]
  1013. dec r2d
  1014. jg .vloop
  1015. REP_RET
  1016. %endmacro
  1017. INIT_MMX mmx2
  1018. PRED16x16_HORIZONTAL
  1019. INIT_XMM sse2
  1020. PRED16x16_HORIZONTAL
  1021. ;-----------------------------------------------------------------------------
  1022. ; void pred16x16_dc(pixel *src, int stride)
  1023. ;-----------------------------------------------------------------------------
  1024. %macro PRED16x16_DC 0
  1025. cglobal pred16x16_dc_10, 2, 6
  1026. mov r5, r0
  1027. sub r0, r1
  1028. mova m0, [r0+0]
  1029. paddw m0, [r0+mmsize]
  1030. %if mmsize==8
  1031. paddw m0, [r0+16]
  1032. paddw m0, [r0+24]
  1033. %endif
  1034. HADDW m0, m2
  1035. lea r0, [r0+r1-2]
  1036. movzx r3d, word [r0]
  1037. movzx r4d, word [r0+r1]
  1038. %rep 7
  1039. lea r0, [r0+r1*2]
  1040. movzx r2d, word [r0]
  1041. add r3d, r2d
  1042. movzx r2d, word [r0+r1]
  1043. add r4d, r2d
  1044. %endrep
  1045. lea r3d, [r3+r4+16]
  1046. movd m1, r3d
  1047. paddw m0, m1
  1048. psrlw m0, 5
  1049. SPLATW m0, m0
  1050. mov r3d, 8
  1051. .loop:
  1052. MOV16 r5+r1*0, m0, m0, m0, m0
  1053. MOV16 r5+r1*1, m0, m0, m0, m0
  1054. lea r5, [r5+r1*2]
  1055. dec r3d
  1056. jg .loop
  1057. REP_RET
  1058. %endmacro
  1059. INIT_MMX mmx2
  1060. PRED16x16_DC
  1061. INIT_XMM sse2
  1062. PRED16x16_DC
  1063. ;-----------------------------------------------------------------------------
  1064. ; void pred16x16_top_dc(pixel *src, int stride)
  1065. ;-----------------------------------------------------------------------------
  1066. %macro PRED16x16_TOP_DC 0
  1067. cglobal pred16x16_top_dc_10, 2, 3
  1068. sub r0, r1
  1069. mova m0, [r0+0]
  1070. paddw m0, [r0+mmsize]
  1071. %if mmsize==8
  1072. paddw m0, [r0+16]
  1073. paddw m0, [r0+24]
  1074. %endif
  1075. HADDW m0, m2
  1076. SPLATW m0, m0
  1077. paddw m0, [pw_8]
  1078. psrlw m0, 4
  1079. mov r2d, 8
  1080. .loop:
  1081. MOV16 r0+r1*1, m0, m0, m0, m0
  1082. MOV16 r0+r1*2, m0, m0, m0, m0
  1083. lea r0, [r0+r1*2]
  1084. dec r2d
  1085. jg .loop
  1086. REP_RET
  1087. %endmacro
  1088. INIT_MMX mmx2
  1089. PRED16x16_TOP_DC
  1090. INIT_XMM sse2
  1091. PRED16x16_TOP_DC
  1092. ;-----------------------------------------------------------------------------
  1093. ; void pred16x16_left_dc(pixel *src, int stride)
  1094. ;-----------------------------------------------------------------------------
  1095. %macro PRED16x16_LEFT_DC 0
  1096. cglobal pred16x16_left_dc_10, 2, 6
  1097. mov r5, r0
  1098. sub r0, 2
  1099. movzx r3d, word [r0]
  1100. movzx r4d, word [r0+r1]
  1101. %rep 7
  1102. lea r0, [r0+r1*2]
  1103. movzx r2d, word [r0]
  1104. add r3d, r2d
  1105. movzx r2d, word [r0+r1]
  1106. add r4d, r2d
  1107. %endrep
  1108. lea r3d, [r3+r4+8]
  1109. shr r3d, 4
  1110. movd m0, r3d
  1111. SPLATW m0, m0
  1112. mov r3d, 8
  1113. .loop:
  1114. MOV16 r5+r1*0, m0, m0, m0, m0
  1115. MOV16 r5+r1*1, m0, m0, m0, m0
  1116. lea r5, [r5+r1*2]
  1117. dec r3d
  1118. jg .loop
  1119. REP_RET
  1120. %endmacro
  1121. INIT_MMX mmx2
  1122. PRED16x16_LEFT_DC
  1123. INIT_XMM sse2
  1124. PRED16x16_LEFT_DC
  1125. ;-----------------------------------------------------------------------------
  1126. ; void pred16x16_128_dc(pixel *src, int stride)
  1127. ;-----------------------------------------------------------------------------
  1128. %macro PRED16x16_128_DC 0
  1129. cglobal pred16x16_128_dc_10, 2,3
  1130. mova m0, [pw_512]
  1131. mov r2d, 8
  1132. .loop:
  1133. MOV16 r0+r1*0, m0, m0, m0, m0
  1134. MOV16 r0+r1*1, m0, m0, m0, m0
  1135. lea r0, [r0+r1*2]
  1136. dec r2d
  1137. jg .loop
  1138. REP_RET
  1139. %endmacro
  1140. INIT_MMX mmx2
  1141. PRED16x16_128_DC
  1142. INIT_XMM sse2
  1143. PRED16x16_128_DC