You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

916 lines
22KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Oskar Arvidsson <oskar@irock.se>
  7. ;* Loren Merritt <lorenm@u.washington.edu>
  8. ;* Jason Garrett-Glaser <darkshikari@gmail.com>
  9. ;*
  10. ;* This file is part of Libav.
  11. ;*
  12. ;* Libav is free software; you can redistribute it and/or
  13. ;* modify it under the terms of the GNU Lesser General Public
  14. ;* License as published by the Free Software Foundation; either
  15. ;* version 2.1 of the License, or (at your option) any later version.
  16. ;*
  17. ;* Libav is distributed in the hope that it will be useful,
  18. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. ;* Lesser General Public License for more details.
  21. ;*
  22. ;* You should have received a copy of the GNU Lesser General Public
  23. ;* License along with Libav; if not, write to the Free Software
  24. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. ;******************************************************************************
  26. %include "libavutil/x86/x86util.asm"
  27. SECTION_RODATA
  28. pw_pixel_max: times 8 dw ((1 << 10)-1)
  29. SECTION .text
  30. cextern pw_2
  31. cextern pw_3
  32. cextern pw_4
  33. ; out: %4 = |%1-%2|-%3
  34. ; clobbers: %5
  35. %macro ABS_SUB 5
  36. psubusw %5, %2, %1
  37. psubusw %4, %1, %2
  38. por %4, %5
  39. psubw %4, %3
  40. %endmacro
  41. ; out: %4 = |%1-%2|<%3
  42. %macro DIFF_LT 5
  43. psubusw %4, %2, %1
  44. psubusw %5, %1, %2
  45. por %5, %4 ; |%1-%2|
  46. pxor %4, %4
  47. psubw %5, %3 ; |%1-%2|-%3
  48. pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
  49. %endmacro
  50. %macro LOAD_AB 4
  51. movd %1, %3
  52. movd %2, %4
  53. SPLATW %1, %1
  54. SPLATW %2, %2
  55. %endmacro
  56. ; in: %2=tc reg
  57. ; out: %1=splatted tc
  58. %macro LOAD_TC 2
  59. movd %1, [%2]
  60. punpcklbw %1, %1
  61. %if mmsize == 8
  62. pshufw %1, %1, 0
  63. %else
  64. pshuflw %1, %1, 01010000b
  65. pshufd %1, %1, 01010000b
  66. %endif
  67. psraw %1, 6
  68. %endmacro
  69. ; in: %1=p1, %2=p0, %3=q0, %4=q1
  70. ; %5=alpha, %6=beta, %7-%9=tmp
  71. ; out: %7=mask
  72. %macro LOAD_MASK 9
  73. ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
  74. ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
  75. pand %8, %9
  76. ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
  77. pxor %7, %7
  78. pand %8, %9
  79. pcmpgtw %7, %8
  80. %endmacro
  81. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  82. ; out: %1=p0', m2=q0'
  83. %macro DEBLOCK_P0_Q0 7
  84. psubw %3, %4
  85. pxor %7, %7
  86. paddw %3, [pw_4]
  87. psubw %7, %5
  88. psubw %6, %2, %1
  89. psllw %6, 2
  90. paddw %3, %6
  91. psraw %3, 3
  92. mova %6, [pw_pixel_max]
  93. CLIPW %3, %7, %5
  94. pxor %7, %7
  95. paddw %1, %3
  96. psubw %2, %3
  97. CLIPW %1, %7, %6
  98. CLIPW %2, %7, %6
  99. %endmacro
  100. ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
  101. %macro LUMA_Q1 6
  102. pavgw %6, %3, %4 ; (p0+q0+1)>>1
  103. paddw %1, %6
  104. pxor %6, %6
  105. psraw %1, 1
  106. psubw %6, %5
  107. psubw %1, %2
  108. CLIPW %1, %6, %5
  109. paddw %1, %2
  110. %endmacro
  111. %macro LUMA_DEBLOCK_ONE 3
  112. DIFF_LT m5, %1, bm, m4, m6
  113. pxor m6, m6
  114. mova %3, m4
  115. pcmpgtw m6, tcm
  116. pand m4, tcm
  117. pandn m6, m7
  118. pand m4, m6
  119. LUMA_Q1 m5, %2, m1, m2, m4, m6
  120. %endmacro
  121. %macro LUMA_H_STORE 2
  122. %if mmsize == 8
  123. movq [r0-4], m0
  124. movq [r0+r1-4], m1
  125. movq [r0+r1*2-4], m2
  126. movq [r0+%2-4], m3
  127. %else
  128. movq [r0-4], m0
  129. movhps [r0+r1-4], m0
  130. movq [r0+r1*2-4], m1
  131. movhps [%1-4], m1
  132. movq [%1+r1-4], m2
  133. movhps [%1+r1*2-4], m2
  134. movq [%1+%2-4], m3
  135. movhps [%1+r1*4-4], m3
  136. %endif
  137. %endmacro
  138. %macro DEBLOCK_LUMA 0
  139. ;-----------------------------------------------------------------------------
  140. ; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  141. ;-----------------------------------------------------------------------------
  142. cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
  143. %assign pad 5*mmsize+12-(stack_offset&15)
  144. %define tcm [rsp]
  145. %define ms1 [rsp+mmsize]
  146. %define ms2 [rsp+mmsize*2]
  147. %define am [rsp+mmsize*3]
  148. %define bm [rsp+mmsize*4]
  149. SUB rsp, pad
  150. shl r2d, 2
  151. shl r3d, 2
  152. LOAD_AB m4, m5, r2d, r3d
  153. mov r3, 32/mmsize
  154. mov r2, r0
  155. sub r0, r1
  156. mova am, m4
  157. sub r0, r1
  158. mova bm, m5
  159. sub r0, r1
  160. .loop:
  161. mova m0, [r0+r1]
  162. mova m1, [r0+r1*2]
  163. mova m2, [r2]
  164. mova m3, [r2+r1]
  165. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  166. LOAD_TC m6, r4
  167. mova tcm, m6
  168. mova m5, [r0]
  169. LUMA_DEBLOCK_ONE m1, m0, ms1
  170. mova [r0+r1], m5
  171. mova m5, [r2+r1*2]
  172. LUMA_DEBLOCK_ONE m2, m3, ms2
  173. mova [r2+r1], m5
  174. pxor m5, m5
  175. mova m6, tcm
  176. pcmpgtw m5, tcm
  177. psubw m6, ms1
  178. pandn m5, m7
  179. psubw m6, ms2
  180. pand m5, m6
  181. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  182. mova [r0+r1*2], m1
  183. mova [r2], m2
  184. add r0, mmsize
  185. add r2, mmsize
  186. add r4, mmsize/8
  187. dec r3
  188. jg .loop
  189. ADD rsp, pad
  190. RET
  191. cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
  192. %assign pad 7*mmsize+12-(stack_offset&15)
  193. %define tcm [rsp]
  194. %define ms1 [rsp+mmsize]
  195. %define ms2 [rsp+mmsize*2]
  196. %define p1m [rsp+mmsize*3]
  197. %define p2m [rsp+mmsize*4]
  198. %define am [rsp+mmsize*5]
  199. %define bm [rsp+mmsize*6]
  200. SUB rsp, pad
  201. shl r2d, 2
  202. shl r3d, 2
  203. LOAD_AB m4, m5, r2d, r3d
  204. mov r3, r1
  205. mova am, m4
  206. add r3, r1
  207. mov r5, 32/mmsize
  208. mova bm, m5
  209. add r3, r1
  210. %if mmsize == 16
  211. mov r2, r0
  212. add r2, r3
  213. %endif
  214. .loop:
  215. %if mmsize == 8
  216. movq m2, [r0-8] ; y q2 q1 q0
  217. movq m7, [r0+0]
  218. movq m5, [r0+r1-8]
  219. movq m3, [r0+r1+0]
  220. movq m0, [r0+r1*2-8]
  221. movq m6, [r0+r1*2+0]
  222. movq m1, [r0+r3-8]
  223. TRANSPOSE4x4W 2, 5, 0, 1, 4
  224. SWAP 2, 7
  225. movq m7, [r0+r3]
  226. TRANSPOSE4x4W 2, 3, 6, 7, 4
  227. %else
  228. movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  229. movu m0, [r0+r1-8]
  230. movu m2, [r0+r1*2-8]
  231. movu m3, [r2-8]
  232. TRANSPOSE4x4W 5, 0, 2, 3, 6
  233. mova tcm, m3
  234. movu m4, [r2+r1-8]
  235. movu m1, [r2+r1*2-8]
  236. movu m3, [r2+r3-8]
  237. movu m7, [r2+r1*4-8]
  238. TRANSPOSE4x4W 4, 1, 3, 7, 6
  239. mova m6, tcm
  240. punpcklqdq m6, m7
  241. punpckhqdq m5, m4
  242. SBUTTERFLY qdq, 0, 1, 7
  243. SBUTTERFLY qdq, 2, 3, 7
  244. %endif
  245. mova p2m, m6
  246. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  247. LOAD_TC m6, r4
  248. mova tcm, m6
  249. LUMA_DEBLOCK_ONE m1, m0, ms1
  250. mova p1m, m5
  251. mova m5, p2m
  252. LUMA_DEBLOCK_ONE m2, m3, ms2
  253. mova p2m, m5
  254. pxor m5, m5
  255. mova m6, tcm
  256. pcmpgtw m5, tcm
  257. psubw m6, ms1
  258. pandn m5, m7
  259. psubw m6, ms2
  260. pand m5, m6
  261. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  262. mova m0, p1m
  263. mova m3, p2m
  264. TRANSPOSE4x4W 0, 1, 2, 3, 4
  265. LUMA_H_STORE r2, r3
  266. add r4, mmsize/8
  267. lea r0, [r0+r1*(mmsize/2)]
  268. lea r2, [r2+r1*(mmsize/2)]
  269. dec r5
  270. jg .loop
  271. ADD rsp, pad
  272. RET
  273. %endmacro
  274. %if ARCH_X86_64
  275. ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
  276. ; m12=alpha, m13=beta
  277. ; out: m0=p1', m3=q1', m1=p0', m2=q0'
  278. ; clobbers: m4, m5, m6, m7, m10, m11, m14
  279. %macro DEBLOCK_LUMA_INTER_SSE2 0
  280. LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
  281. LOAD_TC m6, r4
  282. DIFF_LT m8, m1, m13, m10, m4
  283. DIFF_LT m9, m2, m13, m11, m4
  284. pand m6, m7
  285. mova m14, m6
  286. pxor m4, m4
  287. pcmpgtw m6, m4
  288. pand m6, m14
  289. mova m5, m10
  290. pand m5, m6
  291. LUMA_Q1 m8, m0, m1, m2, m5, m4
  292. mova m5, m11
  293. pand m5, m6
  294. LUMA_Q1 m9, m3, m1, m2, m5, m4
  295. pxor m4, m4
  296. psubw m6, m10
  297. pcmpgtw m4, m14
  298. pandn m4, m7
  299. psubw m6, m11
  300. pand m4, m6
  301. DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
  302. SWAP 0, 8
  303. SWAP 3, 9
  304. %endmacro
  305. %macro DEBLOCK_LUMA_64 0
  306. cglobal deblock_v_luma_10, 5,5,15
  307. %define p2 m8
  308. %define p1 m0
  309. %define p0 m1
  310. %define q0 m2
  311. %define q1 m3
  312. %define q2 m9
  313. %define mask0 m7
  314. %define mask1 m10
  315. %define mask2 m11
  316. shl r2d, 2
  317. shl r3d, 2
  318. LOAD_AB m12, m13, r2d, r3d
  319. mov r2, r0
  320. sub r0, r1
  321. sub r0, r1
  322. sub r0, r1
  323. mov r3, 2
  324. .loop:
  325. mova p2, [r0]
  326. mova p1, [r0+r1]
  327. mova p0, [r0+r1*2]
  328. mova q0, [r2]
  329. mova q1, [r2+r1]
  330. mova q2, [r2+r1*2]
  331. DEBLOCK_LUMA_INTER_SSE2
  332. mova [r0+r1], p1
  333. mova [r0+r1*2], p0
  334. mova [r2], q0
  335. mova [r2+r1], q1
  336. add r0, mmsize
  337. add r2, mmsize
  338. add r4, 2
  339. dec r3
  340. jg .loop
  341. REP_RET
  342. cglobal deblock_h_luma_10, 5,7,15
  343. shl r2d, 2
  344. shl r3d, 2
  345. LOAD_AB m12, m13, r2d, r3d
  346. mov r2, r1
  347. add r2, r1
  348. add r2, r1
  349. mov r5, r0
  350. add r5, r2
  351. mov r6, 2
  352. .loop:
  353. movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  354. movu m0, [r0+r1-8]
  355. movu m2, [r0+r1*2-8]
  356. movu m9, [r5-8]
  357. movu m5, [r5+r1-8]
  358. movu m1, [r5+r1*2-8]
  359. movu m3, [r5+r2-8]
  360. movu m7, [r5+r1*4-8]
  361. TRANSPOSE4x4W 8, 0, 2, 9, 10
  362. TRANSPOSE4x4W 5, 1, 3, 7, 10
  363. punpckhqdq m8, m5
  364. SBUTTERFLY qdq, 0, 1, 10
  365. SBUTTERFLY qdq, 2, 3, 10
  366. punpcklqdq m9, m7
  367. DEBLOCK_LUMA_INTER_SSE2
  368. TRANSPOSE4x4W 0, 1, 2, 3, 4
  369. LUMA_H_STORE r5, r2
  370. add r4, 2
  371. lea r0, [r0+r1*8]
  372. lea r5, [r5+r1*8]
  373. dec r6
  374. jg .loop
  375. REP_RET
  376. %endmacro
  377. INIT_XMM sse2
  378. DEBLOCK_LUMA_64
  379. INIT_XMM avx
  380. DEBLOCK_LUMA_64
  381. %endif
  382. %macro SWAPMOVA 2
  383. %ifid %1
  384. SWAP %1, %2
  385. %else
  386. mova %1, %2
  387. %endif
  388. %endmacro
  389. ; in: t0-t2: tmp registers
  390. ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
  391. ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
  392. %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
  393. %if ARCH_X86_64
  394. paddw t0, %3, %2
  395. mova t2, %4
  396. paddw t2, %3
  397. %else
  398. mova t0, %3
  399. mova t2, %4
  400. paddw t0, %2
  401. paddw t2, %3
  402. %endif
  403. paddw t0, %1
  404. paddw t2, t2
  405. paddw t0, %5
  406. paddw t2, %9
  407. paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
  408. paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
  409. psrlw t2, 3
  410. psrlw t1, t0, 2
  411. psubw t2, %3
  412. psubw t1, %2
  413. pand t2, %8
  414. pand t1, %8
  415. paddw t2, %3
  416. paddw t1, %2
  417. SWAPMOVA %11, t1
  418. psubw t1, t0, %3
  419. paddw t0, t0
  420. psubw t1, %5
  421. psubw t0, %3
  422. paddw t1, %6
  423. paddw t1, %2
  424. paddw t0, %6
  425. psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
  426. psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
  427. pxor t0, t1
  428. pxor t1, %1
  429. pand t0, %8
  430. pand t1, %7
  431. pxor t0, t1
  432. pxor t0, %1
  433. SWAPMOVA %10, t0
  434. SWAPMOVA %12, t2
  435. %endmacro
  436. %macro LUMA_INTRA_INIT 1
  437. %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
  438. %define t0 m4
  439. %define t1 m5
  440. %define t2 m6
  441. %define t3 m7
  442. %assign i 4
  443. %rep %1
  444. CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
  445. %assign i i+1
  446. %endrep
  447. SUB rsp, pad
  448. %endmacro
  449. ; in: %1-%3=tmp, %4=p2, %5=q2
  450. %macro LUMA_INTRA_INTER 5
  451. LOAD_AB t0, t1, r2d, r3d
  452. mova %1, t0
  453. LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
  454. %if ARCH_X86_64
  455. mova %2, t0 ; mask0
  456. psrlw t3, %1, 2
  457. %else
  458. mova t3, %1
  459. mova %2, t0 ; mask0
  460. psrlw t3, 2
  461. %endif
  462. paddw t3, [pw_2] ; alpha/4+2
  463. DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
  464. pand t2, %2
  465. mova t3, %5 ; q2
  466. mova %1, t2 ; mask1
  467. DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
  468. pand t2, %1
  469. mova t3, %4 ; p2
  470. mova %3, t2 ; mask1q
  471. DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
  472. pand t2, %1
  473. mova %1, t2 ; mask1p
  474. %endmacro
  475. %macro LUMA_H_INTRA_LOAD 0
  476. %if mmsize == 8
  477. movu t0, [r0-8]
  478. movu t1, [r0+r1-8]
  479. movu m0, [r0+r1*2-8]
  480. movu m1, [r0+r4-8]
  481. TRANSPOSE4x4W 4, 5, 0, 1, 2
  482. mova t4, t0 ; p3
  483. mova t5, t1 ; p2
  484. movu m2, [r0]
  485. movu m3, [r0+r1]
  486. movu t0, [r0+r1*2]
  487. movu t1, [r0+r4]
  488. TRANSPOSE4x4W 2, 3, 4, 5, 6
  489. mova t6, t0 ; q2
  490. mova t7, t1 ; q3
  491. %else
  492. movu t0, [r0-8]
  493. movu t1, [r0+r1-8]
  494. movu m0, [r0+r1*2-8]
  495. movu m1, [r0+r5-8]
  496. movu m2, [r4-8]
  497. movu m3, [r4+r1-8]
  498. movu t2, [r4+r1*2-8]
  499. movu t3, [r4+r5-8]
  500. TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
  501. mova t4, t0 ; p3
  502. mova t5, t1 ; p2
  503. mova t6, t2 ; q2
  504. mova t7, t3 ; q3
  505. %endif
  506. %endmacro
  507. ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
  508. %macro LUMA_H_INTRA_STORE 9
  509. %if mmsize == 8
  510. TRANSPOSE4x4W %1, %2, %3, %4, %9
  511. movq [r0-8], m%1
  512. movq [r0+r1-8], m%2
  513. movq [r0+r1*2-8], m%3
  514. movq [r0+r4-8], m%4
  515. movq m%1, %8
  516. TRANSPOSE4x4W %5, %6, %7, %1, %9
  517. movq [r0], m%5
  518. movq [r0+r1], m%6
  519. movq [r0+r1*2], m%7
  520. movq [r0+r4], m%1
  521. %else
  522. TRANSPOSE2x4x4W %1, %2, %3, %4, %9
  523. movq [r0-8], m%1
  524. movq [r0+r1-8], m%2
  525. movq [r0+r1*2-8], m%3
  526. movq [r0+r5-8], m%4
  527. movhps [r4-8], m%1
  528. movhps [r4+r1-8], m%2
  529. movhps [r4+r1*2-8], m%3
  530. movhps [r4+r5-8], m%4
  531. %ifnum %8
  532. SWAP %1, %8
  533. %else
  534. mova m%1, %8
  535. %endif
  536. TRANSPOSE2x4x4W %5, %6, %7, %1, %9
  537. movq [r0], m%5
  538. movq [r0+r1], m%6
  539. movq [r0+r1*2], m%7
  540. movq [r0+r5], m%1
  541. movhps [r4], m%5
  542. movhps [r4+r1], m%6
  543. movhps [r4+r1*2], m%7
  544. movhps [r4+r5], m%1
  545. %endif
  546. %endmacro
  547. %if ARCH_X86_64
  548. ;-----------------------------------------------------------------------------
  549. ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  550. ;-----------------------------------------------------------------------------
  551. %macro DEBLOCK_LUMA_INTRA_64 0
  552. cglobal deblock_v_luma_intra_10, 4,7,16
  553. %define t0 m1
  554. %define t1 m2
  555. %define t2 m4
  556. %define p2 m8
  557. %define p1 m9
  558. %define p0 m10
  559. %define q0 m11
  560. %define q1 m12
  561. %define q2 m13
  562. %define aa m5
  563. %define bb m14
  564. lea r4, [r1*4]
  565. lea r5, [r1*3] ; 3*stride
  566. neg r4
  567. add r4, r0 ; pix-4*stride
  568. mov r6, 2
  569. mova m0, [pw_2]
  570. shl r2d, 2
  571. shl r3d, 2
  572. LOAD_AB aa, bb, r2d, r3d
  573. .loop:
  574. mova p2, [r4+r1]
  575. mova p1, [r4+2*r1]
  576. mova p0, [r4+r5]
  577. mova q0, [r0]
  578. mova q1, [r0+r1]
  579. mova q2, [r0+2*r1]
  580. LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
  581. mova t2, aa
  582. psrlw t2, 2
  583. paddw t2, m0 ; alpha/4+2
  584. DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  585. DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
  586. DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
  587. pand m6, m3
  588. pand m7, m6
  589. pand m6, t1
  590. LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
  591. LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
  592. add r0, mmsize
  593. add r4, mmsize
  594. dec r6
  595. jg .loop
  596. REP_RET
  597. ;-----------------------------------------------------------------------------
  598. ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  599. ;-----------------------------------------------------------------------------
  600. cglobal deblock_h_luma_intra_10, 4,7,16
  601. %define t0 m15
  602. %define t1 m14
  603. %define t2 m2
  604. %define q3 m5
  605. %define q2 m8
  606. %define q1 m9
  607. %define q0 m10
  608. %define p0 m11
  609. %define p1 m12
  610. %define p2 m13
  611. %define p3 m4
  612. %define spill [rsp]
  613. %assign pad 24-(stack_offset&15)
  614. SUB rsp, pad
  615. lea r4, [r1*4]
  616. lea r5, [r1*3] ; 3*stride
  617. add r4, r0 ; pix+4*stride
  618. mov r6, 2
  619. mova m0, [pw_2]
  620. shl r2d, 2
  621. shl r3d, 2
  622. .loop:
  623. movu q3, [r0-8]
  624. movu q2, [r0+r1-8]
  625. movu q1, [r0+r1*2-8]
  626. movu q0, [r0+r5-8]
  627. movu p0, [r4-8]
  628. movu p1, [r4+r1-8]
  629. movu p2, [r4+r1*2-8]
  630. movu p3, [r4+r5-8]
  631. TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
  632. LOAD_AB m1, m2, r2d, r3d
  633. LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
  634. psrlw m1, 2
  635. paddw m1, m0 ; alpha/4+2
  636. DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  637. DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
  638. DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
  639. pand m6, m3
  640. pand m7, m6
  641. pand m6, t1
  642. mova spill, q3
  643. LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
  644. LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
  645. mova m7, spill
  646. LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
  647. lea r0, [r0+r1*8]
  648. lea r4, [r4+r1*8]
  649. dec r6
  650. jg .loop
  651. ADD rsp, pad
  652. RET
  653. %endmacro
  654. INIT_XMM sse2
  655. DEBLOCK_LUMA_INTRA_64
  656. INIT_XMM avx
  657. DEBLOCK_LUMA_INTRA_64
  658. %endif
  659. %macro DEBLOCK_LUMA_INTRA 0
  660. ;-----------------------------------------------------------------------------
  661. ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  662. ;-----------------------------------------------------------------------------
  663. cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
  664. LUMA_INTRA_INIT 3
  665. lea r4, [r1*4]
  666. lea r5, [r1*3]
  667. neg r4
  668. add r4, r0
  669. mov r6, 32/mmsize
  670. shl r2d, 2
  671. shl r3d, 2
  672. .loop:
  673. mova m0, [r4+r1*2] ; p1
  674. mova m1, [r4+r5] ; p0
  675. mova m2, [r0] ; q0
  676. mova m3, [r0+r1] ; q1
  677. LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
  678. LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
  679. mova t3, [r0+r1*2] ; q2
  680. LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
  681. add r0, mmsize
  682. add r4, mmsize
  683. dec r6
  684. jg .loop
  685. ADD rsp, pad
  686. RET
  687. ;-----------------------------------------------------------------------------
  688. ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  689. ;-----------------------------------------------------------------------------
  690. cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
  691. LUMA_INTRA_INIT 8
  692. %if mmsize == 8
  693. lea r4, [r1*3]
  694. mov r5, 32/mmsize
  695. %else
  696. lea r4, [r1*4]
  697. lea r5, [r1*3] ; 3*stride
  698. add r4, r0 ; pix+4*stride
  699. mov r6, 32/mmsize
  700. %endif
  701. shl r2d, 2
  702. shl r3d, 2
  703. .loop:
  704. LUMA_H_INTRA_LOAD
  705. LUMA_INTRA_INTER t8, t9, t10, t5, t6
  706. LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
  707. mova t3, t6 ; q2
  708. LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
  709. mova m2, t4
  710. mova m0, t11
  711. mova m1, t5
  712. mova m3, t8
  713. mova m6, t6
  714. LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
  715. lea r0, [r0+r1*(mmsize/2)]
  716. %if mmsize == 8
  717. dec r5
  718. %else
  719. lea r4, [r4+r1*(mmsize/2)]
  720. dec r6
  721. %endif
  722. jg .loop
  723. ADD rsp, pad
  724. RET
  725. %endmacro
  726. %if ARCH_X86_64 == 0
  727. INIT_MMX mmxext
  728. DEBLOCK_LUMA
  729. DEBLOCK_LUMA_INTRA
  730. INIT_XMM sse2
  731. DEBLOCK_LUMA
  732. DEBLOCK_LUMA_INTRA
  733. INIT_XMM avx
  734. DEBLOCK_LUMA
  735. DEBLOCK_LUMA_INTRA
  736. %endif
  737. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  738. ; out: %1=p0', %2=q0'
  739. %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
  740. mova %6, [pw_2]
  741. paddw %6, %3
  742. paddw %6, %4
  743. paddw %7, %6, %2
  744. paddw %6, %1
  745. paddw %6, %3
  746. paddw %7, %4
  747. psraw %6, 2
  748. psraw %7, 2
  749. psubw %6, %1
  750. psubw %7, %2
  751. pand %6, %5
  752. pand %7, %5
  753. paddw %1, %6
  754. paddw %2, %7
  755. %endmacro
  756. %macro CHROMA_V_LOAD 1
  757. mova m0, [r0] ; p1
  758. mova m1, [r0+r1] ; p0
  759. mova m2, [%1] ; q0
  760. mova m3, [%1+r1] ; q1
  761. %endmacro
  762. %macro CHROMA_V_STORE 0
  763. mova [r0+1*r1], m1
  764. mova [r0+2*r1], m2
  765. %endmacro
  766. %macro CHROMA_V_LOAD_TC 2
  767. movd %1, [%2]
  768. punpcklbw %1, %1
  769. punpcklwd %1, %1
  770. psraw %1, 6
  771. %endmacro
  772. %macro DEBLOCK_CHROMA 0
  773. ;-----------------------------------------------------------------------------
  774. ; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  775. ;-----------------------------------------------------------------------------
  776. cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
  777. mov r5, r0
  778. sub r0, r1
  779. sub r0, r1
  780. shl r2d, 2
  781. shl r3d, 2
  782. %if mmsize < 16
  783. mov r6, 16/mmsize
  784. .loop:
  785. %endif
  786. CHROMA_V_LOAD r5
  787. LOAD_AB m4, m5, r2d, r3d
  788. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  789. pxor m4, m4
  790. CHROMA_V_LOAD_TC m6, r4
  791. psubw m6, [pw_3]
  792. pmaxsw m6, m4
  793. pand m7, m6
  794. DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
  795. CHROMA_V_STORE
  796. %if mmsize < 16
  797. add r0, mmsize
  798. add r5, mmsize
  799. add r4, mmsize/4
  800. dec r6
  801. jg .loop
  802. REP_RET
  803. %else
  804. RET
  805. %endif
  806. ;-----------------------------------------------------------------------------
  807. ; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
  808. ;-----------------------------------------------------------------------------
  809. cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
  810. mov r4, r0
  811. sub r0, r1
  812. sub r0, r1
  813. shl r2d, 2
  814. shl r3d, 2
  815. %if mmsize < 16
  816. mov r5, 16/mmsize
  817. .loop:
  818. %endif
  819. CHROMA_V_LOAD r4
  820. LOAD_AB m4, m5, r2d, r3d
  821. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  822. CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
  823. CHROMA_V_STORE
  824. %if mmsize < 16
  825. add r0, mmsize
  826. add r4, mmsize
  827. dec r5
  828. jg .loop
  829. REP_RET
  830. %else
  831. RET
  832. %endif
  833. %endmacro
  834. %if ARCH_X86_64 == 0
  835. INIT_MMX mmxext
  836. DEBLOCK_CHROMA
  837. %endif
  838. INIT_XMM sse2
  839. DEBLOCK_CHROMA
  840. INIT_XMM avx
  841. DEBLOCK_CHROMA