You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

918 lines
22KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Oskar Arvidsson <oskar@irock.se>
  7. ;* Loren Merritt <lorenm@u.washington.edu>
  8. ;* Jason Garrett-Glaser <darkshikari@gmail.com>
  9. ;*
  10. ;* This file is part of Libav.
  11. ;*
  12. ;* Libav is free software; you can redistribute it and/or
  13. ;* modify it under the terms of the GNU Lesser General Public
  14. ;* License as published by the Free Software Foundation; either
  15. ;* version 2.1 of the License, or (at your option) any later version.
  16. ;*
  17. ;* Libav is distributed in the hope that it will be useful,
  18. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. ;* Lesser General Public License for more details.
  21. ;*
  22. ;* You should have received a copy of the GNU Lesser General Public
  23. ;* License along with Libav; if not, write to the Free Software
  24. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. ;******************************************************************************
  26. %include "x86inc.asm"
  27. %include "x86util.asm"
  28. SECTION_RODATA
  29. pw_pixel_max: times 8 dw ((1 << 10)-1)
  30. SECTION .text
  31. cextern pw_2
  32. cextern pw_3
  33. cextern pw_4
  34. ; out: %4 = |%1-%2|-%3
  35. ; clobbers: %5
  36. %macro ABS_SUB 5
  37. psubusw %5, %2, %1
  38. psubusw %4, %1, %2
  39. por %4, %5
  40. psubw %4, %3
  41. %endmacro
  42. ; out: %4 = |%1-%2|<%3
  43. %macro DIFF_LT 5
  44. psubusw %4, %2, %1
  45. psubusw %5, %1, %2
  46. por %5, %4 ; |%1-%2|
  47. pxor %4, %4
  48. psubw %5, %3 ; |%1-%2|-%3
  49. pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
  50. %endmacro
  51. %macro LOAD_AB 4
  52. movd %1, %3
  53. movd %2, %4
  54. SPLATW %1, %1
  55. SPLATW %2, %2
  56. %endmacro
  57. ; in: %2=tc reg
  58. ; out: %1=splatted tc
  59. %macro LOAD_TC 2
  60. movd %1, [%2]
  61. punpcklbw %1, %1
  62. %if mmsize == 8
  63. pshufw %1, %1, 0
  64. %else
  65. pshuflw %1, %1, 01010000b
  66. pshufd %1, %1, 01010000b
  67. %endif
  68. psraw %1, 6
  69. %endmacro
  70. ; in: %1=p1, %2=p0, %3=q0, %4=q1
  71. ; %5=alpha, %6=beta, %7-%9=tmp
  72. ; out: %7=mask
  73. %macro LOAD_MASK 9
  74. ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
  75. ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
  76. pand %8, %9
  77. ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
  78. pxor %7, %7
  79. pand %8, %9
  80. pcmpgtw %7, %8
  81. %endmacro
  82. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  83. ; out: %1=p0', m2=q0'
  84. %macro DEBLOCK_P0_Q0 7
  85. psubw %3, %4
  86. pxor %7, %7
  87. paddw %3, [pw_4]
  88. psubw %7, %5
  89. psubw %6, %2, %1
  90. psllw %6, 2
  91. paddw %3, %6
  92. psraw %3, 3
  93. mova %6, [pw_pixel_max]
  94. CLIPW %3, %7, %5
  95. pxor %7, %7
  96. paddw %1, %3
  97. psubw %2, %3
  98. CLIPW %1, %7, %6
  99. CLIPW %2, %7, %6
  100. %endmacro
  101. ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
  102. %macro LUMA_Q1 6
  103. pavgw %6, %3, %4 ; (p0+q0+1)>>1
  104. paddw %1, %6
  105. pxor %6, %6
  106. psraw %1, 1
  107. psubw %6, %5
  108. psubw %1, %2
  109. CLIPW %1, %6, %5
  110. paddw %1, %2
  111. %endmacro
  112. %macro LUMA_DEBLOCK_ONE 3
  113. DIFF_LT m5, %1, bm, m4, m6
  114. pxor m6, m6
  115. mova %3, m4
  116. pcmpgtw m6, tcm
  117. pand m4, tcm
  118. pandn m6, m7
  119. pand m4, m6
  120. LUMA_Q1 m5, %2, m1, m2, m4, m6
  121. %endmacro
  122. %macro LUMA_H_STORE 2
  123. %if mmsize == 8
  124. movq [r0-4], m0
  125. movq [r0+r1-4], m1
  126. movq [r0+r1*2-4], m2
  127. movq [r0+%2-4], m3
  128. %else
  129. movq [r0-4], m0
  130. movhps [r0+r1-4], m0
  131. movq [r0+r1*2-4], m1
  132. movhps [%1-4], m1
  133. movq [%1+r1-4], m2
  134. movhps [%1+r1*2-4], m2
  135. movq [%1+%2-4], m3
  136. movhps [%1+r1*4-4], m3
  137. %endif
  138. %endmacro
  139. %macro DEBLOCK_LUMA 1
  140. ;-----------------------------------------------------------------------------
  141. ; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  142. ;-----------------------------------------------------------------------------
  143. cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
  144. %assign pad 5*mmsize+12-(stack_offset&15)
  145. %define tcm [rsp]
  146. %define ms1 [rsp+mmsize]
  147. %define ms2 [rsp+mmsize*2]
  148. %define am [rsp+mmsize*3]
  149. %define bm [rsp+mmsize*4]
  150. SUB rsp, pad
  151. shl r2d, 2
  152. shl r3d, 2
  153. LOAD_AB m4, m5, r2, r3
  154. mov r3, 32/mmsize
  155. mov r2, r0
  156. sub r0, r1
  157. mova am, m4
  158. sub r0, r1
  159. mova bm, m5
  160. sub r0, r1
  161. .loop:
  162. mova m0, [r0+r1]
  163. mova m1, [r0+r1*2]
  164. mova m2, [r2]
  165. mova m3, [r2+r1]
  166. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  167. LOAD_TC m6, r4
  168. mova tcm, m6
  169. mova m5, [r0]
  170. LUMA_DEBLOCK_ONE m1, m0, ms1
  171. mova [r0+r1], m5
  172. mova m5, [r2+r1*2]
  173. LUMA_DEBLOCK_ONE m2, m3, ms2
  174. mova [r2+r1], m5
  175. pxor m5, m5
  176. mova m6, tcm
  177. pcmpgtw m5, tcm
  178. psubw m6, ms1
  179. pandn m5, m7
  180. psubw m6, ms2
  181. pand m5, m6
  182. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  183. mova [r0+r1*2], m1
  184. mova [r2], m2
  185. add r0, mmsize
  186. add r2, mmsize
  187. add r4, mmsize/8
  188. dec r3
  189. jg .loop
  190. ADD rsp, pad
  191. RET
  192. cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
  193. %assign pad 7*mmsize+12-(stack_offset&15)
  194. %define tcm [rsp]
  195. %define ms1 [rsp+mmsize]
  196. %define ms2 [rsp+mmsize*2]
  197. %define p1m [rsp+mmsize*3]
  198. %define p2m [rsp+mmsize*4]
  199. %define am [rsp+mmsize*5]
  200. %define bm [rsp+mmsize*6]
  201. SUB rsp, pad
  202. shl r2d, 2
  203. shl r3d, 2
  204. LOAD_AB m4, m5, r2, r3
  205. mov r3, r1
  206. mova am, m4
  207. add r3, r1
  208. mov r5, 32/mmsize
  209. mova bm, m5
  210. add r3, r1
  211. %if mmsize == 16
  212. mov r2, r0
  213. add r2, r3
  214. %endif
  215. .loop:
  216. %if mmsize == 8
  217. movq m2, [r0-8] ; y q2 q1 q0
  218. movq m7, [r0+0]
  219. movq m5, [r0+r1-8]
  220. movq m3, [r0+r1+0]
  221. movq m0, [r0+r1*2-8]
  222. movq m6, [r0+r1*2+0]
  223. movq m1, [r0+r3-8]
  224. TRANSPOSE4x4W 2, 5, 0, 1, 4
  225. SWAP 2, 7
  226. movq m7, [r0+r3]
  227. TRANSPOSE4x4W 2, 3, 6, 7, 4
  228. %else
  229. movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  230. movu m0, [r0+r1-8]
  231. movu m2, [r0+r1*2-8]
  232. movu m3, [r2-8]
  233. TRANSPOSE4x4W 5, 0, 2, 3, 6
  234. mova tcm, m3
  235. movu m4, [r2+r1-8]
  236. movu m1, [r2+r1*2-8]
  237. movu m3, [r2+r3-8]
  238. movu m7, [r2+r1*4-8]
  239. TRANSPOSE4x4W 4, 1, 3, 7, 6
  240. mova m6, tcm
  241. punpcklqdq m6, m7
  242. punpckhqdq m5, m4
  243. SBUTTERFLY qdq, 0, 1, 7
  244. SBUTTERFLY qdq, 2, 3, 7
  245. %endif
  246. mova p2m, m6
  247. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  248. LOAD_TC m6, r4
  249. mova tcm, m6
  250. LUMA_DEBLOCK_ONE m1, m0, ms1
  251. mova p1m, m5
  252. mova m5, p2m
  253. LUMA_DEBLOCK_ONE m2, m3, ms2
  254. mova p2m, m5
  255. pxor m5, m5
  256. mova m6, tcm
  257. pcmpgtw m5, tcm
  258. psubw m6, ms1
  259. pandn m5, m7
  260. psubw m6, ms2
  261. pand m5, m6
  262. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  263. mova m0, p1m
  264. mova m3, p2m
  265. TRANSPOSE4x4W 0, 1, 2, 3, 4
  266. LUMA_H_STORE r2, r3
  267. add r4, mmsize/8
  268. lea r0, [r0+r1*(mmsize/2)]
  269. lea r2, [r2+r1*(mmsize/2)]
  270. dec r5
  271. jg .loop
  272. ADD rsp, pad
  273. RET
  274. %endmacro
  275. INIT_XMM
  276. %if ARCH_X86_64
  277. ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
  278. ; m12=alpha, m13=beta
  279. ; out: m0=p1', m3=q1', m1=p0', m2=q0'
  280. ; clobbers: m4, m5, m6, m7, m10, m11, m14
  281. %macro DEBLOCK_LUMA_INTER_SSE2 0
  282. LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
  283. LOAD_TC m6, r4
  284. DIFF_LT m8, m1, m13, m10, m4
  285. DIFF_LT m9, m2, m13, m11, m4
  286. pand m6, m7
  287. mova m14, m6
  288. pxor m4, m4
  289. pcmpgtw m6, m4
  290. pand m6, m14
  291. mova m5, m10
  292. pand m5, m6
  293. LUMA_Q1 m8, m0, m1, m2, m5, m4
  294. mova m5, m11
  295. pand m5, m6
  296. LUMA_Q1 m9, m3, m1, m2, m5, m4
  297. pxor m4, m4
  298. psubw m6, m10
  299. pcmpgtw m4, m14
  300. pandn m4, m7
  301. psubw m6, m11
  302. pand m4, m6
  303. DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
  304. SWAP 0, 8
  305. SWAP 3, 9
  306. %endmacro
  307. %macro DEBLOCK_LUMA_64 1
  308. cglobal deblock_v_luma_10_%1, 5,5,15
  309. %define p2 m8
  310. %define p1 m0
  311. %define p0 m1
  312. %define q0 m2
  313. %define q1 m3
  314. %define q2 m9
  315. %define mask0 m7
  316. %define mask1 m10
  317. %define mask2 m11
  318. shl r2d, 2
  319. shl r3d, 2
  320. LOAD_AB m12, m13, r2, r3
  321. mov r2, r0
  322. sub r0, r1
  323. sub r0, r1
  324. sub r0, r1
  325. mov r3, 2
  326. .loop:
  327. mova p2, [r0]
  328. mova p1, [r0+r1]
  329. mova p0, [r0+r1*2]
  330. mova q0, [r2]
  331. mova q1, [r2+r1]
  332. mova q2, [r2+r1*2]
  333. DEBLOCK_LUMA_INTER_SSE2
  334. mova [r0+r1], p1
  335. mova [r0+r1*2], p0
  336. mova [r2], q0
  337. mova [r2+r1], q1
  338. add r0, mmsize
  339. add r2, mmsize
  340. add r4, 2
  341. dec r3
  342. jg .loop
  343. REP_RET
  344. cglobal deblock_h_luma_10_%1, 5,7,15
  345. shl r2d, 2
  346. shl r3d, 2
  347. LOAD_AB m12, m13, r2, r3
  348. mov r2, r1
  349. add r2, r1
  350. add r2, r1
  351. mov r5, r0
  352. add r5, r2
  353. mov r6, 2
  354. .loop:
  355. movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  356. movu m0, [r0+r1-8]
  357. movu m2, [r0+r1*2-8]
  358. movu m9, [r5-8]
  359. movu m5, [r5+r1-8]
  360. movu m1, [r5+r1*2-8]
  361. movu m3, [r5+r2-8]
  362. movu m7, [r5+r1*4-8]
  363. TRANSPOSE4x4W 8, 0, 2, 9, 10
  364. TRANSPOSE4x4W 5, 1, 3, 7, 10
  365. punpckhqdq m8, m5
  366. SBUTTERFLY qdq, 0, 1, 10
  367. SBUTTERFLY qdq, 2, 3, 10
  368. punpcklqdq m9, m7
  369. DEBLOCK_LUMA_INTER_SSE2
  370. TRANSPOSE4x4W 0, 1, 2, 3, 4
  371. LUMA_H_STORE r5, r2
  372. add r4, 2
  373. lea r0, [r0+r1*8]
  374. lea r5, [r5+r1*8]
  375. dec r6
  376. jg .loop
  377. REP_RET
  378. %endmacro
  379. INIT_XMM
  380. DEBLOCK_LUMA_64 sse2
  381. INIT_AVX
  382. DEBLOCK_LUMA_64 avx
  383. %endif
  384. %macro SWAPMOVA 2
  385. %ifid %1
  386. SWAP %1, %2
  387. %else
  388. mova %1, %2
  389. %endif
  390. %endmacro
  391. ; in: t0-t2: tmp registers
  392. ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
  393. ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
  394. %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
  395. %if ARCH_X86_64
  396. paddw t0, %3, %2
  397. mova t2, %4
  398. paddw t2, %3
  399. %else
  400. mova t0, %3
  401. mova t2, %4
  402. paddw t0, %2
  403. paddw t2, %3
  404. %endif
  405. paddw t0, %1
  406. paddw t2, t2
  407. paddw t0, %5
  408. paddw t2, %9
  409. paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
  410. paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
  411. psrlw t2, 3
  412. psrlw t1, t0, 2
  413. psubw t2, %3
  414. psubw t1, %2
  415. pand t2, %8
  416. pand t1, %8
  417. paddw t2, %3
  418. paddw t1, %2
  419. SWAPMOVA %11, t1
  420. psubw t1, t0, %3
  421. paddw t0, t0
  422. psubw t1, %5
  423. psubw t0, %3
  424. paddw t1, %6
  425. paddw t1, %2
  426. paddw t0, %6
  427. psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
  428. psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
  429. pxor t0, t1
  430. pxor t1, %1
  431. pand t0, %8
  432. pand t1, %7
  433. pxor t0, t1
  434. pxor t0, %1
  435. SWAPMOVA %10, t0
  436. SWAPMOVA %12, t2
  437. %endmacro
  438. %macro LUMA_INTRA_INIT 1
  439. %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
  440. %define t0 m4
  441. %define t1 m5
  442. %define t2 m6
  443. %define t3 m7
  444. %assign i 4
  445. %rep %1
  446. CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
  447. %assign i i+1
  448. %endrep
  449. SUB rsp, pad
  450. %endmacro
  451. ; in: %1-%3=tmp, %4=p2, %5=q2
  452. %macro LUMA_INTRA_INTER 5
  453. LOAD_AB t0, t1, r2d, r3d
  454. mova %1, t0
  455. LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
  456. %if ARCH_X86_64
  457. mova %2, t0 ; mask0
  458. psrlw t3, %1, 2
  459. %else
  460. mova t3, %1
  461. mova %2, t0 ; mask0
  462. psrlw t3, 2
  463. %endif
  464. paddw t3, [pw_2] ; alpha/4+2
  465. DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
  466. pand t2, %2
  467. mova t3, %5 ; q2
  468. mova %1, t2 ; mask1
  469. DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
  470. pand t2, %1
  471. mova t3, %4 ; p2
  472. mova %3, t2 ; mask1q
  473. DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
  474. pand t2, %1
  475. mova %1, t2 ; mask1p
  476. %endmacro
  477. %macro LUMA_H_INTRA_LOAD 0
  478. %if mmsize == 8
  479. movu t0, [r0-8]
  480. movu t1, [r0+r1-8]
  481. movu m0, [r0+r1*2-8]
  482. movu m1, [r0+r4-8]
  483. TRANSPOSE4x4W 4, 5, 0, 1, 2
  484. mova t4, t0 ; p3
  485. mova t5, t1 ; p2
  486. movu m2, [r0]
  487. movu m3, [r0+r1]
  488. movu t0, [r0+r1*2]
  489. movu t1, [r0+r4]
  490. TRANSPOSE4x4W 2, 3, 4, 5, 6
  491. mova t6, t0 ; q2
  492. mova t7, t1 ; q3
  493. %else
  494. movu t0, [r0-8]
  495. movu t1, [r0+r1-8]
  496. movu m0, [r0+r1*2-8]
  497. movu m1, [r0+r5-8]
  498. movu m2, [r4-8]
  499. movu m3, [r4+r1-8]
  500. movu t2, [r4+r1*2-8]
  501. movu t3, [r4+r5-8]
  502. TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
  503. mova t4, t0 ; p3
  504. mova t5, t1 ; p2
  505. mova t6, t2 ; q2
  506. mova t7, t3 ; q3
  507. %endif
  508. %endmacro
  509. ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
  510. %macro LUMA_H_INTRA_STORE 9
  511. %if mmsize == 8
  512. TRANSPOSE4x4W %1, %2, %3, %4, %9
  513. movq [r0-8], m%1
  514. movq [r0+r1-8], m%2
  515. movq [r0+r1*2-8], m%3
  516. movq [r0+r4-8], m%4
  517. movq m%1, %8
  518. TRANSPOSE4x4W %5, %6, %7, %1, %9
  519. movq [r0], m%5
  520. movq [r0+r1], m%6
  521. movq [r0+r1*2], m%7
  522. movq [r0+r4], m%1
  523. %else
  524. TRANSPOSE2x4x4W %1, %2, %3, %4, %9
  525. movq [r0-8], m%1
  526. movq [r0+r1-8], m%2
  527. movq [r0+r1*2-8], m%3
  528. movq [r0+r5-8], m%4
  529. movhps [r4-8], m%1
  530. movhps [r4+r1-8], m%2
  531. movhps [r4+r1*2-8], m%3
  532. movhps [r4+r5-8], m%4
  533. %ifnum %8
  534. SWAP %1, %8
  535. %else
  536. mova m%1, %8
  537. %endif
  538. TRANSPOSE2x4x4W %5, %6, %7, %1, %9
  539. movq [r0], m%5
  540. movq [r0+r1], m%6
  541. movq [r0+r1*2], m%7
  542. movq [r0+r5], m%1
  543. movhps [r4], m%5
  544. movhps [r4+r1], m%6
  545. movhps [r4+r1*2], m%7
  546. movhps [r4+r5], m%1
  547. %endif
  548. %endmacro
  549. %if ARCH_X86_64
  550. ;-----------------------------------------------------------------------------
  551. ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  552. ;-----------------------------------------------------------------------------
  553. %macro DEBLOCK_LUMA_INTRA_64 1
  554. cglobal deblock_v_luma_intra_10_%1, 4,7,16
  555. %define t0 m1
  556. %define t1 m2
  557. %define t2 m4
  558. %define p2 m8
  559. %define p1 m9
  560. %define p0 m10
  561. %define q0 m11
  562. %define q1 m12
  563. %define q2 m13
  564. %define aa m5
  565. %define bb m14
  566. lea r4, [r1*4]
  567. lea r5, [r1*3] ; 3*stride
  568. neg r4
  569. add r4, r0 ; pix-4*stride
  570. mov r6, 2
  571. mova m0, [pw_2]
  572. shl r2d, 2
  573. shl r3d, 2
  574. LOAD_AB aa, bb, r2d, r3d
  575. .loop
  576. mova p2, [r4+r1]
  577. mova p1, [r4+2*r1]
  578. mova p0, [r4+r5]
  579. mova q0, [r0]
  580. mova q1, [r0+r1]
  581. mova q2, [r0+2*r1]
  582. LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
  583. mova t2, aa
  584. psrlw t2, 2
  585. paddw t2, m0 ; alpha/4+2
  586. DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  587. DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
  588. DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
  589. pand m6, m3
  590. pand m7, m6
  591. pand m6, t1
  592. LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
  593. LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
  594. add r0, mmsize
  595. add r4, mmsize
  596. dec r6
  597. jg .loop
  598. REP_RET
  599. ;-----------------------------------------------------------------------------
  600. ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  601. ;-----------------------------------------------------------------------------
  602. cglobal deblock_h_luma_intra_10_%1, 4,7,16
  603. %define t0 m15
  604. %define t1 m14
  605. %define t2 m2
  606. %define q3 m5
  607. %define q2 m8
  608. %define q1 m9
  609. %define q0 m10
  610. %define p0 m11
  611. %define p1 m12
  612. %define p2 m13
  613. %define p3 m4
  614. %define spill [rsp]
  615. %assign pad 24-(stack_offset&15)
  616. SUB rsp, pad
  617. lea r4, [r1*4]
  618. lea r5, [r1*3] ; 3*stride
  619. add r4, r0 ; pix+4*stride
  620. mov r6, 2
  621. mova m0, [pw_2]
  622. shl r2d, 2
  623. shl r3d, 2
  624. .loop
  625. movu q3, [r0-8]
  626. movu q2, [r0+r1-8]
  627. movu q1, [r0+r1*2-8]
  628. movu q0, [r0+r5-8]
  629. movu p0, [r4-8]
  630. movu p1, [r4+r1-8]
  631. movu p2, [r4+r1*2-8]
  632. movu p3, [r4+r5-8]
  633. TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
  634. LOAD_AB m1, m2, r2d, r3d
  635. LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
  636. psrlw m1, 2
  637. paddw m1, m0 ; alpha/4+2
  638. DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  639. DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
  640. DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
  641. pand m6, m3
  642. pand m7, m6
  643. pand m6, t1
  644. mova spill, q3
  645. LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
  646. LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
  647. mova m7, spill
  648. LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
  649. lea r0, [r0+r1*8]
  650. lea r4, [r4+r1*8]
  651. dec r6
  652. jg .loop
  653. ADD rsp, pad
  654. RET
  655. %endmacro
  656. INIT_XMM
  657. DEBLOCK_LUMA_INTRA_64 sse2
  658. INIT_AVX
  659. DEBLOCK_LUMA_INTRA_64 avx
  660. %endif
  661. %macro DEBLOCK_LUMA_INTRA 1
  662. ;-----------------------------------------------------------------------------
  663. ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  664. ;-----------------------------------------------------------------------------
  665. cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
  666. LUMA_INTRA_INIT 3
  667. lea r4, [r1*4]
  668. lea r5, [r1*3]
  669. neg r4
  670. add r4, r0
  671. mov r6, 32/mmsize
  672. shl r2d, 2
  673. shl r3d, 2
  674. .loop:
  675. mova m0, [r4+r1*2] ; p1
  676. mova m1, [r4+r5] ; p0
  677. mova m2, [r0] ; q0
  678. mova m3, [r0+r1] ; q1
  679. LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
  680. LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
  681. mova t3, [r0+r1*2] ; q2
  682. LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
  683. add r0, mmsize
  684. add r4, mmsize
  685. dec r6
  686. jg .loop
  687. ADD rsp, pad
  688. RET
  689. ;-----------------------------------------------------------------------------
  690. ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
  691. ;-----------------------------------------------------------------------------
  692. cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
  693. LUMA_INTRA_INIT 8
  694. %if mmsize == 8
  695. lea r4, [r1*3]
  696. mov r5, 32/mmsize
  697. %else
  698. lea r4, [r1*4]
  699. lea r5, [r1*3] ; 3*stride
  700. add r4, r0 ; pix+4*stride
  701. mov r6, 32/mmsize
  702. %endif
  703. shl r2d, 2
  704. shl r3d, 2
  705. .loop:
  706. LUMA_H_INTRA_LOAD
  707. LUMA_INTRA_INTER t8, t9, t10, t5, t6
  708. LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
  709. mova t3, t6 ; q2
  710. LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
  711. mova m2, t4
  712. mova m0, t11
  713. mova m1, t5
  714. mova m3, t8
  715. mova m6, t6
  716. LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
  717. lea r0, [r0+r1*(mmsize/2)]
  718. %if mmsize == 8
  719. dec r5
  720. %else
  721. lea r4, [r4+r1*(mmsize/2)]
  722. dec r6
  723. %endif
  724. jg .loop
  725. ADD rsp, pad
  726. RET
  727. %endmacro
  728. %if ARCH_X86_64 == 0
  729. INIT_MMX
  730. DEBLOCK_LUMA mmxext
  731. DEBLOCK_LUMA_INTRA mmxext
  732. INIT_XMM
  733. DEBLOCK_LUMA sse2
  734. DEBLOCK_LUMA_INTRA sse2
  735. INIT_AVX
  736. DEBLOCK_LUMA avx
  737. DEBLOCK_LUMA_INTRA avx
  738. %endif
  739. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  740. ; out: %1=p0', %2=q0'
  741. %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
  742. mova %6, [pw_2]
  743. paddw %6, %3
  744. paddw %6, %4
  745. paddw %7, %6, %2
  746. paddw %6, %1
  747. paddw %6, %3
  748. paddw %7, %4
  749. psraw %6, 2
  750. psraw %7, 2
  751. psubw %6, %1
  752. psubw %7, %2
  753. pand %6, %5
  754. pand %7, %5
  755. paddw %1, %6
  756. paddw %2, %7
  757. %endmacro
  758. %macro CHROMA_V_LOAD 1
  759. mova m0, [r0] ; p1
  760. mova m1, [r0+r1] ; p0
  761. mova m2, [%1] ; q0
  762. mova m3, [%1+r1] ; q1
  763. %endmacro
  764. %macro CHROMA_V_STORE 0
  765. mova [r0+1*r1], m1
  766. mova [r0+2*r1], m2
  767. %endmacro
  768. %macro CHROMA_V_LOAD_TC 2
  769. movd %1, [%2]
  770. punpcklbw %1, %1
  771. punpcklwd %1, %1
  772. psraw %1, 6
  773. %endmacro
  774. %macro DEBLOCK_CHROMA 1
  775. ;-----------------------------------------------------------------------------
  776. ; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  777. ;-----------------------------------------------------------------------------
  778. cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
  779. mov r5, r0
  780. sub r0, r1
  781. sub r0, r1
  782. shl r2d, 2
  783. shl r3d, 2
  784. %if mmsize < 16
  785. mov r6, 16/mmsize
  786. .loop:
  787. %endif
  788. CHROMA_V_LOAD r5
  789. LOAD_AB m4, m5, r2, r3
  790. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  791. pxor m4, m4
  792. CHROMA_V_LOAD_TC m6, r4
  793. psubw m6, [pw_3]
  794. pmaxsw m6, m4
  795. pand m7, m6
  796. DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
  797. CHROMA_V_STORE
  798. %if mmsize < 16
  799. add r0, mmsize
  800. add r5, mmsize
  801. add r4, mmsize/4
  802. dec r6
  803. jg .loop
  804. REP_RET
  805. %else
  806. RET
  807. %endif
  808. ;-----------------------------------------------------------------------------
  809. ; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
  810. ;-----------------------------------------------------------------------------
  811. cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
  812. mov r4, r0
  813. sub r0, r1
  814. sub r0, r1
  815. shl r2d, 2
  816. shl r3d, 2
  817. %if mmsize < 16
  818. mov r5, 16/mmsize
  819. .loop:
  820. %endif
  821. CHROMA_V_LOAD r4
  822. LOAD_AB m4, m5, r2, r3
  823. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  824. CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
  825. CHROMA_V_STORE
  826. %if mmsize < 16
  827. add r0, mmsize
  828. add r4, mmsize
  829. dec r5
  830. jg .loop
  831. REP_RET
  832. %else
  833. RET
  834. %endif
  835. %endmacro
  836. %if ARCH_X86_64 == 0
  837. INIT_MMX
  838. DEBLOCK_CHROMA mmxext
  839. %endif
  840. INIT_XMM
  841. DEBLOCK_CHROMA sse2
  842. INIT_AVX
  843. DEBLOCK_CHROMA avx