You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

844 lines
27KB

  1. ;*****************************************************************************
  2. ;* SSE2-optimized HEVC deblocking code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2013 VTT
  5. ;*
  6. ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. pw_pixel_max: times 8 dw ((1 << 10)-1)
  27. pw_m1: times 8 dw -1
  28. pw_m2: times 8 dw -2
  29. pd_1 : times 4 dd 1
  30. cextern pw_4
  31. cextern pw_8
  32. SECTION .text
  33. INIT_XMM sse2
  34. ; expands to [base],...,[base+7*stride]
  35. %define PASS8ROWS(base, base3, stride, stride3) \
  36. [base], [base+stride], [base+stride*2], [base3], \
  37. [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
  38. ; in: 8 rows of 4 bytes in %4..%11
  39. ; out: 4 rows of 8 words in m0..m3
  40. %macro TRANSPOSE4x8B_LOAD 8
  41. movd m0, %1
  42. movd m2, %2
  43. movd m1, %3
  44. movd m3, %4
  45. punpcklbw m0, m2
  46. punpcklbw m1, m3
  47. punpcklwd m0, m1
  48. movd m4, %5
  49. movd m6, %6
  50. movd m5, %7
  51. movd m3, %8
  52. punpcklbw m4, m6
  53. punpcklbw m5, m3
  54. punpcklwd m4, m5
  55. punpckhdq m2, m0, m4
  56. punpckldq m0, m4
  57. pxor m5, m5
  58. punpckhbw m1, m0, m5
  59. punpcklbw m0, m5
  60. punpckhbw m3, m2, m5
  61. punpcklbw m2, m5
  62. %endmacro
  63. ; in: 4 rows of 8 words in m0..m3
  64. ; out: 8 rows of 4 bytes in %1..%8
  65. %macro TRANSPOSE8x4B_STORE 8
  66. packuswb m0, m0
  67. packuswb m1, m1
  68. packuswb m2, m2
  69. packuswb m3, m3
  70. punpcklbw m0, m1
  71. punpcklbw m2, m3
  72. punpckhwd m6, m0, m2
  73. punpcklwd m0, m2
  74. movd %1, m0
  75. pshufd m0, m0, 0x39
  76. movd %2, m0
  77. pshufd m0, m0, 0x39
  78. movd %3, m0
  79. pshufd m0, m0, 0x39
  80. movd %4, m0
  81. movd %5, m6
  82. pshufd m6, m6, 0x39
  83. movd %6, m6
  84. pshufd m6, m6, 0x39
  85. movd %7, m6
  86. pshufd m6, m6, 0x39
  87. movd %8, m6
  88. %endmacro
  89. ; in: 8 rows of 4 words in %4..%11
  90. ; out: 4 rows of 8 words in m0..m3
  91. %macro TRANSPOSE4x8W_LOAD 8
  92. movq m0, %1
  93. movq m2, %2
  94. movq m1, %3
  95. movq m3, %4
  96. punpcklwd m0, m2
  97. punpcklwd m1, m3
  98. punpckhdq m2, m0, m1
  99. punpckldq m0, m1
  100. movq m4, %5
  101. movq m6, %6
  102. movq m5, %7
  103. movq m3, %8
  104. punpcklwd m4, m6
  105. punpcklwd m5, m3
  106. punpckhdq m6, m4, m5
  107. punpckldq m4, m5
  108. punpckhqdq m1, m0, m4
  109. punpcklqdq m0, m4
  110. punpckhqdq m3, m2, m6
  111. punpcklqdq m2, m6
  112. %endmacro
  113. ; in: 4 rows of 8 words in m0..m3
  114. ; out: 8 rows of 4 words in %1..%8
  115. %macro TRANSPOSE8x4W_STORE 8
  116. pxor m5, m5; zeros reg
  117. CLIPW m0, m5, [pw_pixel_max]
  118. CLIPW m1, m5, [pw_pixel_max]
  119. CLIPW m2, m5, [pw_pixel_max]
  120. CLIPW m3, m5, [pw_pixel_max]
  121. punpckhwd m4, m0, m1
  122. punpcklwd m0, m1
  123. punpckhwd m5, m2, m3
  124. punpcklwd m2, m3
  125. punpckhdq m6, m0, m2
  126. punpckldq m0, m2
  127. movq %1, m0
  128. movhps %2, m0
  129. movq %3, m6
  130. movhps %4, m6
  131. punpckhdq m6, m4, m5
  132. punpckldq m4, m5
  133. movq %5, m4
  134. movhps %6, m4
  135. movq %7, m6
  136. movhps %8, m6
  137. %endmacro
  138. ; in: 8 rows of 8 bytes in %1..%8
  139. ; out: 8 rows of 8 words in m0..m7
  140. %macro TRANSPOSE8x8B_LOAD 8
  141. movq m7, %1
  142. movq m2, %2
  143. movq m1, %3
  144. movq m3, %4
  145. punpcklbw m7, m2
  146. punpcklbw m1, m3
  147. punpcklwd m3, m7, m1
  148. punpckhwd m7, m1
  149. movq m4, %5
  150. movq m6, %6
  151. movq m5, %7
  152. movq m15, %8
  153. punpcklbw m4, m6
  154. punpcklbw m5, m15
  155. punpcklwd m9, m4, m5
  156. punpckhwd m4, m5
  157. punpckldq m1, m3, m9; 0, 1
  158. punpckhdq m3, m9; 2, 3
  159. punpckldq m5, m7, m4; 4, 5
  160. punpckhdq m7, m4; 6, 7
  161. pxor m13, m13
  162. punpcklbw m0, m1, m13; 0 in 16 bit
  163. punpckhbw m1, m13; 1 in 16 bit
  164. punpcklbw m2, m3, m13; 2
  165. punpckhbw m3, m13; 3
  166. punpcklbw m4, m5, m13; 4
  167. punpckhbw m5, m13; 5
  168. punpcklbw m6, m7, m13; 6
  169. punpckhbw m7, m13; 7
  170. %endmacro
  171. ; in: 8 rows of 8 words in m0..m8
  172. ; out: 8 rows of 8 bytes in %1..%8
  173. %macro TRANSPOSE8x8B_STORE 8
  174. packuswb m0, m0
  175. packuswb m1, m1
  176. packuswb m2, m2
  177. packuswb m3, m3
  178. packuswb m4, m4
  179. packuswb m5, m5
  180. packuswb m6, m6
  181. packuswb m7, m7
  182. punpcklbw m0, m1
  183. punpcklbw m2, m3
  184. punpckhwd m8, m0, m2
  185. punpcklwd m0, m2
  186. punpcklbw m4, m5
  187. punpcklbw m6, m7
  188. punpckhwd m9, m4, m6
  189. punpcklwd m4, m6
  190. punpckhdq m10, m0, m4; 2, 3
  191. punpckldq m0, m4; 0, 1
  192. punpckldq m11, m8, m9; 4, 5
  193. punpckhdq m8, m9; 6, 7
  194. movq %1, m0
  195. movhps %2, m0
  196. movq %3, m10
  197. movhps %4, m10
  198. movq %5, m11
  199. movhps %6, m11
  200. movq %7, m8
  201. movhps %8, m8
  202. %endmacro
  203. ; in: 8 rows of 8 words in %1..%8
  204. ; out: 8 rows of 8 words in m0..m7
  205. %macro TRANSPOSE8x8W_LOAD 8
  206. movdqu m0, %1
  207. movdqu m1, %2
  208. movdqu m2, %3
  209. movdqu m3, %4
  210. movdqu m4, %5
  211. movdqu m5, %6
  212. movdqu m6, %7
  213. movdqu m7, %8
  214. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  215. %endmacro
  216. ; in: 8 rows of 8 words in m0..m8
  217. ; out: 8 rows of 8 words in %1..%8
  218. %macro TRANSPOSE8x8W_STORE 8
  219. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  220. pxor m8, m8
  221. CLIPW m0, m8, [pw_pixel_max]
  222. CLIPW m1, m8, [pw_pixel_max]
  223. CLIPW m2, m8, [pw_pixel_max]
  224. CLIPW m3, m8, [pw_pixel_max]
  225. CLIPW m4, m8, [pw_pixel_max]
  226. CLIPW m5, m8, [pw_pixel_max]
  227. CLIPW m6, m8, [pw_pixel_max]
  228. CLIPW m7, m8, [pw_pixel_max]
  229. movdqu %1, m0
  230. movdqu %2, m1
  231. movdqu %3, m2
  232. movdqu %4, m3
  233. movdqu %5, m4
  234. movdqu %6, m5
  235. movdqu %7, m6
  236. movdqu %8, m7
  237. %endmacro
  238. ; in: %2 clobbered
  239. ; out: %1
  240. ; mask in m11
  241. ; clobbers m10
  242. %macro MASKED_COPY 2
  243. pand %2, m11 ; and mask
  244. pandn m10, m11, %1; and -mask
  245. por %2, m10
  246. mova %1, %2
  247. %endmacro
  248. ; in: %2 clobbered
  249. ; out: %1
  250. ; mask in %3, will be clobbered
  251. %macro MASKED_COPY2 3
  252. pand %2, %3 ; and mask
  253. pandn %3, %1; and -mask
  254. por %2, %3
  255. mova %1, %2
  256. %endmacro
  257. ALIGN 16
  258. ; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2
  259. %macro CHROMA_DEBLOCK_BODY 1
  260. psubw m4, m2, m1; q0 - p0
  261. psubw m5, m0, m3; p1 - q1
  262. psllw m4, 2; << 2
  263. paddw m5, m4;
  264. ;tc calculations
  265. movd m6, [tcq]; tc0
  266. punpcklwd m6, m6
  267. movd m4, [tcq+4]; tc1
  268. punpcklwd m4, m4
  269. shufps m6, m4, 0; tc0, tc1
  270. pmullw m4, m6, [pw_m1]; -tc0, -tc1
  271. ;end tc calculations
  272. paddw m5, [pw_4]; +4
  273. psraw m5, 3; >> 3
  274. %if %1 > 8
  275. psllw m4, %1-8; << (BIT_DEPTH - 8)
  276. psllw m6, %1-8; << (BIT_DEPTH - 8)
  277. %endif
  278. pmaxsw m5, m4
  279. pminsw m5, m6
  280. paddw m1, m5; p0 + delta0
  281. psubw m2, m5; q0 - delta0
  282. %endmacro
  283. ; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6
  284. %macro LUMA_DEBLOCK_BODY 2
  285. psllw m9, m2, 1; *2
  286. psubw m10, m1, m9
  287. paddw m10, m3
  288. ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
  289. psllw m9, m5, 1; *2
  290. psubw m11, m6, m9
  291. paddw m11, m4
  292. ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
  293. ;beta calculations
  294. mov r11, [betaq];
  295. %if %1 > 8
  296. shl r11, %1 - 8
  297. %endif
  298. movd m13, r11d; beta0
  299. add betaq, 4;
  300. punpcklwd m13, m13
  301. mov r12, [betaq];
  302. %if %1 > 8
  303. shl r12, %1 - 8
  304. %endif
  305. movd m14, r12d; beta1
  306. punpcklwd m14, m14
  307. pshufd m13, m14, 0; beta0, beta1
  308. ;end beta calculations
  309. paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
  310. pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
  311. pshuflw m14, m14, 0x0f ;0b00001111; 1d3 1d3 1d0 1d0 in low
  312. pshufhw m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
  313. pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
  314. paddw m14, m9; 0d0+0d3, 1d0+1d3
  315. ;compare
  316. pcmpgtw m15, m13, m14
  317. movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
  318. test r13, r13
  319. je .bypassluma
  320. ;weak / strong decision compare to beta_2
  321. psraw m15, m13, 2; beta >> 2
  322. psllw m8, m9, 1;
  323. pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
  324. movmskps r14, m15;
  325. ;end weak / strong decision
  326. ; weak filter nd_p/q calculation
  327. pshufd m8, m10, 0x31
  328. psrld m8, 16
  329. paddw m8, m10
  330. movd r7d, m8
  331. and r7, 0xffff; 1dp0 + 1dp3
  332. pshufd m8, m8, 0x4E
  333. movd r8d, m8
  334. and r8, 0xffff; 0dp0 + 0dp3
  335. pshufd m8, m11, 0x31
  336. psrld m8, 16
  337. paddw m8, m11
  338. movd r9d, m8
  339. and r9, 0xffff; 1dq0 + 1dq3
  340. pshufd m8, m8, 0x4E
  341. movd r10d, m8
  342. and r10, 0xffff; 0dq0 + 0dq3
  343. ; end calc for weak filter
  344. ; filtering mask
  345. mov r2, r13
  346. shr r2, 3
  347. movd m15, r2d
  348. and r13, 1
  349. movd m11, r13d
  350. shufps m11, m15, 0
  351. shl r2, 1
  352. or r13, r2
  353. pcmpeqd m11, [pd_1]; filtering mask
  354. ;decide between strong and weak filtering
  355. ;tc25 calculations
  356. mov r2d, [tcq];
  357. %if %1 > 8
  358. shl r2, %1 - 8
  359. %endif
  360. movd m8, r2d; tc0
  361. add tcq, 4;
  362. mov r3d, [tcq];
  363. %if %1 > 8
  364. shl r3, %1 - 8
  365. %endif
  366. movd m9, r3d; tc1
  367. add r2d, r3d; tc0 + tc1
  368. jz .bypassluma
  369. punpcklwd m8, m8
  370. punpcklwd m9, m9
  371. shufps m8, m9, 0; tc0, tc1
  372. mova m9, m8
  373. psllw m8, 2; tc << 2
  374. pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
  375. ;end tc25 calculations
  376. ;----beta_3 comparison-----
  377. psubw m12, m0, m3; p3 - p0
  378. ABS1 m12, m14; abs(p3 - p0)
  379. psubw m15, m7, m4; q3 - q0
  380. ABS1 m15, m14; abs(q3 - q0)
  381. paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
  382. pshufhw m12, m12, 0xf0 ;0b11110000;
  383. pshuflw m12, m12, 0xf0 ;0b11110000;
  384. psraw m13, 3; beta >> 3
  385. pcmpgtw m13, m12;
  386. movmskps r2, m13;
  387. and r14, r2; strong mask , beta_2 and beta_3 comparisons
  388. ;----beta_3 comparison end-----
  389. ;----tc25 comparison---
  390. psubw m12, m3, m4; p0 - q0
  391. ABS1 m12, m14; abs(p0 - q0)
  392. pshufhw m12, m12, 0xf0 ;0b11110000;
  393. pshuflw m12, m12, 0xf0 ;0b11110000;
  394. pcmpgtw m8, m12; tc25 comparisons
  395. movmskps r2, m8;
  396. and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons
  397. ;----tc25 comparison end---
  398. mov r2, r14;
  399. shr r2, 1;
  400. and r14, r2; strong mask, bits 2 and 0
  401. pmullw m14, m9, [pw_m2]; -tc * 2
  402. paddw m9, m9
  403. and r14, 5; 0b101
  404. mov r2, r14; strong mask
  405. shr r14, 2;
  406. movd m12, r14d; store to xmm for mask generation
  407. shl r14, 1
  408. and r2, 1
  409. movd m10, r2d; store to xmm for mask generation
  410. or r14, r2; final strong mask, bits 1 and 0
  411. jz .weakfilter
  412. shufps m10, m12, 0
  413. pcmpeqd m10, [pd_1]; strong mask
  414. mova m13, [pw_4]; 4 in every cell
  415. pand m11, m10; combine filtering mask and strong mask
  416. paddw m12, m2, m3; p1 + p0
  417. paddw m12, m4; p1 + p0 + q0
  418. mova m10, m12; copy
  419. paddw m12, m12; 2*p1 + 2*p0 + 2*q0
  420. paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
  421. paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
  422. paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
  423. psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
  424. psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
  425. pmaxsw m12, m14
  426. pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
  427. paddw m12, m3; p0'
  428. paddw m15, m1, m10; p2 + p1 + p0 + q0
  429. psrlw m13, 1; 2 in every cell
  430. paddw m15, m13; p2 + p1 + p0 + q0 + 2
  431. psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2
  432. psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
  433. pmaxsw m15, m14
  434. pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
  435. paddw m15, m2; p1'
  436. paddw m8, m1, m0; p3 + p2
  437. paddw m8, m8; 2*p3 + 2*p2
  438. paddw m8, m1; 2*p3 + 3*p2
  439. paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
  440. paddw m13, m13
  441. paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
  442. psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
  443. psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
  444. pmaxsw m8, m14
  445. pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
  446. paddw m8, m1; p2'
  447. MASKED_COPY m1, m8
  448. paddw m8, m3, m4; p0 + q0
  449. paddw m8, m5; p0 + q0 + q1
  450. paddw m8, m8; 2*p0 + 2*q0 + 2*q1
  451. paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
  452. paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
  453. paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
  454. psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
  455. psubw m8, m4;
  456. pmaxsw m8, m14
  457. pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
  458. paddw m8, m4; q0'
  459. MASKED_COPY m2, m15
  460. paddw m15, m3, m4; p0 + q0
  461. paddw m15, m5; p0 + q0 + q1
  462. mova m10, m15;
  463. paddw m15, m6; p0 + q0 + q1 + q2
  464. psrlw m13, 1; 2 in every cell
  465. paddw m15, m13; p0 + q0 + q1 + q2 + 2
  466. psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2
  467. psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
  468. pmaxsw m15, m14
  469. pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
  470. paddw m15, m5; q1'
  471. paddw m13, m7; q3 + 2
  472. paddw m13, m6; q3 + q2 + 2
  473. paddw m13, m13; 2*q3 + 2*q2 + 4
  474. paddw m13, m6; 2*q3 + 3*q2 + 4
  475. paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
  476. psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
  477. psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
  478. pmaxsw m13, m14
  479. pminsw m13, m9; av_clip( , -2 * tc, 2 * tc)
  480. paddw m13, m6; q2'
  481. MASKED_COPY m6, m13
  482. MASKED_COPY m5, m15
  483. MASKED_COPY m4, m8
  484. MASKED_COPY m3, m12
  485. .weakfilter:
  486. not r14; strong mask -> weak mask
  487. and r14, r13; final weak filtering mask, bits 0 and 1
  488. jz .store
  489. ; weak filtering mask
  490. mov r2, r14
  491. shr r2, 1
  492. movd m12, r2d
  493. and r14, 1
  494. movd m11, r14d
  495. shufps m11, m12, 0
  496. pcmpeqd m11, [pd_1]; filtering mask
  497. mov r13, r11; beta0
  498. shr r13, 1;
  499. add r11, r13
  500. shr r11, 3; ((beta0+(beta0>>1))>>3))
  501. mov r13, r12; beta1
  502. shr r13, 1;
  503. add r12, r13
  504. shr r12, 3; ((beta1+(beta1>>1))>>3))
  505. mova m13, [pw_8]
  506. psubw m12, m4, m3 ; q0 - p0
  507. psllw m10, m12, 3; 8 * (q0 - p0)
  508. paddw m12, m10 ; 9 * (q0 - p0)
  509. psubw m10, m5, m2 ; q1 - p1
  510. psllw m8, m10, 1; 2 * ( q1 - p1 )
  511. paddw m10, m8; 3 * ( q1 - p1 )
  512. psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
  513. paddw m12, m13; + 8
  514. psraw m12, 4; >> 4 , delta0
  515. PABSW m13, m12; abs(delta0)
  516. psllw m10, m9, 2; 8 * tc
  517. paddw m10, m9; 10 * tc
  518. pcmpgtw m10, m13
  519. pand m11, m10
  520. psraw m9, 1; tc * 2 -> tc
  521. psraw m14, 1; -tc * 2 -> -tc
  522. pmaxsw m12, m14
  523. pminsw m12, m9; av_clip(delta0, -tc, tc)
  524. psraw m9, 1; tc -> tc / 2
  525. pmullw m14, m9, [pw_m1]; -tc / 2
  526. pavgw m15, m1, m3; (p2 + p0 + 1) >> 1
  527. psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
  528. paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
  529. psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
  530. pmaxsw m15, m14
  531. pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2)
  532. paddw m15, m2; p1'
  533. ;beta calculations
  534. movd m10, r11d; beta0
  535. punpcklwd m10, m10
  536. movd m13, r12d; beta1
  537. punpcklwd m13, m13
  538. shufps m10, m13, 0; betax0, betax1
  539. movd m13, r7d; 1dp0 + 1dp3
  540. movd m8, r8d; 0dp0 + 0dp3
  541. punpcklwd m8, m8
  542. punpcklwd m13, m13
  543. shufps m13, m8, 0;
  544. pcmpgtw m8, m10, m13
  545. pand m8, m11
  546. ;end beta calculations
  547. MASKED_COPY2 m2, m15, m8; write p1'
  548. pavgw m8, m6, m4; (q2 + q0 + 1) >> 1
  549. psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
  550. psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
  551. psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
  552. pmaxsw m8, m14
  553. pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
  554. paddw m8, m5; q1'
  555. movd m13, r9d;
  556. movd m15, r10d;
  557. punpcklwd m15, m15
  558. punpcklwd m13, m13
  559. shufps m13, m15, 0; dq0 + dq3
  560. pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
  561. pand m10, m11
  562. MASKED_COPY2 m5, m8, m10; write q1'
  563. paddw m15, m3, m12 ; p0 + delta0
  564. MASKED_COPY m3, m15
  565. psubw m8, m4, m12 ; q0 - delta0
  566. MASKED_COPY m4, m8
  567. %endmacro
  568. INIT_XMM sse2
  569. ;-----------------------------------------------------------------------------
  570. ; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q)
  571. ;-----------------------------------------------------------------------------
  572. cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
  573. sub pixq, 2
  574. lea r3strideq, [3*strideq]
  575. mov pix0q, pixq
  576. add pixq, r3strideq
  577. TRANSPOSE4x8B_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
  578. CHROMA_DEBLOCK_BODY 8
  579. TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
  580. RET
  581. cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
  582. sub pixq, 4
  583. lea r3strideq, [3*strideq]
  584. mov pix0q, pixq
  585. add pixq, r3strideq
  586. TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
  587. CHROMA_DEBLOCK_BODY 10
  588. TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
  589. RET
  590. ;-----------------------------------------------------------------------------
  591. ; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q
  592. ;-----------------------------------------------------------------------------
  593. cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
  594. mov pix0q, pixq
  595. sub pix0q, strideq
  596. sub pix0q, strideq
  597. movq m0, [pix0q]; p1
  598. movq m1, [pix0q+strideq]; p0
  599. movq m2, [pixq]; q0
  600. movq m3, [pixq+strideq]; q1
  601. pxor m5, m5; zeros reg
  602. punpcklbw m0, m5
  603. punpcklbw m1, m5
  604. punpcklbw m2, m5
  605. punpcklbw m3, m5
  606. CHROMA_DEBLOCK_BODY 8
  607. packuswb m1, m2
  608. movh[pix0q+strideq], m1
  609. movhps [pixq], m1
  610. RET
  611. cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
  612. mov pix0q, pixq
  613. sub pix0q, strideq
  614. sub pix0q, strideq
  615. movu m0, [pix0q]; p1
  616. movu m1, [pix0q+strideq]; p0
  617. movu m2, [pixq]; q0
  618. movu m3, [pixq+strideq]; q1
  619. CHROMA_DEBLOCK_BODY 10
  620. pxor m5, m5; zeros reg
  621. CLIPW m1, m5, [pw_pixel_max]
  622. CLIPW m2, m5, [pw_pixel_max]
  623. movu [pix0q+strideq], m1
  624. movu [pixq], m2
  625. RET
  626. %if ARCH_X86_64
  627. %macro LOOP_FILTER_LUMA 0
  628. ;-----------------------------------------------------------------------------
  629. ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  630. ;-----------------------------------------------------------------------------
  631. cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
  632. sub r0, 4
  633. lea r5, [3 * r1]
  634. mov r6, r0
  635. add r0, r5
  636. TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5)
  637. LUMA_DEBLOCK_BODY 8, v
  638. .store:
  639. TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
  640. .bypassluma:
  641. RET
  642. cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
  643. sub pixq, 8
  644. lea r5, [3 * strideq]
  645. mov r6, pixq
  646. add pixq, r5
  647. TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
  648. LUMA_DEBLOCK_BODY 10, v
  649. .store:
  650. TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
  651. .bypassluma:
  652. RET
  653. ;-----------------------------------------------------------------------------
  654. ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  655. ;-----------------------------------------------------------------------------
  656. cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
  657. lea src3strideq, [3 * strideq]
  658. mov pix0q, pixq
  659. sub pix0q, src3strideq
  660. sub pix0q, strideq
  661. movdqu m0, [pix0q]; p3
  662. movdqu m1, [pix0q + strideq]; p2
  663. movdqu m2, [pix0q + 2 * strideq]; p1
  664. movdqu m3, [pix0q + src3strideq]; p0
  665. movdqu m4, [pixq]; q0
  666. movdqu m5, [pixq + strideq]; q1
  667. movdqu m6, [pixq + 2 * strideq]; q2
  668. movdqu m7, [pixq + src3strideq]; q3
  669. pxor m8, m8
  670. punpcklbw m0, m8
  671. punpcklbw m1, m8
  672. punpcklbw m2, m8
  673. punpcklbw m3, m8
  674. punpcklbw m4, m8
  675. punpcklbw m5, m8
  676. punpcklbw m6, m8
  677. punpcklbw m7, m8
  678. LUMA_DEBLOCK_BODY 8, h
  679. .store:
  680. packuswb m1, m2
  681. packuswb m3, m4
  682. packuswb m5, m6
  683. movh [r5 + r1], m1
  684. movhps [r5 + 2 * r1], m1
  685. movh [r5 + r6], m3
  686. movhps [r0 ], m3
  687. movh [r0 + r1], m5
  688. movhps [r0 + 2 * r1], m5
  689. .bypassluma:
  690. RET
  691. cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
  692. lea src3strideq, [3 * strideq]
  693. mov pix0q, pixq
  694. sub pix0q, src3strideq
  695. sub pix0q, strideq
  696. movdqu m0, [pix0q]; p3
  697. movdqu m1, [pix0q + strideq]; p2
  698. movdqu m2, [pix0q + 2 * strideq]; p1
  699. movdqu m3, [pix0q + src3strideq]; p0
  700. movdqu m4, [pixq]; q0
  701. movdqu m5, [pixq + strideq]; q1
  702. movdqu m6, [pixq + 2 * strideq]; q2
  703. movdqu m7, [pixq + src3strideq]; q3
  704. LUMA_DEBLOCK_BODY 10, h
  705. .store:
  706. pxor m8, m8; zeros reg
  707. CLIPW m1, m8, [pw_pixel_max]
  708. CLIPW m2, m8, [pw_pixel_max]
  709. CLIPW m3, m8, [pw_pixel_max]
  710. CLIPW m4, m8, [pw_pixel_max]
  711. CLIPW m5, m8, [pw_pixel_max]
  712. CLIPW m6, m8, [pw_pixel_max]
  713. movdqu [pix0q + strideq], m1; p2
  714. movdqu [pix0q + 2 * strideq], m2; p1
  715. movdqu [pix0q + src3strideq], m3; p0
  716. movdqu [pixq ], m4; q0
  717. movdqu [pixq + strideq], m5; q1
  718. movdqu [pixq + 2 * strideq], m6; q2
  719. .bypassluma:
  720. RET
  721. %endmacro
  722. INIT_XMM sse2
  723. LOOP_FILTER_LUMA
  724. INIT_XMM ssse3
  725. LOOP_FILTER_LUMA
  726. %endif