You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1141 lines
38KB

  1. ;******************************************************************************
  2. ;* VP9 loop filter SIMD optimizations
  3. ;*
  4. ;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
  5. ;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com>
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. cextern pb_3
  26. cextern pb_80
  27. pb_4: times 16 db 0x04
  28. pb_10: times 16 db 0x10
  29. pb_40: times 16 db 0x40
  30. pb_81: times 16 db 0x81
  31. pb_f8: times 16 db 0xf8
  32. pb_fe: times 16 db 0xfe
  33. pb_ff: times 16 db 0xff
  34. pw_4: times 8 dw 4
  35. pw_8: times 8 dw 8
  36. ; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
  37. ; the following mask is used to splat both in the same register
  38. mask_mix: times 8 db 0
  39. times 8 db 1
  40. mask_mix84: times 8 db 0xff
  41. times 8 db 0x00
  42. mask_mix48: times 8 db 0x00
  43. times 8 db 0xff
  44. SECTION .text
  45. %macro SCRATCH 3
  46. %if ARCH_X86_64
  47. SWAP %1, %2
  48. %else
  49. mova [%3], m%1
  50. %endif
  51. %endmacro
  52. %macro UNSCRATCH 3
  53. %if ARCH_X86_64
  54. SWAP %1, %2
  55. %else
  56. mova m%1, [%3]
  57. %endif
  58. %endmacro
  59. ; %1 = abs(%2-%3)
  60. %macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
  61. %if ARCH_X86_64
  62. psubusb %1, %3, %2
  63. psubusb %4, %2, %3
  64. %else
  65. mova %1, %3
  66. mova %4, %2
  67. psubusb %1, %2
  68. psubusb %4, %3
  69. %endif
  70. por %1, %4
  71. %endmacro
  72. ; %1 = %1>%2
  73. %macro CMP_GT 2-3 ; src/dst, cmp, pb_80
  74. %if %0 == 3
  75. pxor %1, %3
  76. %endif
  77. pcmpgtb %1, %2
  78. %endmacro
  79. ; %1 = abs(%2-%3) > %4
  80. %macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80]
  81. ABSSUB %1, %2, %3, %5 ; dst = abs(src1-src2)
  82. CMP_GT %1, %4, %6 ; dst > cmp
  83. %endmacro
  84. %macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
  85. pand %1, %3 ; new &= mask
  86. pandn %4, %3, %2 ; tmp = ~mask & old
  87. por %1, %4 ; new&mask | old&~mask
  88. %endmacro
  89. %macro UNPACK 4
  90. %if ARCH_X86_64
  91. punpck%1bw %2, %3, %4
  92. %else
  93. mova %2, %3
  94. punpck%1bw %2, %4
  95. %endif
  96. %endmacro
  97. %macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1
  98. ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32]
  99. psubw %3, [rsp+%4+%5*32]
  100. psubw %3, [rsp+%4+%6*32]
  101. paddw %3, [rsp+%4+%7*32]
  102. %ifnidn %10, ""
  103. %if %11 == 0
  104. punpck%2bw %1, %10, m0
  105. %else
  106. UNPACK %2, %1, %10, m0
  107. %endif
  108. mova [rsp+%4+%8*32], %1
  109. paddw %3, %1
  110. %else
  111. paddw %3, [rsp+%4+%8*32]
  112. %endif
  113. psraw %1, %3, %9
  114. %endmacro
  115. ; FIXME interleave l/h better (for instruction pairing)
  116. %macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source
  117. FILTER%7_INIT %1, l, %3, %6 + 0
  118. FILTER%7_INIT %2, h, %4, %6 + 16
  119. packuswb %1, %2
  120. MASK_APPLY %1, %9, %8, %2
  121. mova %5, %1
  122. %endmacro
  123. %macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift,
  124. ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32]
  125. ; FIXME interleave this properly with the subx2/addx2
  126. %ifnidn %15, ""
  127. %if %16 == 0 || ARCH_X86_64
  128. mova %14, %15
  129. %endif
  130. %endif
  131. FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16
  132. FILTER_SUBx2_ADDx2 %2, h, %4, %6 + 16, %7, %8, %9, %10, %11, %14, %16
  133. packuswb %1, %2
  134. %ifnidn %13, ""
  135. MASK_APPLY %1, %13, %12, %2
  136. %else
  137. MASK_APPLY %1, %5, %12, %2
  138. %endif
  139. mova %5, %1
  140. %endmacro
  141. %macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
  142. mova %4, [pb_f8]
  143. pand %1, %4
  144. pand %2, %4
  145. psrlq %1, 3
  146. psrlq %2, 3
  147. pxor %1, %3
  148. pxor %2, %3
  149. psubb %1, %3
  150. psubb %2, %3
  151. %endmacro
  152. %macro EXTRACT_POS_NEG 3 ; i8, neg, pos
  153. pxor %3, %3
  154. pxor %2, %2
  155. pcmpgtb %3, %1 ; i8 < 0 mask
  156. psubb %2, %1 ; neg values (only the originally - will be kept)
  157. pand %2, %3 ; negative values of i8 (but stored as +)
  158. pandn %3, %1 ; positive values of i8
  159. %endmacro
  160. ; clip_u8(u8 + i8)
  161. %macro SIGN_ADD 4 ; dst, u8, i8, tmp1
  162. EXTRACT_POS_NEG %3, %4, %1
  163. paddusb %1, %2 ; add the positives
  164. psubusb %1, %4 ; sub the negatives
  165. %endmacro
  166. ; clip_u8(u8 - i8)
  167. %macro SIGN_SUB 4 ; dst, u8, i8, tmp1
  168. EXTRACT_POS_NEG %3, %1, %4
  169. paddusb %1, %2 ; add the negatives
  170. psubusb %1, %4 ; sub the positives
  171. %endmacro
  172. %macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
  173. UNPACK %2, %1, rp3, m0 ; p3: B->W
  174. mova [rsp+%4+0*32], %1
  175. paddw %3, %1, %1 ; p3*2
  176. paddw %3, %1 ; p3*3
  177. punpck%2bw %1, m1, m0 ; p2: B->W
  178. mova [rsp+%4+1*32], %1
  179. paddw %3, %1 ; p3*3 + p2
  180. paddw %3, %1 ; p3*3 + p2*2
  181. UNPACK %2, %1, rp1, m0 ; p1: B->W
  182. mova [rsp+%4+2*32], %1
  183. paddw %3, %1 ; p3*3 + p2*2 + p1
  184. UNPACK %2, %1, rp0, m0 ; p0: B->W
  185. mova [rsp+%4+3*32], %1
  186. paddw %3, %1 ; p3*3 + p2*2 + p1 + p0
  187. UNPACK %2, %1, rq0, m0 ; q0: B->W
  188. mova [rsp+%4+4*32], %1
  189. paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0
  190. paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4
  191. psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
  192. %endmacro
  193. %macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off
  194. punpck%2bw %1, m2, m0 ; p7: B->W
  195. mova [rsp+%4+ 8*32], %1
  196. psllw %3, %1, 3 ; p7*8
  197. psubw %3, %1 ; p7*7
  198. punpck%2bw %1, m3, m0 ; p6: B->W
  199. mova [rsp+%4+ 9*32], %1
  200. paddw %3, %1 ; p7*7 + p6
  201. paddw %3, %1 ; p7*7 + p6*2
  202. UNPACK %2, %1, rp5, m0 ; p5: B->W
  203. mova [rsp+%4+10*32], %1
  204. paddw %3, %1 ; p7*7 + p6*2 + p5
  205. UNPACK %2, %1, rp4, m0 ; p4: B->W
  206. mova [rsp+%4+11*32], %1
  207. paddw %3, %1 ; p7*7 + p6*2 + p5 + p4
  208. paddw %3, [rsp+%4+ 0*32] ; p7*7 + p6*2 + p5 + p4 + p3
  209. paddw %3, [rsp+%4+ 1*32] ; p7*7 + p6*2 + p5 + .. + p2
  210. paddw %3, [rsp+%4+ 2*32] ; p7*7 + p6*2 + p5 + .. + p1
  211. paddw %3, [rsp+%4+ 3*32] ; p7*7 + p6*2 + p5 + .. + p0
  212. paddw %3, [rsp+%4+ 4*32] ; p7*7 + p6*2 + p5 + .. + p0 + q0
  213. paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
  214. psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
  215. %endmacro
  216. %macro TRANSPOSE16x16B 17
  217. mova %17, m%16
  218. SBUTTERFLY bw, %1, %2, %16
  219. SBUTTERFLY bw, %3, %4, %16
  220. SBUTTERFLY bw, %5, %6, %16
  221. SBUTTERFLY bw, %7, %8, %16
  222. SBUTTERFLY bw, %9, %10, %16
  223. SBUTTERFLY bw, %11, %12, %16
  224. SBUTTERFLY bw, %13, %14, %16
  225. mova m%16, %17
  226. mova %17, m%14
  227. SBUTTERFLY bw, %15, %16, %14
  228. SBUTTERFLY wd, %1, %3, %14
  229. SBUTTERFLY wd, %2, %4, %14
  230. SBUTTERFLY wd, %5, %7, %14
  231. SBUTTERFLY wd, %6, %8, %14
  232. SBUTTERFLY wd, %9, %11, %14
  233. SBUTTERFLY wd, %10, %12, %14
  234. SBUTTERFLY wd, %13, %15, %14
  235. mova m%14, %17
  236. mova %17, m%12
  237. SBUTTERFLY wd, %14, %16, %12
  238. SBUTTERFLY dq, %1, %5, %12
  239. SBUTTERFLY dq, %2, %6, %12
  240. SBUTTERFLY dq, %3, %7, %12
  241. SBUTTERFLY dq, %4, %8, %12
  242. SBUTTERFLY dq, %9, %13, %12
  243. SBUTTERFLY dq, %10, %14, %12
  244. SBUTTERFLY dq, %11, %15, %12
  245. mova m%12, %17
  246. mova %17, m%8
  247. SBUTTERFLY dq, %12, %16, %8
  248. SBUTTERFLY qdq, %1, %9, %8
  249. SBUTTERFLY qdq, %2, %10, %8
  250. SBUTTERFLY qdq, %3, %11, %8
  251. SBUTTERFLY qdq, %4, %12, %8
  252. SBUTTERFLY qdq, %5, %13, %8
  253. SBUTTERFLY qdq, %6, %14, %8
  254. SBUTTERFLY qdq, %7, %15, %8
  255. mova m%8, %17
  256. mova %17, m%1
  257. SBUTTERFLY qdq, %8, %16, %1
  258. mova m%1, %17
  259. SWAP %2, %9
  260. SWAP %3, %5
  261. SWAP %4, %13
  262. SWAP %6, %11
  263. SWAP %8, %15
  264. SWAP %12, %14
  265. %endmacro
  266. %macro TRANSPOSE8x8B 13
  267. SBUTTERFLY bw, %1, %2, %7
  268. movdq%10 m%7, %9
  269. movdqa %11, m%2
  270. SBUTTERFLY bw, %3, %4, %2
  271. SBUTTERFLY bw, %5, %6, %2
  272. SBUTTERFLY bw, %7, %8, %2
  273. SBUTTERFLY wd, %1, %3, %2
  274. movdqa m%2, %11
  275. movdqa %11, m%3
  276. SBUTTERFLY wd, %2, %4, %3
  277. SBUTTERFLY wd, %5, %7, %3
  278. SBUTTERFLY wd, %6, %8, %3
  279. SBUTTERFLY dq, %1, %5, %3
  280. SBUTTERFLY dq, %2, %6, %3
  281. movdqa m%3, %11
  282. movh %12, m%2
  283. movhps %13, m%2
  284. SBUTTERFLY dq, %3, %7, %2
  285. SBUTTERFLY dq, %4, %8, %2
  286. SWAP %2, %5
  287. SWAP %4, %7
  288. %endmacro
  289. %macro DEFINE_REAL_P7_TO_Q7 0-1 0
  290. %define P7 dstq + 4*mstrideq + %1
  291. %define P6 dstq + mstride3q + %1
  292. %define P5 dstq + 2*mstrideq + %1
  293. %define P4 dstq + mstrideq + %1
  294. %define P3 dstq + %1
  295. %define P2 dstq + strideq + %1
  296. %define P1 dstq + 2* strideq + %1
  297. %define P0 dstq + stride3q + %1
  298. %define Q0 dstq + 4* strideq + %1
  299. %define Q1 dst2q + mstride3q + %1
  300. %define Q2 dst2q + 2*mstrideq + %1
  301. %define Q3 dst2q + mstrideq + %1
  302. %define Q4 dst2q + %1
  303. %define Q5 dst2q + strideq + %1
  304. %define Q6 dst2q + 2* strideq + %1
  305. %define Q7 dst2q + stride3q + %1
  306. %endmacro
  307. %macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0
  308. %define P3 rsp + 0 + %1
  309. %define P2 rsp + 16 + %1
  310. %define P1 rsp + 32 + %1
  311. %define P0 rsp + 48 + %1
  312. %define Q0 rsp + 64 + %1
  313. %define Q1 rsp + 80 + %1
  314. %define Q2 rsp + 96 + %1
  315. %define Q3 rsp + 112 + %1
  316. %define P7 rsp + 128 + %1
  317. %define P6 rsp + 144 + %1
  318. %define P5 rsp + 160 + %1
  319. %define P4 rsp + 176 + %1
  320. %define Q4 rsp + 192 + %1
  321. %define Q5 rsp + 208 + %1
  322. %define Q6 rsp + 224 + %1
  323. %define Q7 rsp + 240 + %1
  324. %endmacro
  325. ; ..............AB -> AAAAAAAABBBBBBBB
  326. %macro SPLATB_MIX 1-2 [mask_mix]
  327. %if cpuflag(ssse3)
  328. pshufb %1, %2
  329. %else
  330. punpcklbw %1, %1
  331. punpcklwd %1, %1
  332. punpckldq %1, %1
  333. %endif
  334. %endmacro
  335. %macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=32bit stack only
  336. %if UNIX64
  337. cglobal vp9_loop_filter_%1_%2_16, 5, 9, 16, %3 + %4, dst, stride, E, I, H, mstride, dst2, stride3, mstride3
  338. %else
  339. %if WIN64
  340. cglobal vp9_loop_filter_%1_%2_16, 4, 8, 16, %3 + %4, dst, stride, E, I, mstride, dst2, stride3, mstride3
  341. %else
  342. cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, dst2, stride3, mstride3
  343. %define Ed dword r2m
  344. %define Id dword r3m
  345. %endif
  346. %define Hd dword r4m
  347. %endif
  348. mov mstrideq, strideq
  349. neg mstrideq
  350. lea stride3q, [strideq*3]
  351. lea mstride3q, [mstrideq*3]
  352. %ifidn %1, h
  353. %if %2 > 16
  354. %define movx movh
  355. lea dstq, [dstq + 4*strideq - 4]
  356. %else
  357. %define movx movu
  358. lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos)
  359. %endif
  360. lea dst2q, [dstq + 8*strideq]
  361. %else
  362. lea dstq, [dstq + 4*mstrideq]
  363. lea dst2q, [dstq + 8*strideq]
  364. %endif
  365. DEFINE_REAL_P7_TO_Q7
  366. %ifidn %1, h
  367. movx m0, [P7]
  368. movx m1, [P6]
  369. movx m2, [P5]
  370. movx m3, [P4]
  371. movx m4, [P3]
  372. movx m5, [P2]
  373. %if ARCH_X86_64 || %2 != 16
  374. movx m6, [P1]
  375. %endif
  376. movx m7, [P0]
  377. %if ARCH_X86_64
  378. movx m8, [Q0]
  379. movx m9, [Q1]
  380. movx m10, [Q2]
  381. movx m11, [Q3]
  382. movx m12, [Q4]
  383. movx m13, [Q5]
  384. movx m14, [Q6]
  385. movx m15, [Q7]
  386. DEFINE_TRANSPOSED_P7_TO_Q7
  387. %if %2 == 16
  388. TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
  389. mova [P7], m0
  390. mova [P6], m1
  391. mova [P5], m2
  392. mova [P4], m3
  393. %else ; %2 == 44/48/84/88
  394. ; 8x16 transpose
  395. punpcklbw m0, m1
  396. punpcklbw m2, m3
  397. punpcklbw m4, m5
  398. punpcklbw m6, m7
  399. punpcklbw m8, m9
  400. punpcklbw m10, m11
  401. punpcklbw m12, m13
  402. punpcklbw m14, m15
  403. TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15
  404. SWAP 0, 4
  405. SWAP 2, 5
  406. SWAP 0, 6
  407. SWAP 0, 7
  408. SWAP 10, 9
  409. SWAP 12, 10
  410. SWAP 14, 11
  411. %endif ; %2
  412. mova [P3], m4
  413. mova [P2], m5
  414. mova [P1], m6
  415. mova [P0], m7
  416. mova [Q0], m8
  417. mova [Q1], m9
  418. mova [Q2], m10
  419. mova [Q3], m11
  420. %if %2 == 16
  421. mova [Q4], m12
  422. mova [Q5], m13
  423. mova [Q6], m14
  424. mova [Q7], m15
  425. %endif ; %2
  426. %else ; x86-32
  427. %if %2 == 16
  428. TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80]
  429. DEFINE_TRANSPOSED_P7_TO_Q7
  430. movh [P7], m0
  431. movh [P5], m1
  432. movh [P3], m2
  433. movh [P1], m3
  434. movh [Q2], m5
  435. movh [Q4], m6
  436. movh [Q6], m7
  437. movhps [P6], m0
  438. movhps [P4], m1
  439. movhps [P2], m2
  440. movhps [P0], m3
  441. movhps [Q3], m5
  442. movhps [Q5], m6
  443. movhps [Q7], m7
  444. DEFINE_REAL_P7_TO_Q7
  445. movx m0, [Q0]
  446. movx m1, [Q1]
  447. movx m2, [Q2]
  448. movx m3, [Q3]
  449. movx m4, [Q4]
  450. movx m5, [Q5]
  451. movx m7, [Q7]
  452. TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88]
  453. DEFINE_TRANSPOSED_P7_TO_Q7 8
  454. movh [P7], m0
  455. movh [P5], m1
  456. movh [P3], m2
  457. movh [P1], m3
  458. movh [Q2], m5
  459. movh [Q4], m6
  460. movh [Q6], m7
  461. movhps [P6], m0
  462. movhps [P4], m1
  463. movhps [P2], m2
  464. movhps [P0], m3
  465. movhps [Q3], m5
  466. movhps [Q5], m6
  467. movhps [Q7], m7
  468. DEFINE_TRANSPOSED_P7_TO_Q7
  469. %else ; %2 == 44/48/84/88
  470. punpcklbw m0, m1
  471. punpcklbw m2, m3
  472. punpcklbw m4, m5
  473. punpcklbw m6, m7
  474. movx m1, [Q0]
  475. movx m3, [Q1]
  476. movx m5, [Q2]
  477. movx m7, [Q3]
  478. punpcklbw m1, m3
  479. punpcklbw m5, m7
  480. movx m3, [Q4]
  481. movx m7, [Q5]
  482. punpcklbw m3, m7
  483. mova [rsp], m3
  484. movx m3, [Q6]
  485. movx m7, [Q7]
  486. punpcklbw m3, m7
  487. DEFINE_TRANSPOSED_P7_TO_Q7
  488. TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1
  489. mova [P3], m0
  490. mova [P2], m2
  491. mova [P1], m4
  492. mova [P0], m6
  493. mova [Q1], m5
  494. mova [Q2], m7
  495. mova [Q3], m3
  496. %endif ; %2
  497. %endif ; x86-32/64
  498. %endif ; %1 == h
  499. ; calc fm mask
  500. %if %2 == 16
  501. %if cpuflag(ssse3)
  502. pxor m0, m0
  503. %endif
  504. SPLATB_REG m2, I, m0 ; I I I I ...
  505. SPLATB_REG m3, E, m0 ; E E E E ...
  506. %else
  507. %if cpuflag(ssse3)
  508. mova m0, [mask_mix]
  509. %endif
  510. movd m2, Id
  511. movd m3, Ed
  512. SPLATB_MIX m2, m0
  513. SPLATB_MIX m3, m0
  514. %endif
  515. mova m0, [pb_80]
  516. pxor m2, m0
  517. pxor m3, m0
  518. %if ARCH_X86_64
  519. %ifidn %1, v
  520. mova m8, [P3]
  521. mova m9, [P2]
  522. mova m10, [P1]
  523. mova m11, [P0]
  524. mova m12, [Q0]
  525. mova m13, [Q1]
  526. mova m14, [Q2]
  527. mova m15, [Q3]
  528. %else
  529. ; In case of horizontal, P3..Q3 are already present in some registers due
  530. ; to the previous transpose, so we just swap registers.
  531. SWAP 8, 4, 12
  532. SWAP 9, 5, 13
  533. SWAP 10, 6, 14
  534. SWAP 11, 7, 15
  535. %endif
  536. %define rp3 m8
  537. %define rp2 m9
  538. %define rp1 m10
  539. %define rp0 m11
  540. %define rq0 m12
  541. %define rq1 m13
  542. %define rq2 m14
  543. %define rq3 m15
  544. %else
  545. %define rp3 [P3]
  546. %define rp2 [P2]
  547. %define rp1 [P1]
  548. %define rp0 [P0]
  549. %define rq0 [Q0]
  550. %define rq1 [Q1]
  551. %define rq2 [Q2]
  552. %define rq3 [Q3]
  553. %endif
  554. ABSSUB_GT m5, rp3, rp2, m2, m7, m0 ; m5 = abs(p3-p2) <= I
  555. ABSSUB_GT m1, rp2, rp1, m2, m7, m0 ; m1 = abs(p2-p1) <= I
  556. por m5, m1
  557. ABSSUB_GT m1, rp1, rp0, m2, m7, m0 ; m1 = abs(p1-p0) <= I
  558. por m5, m1
  559. ABSSUB_GT m1, rq0, rq1, m2, m7, m0 ; m1 = abs(q1-q0) <= I
  560. por m5, m1
  561. ABSSUB_GT m1, rq1, rq2, m2, m7, m0 ; m1 = abs(q2-q1) <= I
  562. por m5, m1
  563. ABSSUB_GT m1, rq2, rq3, m2, m7, m0 ; m1 = abs(q3-q2) <= I
  564. por m5, m1
  565. ABSSUB m1, rp0, rq0, m7 ; abs(p0-q0)
  566. paddusb m1, m1 ; abs(p0-q0) * 2
  567. ABSSUB m2, rp1, rq1, m7 ; abs(p1-q1)
  568. pand m2, [pb_fe] ; drop lsb so shift can work
  569. psrlq m2, 1 ; abs(p1-q1)/2
  570. paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2
  571. pxor m1, m0
  572. pcmpgtb m1, m3
  573. por m1, m5 ; fm final value
  574. SWAP 1, 3
  575. pxor m3, [pb_ff]
  576. ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
  577. ; calc flat8in (if not 44_16) and hev masks
  578. %if %2 != 44
  579. mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80
  580. ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1
  581. %if ARCH_X86_64
  582. mova m8, [pb_80]
  583. %define rb80 m8
  584. %else
  585. %define rb80 [pb_80]
  586. %endif
  587. ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1
  588. por m2, m1
  589. ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0)
  590. %if %2 == 16
  591. %if cpuflag(ssse3)
  592. pxor m0, m0
  593. %endif
  594. SPLATB_REG m7, H, m0 ; H H H H ...
  595. %else
  596. movd m7, Hd
  597. SPLATB_MIX m7
  598. %endif
  599. pxor m7, rb80
  600. pxor m4, rb80
  601. pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
  602. CMP_GT m4, m6 ; abs(p1 - p0) <= 1
  603. por m2, m4 ; (flat8in)
  604. ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0)
  605. pxor m4, rb80
  606. pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition)
  607. por m0, m5 ; hev final value
  608. CMP_GT m4, m6 ; abs(q1 - q0) <= 1
  609. por m2, m4 ; (flat8in)
  610. ABSSUB_GT m1, rq2, rq0, m6, m5, rb80 ; abs(q2 - q0) <= 1
  611. por m2, m1
  612. ABSSUB_GT m1, rq3, rq0, m6, m5, rb80 ; abs(q3 - q0) <= 1
  613. por m2, m1 ; flat8in final value
  614. pxor m2, [pb_ff]
  615. %if %2 == 84 || %2 == 48
  616. pand m2, [mask_mix%2]
  617. %endif
  618. %else
  619. mova m6, [pb_80]
  620. movd m7, Hd
  621. SPLATB_MIX m7
  622. pxor m7, m6
  623. ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0)
  624. pxor m4, m6
  625. pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
  626. ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0)
  627. pxor m4, m6
  628. pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition)
  629. por m0, m5 ; hev final value
  630. %endif
  631. %if %2 == 16
  632. ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
  633. ; calc flat8out mask
  634. %if ARCH_X86_64
  635. mova m8, [P7]
  636. mova m9, [P6]
  637. %define rp7 m8
  638. %define rp6 m9
  639. %else
  640. %define rp7 [P7]
  641. %define rp6 [P6]
  642. %endif
  643. ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1
  644. ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1
  645. por m1, m7
  646. %if ARCH_X86_64
  647. mova m8, [P5]
  648. mova m9, [P4]
  649. %define rp5 m8
  650. %define rp4 m9
  651. %else
  652. %define rp5 [P5]
  653. %define rp4 [P4]
  654. %endif
  655. ABSSUB_GT m7, rp5, rp0, m6, m5 ; abs(p5 - p0) <= 1
  656. por m1, m7
  657. ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1
  658. por m1, m7
  659. %if ARCH_X86_64
  660. mova m14, [Q4]
  661. mova m15, [Q5]
  662. %define rq4 m14
  663. %define rq5 m15
  664. %else
  665. %define rq4 [Q4]
  666. %define rq5 [Q5]
  667. %endif
  668. ABSSUB_GT m7, rq4, rq0, m6, m5 ; abs(q4 - q0) <= 1
  669. por m1, m7
  670. ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1
  671. por m1, m7
  672. %if ARCH_X86_64
  673. mova m14, [Q6]
  674. mova m15, [Q7]
  675. %define rq6 m14
  676. %define rq7 m15
  677. %else
  678. %define rq6 [Q6]
  679. %define rq7 [Q7]
  680. %endif
  681. ABSSUB_GT m7, rq6, rq0, m6, m5 ; abs(q4 - q0) <= 1
  682. por m1, m7
  683. ABSSUB_GT m7, rq7, rq0, m6, m5 ; abs(q5 - q0) <= 1
  684. por m1, m7 ; flat8out final value
  685. pxor m1, [pb_ff]
  686. %endif
  687. ; if (fm) {
  688. ; if (out && in) filter_14()
  689. ; else if (in) filter_6()
  690. ; else if (hev) filter_2()
  691. ; else filter_4()
  692. ; }
  693. ;
  694. ; f14: fm & out & in
  695. ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in
  696. ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev
  697. ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev
  698. ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
  699. ; filter2()
  700. %if %2 != 44
  701. mova m6, [pb_80] ; already in m6 if 44_16
  702. SCRATCH 2, 15, rsp+%3+%4
  703. %if %2 == 16
  704. SCRATCH 1, 8, rsp+%3+%4+16
  705. %endif
  706. %endif
  707. pxor m2, m6, rq0 ; q0 ^ 0x80
  708. pxor m4, m6, rp0 ; p0 ^ 0x80
  709. psubsb m2, m4 ; (signed) q0 - p0
  710. pxor m4, m6, rp1 ; p1 ^ 0x80
  711. pxor m5, m6, rq1 ; q1 ^ 0x80
  712. psubsb m4, m5 ; (signed) p1 - q1
  713. paddsb m4, m2 ; (q0 - p0) + (p1 - q1)
  714. paddsb m4, m2 ; 2*(q0 - p0) + (p1 - q1)
  715. paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1)
  716. paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127)
  717. paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127)
  718. %if ARCH_X86_64
  719. mova m14, [pb_10] ; will be reused in filter4()
  720. %define rb10 m14
  721. %else
  722. %define rb10 [pb_10]
  723. %endif
  724. SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3
  725. SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1
  726. SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2
  727. %if %2 != 44
  728. %if ARCH_X86_64
  729. pandn m6, m15, m3 ; ~mask(in) & mask(fm)
  730. %else
  731. mova m6, [rsp+%3+%4]
  732. pandn m6, m3
  733. %endif
  734. pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev)
  735. %else
  736. pand m6, m3, m0
  737. %endif
  738. MASK_APPLY m7, rq0, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4()
  739. MASK_APPLY m1, rp0, m6, m5 ; m1 = filter2(p0) & mask / we write it in filter4()
  740. ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], )
  741. ; filter4()
  742. mova m4, m2
  743. paddsb m2, m4 ; 2 * (q0 - p0)
  744. paddsb m2, m4 ; 3 * (q0 - p0)
  745. paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127)
  746. paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127)
  747. SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3
  748. %if %2 != 44
  749. %if ARCH_X86_64
  750. pandn m5, m15, m3 ; ~mask(in) & mask(fm)
  751. %else
  752. mova m5, [rsp+%3+%4]
  753. pandn m5, m3
  754. %endif
  755. pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm))
  756. %else
  757. pandn m0, m3
  758. %endif
  759. SIGN_SUB m5, rq0, m6, m4 ; q0 - f1
  760. MASK_APPLY m5, m7, m0, m4 ; filter4(q0) & mask
  761. mova [Q0], m5
  762. SIGN_ADD m7, rp0, m2, m4 ; p0 + f2
  763. MASK_APPLY m7, m1, m0, m4 ; filter4(p0) & mask
  764. mova [P0], m7
  765. paddb m6, [pb_80] ;
  766. pxor m1, m1 ; f=(f1+1)>>1
  767. pavgb m6, m1 ;
  768. psubb m6, [pb_40] ;
  769. SIGN_ADD m1, rp1, m6, m2 ; p1 + f
  770. SIGN_SUB m4, rq1, m6, m2 ; q1 - f
  771. MASK_APPLY m1, rp1, m0, m2 ; m1 = filter4(p1)
  772. MASK_APPLY m4, rq1, m0, m2 ; m4 = filter4(q1)
  773. mova [P1], m1
  774. mova [Q1], m4
  775. %if %2 != 44
  776. UNSCRATCH 2, 15, rsp+%3+%4
  777. %endif
  778. ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
  779. ; filter6()
  780. %if %2 != 44
  781. pxor m0, m0
  782. %if %2 > 16
  783. pand m3, m2
  784. %else
  785. pand m2, m3 ; mask(fm) & mask(in)
  786. %if ARCH_X86_64
  787. pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in))
  788. %else
  789. mova m3, [rsp+%3+%4+16]
  790. pandn m3, m2
  791. %endif
  792. %endif
  793. %if ARCH_X86_64
  794. mova m14, [P3]
  795. mova m9, [Q3]
  796. %define rp3 m14
  797. %define rq3 m9
  798. %else
  799. %define rp3 [P3]
  800. %define rq3 [Q3]
  801. %endif
  802. mova m1, [P2]
  803. FILTER_INIT m4, m5, m6, m7, [P2], %4, 6, m3, m1 ; [p2]
  804. mova m1, [Q2]
  805. FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3, "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1
  806. FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3, "", m1 ; [p0] -p3 -p1 +p0 +q2
  807. FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3, "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3
  808. FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3, "" ; [q1] -p2 -q0 +q1 +q3
  809. FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3, m1 ; [q2] -p1 -q1 +q2 +q3
  810. %endif
  811. %if %2 == 16
  812. UNSCRATCH 1, 8, rsp+%3+%4+16
  813. %endif
  814. ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
  815. ; filter14()
  816. ;
  817. ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13
  818. ;
  819. ; q2 q3 p3 p2 p1 p0 q0 q1
  820. ; p6 -7 p7 p6 p5 p4 . . . . .
  821. ; p5 -6 -p7 -p6 +p5 +q1 . . . .
  822. ; p4 -5 -p7 -p5 +p4 +q2 . . . q2
  823. ; p3 -4 -p7 -p4 +p3 +q3 . . . q3
  824. ; p2 -3 -p7 -p3 +p2 +q4 . . . q4
  825. ; p1 -2 -p7 -p2 +p1 +q5 . . . q5
  826. ; p0 -1 -p7 -p1 +p0 +q6 . . . q6
  827. ; q0 +0 -p7 -p0 +q0 +q7 . . . q7
  828. ; q1 +1 -p6 -q0 +q1 +q7 q1 . . .
  829. ; q2 +2 -p5 -q1 +q2 +q7 . q2 . .
  830. ; q3 +3 -p4 -q2 +q3 +q7 . q3 . .
  831. ; q4 +4 -p3 -q3 +q4 +q7 . q4 . .
  832. ; q5 +5 -p2 -q4 +q5 +q7 . q5 . .
  833. ; q6 +6 -p1 -q5 +q6 +q7 . q6 . .
  834. %if %2 == 16
  835. pand m1, m2 ; mask(out) & (mask(fm) & mask(in))
  836. mova m2, [P7]
  837. mova m3, [P6]
  838. %if ARCH_X86_64
  839. mova m8, [P5]
  840. mova m9, [P4]
  841. %define rp5 m8
  842. %define rp4 m9
  843. %define rp5s m8
  844. %define rp4s m9
  845. %define rp3s m14
  846. %define rq4 m8
  847. %define rq5 m9
  848. %define rq6 m14
  849. %define rq7 m15
  850. %define rq4s m8
  851. %define rq5s m9
  852. %define rq6s m14
  853. %else
  854. %define rp5 [P5]
  855. %define rp4 [P4]
  856. %define rp5s ""
  857. %define rp4s ""
  858. %define rp3s ""
  859. %define rq4 [Q4]
  860. %define rq5 [Q5]
  861. %define rq6 [Q6]
  862. %define rq7 [Q7]
  863. %define rq4s ""
  864. %define rq5s ""
  865. %define rq6s ""
  866. %endif
  867. FILTER_INIT m4, m5, m6, m7, [P6], %4, 14, m1, m3 ; [p6]
  868. FILTER_UPDATE m4, m5, m6, m7, [P5], %4, 8, 9, 10, 5, 4, m1, rp5s ; [p5] -p7 -p6 +p5 +q1
  869. FILTER_UPDATE m4, m5, m6, m7, [P4], %4, 8, 10, 11, 6, 4, m1, rp4s ; [p4] -p7 -p5 +p4 +q2
  870. FILTER_UPDATE m4, m5, m6, m7, [P3], %4, 8, 11, 0, 7, 4, m1, rp3s ; [p3] -p7 -p4 +p3 +q3
  871. FILTER_UPDATE m4, m5, m6, m7, [P2], %4, 8, 0, 1, 12, 4, m1, "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4
  872. FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 8, 1, 2, 13, 4, m1, "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5
  873. FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 8, 2, 3, 14, 4, m1, "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6
  874. FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 8, 3, 4, 15, 4, m1, "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7
  875. FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 9, 4, 5, 15, 4, m1, "" ; [q1] -p6 -q0 +q1 +q7
  876. FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 10, 5, 6, 15, 4, m1, "" ; [q2] -p5 -q1 +q2 +q7
  877. FILTER_UPDATE m4, m5, m6, m7, [Q3], %4, 11, 6, 7, 15, 4, m1, "" ; [q3] -p4 -q2 +q3 +q7
  878. FILTER_UPDATE m4, m5, m6, m7, [Q4], %4, 0, 7, 12, 15, 4, m1, rq4s ; [q4] -p3 -q3 +q4 +q7
  879. FILTER_UPDATE m4, m5, m6, m7, [Q5], %4, 1, 12, 13, 15, 4, m1, rq5s ; [q5] -p2 -q4 +q5 +q7
  880. FILTER_UPDATE m4, m5, m6, m7, [Q6], %4, 2, 13, 14, 15, 4, m1, rq6s ; [q6] -p1 -q5 +q6 +q7
  881. %endif
  882. %ifidn %1, h
  883. %if %2 == 16
  884. mova m0, [P7]
  885. mova m1, [P6]
  886. mova m2, [P5]
  887. mova m3, [P4]
  888. mova m4, [P3]
  889. mova m5, [P2]
  890. %if ARCH_X86_64
  891. mova m6, [P1]
  892. %endif
  893. mova m7, [P0]
  894. %if ARCH_X86_64
  895. mova m8, [Q0]
  896. mova m9, [Q1]
  897. mova m10, [Q2]
  898. mova m11, [Q3]
  899. mova m12, [Q4]
  900. mova m13, [Q5]
  901. mova m14, [Q6]
  902. mova m15, [Q7]
  903. TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
  904. DEFINE_REAL_P7_TO_Q7
  905. movu [P7], m0
  906. movu [P6], m1
  907. movu [P5], m2
  908. movu [P4], m3
  909. movu [P3], m4
  910. movu [P2], m5
  911. movu [P1], m6
  912. movu [P0], m7
  913. movu [Q0], m8
  914. movu [Q1], m9
  915. movu [Q2], m10
  916. movu [Q3], m11
  917. movu [Q4], m12
  918. movu [Q5], m13
  919. movu [Q6], m14
  920. movu [Q7], m15
  921. %else
  922. DEFINE_REAL_P7_TO_Q7
  923. TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1]
  924. movh [P7], m0
  925. movh [P5], m1
  926. movh [P3], m2
  927. movh [P1], m3
  928. movh [Q2], m5
  929. movh [Q4], m6
  930. movh [Q6], m7
  931. movhps [P6], m0
  932. movhps [P4], m1
  933. movhps [P2], m2
  934. movhps [P0], m3
  935. movhps [Q3], m5
  936. movhps [Q5], m6
  937. movhps [Q7], m7
  938. DEFINE_TRANSPOSED_P7_TO_Q7
  939. mova m0, [Q0]
  940. mova m1, [Q1]
  941. mova m2, [Q2]
  942. mova m3, [Q3]
  943. mova m4, [Q4]
  944. mova m5, [Q5]
  945. mova m7, [Q7]
  946. DEFINE_REAL_P7_TO_Q7 8
  947. TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1]
  948. movh [P7], m0
  949. movh [P5], m1
  950. movh [P3], m2
  951. movh [P1], m3
  952. movh [Q2], m5
  953. movh [Q4], m6
  954. movh [Q6], m7
  955. movhps [P6], m0
  956. movhps [P4], m1
  957. movhps [P2], m2
  958. movhps [P0], m3
  959. movhps [Q3], m5
  960. movhps [Q5], m6
  961. movhps [Q7], m7
  962. %endif
  963. %elif %2 == 44
  964. SWAP 0, 1 ; m0 = p1
  965. SWAP 1, 7 ; m1 = p0
  966. SWAP 2, 5 ; m2 = q0
  967. SWAP 3, 4 ; m3 = q1
  968. DEFINE_REAL_P7_TO_Q7 2
  969. SBUTTERFLY bw, 0, 1, 4
  970. SBUTTERFLY bw, 2, 3, 4
  971. SBUTTERFLY wd, 0, 2, 4
  972. SBUTTERFLY wd, 1, 3, 4
  973. movd [P7], m0
  974. movd [P3], m2
  975. movd [Q0], m1
  976. movd [Q4], m3
  977. psrldq m0, 4
  978. psrldq m1, 4
  979. psrldq m2, 4
  980. psrldq m3, 4
  981. movd [P6], m0
  982. movd [P2], m2
  983. movd [Q1], m1
  984. movd [Q5], m3
  985. psrldq m0, 4
  986. psrldq m1, 4
  987. psrldq m2, 4
  988. psrldq m3, 4
  989. movd [P5], m0
  990. movd [P1], m2
  991. movd [Q2], m1
  992. movd [Q6], m3
  993. psrldq m0, 4
  994. psrldq m1, 4
  995. psrldq m2, 4
  996. psrldq m3, 4
  997. movd [P4], m0
  998. movd [P0], m2
  999. movd [Q3], m1
  1000. movd [Q7], m3
  1001. %else
  1002. ; the following code do a transpose of 8 full lines to 16 half
  1003. ; lines (high part). It is inlined to avoid the need of a staging area
  1004. mova m0, [P3]
  1005. mova m1, [P2]
  1006. mova m2, [P1]
  1007. mova m3, [P0]
  1008. mova m4, [Q0]
  1009. mova m5, [Q1]
  1010. %if ARCH_X86_64
  1011. mova m6, [Q2]
  1012. %endif
  1013. mova m7, [Q3]
  1014. DEFINE_REAL_P7_TO_Q7
  1015. %if ARCH_X86_64
  1016. SBUTTERFLY bw, 0, 1, 8
  1017. SBUTTERFLY bw, 2, 3, 8
  1018. SBUTTERFLY bw, 4, 5, 8
  1019. SBUTTERFLY bw, 6, 7, 8
  1020. SBUTTERFLY wd, 0, 2, 8
  1021. SBUTTERFLY wd, 1, 3, 8
  1022. SBUTTERFLY wd, 4, 6, 8
  1023. SBUTTERFLY wd, 5, 7, 8
  1024. SBUTTERFLY dq, 0, 4, 8
  1025. SBUTTERFLY dq, 1, 5, 8
  1026. SBUTTERFLY dq, 2, 6, 8
  1027. SBUTTERFLY dq, 3, 7, 8
  1028. %else
  1029. SBUTTERFLY bw, 0, 1, 6
  1030. mova [rsp+64], m1
  1031. mova m6, [rsp+96]
  1032. SBUTTERFLY bw, 2, 3, 1
  1033. SBUTTERFLY bw, 4, 5, 1
  1034. SBUTTERFLY bw, 6, 7, 1
  1035. SBUTTERFLY wd, 0, 2, 1
  1036. mova [rsp+96], m2
  1037. mova m1, [rsp+64]
  1038. SBUTTERFLY wd, 1, 3, 2
  1039. SBUTTERFLY wd, 4, 6, 2
  1040. SBUTTERFLY wd, 5, 7, 2
  1041. SBUTTERFLY dq, 0, 4, 2
  1042. SBUTTERFLY dq, 1, 5, 2
  1043. movh [Q0], m1
  1044. movhps [Q1], m1
  1045. mova m2, [rsp+96]
  1046. SBUTTERFLY dq, 2, 6, 1
  1047. SBUTTERFLY dq, 3, 7, 1
  1048. %endif
  1049. SWAP 3, 6
  1050. SWAP 1, 4
  1051. movh [P7], m0
  1052. movhps [P6], m0
  1053. movh [P5], m1
  1054. movhps [P4], m1
  1055. movh [P3], m2
  1056. movhps [P2], m2
  1057. movh [P1], m3
  1058. movhps [P0], m3
  1059. %if ARCH_X86_64
  1060. movh [Q0], m4
  1061. movhps [Q1], m4
  1062. %endif
  1063. movh [Q2], m5
  1064. movhps [Q3], m5
  1065. movh [Q4], m6
  1066. movhps [Q5], m6
  1067. movh [Q6], m7
  1068. movhps [Q7], m7
  1069. %endif
  1070. %endif
  1071. RET
  1072. %endmacro
  1073. %macro LPF_16_VH 5
  1074. INIT_XMM %5
  1075. LOOPFILTER v, %1, %2, 0, %4
  1076. LOOPFILTER h, %1, %2, %3, %4
  1077. %endmacro
  1078. %macro LPF_16_VH_ALL_OPTS 4
  1079. LPF_16_VH %1, %2, %3, %4, sse2
  1080. LPF_16_VH %1, %2, %3, %4, ssse3
  1081. LPF_16_VH %1, %2, %3, %4, avx
  1082. %endmacro
  1083. LPF_16_VH_ALL_OPTS 16, 512, 256, 32
  1084. LPF_16_VH_ALL_OPTS 44, 0, 128, 0
  1085. LPF_16_VH_ALL_OPTS 48, 256, 128, 16
  1086. LPF_16_VH_ALL_OPTS 84, 256, 128, 16
  1087. LPF_16_VH_ALL_OPTS 88, 256, 128, 16