You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

824 lines
24KB

  1. ;******************************************************************************
  2. ;* VP9 loop filter SIMD optimizations
  3. ;*
  4. ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. pw_511: times 16 dw 511
  25. pw_2047: times 16 dw 2047
  26. pw_16384: times 16 dw 16384
  27. pw_m512: times 16 dw -512
  28. pw_m2048: times 16 dw -2048
  29. cextern pw_1
  30. cextern pw_3
  31. cextern pw_4
  32. cextern pw_8
  33. cextern pw_16
  34. cextern pw_256
  35. cextern pw_1023
  36. cextern pw_4095
  37. cextern pw_m1
  38. SECTION .text
  39. %macro SCRATCH 3-4
  40. %if ARCH_X86_64
  41. SWAP %1, %2
  42. %if %0 == 4
  43. %define reg_%4 m%2
  44. %endif
  45. %else
  46. mova [%3], m%1
  47. %if %0 == 4
  48. %define reg_%4 [%3]
  49. %endif
  50. %endif
  51. %endmacro
  52. %macro UNSCRATCH 3-4
  53. %if ARCH_X86_64
  54. SWAP %1, %2
  55. %else
  56. mova m%1, [%3]
  57. %endif
  58. %if %0 == 4
  59. %undef reg_%4
  60. %endif
  61. %endmacro
  62. %macro PRELOAD 2-3
  63. %if ARCH_X86_64
  64. mova m%1, [%2]
  65. %if %0 == 3
  66. %define reg_%3 m%1
  67. %endif
  68. %elif %0 == 3
  69. %define reg_%3 [%2]
  70. %endif
  71. %endmacro
  72. ; calculate p or q portion of flat8out
  73. %macro FLAT8OUT_HALF 0
  74. psubw m4, m0 ; q4-q0
  75. psubw m5, m0 ; q5-q0
  76. psubw m6, m0 ; q6-q0
  77. psubw m7, m0 ; q7-q0
  78. ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0)
  79. ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0)
  80. pcmpgtw m4, reg_F ; abs(q4-q0) > F
  81. pcmpgtw m5, reg_F ; abs(q5-q0) > F
  82. pcmpgtw m6, reg_F ; abs(q6-q0) > F
  83. pcmpgtw m7, reg_F ; abs(q7-q0) > F
  84. por m5, m4
  85. por m7, m6
  86. por m7, m5 ; !flat8out, q portion
  87. %endmacro
  88. ; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
  89. %macro FLAT8IN_HALF 1
  90. %if %1 > 4
  91. psubw m4, m3, m0 ; q3-q0
  92. psubw m5, m2, m0 ; q2-q0
  93. ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0)
  94. pcmpgtw m4, reg_F ; abs(q3-q0) > F
  95. pcmpgtw m5, reg_F ; abs(q2-q0) > F
  96. %endif
  97. psubw m3, m2 ; q3-q2
  98. psubw m2, m1 ; q2-q1
  99. ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1)
  100. pcmpgtw m3, reg_I ; abs(q3-q2) > I
  101. pcmpgtw m2, reg_I ; abs(q2-q1) > I
  102. %if %1 > 4
  103. por m4, m5
  104. %endif
  105. por m2, m3
  106. psubw m3, m1, m0 ; q1-q0
  107. ABS1 m3, m5 ; abs(q1-q0)
  108. %if %1 > 4
  109. pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F
  110. %endif
  111. pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H
  112. pcmpgtw m3, reg_I ; abs(q1-q0) > I
  113. %if %1 > 4
  114. por m4, m6
  115. %endif
  116. por m2, m3
  117. %endmacro
  118. ; one step in filter_14/filter_6
  119. ;
  120. ; take sum $reg, downshift, apply mask and write into dst
  121. ;
  122. ; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
  123. ; step's sum $reg. This is omitted for the last row in each filter.
  124. ;
  125. ; if dont_store is set, don't write the result into memory, instead keep the
  126. ; values in register so we can write it out later
  127. %macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
  128. ; src/sub1, sub2, add1, add2, dont_store
  129. psrlw %1, %2, %4
  130. psubw %1, %6 ; abs->delta
  131. %ifnidn %7, ""
  132. psubw %2, %6
  133. psubw %2, %7
  134. paddw %2, %8
  135. paddw %2, %9
  136. %endif
  137. pand %1, reg_%3 ; apply mask
  138. %if %10 == 1
  139. paddw %6, %1 ; delta->abs
  140. %else
  141. paddw %1, %6 ; delta->abs
  142. mova [%5], %1
  143. %endif
  144. %endmacro
  145. ; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
  146. %macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
  147. %if ARCH_X86_64
  148. %if %2 == 16
  149. %assign %%num_xmm_regs 16
  150. %elif %2 == 8
  151. %assign %%num_xmm_regs 15
  152. %else ; %2 == 4
  153. %assign %%num_xmm_regs 14
  154. %endif ; %2
  155. %assign %%bak_mem 0
  156. %else ; ARCH_X86_32
  157. %assign %%num_xmm_regs 8
  158. %if %2 == 16
  159. %assign %%bak_mem 7
  160. %elif %2 == 8
  161. %assign %%bak_mem 6
  162. %else ; %2 == 4
  163. %assign %%bak_mem 5
  164. %endif ; %2
  165. %endif ; ARCH_X86_64/32
  166. %if %2 == 16
  167. %ifidn %1, v
  168. %assign %%num_gpr_regs 6
  169. %else ; %1 == h
  170. %assign %%num_gpr_regs 5
  171. %endif ; %1
  172. %assign %%wd_mem 6
  173. %else ; %2 == 8/4
  174. %assign %%num_gpr_regs 5
  175. %if ARCH_X86_32 && %2 == 8
  176. %assign %%wd_mem 2
  177. %else ; ARCH_X86_64 || %2 == 4
  178. %assign %%wd_mem 0
  179. %endif ; ARCH_X86_64/32 etc.
  180. %endif ; %2
  181. %ifidn %1, v
  182. %assign %%tsp_mem 0
  183. %elif %2 == 16 ; && %1 == h
  184. %assign %%tsp_mem 16
  185. %else ; %1 == h && %1 == 8/4
  186. %assign %%tsp_mem 8
  187. %endif ; %1/%2
  188. %assign %%off %%wd_mem
  189. %assign %%tspoff %%bak_mem+%%wd_mem
  190. %assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
  191. %if %3 == 10
  192. %define %%maxsgn 511
  193. %define %%minsgn m512
  194. %define %%maxusgn 1023
  195. %define %%maxf 4
  196. %else ; %3 == 12
  197. %define %%maxsgn 2047
  198. %define %%minsgn m2048
  199. %define %%maxusgn 4095
  200. %define %%maxf 16
  201. %endif ; %3
  202. cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
  203. ; prepare E, I and H masks
  204. shl Ed, %3-8
  205. shl Id, %3-8
  206. shl Hd, %3-8
  207. %if cpuflag(ssse3)
  208. mova m0, [pw_256]
  209. %endif
  210. movd m1, Ed
  211. movd m2, Id
  212. movd m3, Hd
  213. %if cpuflag(ssse3)
  214. pshufb m1, m0 ; E << (bit_depth - 8)
  215. pshufb m2, m0 ; I << (bit_depth - 8)
  216. pshufb m3, m0 ; H << (bit_depth - 8)
  217. %else
  218. punpcklwd m1, m1
  219. punpcklwd m2, m2
  220. punpcklwd m3, m3
  221. pshufd m1, m1, q0000
  222. pshufd m2, m2, q0000
  223. pshufd m3, m3, q0000
  224. %endif
  225. SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E
  226. SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I
  227. SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H
  228. %if %2 > 4
  229. PRELOAD 11, pw_ %+ %%maxf, F
  230. %endif
  231. ; set up variables to load data
  232. %ifidn %1, v
  233. DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
  234. lea stride3q, [strideq*3]
  235. neg strideq
  236. %if %2 == 16
  237. lea dst0q, [dst8q+strideq*8]
  238. %else
  239. lea dst4q, [dst8q+strideq*4]
  240. %endif
  241. neg strideq
  242. %if %2 == 16
  243. lea dst12q, [dst8q+strideq*4]
  244. lea dst4q, [dst0q+strideq*4]
  245. %endif
  246. %if %2 == 16
  247. %define %%p7 dst0q
  248. %define %%p6 dst0q+strideq
  249. %define %%p5 dst0q+strideq*2
  250. %define %%p4 dst0q+stride3q
  251. %endif
  252. %define %%p3 dst4q
  253. %define %%p2 dst4q+strideq
  254. %define %%p1 dst4q+strideq*2
  255. %define %%p0 dst4q+stride3q
  256. %define %%q0 dst8q
  257. %define %%q1 dst8q+strideq
  258. %define %%q2 dst8q+strideq*2
  259. %define %%q3 dst8q+stride3q
  260. %if %2 == 16
  261. %define %%q4 dst12q
  262. %define %%q5 dst12q+strideq
  263. %define %%q6 dst12q+strideq*2
  264. %define %%q7 dst12q+stride3q
  265. %endif
  266. %else ; %1 == h
  267. DEFINE_ARGS dst0, stride, stride3, dst4
  268. lea stride3q, [strideq*3]
  269. lea dst4q, [dst0q+strideq*4]
  270. %define %%p3 rsp+(%%tspoff+0)*mmsize
  271. %define %%p2 rsp+(%%tspoff+1)*mmsize
  272. %define %%p1 rsp+(%%tspoff+2)*mmsize
  273. %define %%p0 rsp+(%%tspoff+3)*mmsize
  274. %define %%q0 rsp+(%%tspoff+4)*mmsize
  275. %define %%q1 rsp+(%%tspoff+5)*mmsize
  276. %define %%q2 rsp+(%%tspoff+6)*mmsize
  277. %define %%q3 rsp+(%%tspoff+7)*mmsize
  278. %if %2 < 16
  279. movu m0, [dst0q+strideq*0-8]
  280. movu m1, [dst0q+strideq*1-8]
  281. movu m2, [dst0q+strideq*2-8]
  282. movu m3, [dst0q+stride3q -8]
  283. movu m4, [dst4q+strideq*0-8]
  284. movu m5, [dst4q+strideq*1-8]
  285. movu m6, [dst4q+strideq*2-8]
  286. movu m7, [dst4q+stride3q -8]
  287. %if ARCH_X86_64
  288. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
  289. %else
  290. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
  291. %endif
  292. mova [%%p3], m0
  293. mova [%%p2], m1
  294. mova [%%p1], m2
  295. mova [%%p0], m3
  296. %if ARCH_X86_64
  297. mova [%%q0], m4
  298. %endif
  299. mova [%%q1], m5
  300. mova [%%q2], m6
  301. mova [%%q3], m7
  302. ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
  303. ; order here accordingly
  304. %else ; %2 == 16
  305. %define %%p7 rsp+(%%tspoff+ 8)*mmsize
  306. %define %%p6 rsp+(%%tspoff+ 9)*mmsize
  307. %define %%p5 rsp+(%%tspoff+10)*mmsize
  308. %define %%p4 rsp+(%%tspoff+11)*mmsize
  309. %define %%q4 rsp+(%%tspoff+12)*mmsize
  310. %define %%q5 rsp+(%%tspoff+13)*mmsize
  311. %define %%q6 rsp+(%%tspoff+14)*mmsize
  312. %define %%q7 rsp+(%%tspoff+15)*mmsize
  313. mova m0, [dst0q+strideq*0-16]
  314. mova m1, [dst0q+strideq*1-16]
  315. mova m2, [dst0q+strideq*2-16]
  316. mova m3, [dst0q+stride3q -16]
  317. mova m4, [dst4q+strideq*0-16]
  318. mova m5, [dst4q+strideq*1-16]
  319. %if ARCH_X86_64
  320. mova m6, [dst4q+strideq*2-16]
  321. %endif
  322. mova m7, [dst4q+stride3q -16]
  323. %if ARCH_X86_64
  324. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
  325. %else
  326. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
  327. %endif
  328. mova [%%p7], m0
  329. mova [%%p6], m1
  330. mova [%%p5], m2
  331. mova [%%p4], m3
  332. %if ARCH_X86_64
  333. mova [%%p3], m4
  334. %endif
  335. mova [%%p2], m5
  336. mova [%%p1], m6
  337. mova [%%p0], m7
  338. mova m0, [dst0q+strideq*0]
  339. mova m1, [dst0q+strideq*1]
  340. mova m2, [dst0q+strideq*2]
  341. mova m3, [dst0q+stride3q ]
  342. mova m4, [dst4q+strideq*0]
  343. mova m5, [dst4q+strideq*1]
  344. %if ARCH_X86_64
  345. mova m6, [dst4q+strideq*2]
  346. %endif
  347. mova m7, [dst4q+stride3q ]
  348. %if ARCH_X86_64
  349. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
  350. %else
  351. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
  352. %endif
  353. mova [%%q0], m0
  354. mova [%%q1], m1
  355. mova [%%q2], m2
  356. mova [%%q3], m3
  357. %if ARCH_X86_64
  358. mova [%%q4], m4
  359. %endif
  360. mova [%%q5], m5
  361. mova [%%q6], m6
  362. mova [%%q7], m7
  363. ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
  364. ; order here accordingly
  365. %endif ; %2
  366. %endif ; %1
  367. ; load q0|q4-7 data
  368. mova m0, [%%q0]
  369. %if %2 == 16
  370. mova m4, [%%q4]
  371. mova m5, [%%q5]
  372. mova m6, [%%q6]
  373. mova m7, [%%q7]
  374. ; flat8out q portion
  375. FLAT8OUT_HALF
  376. SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
  377. %endif
  378. ; load q1-3 data
  379. mova m1, [%%q1]
  380. mova m2, [%%q2]
  381. mova m3, [%%q3]
  382. ; r6-8|pw_4[m8-11]=reg_E/I/H/F
  383. ; r9[m15]=!flatout[q]
  384. ; m12-14=free
  385. ; m0-3=q0-q3
  386. ; m4-7=free
  387. ; flat8in|fm|hev q portion
  388. FLAT8IN_HALF %2
  389. SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
  390. %if %2 > 4
  391. SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I
  392. %endif
  393. ; r6-8|pw_4[m8-11]=reg_E/I/H/F
  394. ; r9[m15]=!flat8out[q]
  395. ; r10[m13]=hev[q]
  396. ; r11[m14]=!flat8in[q]
  397. ; m2=!fm[q]
  398. ; m0,1=q0-q1
  399. ; m2-7=free
  400. ; m12=free
  401. ; load p0-1
  402. mova m3, [%%p0]
  403. mova m4, [%%p1]
  404. ; fm mb_edge portion
  405. psubw m5, m3, m0 ; q0-p0
  406. psubw m6, m4, m1 ; q1-p1
  407. %if ARCH_X86_64
  408. ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1)
  409. %else
  410. ABS1 m5, m7 ; abs(q0-p0)
  411. ABS1 m6, m7 ; abs(q1-p1)
  412. %endif
  413. paddw m5, m5
  414. psraw m6, 1
  415. paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1)
  416. pcmpgtw m6, reg_E
  417. por m2, m6
  418. SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM
  419. ; r6-8|pw_4[m8-11]=reg_E/I/H/F
  420. ; r9[m15]=!flat8out[q]
  421. ; r10[m13]=hev[q]
  422. ; r11[m14]=!flat8in[q]
  423. ; r12[m12]=!fm[q]
  424. ; m3-4=q0-1
  425. ; m0-2/5-7=free
  426. ; load p4-7 data
  427. SWAP 3, 0 ; p0
  428. SWAP 4, 1 ; p1
  429. %if %2 == 16
  430. mova m7, [%%p7]
  431. mova m6, [%%p6]
  432. mova m5, [%%p5]
  433. mova m4, [%%p4]
  434. ; flat8out p portion
  435. FLAT8OUT_HALF
  436. por m7, reg_F8O
  437. SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
  438. %endif
  439. ; r6-8|pw_4[m8-11]=reg_E/I/H/F
  440. ; r9[m15]=!flat8out
  441. ; r10[m13]=hev[q]
  442. ; r11[m14]=!flat8in[q]
  443. ; r12[m12]=!fm[q]
  444. ; m0=p0
  445. ; m1-7=free
  446. ; load p2-3 data
  447. mova m2, [%%p2]
  448. mova m3, [%%p3]
  449. ; flat8in|fm|hev p portion
  450. FLAT8IN_HALF %2
  451. por m7, reg_HEV
  452. %if %2 > 4
  453. por m4, reg_F8I
  454. %endif
  455. por m2, reg_FM
  456. %if %2 > 4
  457. por m4, m2 ; !flat8|!fm
  458. %if %2 == 16
  459. por m5, m4, reg_F8O ; !flat16|!fm
  460. pandn m2, m4 ; filter4_mask
  461. pandn m4, m5 ; filter8_mask
  462. pxor m5, [pw_m1] ; filter16_mask
  463. SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M
  464. %else
  465. pandn m2, m4 ; filter4_mask
  466. pxor m4, [pw_m1] ; filter8_mask
  467. %endif
  468. SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M
  469. %else
  470. pxor m2, [pw_m1] ; filter4_mask
  471. %endif
  472. SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
  473. SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M
  474. ; r9[m15]=filter16_mask
  475. ; r10[m13]=hev
  476. ; r11[m14]=filter8_mask
  477. ; r12[m12]=filter4_mask
  478. ; m0,1=p0-p1
  479. ; m2-7=free
  480. ; m8-11=free
  481. %if %2 > 4
  482. %if %2 == 16
  483. ; filter_14
  484. mova m2, [%%p7]
  485. mova m3, [%%p6]
  486. mova m6, [%%p5]
  487. mova m7, [%%p4]
  488. PRELOAD 8, %%p3, P3
  489. PRELOAD 9, %%p2, P2
  490. %endif
  491. PRELOAD 10, %%q0, Q0
  492. PRELOAD 11, %%q1, Q1
  493. %if %2 == 16
  494. psllw m4, m2, 3
  495. paddw m5, m3, m3
  496. paddw m4, m6
  497. paddw m5, m7
  498. paddw m4, reg_P3
  499. paddw m5, reg_P2
  500. paddw m4, m1
  501. paddw m5, m0
  502. paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8
  503. psubw m5, m2 ; p0+p2+p4+p6*2-p7
  504. paddw m4, [pw_8]
  505. paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
  506. ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
  507. ; at the end of the filter
  508. mova [rsp+0*mmsize], m3
  509. FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1
  510. %endif
  511. mova m3, [%%q2]
  512. %if %2 == 16
  513. mova [rsp+1*mmsize], m6
  514. FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3
  515. %endif
  516. mova m6, [%%q3]
  517. %if %2 == 16
  518. mova [rsp+2*mmsize], m7
  519. FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6
  520. mova m7, [%%q4]
  521. %if ARCH_X86_64
  522. mova [rsp+3*mmsize], reg_P3
  523. %else
  524. mova m4, reg_P3
  525. mova [rsp+3*mmsize], m4
  526. %endif
  527. FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7
  528. PRELOAD 8, %%q5, Q5
  529. %if ARCH_X86_64
  530. mova [rsp+4*mmsize], reg_P2
  531. %else
  532. mova m4, reg_P2
  533. mova [rsp+4*mmsize], m4
  534. %endif
  535. FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5
  536. PRELOAD 9, %%q6, Q6
  537. mova [rsp+5*mmsize], m1
  538. FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6
  539. mova m1, [%%q7]
  540. FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1
  541. FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64
  542. FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64
  543. FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1
  544. FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1
  545. FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1
  546. FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
  547. FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6
  548. mova m7, [%%p1]
  549. %else
  550. SWAP 1, 7
  551. %endif
  552. mova m2, [%%p3]
  553. mova m1, [%%p2]
  554. ; reg_Q0-1 (m10-m11)
  555. ; m0=p0
  556. ; m1=p2
  557. ; m2=p3
  558. ; m3=q2
  559. ; m4-5=free
  560. ; m6=q3
  561. ; m7=p1
  562. ; m8-9 unused
  563. ; filter_6
  564. psllw m4, m2, 2
  565. paddw m5, m1, m1
  566. paddw m4, m7
  567. psubw m5, m2
  568. paddw m4, m0
  569. paddw m5, reg_Q0
  570. paddw m4, [pw_4]
  571. paddw m5, m4
  572. %if ARCH_X86_64
  573. mova m8, m1
  574. mova m9, m7
  575. %else
  576. mova [rsp+0*mmsize], m1
  577. mova [rsp+1*mmsize], m7
  578. %endif
  579. %ifidn %1, v
  580. FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1
  581. %else
  582. FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1
  583. %endif
  584. FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1
  585. FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1
  586. %if ARCH_X86_64
  587. FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64
  588. FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64
  589. %else
  590. FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
  591. FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64
  592. %endif
  593. FILTER_STEP m4, m5, F8M, 3, %%q2, m3
  594. UNSCRATCH 2, 10, %%q0
  595. UNSCRATCH 6, 11, %%q1
  596. %else
  597. SWAP 1, 7
  598. mova m2, [%%q0]
  599. mova m6, [%%q1]
  600. %endif
  601. UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV
  602. ; m0=p0
  603. ; m1=p2
  604. ; m2=q0
  605. ; m3=hev_mask
  606. ; m4-5=free
  607. ; m6=q1
  608. ; m7=p1
  609. ; filter_4
  610. psubw m4, m7, m6 ; p1-q1
  611. psubw m5, m2, m0 ; q0-p0
  612. pand m4, m3
  613. pminsw m4, [pw_ %+ %%maxsgn]
  614. pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f
  615. paddw m4, m5
  616. paddw m5, m5
  617. paddw m4, m5 ; 3*(q0-p0)+f
  618. pminsw m4, [pw_ %+ %%maxsgn]
  619. pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f
  620. pand m4, reg_F4M
  621. paddw m5, m4, [pw_4]
  622. paddw m4, [pw_3]
  623. pminsw m5, [pw_ %+ %%maxsgn]
  624. pminsw m4, [pw_ %+ %%maxsgn]
  625. psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1
  626. psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2
  627. psubw m2, m5 ; q0-f1
  628. paddw m0, m4 ; p0+f2
  629. pandn m3, m5 ; f1 & !hev (for p1/q1 adj)
  630. pxor m4, m4
  631. mova m5, [pw_ %+ %%maxusgn]
  632. pmaxsw m2, m4
  633. pmaxsw m0, m4
  634. pminsw m2, m5
  635. pminsw m0, m5
  636. %if cpuflag(ssse3)
  637. pmulhrsw m3, [pw_16384] ; (f1+1)>>1
  638. %else
  639. paddw m3, [pw_1]
  640. psraw m3, 1
  641. %endif
  642. paddw m7, m3 ; p1+f
  643. psubw m6, m3 ; q1-f
  644. pmaxsw m7, m4
  645. pmaxsw m6, m4
  646. pminsw m7, m5
  647. pminsw m6, m5
  648. ; store
  649. %ifidn %1, v
  650. mova [%%p1], m7
  651. mova [%%p0], m0
  652. mova [%%q0], m2
  653. mova [%%q1], m6
  654. %else ; %1 == h
  655. %if %2 == 4
  656. TRANSPOSE4x4W 7, 0, 2, 6, 1
  657. movh [dst0q+strideq*0-4], m7
  658. movhps [dst0q+strideq*1-4], m7
  659. movh [dst0q+strideq*2-4], m0
  660. movhps [dst0q+stride3q -4], m0
  661. movh [dst4q+strideq*0-4], m2
  662. movhps [dst4q+strideq*1-4], m2
  663. movh [dst4q+strideq*2-4], m6
  664. movhps [dst4q+stride3q -4], m6
  665. %elif %2 == 8
  666. mova m3, [%%p3]
  667. mova m4, [%%q2]
  668. mova m5, [%%q3]
  669. %if ARCH_X86_64
  670. TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8
  671. %else
  672. TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
  673. mova m2, [%%q0]
  674. %endif
  675. movu [dst0q+strideq*0-8], m3
  676. movu [dst0q+strideq*1-8], m1
  677. movu [dst0q+strideq*2-8], m7
  678. movu [dst0q+stride3q -8], m0
  679. movu [dst4q+strideq*0-8], m2
  680. movu [dst4q+strideq*1-8], m6
  681. movu [dst4q+strideq*2-8], m4
  682. movu [dst4q+stride3q -8], m5
  683. %else ; %2 == 16
  684. SCRATCH 2, 8, %%q0
  685. SCRATCH 6, 9, %%q1
  686. mova m2, [%%p7]
  687. mova m3, [%%p6]
  688. mova m4, [%%p5]
  689. mova m5, [%%p4]
  690. mova m6, [%%p3]
  691. %if ARCH_X86_64
  692. TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10
  693. %else
  694. mova [%%p1], m7
  695. TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
  696. %endif
  697. mova [dst0q+strideq*0-16], m2
  698. mova [dst0q+strideq*1-16], m3
  699. mova [dst0q+strideq*2-16], m4
  700. mova [dst0q+stride3q -16], m5
  701. %if ARCH_X86_64
  702. mova [dst4q+strideq*0-16], m6
  703. %endif
  704. mova [dst4q+strideq*1-16], m1
  705. mova [dst4q+strideq*2-16], m7
  706. mova [dst4q+stride3q -16], m0
  707. UNSCRATCH 2, 8, %%q0
  708. UNSCRATCH 6, 9, %%q1
  709. mova m0, [%%q2]
  710. mova m1, [%%q3]
  711. mova m3, [%%q4]
  712. mova m4, [%%q5]
  713. %if ARCH_X86_64
  714. mova m5, [%%q6]
  715. %endif
  716. mova m7, [%%q7]
  717. %if ARCH_X86_64
  718. TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8
  719. %else
  720. TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
  721. %endif
  722. mova [dst0q+strideq*0], m2
  723. mova [dst0q+strideq*1], m6
  724. mova [dst0q+strideq*2], m0
  725. mova [dst0q+stride3q ], m1
  726. %if ARCH_X86_64
  727. mova [dst4q+strideq*0], m3
  728. %endif
  729. mova [dst4q+strideq*1], m4
  730. mova [dst4q+strideq*2], m5
  731. mova [dst4q+stride3q ], m7
  732. %endif ; %2
  733. %endif ; %1
  734. RET
  735. %endmacro
  736. %macro LOOP_FILTER_CPUSETS 3
  737. INIT_XMM sse2
  738. LOOP_FILTER %1, %2, %3
  739. INIT_XMM ssse3
  740. LOOP_FILTER %1, %2, %3
  741. INIT_XMM avx
  742. LOOP_FILTER %1, %2, %3
  743. %endmacro
  744. %macro LOOP_FILTER_WDSETS 2
  745. LOOP_FILTER_CPUSETS %1, 4, %2
  746. LOOP_FILTER_CPUSETS %1, 8, %2
  747. LOOP_FILTER_CPUSETS %1, 16, %2
  748. %endmacro
  749. LOOP_FILTER_WDSETS h, 10
  750. LOOP_FILTER_WDSETS v, 10
  751. LOOP_FILTER_WDSETS h, 12
  752. LOOP_FILTER_WDSETS v, 12