You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

960 lines
36KB

  1. /*
  2. * Copyright (c) 2016 Google Inc.
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. #include "neon.S"
  22. @ Do an 8x8 transpose, using q registers for the subtransposes that don't
  23. @ need to address the indiviudal d registers.
  24. @ r0,r1 == rq0, r2,r3 == rq1, etc
  25. .macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
  26. vtrn.32 \rq0, \rq2
  27. vtrn.32 \rq1, \rq3
  28. vtrn.16 \rq0, \rq1
  29. vtrn.16 \rq2, \rq3
  30. vtrn.8 \r0, \r1
  31. vtrn.8 \r2, \r3
  32. vtrn.8 \r4, \r5
  33. vtrn.8 \r6, \r7
  34. .endm
  35. @ Do a 4x4 transpose, using q registers for the subtransposes that don't
  36. @ need to address the indiviudal d registers.
  37. @ r0,r1 == rq0, r2,r3 == rq1
  38. .macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
  39. vtrn.16 \rq0, \rq1
  40. vtrn.8 \r0, \r1
  41. vtrn.8 \r2, \r3
  42. .endm
  43. @ The input to and output from this macro is in the registers q8-q15,
  44. @ and q0-q7 are used as scratch registers.
  45. @ p3 = q8, p0 = q11, q0 = q12, q3 = q15
  46. .macro loop_filter_q
  47. vdup.u8 d0, r2 @ E
  48. lsr r2, r2, #8
  49. vdup.u8 d2, r3 @ I
  50. lsr r3, r3, #8
  51. vdup.u8 d1, r2 @ E
  52. vdup.u8 d3, r3 @ I
  53. vabd.u8 q2, q8, q9 @ abs(p3 - p2)
  54. vabd.u8 q3, q9, q10 @ abs(p2 - p1)
  55. vabd.u8 q4, q10, q11 @ abs(p1 - p0)
  56. vabd.u8 q5, q12, q13 @ abs(q0 - q1)
  57. vabd.u8 q6, q13, q14 @ abs(q1 - q2)
  58. vabd.u8 q7, q14, q15 @ abs(q2 - q3)
  59. vmax.u8 q2, q2, q3
  60. vmax.u8 q3, q4, q5
  61. vmax.u8 q4, q6, q7
  62. vabd.u8 q5, q11, q12 @ abs(p0 - q0)
  63. vmax.u8 q2, q2, q3
  64. vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2
  65. vabd.u8 q7, q10, q13 @ abs(p1 - q1)
  66. vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
  67. vshr.u8 q7, q7, #1
  68. vcle.u8 q2, q2, q1 @ max(abs()) <= I
  69. vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
  70. vcle.u8 q5, q5, q0
  71. vand q2, q2, q5 @ fm
  72. vshrn.u16 d10, q2, #4
  73. vmov r2, r3, d10
  74. orrs r2, r2, r3
  75. @ If no pixels need filtering, just exit as soon as possible
  76. beq 9f
  77. @ Calculate the normal inner loop filter for 2 or 4 pixels
  78. ldr r3, [sp, #64]
  79. vabd.u8 q3, q10, q11 @ abs(p1 - p0)
  80. vabd.u8 q4, q13, q12 @ abs(q1 - q0)
  81. vsubl.u8 q5, d20, d26 @ p1 - q1
  82. vsubl.u8 q6, d21, d27 @ p1 - q1
  83. vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0))
  84. vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1)
  85. vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1)
  86. vdup.u8 d8, r3 @ H
  87. lsr r3, r3, #8
  88. vdup.u8 d9, r3 @ H
  89. vsubl.u8 q6, d24, d22 @ q0 - p0
  90. vsubl.u8 q7, d25, d23 @ q0 - p0
  91. vcle.u8 q3, q3, q4 @ hev
  92. vmov.s16 q0, #3
  93. vand q3, q3, q2 @ !hev && fm && !flat8in
  94. vmul.s16 q6, q6, q0 @ 3 * (q0 - p0)
  95. vmul.s16 q7, q7, q0 @ 3 * (q0 - p0)
  96. vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0
  97. vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
  98. vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
  99. vmov.s8 q5, #4
  100. vqmovn.s16 d12, q6
  101. vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
  102. vmov.s8 q0, #3
  103. vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127)
  104. vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127)
  105. vmovl.u8 q6, d22 @ p0
  106. vmovl.u8 q7, d23 @ p0
  107. vshr.s8 q5, q5, #3 @ f1
  108. vshr.s8 q0, q0, #3 @ f2
  109. vaddw.s8 q6, q6, d0 @ p0 + f2
  110. vaddw.s8 q7, q7, d1 @ p0 + f2
  111. vqmovun.s16 d0, q6 @ out p0
  112. vmovl.u8 q6, d24 @ q0
  113. vqmovun.s16 d1, q7 @ out p0
  114. vmovl.u8 q7, d25 @ q0
  115. vsubw.s8 q6, q6, d10 @ q0 - f1
  116. vsubw.s8 q7, q7, d11 @ q0 - f1
  117. vqmovun.s16 d12, q6 @ out q0
  118. vqmovun.s16 d13, q7 @ out q0
  119. vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1
  120. vbit q11, q0, q2 @ if (fm && !flat8in)
  121. vbit q12, q6, q2
  122. vmovl.u8 q0, d20 @ p1
  123. vmovl.u8 q2, d21 @ p1
  124. vmovl.u8 q6, d26 @ q1
  125. vmovl.u8 q7, d27 @ q1
  126. vaddw.s8 q0, q0, d10 @ p1 + f
  127. vaddw.s8 q2, q2, d11 @ p1 + f
  128. vsubw.s8 q6, q6, d10 @ q1 - f
  129. vsubw.s8 q7, q7, d11 @ q1 - f
  130. vqmovun.s16 d0, q0 @ out p1
  131. vqmovun.s16 d1, q2 @ out p1
  132. vqmovun.s16 d12, q6 @ out q1
  133. vqmovun.s16 d13, q7 @ out q1
  134. vbit q10, q0, q3 @ if (!hev && fm && !flat8in)
  135. vbit q13, q6, q3
  136. .endm
  137. @ The input to and output from this macro is in the registers d16-d31,
  138. @ and d0-d7 are used as scratch registers.
  139. @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
  140. @ Depending on the width of the loop filter, we either use d16-d19
  141. @ and d28-d31 as temp registers, or d8-d15.
  142. @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
  143. .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
  144. vdup.u8 d0, r2 @ E
  145. vdup.u8 d2, r3 @ I
  146. ldr r3, [sp]
  147. vabd.u8 d4, d20, d21 @ abs(p3 - p2)
  148. vabd.u8 d5, d21, d22 @ abs(p2 - p1)
  149. vabd.u8 d6, d22, d23 @ abs(p1 - p0)
  150. vabd.u8 d7, d24, d25 @ abs(q0 - q1)
  151. vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2)
  152. vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3)
  153. vmax.u8 d4, d4, d5
  154. vmax.u8 d5, d6, d7
  155. vmax.u8 \tmp1, \tmp1, \tmp2
  156. vabd.u8 d6, d23, d24 @ abs(p0 - q0)
  157. vmax.u8 d4, d4, d5
  158. vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
  159. vabd.u8 d5, d22, d25 @ abs(p1 - q1)
  160. vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
  161. vshr.u8 d5, d5, #1
  162. vcle.u8 d4, d4, d2 @ max(abs()) <= I
  163. vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
  164. vcle.u8 d5, d6, d0
  165. vand d4, d4, d5 @ fm
  166. vdup.u8 d3, r3 @ H
  167. vmov r2, r3, d4
  168. orrs r2, r2, r3
  169. @ If no pixels need filtering, just exit as soon as possible
  170. beq 9f
  171. .if \wd >= 8
  172. vmov.u8 d0, #1
  173. vabd.u8 d6, d20, d23 @ abs(p3 - p0)
  174. vabd.u8 d2, d21, d23 @ abs(p2 - p0)
  175. vabd.u8 d1, d22, d23 @ abs(p1 - p0)
  176. vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0)
  177. vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0)
  178. vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0)
  179. vmax.u8 d6, d6, d2
  180. vmax.u8 d1, d1, \tmp1
  181. vmax.u8 \tmp2, \tmp2, \tmp3
  182. .if \wd == 16
  183. vabd.u8 d7, d16, d23 @ abs(p7 - p0)
  184. vmax.u8 d6, d6, d1
  185. vabd.u8 d2, d17, d23 @ abs(p6 - p0)
  186. vmax.u8 d6, d6, \tmp2
  187. vabd.u8 d1, d18, d23 @ abs(p5 - p0)
  188. vcle.u8 d6, d6, d0 @ flat8in
  189. vabd.u8 d8, d19, d23 @ abs(p4 - p0)
  190. vand d6, d6, d4 @ flat8in && fm
  191. vabd.u8 d9, d28, d24 @ abs(q4 - q0)
  192. vbic d4, d4, d6 @ fm && !flat8in
  193. vabd.u8 d10, d29, d24 @ abs(q5 - q0)
  194. vabd.u8 d11, d30, d24 @ abs(q6 - q0)
  195. vabd.u8 d12, d31, d24 @ abs(q7 - q0)
  196. vmax.u8 d7, d7, d2
  197. vmax.u8 d1, d1, d8
  198. vmax.u8 d9, d9, d10
  199. vmax.u8 d11, d11, d12
  200. @ The rest of the calculation of flat8out is interleaved below
  201. .else
  202. @ The rest of the calculation of flat8in is interleaved below
  203. .endif
  204. .endif
  205. @ Calculate the normal inner loop filter for 2 or 4 pixels
  206. vabd.u8 d5, d22, d23 @ abs(p1 - p0)
  207. .if \wd == 16
  208. vmax.u8 d7, d7, d1
  209. vmax.u8 d9, d9, d11
  210. .elseif \wd == 8
  211. vmax.u8 d6, d6, d1
  212. .endif
  213. vabd.u8 d1, d25, d24 @ abs(q1 - q0)
  214. .if \wd == 16
  215. vmax.u8 d7, d7, d9
  216. .elseif \wd == 8
  217. vmax.u8 d6, d6, \tmp2
  218. .endif
  219. vsubl.u8 \tmpq1, d22, d25 @ p1 - q1
  220. vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
  221. vsubl.u8 \tmpq2, d24, d23 @ q0 - p0
  222. vmov.s16 \tmpq3, #3
  223. .if \wd == 8
  224. vcle.u8 d6, d6, d0 @ flat8in
  225. .endif
  226. vcle.u8 d5, d5, d3 @ !hev
  227. .if \wd == 8
  228. vand d6, d6, d4 @ flat8in && fm
  229. .endif
  230. vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1)
  231. .if \wd == 16
  232. vcle.u8 d7, d7, d0 @ flat8out
  233. .elseif \wd == 8
  234. vbic d4, d4, d6 @ fm && !flat8in
  235. .endif
  236. vand d5, d5, d4 @ !hev && fm && !flat8in
  237. .if \wd == 16
  238. vand d7, d7, d6 @ flat8out && flat8in && fm
  239. .endif
  240. vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
  241. vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
  242. vmov.s8 d2, #4
  243. vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
  244. vmov.s8 d3, #3
  245. vqmovn.s16 \tmp1, \tmpq2 @ f
  246. .if \wd == 16
  247. vbic d6, d6, d7 @ fm && flat8in && !flat8out
  248. .endif
  249. vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127)
  250. vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127)
  251. vmovl.u8 q0, d23 @ p0
  252. vshr.s8 \tmp3, \tmp3, #3 @ f1
  253. vshr.s8 \tmp4, \tmp4, #3 @ f2
  254. vmovl.u8 q1, d24 @ q0
  255. vaddw.s8 q0, q0, \tmp4 @ p0 + f2
  256. vsubw.s8 q1, q1, \tmp3 @ q0 - f1
  257. vqmovun.s16 d0, q0 @ out p0
  258. vqmovun.s16 d1, q1 @ out q0
  259. vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
  260. vbit d23, d0, d4 @ if (fm && !flat8in)
  261. vbit d24, d1, d4
  262. vmovl.u8 q0, d22 @ p1
  263. vmovl.u8 q1, d25 @ q1
  264. .if \wd >= 8
  265. vmov r2, r3, d6
  266. .endif
  267. vaddw.s8 q0, q0, \tmp3 @ p1 + f
  268. vsubw.s8 q1, q1, \tmp3 @ q1 - f
  269. .if \wd >= 8
  270. orrs r2, r2, r3
  271. .endif
  272. vqmovun.s16 d0, q0 @ out p1
  273. vqmovun.s16 d2, q1 @ out q1
  274. vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
  275. vbit d25, d2, d5
  276. .if \wd >= 8
  277. @ If no pixels need flat8in, jump to flat8out
  278. @ (or to a writeout of the inner 4 pixels, for wd=8)
  279. beq 6f
  280. @ flat8in
  281. vaddl.u8 \tmpq1, d20, d21
  282. vaddl.u8 \tmpq2, d22, d25
  283. vaddl.u8 \tmpq3, d20, d22
  284. vaddl.u8 \tmpq4, d23, d26
  285. vadd.u16 q0, \tmpq1, \tmpq1
  286. vaddw.u8 q0, q0, d23
  287. vaddw.u8 q0, q0, d24
  288. vadd.u16 q0, q0, \tmpq3
  289. vsub.s16 \tmpq2, \tmpq2, \tmpq1
  290. vsub.s16 \tmpq4, \tmpq4, \tmpq3
  291. vrshrn.u16 d2, q0, #3 @ out p2
  292. vadd.u16 q0, q0, \tmpq2
  293. vaddl.u8 \tmpq1, d20, d23
  294. vaddl.u8 \tmpq2, d24, d27
  295. vrshrn.u16 d3, q0, #3 @ out p1
  296. vadd.u16 q0, q0, \tmpq4
  297. vsub.s16 \tmpq2, \tmpq2, \tmpq1
  298. vaddl.u8 \tmpq3, d21, d24
  299. vaddl.u8 \tmpq4, d25, d27
  300. vrshrn.u16 d4, q0, #3 @ out p0
  301. vadd.u16 q0, q0, \tmpq2
  302. vsub.s16 \tmpq4, \tmpq4, \tmpq3
  303. vaddl.u8 \tmpq1, d22, d25
  304. vaddl.u8 \tmpq2, d26, d27
  305. vrshrn.u16 d5, q0, #3 @ out q0
  306. vadd.u16 q0, q0, \tmpq4
  307. vsub.s16 \tmpq2, \tmpq2, \tmpq1
  308. vrshrn.u16 \tmp5, q0, #3 @ out q1
  309. vadd.u16 q0, q0, \tmpq2
  310. @ The output here is written back into the input registers. This doesn't
  311. @ matter for the flat8out part below, since we only update those pixels
  312. @ which won't be touched below.
  313. vbit d21, d2, d6
  314. vbit d22, d3, d6
  315. vbit d23, d4, d6
  316. vrshrn.u16 \tmp6, q0, #3 @ out q2
  317. vbit d24, d5, d6
  318. vbit d25, \tmp5, d6
  319. vbit d26, \tmp6, d6
  320. .endif
  321. .if \wd == 16
  322. 6:
  323. vorr d2, d6, d7
  324. vmov r2, r3, d2
  325. orrs r2, r2, r3
  326. @ If no pixels needed flat8in nor flat8out, jump to a
  327. @ writeout of the inner 4 pixels
  328. beq 7f
  329. vmov r2, r3, d7
  330. orrs r2, r2, r3
  331. @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
  332. beq 8f
  333. @ flat8out
  334. @ This writes all outputs into d2-d17 (skipping d6 and d16).
  335. @ If this part is skipped, the output is read from d21-d26 (which is the input
  336. @ to this section).
  337. vshll.u8 q0, d16, #3 @ 8 * d16
  338. vsubw.u8 q0, q0, d16 @ 7 * d16
  339. vaddw.u8 q0, q0, d17
  340. vaddl.u8 q4, d17, d18
  341. vaddl.u8 q5, d19, d20
  342. vadd.s16 q0, q0, q4
  343. vaddl.u8 q4, d16, d17
  344. vaddl.u8 q6, d21, d22
  345. vadd.s16 q0, q0, q5
  346. vaddl.u8 q5, d18, d25
  347. vaddl.u8 q7, d23, d24
  348. vsub.s16 q5, q5, q4
  349. vadd.s16 q0, q0, q6
  350. vadd.s16 q0, q0, q7
  351. vaddl.u8 q6, d16, d18
  352. vaddl.u8 q7, d19, d26
  353. vrshrn.u16 d2, q0, #4
  354. vadd.s16 q0, q0, q5
  355. vaddl.u8 q4, d16, d19
  356. vaddl.u8 q5, d20, d27
  357. vsub.s16 q7, q7, q6
  358. vbif d2, d17, d7
  359. vrshrn.u16 d3, q0, #4
  360. vadd.s16 q0, q0, q7
  361. vaddl.u8 q6, d16, d20
  362. vaddl.u8 q7, d21, d28
  363. vsub.s16 q5, q5, q4
  364. vbif d3, d18, d7
  365. vrshrn.u16 d4, q0, #4
  366. vadd.s16 q0, q0, q5
  367. vaddl.u8 q4, d16, d21
  368. vaddl.u8 q5, d22, d29
  369. vsub.s16 q7, q7, q6
  370. vbif d4, d19, d7
  371. vrshrn.u16 d5, q0, #4
  372. vadd.s16 q0, q0, q7
  373. vaddl.u8 q6, d16, d22
  374. vaddl.u8 q7, d23, d30
  375. vsub.s16 q5, q5, q4
  376. vbif d5, d20, d7
  377. vrshrn.u16 d6, q0, #4
  378. vadd.s16 q0, q0, q5
  379. vaddl.u8 q5, d16, d23
  380. vsub.s16 q7, q7, q6
  381. vaddl.u8 q6, d24, d31
  382. vbif d6, d21, d7
  383. vrshrn.u16 d8, q0, #4
  384. vadd.s16 q0, q0, q7
  385. vsub.s16 q5, q6, q5
  386. vaddl.u8 q6, d17, d24
  387. vaddl.u8 q7, d25, d31
  388. vbif d8, d22, d7
  389. vrshrn.u16 d9, q0, #4
  390. vadd.s16 q0, q0, q5
  391. vsub.s16 q7, q7, q6
  392. vaddl.u8 q6, d26, d31
  393. vbif d9, d23, d7
  394. vrshrn.u16 d10, q0, #4
  395. vadd.s16 q0, q0, q7
  396. vaddl.u8 q7, d18, d25
  397. vaddl.u8 q9, d19, d26
  398. vsub.s16 q6, q6, q7
  399. vaddl.u8 q7, d27, d31
  400. vbif d10, d24, d7
  401. vrshrn.u16 d11, q0, #4
  402. vadd.s16 q0, q0, q6
  403. vaddl.u8 q6, d20, d27
  404. vsub.s16 q7, q7, q9
  405. vaddl.u8 q9, d28, d31
  406. vbif d11, d25, d7
  407. vsub.s16 q9, q9, q6
  408. vrshrn.u16 d12, q0, #4
  409. vadd.s16 q0, q0, q7
  410. vaddl.u8 q7, d21, d28
  411. vaddl.u8 q10, d29, d31
  412. vbif d12, d26, d7
  413. vrshrn.u16 d13, q0, #4
  414. vadd.s16 q0, q0, q9
  415. vsub.s16 q10, q10, q7
  416. vaddl.u8 q9, d22, d29
  417. vaddl.u8 q11, d30, d31
  418. vbif d13, d27, d7
  419. vrshrn.u16 d14, q0, #4
  420. vadd.s16 q0, q0, q10
  421. vsub.s16 q11, q11, q9
  422. vbif d14, d28, d7
  423. vrshrn.u16 d15, q0, #4
  424. vadd.s16 q0, q0, q11
  425. vbif d15, d29, d7
  426. vrshrn.u16 d17, q0, #4
  427. vbif d17, d30, d7
  428. .endif
  429. .endm
  430. @ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
  431. @ while we need those for inputs/outputs in wd=16 and use d8-d15
  432. @ for temp registers there instead.
  433. .macro loop_filter_4
  434. loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
  435. .endm
  436. .macro loop_filter_8
  437. loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
  438. .endm
  439. .macro loop_filter_16
  440. loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7
  441. .endm
  442. @ The public functions in this file have got the following signature:
  443. @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
  444. function ff_vp9_loop_filter_v_4_8_neon, export=1
  445. sub r12, r0, r1, lsl #2
  446. vld1.8 {d20}, [r12,:64], r1 @ p3
  447. vld1.8 {d24}, [r0, :64], r1 @ q0
  448. vld1.8 {d21}, [r12,:64], r1 @ p2
  449. vld1.8 {d25}, [r0, :64], r1 @ q1
  450. vld1.8 {d22}, [r12,:64], r1 @ p1
  451. vld1.8 {d26}, [r0, :64], r1 @ q2
  452. vld1.8 {d23}, [r12,:64], r1 @ p0
  453. vld1.8 {d27}, [r0, :64], r1 @ q3
  454. sub r0, r0, r1, lsl #2
  455. sub r12, r12, r1, lsl #1
  456. loop_filter_4
  457. vst1.8 {d22}, [r12,:64], r1
  458. vst1.8 {d24}, [r0, :64], r1
  459. vst1.8 {d23}, [r12,:64], r1
  460. vst1.8 {d25}, [r0, :64], r1
  461. 9:
  462. bx lr
  463. endfunc
  464. function ff_vp9_loop_filter_h_4_8_neon, export=1
  465. sub r12, r0, #4
  466. add r0, r12, r1, lsl #2
  467. vld1.8 {d20}, [r12], r1
  468. vld1.8 {d24}, [r0], r1
  469. vld1.8 {d21}, [r12], r1
  470. vld1.8 {d25}, [r0], r1
  471. vld1.8 {d22}, [r12], r1
  472. vld1.8 {d26}, [r0], r1
  473. vld1.8 {d23}, [r12], r1
  474. vld1.8 {d27}, [r0], r1
  475. sub r12, r12, r1, lsl #2
  476. sub r0, r0, r1, lsl #2
  477. @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
  478. @ outermost 2 pixels since they aren't changed.
  479. add r12, r12, #2
  480. add r0, r0, #2
  481. @ Transpose the 8x8 pixels, taking advantage of q registers, to get
  482. @ one register per column.
  483. transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
  484. loop_filter_4
  485. @ We only will write the mid 4 pixels back; after the loop filter,
  486. @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
  487. @ (8x4 pixels). We need to transpose them to columns, done with a
  488. @ 4x4 transpose (which in practice is two 4x4 transposes of the two
  489. @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
  490. transpose_q_4x4 q11, q12, d22, d23, d24, d25
  491. vst1.32 {d22[0]}, [r12], r1
  492. vst1.32 {d22[1]}, [r0], r1
  493. vst1.32 {d23[0]}, [r12], r1
  494. vst1.32 {d23[1]}, [r0], r1
  495. vst1.32 {d24[0]}, [r12], r1
  496. vst1.32 {d24[1]}, [r0], r1
  497. vst1.32 {d25[0]}, [r12], r1
  498. vst1.32 {d25[1]}, [r0], r1
  499. 9:
  500. bx lr
  501. endfunc
  502. function ff_vp9_loop_filter_v_44_16_neon, export=1
  503. vpush {q4-q7}
  504. sub r12, r0, r1, lsl #2
  505. vld1.8 {q8}, [r12,:128], r1 @ p3
  506. vld1.8 {q12}, [r0, :128], r1 @ q0
  507. vld1.8 {q9}, [r12,:128], r1 @ p2
  508. vld1.8 {q13}, [r0, :128], r1 @ q1
  509. vld1.8 {q10}, [r12,:128], r1 @ p1
  510. vld1.8 {q14}, [r0, :128], r1 @ q2
  511. vld1.8 {q11}, [r12,:128], r1 @ p0
  512. vld1.8 {q15}, [r0, :128], r1 @ q3
  513. sub r0, r0, r1, lsl #2
  514. sub r12, r12, r1, lsl #1
  515. loop_filter_q
  516. vst1.8 {q10}, [r12,:128], r1
  517. vst1.8 {q12}, [r0, :128], r1
  518. vst1.8 {q11}, [r12,:128], r1
  519. vst1.8 {q13}, [r0, :128], r1
  520. 9:
  521. vpop {q4-q7}
  522. bx lr
  523. endfunc
  524. function ff_vp9_loop_filter_h_44_16_neon, export=1
  525. vpush {q4-q7}
  526. sub r12, r0, #4
  527. add r0, r12, r1, lsl #2
  528. vld1.8 {d16}, [r12], r1
  529. vld1.8 {d24}, [r0], r1
  530. vld1.8 {d18}, [r12], r1
  531. vld1.8 {d26}, [r0], r1
  532. vld1.8 {d20}, [r12], r1
  533. vld1.8 {d28}, [r0], r1
  534. vld1.8 {d22}, [r12], r1
  535. vld1.8 {d30}, [r0], r1
  536. mov r12, r0
  537. add r0, r0, r1, lsl #2
  538. vld1.8 {d17}, [r12], r1
  539. vld1.8 {d25}, [r0], r1
  540. vld1.8 {d19}, [r12], r1
  541. vld1.8 {d27}, [r0], r1
  542. vld1.8 {d21}, [r12], r1
  543. vld1.8 {d29}, [r0], r1
  544. vld1.8 {d23}, [r12], r1
  545. vld1.8 {d31}, [r0], r1
  546. @ Transpose the 16x8 pixels, as two 8x8 parts
  547. transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15
  548. loop_filter_q
  549. sub r12, r0, r1, lsl #4
  550. add r0, r12, r1, lsl #3
  551. @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
  552. @ outermost 2 pixels since they aren't changed.
  553. add r12, r12, #2
  554. add r0, r0, #2
  555. @ We only will write the mid 4 pixels back; after the loop filter,
  556. @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
  557. @ We need to transpose them to columns, done with a 4x4 transpose
  558. @ (which in practice is four 4x4 transposes of the 4x4 blocks of
  559. @ the 16x4 pixels; into 4x16 pixels).
  560. transpose_4x4 q10, q11, q12, q13
  561. vst1.32 {d20[0]}, [r12], r1
  562. vst1.32 {d21[0]}, [r0], r1
  563. vst1.32 {d22[0]}, [r12], r1
  564. vst1.32 {d23[0]}, [r0], r1
  565. vst1.32 {d24[0]}, [r12], r1
  566. vst1.32 {d25[0]}, [r0], r1
  567. vst1.32 {d26[0]}, [r12], r1
  568. vst1.32 {d27[0]}, [r0], r1
  569. vst1.32 {d20[1]}, [r12], r1
  570. vst1.32 {d21[1]}, [r0], r1
  571. vst1.32 {d22[1]}, [r12], r1
  572. vst1.32 {d23[1]}, [r0], r1
  573. vst1.32 {d24[1]}, [r12], r1
  574. vst1.32 {d25[1]}, [r0], r1
  575. vst1.32 {d26[1]}, [r12], r1
  576. vst1.32 {d27[1]}, [r0], r1
  577. 9:
  578. vpop {q4-q7}
  579. bx lr
  580. endfunc
  581. function ff_vp9_loop_filter_v_8_8_neon, export=1
  582. sub r12, r0, r1, lsl #2
  583. vld1.8 {d20}, [r12,:64], r1 @ p3
  584. vld1.8 {d24}, [r0, :64], r1 @ q0
  585. vld1.8 {d21}, [r12,:64], r1 @ p2
  586. vld1.8 {d25}, [r0, :64], r1 @ q1
  587. vld1.8 {d22}, [r12,:64], r1 @ p1
  588. vld1.8 {d26}, [r0, :64], r1 @ q2
  589. vld1.8 {d23}, [r12,:64], r1 @ p0
  590. vld1.8 {d27}, [r0, :64], r1 @ q3
  591. sub r12, r12, r1, lsl #2
  592. sub r0, r0, r1, lsl #2
  593. add r12, r12, r1
  594. loop_filter_8
  595. vst1.8 {d21}, [r12,:64], r1
  596. vst1.8 {d24}, [r0, :64], r1
  597. vst1.8 {d22}, [r12,:64], r1
  598. vst1.8 {d25}, [r0, :64], r1
  599. vst1.8 {d23}, [r12,:64], r1
  600. vst1.8 {d26}, [r0, :64], r1
  601. 9:
  602. bx lr
  603. 6:
  604. sub r12, r0, r1, lsl #1
  605. vst1.8 {d22}, [r12,:64], r1
  606. vst1.8 {d24}, [r0, :64], r1
  607. vst1.8 {d23}, [r12,:64], r1
  608. vst1.8 {d25}, [r0, :64], r1
  609. bx lr
  610. endfunc
  611. function ff_vp9_loop_filter_h_8_8_neon, export=1
  612. sub r12, r0, #4
  613. add r0, r12, r1, lsl #2
  614. vld1.8 {d20}, [r12], r1
  615. vld1.8 {d24}, [r0], r1
  616. vld1.8 {d21}, [r12], r1
  617. vld1.8 {d25}, [r0], r1
  618. vld1.8 {d22}, [r12], r1
  619. vld1.8 {d26}, [r0], r1
  620. vld1.8 {d23}, [r12], r1
  621. vld1.8 {d27}, [r0], r1
  622. sub r12, r12, r1, lsl #2
  623. sub r0, r0, r1, lsl #2
  624. transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
  625. loop_filter_8
  626. @ Even though only 6 pixels per row have been changed, we write the
  627. @ full 8 pixel registers.
  628. transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
  629. vst1.8 {d20}, [r12], r1
  630. vst1.8 {d24}, [r0], r1
  631. vst1.8 {d21}, [r12], r1
  632. vst1.8 {d25}, [r0], r1
  633. vst1.8 {d22}, [r12], r1
  634. vst1.8 {d26}, [r0], r1
  635. vst1.8 {d23}, [r12], r1
  636. vst1.8 {d27}, [r0], r1
  637. 9:
  638. bx lr
  639. 6:
  640. @ If we didn't need to do the flat8in part, we use the same writeback
  641. @ as in loop_filter_h_4_8.
  642. add r12, r12, #2
  643. add r0, r0, #2
  644. transpose_q_4x4 q11, q12, d22, d23, d24, d25
  645. vst1.32 {d22[0]}, [r12], r1
  646. vst1.32 {d22[1]}, [r0], r1
  647. vst1.32 {d23[0]}, [r12], r1
  648. vst1.32 {d23[1]}, [r0], r1
  649. vst1.32 {d24[0]}, [r12], r1
  650. vst1.32 {d24[1]}, [r0], r1
  651. vst1.32 {d25[0]}, [r12], r1
  652. vst1.32 {d25[1]}, [r0], r1
  653. bx lr
  654. endfunc
  655. function vp9_loop_filter_v_16_neon
  656. sub r12, r0, r1, lsl #3
  657. @ Read p7-p0 using r12 and q0-q7 using r0
  658. vld1.8 {d16}, [r12,:64], r1 @ p7
  659. vld1.8 {d24}, [r0, :64], r1 @ q0
  660. vld1.8 {d17}, [r12,:64], r1 @ p6
  661. vld1.8 {d25}, [r0, :64], r1 @ q1
  662. vld1.8 {d18}, [r12,:64], r1 @ p5
  663. vld1.8 {d26}, [r0, :64], r1 @ q2
  664. vld1.8 {d19}, [r12,:64], r1 @ p4
  665. vld1.8 {d27}, [r0, :64], r1 @ q3
  666. vld1.8 {d20}, [r12,:64], r1 @ p3
  667. vld1.8 {d28}, [r0, :64], r1 @ q4
  668. vld1.8 {d21}, [r12,:64], r1 @ p2
  669. vld1.8 {d29}, [r0, :64], r1 @ q5
  670. vld1.8 {d22}, [r12,:64], r1 @ p1
  671. vld1.8 {d30}, [r0, :64], r1 @ q6
  672. vld1.8 {d23}, [r12,:64], r1 @ p0
  673. vld1.8 {d31}, [r0, :64], r1 @ q7
  674. sub r12, r12, r1, lsl #3
  675. sub r0, r0, r1, lsl #3
  676. add r12, r12, r1
  677. loop_filter_16
  678. @ If we did the flat8out part, we get the output in
  679. @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
  680. @ store d2-d9 there, and d10-d17 into r0.
  681. vst1.8 {d2}, [r12,:64], r1
  682. vst1.8 {d10}, [r0, :64], r1
  683. vst1.8 {d3}, [r12,:64], r1
  684. vst1.8 {d11}, [r0, :64], r1
  685. vst1.8 {d4}, [r12,:64], r1
  686. vst1.8 {d12}, [r0, :64], r1
  687. vst1.8 {d5}, [r12,:64], r1
  688. vst1.8 {d13}, [r0, :64], r1
  689. vst1.8 {d6}, [r12,:64], r1
  690. vst1.8 {d14}, [r0, :64], r1
  691. vst1.8 {d8}, [r12,:64], r1
  692. vst1.8 {d15}, [r0, :64], r1
  693. vst1.8 {d9}, [r12,:64], r1
  694. vst1.8 {d17}, [r0, :64], r1
  695. sub r0, r0, r1, lsl #3
  696. add r0, r0, r1
  697. 9:
  698. bx lr
  699. 8:
  700. add r12, r12, r1, lsl #2
  701. @ If we didn't do the flat8out part, the output is left in the
  702. @ input registers.
  703. vst1.8 {d21}, [r12,:64], r1
  704. vst1.8 {d24}, [r0, :64], r1
  705. vst1.8 {d22}, [r12,:64], r1
  706. vst1.8 {d25}, [r0, :64], r1
  707. vst1.8 {d23}, [r12,:64], r1
  708. vst1.8 {d26}, [r0, :64], r1
  709. sub r0, r0, r1, lsl #1
  710. sub r0, r0, r1
  711. bx lr
  712. 7:
  713. sub r12, r0, r1, lsl #1
  714. vst1.8 {d22}, [r12,:64], r1
  715. vst1.8 {d24}, [r0, :64], r1
  716. vst1.8 {d23}, [r12,:64], r1
  717. vst1.8 {d25}, [r0, :64], r1
  718. sub r0, r0, r1, lsl #1
  719. bx lr
  720. endfunc
  721. function ff_vp9_loop_filter_v_16_8_neon, export=1
  722. ldr r12, [sp]
  723. push {lr}
  724. vpush {q4-q7}
  725. push {r12}
  726. bl vp9_loop_filter_v_16_neon
  727. add sp, sp, #4
  728. vpop {q4-q7}
  729. pop {pc}
  730. endfunc
  731. function ff_vp9_loop_filter_v_16_16_neon, export=1
  732. ldr r12, [sp]
  733. // The filter clobbers r2 and r3, but we need to keep them for the second round
  734. push {r2, r3, lr}
  735. vpush {q4-q7}
  736. push {r12}
  737. bl vp9_loop_filter_v_16_neon
  738. add r0, #8
  739. ldr r2, [sp, #68]
  740. ldr r3, [sp, #72]
  741. bl vp9_loop_filter_v_16_neon
  742. add sp, sp, #4
  743. vpop {q4-q7}
  744. pop {r2, r3, pc}
  745. endfunc
  746. function vp9_loop_filter_h_16_neon
  747. sub r12, r0, #8
  748. vld1.8 {d16}, [r12,:64], r1
  749. vld1.8 {d24}, [r0, :64], r1
  750. vld1.8 {d17}, [r12,:64], r1
  751. vld1.8 {d25}, [r0, :64], r1
  752. vld1.8 {d18}, [r12,:64], r1
  753. vld1.8 {d26}, [r0, :64], r1
  754. vld1.8 {d19}, [r12,:64], r1
  755. vld1.8 {d27}, [r0, :64], r1
  756. vld1.8 {d20}, [r12,:64], r1
  757. vld1.8 {d28}, [r0, :64], r1
  758. vld1.8 {d21}, [r12,:64], r1
  759. vld1.8 {d29}, [r0, :64], r1
  760. vld1.8 {d22}, [r12,:64], r1
  761. vld1.8 {d30}, [r0, :64], r1
  762. vld1.8 {d23}, [r12,:64], r1
  763. vld1.8 {d31}, [r0, :64], r1
  764. sub r0, r0, r1, lsl #3
  765. sub r12, r12, r1, lsl #3
  766. @ The 16x8 pixels read above is in two 8x8 blocks; the left
  767. @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
  768. @ of this, to get one column per register. This could be done with two
  769. @ transpose_8x8 as below, but this takes advantage of the q registers.
  770. transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
  771. vtrn.8 d16, d17
  772. vtrn.8 d18, d19
  773. vtrn.8 d20, d21
  774. vtrn.8 d22, d23
  775. vtrn.8 d24, d25
  776. vtrn.8 d26, d27
  777. vtrn.8 d28, d29
  778. vtrn.8 d30, d31
  779. loop_filter_16
  780. @ Transpose back; this is the same transpose as above, but
  781. @ we can't take advantage of q registers for the transpose, since
  782. @ all d registers in the transpose aren't consecutive.
  783. transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9
  784. transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31
  785. vst1.8 {d16}, [r12,:64], r1
  786. vst1.8 {d10}, [r0, :64], r1
  787. vst1.8 {d2}, [r12,:64], r1
  788. vst1.8 {d11}, [r0, :64], r1
  789. vst1.8 {d3}, [r12,:64], r1
  790. vst1.8 {d12}, [r0, :64], r1
  791. vst1.8 {d4}, [r12,:64], r1
  792. vst1.8 {d13}, [r0, :64], r1
  793. vst1.8 {d5}, [r12,:64], r1
  794. vst1.8 {d14}, [r0, :64], r1
  795. vst1.8 {d6}, [r12,:64], r1
  796. vst1.8 {d15}, [r0, :64], r1
  797. vst1.8 {d8}, [r12,:64], r1
  798. vst1.8 {d17}, [r0, :64], r1
  799. vst1.8 {d9}, [r12,:64], r1
  800. vst1.8 {d31}, [r0, :64], r1
  801. sub r0, r0, r1, lsl #3
  802. 9:
  803. bx lr
  804. 8:
  805. @ The same writeback as in loop_filter_h_8_8
  806. sub r12, r0, #4
  807. add r0, r12, r1, lsl #2
  808. transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
  809. vst1.8 {d20}, [r12], r1
  810. vst1.8 {d24}, [r0], r1
  811. vst1.8 {d21}, [r12], r1
  812. vst1.8 {d25}, [r0], r1
  813. vst1.8 {d22}, [r12], r1
  814. vst1.8 {d26}, [r0], r1
  815. vst1.8 {d23}, [r12], r1
  816. vst1.8 {d27}, [r0], r1
  817. sub r0, r0, r1, lsl #3
  818. add r0, r0, #4
  819. bx lr
  820. 7:
  821. @ The same writeback as in loop_filter_h_4_8
  822. sub r12, r0, #2
  823. add r0, r12, r1, lsl #2
  824. transpose_q_4x4 q11, q12, d22, d23, d24, d25
  825. vst1.32 {d22[0]}, [r12], r1
  826. vst1.32 {d22[1]}, [r0], r1
  827. vst1.32 {d23[0]}, [r12], r1
  828. vst1.32 {d23[1]}, [r0], r1
  829. vst1.32 {d24[0]}, [r12], r1
  830. vst1.32 {d24[1]}, [r0], r1
  831. vst1.32 {d25[0]}, [r12], r1
  832. vst1.32 {d25[1]}, [r0], r1
  833. sub r0, r0, r1, lsl #3
  834. add r0, r0, #2
  835. bx lr
  836. endfunc
  837. function ff_vp9_loop_filter_h_16_8_neon, export=1
  838. ldr r12, [sp]
  839. push {lr}
  840. vpush {q4-q7}
  841. push {r12}
  842. bl vp9_loop_filter_h_16_neon
  843. add sp, sp, #4
  844. vpop {q4-q7}
  845. pop {pc}
  846. endfunc
  847. function ff_vp9_loop_filter_h_16_16_neon, export=1
  848. ldr r12, [sp]
  849. // The filter clobbers r2 and r3, but we need to keep them for the second round
  850. push {r2, r3, lr}
  851. vpush {q4-q7}
  852. push {r12}
  853. bl vp9_loop_filter_h_16_neon
  854. add r0, r0, r1, lsl #3
  855. ldr r2, [sp, #68]
  856. ldr r3, [sp, #72]
  857. bl vp9_loop_filter_h_16_neon
  858. add sp, sp, #4
  859. vpop {q4-q7}
  860. pop {r2, r3, pc}
  861. endfunc