You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

499 lines
18KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/aarch64/asm.S"
  22. #include "neon.S"
  23. .macro h264_loop_filter_start
  24. cmp w2, #0
  25. ldr w6, [x4]
  26. ccmp w3, #0, #0, ne
  27. mov v24.S[0], w6
  28. and w6, w6, w6, lsl #16
  29. b.eq 1f
  30. ands w6, w6, w6, lsl #8
  31. b.ge 2f
  32. 1:
  33. ret
  34. 2:
  35. .endm
  36. .macro h264_loop_filter_luma
  37. dup v22.16B, w2 // alpha
  38. uxtl v24.8H, v24.8B
  39. uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
  40. uxtl v24.4S, v24.4H
  41. uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
  42. sli v24.8H, v24.8H, #8
  43. uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
  44. sli v24.4S, v24.4S, #16
  45. cmhi v21.16B, v22.16B, v21.16B // < alpha
  46. dup v22.16B, w3 // beta
  47. cmlt v23.16B, v24.16B, #0
  48. cmhi v28.16B, v22.16B, v28.16B // < beta
  49. cmhi v30.16B, v22.16B, v30.16B // < beta
  50. bic v21.16B, v21.16B, v23.16B
  51. uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
  52. and v21.16B, v21.16B, v28.16B
  53. uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
  54. cmhi v17.16B, v22.16B, v17.16B // < beta
  55. and v21.16B, v21.16B, v30.16B
  56. cmhi v19.16B, v22.16B, v19.16B // < beta
  57. and v17.16B, v17.16B, v21.16B
  58. and v19.16B, v19.16B, v21.16B
  59. and v24.16B, v24.16B, v21.16B
  60. urhadd v28.16B, v16.16B, v0.16B
  61. sub v21.16B, v24.16B, v17.16B
  62. uqadd v23.16B, v18.16B, v24.16B
  63. uhadd v20.16B, v20.16B, v28.16B
  64. sub v21.16B, v21.16B, v19.16B
  65. uhadd v28.16B, v4.16B, v28.16B
  66. umin v23.16B, v23.16B, v20.16B
  67. uqsub v22.16B, v18.16B, v24.16B
  68. uqadd v4.16B, v2.16B, v24.16B
  69. umax v23.16B, v23.16B, v22.16B
  70. uqsub v22.16B, v2.16B, v24.16B
  71. umin v28.16B, v4.16B, v28.16B
  72. uxtl v4.8H, v0.8B
  73. umax v28.16B, v28.16B, v22.16B
  74. uxtl2 v20.8H, v0.16B
  75. usubw v4.8H, v4.8H, v16.8B
  76. usubw2 v20.8H, v20.8H, v16.16B
  77. shl v4.8H, v4.8H, #2
  78. shl v20.8H, v20.8H, #2
  79. uaddw v4.8H, v4.8H, v18.8B
  80. uaddw2 v20.8H, v20.8H, v18.16B
  81. usubw v4.8H, v4.8H, v2.8B
  82. usubw2 v20.8H, v20.8H, v2.16B
  83. rshrn v4.8B, v4.8H, #3
  84. rshrn2 v4.16B, v20.8H, #3
  85. bsl v17.16B, v23.16B, v18.16B
  86. bsl v19.16B, v28.16B, v2.16B
  87. neg v23.16B, v21.16B
  88. uxtl v28.8H, v16.8B
  89. smin v4.16B, v4.16B, v21.16B
  90. uxtl2 v21.8H, v16.16B
  91. smax v4.16B, v4.16B, v23.16B
  92. uxtl v22.8H, v0.8B
  93. uxtl2 v24.8H, v0.16B
  94. saddw v28.8H, v28.8H, v4.8B
  95. saddw2 v21.8H, v21.8H, v4.16B
  96. ssubw v22.8H, v22.8H, v4.8B
  97. ssubw2 v24.8H, v24.8H, v4.16B
  98. sqxtun v16.8B, v28.8H
  99. sqxtun2 v16.16B, v21.8H
  100. sqxtun v0.8B, v22.8H
  101. sqxtun2 v0.16B, v24.8H
  102. .endm
  103. function ff_h264_v_loop_filter_luma_neon, export=1
  104. h264_loop_filter_start
  105. sxtw x1, w1
  106. ld1 {v0.16B}, [x0], x1
  107. ld1 {v2.16B}, [x0], x1
  108. ld1 {v4.16B}, [x0], x1
  109. sub x0, x0, x1, lsl #2
  110. sub x0, x0, x1, lsl #1
  111. ld1 {v20.16B}, [x0], x1
  112. ld1 {v18.16B}, [x0], x1
  113. ld1 {v16.16B}, [x0], x1
  114. h264_loop_filter_luma
  115. sub x0, x0, x1, lsl #1
  116. st1 {v17.16B}, [x0], x1
  117. st1 {v16.16B}, [x0], x1
  118. st1 {v0.16B}, [x0], x1
  119. st1 {v19.16B}, [x0]
  120. ret
  121. endfunc
  122. function ff_h264_h_loop_filter_luma_neon, export=1
  123. h264_loop_filter_start
  124. sub x0, x0, #4
  125. ld1 {v6.8B}, [x0], x1
  126. ld1 {v20.8B}, [x0], x1
  127. ld1 {v18.8B}, [x0], x1
  128. ld1 {v16.8B}, [x0], x1
  129. ld1 {v0.8B}, [x0], x1
  130. ld1 {v2.8B}, [x0], x1
  131. ld1 {v4.8B}, [x0], x1
  132. ld1 {v26.8B}, [x0], x1
  133. ld1 {v6.D}[1], [x0], x1
  134. ld1 {v20.D}[1], [x0], x1
  135. ld1 {v18.D}[1], [x0], x1
  136. ld1 {v16.D}[1], [x0], x1
  137. ld1 {v0.D}[1], [x0], x1
  138. ld1 {v2.D}[1], [x0], x1
  139. ld1 {v4.D}[1], [x0], x1
  140. ld1 {v26.D}[1], [x0], x1
  141. transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
  142. h264_loop_filter_luma
  143. transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
  144. sub x0, x0, x1, lsl #4
  145. add x0, x0, #2
  146. st1 {v17.S}[0], [x0], x1
  147. st1 {v16.S}[0], [x0], x1
  148. st1 {v0.S}[0], [x0], x1
  149. st1 {v19.S}[0], [x0], x1
  150. st1 {v17.S}[1], [x0], x1
  151. st1 {v16.S}[1], [x0], x1
  152. st1 {v0.S}[1], [x0], x1
  153. st1 {v19.S}[1], [x0], x1
  154. st1 {v17.S}[2], [x0], x1
  155. st1 {v16.S}[2], [x0], x1
  156. st1 {v0.S}[2], [x0], x1
  157. st1 {v19.S}[2], [x0], x1
  158. st1 {v17.S}[3], [x0], x1
  159. st1 {v16.S}[3], [x0], x1
  160. st1 {v0.S}[3], [x0], x1
  161. st1 {v19.S}[3], [x0], x1
  162. ret
  163. endfunc
  164. .macro h264_loop_filter_chroma
  165. dup v22.8B, w2 // alpha
  166. uxtl v24.8H, v24.8B
  167. uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
  168. uxtl v4.8H, v0.8B
  169. uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
  170. usubw v4.8H, v4.8H, v16.8B
  171. sli v24.8H, v24.8H, #8
  172. shl v4.8H, v4.8H, #2
  173. uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
  174. uaddw v4.8H, v4.8H, v18.8B
  175. cmhi v26.8B, v22.8B, v26.8B // < alpha
  176. usubw v4.8H, v4.8H, v2.8B
  177. dup v22.8B, w3 // beta
  178. rshrn v4.8B, v4.8H, #3
  179. cmhi v28.8B, v22.8B, v28.8B // < beta
  180. cmhi v30.8B, v22.8B, v30.8B // < beta
  181. smin v4.8B, v4.8B, v24.8B
  182. neg v25.8B, v24.8B
  183. and v26.8B, v26.8B, v28.8B
  184. smax v4.8B, v4.8B, v25.8B
  185. and v26.8B, v26.8B, v30.8B
  186. uxtl v22.8H, v0.8B
  187. and v4.8B, v4.8B, v26.8B
  188. uxtl v28.8H, v16.8B
  189. saddw v28.8H, v28.8H, v4.8B
  190. ssubw v22.8H, v22.8H, v4.8B
  191. sqxtun v16.8B, v28.8H
  192. sqxtun v0.8B, v22.8H
  193. .endm
  194. function ff_h264_v_loop_filter_chroma_neon, export=1
  195. h264_loop_filter_start
  196. sub x0, x0, x1, lsl #1
  197. ld1 {v18.8B}, [x0], x1
  198. ld1 {v16.8B}, [x0], x1
  199. ld1 {v0.8B}, [x0], x1
  200. ld1 {v2.8B}, [x0]
  201. h264_loop_filter_chroma
  202. sub x0, x0, x1, lsl #1
  203. st1 {v16.8B}, [x0], x1
  204. st1 {v0.8B}, [x0], x1
  205. ret
  206. endfunc
  207. function ff_h264_h_loop_filter_chroma_neon, export=1
  208. h264_loop_filter_start
  209. sub x0, x0, #2
  210. ld1 {v18.S}[0], [x0], x1
  211. ld1 {v16.S}[0], [x0], x1
  212. ld1 {v0.S}[0], [x0], x1
  213. ld1 {v2.S}[0], [x0], x1
  214. ld1 {v18.S}[1], [x0], x1
  215. ld1 {v16.S}[1], [x0], x1
  216. ld1 {v0.S}[1], [x0], x1
  217. ld1 {v2.S}[1], [x0], x1
  218. transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
  219. h264_loop_filter_chroma
  220. transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
  221. sub x0, x0, x1, lsl #3
  222. st1 {v18.S}[0], [x0], x1
  223. st1 {v16.S}[0], [x0], x1
  224. st1 {v0.S}[0], [x0], x1
  225. st1 {v2.S}[0], [x0], x1
  226. st1 {v18.S}[1], [x0], x1
  227. st1 {v16.S}[1], [x0], x1
  228. st1 {v0.S}[1], [x0], x1
  229. st1 {v2.S}[1], [x0], x1
  230. ret
  231. endfunc
  232. .macro biweight_16 macs, macd
  233. dup v0.16B, w5
  234. dup v1.16B, w6
  235. mov v4.16B, v16.16B
  236. mov v6.16B, v16.16B
  237. 1: subs w3, w3, #2
  238. ld1 {v20.16B}, [x0], x2
  239. \macd v4.8H, v0.8B, v20.8B
  240. \macd\()2 v6.8H, v0.16B, v20.16B
  241. ld1 {v22.16B}, [x1], x2
  242. \macs v4.8H, v1.8B, v22.8B
  243. \macs\()2 v6.8H, v1.16B, v22.16B
  244. mov v24.16B, v16.16B
  245. ld1 {v28.16B}, [x0], x2
  246. mov v26.16B, v16.16B
  247. \macd v24.8H, v0.8B, v28.8B
  248. \macd\()2 v26.8H, v0.16B, v28.16B
  249. ld1 {v30.16B}, [x1], x2
  250. \macs v24.8H, v1.8B, v30.8B
  251. \macs\()2 v26.8H, v1.16B, v30.16B
  252. sshl v4.8H, v4.8H, v18.8H
  253. sshl v6.8H, v6.8H, v18.8H
  254. sqxtun v4.8B, v4.8H
  255. sqxtun2 v4.16B, v6.8H
  256. sshl v24.8H, v24.8H, v18.8H
  257. sshl v26.8H, v26.8H, v18.8H
  258. sqxtun v24.8B, v24.8H
  259. sqxtun2 v24.16B, v26.8H
  260. mov v6.16B, v16.16B
  261. st1 {v4.16B}, [x7], x2
  262. mov v4.16B, v16.16B
  263. st1 {v24.16B}, [x7], x2
  264. b.ne 1b
  265. ret
  266. .endm
  267. .macro biweight_8 macs, macd
  268. dup v0.8B, w5
  269. dup v1.8B, w6
  270. mov v2.16B, v16.16B
  271. mov v20.16B, v16.16B
  272. 1: subs w3, w3, #2
  273. ld1 {v4.8B}, [x0], x2
  274. \macd v2.8H, v0.8B, v4.8B
  275. ld1 {v5.8B}, [x1], x2
  276. \macs v2.8H, v1.8B, v5.8B
  277. ld1 {v6.8B}, [x0], x2
  278. \macd v20.8H, v0.8B, v6.8B
  279. ld1 {v7.8B}, [x1], x2
  280. \macs v20.8H, v1.8B, v7.8B
  281. sshl v2.8H, v2.8H, v18.8H
  282. sqxtun v2.8B, v2.8H
  283. sshl v20.8H, v20.8H, v18.8H
  284. sqxtun v4.8B, v20.8H
  285. mov v20.16B, v16.16B
  286. st1 {v2.8B}, [x7], x2
  287. mov v2.16B, v16.16B
  288. st1 {v4.8B}, [x7], x2
  289. b.ne 1b
  290. ret
  291. .endm
  292. .macro biweight_4 macs, macd
  293. dup v0.8B, w5
  294. dup v1.8B, w6
  295. mov v2.16B, v16.16B
  296. mov v20.16B,v16.16B
  297. 1: subs w3, w3, #4
  298. ld1 {v4.S}[0], [x0], x2
  299. ld1 {v4.S}[1], [x0], x2
  300. \macd v2.8H, v0.8B, v4.8B
  301. ld1 {v5.S}[0], [x1], x2
  302. ld1 {v5.S}[1], [x1], x2
  303. \macs v2.8H, v1.8B, v5.8B
  304. b.lt 2f
  305. ld1 {v6.S}[0], [x0], x2
  306. ld1 {v6.S}[1], [x0], x2
  307. \macd v20.8H, v0.8B, v6.8B
  308. ld1 {v7.S}[0], [x1], x2
  309. ld1 {v7.S}[1], [x1], x2
  310. \macs v20.8H, v1.8B, v7.8B
  311. sshl v2.8H, v2.8H, v18.8H
  312. sqxtun v2.8B, v2.8H
  313. sshl v20.8H, v20.8H, v18.8H
  314. sqxtun v4.8B, v20.8H
  315. mov v20.16B, v16.16B
  316. st1 {v2.S}[0], [x7], x2
  317. st1 {v2.S}[1], [x7], x2
  318. mov v2.16B, v16.16B
  319. st1 {v4.S}[0], [x7], x2
  320. st1 {v4.S}[1], [x7], x2
  321. b.ne 1b
  322. ret
  323. 2: sshl v2.8H, v2.8H, v18.8H
  324. sqxtun v2.8B, v2.8H
  325. st1 {v2.S}[0], [x7], x2
  326. st1 {v2.S}[1], [x7], x2
  327. ret
  328. .endm
  329. .macro biweight_func w
  330. function ff_biweight_h264_pixels_\w\()_neon, export=1
  331. sxtw x2, w2
  332. lsr w8, w5, #31
  333. add w7, w7, #1
  334. eor w8, w8, w6, lsr #30
  335. orr w7, w7, #1
  336. dup v18.8H, w4
  337. lsl w7, w7, w4
  338. not v18.16B, v18.16B
  339. dup v16.8H, w7
  340. mov x7, x0
  341. cbz w8, 10f
  342. subs w8, w8, #1
  343. b.eq 20f
  344. subs w8, w8, #1
  345. b.eq 30f
  346. b 40f
  347. 10: biweight_\w umlal, umlal
  348. 20: neg w5, w5
  349. biweight_\w umlal, umlsl
  350. 30: neg w5, w5
  351. neg w6, w6
  352. biweight_\w umlsl, umlsl
  353. 40: neg w6, w6
  354. biweight_\w umlsl, umlal
  355. endfunc
  356. .endm
  357. biweight_func 16
  358. biweight_func 8
  359. biweight_func 4
  360. .macro weight_16 add
  361. dup v0.16B, w4
  362. 1: subs w2, w2, #2
  363. ld1 {v20.16B}, [x0], x1
  364. umull v4.8H, v0.8B, v20.8B
  365. umull2 v6.8H, v0.16B, v20.16B
  366. ld1 {v28.16B}, [x0], x1
  367. umull v24.8H, v0.8B, v28.8B
  368. umull2 v26.8H, v0.16B, v28.16B
  369. \add v4.8H, v16.8H, v4.8H
  370. srshl v4.8H, v4.8H, v18.8H
  371. \add v6.8H, v16.8H, v6.8H
  372. srshl v6.8H, v6.8H, v18.8H
  373. sqxtun v4.8B, v4.8H
  374. sqxtun2 v4.16B, v6.8H
  375. \add v24.8H, v16.8H, v24.8H
  376. srshl v24.8H, v24.8H, v18.8H
  377. \add v26.8H, v16.8H, v26.8H
  378. srshl v26.8H, v26.8H, v18.8H
  379. sqxtun v24.8B, v24.8H
  380. sqxtun2 v24.16B, v26.8H
  381. st1 {v4.16B}, [x5], x1
  382. st1 {v24.16B}, [x5], x1
  383. b.ne 1b
  384. ret
  385. .endm
  386. .macro weight_8 add
  387. dup v0.8B, w4
  388. 1: subs w2, w2, #2
  389. ld1 {v4.8B}, [x0], x1
  390. umull v2.8H, v0.8B, v4.8B
  391. ld1 {v6.8B}, [x0], x1
  392. umull v20.8H, v0.8B, v6.8B
  393. \add v2.8H, v16.8H, v2.8H
  394. srshl v2.8H, v2.8H, v18.8H
  395. sqxtun v2.8B, v2.8H
  396. \add v20.8H, v16.8H, v20.8H
  397. srshl v20.8H, v20.8H, v18.8H
  398. sqxtun v4.8B, v20.8H
  399. st1 {v2.8B}, [x5], x1
  400. st1 {v4.8B}, [x5], x1
  401. b.ne 1b
  402. ret
  403. .endm
  404. .macro weight_4 add
  405. dup v0.8B, w4
  406. 1: subs w2, w2, #4
  407. ld1 {v4.S}[0], [x0], x1
  408. ld1 {v4.S}[1], [x0], x1
  409. umull v2.8H, v0.8B, v4.8B
  410. b.lt 2f
  411. ld1 {v6.S}[0], [x0], x1
  412. ld1 {v6.S}[1], [x0], x1
  413. umull v20.8H, v0.8B, v6.8B
  414. \add v2.8H, v16.8H, v2.8H
  415. srshl v2.8H, v2.8H, v18.8H
  416. sqxtun v2.8B, v2.8H
  417. \add v20.8H, v16.8H, v20.8H
  418. srshl v20.8H, v20.8h, v18.8H
  419. sqxtun v4.8B, v20.8H
  420. st1 {v2.S}[0], [x5], x1
  421. st1 {v2.S}[1], [x5], x1
  422. st1 {v4.S}[0], [x5], x1
  423. st1 {v4.S}[1], [x5], x1
  424. b.ne 1b
  425. ret
  426. 2: \add v2.8H, v16.8H, v2.8H
  427. srshl v2.8H, v2.8H, v18.8H
  428. sqxtun v2.8B, v2.8H
  429. st1 {v2.S}[0], [x5], x1
  430. st1 {v2.S}[1], [x5], x1
  431. ret
  432. .endm
  433. .macro weight_func w
  434. function ff_weight_h264_pixels_\w\()_neon, export=1
  435. sxtw x1, w1
  436. cmp w3, #1
  437. mov w6, #1
  438. lsl w5, w5, w3
  439. dup v16.8H, w5
  440. mov x5, x0
  441. b.le 20f
  442. sub w6, w6, w3
  443. dup v18.8H, w6
  444. cmp w4, #0
  445. b.lt 10f
  446. weight_\w shadd
  447. 10: neg w4, w4
  448. weight_\w shsub
  449. 20: neg w6, w3
  450. dup v18.8H, w6
  451. cmp w4, #0
  452. b.lt 10f
  453. weight_\w add
  454. 10: neg w4, w4
  455. weight_\w sub
  456. endfunc
  457. .endm
  458. weight_func 16
  459. weight_func 8
  460. weight_func 4