You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

362 lines
13KB

  1. /*
  2. * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. .macro ldcol.8 rd, rs, rt, n=8, hi=0
  22. .if \n >= 8 || \hi == 0
  23. ld1 {\rd\().b}[0], [\rs], \rt
  24. ld1 {\rd\().b}[1], [\rs], \rt
  25. ld1 {\rd\().b}[2], [\rs], \rt
  26. ld1 {\rd\().b}[3], [\rs], \rt
  27. .endif
  28. .if \n >= 8 || \hi == 1
  29. ld1 {\rd\().b}[4], [\rs], \rt
  30. ld1 {\rd\().b}[5], [\rs], \rt
  31. ld1 {\rd\().b}[6], [\rs], \rt
  32. ld1 {\rd\().b}[7], [\rs], \rt
  33. .endif
  34. .if \n == 16
  35. ld1 {\rd\().b}[8], [\rs], \rt
  36. ld1 {\rd\().b}[9], [\rs], \rt
  37. ld1 {\rd\().b}[10], [\rs], \rt
  38. ld1 {\rd\().b}[11], [\rs], \rt
  39. ld1 {\rd\().b}[12], [\rs], \rt
  40. ld1 {\rd\().b}[13], [\rs], \rt
  41. ld1 {\rd\().b}[14], [\rs], \rt
  42. ld1 {\rd\().b}[15], [\rs], \rt
  43. .endif
  44. .endm
  45. function ff_pred16x16_128_dc_neon, export=1
  46. movi v0.16b, #128
  47. b .L_pred16x16_dc_end
  48. endfunc
  49. function ff_pred16x16_top_dc_neon, export=1
  50. sub x2, x0, x1
  51. ld1 {v0.16b}, [x2]
  52. uaddlv h0, v0.16b
  53. rshrn v0.8b, v0.8h, #4
  54. dup v0.16b, v0.b[0]
  55. b .L_pred16x16_dc_end
  56. endfunc
  57. function ff_pred16x16_left_dc_neon, export=1
  58. sub x2, x0, #1
  59. ldcol.8 v0, x2, x1, 16
  60. uaddlv h0, v0.16b
  61. rshrn v0.8b, v0.8h, #4
  62. dup v0.16b, v0.b[0]
  63. b .L_pred16x16_dc_end
  64. endfunc
  65. function ff_pred16x16_dc_neon, export=1
  66. sub x2, x0, x1
  67. sub x3, x0, #1
  68. ld1 {v0.16b}, [x2]
  69. ldcol.8 v1, x3, x1, 16
  70. uaddlv h0, v0.16b
  71. uaddlv h1, v1.16b
  72. add v0.4h, v0.4h, v1.4h
  73. rshrn v0.8b, v0.8h, #5
  74. dup v0.16b, v0.b[0]
  75. .L_pred16x16_dc_end:
  76. mov w3, #8
  77. 6: st1 {v0.16b}, [x0], x1
  78. st1 {v0.16b}, [x0], x1
  79. subs w3, w3, #1
  80. b.ne 6b
  81. ret
  82. endfunc
  83. function ff_pred16x16_hor_neon, export=1
  84. sub x2, x0, #1
  85. mov w3, #16
  86. 1: ld1r {v0.16b}, [x2], x1
  87. st1 {v0.16b}, [x0], x1
  88. subs w3, w3, #1
  89. b.ne 1b
  90. ret
  91. endfunc
  92. function ff_pred16x16_vert_neon, export=1
  93. sub x2, x0, x1
  94. add x1, x1, x1
  95. ld1 {v0.16b}, [x2], x1
  96. mov w3, #8
  97. 1: st1 {v0.16b}, [x0], x1
  98. st1 {v0.16b}, [x2], x1
  99. subs w3, w3, #1
  100. b.ne 1b
  101. ret
  102. endfunc
  103. function ff_pred16x16_plane_neon, export=1
  104. sub x3, x0, x1
  105. movrel x4, p16weight
  106. add x2, x3, #8
  107. sub x3, x3, #1
  108. ld1 {v0.8b}, [x3]
  109. ld1 {v2.8b}, [x2], x1
  110. ldcol.8 v1, x3, x1
  111. add x3, x3, x1
  112. ldcol.8 v3, x3, x1
  113. rev64 v0.8b, v0.8b
  114. rev64 v1.8b, v1.8b
  115. uaddl v7.8h, v2.8b, v3.8b
  116. usubl v2.8h, v2.8b, v0.8b
  117. usubl v3.8h, v3.8b, v1.8b
  118. ld1 {v0.8h}, [x4]
  119. mul v2.8h, v2.8h, v0.8h
  120. mul v3.8h, v3.8h, v0.8h
  121. addp v2.8h, v2.8h, v3.8h
  122. addp v2.8h, v2.8h, v2.8h
  123. addp v2.4h, v2.4h, v2.4h
  124. sshll v3.4s, v2.4h, #2
  125. saddw v2.4s, v3.4s, v2.4h
  126. rshrn v4.4h, v2.4s, #6
  127. trn2 v5.4h, v4.4h, v4.4h
  128. add v2.4h, v4.4h, v5.4h
  129. shl v3.4h, v2.4h, #3
  130. ext v7.16b, v7.16b, v7.16b, #14
  131. sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
  132. add v7.4h, v7.4h, v0.4h
  133. shl v2.4h, v7.4h, #4
  134. sub v2.4h, v2.4h, v3.4h
  135. shl v3.4h, v4.4h, #4
  136. ext v0.16b, v0.16b, v0.16b, #14
  137. sub v6.4h, v5.4h, v3.4h
  138. mov v0.h[0], wzr
  139. mul v0.8h, v0.8h, v4.h[0]
  140. dup v1.8h, v2.h[0]
  141. dup v2.8h, v4.h[0]
  142. dup v3.8h, v6.h[0]
  143. shl v2.8h, v2.8h, #3
  144. add v1.8h, v1.8h, v0.8h
  145. add v3.8h, v3.8h, v2.8h
  146. mov w3, #16
  147. 1:
  148. sqshrun v0.8b, v1.8h, #5
  149. add v1.8h, v1.8h, v2.8h
  150. sqshrun2 v0.16b, v1.8h, #5
  151. add v1.8h, v1.8h, v3.8h
  152. st1 {v0.16b}, [x0], x1
  153. subs w3, w3, #1
  154. b.ne 1b
  155. ret
  156. endfunc
  157. const p16weight, align=4
  158. .short 1,2,3,4,5,6,7,8
  159. endconst
  160. const p8weight, align=4
  161. .short 1,2,3,4,1,2,3,4
  162. endconst
  163. function ff_pred8x8_hor_neon, export=1
  164. sub x2, x0, #1
  165. mov w3, #8
  166. 1: ld1r {v0.8b}, [x2], x1
  167. st1 {v0.8b}, [x0], x1
  168. subs w3, w3, #1
  169. b.ne 1b
  170. ret
  171. endfunc
  172. function ff_pred8x8_vert_neon, export=1
  173. sub x2, x0, x1
  174. lsl x1, x1, #1
  175. ld1 {v0.8b}, [x2], x1
  176. mov w3, #4
  177. 1: st1 {v0.8b}, [x0], x1
  178. st1 {v0.8b}, [x2], x1
  179. subs w3, w3, #1
  180. b.ne 1b
  181. ret
  182. endfunc
  183. function ff_pred8x8_plane_neon, export=1
  184. sub x3, x0, x1
  185. movrel x4, p8weight
  186. movrel x5, p16weight
  187. add x2, x3, #4
  188. sub x3, x3, #1
  189. ld1 {v0.s}[0], [x3]
  190. ld1 {v2.s}[0], [x2], x1
  191. ldcol.8 v0, x3, x1, 4, hi=1
  192. add x3, x3, x1
  193. ldcol.8 v3, x3, x1, 4
  194. uaddl v7.8h, v2.8b, v3.8b
  195. rev32 v0.8b, v0.8b
  196. trn1 v2.2s, v2.2s, v3.2s
  197. usubl v2.8h, v2.8b, v0.8b
  198. ld1 {v6.8h}, [x4]
  199. mul v2.8h, v2.8h, v6.8h
  200. ld1 {v0.8h}, [x5]
  201. saddlp v2.4s, v2.8h
  202. addp v2.4s, v2.4s, v2.4s
  203. shl v3.4s, v2.4s, #4
  204. add v2.4s, v3.4s, v2.4s
  205. rshrn v5.4h, v2.4s, #5
  206. addp v2.4h, v5.4h, v5.4h
  207. shl v3.4h, v2.4h, #1
  208. add v3.4h, v3.4h, v2.4h
  209. rev64 v7.4h, v7.4h
  210. add v7.4h, v7.4h, v0.4h
  211. shl v2.4h, v7.4h, #4
  212. sub v2.4h, v2.4h, v3.4h
  213. ext v0.16b, v0.16b, v0.16b, #14
  214. mov v0.h[0], wzr
  215. mul v0.8h, v0.8h, v5.h[0]
  216. dup v1.8h, v2.h[0]
  217. dup v2.8h, v5.h[1]
  218. add v1.8h, v1.8h, v0.8h
  219. mov w3, #8
  220. 1:
  221. sqshrun v0.8b, v1.8h, #5
  222. add v1.8h, v1.8h, v2.8h
  223. st1 {v0.8b}, [x0], x1
  224. subs w3, w3, #1
  225. b.ne 1b
  226. ret
  227. endfunc
  228. function ff_pred8x8_128_dc_neon, export=1
  229. movi v0.8b, #128
  230. movi v1.8b, #128
  231. b .L_pred8x8_dc_end
  232. endfunc
  233. function ff_pred8x8_top_dc_neon, export=1
  234. sub x2, x0, x1
  235. ld1 {v0.8b}, [x2]
  236. uaddlp v0.4h, v0.8b
  237. addp v0.4h, v0.4h, v0.4h
  238. zip1 v0.8h, v0.8h, v0.8h
  239. rshrn v2.8b, v0.8h, #2
  240. zip1 v0.8b, v2.8b, v2.8b
  241. zip1 v1.8b, v2.8b, v2.8b
  242. b .L_pred8x8_dc_end
  243. endfunc
  244. function ff_pred8x8_left_dc_neon, export=1
  245. sub x2, x0, #1
  246. ldcol.8 v0, x2, x1
  247. uaddlp v0.4h, v0.8b
  248. addp v0.4h, v0.4h, v0.4h
  249. rshrn v2.8b, v0.8h, #2
  250. dup v1.8b, v2.b[1]
  251. dup v0.8b, v2.b[0]
  252. b .L_pred8x8_dc_end
  253. endfunc
  254. function ff_pred8x8_dc_neon, export=1
  255. sub x2, x0, x1
  256. sub x3, x0, #1
  257. ld1 {v0.8b}, [x2]
  258. ldcol.8 v1, x3, x1
  259. uaddlp v0.4h, v0.8b
  260. uaddlp v1.4h, v1.8b
  261. trn1 v2.2s, v0.2s, v1.2s
  262. trn2 v3.2s, v0.2s, v1.2s
  263. addp v4.4h, v2.4h, v3.4h
  264. addp v5.4h, v4.4h, v4.4h
  265. rshrn v6.8b, v5.8h, #3
  266. rshrn v7.8b, v4.8h, #2
  267. dup v0.8b, v6.b[0]
  268. dup v2.8b, v7.b[2]
  269. dup v1.8b, v7.b[3]
  270. dup v3.8b, v6.b[1]
  271. zip1 v0.2s, v0.2s, v2.2s
  272. zip1 v1.2s, v1.2s, v3.2s
  273. .L_pred8x8_dc_end:
  274. mov w3, #4
  275. add x2, x0, x1, lsl #2
  276. 6: st1 {v0.8b}, [x0], x1
  277. st1 {v1.8b}, [x2], x1
  278. subs w3, w3, #1
  279. b.ne 6b
  280. ret
  281. endfunc
  282. function ff_pred8x8_l0t_dc_neon, export=1
  283. sub x2, x0, x1
  284. sub x3, x0, #1
  285. ld1 {v0.8b}, [x2]
  286. ldcol.8 v1, x3, x1, 4
  287. zip1 v0.4s, v0.4s, v1.4s
  288. uaddlp v0.8h, v0.16b
  289. addp v0.8h, v0.8h, v0.8h
  290. addp v1.4h, v0.4h, v0.4h
  291. rshrn v2.8b, v0.8h, #2
  292. rshrn v3.8b, v1.8h, #3
  293. dup v4.8b, v3.b[0]
  294. dup v6.8b, v2.b[2]
  295. dup v5.8b, v2.b[0]
  296. zip1 v0.2s, v4.2s, v6.2s
  297. zip1 v1.2s, v5.2s, v6.2s
  298. b .L_pred8x8_dc_end
  299. endfunc
  300. function ff_pred8x8_l00_dc_neon, export=1
  301. sub x2, x0, #1
  302. ldcol.8 v0, x2, x1, 4
  303. uaddlp v0.4h, v0.8b
  304. addp v0.4h, v0.4h, v0.4h
  305. rshrn v0.8b, v0.8h, #2
  306. movi v1.8b, #128
  307. dup v0.8b, v0.b[0]
  308. b .L_pred8x8_dc_end
  309. endfunc
  310. function ff_pred8x8_0lt_dc_neon, export=1
  311. add x3, x0, x1, lsl #2
  312. sub x2, x0, x1
  313. sub x3, x3, #1
  314. ld1 {v0.8b}, [x2]
  315. ldcol.8 v1, x3, x1, 4, hi=1
  316. zip1 v0.4s, v0.4s, v1.4s
  317. uaddlp v0.8h, v0.16b
  318. addp v0.8h, v0.8h, v0.8h
  319. addp v1.4h, v0.4h, v0.4h
  320. rshrn v2.8b, v0.8h, #2
  321. rshrn v3.8b, v1.8h, #3
  322. dup v4.8b, v2.b[0]
  323. dup v5.8b, v2.b[3]
  324. dup v6.8b, v2.b[2]
  325. dup v7.8b, v3.b[1]
  326. zip1 v0.2s, v4.2s, v6.2s
  327. zip1 v1.2s, v5.2s, v7.2s
  328. b .L_pred8x8_dc_end
  329. endfunc
  330. function ff_pred8x8_0l0_dc_neon, export=1
  331. add x2, x0, x1, lsl #2
  332. sub x2, x2, #1
  333. ldcol.8 v1, x2, x1, 4
  334. uaddlp v2.4h, v1.8b
  335. addp v2.4h, v2.4h, v2.4h
  336. rshrn v1.8b, v2.8h, #2
  337. movi v0.8b, #128
  338. dup v1.8b, v1.b[0]
  339. b .L_pred8x8_dc_end
  340. endfunc