You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
11KB

  1. /*
  2. * ARM NEON optimised MC functions for HEVC decoding
  3. *
  4. * Copyright (c) 2017 Alexandra Hájková
  5. *
  6. * This file is part of Libav.
  7. *
  8. * Libav is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * Libav is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with Libav; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavutil/arm/asm.S"
  23. .macro get_pixels4 bitdepth
  24. function ff_hevc_get_pixels_4_\bitdepth\()_neon, export=1
  25. @r0 dst, r1 dststride, r2 src, r3 srcstride
  26. ldr r12, [sp] @height
  27. cmp r12, #0
  28. it eq
  29. bxeq lr
  30. 1:
  31. .if \bitdepth == 8
  32. vld1.32 {d0[0]}, [r2], r3
  33. vld1.32 {d1[0]}, [r2], r3
  34. vld1.32 {d2[0]}, [r2], r3
  35. vld1.32 {d3[0]}, [r2], r3
  36. vshll.u8 q8, d0, #6
  37. vshll.u8 q9, d1, #6
  38. vshll.u8 q10, d2, #6
  39. vshll.u8 q11, d3, #6
  40. .else
  41. vld1.16 {d0}, [r2], r3
  42. vld1.16 {d1}, [r2], r3
  43. vld1.16 {d2}, [r2], r3
  44. vld1.16 {d3}, [r2], r3
  45. vshl.i16 d16, d0, #4
  46. vshl.i16 d18, d1, #4
  47. vshl.i16 d20, d2, #4
  48. vshl.i16 d22, d3, #4
  49. .endif
  50. vst1.16 {d16}, [r0, :64], r1
  51. vst1.16 {d18}, [r0, :64], r1
  52. vst1.16 {d20}, [r0, :64], r1
  53. vst1.16 {d22}, [r0, :64], r1
  54. subs r12, #4
  55. bgt 1b
  56. bx lr
  57. endfunc
  58. .endm
  59. .macro get_pixels8 bitdepth
  60. function ff_hevc_get_pixels_8_\bitdepth\()_neon, export=1
  61. @r0 dst, r1 dststride, r2 src, r3 srcstride
  62. ldr r12, [sp] @height
  63. cmp r12, #0
  64. it eq
  65. bxeq lr
  66. 1:
  67. .if \bitdepth == 8
  68. vld1.8 {d0}, [r2], r3
  69. vld1.8 {d1}, [r2], r3
  70. vld1.8 {d2}, [r2], r3
  71. vld1.8 {d3}, [r2], r3
  72. vshll.u8 q8, d0, #6
  73. vshll.u8 q9, d1, #6
  74. vshll.u8 q10, d2, #6
  75. vshll.u8 q11, d3, #6
  76. .else
  77. vld1.16 {d16-d17}, [r2], r3
  78. vld1.16 {d18-d19}, [r2], r3
  79. vld1.16 {d20-d21}, [r2], r3
  80. vld1.16 {d22-d23}, [r2], r3
  81. vshl.i16 q8, q8, #4
  82. vshl.i16 q9, q9, #4
  83. vshl.i16 q10, q10, #4
  84. vshl.i16 q11, q11, #4
  85. .endif
  86. vst1.16 {d16-d17}, [r0, :64], r1
  87. vst1.16 {d18-d19}, [r0, :64], r1
  88. vst1.16 {d20-d21}, [r0, :64], r1
  89. vst1.16 {d22-d23}, [r0, :64], r1
  90. subs r12, #4
  91. bgt 1b
  92. bx lr
  93. endfunc
  94. .endm
  95. .macro get_pixels12 bitdepth
  96. function ff_hevc_get_pixels_12_\bitdepth\()_neon, export=1
  97. @r0 - dst, r1 - dststride, r2 - src, r3 - srcstride
  98. ldr r12, [sp] @height
  99. cmp r12, #0
  100. it eq
  101. bxeq lr
  102. push {r4-r5, lr}
  103. add r4, r0, #16
  104. 1:
  105. .if \bitdepth == 8
  106. add r5, r2, #8
  107. vld1.8 {d0}, [r2], r3
  108. vld1.32 {d4[0]}, [r5], r3
  109. vld1.8 {d1}, [r2], r3
  110. vld1.32 {d5[0]}, [r5], r3
  111. vld1.8 {d2}, [r2], r3
  112. vld1.32 {d6[0]}, [r5], r3
  113. vld1.8 {d3}, [r2], r3
  114. vld1.32 {d7[0]}, [r5], r3
  115. vshll.u8 q8, d0, #6
  116. vshll.u8 q12, d4, #6
  117. vshll.u8 q9, d1, #6
  118. vshll.u8 q13, d5, #6
  119. vshll.u8 q10, d2, #6
  120. vshll.u8 q14, d6, #6
  121. vshll.u8 q11, d3, #6
  122. vshll.u8 q15, d7, #6
  123. .else
  124. add r5, r2, #16
  125. vld1.16 {d16-d17}, [r2], r3
  126. vld1.16 {d24}, [r5], r3
  127. vld1.16 {d18-d19}, [r2], r3
  128. vld1.16 {d26}, [r5], r3
  129. vld1.16 {d20-d21}, [r2], r3
  130. vld1.16 {d28}, [r5], r3
  131. vld1.16 {d22-d23}, [r2], r3
  132. vld1.16 {d30}, [r5], r3
  133. vshl.i16 q8, q8, #4
  134. vshl.i16 d24, d24, #4
  135. vshl.i16 q9, q9, #4
  136. vshl.i16 d26, d26, #4
  137. vshl.i16 q10, q10, #4
  138. vshl.i16 d28, d28, #4
  139. vshl.i16 q11, q11, #4
  140. vshl.i16 d30, d30, #4
  141. .endif
  142. vst1.16 {d16-d17}, [r0, :64], r1
  143. vst1.16 {d24}, [r4, :64], r1
  144. vst1.16 {d18-d19}, [r0, :64], r1
  145. vst1.16 {d26}, [r4, :64], r1
  146. vst1.16 {d20-d21}, [r0, :64], r1
  147. vst1.16 {d28}, [r4, :64], r1
  148. vst1.16 {d22-d23}, [r0, :64], r1
  149. vst1.16 {d30}, [r4, :64], r1
  150. subs r12, #4
  151. bgt 1b
  152. pop {r4-r5, pc}
  153. endfunc
  154. .endm
  155. @8 bitdepth case
  156. .macro process_8 load
  157. vld1.8 {d0-d1}, [\load], r3
  158. vld1.8 {d2-d3}, [\load], r3
  159. vld1.8 {d4-d5}, [\load], r3
  160. vld1.8 {d6-d7}, [\load], r3
  161. vshll.u8 q8, d0, #6
  162. vshll.u8 q9, d1, #6
  163. vshll.u8 q10, d2, #6
  164. vshll.u8 q11, d3, #6
  165. vshll.u8 q12, d4, #6
  166. vshll.u8 q13, d5, #6
  167. vshll.u8 q14, d6, #6
  168. vshll.u8 q15, d7, #6
  169. .endm
  170. @10 bitdepth case
  171. .macro process_10 load
  172. vld1.16 {d16-d19}, [\load], r3
  173. vld1.16 {d20-d23}, [\load], r3
  174. vld1.16 {d24-d27}, [\load], r3
  175. vld1.16 {d28-d31}, [\load], r3
  176. vshl.i16 q8, q8, #4
  177. vshl.i16 q9, q9, #4
  178. vshl.i16 q10, q10, #4
  179. vshl.i16 q11, q11, #4
  180. vshl.i16 q12, q12, #4
  181. vshl.i16 q13, q13, #4
  182. vshl.i16 q14, q14, #4
  183. vshl.i16 q15, q15, #4
  184. .endm
  185. .macro store_4x16 store
  186. vst1.16 {d16-d19}, [\store, :128], r1
  187. vst1.16 {d20-d23}, [\store, :128], r1
  188. vst1.16 {d24-d27}, [\store, :128], r1
  189. vst1.16 {d28-d31}, [\store, :128], r1
  190. .endm
  191. .macro get_pixels16 bitdepth
  192. function ff_hevc_get_pixels_16_\bitdepth\()_neon, export=1
  193. @r0 dst, r1 dststride, r2 src, r3 srcstride
  194. ldr r12, [sp] @height
  195. cmp r12, #0
  196. it eq
  197. bxeq lr
  198. 1:
  199. .if \bitdepth == 8
  200. process_8 r2
  201. .else
  202. process_10 r2
  203. .endif
  204. store_4x16 r0
  205. subs r12, #4
  206. bgt 1b
  207. bx lr
  208. endfunc
  209. .endm
  210. .macro get_pixels24 bitdepth
  211. function ff_hevc_get_pixels_24_\bitdepth\()_neon, export=1
  212. @r0 dst, r1 dststride, r2 src, r3 srcstride
  213. ldr r12, [sp] @height
  214. cmp r12, #0
  215. it eq
  216. bxeq lr
  217. push {r0-r4, lr}
  218. push {r12}
  219. bl X(ff_hevc_get_pixels_8_\bitdepth\()_neon)
  220. pop {r12}
  221. pop {r0-r4, lr}
  222. .if \bitdepth == 8
  223. add r2, #8
  224. .else
  225. add r2, #16
  226. .endif
  227. add r0, #16
  228. b X(ff_hevc_get_pixels_16_\bitdepth\()_neon)
  229. endfunc
  230. .endm
  231. .macro get_pixels32 bitdepth
  232. function ff_hevc_get_pixels_32_\bitdepth\()_neon, export=1
  233. @r0 dst, r1 dststride, r2 src, r3 srcstride
  234. ldr r12, [sp] @height
  235. cmp r12, #0
  236. it eq
  237. bxeq lr
  238. push {r4-r5, lr}
  239. .if \bitdepth == 8
  240. add r4, r2, #16
  241. .else
  242. add r4, r2, #32
  243. .endif
  244. add r5, r0, #32
  245. 1:
  246. .if \bitdepth == 8
  247. process_8 r2
  248. .else
  249. process_10 r2
  250. .endif
  251. store_4x16 r0
  252. .if \bitdepth == 8
  253. process_8 r4
  254. .else
  255. process_10 r4
  256. .endif
  257. store_4x16 r5
  258. subs r12, #4
  259. bgt 1b
  260. pop {r4-r5, pc}
  261. endfunc
  262. .endm
  263. .macro get_pixels48 bitdepth
  264. function ff_hevc_get_pixels_48_\bitdepth\()_neon, export=1
  265. @r0 dst, r1 dststride, r2 src, r3 srcstride
  266. ldr r12, [sp] @height
  267. cmp r12, #0
  268. it eq
  269. bxeq lr
  270. push {r0-r4, lr}
  271. push {r12}
  272. bl X(ff_hevc_get_pixels_16_\bitdepth\()_neon)
  273. pop {r12}
  274. pop {r0-r4, lr}
  275. .if \bitdepth == 8
  276. add r2, #16
  277. .else
  278. add r2, #32
  279. .endif
  280. add r0, #32
  281. b X(ff_hevc_get_pixels_32_\bitdepth\()_neon)
  282. endfunc
  283. .endm
  284. .macro get_pixels64 bitdepth
  285. function ff_hevc_get_pixels_64_\bitdepth\()_neon, export=1
  286. @r0 dst, r1 dststride, r2 src, r3 srcstride
  287. ldr r12, [sp] @height
  288. cmp r12, #0
  289. it eq
  290. bxeq lr
  291. push {r4-r9, lr}
  292. .if \bitdepth == 8
  293. add r4, r2, #16
  294. add r6, r4, #16
  295. add r8, r6, #16
  296. .else
  297. add r4, r2, #32
  298. add r6, r4, #32
  299. add r8, r6, #32
  300. .endif
  301. add r5, r0, #32
  302. add r7, r5, #32
  303. add r9, r7, #32
  304. 1:
  305. .if \bitdepth == 8
  306. process_8 r2
  307. .else
  308. process_10 r2
  309. .endif
  310. store_4x16 r0
  311. .if \bitdepth == 8
  312. process_8 r4
  313. .else
  314. process_10 r4
  315. .endif
  316. store_4x16 r5
  317. .if \bitdepth == 8
  318. process_8 r6
  319. .else
  320. process_10 r6
  321. .endif
  322. store_4x16 r7
  323. .if \bitdepth == 8
  324. process_8 r8
  325. .else
  326. process_10 r8
  327. .endif
  328. store_4x16 r9
  329. subs r12, #4
  330. bgt 1b
  331. pop {r4-r9, pc}
  332. endfunc
  333. .endm
  334. get_pixels4 8
  335. get_pixels4 10
  336. get_pixels8 8
  337. get_pixels8 10
  338. get_pixels12 8
  339. get_pixels12 10
  340. get_pixels16 8
  341. get_pixels16 10
  342. get_pixels24 8
  343. get_pixels24 10
  344. get_pixels32 8
  345. get_pixels32 10
  346. get_pixels48 8
  347. get_pixels48 10
  348. get_pixels64 8
  349. get_pixels64 10