You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

403 lines
14KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/aarch64/asm.S"
  22. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  23. .macro h264_chroma_mc8 type, codec=h264
  24. function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
  25. sxtw x2, w2
  26. .ifc \type,avg
  27. mov x8, x0
  28. .endif
  29. prfm pldl1strm, [x1]
  30. prfm pldl1strm, [x1, x2]
  31. .ifc \codec,rv40
  32. movrel x6, rv40bias
  33. lsr w9, w5, #1
  34. lsr w10, w4, #1
  35. lsl w9, w9, #3
  36. lsl w10, w10, #1
  37. add w9, w9, w10
  38. add x6, x6, w9, UXTW
  39. ld1r {v22.8H}, [x6]
  40. .endif
  41. .ifc \codec,vc1
  42. movi v22.8H, #28
  43. .endif
  44. mul w7, w4, w5
  45. lsl w14, w5, #3
  46. lsl w13, w4, #3
  47. cmp w7, #0
  48. sub w6, w14, w7
  49. sub w12, w13, w7
  50. sub w4, w7, w13
  51. sub w4, w4, w14
  52. add w4, w4, #64
  53. b.eq 2f
  54. dup v0.8B, w4
  55. dup v1.8B, w12
  56. ld1 {v4.8B, v5.8B}, [x1], x2
  57. dup v2.8B, w6
  58. dup v3.8B, w7
  59. ext v5.8B, v4.8B, v5.8B, #1
  60. 1: ld1 {v6.8B, v7.8B}, [x1], x2
  61. umull v16.8H, v4.8B, v0.8B
  62. umlal v16.8H, v5.8B, v1.8B
  63. ext v7.8B, v6.8B, v7.8B, #1
  64. ld1 {v4.8B, v5.8B}, [x1], x2
  65. umlal v16.8H, v6.8B, v2.8B
  66. prfm pldl1strm, [x1]
  67. ext v5.8B, v4.8B, v5.8B, #1
  68. umlal v16.8H, v7.8B, v3.8B
  69. umull v17.8H, v6.8B, v0.8B
  70. subs w3, w3, #2
  71. umlal v17.8H, v7.8B, v1.8B
  72. umlal v17.8H, v4.8B, v2.8B
  73. umlal v17.8H, v5.8B, v3.8B
  74. prfm pldl1strm, [x1, x2]
  75. .ifc \codec,h264
  76. rshrn v16.8B, v16.8H, #6
  77. rshrn v17.8B, v17.8H, #6
  78. .else
  79. add v16.8H, v16.8H, v22.8H
  80. add v17.8H, v17.8H, v22.8H
  81. shrn v16.8B, v16.8H, #6
  82. shrn v17.8B, v17.8H, #6
  83. .endif
  84. .ifc \type,avg
  85. ld1 {v20.8B}, [x8], x2
  86. ld1 {v21.8B}, [x8], x2
  87. urhadd v16.8B, v16.8B, v20.8B
  88. urhadd v17.8B, v17.8B, v21.8B
  89. .endif
  90. st1 {v16.8B}, [x0], x2
  91. st1 {v17.8B}, [x0], x2
  92. b.gt 1b
  93. ret
  94. 2: tst w6, w6
  95. add w12, w12, w6
  96. dup v0.8B, w4
  97. dup v1.8B, w12
  98. b.eq 4f
  99. ld1 {v4.8B}, [x1], x2
  100. 3: ld1 {v6.8B}, [x1], x2
  101. umull v16.8H, v4.8B, v0.8B
  102. umlal v16.8H, v6.8B, v1.8B
  103. ld1 {v4.8B}, [x1], x2
  104. umull v17.8H, v6.8B, v0.8B
  105. umlal v17.8H, v4.8B, v1.8B
  106. prfm pldl1strm, [x1]
  107. .ifc \codec,h264
  108. rshrn v16.8B, v16.8H, #6
  109. rshrn v17.8B, v17.8H, #6
  110. .else
  111. add v16.8H, v16.8H, v22.8H
  112. add v17.8H, v17.8H, v22.8H
  113. shrn v16.8B, v16.8H, #6
  114. shrn v17.8B, v17.8H, #6
  115. .endif
  116. prfm pldl1strm, [x1, x2]
  117. .ifc \type,avg
  118. ld1 {v20.8B}, [x8], x2
  119. ld1 {v21.8B}, [x8], x2
  120. urhadd v16.8B, v16.8B, v20.8B
  121. urhadd v17.8B, v17.8B, v21.8B
  122. .endif
  123. subs w3, w3, #2
  124. st1 {v16.8B}, [x0], x2
  125. st1 {v17.8B}, [x0], x2
  126. b.gt 3b
  127. ret
  128. 4: ld1 {v4.8B, v5.8B}, [x1], x2
  129. ld1 {v6.8B, v7.8B}, [x1], x2
  130. ext v5.8B, v4.8B, v5.8B, #1
  131. ext v7.8B, v6.8B, v7.8B, #1
  132. prfm pldl1strm, [x1]
  133. subs w3, w3, #2
  134. umull v16.8H, v4.8B, v0.8B
  135. umlal v16.8H, v5.8B, v1.8B
  136. umull v17.8H, v6.8B, v0.8B
  137. umlal v17.8H, v7.8B, v1.8B
  138. prfm pldl1strm, [x1, x2]
  139. .ifc \codec,h264
  140. rshrn v16.8B, v16.8H, #6
  141. rshrn v17.8B, v17.8H, #6
  142. .else
  143. add v16.8H, v16.8H, v22.8H
  144. add v17.8H, v17.8H, v22.8H
  145. shrn v16.8B, v16.8H, #6
  146. shrn v17.8B, v17.8H, #6
  147. .endif
  148. .ifc \type,avg
  149. ld1 {v20.8B}, [x8], x2
  150. ld1 {v21.8B}, [x8], x2
  151. urhadd v16.8B, v16.8B, v20.8B
  152. urhadd v17.8B, v17.8B, v21.8B
  153. .endif
  154. st1 {v16.8B}, [x0], x2
  155. st1 {v17.8B}, [x0], x2
  156. b.gt 4b
  157. ret
  158. endfunc
  159. .endm
  160. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  161. .macro h264_chroma_mc4 type, codec=h264
  162. function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
  163. sxtw x2, w2
  164. .ifc \type,avg
  165. mov x8, x0
  166. .endif
  167. prfm pldl1strm, [x1]
  168. prfm pldl1strm, [x1, x2]
  169. .ifc \codec,rv40
  170. movrel x6, rv40bias
  171. lsr w9, w5, #1
  172. lsr w10, w4, #1
  173. lsl w9, w9, #3
  174. lsl w10, w10, #1
  175. add w9, w9, w10
  176. add x6, x6, w9, UXTW
  177. ld1r {v22.8H}, [x6]
  178. .endif
  179. .ifc \codec,vc1
  180. movi v22.8H, #28
  181. .endif
  182. mul w7, w4, w5
  183. lsl w14, w5, #3
  184. lsl w13, w4, #3
  185. cmp w7, #0
  186. sub w6, w14, w7
  187. sub w12, w13, w7
  188. sub w4, w7, w13
  189. sub w4, w4, w14
  190. add w4, w4, #64
  191. b.eq 2f
  192. dup v24.8B, w4
  193. dup v25.8B, w12
  194. ld1 {v4.8B}, [x1], x2
  195. dup v26.8B, w6
  196. dup v27.8B, w7
  197. ext v5.8B, v4.8B, v5.8B, #1
  198. trn1 v0.2S, v24.2S, v25.2S
  199. trn1 v2.2S, v26.2S, v27.2S
  200. trn1 v4.2S, v4.2S, v5.2S
  201. 1: ld1 {v6.8B}, [x1], x2
  202. ext v7.8B, v6.8B, v7.8B, #1
  203. trn1 v6.2S, v6.2S, v7.2S
  204. umull v18.8H, v4.8B, v0.8B
  205. umlal v18.8H, v6.8B, v2.8B
  206. ld1 {v4.8B}, [x1], x2
  207. ext v5.8B, v4.8B, v5.8B, #1
  208. trn1 v4.2S, v4.2S, v5.2S
  209. prfm pldl1strm, [x1]
  210. umull v19.8H, v6.8B, v0.8B
  211. umlal v19.8H, v4.8B, v2.8B
  212. trn1 v30.2D, v18.2D, v19.2D
  213. trn2 v31.2D, v18.2D, v19.2D
  214. add v18.8H, v30.8H, v31.8H
  215. .ifc \codec,h264
  216. rshrn v16.8B, v18.8H, #6
  217. .else
  218. add v18.8H, v18.8H, v22.8H
  219. shrn v16.8B, v18.8H, #6
  220. .endif
  221. subs w3, w3, #2
  222. prfm pldl1strm, [x1, x2]
  223. .ifc \type,avg
  224. ld1 {v20.S}[0], [x8], x2
  225. ld1 {v20.S}[1], [x8], x2
  226. urhadd v16.8B, v16.8B, v20.8B
  227. .endif
  228. st1 {v16.S}[0], [x0], x2
  229. st1 {v16.S}[1], [x0], x2
  230. b.gt 1b
  231. ret
  232. 2: tst w6, w6
  233. add w12, w12, w6
  234. dup v30.8B, w4
  235. dup v31.8B, w12
  236. trn1 v0.2S, v30.2S, v31.2S
  237. trn2 v1.2S, v30.2S, v31.2S
  238. b.eq 4f
  239. ext v1.8B, v0.8B, v1.8B, #4
  240. ld1 {v4.S}[0], [x1], x2
  241. 3: ld1 {v4.S}[1], [x1], x2
  242. umull v18.8H, v4.8B, v0.8B
  243. ld1 {v4.S}[0], [x1], x2
  244. umull v19.8H, v4.8B, v1.8B
  245. trn1 v30.2D, v18.2D, v19.2D
  246. trn2 v31.2D, v18.2D, v19.2D
  247. add v18.8H, v30.8H, v31.8H
  248. prfm pldl1strm, [x1]
  249. .ifc \codec,h264
  250. rshrn v16.8B, v18.8H, #6
  251. .else
  252. add v18.8H, v18.8H, v22.8H
  253. shrn v16.8B, v18.8H, #6
  254. .endif
  255. .ifc \type,avg
  256. ld1 {v20.S}[0], [x8], x2
  257. ld1 {v20.S}[1], [x8], x2
  258. urhadd v16.8B, v16.8B, v20.8B
  259. .endif
  260. subs w3, w3, #2
  261. prfm pldl1strm, [x1, x2]
  262. st1 {v16.S}[0], [x0], x2
  263. st1 {v16.S}[1], [x0], x2
  264. b.gt 3b
  265. ret
  266. 4: ld1 {v4.8B}, [x1], x2
  267. ld1 {v6.8B}, [x1], x2
  268. ext v5.8B, v4.8B, v5.8B, #1
  269. ext v7.8B, v6.8B, v7.8B, #1
  270. trn1 v4.2S, v4.2S, v5.2S
  271. trn1 v6.2S, v6.2S, v7.2S
  272. umull v18.8H, v4.8B, v0.8B
  273. umull v19.8H, v6.8B, v0.8B
  274. subs w3, w3, #2
  275. trn1 v30.2D, v18.2D, v19.2D
  276. trn2 v31.2D, v18.2D, v19.2D
  277. add v18.8H, v30.8H, v31.8H
  278. prfm pldl1strm, [x1]
  279. .ifc \codec,h264
  280. rshrn v16.8B, v18.8H, #6
  281. .else
  282. add v18.8H, v18.8H, v22.8H
  283. shrn v16.8B, v18.8H, #6
  284. .endif
  285. .ifc \type,avg
  286. ld1 {v20.S}[0], [x8], x2
  287. ld1 {v20.S}[1], [x8], x2
  288. urhadd v16.8B, v16.8B, v20.8B
  289. .endif
  290. prfm pldl1strm, [x1]
  291. st1 {v16.S}[0], [x0], x2
  292. st1 {v16.S}[1], [x0], x2
  293. b.gt 4b
  294. ret
  295. endfunc
  296. .endm
  297. .macro h264_chroma_mc2 type
  298. function ff_\type\()_h264_chroma_mc2_neon, export=1
  299. sxtw x2, w2
  300. prfm pldl1strm, [x1]
  301. prfm pldl1strm, [x1, x2]
  302. orr w7, w4, w5
  303. cbz w7, 2f
  304. mul w7, w4, w5
  305. lsl w14, w5, #3
  306. lsl w13, w4, #3
  307. sub w6, w14, w7
  308. sub w12, w13, w7
  309. sub w4, w7, w13
  310. sub w4, w4, w14
  311. add w4, w4, #64
  312. dup v0.8B, w4
  313. dup v2.8B, w12
  314. dup v1.8B, w6
  315. dup v3.8B, w7
  316. trn1 v0.4H, v0.4H, v2.4H
  317. trn1 v1.4H, v1.4H, v3.4H
  318. 1:
  319. ld1 {v4.S}[0], [x1], x2
  320. ld1 {v4.S}[1], [x1], x2
  321. rev64 v5.2S, v4.2S
  322. ld1 {v5.S}[1], [x1]
  323. ext v6.8B, v4.8B, v5.8B, #1
  324. ext v7.8B, v5.8B, v4.8B, #1
  325. trn1 v4.4H, v4.4H, v6.4H
  326. trn1 v5.4H, v5.4H, v7.4H
  327. umull v16.8H, v4.8B, v0.8B
  328. umlal v16.8H, v5.8B, v1.8B
  329. .ifc \type,avg
  330. ld1 {v18.H}[0], [x0], x2
  331. ld1 {v18.H}[2], [x0]
  332. sub x0, x0, x2
  333. .endif
  334. rev64 v17.4S, v16.4S
  335. add v16.8H, v16.8H, v17.8H
  336. rshrn v16.8B, v16.8H, #6
  337. .ifc \type,avg
  338. urhadd v16.8B, v16.8B, v18.8B
  339. .endif
  340. st1 {v16.H}[0], [x0], x2
  341. st1 {v16.H}[2], [x0], x2
  342. subs w3, w3, #2
  343. b.gt 1b
  344. ret
  345. 2:
  346. ld1 {v16.H}[0], [x1], x2
  347. ld1 {v16.H}[1], [x1], x2
  348. .ifc \type,avg
  349. ld1 {v18.H}[0], [x0], x2
  350. ld1 {v18.H}[1], [x0]
  351. sub x0, x0, x2
  352. urhadd v16.8B, v16.8B, v18.8B
  353. .endif
  354. st1 {v16.H}[0], [x0], x2
  355. st1 {v16.H}[1], [x0], x2
  356. subs w3, w3, #2
  357. b.gt 2b
  358. ret
  359. endfunc
  360. .endm
  361. h264_chroma_mc8 put
  362. h264_chroma_mc8 avg
  363. h264_chroma_mc4 put
  364. h264_chroma_mc4 avg
  365. h264_chroma_mc2 put
  366. h264_chroma_mc2 avg
  367. #if CONFIG_RV40_DECODER
  368. const rv40bias
  369. .short 0, 16, 32, 16
  370. .short 32, 28, 32, 28
  371. .short 0, 32, 16, 32
  372. .short 32, 28, 32, 28
  373. endconst
  374. h264_chroma_mc8 put, rv40
  375. h264_chroma_mc8 avg, rv40
  376. h264_chroma_mc4 put, rv40
  377. h264_chroma_mc4 avg, rv40
  378. #endif
  379. #if CONFIG_VC1_DECODER
  380. h264_chroma_mc8 put, vc1
  381. h264_chroma_mc8 avg, vc1
  382. h264_chroma_mc4 put, vc1
  383. h264_chroma_mc4 avg, vc1
  384. #endif