You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

454 lines
15KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/aarch64/asm.S"
  22. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  23. .macro h264_chroma_mc8 type, codec=h264
  24. function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
  25. sxtw x2, w2
  26. .ifc \type,avg
  27. mov x8, x0
  28. .endif
  29. prfm pldl1strm, [x1]
  30. prfm pldl1strm, [x1, x2]
  31. .ifc \codec,rv40
  32. movrel x6, rv40bias
  33. lsr w9, w5, #1
  34. lsr w10, w4, #1
  35. lsl w9, w9, #3
  36. lsl w10, w10, #1
  37. add w9, w9, w10
  38. add x6, x6, w9, UXTW
  39. ld1r {v22.8H}, [x6]
  40. .endif
  41. .ifc \codec,vc1
  42. movi v22.8H, #28
  43. .endif
  44. mul w7, w4, w5
  45. lsl w14, w5, #3
  46. lsl w13, w4, #3
  47. cmp w7, #0
  48. sub w6, w14, w7
  49. sub w12, w13, w7
  50. sub w4, w7, w13
  51. sub w4, w4, w14
  52. add w4, w4, #64
  53. b.eq 2f
  54. dup v0.8B, w4
  55. dup v1.8B, w12
  56. ld1 {v4.8B, v5.8B}, [x1], x2
  57. dup v2.8B, w6
  58. dup v3.8B, w7
  59. ext v5.8B, v4.8B, v5.8B, #1
  60. 1: ld1 {v6.8B, v7.8B}, [x1], x2
  61. umull v16.8H, v4.8B, v0.8B
  62. umlal v16.8H, v5.8B, v1.8B
  63. ext v7.8B, v6.8B, v7.8B, #1
  64. ld1 {v4.8B, v5.8B}, [x1], x2
  65. umlal v16.8H, v6.8B, v2.8B
  66. prfm pldl1strm, [x1]
  67. ext v5.8B, v4.8B, v5.8B, #1
  68. umlal v16.8H, v7.8B, v3.8B
  69. umull v17.8H, v6.8B, v0.8B
  70. subs w3, w3, #2
  71. umlal v17.8H, v7.8B, v1.8B
  72. umlal v17.8H, v4.8B, v2.8B
  73. umlal v17.8H, v5.8B, v3.8B
  74. prfm pldl1strm, [x1, x2]
  75. .ifc \codec,h264
  76. rshrn v16.8B, v16.8H, #6
  77. rshrn v17.8B, v17.8H, #6
  78. .else
  79. add v16.8H, v16.8H, v22.8H
  80. add v17.8H, v17.8H, v22.8H
  81. shrn v16.8B, v16.8H, #6
  82. shrn v17.8B, v17.8H, #6
  83. .endif
  84. .ifc \type,avg
  85. ld1 {v20.8B}, [x8], x2
  86. ld1 {v21.8B}, [x8], x2
  87. urhadd v16.8B, v16.8B, v20.8B
  88. urhadd v17.8B, v17.8B, v21.8B
  89. .endif
  90. st1 {v16.8B}, [x0], x2
  91. st1 {v17.8B}, [x0], x2
  92. b.gt 1b
  93. ret
  94. 2: adds w12, w12, w6
  95. dup v0.8B, w4
  96. b.eq 5f
  97. tst w6, w6
  98. dup v1.8B, w12
  99. b.eq 4f
  100. ld1 {v4.8B}, [x1], x2
  101. 3: ld1 {v6.8B}, [x1], x2
  102. umull v16.8H, v4.8B, v0.8B
  103. umlal v16.8H, v6.8B, v1.8B
  104. ld1 {v4.8B}, [x1], x2
  105. umull v17.8H, v6.8B, v0.8B
  106. umlal v17.8H, v4.8B, v1.8B
  107. prfm pldl1strm, [x1]
  108. .ifc \codec,h264
  109. rshrn v16.8B, v16.8H, #6
  110. rshrn v17.8B, v17.8H, #6
  111. .else
  112. add v16.8H, v16.8H, v22.8H
  113. add v17.8H, v17.8H, v22.8H
  114. shrn v16.8B, v16.8H, #6
  115. shrn v17.8B, v17.8H, #6
  116. .endif
  117. prfm pldl1strm, [x1, x2]
  118. .ifc \type,avg
  119. ld1 {v20.8B}, [x8], x2
  120. ld1 {v21.8B}, [x8], x2
  121. urhadd v16.8B, v16.8B, v20.8B
  122. urhadd v17.8B, v17.8B, v21.8B
  123. .endif
  124. subs w3, w3, #2
  125. st1 {v16.8B}, [x0], x2
  126. st1 {v17.8B}, [x0], x2
  127. b.gt 3b
  128. ret
  129. 4: ld1 {v4.8B, v5.8B}, [x1], x2
  130. ld1 {v6.8B, v7.8B}, [x1], x2
  131. ext v5.8B, v4.8B, v5.8B, #1
  132. ext v7.8B, v6.8B, v7.8B, #1
  133. prfm pldl1strm, [x1]
  134. subs w3, w3, #2
  135. umull v16.8H, v4.8B, v0.8B
  136. umlal v16.8H, v5.8B, v1.8B
  137. umull v17.8H, v6.8B, v0.8B
  138. umlal v17.8H, v7.8B, v1.8B
  139. prfm pldl1strm, [x1, x2]
  140. .ifc \codec,h264
  141. rshrn v16.8B, v16.8H, #6
  142. rshrn v17.8B, v17.8H, #6
  143. .else
  144. add v16.8H, v16.8H, v22.8H
  145. add v17.8H, v17.8H, v22.8H
  146. shrn v16.8B, v16.8H, #6
  147. shrn v17.8B, v17.8H, #6
  148. .endif
  149. .ifc \type,avg
  150. ld1 {v20.8B}, [x8], x2
  151. ld1 {v21.8B}, [x8], x2
  152. urhadd v16.8B, v16.8B, v20.8B
  153. urhadd v17.8B, v17.8B, v21.8B
  154. .endif
  155. st1 {v16.8B}, [x0], x2
  156. st1 {v17.8B}, [x0], x2
  157. b.gt 4b
  158. ret
  159. 5: ld1 {v4.8B}, [x1], x2
  160. ld1 {v5.8B}, [x1], x2
  161. prfm pldl1strm, [x1]
  162. subs w3, w3, #2
  163. umull v16.8H, v4.8B, v0.8B
  164. umull v17.8H, v5.8B, v0.8B
  165. prfm pldl1strm, [x1, x2]
  166. .ifc \codec,h264
  167. rshrn v16.8B, v16.8H, #6
  168. rshrn v17.8B, v17.8H, #6
  169. .else
  170. add v16.8H, v16.8H, v22.8H
  171. add v17.8H, v17.8H, v22.8H
  172. shrn v16.8B, v16.8H, #6
  173. shrn v17.8B, v17.8H, #6
  174. .endif
  175. .ifc \type,avg
  176. ld1 {v20.8B}, [x8], x2
  177. ld1 {v21.8B}, [x8], x2
  178. urhadd v16.8B, v16.8B, v20.8B
  179. urhadd v17.8B, v17.8B, v21.8B
  180. .endif
  181. st1 {v16.8B}, [x0], x2
  182. st1 {v17.8B}, [x0], x2
  183. b.gt 5b
  184. ret
  185. endfunc
  186. .endm
  187. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  188. .macro h264_chroma_mc4 type, codec=h264
  189. function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
  190. sxtw x2, w2
  191. .ifc \type,avg
  192. mov x8, x0
  193. .endif
  194. prfm pldl1strm, [x1]
  195. prfm pldl1strm, [x1, x2]
  196. .ifc \codec,rv40
  197. movrel x6, rv40bias
  198. lsr w9, w5, #1
  199. lsr w10, w4, #1
  200. lsl w9, w9, #3
  201. lsl w10, w10, #1
  202. add w9, w9, w10
  203. add x6, x6, w9, UXTW
  204. ld1r {v22.8H}, [x6]
  205. .endif
  206. .ifc \codec,vc1
  207. movi v22.8H, #28
  208. .endif
  209. mul w7, w4, w5
  210. lsl w14, w5, #3
  211. lsl w13, w4, #3
  212. cmp w7, #0
  213. sub w6, w14, w7
  214. sub w12, w13, w7
  215. sub w4, w7, w13
  216. sub w4, w4, w14
  217. add w4, w4, #64
  218. b.eq 2f
  219. dup v24.8B, w4
  220. dup v25.8B, w12
  221. ld1 {v4.8B}, [x1], x2
  222. dup v26.8B, w6
  223. dup v27.8B, w7
  224. ext v5.8B, v4.8B, v5.8B, #1
  225. trn1 v0.2S, v24.2S, v25.2S
  226. trn1 v2.2S, v26.2S, v27.2S
  227. trn1 v4.2S, v4.2S, v5.2S
  228. 1: ld1 {v6.8B}, [x1], x2
  229. ext v7.8B, v6.8B, v7.8B, #1
  230. trn1 v6.2S, v6.2S, v7.2S
  231. umull v18.8H, v4.8B, v0.8B
  232. umlal v18.8H, v6.8B, v2.8B
  233. ld1 {v4.8B}, [x1], x2
  234. ext v5.8B, v4.8B, v5.8B, #1
  235. trn1 v4.2S, v4.2S, v5.2S
  236. prfm pldl1strm, [x1]
  237. umull v19.8H, v6.8B, v0.8B
  238. umlal v19.8H, v4.8B, v2.8B
  239. trn1 v30.2D, v18.2D, v19.2D
  240. trn2 v31.2D, v18.2D, v19.2D
  241. add v18.8H, v30.8H, v31.8H
  242. .ifc \codec,h264
  243. rshrn v16.8B, v18.8H, #6
  244. .else
  245. add v18.8H, v18.8H, v22.8H
  246. shrn v16.8B, v18.8H, #6
  247. .endif
  248. subs w3, w3, #2
  249. prfm pldl1strm, [x1, x2]
  250. .ifc \type,avg
  251. ld1 {v20.S}[0], [x8], x2
  252. ld1 {v20.S}[1], [x8], x2
  253. urhadd v16.8B, v16.8B, v20.8B
  254. .endif
  255. st1 {v16.S}[0], [x0], x2
  256. st1 {v16.S}[1], [x0], x2
  257. b.gt 1b
  258. ret
  259. 2: adds w12, w12, w6
  260. dup v30.8B, w4
  261. b.eq 5f
  262. tst w6, w6
  263. dup v31.8B, w12
  264. trn1 v0.2S, v30.2S, v31.2S
  265. trn2 v1.2S, v30.2S, v31.2S
  266. b.eq 4f
  267. ext v1.8B, v0.8B, v1.8B, #4
  268. ld1 {v4.S}[0], [x1], x2
  269. 3: ld1 {v4.S}[1], [x1], x2
  270. umull v18.8H, v4.8B, v0.8B
  271. ld1 {v4.S}[0], [x1], x2
  272. umull v19.8H, v4.8B, v1.8B
  273. trn1 v30.2D, v18.2D, v19.2D
  274. trn2 v31.2D, v18.2D, v19.2D
  275. add v18.8H, v30.8H, v31.8H
  276. prfm pldl1strm, [x1]
  277. .ifc \codec,h264
  278. rshrn v16.8B, v18.8H, #6
  279. .else
  280. add v18.8H, v18.8H, v22.8H
  281. shrn v16.8B, v18.8H, #6
  282. .endif
  283. .ifc \type,avg
  284. ld1 {v20.S}[0], [x8], x2
  285. ld1 {v20.S}[1], [x8], x2
  286. urhadd v16.8B, v16.8B, v20.8B
  287. .endif
  288. subs w3, w3, #2
  289. prfm pldl1strm, [x1, x2]
  290. st1 {v16.S}[0], [x0], x2
  291. st1 {v16.S}[1], [x0], x2
  292. b.gt 3b
  293. ret
  294. 4: ld1 {v4.8B}, [x1], x2
  295. ld1 {v6.8B}, [x1], x2
  296. ext v5.8B, v4.8B, v5.8B, #1
  297. ext v7.8B, v6.8B, v7.8B, #1
  298. trn1 v4.2S, v4.2S, v5.2S
  299. trn1 v6.2S, v6.2S, v7.2S
  300. umull v18.8H, v4.8B, v0.8B
  301. umull v19.8H, v6.8B, v0.8B
  302. subs w3, w3, #2
  303. trn1 v30.2D, v18.2D, v19.2D
  304. trn2 v31.2D, v18.2D, v19.2D
  305. add v18.8H, v30.8H, v31.8H
  306. prfm pldl1strm, [x1]
  307. .ifc \codec,h264
  308. rshrn v16.8B, v18.8H, #6
  309. .else
  310. add v18.8H, v18.8H, v22.8H
  311. shrn v16.8B, v18.8H, #6
  312. .endif
  313. .ifc \type,avg
  314. ld1 {v20.S}[0], [x8], x2
  315. ld1 {v20.S}[1], [x8], x2
  316. urhadd v16.8B, v16.8B, v20.8B
  317. .endif
  318. prfm pldl1strm, [x1]
  319. st1 {v16.S}[0], [x0], x2
  320. st1 {v16.S}[1], [x0], x2
  321. b.gt 4b
  322. ret
  323. 5: ld1 {v4.S}[0], [x1], x2
  324. ld1 {v4.S}[1], [x1], x2
  325. umull v18.8H, v4.8B, v30.8B
  326. subs w3, w3, #2
  327. prfm pldl1strm, [x1]
  328. .ifc \codec,h264
  329. rshrn v16.8B, v18.8H, #6
  330. .else
  331. add v18.8H, v18.8H, v22.8H
  332. shrn v16.8B, v18.8H, #6
  333. .endif
  334. .ifc \type,avg
  335. ld1 {v20.S}[0], [x8], x2
  336. ld1 {v20.S}[1], [x8], x2
  337. urhadd v16.8B, v16.8B, v20.8B
  338. .endif
  339. prfm pldl1strm, [x1]
  340. st1 {v16.S}[0], [x0], x2
  341. st1 {v16.S}[1], [x0], x2
  342. b.gt 5b
  343. ret
  344. endfunc
  345. .endm
  346. .macro h264_chroma_mc2 type
  347. function ff_\type\()_h264_chroma_mc2_neon, export=1
  348. sxtw x2, w2
  349. prfm pldl1strm, [x1]
  350. prfm pldl1strm, [x1, x2]
  351. orr w7, w4, w5
  352. cbz w7, 2f
  353. mul w7, w4, w5
  354. lsl w14, w5, #3
  355. lsl w13, w4, #3
  356. sub w6, w14, w7
  357. sub w12, w13, w7
  358. sub w4, w7, w13
  359. sub w4, w4, w14
  360. add w4, w4, #64
  361. dup v0.8B, w4
  362. dup v2.8B, w12
  363. dup v1.8B, w6
  364. dup v3.8B, w7
  365. trn1 v0.4H, v0.4H, v2.4H
  366. trn1 v1.4H, v1.4H, v3.4H
  367. 1:
  368. ld1 {v4.S}[0], [x1], x2
  369. ld1 {v4.S}[1], [x1], x2
  370. rev64 v5.2S, v4.2S
  371. ld1 {v5.S}[1], [x1]
  372. ext v6.8B, v4.8B, v5.8B, #1
  373. ext v7.8B, v5.8B, v4.8B, #1
  374. trn1 v4.4H, v4.4H, v6.4H
  375. trn1 v5.4H, v5.4H, v7.4H
  376. umull v16.8H, v4.8B, v0.8B
  377. umlal v16.8H, v5.8B, v1.8B
  378. .ifc \type,avg
  379. ld1 {v18.H}[0], [x0], x2
  380. ld1 {v18.H}[2], [x0]
  381. sub x0, x0, x2
  382. .endif
  383. rev64 v17.4S, v16.4S
  384. add v16.8H, v16.8H, v17.8H
  385. rshrn v16.8B, v16.8H, #6
  386. .ifc \type,avg
  387. urhadd v16.8B, v16.8B, v18.8B
  388. .endif
  389. st1 {v16.H}[0], [x0], x2
  390. st1 {v16.H}[2], [x0], x2
  391. subs w3, w3, #2
  392. b.gt 1b
  393. ret
  394. 2:
  395. ld1 {v16.H}[0], [x1], x2
  396. ld1 {v16.H}[1], [x1], x2
  397. .ifc \type,avg
  398. ld1 {v18.H}[0], [x0], x2
  399. ld1 {v18.H}[1], [x0]
  400. sub x0, x0, x2
  401. urhadd v16.8B, v16.8B, v18.8B
  402. .endif
  403. st1 {v16.H}[0], [x0], x2
  404. st1 {v16.H}[1], [x0], x2
  405. subs w3, w3, #2
  406. b.gt 2b
  407. ret
  408. endfunc
  409. .endm
  410. h264_chroma_mc8 put
  411. h264_chroma_mc8 avg
  412. h264_chroma_mc4 put
  413. h264_chroma_mc4 avg
  414. h264_chroma_mc2 put
  415. h264_chroma_mc2 avg
  416. #if CONFIG_RV40_DECODER
  417. const rv40bias
  418. .short 0, 16, 32, 16
  419. .short 32, 28, 32, 28
  420. .short 0, 32, 16, 32
  421. .short 32, 28, 32, 28
  422. endconst
  423. h264_chroma_mc8 put, rv40
  424. h264_chroma_mc8 avg, rv40
  425. h264_chroma_mc4 put, rv40
  426. h264_chroma_mc4 avg, rv40
  427. #endif
  428. #if CONFIG_VC1_DECODER
  429. h264_chroma_mc8 put, vc1
  430. h264_chroma_mc8 avg, vc1
  431. h264_chroma_mc4 put, vc1
  432. h264_chroma_mc4 avg, vc1
  433. #endif