You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

399 lines
12KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  22. .macro h264_chroma_mc8 type, codec=h264
  23. function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
  24. push {r4-r7, lr}
  25. ldrd r4, r5, [sp, #20]
  26. .ifc \type,avg
  27. mov lr, r0
  28. .endif
  29. pld [r1]
  30. pld [r1, r2]
  31. .ifc \codec,rv40
  32. movrel r6, rv40bias
  33. lsr r7, r5, #1
  34. add r6, r6, r7, lsl #3
  35. lsr r7, r4, #1
  36. add r6, r6, r7, lsl #1
  37. vld1.16 {d22[],d23[]}, [r6,:16]
  38. .endif
  39. A muls r7, r4, r5
  40. T mul r7, r4, r5
  41. T cmp r7, #0
  42. rsb r6, r7, r5, lsl #3
  43. rsb r12, r7, r4, lsl #3
  44. sub r4, r7, r4, lsl #3
  45. sub r4, r4, r5, lsl #3
  46. add r4, r4, #64
  47. beq 2f
  48. vdup.8 d0, r4
  49. vdup.8 d1, r12
  50. vld1.8 {d4, d5}, [r1], r2
  51. vdup.8 d2, r6
  52. vdup.8 d3, r7
  53. vext.8 d5, d4, d5, #1
  54. 1: vld1.8 {d6, d7}, [r1], r2
  55. vmull.u8 q8, d4, d0
  56. vmlal.u8 q8, d5, d1
  57. vext.8 d7, d6, d7, #1
  58. vld1.8 {d4, d5}, [r1], r2
  59. vmlal.u8 q8, d6, d2
  60. pld [r1]
  61. vext.8 d5, d4, d5, #1
  62. vmlal.u8 q8, d7, d3
  63. vmull.u8 q9, d6, d0
  64. subs r3, r3, #2
  65. vmlal.u8 q9, d7, d1
  66. vmlal.u8 q9, d4, d2
  67. vmlal.u8 q9, d5, d3
  68. pld [r1, r2]
  69. .ifc \codec,h264
  70. vrshrn.u16 d16, q8, #6
  71. vrshrn.u16 d17, q9, #6
  72. .else
  73. vadd.u16 q8, q8, q11
  74. vadd.u16 q9, q9, q11
  75. vshrn.u16 d16, q8, #6
  76. vshrn.u16 d17, q9, #6
  77. .endif
  78. .ifc \type,avg
  79. vld1.8 {d20}, [lr,:64], r2
  80. vld1.8 {d21}, [lr,:64], r2
  81. vrhadd.u8 q8, q8, q10
  82. .endif
  83. vst1.8 {d16}, [r0,:64], r2
  84. vst1.8 {d17}, [r0,:64], r2
  85. bgt 1b
  86. pop {r4-r7, pc}
  87. 2: tst r6, r6
  88. add r12, r12, r6
  89. vdup.8 d0, r4
  90. vdup.8 d1, r12
  91. beq 4f
  92. vld1.8 {d4}, [r1], r2
  93. 3: vld1.8 {d6}, [r1], r2
  94. vmull.u8 q8, d4, d0
  95. vmlal.u8 q8, d6, d1
  96. vld1.8 {d4}, [r1], r2
  97. vmull.u8 q9, d6, d0
  98. vmlal.u8 q9, d4, d1
  99. pld [r1]
  100. .ifc \codec,h264
  101. vrshrn.u16 d16, q8, #6
  102. vrshrn.u16 d17, q9, #6
  103. .else
  104. vadd.u16 q8, q8, q11
  105. vadd.u16 q9, q9, q11
  106. vshrn.u16 d16, q8, #6
  107. vshrn.u16 d17, q9, #6
  108. .endif
  109. pld [r1, r2]
  110. .ifc \type,avg
  111. vld1.8 {d20}, [lr,:64], r2
  112. vld1.8 {d21}, [lr,:64], r2
  113. vrhadd.u8 q8, q8, q10
  114. .endif
  115. subs r3, r3, #2
  116. vst1.8 {d16}, [r0,:64], r2
  117. vst1.8 {d17}, [r0,:64], r2
  118. bgt 3b
  119. pop {r4-r7, pc}
  120. 4: vld1.8 {d4, d5}, [r1], r2
  121. vld1.8 {d6, d7}, [r1], r2
  122. vext.8 d5, d4, d5, #1
  123. vext.8 d7, d6, d7, #1
  124. pld [r1]
  125. subs r3, r3, #2
  126. vmull.u8 q8, d4, d0
  127. vmlal.u8 q8, d5, d1
  128. vmull.u8 q9, d6, d0
  129. vmlal.u8 q9, d7, d1
  130. pld [r1, r2]
  131. .ifc \codec,h264
  132. vrshrn.u16 d16, q8, #6
  133. vrshrn.u16 d17, q9, #6
  134. .else
  135. vadd.u16 q8, q8, q11
  136. vadd.u16 q9, q9, q11
  137. vshrn.u16 d16, q8, #6
  138. vshrn.u16 d17, q9, #6
  139. .endif
  140. .ifc \type,avg
  141. vld1.8 {d20}, [lr,:64], r2
  142. vld1.8 {d21}, [lr,:64], r2
  143. vrhadd.u8 q8, q8, q10
  144. .endif
  145. vst1.8 {d16}, [r0,:64], r2
  146. vst1.8 {d17}, [r0,:64], r2
  147. bgt 4b
  148. pop {r4-r7, pc}
  149. endfunc
  150. .endm
  151. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  152. .macro h264_chroma_mc4 type, codec=h264
  153. function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
  154. push {r4-r7, lr}
  155. ldrd r4, r5, [sp, #20]
  156. .ifc \type,avg
  157. mov lr, r0
  158. .endif
  159. pld [r1]
  160. pld [r1, r2]
  161. .ifc \codec,rv40
  162. movrel r6, rv40bias
  163. lsr r7, r5, #1
  164. add r6, r6, r7, lsl #3
  165. lsr r7, r4, #1
  166. add r6, r6, r7, lsl #1
  167. vld1.16 {d22[],d23[]}, [r6,:16]
  168. .endif
  169. A muls r7, r4, r5
  170. T mul r7, r4, r5
  171. T cmp r7, #0
  172. rsb r6, r7, r5, lsl #3
  173. rsb r12, r7, r4, lsl #3
  174. sub r4, r7, r4, lsl #3
  175. sub r4, r4, r5, lsl #3
  176. add r4, r4, #64
  177. beq 2f
  178. vdup.8 d0, r4
  179. vdup.8 d1, r12
  180. vld1.8 {d4}, [r1], r2
  181. vdup.8 d2, r6
  182. vdup.8 d3, r7
  183. vext.8 d5, d4, d5, #1
  184. vtrn.32 d4, d5
  185. vtrn.32 d0, d1
  186. vtrn.32 d2, d3
  187. 1: vld1.8 {d6}, [r1], r2
  188. vext.8 d7, d6, d7, #1
  189. vtrn.32 d6, d7
  190. vmull.u8 q8, d4, d0
  191. vmlal.u8 q8, d6, d2
  192. vld1.8 {d4}, [r1], r2
  193. vext.8 d5, d4, d5, #1
  194. vtrn.32 d4, d5
  195. pld [r1]
  196. vmull.u8 q9, d6, d0
  197. vmlal.u8 q9, d4, d2
  198. vadd.i16 d16, d16, d17
  199. vadd.i16 d17, d18, d19
  200. .ifc \codec,h264
  201. vrshrn.u16 d16, q8, #6
  202. .else
  203. vadd.u16 q8, q8, q11
  204. vshrn.u16 d16, q8, #6
  205. .endif
  206. subs r3, r3, #2
  207. pld [r1, r2]
  208. .ifc \type,avg
  209. vld1.32 {d20[0]}, [lr,:32], r2
  210. vld1.32 {d20[1]}, [lr,:32], r2
  211. vrhadd.u8 d16, d16, d20
  212. .endif
  213. vst1.32 {d16[0]}, [r0,:32], r2
  214. vst1.32 {d16[1]}, [r0,:32], r2
  215. bgt 1b
  216. pop {r4-r7, pc}
  217. 2: tst r6, r6
  218. add r12, r12, r6
  219. vdup.8 d0, r4
  220. vdup.8 d1, r12
  221. vtrn.32 d0, d1
  222. beq 4f
  223. vext.32 d1, d0, d1, #1
  224. vld1.32 {d4[0]}, [r1], r2
  225. 3: vld1.32 {d4[1]}, [r1], r2
  226. vmull.u8 q8, d4, d0
  227. vld1.32 {d4[0]}, [r1], r2
  228. vmull.u8 q9, d4, d1
  229. vadd.i16 d16, d16, d17
  230. vadd.i16 d17, d18, d19
  231. pld [r1]
  232. .ifc \codec,h264
  233. vrshrn.u16 d16, q8, #6
  234. .else
  235. vadd.u16 q8, q8, q11
  236. vshrn.u16 d16, q8, #6
  237. .endif
  238. .ifc \type,avg
  239. vld1.32 {d20[0]}, [lr,:32], r2
  240. vld1.32 {d20[1]}, [lr,:32], r2
  241. vrhadd.u8 d16, d16, d20
  242. .endif
  243. subs r3, r3, #2
  244. pld [r1, r2]
  245. vst1.32 {d16[0]}, [r0,:32], r2
  246. vst1.32 {d16[1]}, [r0,:32], r2
  247. bgt 3b
  248. pop {r4-r7, pc}
  249. 4: vld1.8 {d4}, [r1], r2
  250. vld1.8 {d6}, [r1], r2
  251. vext.8 d5, d4, d5, #1
  252. vext.8 d7, d6, d7, #1
  253. vtrn.32 d4, d5
  254. vtrn.32 d6, d7
  255. vmull.u8 q8, d4, d0
  256. vmull.u8 q9, d6, d0
  257. subs r3, r3, #2
  258. vadd.i16 d16, d16, d17
  259. vadd.i16 d17, d18, d19
  260. pld [r1]
  261. .ifc \codec,h264
  262. vrshrn.u16 d16, q8, #6
  263. .else
  264. vadd.u16 q8, q8, q11
  265. vshrn.u16 d16, q8, #6
  266. .endif
  267. .ifc \type,avg
  268. vld1.32 {d20[0]}, [lr,:32], r2
  269. vld1.32 {d20[1]}, [lr,:32], r2
  270. vrhadd.u8 d16, d16, d20
  271. .endif
  272. pld [r1]
  273. vst1.32 {d16[0]}, [r0,:32], r2
  274. vst1.32 {d16[1]}, [r0,:32], r2
  275. bgt 4b
  276. pop {r4-r7, pc}
  277. endfunc
  278. .endm
  279. .macro h264_chroma_mc2 type
  280. function ff_\type\()_h264_chroma_mc2_neon, export=1
  281. push {r4-r6, lr}
  282. ldr r4, [sp, #16]
  283. ldr lr, [sp, #20]
  284. pld [r1]
  285. pld [r1, r2]
  286. orrs r5, r4, lr
  287. beq 2f
  288. mul r5, r4, lr
  289. rsb r6, r5, lr, lsl #3
  290. rsb r12, r5, r4, lsl #3
  291. sub r4, r5, r4, lsl #3
  292. sub r4, r4, lr, lsl #3
  293. add r4, r4, #64
  294. vdup.8 d0, r4
  295. vdup.8 d2, r12
  296. vdup.8 d1, r6
  297. vdup.8 d3, r5
  298. vtrn.16 q0, q1
  299. 1:
  300. vld1.32 {d4[0]}, [r1], r2
  301. vld1.32 {d4[1]}, [r1], r2
  302. vrev64.32 d5, d4
  303. vld1.32 {d5[1]}, [r1]
  304. vext.8 q3, q2, q2, #1
  305. vtrn.16 q2, q3
  306. vmull.u8 q8, d4, d0
  307. vmlal.u8 q8, d5, d1
  308. .ifc \type,avg
  309. vld1.16 {d18[0]}, [r0,:16], r2
  310. vld1.16 {d18[1]}, [r0,:16]
  311. sub r0, r0, r2
  312. .endif
  313. vtrn.32 d16, d17
  314. vadd.i16 d16, d16, d17
  315. vrshrn.u16 d16, q8, #6
  316. .ifc \type,avg
  317. vrhadd.u8 d16, d16, d18
  318. .endif
  319. vst1.16 {d16[0]}, [r0,:16], r2
  320. vst1.16 {d16[1]}, [r0,:16], r2
  321. subs r3, r3, #2
  322. bgt 1b
  323. pop {r4-r6, pc}
  324. 2:
  325. .ifc \type,put
  326. ldrh_post r5, r1, r2
  327. strh_post r5, r0, r2
  328. ldrh_post r6, r1, r2
  329. strh_post r6, r0, r2
  330. .else
  331. vld1.16 {d16[0]}, [r1], r2
  332. vld1.16 {d16[1]}, [r1], r2
  333. vld1.16 {d18[0]}, [r0,:16], r2
  334. vld1.16 {d18[1]}, [r0,:16]
  335. sub r0, r0, r2
  336. vrhadd.u8 d16, d16, d18
  337. vst1.16 {d16[0]}, [r0,:16], r2
  338. vst1.16 {d16[1]}, [r0,:16], r2
  339. .endif
  340. subs r3, r3, #2
  341. bgt 2b
  342. pop {r4-r6, pc}
  343. endfunc
  344. .endm
  345. h264_chroma_mc8 put
  346. h264_chroma_mc8 avg
  347. h264_chroma_mc4 put
  348. h264_chroma_mc4 avg
  349. h264_chroma_mc2 put
  350. h264_chroma_mc2 avg
  351. #if CONFIG_RV40_DECODER
  352. const rv40bias
  353. .short 0, 16, 32, 16
  354. .short 32, 28, 32, 28
  355. .short 0, 32, 16, 32
  356. .short 32, 28, 32, 28
  357. endconst
  358. h264_chroma_mc8 put, rv40
  359. h264_chroma_mc8 avg, rv40
  360. h264_chroma_mc4 put, rv40
  361. h264_chroma_mc4 avg, rv40
  362. #endif