You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

412 lines
12KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  22. .macro h264_chroma_mc8 type, codec=h264
  23. function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
  24. push {r4-r7, lr}
  25. ldrd r4, r5, [sp, #20]
  26. .ifc \type,avg
  27. mov lr, r0
  28. .endif
  29. pld [r1]
  30. pld [r1, r2]
  31. .ifc \codec,rv40
  32. movrel r6, rv40bias
  33. lsr r7, r5, #1
  34. add r6, r6, r7, lsl #3
  35. lsr r7, r4, #1
  36. add r6, r6, r7, lsl #1
  37. vld1.16 {d22[],d23[]}, [r6,:16]
  38. .endif
  39. .ifc \codec,vc1
  40. vmov.u16 q11, #28
  41. .endif
  42. A muls r7, r4, r5
  43. T mul r7, r4, r5
  44. T cmp r7, #0
  45. rsb r6, r7, r5, lsl #3
  46. rsb r12, r7, r4, lsl #3
  47. sub r4, r7, r4, lsl #3
  48. sub r4, r4, r5, lsl #3
  49. add r4, r4, #64
  50. beq 2f
  51. vdup.8 d0, r4
  52. vdup.8 d1, r12
  53. vld1.8 {d4, d5}, [r1], r2
  54. vdup.8 d2, r6
  55. vdup.8 d3, r7
  56. vext.8 d5, d4, d5, #1
  57. 1: vld1.8 {d6, d7}, [r1], r2
  58. vmull.u8 q8, d4, d0
  59. vmlal.u8 q8, d5, d1
  60. vext.8 d7, d6, d7, #1
  61. vld1.8 {d4, d5}, [r1], r2
  62. vmlal.u8 q8, d6, d2
  63. pld [r1]
  64. vext.8 d5, d4, d5, #1
  65. vmlal.u8 q8, d7, d3
  66. vmull.u8 q9, d6, d0
  67. subs r3, r3, #2
  68. vmlal.u8 q9, d7, d1
  69. vmlal.u8 q9, d4, d2
  70. vmlal.u8 q9, d5, d3
  71. pld [r1, r2]
  72. .ifc \codec,h264
  73. vrshrn.u16 d16, q8, #6
  74. vrshrn.u16 d17, q9, #6
  75. .else
  76. vadd.u16 q8, q8, q11
  77. vadd.u16 q9, q9, q11
  78. vshrn.u16 d16, q8, #6
  79. vshrn.u16 d17, q9, #6
  80. .endif
  81. .ifc \type,avg
  82. vld1.8 {d20}, [lr,:64], r2
  83. vld1.8 {d21}, [lr,:64], r2
  84. vrhadd.u8 q8, q8, q10
  85. .endif
  86. vst1.8 {d16}, [r0,:64], r2
  87. vst1.8 {d17}, [r0,:64], r2
  88. bgt 1b
  89. pop {r4-r7, pc}
  90. 2: tst r6, r6
  91. add r12, r12, r6
  92. vdup.8 d0, r4
  93. vdup.8 d1, r12
  94. beq 4f
  95. vld1.8 {d4}, [r1], r2
  96. 3: vld1.8 {d6}, [r1], r2
  97. vmull.u8 q8, d4, d0
  98. vmlal.u8 q8, d6, d1
  99. vld1.8 {d4}, [r1], r2
  100. vmull.u8 q9, d6, d0
  101. vmlal.u8 q9, d4, d1
  102. pld [r1]
  103. .ifc \codec,h264
  104. vrshrn.u16 d16, q8, #6
  105. vrshrn.u16 d17, q9, #6
  106. .else
  107. vadd.u16 q8, q8, q11
  108. vadd.u16 q9, q9, q11
  109. vshrn.u16 d16, q8, #6
  110. vshrn.u16 d17, q9, #6
  111. .endif
  112. pld [r1, r2]
  113. .ifc \type,avg
  114. vld1.8 {d20}, [lr,:64], r2
  115. vld1.8 {d21}, [lr,:64], r2
  116. vrhadd.u8 q8, q8, q10
  117. .endif
  118. subs r3, r3, #2
  119. vst1.8 {d16}, [r0,:64], r2
  120. vst1.8 {d17}, [r0,:64], r2
  121. bgt 3b
  122. pop {r4-r7, pc}
  123. 4: vld1.8 {d4, d5}, [r1], r2
  124. vld1.8 {d6, d7}, [r1], r2
  125. vext.8 d5, d4, d5, #1
  126. vext.8 d7, d6, d7, #1
  127. pld [r1]
  128. subs r3, r3, #2
  129. vmull.u8 q8, d4, d0
  130. vmlal.u8 q8, d5, d1
  131. vmull.u8 q9, d6, d0
  132. vmlal.u8 q9, d7, d1
  133. pld [r1, r2]
  134. .ifc \codec,h264
  135. vrshrn.u16 d16, q8, #6
  136. vrshrn.u16 d17, q9, #6
  137. .else
  138. vadd.u16 q8, q8, q11
  139. vadd.u16 q9, q9, q11
  140. vshrn.u16 d16, q8, #6
  141. vshrn.u16 d17, q9, #6
  142. .endif
  143. .ifc \type,avg
  144. vld1.8 {d20}, [lr,:64], r2
  145. vld1.8 {d21}, [lr,:64], r2
  146. vrhadd.u8 q8, q8, q10
  147. .endif
  148. vst1.8 {d16}, [r0,:64], r2
  149. vst1.8 {d17}, [r0,:64], r2
  150. bgt 4b
  151. pop {r4-r7, pc}
  152. endfunc
  153. .endm
  154. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  155. .macro h264_chroma_mc4 type, codec=h264
  156. function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
  157. push {r4-r7, lr}
  158. ldrd r4, r5, [sp, #20]
  159. .ifc \type,avg
  160. mov lr, r0
  161. .endif
  162. pld [r1]
  163. pld [r1, r2]
  164. .ifc \codec,rv40
  165. movrel r6, rv40bias
  166. lsr r7, r5, #1
  167. add r6, r6, r7, lsl #3
  168. lsr r7, r4, #1
  169. add r6, r6, r7, lsl #1
  170. vld1.16 {d22[],d23[]}, [r6,:16]
  171. .endif
  172. .ifc \codec,vc1
  173. vmov.u16 q11, #28
  174. .endif
  175. A muls r7, r4, r5
  176. T mul r7, r4, r5
  177. T cmp r7, #0
  178. rsb r6, r7, r5, lsl #3
  179. rsb r12, r7, r4, lsl #3
  180. sub r4, r7, r4, lsl #3
  181. sub r4, r4, r5, lsl #3
  182. add r4, r4, #64
  183. beq 2f
  184. vdup.8 d0, r4
  185. vdup.8 d1, r12
  186. vld1.8 {d4}, [r1], r2
  187. vdup.8 d2, r6
  188. vdup.8 d3, r7
  189. vext.8 d5, d4, d5, #1
  190. vtrn.32 d4, d5
  191. vtrn.32 d0, d1
  192. vtrn.32 d2, d3
  193. 1: vld1.8 {d6}, [r1], r2
  194. vext.8 d7, d6, d7, #1
  195. vtrn.32 d6, d7
  196. vmull.u8 q8, d4, d0
  197. vmlal.u8 q8, d6, d2
  198. vld1.8 {d4}, [r1], r2
  199. vext.8 d5, d4, d5, #1
  200. vtrn.32 d4, d5
  201. pld [r1]
  202. vmull.u8 q9, d6, d0
  203. vmlal.u8 q9, d4, d2
  204. vadd.i16 d16, d16, d17
  205. vadd.i16 d17, d18, d19
  206. .ifc \codec,h264
  207. vrshrn.u16 d16, q8, #6
  208. .else
  209. vadd.u16 q8, q8, q11
  210. vshrn.u16 d16, q8, #6
  211. .endif
  212. subs r3, r3, #2
  213. pld [r1, r2]
  214. .ifc \type,avg
  215. vld1.32 {d20[0]}, [lr,:32], r2
  216. vld1.32 {d20[1]}, [lr,:32], r2
  217. vrhadd.u8 d16, d16, d20
  218. .endif
  219. vst1.32 {d16[0]}, [r0,:32], r2
  220. vst1.32 {d16[1]}, [r0,:32], r2
  221. bgt 1b
  222. pop {r4-r7, pc}
  223. 2: tst r6, r6
  224. add r12, r12, r6
  225. vdup.8 d0, r4
  226. vdup.8 d1, r12
  227. vtrn.32 d0, d1
  228. beq 4f
  229. vext.32 d1, d0, d1, #1
  230. vld1.32 {d4[0]}, [r1], r2
  231. 3: vld1.32 {d4[1]}, [r1], r2
  232. vmull.u8 q8, d4, d0
  233. vld1.32 {d4[0]}, [r1], r2
  234. vmull.u8 q9, d4, d1
  235. vadd.i16 d16, d16, d17
  236. vadd.i16 d17, d18, d19
  237. pld [r1]
  238. .ifc \codec,h264
  239. vrshrn.u16 d16, q8, #6
  240. .else
  241. vadd.u16 q8, q8, q11
  242. vshrn.u16 d16, q8, #6
  243. .endif
  244. .ifc \type,avg
  245. vld1.32 {d20[0]}, [lr,:32], r2
  246. vld1.32 {d20[1]}, [lr,:32], r2
  247. vrhadd.u8 d16, d16, d20
  248. .endif
  249. subs r3, r3, #2
  250. pld [r1, r2]
  251. vst1.32 {d16[0]}, [r0,:32], r2
  252. vst1.32 {d16[1]}, [r0,:32], r2
  253. bgt 3b
  254. pop {r4-r7, pc}
  255. 4: vld1.8 {d4}, [r1], r2
  256. vld1.8 {d6}, [r1], r2
  257. vext.8 d5, d4, d5, #1
  258. vext.8 d7, d6, d7, #1
  259. vtrn.32 d4, d5
  260. vtrn.32 d6, d7
  261. vmull.u8 q8, d4, d0
  262. vmull.u8 q9, d6, d0
  263. subs r3, r3, #2
  264. vadd.i16 d16, d16, d17
  265. vadd.i16 d17, d18, d19
  266. pld [r1]
  267. .ifc \codec,h264
  268. vrshrn.u16 d16, q8, #6
  269. .else
  270. vadd.u16 q8, q8, q11
  271. vshrn.u16 d16, q8, #6
  272. .endif
  273. .ifc \type,avg
  274. vld1.32 {d20[0]}, [lr,:32], r2
  275. vld1.32 {d20[1]}, [lr,:32], r2
  276. vrhadd.u8 d16, d16, d20
  277. .endif
  278. pld [r1]
  279. vst1.32 {d16[0]}, [r0,:32], r2
  280. vst1.32 {d16[1]}, [r0,:32], r2
  281. bgt 4b
  282. pop {r4-r7, pc}
  283. endfunc
  284. .endm
  285. .macro h264_chroma_mc2 type
  286. function ff_\type\()_h264_chroma_mc2_neon, export=1
  287. push {r4-r6, lr}
  288. ldr r4, [sp, #16]
  289. ldr lr, [sp, #20]
  290. pld [r1]
  291. pld [r1, r2]
  292. orrs r5, r4, lr
  293. beq 2f
  294. mul r5, r4, lr
  295. rsb r6, r5, lr, lsl #3
  296. rsb r12, r5, r4, lsl #3
  297. sub r4, r5, r4, lsl #3
  298. sub r4, r4, lr, lsl #3
  299. add r4, r4, #64
  300. vdup.8 d0, r4
  301. vdup.8 d2, r12
  302. vdup.8 d1, r6
  303. vdup.8 d3, r5
  304. vtrn.16 q0, q1
  305. 1:
  306. vld1.32 {d4[0]}, [r1], r2
  307. vld1.32 {d4[1]}, [r1], r2
  308. vrev64.32 d5, d4
  309. vld1.32 {d5[1]}, [r1]
  310. vext.8 q3, q2, q2, #1
  311. vtrn.16 q2, q3
  312. vmull.u8 q8, d4, d0
  313. vmlal.u8 q8, d5, d1
  314. .ifc \type,avg
  315. vld1.16 {d18[0]}, [r0,:16], r2
  316. vld1.16 {d18[1]}, [r0,:16]
  317. sub r0, r0, r2
  318. .endif
  319. vtrn.32 d16, d17
  320. vadd.i16 d16, d16, d17
  321. vrshrn.u16 d16, q8, #6
  322. .ifc \type,avg
  323. vrhadd.u8 d16, d16, d18
  324. .endif
  325. vst1.16 {d16[0]}, [r0,:16], r2
  326. vst1.16 {d16[1]}, [r0,:16], r2
  327. subs r3, r3, #2
  328. bgt 1b
  329. pop {r4-r6, pc}
  330. 2:
  331. .ifc \type,put
  332. ldrh_post r5, r1, r2
  333. strh_post r5, r0, r2
  334. ldrh_post r6, r1, r2
  335. strh_post r6, r0, r2
  336. .else
  337. vld1.16 {d16[0]}, [r1], r2
  338. vld1.16 {d16[1]}, [r1], r2
  339. vld1.16 {d18[0]}, [r0,:16], r2
  340. vld1.16 {d18[1]}, [r0,:16]
  341. sub r0, r0, r2
  342. vrhadd.u8 d16, d16, d18
  343. vst1.16 {d16[0]}, [r0,:16], r2
  344. vst1.16 {d16[1]}, [r0,:16], r2
  345. .endif
  346. subs r3, r3, #2
  347. bgt 2b
  348. pop {r4-r6, pc}
  349. endfunc
  350. .endm
  351. h264_chroma_mc8 put
  352. h264_chroma_mc8 avg
  353. h264_chroma_mc4 put
  354. h264_chroma_mc4 avg
  355. h264_chroma_mc2 put
  356. h264_chroma_mc2 avg
  357. #if CONFIG_RV40_DECODER
  358. const rv40bias
  359. .short 0, 16, 32, 16
  360. .short 32, 28, 32, 28
  361. .short 0, 32, 16, 32
  362. .short 32, 28, 32, 28
  363. endconst
  364. h264_chroma_mc8 put, rv40
  365. h264_chroma_mc8 avg, rv40
  366. h264_chroma_mc4 put, rv40
  367. h264_chroma_mc4 avg, rv40
  368. #endif
  369. #if CONFIG_VC1_DECODER
  370. h264_chroma_mc8 put, vc1
  371. h264_chroma_mc8 avg, vc1
  372. h264_chroma_mc4 put, vc1
  373. h264_chroma_mc4 avg, vc1
  374. #endif