You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

431 lines
13KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  22. .macro h264_chroma_mc8 type, codec=h264
  23. function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
  24. push {r4-r7, lr}
  25. ldrd r4, [sp, #20]
  26. .ifc \type,avg
  27. mov lr, r0
  28. .endif
  29. pld [r1]
  30. pld [r1, r2]
  31. .ifc \codec,rv40
  32. movrel r6, rv40bias
  33. lsr r7, r5, #1
  34. add r6, r6, r7, lsl #3
  35. lsr r7, r4, #1
  36. add r6, r6, r7, lsl #1
  37. vld1.16 {d22[],d23[]}, [r6,:16]
  38. .endif
  39. A muls r7, r4, r5
  40. T mul r7, r4, r5
  41. T cmp r7, #0
  42. rsb r6, r7, r5, lsl #3
  43. rsb r12, r7, r4, lsl #3
  44. sub r4, r7, r4, lsl #3
  45. sub r4, r4, r5, lsl #3
  46. add r4, r4, #64
  47. beq 2f
  48. add r5, r1, r2
  49. vdup.8 d0, r4
  50. lsl r4, r2, #1
  51. vdup.8 d1, r12
  52. vld1.8 {d4, d5}, [r1], r4
  53. vdup.8 d2, r6
  54. vld1.8 {d6, d7}, [r5], r4
  55. vdup.8 d3, r7
  56. vext.8 d5, d4, d5, #1
  57. vext.8 d7, d6, d7, #1
  58. 1: pld [r5]
  59. vmull.u8 q8, d4, d0
  60. vmlal.u8 q8, d5, d1
  61. vld1.8 {d4, d5}, [r1], r4
  62. vmlal.u8 q8, d6, d2
  63. vext.8 d5, d4, d5, #1
  64. vmlal.u8 q8, d7, d3
  65. vmull.u8 q9, d6, d0
  66. subs r3, r3, #2
  67. vmlal.u8 q9, d7, d1
  68. vmlal.u8 q9, d4, d2
  69. vmlal.u8 q9, d5, d3
  70. vld1.8 {d6, d7}, [r5], r4
  71. pld [r1]
  72. .ifc \codec,h264
  73. vrshrn.u16 d16, q8, #6
  74. vrshrn.u16 d17, q9, #6
  75. .else
  76. vadd.u16 q8, q8, q11
  77. vadd.u16 q9, q9, q11
  78. vshrn.u16 d16, q8, #6
  79. vshrn.u16 d17, q9, #6
  80. .endif
  81. .ifc \type,avg
  82. vld1.8 {d20}, [lr,:64], r2
  83. vld1.8 {d21}, [lr,:64], r2
  84. vrhadd.u8 q8, q8, q10
  85. .endif
  86. vext.8 d7, d6, d7, #1
  87. vst1.8 {d16}, [r0,:64], r2
  88. vst1.8 {d17}, [r0,:64], r2
  89. bgt 1b
  90. pop {r4-r7, pc}
  91. 2: tst r6, r6
  92. add r12, r12, r6
  93. vdup.8 d0, r4
  94. vdup.8 d1, r12
  95. beq 4f
  96. add r5, r1, r2
  97. lsl r4, r2, #1
  98. vld1.8 {d4}, [r1], r4
  99. vld1.8 {d6}, [r5], r4
  100. 3: pld [r5]
  101. vmull.u8 q8, d4, d0
  102. vmlal.u8 q8, d6, d1
  103. vld1.8 {d4}, [r1], r4
  104. vmull.u8 q9, d6, d0
  105. vmlal.u8 q9, d4, d1
  106. vld1.8 {d6}, [r5], r4
  107. .ifc \codec,h264
  108. vrshrn.u16 d16, q8, #6
  109. vrshrn.u16 d17, q9, #6
  110. .else
  111. vadd.u16 q8, q8, q11
  112. vadd.u16 q9, q9, q11
  113. vshrn.u16 d16, q8, #6
  114. vshrn.u16 d17, q9, #6
  115. .endif
  116. .ifc \type,avg
  117. vld1.8 {d20}, [lr,:64], r2
  118. vld1.8 {d21}, [lr,:64], r2
  119. vrhadd.u8 q8, q8, q10
  120. .endif
  121. subs r3, r3, #2
  122. pld [r1]
  123. vst1.8 {d16}, [r0,:64], r2
  124. vst1.8 {d17}, [r0,:64], r2
  125. bgt 3b
  126. pop {r4-r7, pc}
  127. 4: vld1.8 {d4, d5}, [r1], r2
  128. vld1.8 {d6, d7}, [r1], r2
  129. vext.8 d5, d4, d5, #1
  130. vext.8 d7, d6, d7, #1
  131. 5: pld [r1]
  132. subs r3, r3, #2
  133. vmull.u8 q8, d4, d0
  134. vmlal.u8 q8, d5, d1
  135. vld1.8 {d4, d5}, [r1], r2
  136. vmull.u8 q9, d6, d0
  137. vmlal.u8 q9, d7, d1
  138. pld [r1]
  139. vext.8 d5, d4, d5, #1
  140. .ifc \codec,h264
  141. vrshrn.u16 d16, q8, #6
  142. vrshrn.u16 d17, q9, #6
  143. .else
  144. vadd.u16 q8, q8, q11
  145. vadd.u16 q9, q9, q11
  146. vshrn.u16 d16, q8, #6
  147. vshrn.u16 d17, q9, #6
  148. .endif
  149. .ifc \type,avg
  150. vld1.8 {d20}, [lr,:64], r2
  151. vld1.8 {d21}, [lr,:64], r2
  152. vrhadd.u8 q8, q8, q10
  153. .endif
  154. vld1.8 {d6, d7}, [r1], r2
  155. vext.8 d7, d6, d7, #1
  156. vst1.8 {d16}, [r0,:64], r2
  157. vst1.8 {d17}, [r0,:64], r2
  158. bgt 5b
  159. pop {r4-r7, pc}
  160. endfunc
  161. .endm
  162. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  163. .macro h264_chroma_mc4 type, codec=h264
  164. function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
  165. push {r4-r7, lr}
  166. ldrd r4, [sp, #20]
  167. .ifc \type,avg
  168. mov lr, r0
  169. .endif
  170. pld [r1]
  171. pld [r1, r2]
  172. .ifc \codec,rv40
  173. movrel r6, rv40bias
  174. lsr r7, r5, #1
  175. add r6, r6, r7, lsl #3
  176. lsr r7, r4, #1
  177. add r6, r6, r7, lsl #1
  178. vld1.16 {d22[],d23[]}, [r6,:16]
  179. .endif
  180. A muls r7, r4, r5
  181. T mul r7, r4, r5
  182. T cmp r7, #0
  183. rsb r6, r7, r5, lsl #3
  184. rsb r12, r7, r4, lsl #3
  185. sub r4, r7, r4, lsl #3
  186. sub r4, r4, r5, lsl #3
  187. add r4, r4, #64
  188. beq 2f
  189. add r5, r1, r2
  190. vdup.8 d0, r4
  191. lsl r4, r2, #1
  192. vdup.8 d1, r12
  193. vld1.8 {d4}, [r1], r4
  194. vdup.8 d2, r6
  195. vld1.8 {d6}, [r5], r4
  196. vdup.8 d3, r7
  197. vext.8 d5, d4, d5, #1
  198. vext.8 d7, d6, d7, #1
  199. vtrn.32 d4, d5
  200. vtrn.32 d6, d7
  201. vtrn.32 d0, d1
  202. vtrn.32 d2, d3
  203. 1: pld [r5]
  204. vmull.u8 q8, d4, d0
  205. vmlal.u8 q8, d6, d2
  206. vld1.8 {d4}, [r1], r4
  207. vext.8 d5, d4, d5, #1
  208. vtrn.32 d4, d5
  209. vmull.u8 q9, d6, d0
  210. vmlal.u8 q9, d4, d2
  211. vld1.8 {d6}, [r5], r4
  212. vadd.i16 d16, d16, d17
  213. vadd.i16 d17, d18, d19
  214. .ifc \codec,h264
  215. vrshrn.u16 d16, q8, #6
  216. .else
  217. vadd.u16 q8, q8, q11
  218. vshrn.u16 d16, q8, #6
  219. .endif
  220. subs r3, r3, #2
  221. pld [r1]
  222. .ifc \type,avg
  223. vld1.32 {d20[0]}, [lr,:32], r2
  224. vld1.32 {d20[1]}, [lr,:32], r2
  225. vrhadd.u8 d16, d16, d20
  226. .endif
  227. vext.8 d7, d6, d7, #1
  228. vtrn.32 d6, d7
  229. vst1.32 {d16[0]}, [r0,:32], r2
  230. vst1.32 {d16[1]}, [r0,:32], r2
  231. bgt 1b
  232. pop {r4-r7, pc}
  233. 2: tst r6, r6
  234. add r12, r12, r6
  235. vdup.8 d0, r4
  236. vdup.8 d1, r12
  237. vtrn.32 d0, d1
  238. beq 4f
  239. vext.32 d1, d0, d1, #1
  240. add r5, r1, r2
  241. lsl r4, r2, #1
  242. vld1.32 {d4[0]}, [r1], r4
  243. vld1.32 {d4[1]}, [r5], r4
  244. 3: pld [r5]
  245. vmull.u8 q8, d4, d0
  246. vld1.32 {d4[0]}, [r1], r4
  247. vmull.u8 q9, d4, d1
  248. vld1.32 {d4[1]}, [r5], r4
  249. vadd.i16 d16, d16, d17
  250. vadd.i16 d17, d18, d19
  251. .ifc \codec,h264
  252. vrshrn.u16 d16, q8, #6
  253. .else
  254. vadd.u16 q8, q8, q11
  255. vshrn.u16 d16, q8, #6
  256. .endif
  257. .ifc \type,avg
  258. vld1.32 {d20[0]}, [lr,:32], r2
  259. vld1.32 {d20[1]}, [lr,:32], r2
  260. vrhadd.u8 d16, d16, d20
  261. .endif
  262. subs r3, r3, #2
  263. pld [r1]
  264. vst1.32 {d16[0]}, [r0,:32], r2
  265. vst1.32 {d16[1]}, [r0,:32], r2
  266. bgt 3b
  267. pop {r4-r7, pc}
  268. 4: vld1.8 {d4}, [r1], r2
  269. vld1.8 {d6}, [r1], r2
  270. vext.8 d5, d4, d5, #1
  271. vext.8 d7, d6, d7, #1
  272. vtrn.32 d4, d5
  273. vtrn.32 d6, d7
  274. 5: vmull.u8 q8, d4, d0
  275. vmull.u8 q9, d6, d0
  276. subs r3, r3, #2
  277. vld1.8 {d4}, [r1], r2
  278. vext.8 d5, d4, d5, #1
  279. vtrn.32 d4, d5
  280. vadd.i16 d16, d16, d17
  281. vadd.i16 d17, d18, d19
  282. pld [r1]
  283. .ifc \codec,h264
  284. vrshrn.u16 d16, q8, #6
  285. .else
  286. vadd.u16 q8, q8, q11
  287. vshrn.u16 d16, q8, #6
  288. .endif
  289. .ifc \type,avg
  290. vld1.32 {d20[0]}, [lr,:32], r2
  291. vld1.32 {d20[1]}, [lr,:32], r2
  292. vrhadd.u8 d16, d16, d20
  293. .endif
  294. vld1.8 {d6}, [r1], r2
  295. vext.8 d7, d6, d7, #1
  296. vtrn.32 d6, d7
  297. pld [r1]
  298. vst1.32 {d16[0]}, [r0,:32], r2
  299. vst1.32 {d16[1]}, [r0,:32], r2
  300. bgt 5b
  301. pop {r4-r7, pc}
  302. endfunc
  303. .endm
  304. .macro h264_chroma_mc2 type
  305. function ff_\type\()_h264_chroma_mc2_neon, export=1
  306. push {r4-r6, lr}
  307. ldr r4, [sp, #16]
  308. ldr lr, [sp, #20]
  309. pld [r1]
  310. pld [r1, r2]
  311. orrs r5, r4, lr
  312. beq 2f
  313. mul r5, r4, lr
  314. rsb r6, r5, lr, lsl #3
  315. rsb r12, r5, r4, lsl #3
  316. sub r4, r5, r4, lsl #3
  317. sub r4, r4, lr, lsl #3
  318. add r4, r4, #64
  319. vdup.8 d0, r4
  320. vdup.8 d2, r12
  321. vdup.8 d1, r6
  322. vdup.8 d3, r5
  323. vtrn.16 q0, q1
  324. 1:
  325. vld1.32 {d4[0]}, [r1], r2
  326. vld1.32 {d4[1]}, [r1], r2
  327. vrev64.32 d5, d4
  328. vld1.32 {d5[1]}, [r1]
  329. vext.8 q3, q2, q2, #1
  330. vtrn.16 q2, q3
  331. vmull.u8 q8, d4, d0
  332. vmlal.u8 q8, d5, d1
  333. .ifc \type,avg
  334. vld1.16 {d18[0]}, [r0,:16], r2
  335. vld1.16 {d18[1]}, [r0,:16]
  336. sub r0, r0, r2
  337. .endif
  338. vtrn.32 d16, d17
  339. vadd.i16 d16, d16, d17
  340. vrshrn.u16 d16, q8, #6
  341. .ifc \type,avg
  342. vrhadd.u8 d16, d16, d18
  343. .endif
  344. vst1.16 {d16[0]}, [r0,:16], r2
  345. vst1.16 {d16[1]}, [r0,:16], r2
  346. subs r3, r3, #2
  347. bgt 1b
  348. pop {r4-r6, pc}
  349. 2:
  350. .ifc \type,put
  351. ldrh_post r5, r1, r2
  352. strh_post r5, r0, r2
  353. ldrh_post r6, r1, r2
  354. strh_post r6, r0, r2
  355. .else
  356. vld1.16 {d16[0]}, [r1], r2
  357. vld1.16 {d16[1]}, [r1], r2
  358. vld1.16 {d18[0]}, [r0,:16], r2
  359. vld1.16 {d18[1]}, [r0,:16]
  360. sub r0, r0, r2
  361. vrhadd.u8 d16, d16, d18
  362. vst1.16 {d16[0]}, [r0,:16], r2
  363. vst1.16 {d16[1]}, [r0,:16], r2
  364. .endif
  365. subs r3, r3, #2
  366. bgt 2b
  367. pop {r4-r6, pc}
  368. endfunc
  369. .endm
  370. #if CONFIG_H264_DECODER
  371. h264_chroma_mc8 put
  372. h264_chroma_mc8 avg
  373. h264_chroma_mc4 put
  374. h264_chroma_mc4 avg
  375. h264_chroma_mc2 put
  376. h264_chroma_mc2 avg
  377. #endif
  378. #if CONFIG_RV40_DECODER
  379. const rv40bias
  380. .short 0, 16, 32, 16
  381. .short 32, 28, 32, 28
  382. .short 0, 32, 16, 32
  383. .short 32, 28, 32, 28
  384. endconst
  385. h264_chroma_mc8 put, rv40
  386. h264_chroma_mc8 avg, rv40
  387. h264_chroma_mc4 put, rv40
  388. h264_chroma_mc4 avg, rv40
  389. #endif