You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

464 lines
14KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
  22. .macro h264_chroma_mc8 type, codec=h264
  23. function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
  24. push {r4-r7, lr}
  25. ldrd r4, r5, [sp, #20]
  26. .ifc \type,avg
  27. mov lr, r0
  28. .endif
  29. pld [r1]
  30. pld [r1, r2]
  31. .ifc \codec,rv40
  32. movrel r6, rv40bias
  33. lsr r7, r5, #1
  34. add r6, r6, r7, lsl #3
  35. lsr r7, r4, #1
  36. add r6, r6, r7, lsl #1
  37. vld1.16 {d22[],d23[]}, [r6,:16]
  38. .endif
  39. .ifc \codec,vc1
  40. vmov.u16 q11, #28
  41. .endif
  42. A muls r7, r4, r5
  43. T mul r7, r4, r5
  44. T cmp r7, #0
  45. rsb r6, r7, r5, lsl #3
  46. rsb r12, r7, r4, lsl #3
  47. sub r4, r7, r4, lsl #3
  48. sub r4, r4, r5, lsl #3
  49. add r4, r4, #64
  50. beq 2f
  51. vdup.8 d0, r4
  52. vdup.8 d1, r12
  53. vld1.8 {d4, d5}, [r1], r2
  54. vdup.8 d2, r6
  55. vdup.8 d3, r7
  56. vext.8 d5, d4, d5, #1
  57. 1: vld1.8 {d6, d7}, [r1], r2
  58. vmull.u8 q8, d4, d0
  59. vmlal.u8 q8, d5, d1
  60. vext.8 d7, d6, d7, #1
  61. vld1.8 {d4, d5}, [r1], r2
  62. vmlal.u8 q8, d6, d2
  63. pld [r1]
  64. vext.8 d5, d4, d5, #1
  65. vmlal.u8 q8, d7, d3
  66. vmull.u8 q9, d6, d0
  67. subs r3, r3, #2
  68. vmlal.u8 q9, d7, d1
  69. vmlal.u8 q9, d4, d2
  70. vmlal.u8 q9, d5, d3
  71. pld [r1, r2]
  72. .ifc \codec,h264
  73. vrshrn.u16 d16, q8, #6
  74. vrshrn.u16 d17, q9, #6
  75. .else
  76. vadd.u16 q8, q8, q11
  77. vadd.u16 q9, q9, q11
  78. vshrn.u16 d16, q8, #6
  79. vshrn.u16 d17, q9, #6
  80. .endif
  81. .ifc \type,avg
  82. vld1.8 {d20}, [lr,:64], r2
  83. vld1.8 {d21}, [lr,:64], r2
  84. vrhadd.u8 q8, q8, q10
  85. .endif
  86. vst1.8 {d16}, [r0,:64], r2
  87. vst1.8 {d17}, [r0,:64], r2
  88. bgt 1b
  89. pop {r4-r7, pc}
  90. 2: adds r12, r12, r6
  91. vdup.8 d0, r4
  92. beq 5f
  93. tst r6, r6
  94. vdup.8 d1, r12
  95. beq 4f
  96. vld1.8 {d4}, [r1], r2
  97. 3: vld1.8 {d6}, [r1], r2
  98. vmull.u8 q8, d4, d0
  99. vmlal.u8 q8, d6, d1
  100. vld1.8 {d4}, [r1], r2
  101. vmull.u8 q9, d6, d0
  102. vmlal.u8 q9, d4, d1
  103. pld [r1]
  104. .ifc \codec,h264
  105. vrshrn.u16 d16, q8, #6
  106. vrshrn.u16 d17, q9, #6
  107. .else
  108. vadd.u16 q8, q8, q11
  109. vadd.u16 q9, q9, q11
  110. vshrn.u16 d16, q8, #6
  111. vshrn.u16 d17, q9, #6
  112. .endif
  113. pld [r1, r2]
  114. .ifc \type,avg
  115. vld1.8 {d20}, [lr,:64], r2
  116. vld1.8 {d21}, [lr,:64], r2
  117. vrhadd.u8 q8, q8, q10
  118. .endif
  119. subs r3, r3, #2
  120. vst1.8 {d16}, [r0,:64], r2
  121. vst1.8 {d17}, [r0,:64], r2
  122. bgt 3b
  123. pop {r4-r7, pc}
  124. 4: vld1.8 {d4, d5}, [r1], r2
  125. vld1.8 {d6, d7}, [r1], r2
  126. vext.8 d5, d4, d5, #1
  127. vext.8 d7, d6, d7, #1
  128. pld [r1]
  129. subs r3, r3, #2
  130. vmull.u8 q8, d4, d0
  131. vmlal.u8 q8, d5, d1
  132. vmull.u8 q9, d6, d0
  133. vmlal.u8 q9, d7, d1
  134. pld [r1, r2]
  135. .ifc \codec,h264
  136. vrshrn.u16 d16, q8, #6
  137. vrshrn.u16 d17, q9, #6
  138. .else
  139. vadd.u16 q8, q8, q11
  140. vadd.u16 q9, q9, q11
  141. vshrn.u16 d16, q8, #6
  142. vshrn.u16 d17, q9, #6
  143. .endif
  144. .ifc \type,avg
  145. vld1.8 {d20}, [lr,:64], r2
  146. vld1.8 {d21}, [lr,:64], r2
  147. vrhadd.u8 q8, q8, q10
  148. .endif
  149. vst1.8 {d16}, [r0,:64], r2
  150. vst1.8 {d17}, [r0,:64], r2
  151. bgt 4b
  152. pop {r4-r7, pc}
  153. 5: vld1.8 {d4}, [r1], r2
  154. vld1.8 {d5}, [r1], r2
  155. pld [r1]
  156. subs r3, r3, #2
  157. vmull.u8 q8, d4, d0
  158. vmull.u8 q9, d5, d0
  159. pld [r1, r2]
  160. .ifc \codec,h264
  161. vrshrn.u16 d16, q8, #6
  162. vrshrn.u16 d17, q9, #6
  163. .else
  164. vadd.u16 q8, q8, q11
  165. vadd.u16 q9, q9, q11
  166. vshrn.u16 d16, q8, #6
  167. vshrn.u16 d17, q9, #6
  168. .endif
  169. .ifc \type,avg
  170. vld1.8 {d20}, [lr,:64], r2
  171. vld1.8 {d21}, [lr,:64], r2
  172. vrhadd.u8 q8, q8, q10
  173. .endif
  174. vst1.8 {d16}, [r0,:64], r2
  175. vst1.8 {d17}, [r0,:64], r2
  176. bgt 5b
  177. pop {r4-r7, pc}
  178. endfunc
  179. .endm
  180. /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
  181. .macro h264_chroma_mc4 type, codec=h264
  182. function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
  183. push {r4-r7, lr}
  184. ldrd r4, r5, [sp, #20]
  185. .ifc \type,avg
  186. mov lr, r0
  187. .endif
  188. pld [r1]
  189. pld [r1, r2]
  190. .ifc \codec,rv40
  191. movrel r6, rv40bias
  192. lsr r7, r5, #1
  193. add r6, r6, r7, lsl #3
  194. lsr r7, r4, #1
  195. add r6, r6, r7, lsl #1
  196. vld1.16 {d22[],d23[]}, [r6,:16]
  197. .endif
  198. .ifc \codec,vc1
  199. vmov.u16 q11, #28
  200. .endif
  201. A muls r7, r4, r5
  202. T mul r7, r4, r5
  203. T cmp r7, #0
  204. rsb r6, r7, r5, lsl #3
  205. rsb r12, r7, r4, lsl #3
  206. sub r4, r7, r4, lsl #3
  207. sub r4, r4, r5, lsl #3
  208. add r4, r4, #64
  209. beq 2f
  210. vdup.8 d0, r4
  211. vdup.8 d1, r12
  212. vld1.8 {d4}, [r1], r2
  213. vdup.8 d2, r6
  214. vdup.8 d3, r7
  215. vext.8 d5, d4, d5, #1
  216. vtrn.32 d4, d5
  217. vtrn.32 d0, d1
  218. vtrn.32 d2, d3
  219. 1: vld1.8 {d6}, [r1], r2
  220. vext.8 d7, d6, d7, #1
  221. vtrn.32 d6, d7
  222. vmull.u8 q8, d4, d0
  223. vmlal.u8 q8, d6, d2
  224. vld1.8 {d4}, [r1], r2
  225. vext.8 d5, d4, d5, #1
  226. vtrn.32 d4, d5
  227. pld [r1]
  228. vmull.u8 q9, d6, d0
  229. vmlal.u8 q9, d4, d2
  230. vadd.i16 d16, d16, d17
  231. vadd.i16 d17, d18, d19
  232. .ifc \codec,h264
  233. vrshrn.u16 d16, q8, #6
  234. .else
  235. vadd.u16 q8, q8, q11
  236. vshrn.u16 d16, q8, #6
  237. .endif
  238. subs r3, r3, #2
  239. pld [r1, r2]
  240. .ifc \type,avg
  241. vld1.32 {d20[0]}, [lr,:32], r2
  242. vld1.32 {d20[1]}, [lr,:32], r2
  243. vrhadd.u8 d16, d16, d20
  244. .endif
  245. vst1.32 {d16[0]}, [r0,:32], r2
  246. vst1.32 {d16[1]}, [r0,:32], r2
  247. bgt 1b
  248. pop {r4-r7, pc}
  249. 2: adds r12, r12, r6
  250. vdup.8 d0, r4
  251. beq 5f
  252. tst r6, r6
  253. vdup.8 d1, r12
  254. vtrn.32 d0, d1
  255. beq 4f
  256. vext.32 d1, d0, d1, #1
  257. vld1.32 {d4[0]}, [r1], r2
  258. 3: vld1.32 {d4[1]}, [r1], r2
  259. vmull.u8 q8, d4, d0
  260. vld1.32 {d4[0]}, [r1], r2
  261. vmull.u8 q9, d4, d1
  262. vadd.i16 d16, d16, d17
  263. vadd.i16 d17, d18, d19
  264. pld [r1]
  265. .ifc \codec,h264
  266. vrshrn.u16 d16, q8, #6
  267. .else
  268. vadd.u16 q8, q8, q11
  269. vshrn.u16 d16, q8, #6
  270. .endif
  271. .ifc \type,avg
  272. vld1.32 {d20[0]}, [lr,:32], r2
  273. vld1.32 {d20[1]}, [lr,:32], r2
  274. vrhadd.u8 d16, d16, d20
  275. .endif
  276. subs r3, r3, #2
  277. pld [r1, r2]
  278. vst1.32 {d16[0]}, [r0,:32], r2
  279. vst1.32 {d16[1]}, [r0,:32], r2
  280. bgt 3b
  281. pop {r4-r7, pc}
  282. 4: vld1.8 {d4}, [r1], r2
  283. vld1.8 {d6}, [r1], r2
  284. vext.8 d5, d4, d5, #1
  285. vext.8 d7, d6, d7, #1
  286. vtrn.32 d4, d5
  287. vtrn.32 d6, d7
  288. vmull.u8 q8, d4, d0
  289. vmull.u8 q9, d6, d0
  290. subs r3, r3, #2
  291. vadd.i16 d16, d16, d17
  292. vadd.i16 d17, d18, d19
  293. pld [r1]
  294. .ifc \codec,h264
  295. vrshrn.u16 d16, q8, #6
  296. .else
  297. vadd.u16 q8, q8, q11
  298. vshrn.u16 d16, q8, #6
  299. .endif
  300. .ifc \type,avg
  301. vld1.32 {d20[0]}, [lr,:32], r2
  302. vld1.32 {d20[1]}, [lr,:32], r2
  303. vrhadd.u8 d16, d16, d20
  304. .endif
  305. pld [r1]
  306. vst1.32 {d16[0]}, [r0,:32], r2
  307. vst1.32 {d16[1]}, [r0,:32], r2
  308. bgt 4b
  309. pop {r4-r7, pc}
  310. 5: vld1.32 {d4[0]}, [r1], r2
  311. vld1.32 {d4[1]}, [r1], r2
  312. vmull.u8 q8, d4, d0
  313. subs r3, r3, #2
  314. pld [r1]
  315. .ifc \codec,h264
  316. vrshrn.u16 d16, q8, #6
  317. .else
  318. vadd.u16 q8, q8, q11
  319. vshrn.u16 d16, q8, #6
  320. .endif
  321. .ifc \type,avg
  322. vld1.32 {d20[0]}, [lr,:32], r2
  323. vld1.32 {d20[1]}, [lr,:32], r2
  324. vrhadd.u8 d16, d16, d20
  325. .endif
  326. pld [r1]
  327. vst1.32 {d16[0]}, [r0,:32], r2
  328. vst1.32 {d16[1]}, [r0,:32], r2
  329. bgt 5b
  330. pop {r4-r7, pc}
  331. endfunc
  332. .endm
  333. .macro h264_chroma_mc2 type
  334. function ff_\type\()_h264_chroma_mc2_neon, export=1
  335. push {r4-r6, lr}
  336. ldr r4, [sp, #16]
  337. ldr lr, [sp, #20]
  338. pld [r1]
  339. pld [r1, r2]
  340. orrs r5, r4, lr
  341. beq 2f
  342. mul r5, r4, lr
  343. rsb r6, r5, lr, lsl #3
  344. rsb r12, r5, r4, lsl #3
  345. sub r4, r5, r4, lsl #3
  346. sub r4, r4, lr, lsl #3
  347. add r4, r4, #64
  348. vdup.8 d0, r4
  349. vdup.8 d2, r12
  350. vdup.8 d1, r6
  351. vdup.8 d3, r5
  352. vtrn.16 q0, q1
  353. 1:
  354. vld1.32 {d4[0]}, [r1], r2
  355. vld1.32 {d4[1]}, [r1], r2
  356. vrev64.32 d5, d4
  357. vld1.32 {d5[1]}, [r1]
  358. vext.8 q3, q2, q2, #1
  359. vtrn.16 q2, q3
  360. vmull.u8 q8, d4, d0
  361. vmlal.u8 q8, d5, d1
  362. .ifc \type,avg
  363. vld1.16 {d18[0]}, [r0,:16], r2
  364. vld1.16 {d18[1]}, [r0,:16]
  365. sub r0, r0, r2
  366. .endif
  367. vtrn.32 d16, d17
  368. vadd.i16 d16, d16, d17
  369. vrshrn.u16 d16, q8, #6
  370. .ifc \type,avg
  371. vrhadd.u8 d16, d16, d18
  372. .endif
  373. vst1.16 {d16[0]}, [r0,:16], r2
  374. vst1.16 {d16[1]}, [r0,:16], r2
  375. subs r3, r3, #2
  376. bgt 1b
  377. pop {r4-r6, pc}
  378. 2:
  379. .ifc \type,put
  380. ldrh_post r5, r1, r2
  381. strh_post r5, r0, r2
  382. ldrh_post r6, r1, r2
  383. strh_post r6, r0, r2
  384. .else
  385. vld1.16 {d16[0]}, [r1], r2
  386. vld1.16 {d16[1]}, [r1], r2
  387. vld1.16 {d18[0]}, [r0,:16], r2
  388. vld1.16 {d18[1]}, [r0,:16]
  389. sub r0, r0, r2
  390. vrhadd.u8 d16, d16, d18
  391. vst1.16 {d16[0]}, [r0,:16], r2
  392. vst1.16 {d16[1]}, [r0,:16], r2
  393. .endif
  394. subs r3, r3, #2
  395. bgt 2b
  396. pop {r4-r6, pc}
  397. endfunc
  398. .endm
  399. h264_chroma_mc8 put
  400. h264_chroma_mc8 avg
  401. h264_chroma_mc4 put
  402. h264_chroma_mc4 avg
  403. h264_chroma_mc2 put
  404. h264_chroma_mc2 avg
  405. #if CONFIG_RV40_DECODER
  406. const rv40bias
  407. .short 0, 16, 32, 16
  408. .short 32, 28, 32, 28
  409. .short 0, 32, 16, 32
  410. .short 32, 28, 32, 28
  411. endconst
  412. h264_chroma_mc8 put, rv40
  413. h264_chroma_mc8 avg, rv40
  414. h264_chroma_mc4 put, rv40
  415. h264_chroma_mc4 avg, rv40
  416. #endif
  417. #if CONFIG_VC1_DECODER
  418. h264_chroma_mc8 put, vc1
  419. h264_chroma_mc8 avg, vc1
  420. h264_chroma_mc4 put, vc1
  421. h264_chroma_mc4 avg, vc1
  422. #endif