You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1378 lines
46KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .fpu neon
  22. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  23. vtrn.32 \r0, \r4
  24. vtrn.32 \r1, \r5
  25. vtrn.32 \r2, \r6
  26. vtrn.32 \r3, \r7
  27. vtrn.16 \r0, \r2
  28. vtrn.16 \r1, \r3
  29. vtrn.16 \r4, \r6
  30. vtrn.16 \r5, \r7
  31. vtrn.8 \r0, \r1
  32. vtrn.8 \r2, \r3
  33. vtrn.8 \r4, \r5
  34. vtrn.8 \r6, \r7
  35. .endm
  36. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  37. vswp \r0, \r4
  38. vswp \r1, \r5
  39. vswp \r2, \r6
  40. vswp \r3, \r7
  41. .endm
  42. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  43. vtrn.32 \r0, \r2
  44. vtrn.32 \r1, \r3
  45. vtrn.32 \r4, \r6
  46. vtrn.32 \r5, \r7
  47. vtrn.16 \r0, \r1
  48. vtrn.16 \r2, \r3
  49. vtrn.16 \r4, \r5
  50. vtrn.16 \r6, \r7
  51. .endm
  52. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  53. .macro h264_chroma_mc8 avg=0
  54. push {r4-r7, lr}
  55. ldrd r4, [sp, #20]
  56. .if \avg
  57. mov lr, r0
  58. .endif
  59. pld [r1]
  60. pld [r1, r2]
  61. muls r7, r4, r5
  62. rsb r6, r7, r5, lsl #3
  63. rsb ip, r7, r4, lsl #3
  64. sub r4, r7, r4, lsl #3
  65. sub r4, r4, r5, lsl #3
  66. add r4, r4, #64
  67. beq 2f
  68. add r5, r1, r2
  69. vdup.8 d0, r4
  70. lsl r4, r2, #1
  71. vdup.8 d1, ip
  72. vld1.64 {d4, d5}, [r1], r4
  73. vdup.8 d2, r6
  74. vld1.64 {d6, d7}, [r5], r4
  75. vdup.8 d3, r7
  76. vext.8 d5, d4, d5, #1
  77. vext.8 d7, d6, d7, #1
  78. 1: pld [r5]
  79. vmull.u8 q8, d4, d0
  80. vmlal.u8 q8, d5, d1
  81. vld1.64 {d4, d5}, [r1], r4
  82. vmlal.u8 q8, d6, d2
  83. vext.8 d5, d4, d5, #1
  84. vmlal.u8 q8, d7, d3
  85. vmull.u8 q9, d6, d0
  86. subs r3, r3, #2
  87. vmlal.u8 q9, d7, d1
  88. vmlal.u8 q9, d4, d2
  89. vmlal.u8 q9, d5, d3
  90. vrshrn.u16 d16, q8, #6
  91. vld1.64 {d6, d7}, [r5], r4
  92. pld [r1]
  93. vrshrn.u16 d17, q9, #6
  94. .if \avg
  95. vld1.64 {d20}, [lr,:64], r2
  96. vld1.64 {d21}, [lr,:64], r2
  97. vrhadd.u8 q8, q8, q10
  98. .endif
  99. vext.8 d7, d6, d7, #1
  100. vst1.64 {d16}, [r0,:64], r2
  101. vst1.64 {d17}, [r0,:64], r2
  102. bgt 1b
  103. pop {r4-r7, pc}
  104. 2: tst r6, r6
  105. add ip, ip, r6
  106. vdup.8 d0, r4
  107. vdup.8 d1, ip
  108. beq 4f
  109. add r5, r1, r2
  110. lsl r4, r2, #1
  111. vld1.64 {d4}, [r1], r4
  112. vld1.64 {d6}, [r5], r4
  113. 3: pld [r5]
  114. vmull.u8 q8, d4, d0
  115. vmlal.u8 q8, d6, d1
  116. vld1.64 {d4}, [r1], r4
  117. vmull.u8 q9, d6, d0
  118. vmlal.u8 q9, d4, d1
  119. vld1.64 {d6}, [r5], r4
  120. vrshrn.u16 d16, q8, #6
  121. vrshrn.u16 d17, q9, #6
  122. .if \avg
  123. vld1.64 {d20}, [lr,:64], r2
  124. vld1.64 {d21}, [lr,:64], r2
  125. vrhadd.u8 q8, q8, q10
  126. .endif
  127. subs r3, r3, #2
  128. pld [r1]
  129. vst1.64 {d16}, [r0,:64], r2
  130. vst1.64 {d17}, [r0,:64], r2
  131. bgt 3b
  132. pop {r4-r7, pc}
  133. 4: vld1.64 {d4, d5}, [r1], r2
  134. vld1.64 {d6, d7}, [r1], r2
  135. vext.8 d5, d4, d5, #1
  136. vext.8 d7, d6, d7, #1
  137. 5: pld [r1]
  138. subs r3, r3, #2
  139. vmull.u8 q8, d4, d0
  140. vmlal.u8 q8, d5, d1
  141. vld1.64 {d4, d5}, [r1], r2
  142. vmull.u8 q9, d6, d0
  143. vmlal.u8 q9, d7, d1
  144. pld [r1]
  145. vext.8 d5, d4, d5, #1
  146. vrshrn.u16 d16, q8, #6
  147. vrshrn.u16 d17, q9, #6
  148. .if \avg
  149. vld1.64 {d20}, [lr,:64], r2
  150. vld1.64 {d21}, [lr,:64], r2
  151. vrhadd.u8 q8, q8, q10
  152. .endif
  153. vld1.64 {d6, d7}, [r1], r2
  154. vext.8 d7, d6, d7, #1
  155. vst1.64 {d16}, [r0,:64], r2
  156. vst1.64 {d17}, [r0,:64], r2
  157. bgt 5b
  158. pop {r4-r7, pc}
  159. .endm
  160. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  161. .macro h264_chroma_mc4 avg=0
  162. push {r4-r7, lr}
  163. ldrd r4, [sp, #20]
  164. .if \avg
  165. mov lr, r0
  166. .endif
  167. pld [r1]
  168. pld [r1, r2]
  169. muls r7, r4, r5
  170. rsb r6, r7, r5, lsl #3
  171. rsb ip, r7, r4, lsl #3
  172. sub r4, r7, r4, lsl #3
  173. sub r4, r4, r5, lsl #3
  174. add r4, r4, #64
  175. beq 2f
  176. add r5, r1, r2
  177. vdup.8 d0, r4
  178. lsl r4, r2, #1
  179. vdup.8 d1, ip
  180. vld1.64 {d4}, [r1], r4
  181. vdup.8 d2, r6
  182. vld1.64 {d6}, [r5], r4
  183. vdup.8 d3, r7
  184. vext.8 d5, d4, d5, #1
  185. vext.8 d7, d6, d7, #1
  186. vtrn.32 d4, d5
  187. vtrn.32 d6, d7
  188. vtrn.32 d0, d1
  189. vtrn.32 d2, d3
  190. 1: pld [r5]
  191. vmull.u8 q8, d4, d0
  192. vmlal.u8 q8, d6, d2
  193. vld1.64 {d4}, [r1], r4
  194. vext.8 d5, d4, d5, #1
  195. vtrn.32 d4, d5
  196. vmull.u8 q9, d6, d0
  197. vmlal.u8 q9, d4, d2
  198. vld1.64 {d6}, [r5], r4
  199. vadd.i16 d16, d16, d17
  200. vadd.i16 d17, d18, d19
  201. vrshrn.u16 d16, q8, #6
  202. subs r3, r3, #2
  203. pld [r1]
  204. .if \avg
  205. vld1.32 {d20[0]}, [lr,:32], r2
  206. vld1.32 {d20[1]}, [lr,:32], r2
  207. vrhadd.u8 d16, d16, d20
  208. .endif
  209. vext.8 d7, d6, d7, #1
  210. vtrn.32 d6, d7
  211. vst1.32 {d16[0]}, [r0,:32], r2
  212. vst1.32 {d16[1]}, [r0,:32], r2
  213. bgt 1b
  214. pop {r4-r7, pc}
  215. 2: tst r6, r6
  216. add ip, ip, r6
  217. vdup.8 d0, r4
  218. vdup.8 d1, ip
  219. vtrn.32 d0, d1
  220. beq 4f
  221. vext.32 d1, d0, d1, #1
  222. add r5, r1, r2
  223. lsl r4, r2, #1
  224. vld1.32 {d4[0]}, [r1], r4
  225. vld1.32 {d4[1]}, [r5], r4
  226. 3: pld [r5]
  227. vmull.u8 q8, d4, d0
  228. vld1.32 {d4[0]}, [r1], r4
  229. vmull.u8 q9, d4, d1
  230. vld1.32 {d4[1]}, [r5], r4
  231. vadd.i16 d16, d16, d17
  232. vadd.i16 d17, d18, d19
  233. vrshrn.u16 d16, q8, #6
  234. .if \avg
  235. vld1.32 {d20[0]}, [lr,:32], r2
  236. vld1.32 {d20[1]}, [lr,:32], r2
  237. vrhadd.u8 d16, d16, d20
  238. .endif
  239. subs r3, r3, #2
  240. pld [r1]
  241. vst1.32 {d16[0]}, [r0,:32], r2
  242. vst1.32 {d16[1]}, [r0,:32], r2
  243. bgt 3b
  244. pop {r4-r7, pc}
  245. 4: vld1.64 {d4}, [r1], r2
  246. vld1.64 {d6}, [r1], r2
  247. vext.8 d5, d4, d5, #1
  248. vext.8 d7, d6, d7, #1
  249. vtrn.32 d4, d5
  250. vtrn.32 d6, d7
  251. 5: vmull.u8 q8, d4, d0
  252. vmull.u8 q9, d6, d0
  253. subs r3, r3, #2
  254. vld1.64 {d4}, [r1], r2
  255. vext.8 d5, d4, d5, #1
  256. vtrn.32 d4, d5
  257. vadd.i16 d16, d16, d17
  258. vadd.i16 d17, d18, d19
  259. pld [r1]
  260. vrshrn.u16 d16, q8, #6
  261. .if \avg
  262. vld1.32 {d20[0]}, [lr,:32], r2
  263. vld1.32 {d20[1]}, [lr,:32], r2
  264. vrhadd.u8 d16, d16, d20
  265. .endif
  266. vld1.64 {d6}, [r1], r2
  267. vext.8 d7, d6, d7, #1
  268. vtrn.32 d6, d7
  269. pld [r1]
  270. vst1.32 {d16[0]}, [r0,:32], r2
  271. vst1.32 {d16[1]}, [r0,:32], r2
  272. bgt 5b
  273. pop {r4-r7, pc}
  274. .endm
  275. .text
  276. .align
  277. function ff_put_h264_chroma_mc8_neon, export=1
  278. h264_chroma_mc8
  279. .endfunc
  280. function ff_avg_h264_chroma_mc8_neon, export=1
  281. h264_chroma_mc8 avg=1
  282. .endfunc
  283. function ff_put_h264_chroma_mc4_neon, export=1
  284. h264_chroma_mc4
  285. .endfunc
  286. function ff_avg_h264_chroma_mc4_neon, export=1
  287. h264_chroma_mc4 avg=1
  288. .endfunc
  289. /* H.264 loop filter */
  290. .macro h264_loop_filter_start
  291. ldr ip, [sp]
  292. tst r2, r2
  293. ldr ip, [ip]
  294. tstne r3, r3
  295. vmov.32 d24[0], ip
  296. and ip, ip, ip, lsl #16
  297. bxeq lr
  298. ands ip, ip, ip, lsl #8
  299. bxlt lr
  300. .endm
  301. .macro align_push_regs
  302. and ip, sp, #15
  303. add ip, ip, #32
  304. sub sp, sp, ip
  305. vst1.64 {d12-d15}, [sp,:128]
  306. sub sp, sp, #32
  307. vst1.64 {d8-d11}, [sp,:128]
  308. .endm
  309. .macro align_pop_regs
  310. vld1.64 {d8-d11}, [sp,:128]!
  311. vld1.64 {d12-d15}, [sp,:128], ip
  312. .endm
  313. .macro h264_loop_filter_luma
  314. vdup.8 q11, r2 @ alpha
  315. vmovl.u8 q12, d24
  316. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  317. vmovl.u16 q12, d24
  318. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  319. vsli.16 q12, q12, #8
  320. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  321. vsli.32 q12, q12, #16
  322. vclt.u8 q6, q6, q11 @ < alpha
  323. vdup.8 q11, r3 @ beta
  324. vclt.s8 q7, q12, #0
  325. vclt.u8 q14, q14, q11 @ < beta
  326. vclt.u8 q15, q15, q11 @ < beta
  327. vbic q6, q6, q7
  328. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  329. vand q6, q6, q14
  330. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  331. vclt.u8 q4, q4, q11 @ < beta
  332. vand q6, q6, q15
  333. vclt.u8 q5, q5, q11 @ < beta
  334. vand q4, q4, q6
  335. vand q5, q5, q6
  336. vand q12, q12, q6
  337. vrhadd.u8 q14, q8, q0
  338. vsub.i8 q6, q12, q4
  339. vqadd.u8 q7, q9, q12
  340. vhadd.u8 q10, q10, q14
  341. vsub.i8 q6, q6, q5
  342. vhadd.u8 q14, q2, q14
  343. vmin.u8 q7, q7, q10
  344. vqsub.u8 q11, q9, q12
  345. vqadd.u8 q2, q1, q12
  346. vmax.u8 q7, q7, q11
  347. vqsub.u8 q11, q1, q12
  348. vmin.u8 q14, q2, q14
  349. vmovl.u8 q2, d0
  350. vmax.u8 q14, q14, q11
  351. vmovl.u8 q10, d1
  352. vsubw.u8 q2, q2, d16
  353. vsubw.u8 q10, q10, d17
  354. vshl.i16 q2, q2, #2
  355. vshl.i16 q10, q10, #2
  356. vaddw.u8 q2, q2, d18
  357. vaddw.u8 q10, q10, d19
  358. vsubw.u8 q2, q2, d2
  359. vsubw.u8 q10, q10, d3
  360. vrshrn.i16 d4, q2, #3
  361. vrshrn.i16 d5, q10, #3
  362. vbsl q4, q7, q9
  363. vbsl q5, q14, q1
  364. vneg.s8 q7, q6
  365. vmovl.u8 q14, d16
  366. vmin.s8 q2, q2, q6
  367. vmovl.u8 q6, d17
  368. vmax.s8 q2, q2, q7
  369. vmovl.u8 q11, d0
  370. vmovl.u8 q12, d1
  371. vaddw.s8 q14, q14, d4
  372. vaddw.s8 q6, q6, d5
  373. vsubw.s8 q11, q11, d4
  374. vsubw.s8 q12, q12, d5
  375. vqmovun.s16 d16, q14
  376. vqmovun.s16 d17, q6
  377. vqmovun.s16 d0, q11
  378. vqmovun.s16 d1, q12
  379. .endm
  380. function ff_h264_v_loop_filter_luma_neon, export=1
  381. h264_loop_filter_start
  382. vld1.64 {d0, d1}, [r0,:128], r1
  383. vld1.64 {d2, d3}, [r0,:128], r1
  384. vld1.64 {d4, d5}, [r0,:128], r1
  385. sub r0, r0, r1, lsl #2
  386. sub r0, r0, r1, lsl #1
  387. vld1.64 {d20,d21}, [r0,:128], r1
  388. vld1.64 {d18,d19}, [r0,:128], r1
  389. vld1.64 {d16,d17}, [r0,:128], r1
  390. align_push_regs
  391. h264_loop_filter_luma
  392. sub r0, r0, r1, lsl #1
  393. vst1.64 {d8, d9}, [r0,:128], r1
  394. vst1.64 {d16,d17}, [r0,:128], r1
  395. vst1.64 {d0, d1}, [r0,:128], r1
  396. vst1.64 {d10,d11}, [r0,:128]
  397. align_pop_regs
  398. bx lr
  399. .endfunc
  400. function ff_h264_h_loop_filter_luma_neon, export=1
  401. h264_loop_filter_start
  402. sub r0, r0, #4
  403. vld1.64 {d6}, [r0], r1
  404. vld1.64 {d20}, [r0], r1
  405. vld1.64 {d18}, [r0], r1
  406. vld1.64 {d16}, [r0], r1
  407. vld1.64 {d0}, [r0], r1
  408. vld1.64 {d2}, [r0], r1
  409. vld1.64 {d4}, [r0], r1
  410. vld1.64 {d26}, [r0], r1
  411. vld1.64 {d7}, [r0], r1
  412. vld1.64 {d21}, [r0], r1
  413. vld1.64 {d19}, [r0], r1
  414. vld1.64 {d17}, [r0], r1
  415. vld1.64 {d1}, [r0], r1
  416. vld1.64 {d3}, [r0], r1
  417. vld1.64 {d5}, [r0], r1
  418. vld1.64 {d27}, [r0], r1
  419. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  420. align_push_regs
  421. sub sp, sp, #16
  422. vst1.64 {d4, d5}, [sp,:128]
  423. sub sp, sp, #16
  424. vst1.64 {d20,d21}, [sp,:128]
  425. h264_loop_filter_luma
  426. vld1.64 {d20,d21}, [sp,:128]!
  427. vld1.64 {d4, d5}, [sp,:128]!
  428. transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
  429. sub r0, r0, r1, lsl #4
  430. vst1.64 {d6}, [r0], r1
  431. vst1.64 {d20}, [r0], r1
  432. vst1.64 {d8}, [r0], r1
  433. vst1.64 {d16}, [r0], r1
  434. vst1.64 {d0}, [r0], r1
  435. vst1.64 {d10}, [r0], r1
  436. vst1.64 {d4}, [r0], r1
  437. vst1.64 {d26}, [r0], r1
  438. vst1.64 {d7}, [r0], r1
  439. vst1.64 {d21}, [r0], r1
  440. vst1.64 {d9}, [r0], r1
  441. vst1.64 {d17}, [r0], r1
  442. vst1.64 {d1}, [r0], r1
  443. vst1.64 {d11}, [r0], r1
  444. vst1.64 {d5}, [r0], r1
  445. vst1.64 {d27}, [r0], r1
  446. align_pop_regs
  447. bx lr
  448. .endfunc
  449. .macro h264_loop_filter_chroma
  450. vdup.8 d22, r2 @ alpha
  451. vmovl.u8 q12, d24
  452. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  453. vmovl.u8 q2, d0
  454. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  455. vsubw.u8 q2, q2, d16
  456. vsli.16 d24, d24, #8
  457. vshl.i16 q2, q2, #2
  458. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  459. vaddw.u8 q2, q2, d18
  460. vclt.u8 d26, d26, d22 @ < alpha
  461. vsubw.u8 q2, q2, d2
  462. vdup.8 d22, r3 @ beta
  463. vclt.s8 d25, d24, #0
  464. vrshrn.i16 d4, q2, #3
  465. vclt.u8 d28, d28, d22 @ < beta
  466. vbic d26, d26, d25
  467. vclt.u8 d30, d30, d22 @ < beta
  468. vand d26, d26, d28
  469. vneg.s8 d25, d24
  470. vand d26, d26, d30
  471. vmin.s8 d4, d4, d24
  472. vmovl.u8 q14, d16
  473. vand d4, d4, d26
  474. vmax.s8 d4, d4, d25
  475. vmovl.u8 q11, d0
  476. vaddw.s8 q14, q14, d4
  477. vsubw.s8 q11, q11, d4
  478. vqmovun.s16 d16, q14
  479. vqmovun.s16 d0, q11
  480. .endm
  481. function ff_h264_v_loop_filter_chroma_neon, export=1
  482. h264_loop_filter_start
  483. sub r0, r0, r1, lsl #1
  484. vld1.64 {d18}, [r0,:64], r1
  485. vld1.64 {d16}, [r0,:64], r1
  486. vld1.64 {d0}, [r0,:64], r1
  487. vld1.64 {d2}, [r0,:64]
  488. h264_loop_filter_chroma
  489. sub r0, r0, r1, lsl #1
  490. vst1.64 {d16}, [r0,:64], r1
  491. vst1.64 {d0}, [r0,:64], r1
  492. bx lr
  493. .endfunc
  494. function ff_h264_h_loop_filter_chroma_neon, export=1
  495. h264_loop_filter_start
  496. sub r0, r0, #2
  497. vld1.32 {d18[0]}, [r0], r1
  498. vld1.32 {d16[0]}, [r0], r1
  499. vld1.32 {d0[0]}, [r0], r1
  500. vld1.32 {d2[0]}, [r0], r1
  501. vld1.32 {d18[1]}, [r0], r1
  502. vld1.32 {d16[1]}, [r0], r1
  503. vld1.32 {d0[1]}, [r0], r1
  504. vld1.32 {d2[1]}, [r0], r1
  505. vtrn.16 d18, d0
  506. vtrn.16 d16, d2
  507. vtrn.8 d18, d16
  508. vtrn.8 d0, d2
  509. h264_loop_filter_chroma
  510. vtrn.16 d18, d0
  511. vtrn.16 d16, d2
  512. vtrn.8 d18, d16
  513. vtrn.8 d0, d2
  514. sub r0, r0, r1, lsl #3
  515. vst1.32 {d18[0]}, [r0], r1
  516. vst1.32 {d16[0]}, [r0], r1
  517. vst1.32 {d0[0]}, [r0], r1
  518. vst1.32 {d2[0]}, [r0], r1
  519. vst1.32 {d18[1]}, [r0], r1
  520. vst1.32 {d16[1]}, [r0], r1
  521. vst1.32 {d0[1]}, [r0], r1
  522. vst1.32 {d2[1]}, [r0], r1
  523. bx lr
  524. .endfunc
  525. /* H.264 qpel MC */
  526. .macro lowpass_const r
  527. movw \r, #5
  528. movt \r, #20
  529. vmov.32 d6[0], \r
  530. .endm
  531. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  532. .if \narrow
  533. t0 .req q0
  534. t1 .req q8
  535. .else
  536. t0 .req \d0
  537. t1 .req \d1
  538. .endif
  539. vext.8 d2, \r0, \r1, #2
  540. vext.8 d3, \r0, \r1, #3
  541. vaddl.u8 q1, d2, d3
  542. vext.8 d4, \r0, \r1, #1
  543. vext.8 d5, \r0, \r1, #4
  544. vaddl.u8 q2, d4, d5
  545. vext.8 d30, \r0, \r1, #5
  546. vaddl.u8 t0, \r0, d30
  547. vext.8 d18, \r2, \r3, #2
  548. vmla.i16 t0, q1, d6[1]
  549. vext.8 d19, \r2, \r3, #3
  550. vaddl.u8 q9, d18, d19
  551. vext.8 d20, \r2, \r3, #1
  552. vmls.i16 t0, q2, d6[0]
  553. vext.8 d21, \r2, \r3, #4
  554. vaddl.u8 q10, d20, d21
  555. vext.8 d31, \r2, \r3, #5
  556. vaddl.u8 t1, \r2, d31
  557. vmla.i16 t1, q9, d6[1]
  558. vmls.i16 t1, q10, d6[0]
  559. .if \narrow
  560. vqrshrun.s16 \d0, t0, #5
  561. vqrshrun.s16 \d1, t1, #5
  562. .endif
  563. .unreq t0
  564. .unreq t1
  565. .endm
  566. .macro lowpass_8_1 r0, r1, d0, narrow=1
  567. .if \narrow
  568. t0 .req q0
  569. .else
  570. t0 .req \d0
  571. .endif
  572. vext.8 d2, \r0, \r1, #2
  573. vext.8 d3, \r0, \r1, #3
  574. vaddl.u8 q1, d2, d3
  575. vext.8 d4, \r0, \r1, #1
  576. vext.8 d5, \r0, \r1, #4
  577. vaddl.u8 q2, d4, d5
  578. vext.8 d30, \r0, \r1, #5
  579. vaddl.u8 t0, \r0, d30
  580. vmla.i16 t0, q1, d6[1]
  581. vmls.i16 t0, q2, d6[0]
  582. .if \narrow
  583. vqrshrun.s16 \d0, t0, #5
  584. .endif
  585. .unreq t0
  586. .endm
  587. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  588. vext.16 q1, \r0, \r1, #2
  589. vext.16 q0, \r0, \r1, #3
  590. vaddl.s16 q9, d2, d0
  591. vext.16 q2, \r0, \r1, #1
  592. vaddl.s16 q1, d3, d1
  593. vext.16 q3, \r0, \r1, #4
  594. vaddl.s16 q10, d4, d6
  595. vext.16 \r1, \r0, \r1, #5
  596. vaddl.s16 q2, d5, d7
  597. vaddl.s16 q0, \h0, \h1
  598. vaddl.s16 q8, \l0, \l1
  599. vshl.i32 q3, q9, #4
  600. vshl.i32 q9, q9, #2
  601. vshl.i32 q15, q10, #2
  602. vadd.i32 q9, q9, q3
  603. vadd.i32 q10, q10, q15
  604. vshl.i32 q3, q1, #4
  605. vshl.i32 q1, q1, #2
  606. vshl.i32 q15, q2, #2
  607. vadd.i32 q1, q1, q3
  608. vadd.i32 q2, q2, q15
  609. vadd.i32 q9, q9, q8
  610. vsub.i32 q9, q9, q10
  611. vadd.i32 q1, q1, q0
  612. vsub.i32 q1, q1, q2
  613. vrshrn.s32 d18, q9, #10
  614. vrshrn.s32 d19, q1, #10
  615. vqmovun.s16 \d, q9
  616. .endm
  617. function put_h264_qpel16_h_lowpass_neon_packed
  618. mov r4, lr
  619. mov ip, #16
  620. mov r3, #8
  621. bl put_h264_qpel8_h_lowpass_neon
  622. sub r1, r1, r2, lsl #4
  623. add r1, r1, #8
  624. mov ip, #16
  625. mov lr, r4
  626. b put_h264_qpel8_h_lowpass_neon
  627. .endfunc
  628. function put_h264_qpel16_h_lowpass_neon
  629. push {lr}
  630. mov ip, #16
  631. bl put_h264_qpel8_h_lowpass_neon
  632. sub r0, r0, r3, lsl #4
  633. sub r1, r1, r2, lsl #4
  634. add r0, r0, #8
  635. add r1, r1, #8
  636. mov ip, #16
  637. pop {lr}
  638. .endfunc
  639. function put_h264_qpel8_h_lowpass_neon
  640. 1: vld1.64 {d0, d1}, [r1], r2
  641. vld1.64 {d16,d17}, [r1], r2
  642. subs ip, ip, #2
  643. lowpass_8 d0, d1, d16, d17, d0, d16
  644. vst1.64 {d0}, [r0,:64], r3
  645. vst1.64 {d16}, [r0,:64], r3
  646. bne 1b
  647. bx lr
  648. .endfunc
  649. function put_h264_qpel16_h_lowpass_l2_neon
  650. push {lr}
  651. mov ip, #16
  652. bl put_h264_qpel8_h_lowpass_l2_neon
  653. sub r0, r0, r2, lsl #4
  654. sub r1, r1, r2, lsl #4
  655. sub r3, r3, r2, lsl #4
  656. add r0, r0, #8
  657. add r1, r1, #8
  658. add r3, r3, #8
  659. mov ip, #16
  660. pop {lr}
  661. .endfunc
  662. function put_h264_qpel8_h_lowpass_l2_neon
  663. 1: vld1.64 {d0, d1}, [r1], r2
  664. vld1.64 {d16,d17}, [r1], r2
  665. vld1.64 {d28}, [r3], r2
  666. vld1.64 {d29}, [r3], r2
  667. subs ip, ip, #2
  668. lowpass_8 d0, d1, d16, d17, d0, d1
  669. vrhadd.u8 q0, q0, q14
  670. vst1.64 {d0}, [r0,:64], r2
  671. vst1.64 {d1}, [r0,:64], r2
  672. bne 1b
  673. bx lr
  674. .endfunc
  675. function put_h264_qpel16_v_lowpass_neon_packed
  676. mov r4, lr
  677. mov r2, #8
  678. bl put_h264_qpel8_v_lowpass_neon
  679. sub r1, r1, r3, lsl #2
  680. bl put_h264_qpel8_v_lowpass_neon
  681. sub r1, r1, r3, lsl #4
  682. sub r1, r1, r3, lsl #2
  683. add r1, r1, #8
  684. bl put_h264_qpel8_v_lowpass_neon
  685. sub r1, r1, r3, lsl #2
  686. mov lr, r4
  687. b put_h264_qpel8_v_lowpass_neon
  688. .endfunc
  689. function put_h264_qpel16_v_lowpass_neon
  690. mov r4, lr
  691. bl put_h264_qpel8_v_lowpass_neon
  692. sub r1, r1, r3, lsl #2
  693. bl put_h264_qpel8_v_lowpass_neon
  694. sub r0, r0, r2, lsl #4
  695. add r0, r0, #8
  696. sub r1, r1, r3, lsl #4
  697. sub r1, r1, r3, lsl #2
  698. add r1, r1, #8
  699. bl put_h264_qpel8_v_lowpass_neon
  700. sub r1, r1, r3, lsl #2
  701. mov lr, r4
  702. .endfunc
  703. function put_h264_qpel8_v_lowpass_neon
  704. vld1.64 {d8}, [r1], r3
  705. vld1.64 {d10}, [r1], r3
  706. vld1.64 {d12}, [r1], r3
  707. vld1.64 {d14}, [r1], r3
  708. vld1.64 {d22}, [r1], r3
  709. vld1.64 {d24}, [r1], r3
  710. vld1.64 {d26}, [r1], r3
  711. vld1.64 {d28}, [r1], r3
  712. vld1.64 {d9}, [r1], r3
  713. vld1.64 {d11}, [r1], r3
  714. vld1.64 {d13}, [r1], r3
  715. vld1.64 {d15}, [r1], r3
  716. vld1.64 {d23}, [r1]
  717. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  718. lowpass_8 d8, d9, d10, d11, d8, d10
  719. lowpass_8 d12, d13, d14, d15, d12, d14
  720. lowpass_8 d22, d23, d24, d25, d22, d24
  721. lowpass_8 d26, d27, d28, d29, d26, d28
  722. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  723. vst1.64 {d8}, [r0,:64], r2
  724. vst1.64 {d10}, [r0,:64], r2
  725. vst1.64 {d12}, [r0,:64], r2
  726. vst1.64 {d14}, [r0,:64], r2
  727. vst1.64 {d22}, [r0,:64], r2
  728. vst1.64 {d24}, [r0,:64], r2
  729. vst1.64 {d26}, [r0,:64], r2
  730. vst1.64 {d28}, [r0,:64], r2
  731. bx lr
  732. .endfunc
  733. function put_h264_qpel16_v_lowpass_l2_neon
  734. mov r4, lr
  735. bl put_h264_qpel8_v_lowpass_l2_neon
  736. sub r1, r1, r3, lsl #2
  737. bl put_h264_qpel8_v_lowpass_l2_neon
  738. sub r0, r0, r3, lsl #4
  739. sub ip, ip, r2, lsl #4
  740. add r0, r0, #8
  741. add ip, ip, #8
  742. sub r1, r1, r3, lsl #4
  743. sub r1, r1, r3, lsl #2
  744. add r1, r1, #8
  745. bl put_h264_qpel8_v_lowpass_l2_neon
  746. sub r1, r1, r3, lsl #2
  747. mov lr, r4
  748. .endfunc
  749. function put_h264_qpel8_v_lowpass_l2_neon
  750. vld1.64 {d8}, [r1], r3
  751. vld1.64 {d10}, [r1], r3
  752. vld1.64 {d12}, [r1], r3
  753. vld1.64 {d14}, [r1], r3
  754. vld1.64 {d22}, [r1], r3
  755. vld1.64 {d24}, [r1], r3
  756. vld1.64 {d26}, [r1], r3
  757. vld1.64 {d28}, [r1], r3
  758. vld1.64 {d9}, [r1], r3
  759. vld1.64 {d11}, [r1], r3
  760. vld1.64 {d13}, [r1], r3
  761. vld1.64 {d15}, [r1], r3
  762. vld1.64 {d23}, [r1]
  763. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  764. lowpass_8 d8, d9, d10, d11, d8, d9
  765. lowpass_8 d12, d13, d14, d15, d12, d13
  766. lowpass_8 d22, d23, d24, d25, d22, d23
  767. lowpass_8 d26, d27, d28, d29, d26, d27
  768. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  769. vld1.64 {d0}, [ip], r2
  770. vld1.64 {d1}, [ip], r2
  771. vld1.64 {d2}, [ip], r2
  772. vld1.64 {d3}, [ip], r2
  773. vld1.64 {d4}, [ip], r2
  774. vrhadd.u8 q0, q0, q4
  775. vld1.64 {d5}, [ip], r2
  776. vrhadd.u8 q1, q1, q6
  777. vld1.64 {d10}, [ip], r2
  778. vrhadd.u8 q2, q2, q11
  779. vld1.64 {d11}, [ip], r2
  780. vst1.64 {d0}, [r0,:64], r3
  781. vst1.64 {d1}, [r0,:64], r3
  782. vrhadd.u8 q5, q5, q13
  783. vst1.64 {d2}, [r0,:64], r3
  784. vst1.64 {d3}, [r0,:64], r3
  785. vst1.64 {d4}, [r0,:64], r3
  786. vst1.64 {d5}, [r0,:64], r3
  787. vst1.64 {d10}, [r0,:64], r3
  788. vst1.64 {d11}, [r0,:64], r3
  789. bx lr
  790. .endfunc
  791. function put_h264_qpel8_hv_lowpass_neon_top
  792. lowpass_const ip
  793. mov ip, #12
  794. 1: vld1.64 {d0, d1}, [r1], r3
  795. vld1.64 {d16,d17}, [r1], r3
  796. subs ip, ip, #2
  797. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  798. vst1.64 {d22-d25}, [r4,:128]!
  799. bne 1b
  800. vld1.64 {d0, d1}, [r1]
  801. lowpass_8_1 d0, d1, q12, narrow=0
  802. mov ip, #-16
  803. add r4, r4, ip
  804. vld1.64 {d30,d31}, [r4,:128], ip
  805. vld1.64 {d20,d21}, [r4,:128], ip
  806. vld1.64 {d18,d19}, [r4,:128], ip
  807. vld1.64 {d16,d17}, [r4,:128], ip
  808. vld1.64 {d14,d15}, [r4,:128], ip
  809. vld1.64 {d12,d13}, [r4,:128], ip
  810. vld1.64 {d10,d11}, [r4,:128], ip
  811. vld1.64 {d8, d9}, [r4,:128], ip
  812. vld1.64 {d6, d7}, [r4,:128], ip
  813. vld1.64 {d4, d5}, [r4,:128], ip
  814. vld1.64 {d2, d3}, [r4,:128], ip
  815. vld1.64 {d0, d1}, [r4,:128]
  816. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  817. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  818. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  819. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  820. vst1.64 {d30,d31}, [r4,:128]!
  821. vst1.64 {d6, d7}, [r4,:128]!
  822. vst1.64 {d20,d21}, [r4,:128]!
  823. vst1.64 {d4, d5}, [r4,:128]!
  824. vst1.64 {d18,d19}, [r4,:128]!
  825. vst1.64 {d2, d3}, [r4,:128]!
  826. vst1.64 {d16,d17}, [r4,:128]!
  827. vst1.64 {d0, d1}, [r4,:128]
  828. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  829. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  830. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  831. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  832. vld1.64 {d16,d17}, [r4,:128], ip
  833. vld1.64 {d30,d31}, [r4,:128], ip
  834. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  835. vld1.64 {d16,d17}, [r4,:128], ip
  836. vld1.64 {d30,d31}, [r4,:128], ip
  837. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  838. vld1.64 {d16,d17}, [r4,:128], ip
  839. vld1.64 {d30,d31}, [r4,:128], ip
  840. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  841. vld1.64 {d16,d17}, [r4,:128], ip
  842. vld1.64 {d30,d31}, [r4,:128]
  843. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  844. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  845. bx lr
  846. .endfunc
  847. function put_h264_qpel8_hv_lowpass_neon
  848. mov r10, lr
  849. bl put_h264_qpel8_hv_lowpass_neon_top
  850. vst1.64 {d12}, [r0,:64], r2
  851. vst1.64 {d13}, [r0,:64], r2
  852. vst1.64 {d14}, [r0,:64], r2
  853. vst1.64 {d15}, [r0,:64], r2
  854. vst1.64 {d8}, [r0,:64], r2
  855. vst1.64 {d9}, [r0,:64], r2
  856. vst1.64 {d10}, [r0,:64], r2
  857. vst1.64 {d11}, [r0,:64], r2
  858. mov lr, r10
  859. bx lr
  860. .endfunc
  861. function put_h264_qpel8_hv_lowpass_l2_neon
  862. mov r10, lr
  863. bl put_h264_qpel8_hv_lowpass_neon_top
  864. vld1.64 {d0, d1}, [r2,:128]!
  865. vld1.64 {d2, d3}, [r2,:128]!
  866. vrhadd.u8 q0, q0, q6
  867. vld1.64 {d4, d5}, [r2,:128]!
  868. vrhadd.u8 q1, q1, q7
  869. vld1.64 {d6, d7}, [r2,:128]!
  870. vrhadd.u8 q2, q2, q4
  871. vst1.64 {d0}, [r0,:64], r3
  872. vrhadd.u8 q3, q3, q5
  873. vst1.64 {d1}, [r0,:64], r3
  874. vst1.64 {d2}, [r0,:64], r3
  875. vst1.64 {d3}, [r0,:64], r3
  876. vst1.64 {d4}, [r0,:64], r3
  877. vst1.64 {d5}, [r0,:64], r3
  878. vst1.64 {d6}, [r0,:64], r3
  879. vst1.64 {d7}, [r0,:64], r3
  880. mov lr, r10
  881. bx lr
  882. .endfunc
  883. function put_h264_qpel16_hv_lowpass_neon
  884. mov r9, lr
  885. bl put_h264_qpel8_hv_lowpass_neon
  886. sub r1, r1, r3, lsl #2
  887. bl put_h264_qpel8_hv_lowpass_neon
  888. sub r1, r1, r3, lsl #4
  889. sub r1, r1, r3, lsl #2
  890. add r1, r1, #8
  891. sub r0, r0, r2, lsl #4
  892. add r0, r0, #8
  893. bl put_h264_qpel8_hv_lowpass_neon
  894. sub r1, r1, r3, lsl #2
  895. mov lr, r9
  896. b put_h264_qpel8_hv_lowpass_neon
  897. .endfunc
  898. function put_h264_qpel16_hv_lowpass_l2_neon
  899. mov r9, lr
  900. sub r2, r4, #256
  901. bl put_h264_qpel8_hv_lowpass_l2_neon
  902. sub r1, r1, r3, lsl #2
  903. bl put_h264_qpel8_hv_lowpass_l2_neon
  904. sub r1, r1, r3, lsl #4
  905. sub r1, r1, r3, lsl #2
  906. add r1, r1, #8
  907. sub r0, r0, r3, lsl #4
  908. add r0, r0, #8
  909. bl put_h264_qpel8_hv_lowpass_l2_neon
  910. sub r1, r1, r3, lsl #2
  911. mov lr, r9
  912. b put_h264_qpel8_hv_lowpass_l2_neon
  913. .endfunc
  914. function ff_put_h264_qpel8_mc10_neon, export=1
  915. lowpass_const r3
  916. mov r3, r1
  917. sub r1, r1, #2
  918. mov ip, #8
  919. b put_h264_qpel8_h_lowpass_l2_neon
  920. .endfunc
  921. function ff_put_h264_qpel8_mc20_neon, export=1
  922. lowpass_const r3
  923. sub r1, r1, #2
  924. mov r3, r2
  925. mov ip, #8
  926. b put_h264_qpel8_h_lowpass_neon
  927. .endfunc
  928. function ff_put_h264_qpel8_mc30_neon, export=1
  929. lowpass_const r3
  930. add r3, r1, #1
  931. sub r1, r1, #2
  932. mov ip, #8
  933. b put_h264_qpel8_h_lowpass_l2_neon
  934. .endfunc
  935. function ff_put_h264_qpel8_mc01_neon, export=1
  936. push {lr}
  937. mov ip, r1
  938. put_h264_qpel8_mc01:
  939. lowpass_const r3
  940. mov r3, r2
  941. sub r1, r1, r2, lsl #1
  942. vpush {d8-d15}
  943. bl put_h264_qpel8_v_lowpass_l2_neon
  944. vpop {d8-d15}
  945. pop {pc}
  946. .endfunc
  947. function ff_put_h264_qpel8_mc11_neon, export=1
  948. push {r0, r1, r2, lr}
  949. put_h264_qpel8_mc11:
  950. lowpass_const r3
  951. sub sp, sp, #64
  952. mov r0, sp
  953. sub r1, r1, #2
  954. mov r3, #8
  955. mov ip, #8
  956. vpush {d8-d15}
  957. bl put_h264_qpel8_h_lowpass_neon
  958. ldrd r0, [sp, #128]
  959. mov r3, r2
  960. add ip, sp, #64
  961. sub r1, r1, r2, lsl #1
  962. mov r2, #8
  963. bl put_h264_qpel8_v_lowpass_l2_neon
  964. vpop {d8-d15}
  965. add sp, sp, #76
  966. pop {pc}
  967. .endfunc
  968. function ff_put_h264_qpel8_mc21_neon, export=1
  969. push {r0, r1, r4, r10, r11, lr}
  970. put_h264_qpel8_mc21:
  971. lowpass_const r3
  972. mov r11, sp
  973. bic sp, sp, #15
  974. sub sp, sp, #(8*8+16*12)
  975. sub r1, r1, #2
  976. mov r3, #8
  977. mov r0, sp
  978. mov ip, #8
  979. vpush {d8-d15}
  980. bl put_h264_qpel8_h_lowpass_neon
  981. mov r4, r0
  982. ldrd r0, [r11]
  983. sub r1, r1, r2, lsl #1
  984. sub r1, r1, #2
  985. mov r3, r2
  986. sub r2, r4, #64
  987. bl put_h264_qpel8_hv_lowpass_l2_neon
  988. vpop {d8-d15}
  989. add sp, r11, #8
  990. pop {r4, r10, r11, pc}
  991. .endfunc
  992. function ff_put_h264_qpel8_mc31_neon, export=1
  993. add r1, r1, #1
  994. push {r0, r1, r2, lr}
  995. sub r1, r1, #1
  996. b put_h264_qpel8_mc11
  997. .endfunc
  998. function ff_put_h264_qpel8_mc02_neon, export=1
  999. push {lr}
  1000. lowpass_const r3
  1001. sub r1, r1, r2, lsl #1
  1002. mov r3, r2
  1003. vpush {d8-d15}
  1004. bl put_h264_qpel8_v_lowpass_neon
  1005. vpop {d8-d15}
  1006. pop {pc}
  1007. .endfunc
  1008. function ff_put_h264_qpel8_mc12_neon, export=1
  1009. push {r0, r1, r4, r10, r11, lr}
  1010. put_h264_qpel8_mc12:
  1011. lowpass_const r3
  1012. mov r11, sp
  1013. bic sp, sp, #15
  1014. sub sp, sp, #(8*8+16*12)
  1015. sub r1, r1, r2, lsl #1
  1016. mov r3, r2
  1017. mov r2, #8
  1018. mov r0, sp
  1019. vpush {d8-d15}
  1020. bl put_h264_qpel8_v_lowpass_neon
  1021. mov r4, r0
  1022. ldrd r0, [r11]
  1023. sub r1, r1, r3, lsl #1
  1024. sub r1, r1, #2
  1025. sub r2, r4, #64
  1026. bl put_h264_qpel8_hv_lowpass_l2_neon
  1027. vpop {d8-d15}
  1028. add sp, r11, #8
  1029. pop {r4, r10, r11, pc}
  1030. .endfunc
  1031. function ff_put_h264_qpel8_mc22_neon, export=1
  1032. push {r4, r10, r11, lr}
  1033. mov r11, sp
  1034. bic sp, sp, #15
  1035. sub r1, r1, r2, lsl #1
  1036. sub r1, r1, #2
  1037. mov r3, r2
  1038. sub sp, sp, #(16*12)
  1039. mov r4, sp
  1040. vpush {d8-d15}
  1041. bl put_h264_qpel8_hv_lowpass_neon
  1042. vpop {d8-d15}
  1043. mov sp, r11
  1044. pop {r4, r10, r11, pc}
  1045. .endfunc
  1046. function ff_put_h264_qpel8_mc32_neon, export=1
  1047. push {r0, r1, r4, r10, r11, lr}
  1048. add r1, r1, #1
  1049. b put_h264_qpel8_mc12
  1050. .endfunc
  1051. function ff_put_h264_qpel8_mc03_neon, export=1
  1052. push {lr}
  1053. add ip, r1, r2
  1054. b put_h264_qpel8_mc01
  1055. .endfunc
  1056. function ff_put_h264_qpel8_mc13_neon, export=1
  1057. push {r0, r1, r2, lr}
  1058. add r1, r1, r2
  1059. b put_h264_qpel8_mc11
  1060. .endfunc
  1061. function ff_put_h264_qpel8_mc23_neon, export=1
  1062. push {r0, r1, r4, r10, r11, lr}
  1063. add r1, r1, r2
  1064. b put_h264_qpel8_mc21
  1065. .endfunc
  1066. function ff_put_h264_qpel8_mc33_neon, export=1
  1067. add r1, r1, #1
  1068. push {r0, r1, r2, lr}
  1069. add r1, r1, r2
  1070. sub r1, r1, #1
  1071. b put_h264_qpel8_mc11
  1072. .endfunc
  1073. function ff_put_h264_qpel16_mc10_neon, export=1
  1074. lowpass_const r3
  1075. mov r3, r1
  1076. sub r1, r1, #2
  1077. b put_h264_qpel16_h_lowpass_l2_neon
  1078. .endfunc
  1079. function ff_put_h264_qpel16_mc20_neon, export=1
  1080. lowpass_const r3
  1081. sub r1, r1, #2
  1082. mov r3, r2
  1083. b put_h264_qpel16_h_lowpass_neon
  1084. .endfunc
  1085. function ff_put_h264_qpel16_mc30_neon, export=1
  1086. lowpass_const r3
  1087. add r3, r1, #1
  1088. sub r1, r1, #2
  1089. b put_h264_qpel16_h_lowpass_l2_neon
  1090. .endfunc
  1091. function ff_put_h264_qpel16_mc01_neon, export=1
  1092. push {r4, lr}
  1093. mov ip, r1
  1094. put_h264_qpel16_mc01:
  1095. lowpass_const r3
  1096. mov r3, r2
  1097. sub r1, r1, r2, lsl #1
  1098. vpush {d8-d15}
  1099. bl put_h264_qpel16_v_lowpass_l2_neon
  1100. vpop {d8-d15}
  1101. pop {r4, pc}
  1102. .endfunc
  1103. function ff_put_h264_qpel16_mc11_neon, export=1
  1104. push {r0, r1, r4, lr}
  1105. put_h264_qpel16_mc11:
  1106. lowpass_const r3
  1107. sub sp, sp, #256
  1108. mov r0, sp
  1109. sub r1, r1, #2
  1110. mov r3, #16
  1111. vpush {d8-d15}
  1112. bl put_h264_qpel16_h_lowpass_neon
  1113. add r0, sp, #256
  1114. ldrd r0, [r0, #64]
  1115. mov r3, r2
  1116. add ip, sp, #64
  1117. sub r1, r1, r2, lsl #1
  1118. mov r2, #16
  1119. bl put_h264_qpel16_v_lowpass_l2_neon
  1120. vpop {d8-d15}
  1121. add sp, sp, #(256+8)
  1122. pop {r4, pc}
  1123. .endfunc
  1124. function ff_put_h264_qpel16_mc21_neon, export=1
  1125. push {r0, r1, r4-r5, r9-r11, lr}
  1126. put_h264_qpel16_mc21:
  1127. lowpass_const r3
  1128. mov r11, sp
  1129. bic sp, sp, #15
  1130. sub sp, sp, #(16*16+16*12)
  1131. sub r1, r1, #2
  1132. mov r0, sp
  1133. vpush {d8-d15}
  1134. bl put_h264_qpel16_h_lowpass_neon_packed
  1135. mov r4, r0
  1136. ldrd r0, [r11]
  1137. sub r1, r1, r2, lsl #1
  1138. sub r1, r1, #2
  1139. mov r3, r2
  1140. bl put_h264_qpel16_hv_lowpass_l2_neon
  1141. vpop {d8-d15}
  1142. add sp, r11, #8
  1143. pop {r4-r5, r9-r11, pc}
  1144. .endfunc
  1145. function ff_put_h264_qpel16_mc31_neon, export=1
  1146. add r1, r1, #1
  1147. push {r0, r1, r4, lr}
  1148. sub r1, r1, #1
  1149. b put_h264_qpel16_mc11
  1150. .endfunc
  1151. function ff_put_h264_qpel16_mc02_neon, export=1
  1152. push {r4, lr}
  1153. lowpass_const r3
  1154. sub r1, r1, r2, lsl #1
  1155. mov r3, r2
  1156. vpush {d8-d15}
  1157. bl put_h264_qpel16_v_lowpass_neon
  1158. vpop {d8-d15}
  1159. pop {r4, pc}
  1160. .endfunc
  1161. function ff_put_h264_qpel16_mc12_neon, export=1
  1162. push {r0, r1, r4-r5, r9-r11, lr}
  1163. put_h264_qpel16_mc12:
  1164. lowpass_const r3
  1165. mov r11, sp
  1166. bic sp, sp, #15
  1167. sub sp, sp, #(16*16+16*12)
  1168. sub r1, r1, r2, lsl #1
  1169. mov r0, sp
  1170. mov r3, r2
  1171. vpush {d8-d15}
  1172. bl put_h264_qpel16_v_lowpass_neon_packed
  1173. mov r4, r0
  1174. ldrd r0, [r11]
  1175. sub r1, r1, r3, lsl #1
  1176. sub r1, r1, #2
  1177. mov r2, r3
  1178. bl put_h264_qpel16_hv_lowpass_l2_neon
  1179. vpop {d8-d15}
  1180. add sp, r11, #8
  1181. pop {r4-r5, r9-r11, pc}
  1182. .endfunc
  1183. function ff_put_h264_qpel16_mc22_neon, export=1
  1184. push {r4, r9-r11, lr}
  1185. lowpass_const r3
  1186. mov r11, sp
  1187. bic sp, sp, #15
  1188. sub r1, r1, r2, lsl #1
  1189. sub r1, r1, #2
  1190. mov r3, r2
  1191. sub sp, sp, #(16*12)
  1192. mov r4, sp
  1193. vpush {d8-d15}
  1194. bl put_h264_qpel16_hv_lowpass_neon
  1195. vpop {d8-d15}
  1196. mov sp, r11
  1197. pop {r4, r9-r11, pc}
  1198. .endfunc
  1199. function ff_put_h264_qpel16_mc32_neon, export=1
  1200. push {r0, r1, r4-r5, r9-r11, lr}
  1201. add r1, r1, #1
  1202. b put_h264_qpel16_mc12
  1203. .endfunc
  1204. function ff_put_h264_qpel16_mc03_neon, export=1
  1205. push {r4, lr}
  1206. add ip, r1, r2
  1207. b put_h264_qpel16_mc01
  1208. .endfunc
  1209. function ff_put_h264_qpel16_mc13_neon, export=1
  1210. push {r0, r1, r4, lr}
  1211. add r1, r1, r2
  1212. b put_h264_qpel16_mc11
  1213. .endfunc
  1214. function ff_put_h264_qpel16_mc23_neon, export=1
  1215. push {r0, r1, r4-r5, r9-r11, lr}
  1216. add r1, r1, r2
  1217. b put_h264_qpel16_mc21
  1218. .endfunc
  1219. function ff_put_h264_qpel16_mc33_neon, export=1
  1220. add r1, r1, #1
  1221. push {r0, r1, r4, lr}
  1222. add r1, r1, r2
  1223. sub r1, r1, #1
  1224. b put_h264_qpel16_mc11
  1225. .endfunc