You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1676 lines
55KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .fpu neon
  22. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  23. vtrn.32 \r0, \r4
  24. vtrn.32 \r1, \r5
  25. vtrn.32 \r2, \r6
  26. vtrn.32 \r3, \r7
  27. vtrn.16 \r0, \r2
  28. vtrn.16 \r1, \r3
  29. vtrn.16 \r4, \r6
  30. vtrn.16 \r5, \r7
  31. vtrn.8 \r0, \r1
  32. vtrn.8 \r2, \r3
  33. vtrn.8 \r4, \r5
  34. vtrn.8 \r6, \r7
  35. .endm
  36. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  37. vswp \r0, \r4
  38. vswp \r1, \r5
  39. vswp \r2, \r6
  40. vswp \r3, \r7
  41. .endm
  42. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  43. vtrn.32 \r0, \r2
  44. vtrn.32 \r1, \r3
  45. vtrn.32 \r4, \r6
  46. vtrn.32 \r5, \r7
  47. vtrn.16 \r0, \r1
  48. vtrn.16 \r2, \r3
  49. vtrn.16 \r4, \r5
  50. vtrn.16 \r6, \r7
  51. .endm
  52. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  53. .macro h264_chroma_mc8 type
  54. function ff_\type\()_h264_chroma_mc8_neon, export=1
  55. push {r4-r7, lr}
  56. ldrd r4, [sp, #20]
  57. .ifc \type,avg
  58. mov lr, r0
  59. .endif
  60. pld [r1]
  61. pld [r1, r2]
  62. muls r7, r4, r5
  63. rsb r6, r7, r5, lsl #3
  64. rsb ip, r7, r4, lsl #3
  65. sub r4, r7, r4, lsl #3
  66. sub r4, r4, r5, lsl #3
  67. add r4, r4, #64
  68. beq 2f
  69. add r5, r1, r2
  70. vdup.8 d0, r4
  71. lsl r4, r2, #1
  72. vdup.8 d1, ip
  73. vld1.64 {d4, d5}, [r1], r4
  74. vdup.8 d2, r6
  75. vld1.64 {d6, d7}, [r5], r4
  76. vdup.8 d3, r7
  77. vext.8 d5, d4, d5, #1
  78. vext.8 d7, d6, d7, #1
  79. 1: pld [r5]
  80. vmull.u8 q8, d4, d0
  81. vmlal.u8 q8, d5, d1
  82. vld1.64 {d4, d5}, [r1], r4
  83. vmlal.u8 q8, d6, d2
  84. vext.8 d5, d4, d5, #1
  85. vmlal.u8 q8, d7, d3
  86. vmull.u8 q9, d6, d0
  87. subs r3, r3, #2
  88. vmlal.u8 q9, d7, d1
  89. vmlal.u8 q9, d4, d2
  90. vmlal.u8 q9, d5, d3
  91. vrshrn.u16 d16, q8, #6
  92. vld1.64 {d6, d7}, [r5], r4
  93. pld [r1]
  94. vrshrn.u16 d17, q9, #6
  95. .ifc \type,avg
  96. vld1.64 {d20}, [lr,:64], r2
  97. vld1.64 {d21}, [lr,:64], r2
  98. vrhadd.u8 q8, q8, q10
  99. .endif
  100. vext.8 d7, d6, d7, #1
  101. vst1.64 {d16}, [r0,:64], r2
  102. vst1.64 {d17}, [r0,:64], r2
  103. bgt 1b
  104. pop {r4-r7, pc}
  105. 2: tst r6, r6
  106. add ip, ip, r6
  107. vdup.8 d0, r4
  108. vdup.8 d1, ip
  109. beq 4f
  110. add r5, r1, r2
  111. lsl r4, r2, #1
  112. vld1.64 {d4}, [r1], r4
  113. vld1.64 {d6}, [r5], r4
  114. 3: pld [r5]
  115. vmull.u8 q8, d4, d0
  116. vmlal.u8 q8, d6, d1
  117. vld1.64 {d4}, [r1], r4
  118. vmull.u8 q9, d6, d0
  119. vmlal.u8 q9, d4, d1
  120. vld1.64 {d6}, [r5], r4
  121. vrshrn.u16 d16, q8, #6
  122. vrshrn.u16 d17, q9, #6
  123. .ifc \type,avg
  124. vld1.64 {d20}, [lr,:64], r2
  125. vld1.64 {d21}, [lr,:64], r2
  126. vrhadd.u8 q8, q8, q10
  127. .endif
  128. subs r3, r3, #2
  129. pld [r1]
  130. vst1.64 {d16}, [r0,:64], r2
  131. vst1.64 {d17}, [r0,:64], r2
  132. bgt 3b
  133. pop {r4-r7, pc}
  134. 4: vld1.64 {d4, d5}, [r1], r2
  135. vld1.64 {d6, d7}, [r1], r2
  136. vext.8 d5, d4, d5, #1
  137. vext.8 d7, d6, d7, #1
  138. 5: pld [r1]
  139. subs r3, r3, #2
  140. vmull.u8 q8, d4, d0
  141. vmlal.u8 q8, d5, d1
  142. vld1.64 {d4, d5}, [r1], r2
  143. vmull.u8 q9, d6, d0
  144. vmlal.u8 q9, d7, d1
  145. pld [r1]
  146. vext.8 d5, d4, d5, #1
  147. vrshrn.u16 d16, q8, #6
  148. vrshrn.u16 d17, q9, #6
  149. .ifc \type,avg
  150. vld1.64 {d20}, [lr,:64], r2
  151. vld1.64 {d21}, [lr,:64], r2
  152. vrhadd.u8 q8, q8, q10
  153. .endif
  154. vld1.64 {d6, d7}, [r1], r2
  155. vext.8 d7, d6, d7, #1
  156. vst1.64 {d16}, [r0,:64], r2
  157. vst1.64 {d17}, [r0,:64], r2
  158. bgt 5b
  159. pop {r4-r7, pc}
  160. .endfunc
  161. .endm
  162. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  163. .macro h264_chroma_mc4 type
  164. function ff_\type\()_h264_chroma_mc4_neon, export=1
  165. push {r4-r7, lr}
  166. ldrd r4, [sp, #20]
  167. .ifc \type,avg
  168. mov lr, r0
  169. .endif
  170. pld [r1]
  171. pld [r1, r2]
  172. muls r7, r4, r5
  173. rsb r6, r7, r5, lsl #3
  174. rsb ip, r7, r4, lsl #3
  175. sub r4, r7, r4, lsl #3
  176. sub r4, r4, r5, lsl #3
  177. add r4, r4, #64
  178. beq 2f
  179. add r5, r1, r2
  180. vdup.8 d0, r4
  181. lsl r4, r2, #1
  182. vdup.8 d1, ip
  183. vld1.64 {d4}, [r1], r4
  184. vdup.8 d2, r6
  185. vld1.64 {d6}, [r5], r4
  186. vdup.8 d3, r7
  187. vext.8 d5, d4, d5, #1
  188. vext.8 d7, d6, d7, #1
  189. vtrn.32 d4, d5
  190. vtrn.32 d6, d7
  191. vtrn.32 d0, d1
  192. vtrn.32 d2, d3
  193. 1: pld [r5]
  194. vmull.u8 q8, d4, d0
  195. vmlal.u8 q8, d6, d2
  196. vld1.64 {d4}, [r1], r4
  197. vext.8 d5, d4, d5, #1
  198. vtrn.32 d4, d5
  199. vmull.u8 q9, d6, d0
  200. vmlal.u8 q9, d4, d2
  201. vld1.64 {d6}, [r5], r4
  202. vadd.i16 d16, d16, d17
  203. vadd.i16 d17, d18, d19
  204. vrshrn.u16 d16, q8, #6
  205. subs r3, r3, #2
  206. pld [r1]
  207. .ifc \type,avg
  208. vld1.32 {d20[0]}, [lr,:32], r2
  209. vld1.32 {d20[1]}, [lr,:32], r2
  210. vrhadd.u8 d16, d16, d20
  211. .endif
  212. vext.8 d7, d6, d7, #1
  213. vtrn.32 d6, d7
  214. vst1.32 {d16[0]}, [r0,:32], r2
  215. vst1.32 {d16[1]}, [r0,:32], r2
  216. bgt 1b
  217. pop {r4-r7, pc}
  218. 2: tst r6, r6
  219. add ip, ip, r6
  220. vdup.8 d0, r4
  221. vdup.8 d1, ip
  222. vtrn.32 d0, d1
  223. beq 4f
  224. vext.32 d1, d0, d1, #1
  225. add r5, r1, r2
  226. lsl r4, r2, #1
  227. vld1.32 {d4[0]}, [r1], r4
  228. vld1.32 {d4[1]}, [r5], r4
  229. 3: pld [r5]
  230. vmull.u8 q8, d4, d0
  231. vld1.32 {d4[0]}, [r1], r4
  232. vmull.u8 q9, d4, d1
  233. vld1.32 {d4[1]}, [r5], r4
  234. vadd.i16 d16, d16, d17
  235. vadd.i16 d17, d18, d19
  236. vrshrn.u16 d16, q8, #6
  237. .ifc \type,avg
  238. vld1.32 {d20[0]}, [lr,:32], r2
  239. vld1.32 {d20[1]}, [lr,:32], r2
  240. vrhadd.u8 d16, d16, d20
  241. .endif
  242. subs r3, r3, #2
  243. pld [r1]
  244. vst1.32 {d16[0]}, [r0,:32], r2
  245. vst1.32 {d16[1]}, [r0,:32], r2
  246. bgt 3b
  247. pop {r4-r7, pc}
  248. 4: vld1.64 {d4}, [r1], r2
  249. vld1.64 {d6}, [r1], r2
  250. vext.8 d5, d4, d5, #1
  251. vext.8 d7, d6, d7, #1
  252. vtrn.32 d4, d5
  253. vtrn.32 d6, d7
  254. 5: vmull.u8 q8, d4, d0
  255. vmull.u8 q9, d6, d0
  256. subs r3, r3, #2
  257. vld1.64 {d4}, [r1], r2
  258. vext.8 d5, d4, d5, #1
  259. vtrn.32 d4, d5
  260. vadd.i16 d16, d16, d17
  261. vadd.i16 d17, d18, d19
  262. pld [r1]
  263. vrshrn.u16 d16, q8, #6
  264. .ifc \type,avg
  265. vld1.32 {d20[0]}, [lr,:32], r2
  266. vld1.32 {d20[1]}, [lr,:32], r2
  267. vrhadd.u8 d16, d16, d20
  268. .endif
  269. vld1.64 {d6}, [r1], r2
  270. vext.8 d7, d6, d7, #1
  271. vtrn.32 d6, d7
  272. pld [r1]
  273. vst1.32 {d16[0]}, [r0,:32], r2
  274. vst1.32 {d16[1]}, [r0,:32], r2
  275. bgt 5b
  276. pop {r4-r7, pc}
  277. .endfunc
  278. .endm
  279. .text
  280. .align
  281. h264_chroma_mc8 put
  282. h264_chroma_mc8 avg
  283. h264_chroma_mc4 put
  284. h264_chroma_mc4 avg
  285. /* H.264 loop filter */
  286. .macro h264_loop_filter_start
  287. ldr ip, [sp]
  288. tst r2, r2
  289. ldr ip, [ip]
  290. tstne r3, r3
  291. vmov.32 d24[0], ip
  292. and ip, ip, ip, lsl #16
  293. bxeq lr
  294. ands ip, ip, ip, lsl #8
  295. bxlt lr
  296. .endm
  297. .macro align_push_regs
  298. and ip, sp, #15
  299. add ip, ip, #32
  300. sub sp, sp, ip
  301. vst1.64 {d12-d15}, [sp,:128]
  302. sub sp, sp, #32
  303. vst1.64 {d8-d11}, [sp,:128]
  304. .endm
  305. .macro align_pop_regs
  306. vld1.64 {d8-d11}, [sp,:128]!
  307. vld1.64 {d12-d15}, [sp,:128], ip
  308. .endm
  309. .macro h264_loop_filter_luma
  310. vdup.8 q11, r2 @ alpha
  311. vmovl.u8 q12, d24
  312. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  313. vmovl.u16 q12, d24
  314. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  315. vsli.16 q12, q12, #8
  316. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  317. vsli.32 q12, q12, #16
  318. vclt.u8 q6, q6, q11 @ < alpha
  319. vdup.8 q11, r3 @ beta
  320. vclt.s8 q7, q12, #0
  321. vclt.u8 q14, q14, q11 @ < beta
  322. vclt.u8 q15, q15, q11 @ < beta
  323. vbic q6, q6, q7
  324. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  325. vand q6, q6, q14
  326. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  327. vclt.u8 q4, q4, q11 @ < beta
  328. vand q6, q6, q15
  329. vclt.u8 q5, q5, q11 @ < beta
  330. vand q4, q4, q6
  331. vand q5, q5, q6
  332. vand q12, q12, q6
  333. vrhadd.u8 q14, q8, q0
  334. vsub.i8 q6, q12, q4
  335. vqadd.u8 q7, q9, q12
  336. vhadd.u8 q10, q10, q14
  337. vsub.i8 q6, q6, q5
  338. vhadd.u8 q14, q2, q14
  339. vmin.u8 q7, q7, q10
  340. vqsub.u8 q11, q9, q12
  341. vqadd.u8 q2, q1, q12
  342. vmax.u8 q7, q7, q11
  343. vqsub.u8 q11, q1, q12
  344. vmin.u8 q14, q2, q14
  345. vmovl.u8 q2, d0
  346. vmax.u8 q14, q14, q11
  347. vmovl.u8 q10, d1
  348. vsubw.u8 q2, q2, d16
  349. vsubw.u8 q10, q10, d17
  350. vshl.i16 q2, q2, #2
  351. vshl.i16 q10, q10, #2
  352. vaddw.u8 q2, q2, d18
  353. vaddw.u8 q10, q10, d19
  354. vsubw.u8 q2, q2, d2
  355. vsubw.u8 q10, q10, d3
  356. vrshrn.i16 d4, q2, #3
  357. vrshrn.i16 d5, q10, #3
  358. vbsl q4, q7, q9
  359. vbsl q5, q14, q1
  360. vneg.s8 q7, q6
  361. vmovl.u8 q14, d16
  362. vmin.s8 q2, q2, q6
  363. vmovl.u8 q6, d17
  364. vmax.s8 q2, q2, q7
  365. vmovl.u8 q11, d0
  366. vmovl.u8 q12, d1
  367. vaddw.s8 q14, q14, d4
  368. vaddw.s8 q6, q6, d5
  369. vsubw.s8 q11, q11, d4
  370. vsubw.s8 q12, q12, d5
  371. vqmovun.s16 d16, q14
  372. vqmovun.s16 d17, q6
  373. vqmovun.s16 d0, q11
  374. vqmovun.s16 d1, q12
  375. .endm
  376. function ff_h264_v_loop_filter_luma_neon, export=1
  377. h264_loop_filter_start
  378. vld1.64 {d0, d1}, [r0,:128], r1
  379. vld1.64 {d2, d3}, [r0,:128], r1
  380. vld1.64 {d4, d5}, [r0,:128], r1
  381. sub r0, r0, r1, lsl #2
  382. sub r0, r0, r1, lsl #1
  383. vld1.64 {d20,d21}, [r0,:128], r1
  384. vld1.64 {d18,d19}, [r0,:128], r1
  385. vld1.64 {d16,d17}, [r0,:128], r1
  386. align_push_regs
  387. h264_loop_filter_luma
  388. sub r0, r0, r1, lsl #1
  389. vst1.64 {d8, d9}, [r0,:128], r1
  390. vst1.64 {d16,d17}, [r0,:128], r1
  391. vst1.64 {d0, d1}, [r0,:128], r1
  392. vst1.64 {d10,d11}, [r0,:128]
  393. align_pop_regs
  394. bx lr
  395. .endfunc
  396. function ff_h264_h_loop_filter_luma_neon, export=1
  397. h264_loop_filter_start
  398. sub r0, r0, #4
  399. vld1.64 {d6}, [r0], r1
  400. vld1.64 {d20}, [r0], r1
  401. vld1.64 {d18}, [r0], r1
  402. vld1.64 {d16}, [r0], r1
  403. vld1.64 {d0}, [r0], r1
  404. vld1.64 {d2}, [r0], r1
  405. vld1.64 {d4}, [r0], r1
  406. vld1.64 {d26}, [r0], r1
  407. vld1.64 {d7}, [r0], r1
  408. vld1.64 {d21}, [r0], r1
  409. vld1.64 {d19}, [r0], r1
  410. vld1.64 {d17}, [r0], r1
  411. vld1.64 {d1}, [r0], r1
  412. vld1.64 {d3}, [r0], r1
  413. vld1.64 {d5}, [r0], r1
  414. vld1.64 {d27}, [r0], r1
  415. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  416. align_push_regs
  417. sub sp, sp, #16
  418. vst1.64 {d4, d5}, [sp,:128]
  419. sub sp, sp, #16
  420. vst1.64 {d20,d21}, [sp,:128]
  421. h264_loop_filter_luma
  422. vld1.64 {d20,d21}, [sp,:128]!
  423. vld1.64 {d4, d5}, [sp,:128]!
  424. transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
  425. sub r0, r0, r1, lsl #4
  426. vst1.64 {d6}, [r0], r1
  427. vst1.64 {d20}, [r0], r1
  428. vst1.64 {d8}, [r0], r1
  429. vst1.64 {d16}, [r0], r1
  430. vst1.64 {d0}, [r0], r1
  431. vst1.64 {d10}, [r0], r1
  432. vst1.64 {d4}, [r0], r1
  433. vst1.64 {d26}, [r0], r1
  434. vst1.64 {d7}, [r0], r1
  435. vst1.64 {d21}, [r0], r1
  436. vst1.64 {d9}, [r0], r1
  437. vst1.64 {d17}, [r0], r1
  438. vst1.64 {d1}, [r0], r1
  439. vst1.64 {d11}, [r0], r1
  440. vst1.64 {d5}, [r0], r1
  441. vst1.64 {d27}, [r0], r1
  442. align_pop_regs
  443. bx lr
  444. .endfunc
  445. .macro h264_loop_filter_chroma
  446. vdup.8 d22, r2 @ alpha
  447. vmovl.u8 q12, d24
  448. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  449. vmovl.u8 q2, d0
  450. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  451. vsubw.u8 q2, q2, d16
  452. vsli.16 d24, d24, #8
  453. vshl.i16 q2, q2, #2
  454. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  455. vaddw.u8 q2, q2, d18
  456. vclt.u8 d26, d26, d22 @ < alpha
  457. vsubw.u8 q2, q2, d2
  458. vdup.8 d22, r3 @ beta
  459. vclt.s8 d25, d24, #0
  460. vrshrn.i16 d4, q2, #3
  461. vclt.u8 d28, d28, d22 @ < beta
  462. vbic d26, d26, d25
  463. vclt.u8 d30, d30, d22 @ < beta
  464. vand d26, d26, d28
  465. vneg.s8 d25, d24
  466. vand d26, d26, d30
  467. vmin.s8 d4, d4, d24
  468. vmovl.u8 q14, d16
  469. vand d4, d4, d26
  470. vmax.s8 d4, d4, d25
  471. vmovl.u8 q11, d0
  472. vaddw.s8 q14, q14, d4
  473. vsubw.s8 q11, q11, d4
  474. vqmovun.s16 d16, q14
  475. vqmovun.s16 d0, q11
  476. .endm
  477. function ff_h264_v_loop_filter_chroma_neon, export=1
  478. h264_loop_filter_start
  479. sub r0, r0, r1, lsl #1
  480. vld1.64 {d18}, [r0,:64], r1
  481. vld1.64 {d16}, [r0,:64], r1
  482. vld1.64 {d0}, [r0,:64], r1
  483. vld1.64 {d2}, [r0,:64]
  484. h264_loop_filter_chroma
  485. sub r0, r0, r1, lsl #1
  486. vst1.64 {d16}, [r0,:64], r1
  487. vst1.64 {d0}, [r0,:64], r1
  488. bx lr
  489. .endfunc
  490. function ff_h264_h_loop_filter_chroma_neon, export=1
  491. h264_loop_filter_start
  492. sub r0, r0, #2
  493. vld1.32 {d18[0]}, [r0], r1
  494. vld1.32 {d16[0]}, [r0], r1
  495. vld1.32 {d0[0]}, [r0], r1
  496. vld1.32 {d2[0]}, [r0], r1
  497. vld1.32 {d18[1]}, [r0], r1
  498. vld1.32 {d16[1]}, [r0], r1
  499. vld1.32 {d0[1]}, [r0], r1
  500. vld1.32 {d2[1]}, [r0], r1
  501. vtrn.16 d18, d0
  502. vtrn.16 d16, d2
  503. vtrn.8 d18, d16
  504. vtrn.8 d0, d2
  505. h264_loop_filter_chroma
  506. vtrn.16 d18, d0
  507. vtrn.16 d16, d2
  508. vtrn.8 d18, d16
  509. vtrn.8 d0, d2
  510. sub r0, r0, r1, lsl #3
  511. vst1.32 {d18[0]}, [r0], r1
  512. vst1.32 {d16[0]}, [r0], r1
  513. vst1.32 {d0[0]}, [r0], r1
  514. vst1.32 {d2[0]}, [r0], r1
  515. vst1.32 {d18[1]}, [r0], r1
  516. vst1.32 {d16[1]}, [r0], r1
  517. vst1.32 {d0[1]}, [r0], r1
  518. vst1.32 {d2[1]}, [r0], r1
  519. bx lr
  520. .endfunc
  521. /* H.264 qpel MC */
  522. .macro lowpass_const r
  523. movw \r, #5
  524. movt \r, #20
  525. vmov.32 d6[0], \r
  526. .endm
  527. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  528. .if \narrow
  529. t0 .req q0
  530. t1 .req q8
  531. .else
  532. t0 .req \d0
  533. t1 .req \d1
  534. .endif
  535. vext.8 d2, \r0, \r1, #2
  536. vext.8 d3, \r0, \r1, #3
  537. vaddl.u8 q1, d2, d3
  538. vext.8 d4, \r0, \r1, #1
  539. vext.8 d5, \r0, \r1, #4
  540. vaddl.u8 q2, d4, d5
  541. vext.8 d30, \r0, \r1, #5
  542. vaddl.u8 t0, \r0, d30
  543. vext.8 d18, \r2, \r3, #2
  544. vmla.i16 t0, q1, d6[1]
  545. vext.8 d19, \r2, \r3, #3
  546. vaddl.u8 q9, d18, d19
  547. vext.8 d20, \r2, \r3, #1
  548. vmls.i16 t0, q2, d6[0]
  549. vext.8 d21, \r2, \r3, #4
  550. vaddl.u8 q10, d20, d21
  551. vext.8 d31, \r2, \r3, #5
  552. vaddl.u8 t1, \r2, d31
  553. vmla.i16 t1, q9, d6[1]
  554. vmls.i16 t1, q10, d6[0]
  555. .if \narrow
  556. vqrshrun.s16 \d0, t0, #5
  557. vqrshrun.s16 \d1, t1, #5
  558. .endif
  559. .unreq t0
  560. .unreq t1
  561. .endm
  562. .macro lowpass_8_1 r0, r1, d0, narrow=1
  563. .if \narrow
  564. t0 .req q0
  565. .else
  566. t0 .req \d0
  567. .endif
  568. vext.8 d2, \r0, \r1, #2
  569. vext.8 d3, \r0, \r1, #3
  570. vaddl.u8 q1, d2, d3
  571. vext.8 d4, \r0, \r1, #1
  572. vext.8 d5, \r0, \r1, #4
  573. vaddl.u8 q2, d4, d5
  574. vext.8 d30, \r0, \r1, #5
  575. vaddl.u8 t0, \r0, d30
  576. vmla.i16 t0, q1, d6[1]
  577. vmls.i16 t0, q2, d6[0]
  578. .if \narrow
  579. vqrshrun.s16 \d0, t0, #5
  580. .endif
  581. .unreq t0
  582. .endm
  583. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  584. vext.16 q1, \r0, \r1, #2
  585. vext.16 q0, \r0, \r1, #3
  586. vaddl.s16 q9, d2, d0
  587. vext.16 q2, \r0, \r1, #1
  588. vaddl.s16 q1, d3, d1
  589. vext.16 q3, \r0, \r1, #4
  590. vaddl.s16 q10, d4, d6
  591. vext.16 \r1, \r0, \r1, #5
  592. vaddl.s16 q2, d5, d7
  593. vaddl.s16 q0, \h0, \h1
  594. vaddl.s16 q8, \l0, \l1
  595. vshl.i32 q3, q9, #4
  596. vshl.i32 q9, q9, #2
  597. vshl.i32 q15, q10, #2
  598. vadd.i32 q9, q9, q3
  599. vadd.i32 q10, q10, q15
  600. vshl.i32 q3, q1, #4
  601. vshl.i32 q1, q1, #2
  602. vshl.i32 q15, q2, #2
  603. vadd.i32 q1, q1, q3
  604. vadd.i32 q2, q2, q15
  605. vadd.i32 q9, q9, q8
  606. vsub.i32 q9, q9, q10
  607. vadd.i32 q1, q1, q0
  608. vsub.i32 q1, q1, q2
  609. vrshrn.s32 d18, q9, #10
  610. vrshrn.s32 d19, q1, #10
  611. vqmovun.s16 \d, q9
  612. .endm
  613. function put_h264_qpel16_h_lowpass_neon_packed
  614. mov r4, lr
  615. mov ip, #16
  616. mov r3, #8
  617. bl put_h264_qpel8_h_lowpass_neon
  618. sub r1, r1, r2, lsl #4
  619. add r1, r1, #8
  620. mov ip, #16
  621. mov lr, r4
  622. b put_h264_qpel8_h_lowpass_neon
  623. .endfunc
  624. function put_h264_qpel16_h_lowpass_neon
  625. push {lr}
  626. mov ip, #16
  627. bl put_h264_qpel8_h_lowpass_neon
  628. sub r0, r0, r3, lsl #4
  629. sub r1, r1, r2, lsl #4
  630. add r0, r0, #8
  631. add r1, r1, #8
  632. mov ip, #16
  633. pop {lr}
  634. .endfunc
  635. function put_h264_qpel8_h_lowpass_neon
  636. 1: vld1.64 {d0, d1}, [r1], r2
  637. vld1.64 {d16,d17}, [r1], r2
  638. subs ip, ip, #2
  639. lowpass_8 d0, d1, d16, d17, d0, d16
  640. vst1.64 {d0}, [r0,:64], r3
  641. vst1.64 {d16}, [r0,:64], r3
  642. bne 1b
  643. bx lr
  644. .endfunc
  645. function put_h264_qpel16_h_lowpass_l2_neon
  646. push {lr}
  647. mov ip, #16
  648. bl put_h264_qpel8_h_lowpass_l2_neon
  649. sub r0, r0, r2, lsl #4
  650. sub r1, r1, r2, lsl #4
  651. sub r3, r3, r2, lsl #4
  652. add r0, r0, #8
  653. add r1, r1, #8
  654. add r3, r3, #8
  655. mov ip, #16
  656. pop {lr}
  657. .endfunc
  658. function put_h264_qpel8_h_lowpass_l2_neon
  659. 1: vld1.64 {d0, d1}, [r1], r2
  660. vld1.64 {d16,d17}, [r1], r2
  661. vld1.64 {d28}, [r3], r2
  662. vld1.64 {d29}, [r3], r2
  663. subs ip, ip, #2
  664. lowpass_8 d0, d1, d16, d17, d0, d1
  665. vrhadd.u8 q0, q0, q14
  666. vst1.64 {d0}, [r0,:64], r2
  667. vst1.64 {d1}, [r0,:64], r2
  668. bne 1b
  669. bx lr
  670. .endfunc
  671. function put_h264_qpel16_v_lowpass_neon_packed
  672. mov r4, lr
  673. mov r2, #8
  674. bl put_h264_qpel8_v_lowpass_neon
  675. sub r1, r1, r3, lsl #2
  676. bl put_h264_qpel8_v_lowpass_neon
  677. sub r1, r1, r3, lsl #4
  678. sub r1, r1, r3, lsl #2
  679. add r1, r1, #8
  680. bl put_h264_qpel8_v_lowpass_neon
  681. sub r1, r1, r3, lsl #2
  682. mov lr, r4
  683. b put_h264_qpel8_v_lowpass_neon
  684. .endfunc
  685. function put_h264_qpel16_v_lowpass_neon
  686. mov r4, lr
  687. bl put_h264_qpel8_v_lowpass_neon
  688. sub r1, r1, r3, lsl #2
  689. bl put_h264_qpel8_v_lowpass_neon
  690. sub r0, r0, r2, lsl #4
  691. add r0, r0, #8
  692. sub r1, r1, r3, lsl #4
  693. sub r1, r1, r3, lsl #2
  694. add r1, r1, #8
  695. bl put_h264_qpel8_v_lowpass_neon
  696. sub r1, r1, r3, lsl #2
  697. mov lr, r4
  698. .endfunc
  699. function put_h264_qpel8_v_lowpass_neon
  700. vld1.64 {d8}, [r1], r3
  701. vld1.64 {d10}, [r1], r3
  702. vld1.64 {d12}, [r1], r3
  703. vld1.64 {d14}, [r1], r3
  704. vld1.64 {d22}, [r1], r3
  705. vld1.64 {d24}, [r1], r3
  706. vld1.64 {d26}, [r1], r3
  707. vld1.64 {d28}, [r1], r3
  708. vld1.64 {d9}, [r1], r3
  709. vld1.64 {d11}, [r1], r3
  710. vld1.64 {d13}, [r1], r3
  711. vld1.64 {d15}, [r1], r3
  712. vld1.64 {d23}, [r1]
  713. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  714. lowpass_8 d8, d9, d10, d11, d8, d10
  715. lowpass_8 d12, d13, d14, d15, d12, d14
  716. lowpass_8 d22, d23, d24, d25, d22, d24
  717. lowpass_8 d26, d27, d28, d29, d26, d28
  718. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  719. vst1.64 {d8}, [r0,:64], r2
  720. vst1.64 {d10}, [r0,:64], r2
  721. vst1.64 {d12}, [r0,:64], r2
  722. vst1.64 {d14}, [r0,:64], r2
  723. vst1.64 {d22}, [r0,:64], r2
  724. vst1.64 {d24}, [r0,:64], r2
  725. vst1.64 {d26}, [r0,:64], r2
  726. vst1.64 {d28}, [r0,:64], r2
  727. bx lr
  728. .endfunc
  729. function put_h264_qpel16_v_lowpass_l2_neon
  730. mov r4, lr
  731. bl put_h264_qpel8_v_lowpass_l2_neon
  732. sub r1, r1, r3, lsl #2
  733. bl put_h264_qpel8_v_lowpass_l2_neon
  734. sub r0, r0, r3, lsl #4
  735. sub ip, ip, r2, lsl #4
  736. add r0, r0, #8
  737. add ip, ip, #8
  738. sub r1, r1, r3, lsl #4
  739. sub r1, r1, r3, lsl #2
  740. add r1, r1, #8
  741. bl put_h264_qpel8_v_lowpass_l2_neon
  742. sub r1, r1, r3, lsl #2
  743. mov lr, r4
  744. .endfunc
  745. function put_h264_qpel8_v_lowpass_l2_neon
  746. vld1.64 {d8}, [r1], r3
  747. vld1.64 {d10}, [r1], r3
  748. vld1.64 {d12}, [r1], r3
  749. vld1.64 {d14}, [r1], r3
  750. vld1.64 {d22}, [r1], r3
  751. vld1.64 {d24}, [r1], r3
  752. vld1.64 {d26}, [r1], r3
  753. vld1.64 {d28}, [r1], r3
  754. vld1.64 {d9}, [r1], r3
  755. vld1.64 {d11}, [r1], r3
  756. vld1.64 {d13}, [r1], r3
  757. vld1.64 {d15}, [r1], r3
  758. vld1.64 {d23}, [r1]
  759. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  760. lowpass_8 d8, d9, d10, d11, d8, d9
  761. lowpass_8 d12, d13, d14, d15, d12, d13
  762. lowpass_8 d22, d23, d24, d25, d22, d23
  763. lowpass_8 d26, d27, d28, d29, d26, d27
  764. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  765. vld1.64 {d0}, [ip], r2
  766. vld1.64 {d1}, [ip], r2
  767. vld1.64 {d2}, [ip], r2
  768. vld1.64 {d3}, [ip], r2
  769. vld1.64 {d4}, [ip], r2
  770. vrhadd.u8 q0, q0, q4
  771. vld1.64 {d5}, [ip], r2
  772. vrhadd.u8 q1, q1, q6
  773. vld1.64 {d10}, [ip], r2
  774. vrhadd.u8 q2, q2, q11
  775. vld1.64 {d11}, [ip], r2
  776. vst1.64 {d0}, [r0,:64], r3
  777. vst1.64 {d1}, [r0,:64], r3
  778. vrhadd.u8 q5, q5, q13
  779. vst1.64 {d2}, [r0,:64], r3
  780. vst1.64 {d3}, [r0,:64], r3
  781. vst1.64 {d4}, [r0,:64], r3
  782. vst1.64 {d5}, [r0,:64], r3
  783. vst1.64 {d10}, [r0,:64], r3
  784. vst1.64 {d11}, [r0,:64], r3
  785. bx lr
  786. .endfunc
  787. function put_h264_qpel8_hv_lowpass_neon_top
  788. lowpass_const ip
  789. mov ip, #12
  790. 1: vld1.64 {d0, d1}, [r1], r3
  791. vld1.64 {d16,d17}, [r1], r3
  792. subs ip, ip, #2
  793. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  794. vst1.64 {d22-d25}, [r4,:128]!
  795. bne 1b
  796. vld1.64 {d0, d1}, [r1]
  797. lowpass_8_1 d0, d1, q12, narrow=0
  798. mov ip, #-16
  799. add r4, r4, ip
  800. vld1.64 {d30,d31}, [r4,:128], ip
  801. vld1.64 {d20,d21}, [r4,:128], ip
  802. vld1.64 {d18,d19}, [r4,:128], ip
  803. vld1.64 {d16,d17}, [r4,:128], ip
  804. vld1.64 {d14,d15}, [r4,:128], ip
  805. vld1.64 {d12,d13}, [r4,:128], ip
  806. vld1.64 {d10,d11}, [r4,:128], ip
  807. vld1.64 {d8, d9}, [r4,:128], ip
  808. vld1.64 {d6, d7}, [r4,:128], ip
  809. vld1.64 {d4, d5}, [r4,:128], ip
  810. vld1.64 {d2, d3}, [r4,:128], ip
  811. vld1.64 {d0, d1}, [r4,:128]
  812. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  813. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  814. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  815. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  816. vst1.64 {d30,d31}, [r4,:128]!
  817. vst1.64 {d6, d7}, [r4,:128]!
  818. vst1.64 {d20,d21}, [r4,:128]!
  819. vst1.64 {d4, d5}, [r4,:128]!
  820. vst1.64 {d18,d19}, [r4,:128]!
  821. vst1.64 {d2, d3}, [r4,:128]!
  822. vst1.64 {d16,d17}, [r4,:128]!
  823. vst1.64 {d0, d1}, [r4,:128]
  824. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  825. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  826. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  827. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  828. vld1.64 {d16,d17}, [r4,:128], ip
  829. vld1.64 {d30,d31}, [r4,:128], ip
  830. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  831. vld1.64 {d16,d17}, [r4,:128], ip
  832. vld1.64 {d30,d31}, [r4,:128], ip
  833. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  834. vld1.64 {d16,d17}, [r4,:128], ip
  835. vld1.64 {d30,d31}, [r4,:128], ip
  836. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  837. vld1.64 {d16,d17}, [r4,:128], ip
  838. vld1.64 {d30,d31}, [r4,:128]
  839. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  840. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  841. bx lr
  842. .endfunc
  843. function put_h264_qpel8_hv_lowpass_neon
  844. mov r10, lr
  845. bl put_h264_qpel8_hv_lowpass_neon_top
  846. vst1.64 {d12}, [r0,:64], r2
  847. vst1.64 {d13}, [r0,:64], r2
  848. vst1.64 {d14}, [r0,:64], r2
  849. vst1.64 {d15}, [r0,:64], r2
  850. vst1.64 {d8}, [r0,:64], r2
  851. vst1.64 {d9}, [r0,:64], r2
  852. vst1.64 {d10}, [r0,:64], r2
  853. vst1.64 {d11}, [r0,:64], r2
  854. mov lr, r10
  855. bx lr
  856. .endfunc
  857. function put_h264_qpel8_hv_lowpass_l2_neon
  858. mov r10, lr
  859. bl put_h264_qpel8_hv_lowpass_neon_top
  860. vld1.64 {d0, d1}, [r2,:128]!
  861. vld1.64 {d2, d3}, [r2,:128]!
  862. vrhadd.u8 q0, q0, q6
  863. vld1.64 {d4, d5}, [r2,:128]!
  864. vrhadd.u8 q1, q1, q7
  865. vld1.64 {d6, d7}, [r2,:128]!
  866. vrhadd.u8 q2, q2, q4
  867. vst1.64 {d0}, [r0,:64], r3
  868. vrhadd.u8 q3, q3, q5
  869. vst1.64 {d1}, [r0,:64], r3
  870. vst1.64 {d2}, [r0,:64], r3
  871. vst1.64 {d3}, [r0,:64], r3
  872. vst1.64 {d4}, [r0,:64], r3
  873. vst1.64 {d5}, [r0,:64], r3
  874. vst1.64 {d6}, [r0,:64], r3
  875. vst1.64 {d7}, [r0,:64], r3
  876. mov lr, r10
  877. bx lr
  878. .endfunc
  879. function put_h264_qpel16_hv_lowpass_neon
  880. mov r9, lr
  881. bl put_h264_qpel8_hv_lowpass_neon
  882. sub r1, r1, r3, lsl #2
  883. bl put_h264_qpel8_hv_lowpass_neon
  884. sub r1, r1, r3, lsl #4
  885. sub r1, r1, r3, lsl #2
  886. add r1, r1, #8
  887. sub r0, r0, r2, lsl #4
  888. add r0, r0, #8
  889. bl put_h264_qpel8_hv_lowpass_neon
  890. sub r1, r1, r3, lsl #2
  891. mov lr, r9
  892. b put_h264_qpel8_hv_lowpass_neon
  893. .endfunc
  894. function put_h264_qpel16_hv_lowpass_l2_neon
  895. mov r9, lr
  896. sub r2, r4, #256
  897. bl put_h264_qpel8_hv_lowpass_l2_neon
  898. sub r1, r1, r3, lsl #2
  899. bl put_h264_qpel8_hv_lowpass_l2_neon
  900. sub r1, r1, r3, lsl #4
  901. sub r1, r1, r3, lsl #2
  902. add r1, r1, #8
  903. sub r0, r0, r3, lsl #4
  904. add r0, r0, #8
  905. bl put_h264_qpel8_hv_lowpass_l2_neon
  906. sub r1, r1, r3, lsl #2
  907. mov lr, r9
  908. b put_h264_qpel8_hv_lowpass_l2_neon
  909. .endfunc
  910. function ff_put_h264_qpel8_mc10_neon, export=1
  911. lowpass_const r3
  912. mov r3, r1
  913. sub r1, r1, #2
  914. mov ip, #8
  915. b put_h264_qpel8_h_lowpass_l2_neon
  916. .endfunc
  917. function ff_put_h264_qpel8_mc20_neon, export=1
  918. lowpass_const r3
  919. sub r1, r1, #2
  920. mov r3, r2
  921. mov ip, #8
  922. b put_h264_qpel8_h_lowpass_neon
  923. .endfunc
  924. function ff_put_h264_qpel8_mc30_neon, export=1
  925. lowpass_const r3
  926. add r3, r1, #1
  927. sub r1, r1, #2
  928. mov ip, #8
  929. b put_h264_qpel8_h_lowpass_l2_neon
  930. .endfunc
  931. function ff_put_h264_qpel8_mc01_neon, export=1
  932. push {lr}
  933. mov ip, r1
  934. put_h264_qpel8_mc01:
  935. lowpass_const r3
  936. mov r3, r2
  937. sub r1, r1, r2, lsl #1
  938. vpush {d8-d15}
  939. bl put_h264_qpel8_v_lowpass_l2_neon
  940. vpop {d8-d15}
  941. pop {pc}
  942. .endfunc
  943. function ff_put_h264_qpel8_mc11_neon, export=1
  944. push {r0, r1, r2, lr}
  945. put_h264_qpel8_mc11:
  946. lowpass_const r3
  947. sub sp, sp, #64
  948. mov r0, sp
  949. sub r1, r1, #2
  950. mov r3, #8
  951. mov ip, #8
  952. vpush {d8-d15}
  953. bl put_h264_qpel8_h_lowpass_neon
  954. ldrd r0, [sp, #128]
  955. mov r3, r2
  956. add ip, sp, #64
  957. sub r1, r1, r2, lsl #1
  958. mov r2, #8
  959. bl put_h264_qpel8_v_lowpass_l2_neon
  960. vpop {d8-d15}
  961. add sp, sp, #76
  962. pop {pc}
  963. .endfunc
  964. function ff_put_h264_qpel8_mc21_neon, export=1
  965. push {r0, r1, r4, r10, r11, lr}
  966. put_h264_qpel8_mc21:
  967. lowpass_const r3
  968. mov r11, sp
  969. bic sp, sp, #15
  970. sub sp, sp, #(8*8+16*12)
  971. sub r1, r1, #2
  972. mov r3, #8
  973. mov r0, sp
  974. mov ip, #8
  975. vpush {d8-d15}
  976. bl put_h264_qpel8_h_lowpass_neon
  977. mov r4, r0
  978. ldrd r0, [r11]
  979. sub r1, r1, r2, lsl #1
  980. sub r1, r1, #2
  981. mov r3, r2
  982. sub r2, r4, #64
  983. bl put_h264_qpel8_hv_lowpass_l2_neon
  984. vpop {d8-d15}
  985. add sp, r11, #8
  986. pop {r4, r10, r11, pc}
  987. .endfunc
  988. function ff_put_h264_qpel8_mc31_neon, export=1
  989. add r1, r1, #1
  990. push {r0, r1, r2, lr}
  991. sub r1, r1, #1
  992. b put_h264_qpel8_mc11
  993. .endfunc
  994. function ff_put_h264_qpel8_mc02_neon, export=1
  995. push {lr}
  996. lowpass_const r3
  997. sub r1, r1, r2, lsl #1
  998. mov r3, r2
  999. vpush {d8-d15}
  1000. bl put_h264_qpel8_v_lowpass_neon
  1001. vpop {d8-d15}
  1002. pop {pc}
  1003. .endfunc
  1004. function ff_put_h264_qpel8_mc12_neon, export=1
  1005. push {r0, r1, r4, r10, r11, lr}
  1006. put_h264_qpel8_mc12:
  1007. lowpass_const r3
  1008. mov r11, sp
  1009. bic sp, sp, #15
  1010. sub sp, sp, #(8*8+16*12)
  1011. sub r1, r1, r2, lsl #1
  1012. mov r3, r2
  1013. mov r2, #8
  1014. mov r0, sp
  1015. vpush {d8-d15}
  1016. bl put_h264_qpel8_v_lowpass_neon
  1017. mov r4, r0
  1018. ldrd r0, [r11]
  1019. sub r1, r1, r3, lsl #1
  1020. sub r1, r1, #2
  1021. sub r2, r4, #64
  1022. bl put_h264_qpel8_hv_lowpass_l2_neon
  1023. vpop {d8-d15}
  1024. add sp, r11, #8
  1025. pop {r4, r10, r11, pc}
  1026. .endfunc
  1027. function ff_put_h264_qpel8_mc22_neon, export=1
  1028. push {r4, r10, r11, lr}
  1029. mov r11, sp
  1030. bic sp, sp, #15
  1031. sub r1, r1, r2, lsl #1
  1032. sub r1, r1, #2
  1033. mov r3, r2
  1034. sub sp, sp, #(16*12)
  1035. mov r4, sp
  1036. vpush {d8-d15}
  1037. bl put_h264_qpel8_hv_lowpass_neon
  1038. vpop {d8-d15}
  1039. mov sp, r11
  1040. pop {r4, r10, r11, pc}
  1041. .endfunc
  1042. function ff_put_h264_qpel8_mc32_neon, export=1
  1043. push {r0, r1, r4, r10, r11, lr}
  1044. add r1, r1, #1
  1045. b put_h264_qpel8_mc12
  1046. .endfunc
  1047. function ff_put_h264_qpel8_mc03_neon, export=1
  1048. push {lr}
  1049. add ip, r1, r2
  1050. b put_h264_qpel8_mc01
  1051. .endfunc
  1052. function ff_put_h264_qpel8_mc13_neon, export=1
  1053. push {r0, r1, r2, lr}
  1054. add r1, r1, r2
  1055. b put_h264_qpel8_mc11
  1056. .endfunc
  1057. function ff_put_h264_qpel8_mc23_neon, export=1
  1058. push {r0, r1, r4, r10, r11, lr}
  1059. add r1, r1, r2
  1060. b put_h264_qpel8_mc21
  1061. .endfunc
  1062. function ff_put_h264_qpel8_mc33_neon, export=1
  1063. add r1, r1, #1
  1064. push {r0, r1, r2, lr}
  1065. add r1, r1, r2
  1066. sub r1, r1, #1
  1067. b put_h264_qpel8_mc11
  1068. .endfunc
  1069. function ff_put_h264_qpel16_mc10_neon, export=1
  1070. lowpass_const r3
  1071. mov r3, r1
  1072. sub r1, r1, #2
  1073. b put_h264_qpel16_h_lowpass_l2_neon
  1074. .endfunc
  1075. function ff_put_h264_qpel16_mc20_neon, export=1
  1076. lowpass_const r3
  1077. sub r1, r1, #2
  1078. mov r3, r2
  1079. b put_h264_qpel16_h_lowpass_neon
  1080. .endfunc
  1081. function ff_put_h264_qpel16_mc30_neon, export=1
  1082. lowpass_const r3
  1083. add r3, r1, #1
  1084. sub r1, r1, #2
  1085. b put_h264_qpel16_h_lowpass_l2_neon
  1086. .endfunc
  1087. function ff_put_h264_qpel16_mc01_neon, export=1
  1088. push {r4, lr}
  1089. mov ip, r1
  1090. put_h264_qpel16_mc01:
  1091. lowpass_const r3
  1092. mov r3, r2
  1093. sub r1, r1, r2, lsl #1
  1094. vpush {d8-d15}
  1095. bl put_h264_qpel16_v_lowpass_l2_neon
  1096. vpop {d8-d15}
  1097. pop {r4, pc}
  1098. .endfunc
  1099. function ff_put_h264_qpel16_mc11_neon, export=1
  1100. push {r0, r1, r4, lr}
  1101. put_h264_qpel16_mc11:
  1102. lowpass_const r3
  1103. sub sp, sp, #256
  1104. mov r0, sp
  1105. sub r1, r1, #2
  1106. mov r3, #16
  1107. vpush {d8-d15}
  1108. bl put_h264_qpel16_h_lowpass_neon
  1109. add r0, sp, #256
  1110. ldrd r0, [r0, #64]
  1111. mov r3, r2
  1112. add ip, sp, #64
  1113. sub r1, r1, r2, lsl #1
  1114. mov r2, #16
  1115. bl put_h264_qpel16_v_lowpass_l2_neon
  1116. vpop {d8-d15}
  1117. add sp, sp, #(256+8)
  1118. pop {r4, pc}
  1119. .endfunc
  1120. function ff_put_h264_qpel16_mc21_neon, export=1
  1121. push {r0, r1, r4-r5, r9-r11, lr}
  1122. put_h264_qpel16_mc21:
  1123. lowpass_const r3
  1124. mov r11, sp
  1125. bic sp, sp, #15
  1126. sub sp, sp, #(16*16+16*12)
  1127. sub r1, r1, #2
  1128. mov r0, sp
  1129. vpush {d8-d15}
  1130. bl put_h264_qpel16_h_lowpass_neon_packed
  1131. mov r4, r0
  1132. ldrd r0, [r11]
  1133. sub r1, r1, r2, lsl #1
  1134. sub r1, r1, #2
  1135. mov r3, r2
  1136. bl put_h264_qpel16_hv_lowpass_l2_neon
  1137. vpop {d8-d15}
  1138. add sp, r11, #8
  1139. pop {r4-r5, r9-r11, pc}
  1140. .endfunc
  1141. function ff_put_h264_qpel16_mc31_neon, export=1
  1142. add r1, r1, #1
  1143. push {r0, r1, r4, lr}
  1144. sub r1, r1, #1
  1145. b put_h264_qpel16_mc11
  1146. .endfunc
  1147. function ff_put_h264_qpel16_mc02_neon, export=1
  1148. push {r4, lr}
  1149. lowpass_const r3
  1150. sub r1, r1, r2, lsl #1
  1151. mov r3, r2
  1152. vpush {d8-d15}
  1153. bl put_h264_qpel16_v_lowpass_neon
  1154. vpop {d8-d15}
  1155. pop {r4, pc}
  1156. .endfunc
  1157. function ff_put_h264_qpel16_mc12_neon, export=1
  1158. push {r0, r1, r4-r5, r9-r11, lr}
  1159. put_h264_qpel16_mc12:
  1160. lowpass_const r3
  1161. mov r11, sp
  1162. bic sp, sp, #15
  1163. sub sp, sp, #(16*16+16*12)
  1164. sub r1, r1, r2, lsl #1
  1165. mov r0, sp
  1166. mov r3, r2
  1167. vpush {d8-d15}
  1168. bl put_h264_qpel16_v_lowpass_neon_packed
  1169. mov r4, r0
  1170. ldrd r0, [r11]
  1171. sub r1, r1, r3, lsl #1
  1172. sub r1, r1, #2
  1173. mov r2, r3
  1174. bl put_h264_qpel16_hv_lowpass_l2_neon
  1175. vpop {d8-d15}
  1176. add sp, r11, #8
  1177. pop {r4-r5, r9-r11, pc}
  1178. .endfunc
  1179. function ff_put_h264_qpel16_mc22_neon, export=1
  1180. push {r4, r9-r11, lr}
  1181. lowpass_const r3
  1182. mov r11, sp
  1183. bic sp, sp, #15
  1184. sub r1, r1, r2, lsl #1
  1185. sub r1, r1, #2
  1186. mov r3, r2
  1187. sub sp, sp, #(16*12)
  1188. mov r4, sp
  1189. vpush {d8-d15}
  1190. bl put_h264_qpel16_hv_lowpass_neon
  1191. vpop {d8-d15}
  1192. mov sp, r11
  1193. pop {r4, r9-r11, pc}
  1194. .endfunc
  1195. function ff_put_h264_qpel16_mc32_neon, export=1
  1196. push {r0, r1, r4-r5, r9-r11, lr}
  1197. add r1, r1, #1
  1198. b put_h264_qpel16_mc12
  1199. .endfunc
  1200. function ff_put_h264_qpel16_mc03_neon, export=1
  1201. push {r4, lr}
  1202. add ip, r1, r2
  1203. b put_h264_qpel16_mc01
  1204. .endfunc
  1205. function ff_put_h264_qpel16_mc13_neon, export=1
  1206. push {r0, r1, r4, lr}
  1207. add r1, r1, r2
  1208. b put_h264_qpel16_mc11
  1209. .endfunc
  1210. function ff_put_h264_qpel16_mc23_neon, export=1
  1211. push {r0, r1, r4-r5, r9-r11, lr}
  1212. add r1, r1, r2
  1213. b put_h264_qpel16_mc21
  1214. .endfunc
  1215. function ff_put_h264_qpel16_mc33_neon, export=1
  1216. add r1, r1, #1
  1217. push {r0, r1, r4, lr}
  1218. add r1, r1, r2
  1219. sub r1, r1, #1
  1220. b put_h264_qpel16_mc11
  1221. .endfunc
  1222. @ Biweighted prediction
  1223. .macro biweight_16 macs, macd
  1224. vdup.8 d0, r4
  1225. vdup.8 d1, r5
  1226. vmov q2, q8
  1227. vmov q3, q8
  1228. 1: subs ip, ip, #2
  1229. vld1.8 {d20-d21},[r0,:128], r2
  1230. \macd q2, d0, d20
  1231. pld [r0]
  1232. \macd q3, d0, d21
  1233. vld1.8 {d22-d23},[r1,:128], r2
  1234. \macs q2, d1, d22
  1235. pld [r1]
  1236. \macs q3, d1, d23
  1237. vmov q12, q8
  1238. vld1.8 {d28-d29},[r0,:128], r2
  1239. vmov q13, q8
  1240. \macd q12, d0, d28
  1241. pld [r0]
  1242. \macd q13, d0, d29
  1243. vld1.8 {d30-d31},[r1,:128], r2
  1244. \macs q12, d1, d30
  1245. pld [r1]
  1246. \macs q13, d1, d31
  1247. vshl.s16 q2, q2, q9
  1248. vshl.s16 q3, q3, q9
  1249. vqmovun.s16 d4, q2
  1250. vqmovun.s16 d5, q3
  1251. vshl.s16 q12, q12, q9
  1252. vshl.s16 q13, q13, q9
  1253. vqmovun.s16 d24, q12
  1254. vqmovun.s16 d25, q13
  1255. vmov q3, q8
  1256. vst1.8 {d4- d5}, [r6,:128], r2
  1257. vmov q2, q8
  1258. vst1.8 {d24-d25},[r6,:128], r2
  1259. bne 1b
  1260. pop {r4-r6, pc}
  1261. .endm
  1262. .macro biweight_8 macs, macd
  1263. vdup.8 d0, r4
  1264. vdup.8 d1, r5
  1265. vmov q1, q8
  1266. vmov q10, q8
  1267. 1: subs ip, ip, #2
  1268. vld1.8 {d4},[r0,:64], r2
  1269. \macd q1, d0, d4
  1270. pld [r0]
  1271. vld1.8 {d5},[r1,:64], r2
  1272. \macs q1, d1, d5
  1273. pld [r1]
  1274. vld1.8 {d6},[r0,:64], r2
  1275. \macd q10, d0, d6
  1276. pld [r0]
  1277. vld1.8 {d7},[r1,:64], r2
  1278. \macs q10, d1, d7
  1279. pld [r1]
  1280. vshl.s16 q1, q1, q9
  1281. vqmovun.s16 d2, q1
  1282. vshl.s16 q10, q10, q9
  1283. vqmovun.s16 d4, q10
  1284. vmov q10, q8
  1285. vst1.8 {d2},[r6,:64], r2
  1286. vmov q1, q8
  1287. vst1.8 {d4},[r6,:64], r2
  1288. bne 1b
  1289. pop {r4-r6, pc}
  1290. .endm
  1291. .macro biweight_4 macs, macd
  1292. vdup.8 d0, r4
  1293. vdup.8 d1, r5
  1294. vmov q1, q8
  1295. vmov q10, q8
  1296. 1: subs ip, ip, #4
  1297. vld1.32 {d4[0]},[r0,:32], r2
  1298. vld1.32 {d4[1]},[r0,:32], r2
  1299. \macd q1, d0, d4
  1300. pld [r0]
  1301. vld1.32 {d5[0]},[r1,:32], r2
  1302. vld1.32 {d5[1]},[r1,:32], r2
  1303. \macs q1, d1, d5
  1304. pld [r1]
  1305. blt 2f
  1306. vld1.32 {d6[0]},[r0,:32], r2
  1307. vld1.32 {d6[1]},[r0,:32], r2
  1308. \macd q10, d0, d6
  1309. pld [r0]
  1310. vld1.32 {d7[0]},[r1,:32], r2
  1311. vld1.32 {d7[1]},[r1,:32], r2
  1312. \macs q10, d1, d7
  1313. pld [r1]
  1314. vshl.s16 q1, q1, q9
  1315. vqmovun.s16 d2, q1
  1316. vshl.s16 q10, q10, q9
  1317. vqmovun.s16 d4, q10
  1318. vmov q10, q8
  1319. vst1.32 {d2[0]},[r6,:32], r2
  1320. vst1.32 {d2[1]},[r6,:32], r2
  1321. vmov q1, q8
  1322. vst1.32 {d4[0]},[r6,:32], r2
  1323. vst1.32 {d4[1]},[r6,:32], r2
  1324. bne 1b
  1325. pop {r4-r6, pc}
  1326. 2: vshl.s16 q1, q1, q9
  1327. vqmovun.s16 d2, q1
  1328. vst1.32 {d2[0]},[r6,:32], r2
  1329. vst1.32 {d2[1]},[r6,:32], r2
  1330. pop {r4-r6, pc}
  1331. .endm
  1332. .macro biweight_func w
  1333. function biweight_h264_pixels_\w\()_neon
  1334. push {r4-r6, lr}
  1335. add r4, sp, #16
  1336. ldm r4, {r4-r6}
  1337. lsr lr, r4, #31
  1338. add r6, r6, #1
  1339. eors lr, lr, r5, lsr #30
  1340. orr r6, r6, #1
  1341. vdup.16 q9, r3
  1342. lsl r6, r6, r3
  1343. vmvn q9, q9
  1344. vdup.16 q8, r6
  1345. mov r6, r0
  1346. beq 10f
  1347. subs lr, lr, #1
  1348. beq 20f
  1349. subs lr, lr, #1
  1350. beq 30f
  1351. b 40f
  1352. 10: biweight_\w vmlal.u8, vmlal.u8
  1353. 20: rsb r4, r4, #0
  1354. biweight_\w vmlal.u8, vmlsl.u8
  1355. 30: rsb r4, r4, #0
  1356. rsb r5, r5, #0
  1357. biweight_\w vmlsl.u8, vmlsl.u8
  1358. 40: rsb r5, r5, #0
  1359. biweight_\w vmlsl.u8, vmlal.u8
  1360. .endfunc
  1361. .endm
  1362. .macro biweight_entry w, h, b=1
  1363. function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
  1364. mov ip, #\h
  1365. .if \b
  1366. b biweight_h264_pixels_\w\()_neon
  1367. .endif
  1368. .endfunc
  1369. .endm
  1370. biweight_entry 16, 8
  1371. biweight_entry 16, 16, b=0
  1372. biweight_func 16
  1373. biweight_entry 8, 16
  1374. biweight_entry 8, 4
  1375. biweight_entry 8, 8, b=0
  1376. biweight_func 8
  1377. biweight_entry 4, 8
  1378. biweight_entry 4, 2
  1379. biweight_entry 4, 4, b=0
  1380. biweight_func 4
  1381. @ Weighted prediction
  1382. .macro weight_16 add
  1383. vdup.8 d0, r3
  1384. 1: subs ip, ip, #2
  1385. vld1.8 {d20-d21},[r0,:128], r1
  1386. vmull.u8 q2, d0, d20
  1387. pld [r0]
  1388. vmull.u8 q3, d0, d21
  1389. vld1.8 {d28-d29},[r0,:128], r1
  1390. vmull.u8 q12, d0, d28
  1391. pld [r0]
  1392. vmull.u8 q13, d0, d29
  1393. \add q2, q8, q2
  1394. vrshl.s16 q2, q2, q9
  1395. \add q3, q8, q3
  1396. vrshl.s16 q3, q3, q9
  1397. vqmovun.s16 d4, q2
  1398. vqmovun.s16 d5, q3
  1399. \add q12, q8, q12
  1400. vrshl.s16 q12, q12, q9
  1401. \add q13, q8, q13
  1402. vrshl.s16 q13, q13, q9
  1403. vqmovun.s16 d24, q12
  1404. vqmovun.s16 d25, q13
  1405. vst1.8 {d4- d5}, [r4,:128], r1
  1406. vst1.8 {d24-d25},[r4,:128], r1
  1407. bne 1b
  1408. pop {r4, pc}
  1409. .endm
  1410. .macro weight_8 add
  1411. vdup.8 d0, r3
  1412. 1: subs ip, ip, #2
  1413. vld1.8 {d4},[r0,:64], r1
  1414. vmull.u8 q1, d0, d4
  1415. pld [r0]
  1416. vld1.8 {d6},[r0,:64], r1
  1417. vmull.u8 q10, d0, d6
  1418. \add q1, q8, q1
  1419. pld [r0]
  1420. vrshl.s16 q1, q1, q9
  1421. vqmovun.s16 d2, q1
  1422. \add q10, q8, q10
  1423. vrshl.s16 q10, q10, q9
  1424. vqmovun.s16 d4, q10
  1425. vst1.8 {d2},[r4,:64], r1
  1426. vst1.8 {d4},[r4,:64], r1
  1427. bne 1b
  1428. pop {r4, pc}
  1429. .endm
  1430. .macro weight_4 add
  1431. vdup.8 d0, r3
  1432. vmov q1, q8
  1433. vmov q10, q8
  1434. 1: subs ip, ip, #4
  1435. vld1.32 {d4[0]},[r0,:32], r1
  1436. vld1.32 {d4[1]},[r0,:32], r1
  1437. vmull.u8 q1, d0, d4
  1438. pld [r0]
  1439. blt 2f
  1440. vld1.32 {d6[0]},[r0,:32], r1
  1441. vld1.32 {d6[1]},[r0,:32], r1
  1442. vmull.u8 q10, d0, d6
  1443. pld [r0]
  1444. \add q1, q8, q1
  1445. vrshl.s16 q1, q1, q9
  1446. vqmovun.s16 d2, q1
  1447. \add q10, q8, q10
  1448. vrshl.s16 q10, q10, q9
  1449. vqmovun.s16 d4, q10
  1450. vmov q10, q8
  1451. vst1.32 {d2[0]},[r4,:32], r1
  1452. vst1.32 {d2[1]},[r4,:32], r1
  1453. vmov q1, q8
  1454. vst1.32 {d4[0]},[r4,:32], r1
  1455. vst1.32 {d4[1]},[r4,:32], r1
  1456. bne 1b
  1457. pop {r4, pc}
  1458. 2: \add q1, q8, q1
  1459. vrshl.s16 q1, q1, q9
  1460. vqmovun.s16 d2, q1
  1461. vst1.32 {d2[0]},[r4,:32], r1
  1462. vst1.32 {d2[1]},[r4,:32], r1
  1463. pop {r4, pc}
  1464. .endm
  1465. .macro weight_func w
  1466. function weight_h264_pixels_\w\()_neon
  1467. push {r4, lr}
  1468. ldr r4, [sp, #8]
  1469. cmp r2, #1
  1470. lsl r4, r4, r2
  1471. vdup.16 q8, r4
  1472. mov r4, r0
  1473. ble 20f
  1474. rsb lr, r2, #1
  1475. vdup.16 q9, lr
  1476. cmp r3, #0
  1477. blt 10f
  1478. weight_\w vhadd.s16
  1479. 10: rsb r3, r3, #0
  1480. weight_\w vhsub.s16
  1481. 20: rsb lr, r2, #0
  1482. vdup.16 q9, lr
  1483. cmp r3, #0
  1484. blt 10f
  1485. weight_\w vadd.s16
  1486. 10: rsb r3, r3, #0
  1487. weight_\w vsub.s16
  1488. .endfunc
  1489. .endm
  1490. .macro weight_entry w, h, b=1
  1491. function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
  1492. mov ip, #\h
  1493. .if \b
  1494. b weight_h264_pixels_\w\()_neon
  1495. .endif
  1496. .endfunc
  1497. .endm
  1498. weight_entry 16, 8
  1499. weight_entry 16, 16, b=0
  1500. weight_func 16
  1501. weight_entry 8, 16
  1502. weight_entry 8, 4
  1503. weight_entry 8, 8, b=0
  1504. weight_func 8
  1505. weight_entry 4, 8
  1506. weight_entry 4, 2
  1507. weight_entry 4, 4, b=0
  1508. weight_func 4