You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1678 lines
55KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  22. vtrn.32 \r0, \r4
  23. vtrn.32 \r1, \r5
  24. vtrn.32 \r2, \r6
  25. vtrn.32 \r3, \r7
  26. vtrn.16 \r0, \r2
  27. vtrn.16 \r1, \r3
  28. vtrn.16 \r4, \r6
  29. vtrn.16 \r5, \r7
  30. vtrn.8 \r0, \r1
  31. vtrn.8 \r2, \r3
  32. vtrn.8 \r4, \r5
  33. vtrn.8 \r6, \r7
  34. .endm
  35. .macro transpose_4x4 r0 r1 r2 r3
  36. vtrn.16 \r0, \r2
  37. vtrn.16 \r1, \r3
  38. vtrn.8 \r0, \r1
  39. vtrn.8 \r2, \r3
  40. .endm
  41. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  42. vswp \r0, \r4
  43. vswp \r1, \r5
  44. vswp \r2, \r6
  45. vswp \r3, \r7
  46. .endm
  47. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  48. vtrn.32 \r0, \r2
  49. vtrn.32 \r1, \r3
  50. vtrn.32 \r4, \r6
  51. vtrn.32 \r5, \r7
  52. vtrn.16 \r0, \r1
  53. vtrn.16 \r2, \r3
  54. vtrn.16 \r4, \r5
  55. vtrn.16 \r6, \r7
  56. .endm
  57. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  58. .macro h264_chroma_mc8 type
  59. function ff_\type\()_h264_chroma_mc8_neon, export=1
  60. push {r4-r7, lr}
  61. ldrd r4, [sp, #20]
  62. .ifc \type,avg
  63. mov lr, r0
  64. .endif
  65. pld [r1]
  66. pld [r1, r2]
  67. muls r7, r4, r5
  68. rsb r6, r7, r5, lsl #3
  69. rsb ip, r7, r4, lsl #3
  70. sub r4, r7, r4, lsl #3
  71. sub r4, r4, r5, lsl #3
  72. add r4, r4, #64
  73. beq 2f
  74. add r5, r1, r2
  75. vdup.8 d0, r4
  76. lsl r4, r2, #1
  77. vdup.8 d1, ip
  78. vld1.64 {d4, d5}, [r1], r4
  79. vdup.8 d2, r6
  80. vld1.64 {d6, d7}, [r5], r4
  81. vdup.8 d3, r7
  82. vext.8 d5, d4, d5, #1
  83. vext.8 d7, d6, d7, #1
  84. 1: pld [r5]
  85. vmull.u8 q8, d4, d0
  86. vmlal.u8 q8, d5, d1
  87. vld1.64 {d4, d5}, [r1], r4
  88. vmlal.u8 q8, d6, d2
  89. vext.8 d5, d4, d5, #1
  90. vmlal.u8 q8, d7, d3
  91. vmull.u8 q9, d6, d0
  92. subs r3, r3, #2
  93. vmlal.u8 q9, d7, d1
  94. vmlal.u8 q9, d4, d2
  95. vmlal.u8 q9, d5, d3
  96. vrshrn.u16 d16, q8, #6
  97. vld1.64 {d6, d7}, [r5], r4
  98. pld [r1]
  99. vrshrn.u16 d17, q9, #6
  100. .ifc \type,avg
  101. vld1.64 {d20}, [lr,:64], r2
  102. vld1.64 {d21}, [lr,:64], r2
  103. vrhadd.u8 q8, q8, q10
  104. .endif
  105. vext.8 d7, d6, d7, #1
  106. vst1.64 {d16}, [r0,:64], r2
  107. vst1.64 {d17}, [r0,:64], r2
  108. bgt 1b
  109. pop {r4-r7, pc}
  110. 2: tst r6, r6
  111. add ip, ip, r6
  112. vdup.8 d0, r4
  113. vdup.8 d1, ip
  114. beq 4f
  115. add r5, r1, r2
  116. lsl r4, r2, #1
  117. vld1.64 {d4}, [r1], r4
  118. vld1.64 {d6}, [r5], r4
  119. 3: pld [r5]
  120. vmull.u8 q8, d4, d0
  121. vmlal.u8 q8, d6, d1
  122. vld1.64 {d4}, [r1], r4
  123. vmull.u8 q9, d6, d0
  124. vmlal.u8 q9, d4, d1
  125. vld1.64 {d6}, [r5], r4
  126. vrshrn.u16 d16, q8, #6
  127. vrshrn.u16 d17, q9, #6
  128. .ifc \type,avg
  129. vld1.64 {d20}, [lr,:64], r2
  130. vld1.64 {d21}, [lr,:64], r2
  131. vrhadd.u8 q8, q8, q10
  132. .endif
  133. subs r3, r3, #2
  134. pld [r1]
  135. vst1.64 {d16}, [r0,:64], r2
  136. vst1.64 {d17}, [r0,:64], r2
  137. bgt 3b
  138. pop {r4-r7, pc}
  139. 4: vld1.64 {d4, d5}, [r1], r2
  140. vld1.64 {d6, d7}, [r1], r2
  141. vext.8 d5, d4, d5, #1
  142. vext.8 d7, d6, d7, #1
  143. 5: pld [r1]
  144. subs r3, r3, #2
  145. vmull.u8 q8, d4, d0
  146. vmlal.u8 q8, d5, d1
  147. vld1.64 {d4, d5}, [r1], r2
  148. vmull.u8 q9, d6, d0
  149. vmlal.u8 q9, d7, d1
  150. pld [r1]
  151. vext.8 d5, d4, d5, #1
  152. vrshrn.u16 d16, q8, #6
  153. vrshrn.u16 d17, q9, #6
  154. .ifc \type,avg
  155. vld1.64 {d20}, [lr,:64], r2
  156. vld1.64 {d21}, [lr,:64], r2
  157. vrhadd.u8 q8, q8, q10
  158. .endif
  159. vld1.64 {d6, d7}, [r1], r2
  160. vext.8 d7, d6, d7, #1
  161. vst1.64 {d16}, [r0,:64], r2
  162. vst1.64 {d17}, [r0,:64], r2
  163. bgt 5b
  164. pop {r4-r7, pc}
  165. .endfunc
  166. .endm
  167. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  168. .macro h264_chroma_mc4 type
  169. function ff_\type\()_h264_chroma_mc4_neon, export=1
  170. push {r4-r7, lr}
  171. ldrd r4, [sp, #20]
  172. .ifc \type,avg
  173. mov lr, r0
  174. .endif
  175. pld [r1]
  176. pld [r1, r2]
  177. muls r7, r4, r5
  178. rsb r6, r7, r5, lsl #3
  179. rsb ip, r7, r4, lsl #3
  180. sub r4, r7, r4, lsl #3
  181. sub r4, r4, r5, lsl #3
  182. add r4, r4, #64
  183. beq 2f
  184. add r5, r1, r2
  185. vdup.8 d0, r4
  186. lsl r4, r2, #1
  187. vdup.8 d1, ip
  188. vld1.64 {d4}, [r1], r4
  189. vdup.8 d2, r6
  190. vld1.64 {d6}, [r5], r4
  191. vdup.8 d3, r7
  192. vext.8 d5, d4, d5, #1
  193. vext.8 d7, d6, d7, #1
  194. vtrn.32 d4, d5
  195. vtrn.32 d6, d7
  196. vtrn.32 d0, d1
  197. vtrn.32 d2, d3
  198. 1: pld [r5]
  199. vmull.u8 q8, d4, d0
  200. vmlal.u8 q8, d6, d2
  201. vld1.64 {d4}, [r1], r4
  202. vext.8 d5, d4, d5, #1
  203. vtrn.32 d4, d5
  204. vmull.u8 q9, d6, d0
  205. vmlal.u8 q9, d4, d2
  206. vld1.64 {d6}, [r5], r4
  207. vadd.i16 d16, d16, d17
  208. vadd.i16 d17, d18, d19
  209. vrshrn.u16 d16, q8, #6
  210. subs r3, r3, #2
  211. pld [r1]
  212. .ifc \type,avg
  213. vld1.32 {d20[0]}, [lr,:32], r2
  214. vld1.32 {d20[1]}, [lr,:32], r2
  215. vrhadd.u8 d16, d16, d20
  216. .endif
  217. vext.8 d7, d6, d7, #1
  218. vtrn.32 d6, d7
  219. vst1.32 {d16[0]}, [r0,:32], r2
  220. vst1.32 {d16[1]}, [r0,:32], r2
  221. bgt 1b
  222. pop {r4-r7, pc}
  223. 2: tst r6, r6
  224. add ip, ip, r6
  225. vdup.8 d0, r4
  226. vdup.8 d1, ip
  227. vtrn.32 d0, d1
  228. beq 4f
  229. vext.32 d1, d0, d1, #1
  230. add r5, r1, r2
  231. lsl r4, r2, #1
  232. vld1.32 {d4[0]}, [r1], r4
  233. vld1.32 {d4[1]}, [r5], r4
  234. 3: pld [r5]
  235. vmull.u8 q8, d4, d0
  236. vld1.32 {d4[0]}, [r1], r4
  237. vmull.u8 q9, d4, d1
  238. vld1.32 {d4[1]}, [r5], r4
  239. vadd.i16 d16, d16, d17
  240. vadd.i16 d17, d18, d19
  241. vrshrn.u16 d16, q8, #6
  242. .ifc \type,avg
  243. vld1.32 {d20[0]}, [lr,:32], r2
  244. vld1.32 {d20[1]}, [lr,:32], r2
  245. vrhadd.u8 d16, d16, d20
  246. .endif
  247. subs r3, r3, #2
  248. pld [r1]
  249. vst1.32 {d16[0]}, [r0,:32], r2
  250. vst1.32 {d16[1]}, [r0,:32], r2
  251. bgt 3b
  252. pop {r4-r7, pc}
  253. 4: vld1.64 {d4}, [r1], r2
  254. vld1.64 {d6}, [r1], r2
  255. vext.8 d5, d4, d5, #1
  256. vext.8 d7, d6, d7, #1
  257. vtrn.32 d4, d5
  258. vtrn.32 d6, d7
  259. 5: vmull.u8 q8, d4, d0
  260. vmull.u8 q9, d6, d0
  261. subs r3, r3, #2
  262. vld1.64 {d4}, [r1], r2
  263. vext.8 d5, d4, d5, #1
  264. vtrn.32 d4, d5
  265. vadd.i16 d16, d16, d17
  266. vadd.i16 d17, d18, d19
  267. pld [r1]
  268. vrshrn.u16 d16, q8, #6
  269. .ifc \type,avg
  270. vld1.32 {d20[0]}, [lr,:32], r2
  271. vld1.32 {d20[1]}, [lr,:32], r2
  272. vrhadd.u8 d16, d16, d20
  273. .endif
  274. vld1.64 {d6}, [r1], r2
  275. vext.8 d7, d6, d7, #1
  276. vtrn.32 d6, d7
  277. pld [r1]
  278. vst1.32 {d16[0]}, [r0,:32], r2
  279. vst1.32 {d16[1]}, [r0,:32], r2
  280. bgt 5b
  281. pop {r4-r7, pc}
  282. .endfunc
  283. .endm
  284. .text
  285. .align
  286. h264_chroma_mc8 put
  287. h264_chroma_mc8 avg
  288. h264_chroma_mc4 put
  289. h264_chroma_mc4 avg
  290. /* H.264 loop filter */
  291. .macro h264_loop_filter_start
  292. ldr ip, [sp]
  293. tst r2, r2
  294. ldr ip, [ip]
  295. tstne r3, r3
  296. vmov.32 d24[0], ip
  297. and ip, ip, ip, lsl #16
  298. bxeq lr
  299. ands ip, ip, ip, lsl #8
  300. bxlt lr
  301. .endm
  302. .macro align_push_regs
  303. and ip, sp, #15
  304. add ip, ip, #32
  305. sub sp, sp, ip
  306. vst1.64 {d12-d15}, [sp,:128]
  307. sub sp, sp, #32
  308. vst1.64 {d8-d11}, [sp,:128]
  309. .endm
  310. .macro align_pop_regs
  311. vld1.64 {d8-d11}, [sp,:128]!
  312. vld1.64 {d12-d15}, [sp,:128], ip
  313. .endm
  314. .macro h264_loop_filter_luma
  315. vdup.8 q11, r2 @ alpha
  316. vmovl.u8 q12, d24
  317. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  318. vmovl.u16 q12, d24
  319. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  320. vsli.16 q12, q12, #8
  321. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  322. vsli.32 q12, q12, #16
  323. vclt.u8 q6, q6, q11 @ < alpha
  324. vdup.8 q11, r3 @ beta
  325. vclt.s8 q7, q12, #0
  326. vclt.u8 q14, q14, q11 @ < beta
  327. vclt.u8 q15, q15, q11 @ < beta
  328. vbic q6, q6, q7
  329. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  330. vand q6, q6, q14
  331. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  332. vclt.u8 q4, q4, q11 @ < beta
  333. vand q6, q6, q15
  334. vclt.u8 q5, q5, q11 @ < beta
  335. vand q4, q4, q6
  336. vand q5, q5, q6
  337. vand q12, q12, q6
  338. vrhadd.u8 q14, q8, q0
  339. vsub.i8 q6, q12, q4
  340. vqadd.u8 q7, q9, q12
  341. vhadd.u8 q10, q10, q14
  342. vsub.i8 q6, q6, q5
  343. vhadd.u8 q14, q2, q14
  344. vmin.u8 q7, q7, q10
  345. vqsub.u8 q11, q9, q12
  346. vqadd.u8 q2, q1, q12
  347. vmax.u8 q7, q7, q11
  348. vqsub.u8 q11, q1, q12
  349. vmin.u8 q14, q2, q14
  350. vmovl.u8 q2, d0
  351. vmax.u8 q14, q14, q11
  352. vmovl.u8 q10, d1
  353. vsubw.u8 q2, q2, d16
  354. vsubw.u8 q10, q10, d17
  355. vshl.i16 q2, q2, #2
  356. vshl.i16 q10, q10, #2
  357. vaddw.u8 q2, q2, d18
  358. vaddw.u8 q10, q10, d19
  359. vsubw.u8 q2, q2, d2
  360. vsubw.u8 q10, q10, d3
  361. vrshrn.i16 d4, q2, #3
  362. vrshrn.i16 d5, q10, #3
  363. vbsl q4, q7, q9
  364. vbsl q5, q14, q1
  365. vneg.s8 q7, q6
  366. vmovl.u8 q14, d16
  367. vmin.s8 q2, q2, q6
  368. vmovl.u8 q6, d17
  369. vmax.s8 q2, q2, q7
  370. vmovl.u8 q11, d0
  371. vmovl.u8 q12, d1
  372. vaddw.s8 q14, q14, d4
  373. vaddw.s8 q6, q6, d5
  374. vsubw.s8 q11, q11, d4
  375. vsubw.s8 q12, q12, d5
  376. vqmovun.s16 d16, q14
  377. vqmovun.s16 d17, q6
  378. vqmovun.s16 d0, q11
  379. vqmovun.s16 d1, q12
  380. .endm
  381. function ff_h264_v_loop_filter_luma_neon, export=1
  382. h264_loop_filter_start
  383. vld1.64 {d0, d1}, [r0,:128], r1
  384. vld1.64 {d2, d3}, [r0,:128], r1
  385. vld1.64 {d4, d5}, [r0,:128], r1
  386. sub r0, r0, r1, lsl #2
  387. sub r0, r0, r1, lsl #1
  388. vld1.64 {d20,d21}, [r0,:128], r1
  389. vld1.64 {d18,d19}, [r0,:128], r1
  390. vld1.64 {d16,d17}, [r0,:128], r1
  391. align_push_regs
  392. h264_loop_filter_luma
  393. sub r0, r0, r1, lsl #1
  394. vst1.64 {d8, d9}, [r0,:128], r1
  395. vst1.64 {d16,d17}, [r0,:128], r1
  396. vst1.64 {d0, d1}, [r0,:128], r1
  397. vst1.64 {d10,d11}, [r0,:128]
  398. align_pop_regs
  399. bx lr
  400. .endfunc
  401. function ff_h264_h_loop_filter_luma_neon, export=1
  402. h264_loop_filter_start
  403. sub r0, r0, #4
  404. vld1.64 {d6}, [r0], r1
  405. vld1.64 {d20}, [r0], r1
  406. vld1.64 {d18}, [r0], r1
  407. vld1.64 {d16}, [r0], r1
  408. vld1.64 {d0}, [r0], r1
  409. vld1.64 {d2}, [r0], r1
  410. vld1.64 {d4}, [r0], r1
  411. vld1.64 {d26}, [r0], r1
  412. vld1.64 {d7}, [r0], r1
  413. vld1.64 {d21}, [r0], r1
  414. vld1.64 {d19}, [r0], r1
  415. vld1.64 {d17}, [r0], r1
  416. vld1.64 {d1}, [r0], r1
  417. vld1.64 {d3}, [r0], r1
  418. vld1.64 {d5}, [r0], r1
  419. vld1.64 {d27}, [r0], r1
  420. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  421. align_push_regs
  422. h264_loop_filter_luma
  423. transpose_4x4 q4, q8, q0, q5
  424. sub r0, r0, r1, lsl #4
  425. add r0, r0, #2
  426. vst1.32 {d8[0]}, [r0], r1
  427. vst1.32 {d16[0]}, [r0], r1
  428. vst1.32 {d0[0]}, [r0], r1
  429. vst1.32 {d10[0]}, [r0], r1
  430. vst1.32 {d8[1]}, [r0], r1
  431. vst1.32 {d16[1]}, [r0], r1
  432. vst1.32 {d0[1]}, [r0], r1
  433. vst1.32 {d10[1]}, [r0], r1
  434. vst1.32 {d9[0]}, [r0], r1
  435. vst1.32 {d17[0]}, [r0], r1
  436. vst1.32 {d1[0]}, [r0], r1
  437. vst1.32 {d11[0]}, [r0], r1
  438. vst1.32 {d9[1]}, [r0], r1
  439. vst1.32 {d17[1]}, [r0], r1
  440. vst1.32 {d1[1]}, [r0], r1
  441. vst1.32 {d11[1]}, [r0], r1
  442. align_pop_regs
  443. bx lr
  444. .endfunc
  445. .macro h264_loop_filter_chroma
  446. vdup.8 d22, r2 @ alpha
  447. vmovl.u8 q12, d24
  448. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  449. vmovl.u8 q2, d0
  450. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  451. vsubw.u8 q2, q2, d16
  452. vsli.16 d24, d24, #8
  453. vshl.i16 q2, q2, #2
  454. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  455. vaddw.u8 q2, q2, d18
  456. vclt.u8 d26, d26, d22 @ < alpha
  457. vsubw.u8 q2, q2, d2
  458. vdup.8 d22, r3 @ beta
  459. vclt.s8 d25, d24, #0
  460. vrshrn.i16 d4, q2, #3
  461. vclt.u8 d28, d28, d22 @ < beta
  462. vbic d26, d26, d25
  463. vclt.u8 d30, d30, d22 @ < beta
  464. vand d26, d26, d28
  465. vneg.s8 d25, d24
  466. vand d26, d26, d30
  467. vmin.s8 d4, d4, d24
  468. vmovl.u8 q14, d16
  469. vand d4, d4, d26
  470. vmax.s8 d4, d4, d25
  471. vmovl.u8 q11, d0
  472. vaddw.s8 q14, q14, d4
  473. vsubw.s8 q11, q11, d4
  474. vqmovun.s16 d16, q14
  475. vqmovun.s16 d0, q11
  476. .endm
  477. function ff_h264_v_loop_filter_chroma_neon, export=1
  478. h264_loop_filter_start
  479. sub r0, r0, r1, lsl #1
  480. vld1.64 {d18}, [r0,:64], r1
  481. vld1.64 {d16}, [r0,:64], r1
  482. vld1.64 {d0}, [r0,:64], r1
  483. vld1.64 {d2}, [r0,:64]
  484. h264_loop_filter_chroma
  485. sub r0, r0, r1, lsl #1
  486. vst1.64 {d16}, [r0,:64], r1
  487. vst1.64 {d0}, [r0,:64], r1
  488. bx lr
  489. .endfunc
  490. function ff_h264_h_loop_filter_chroma_neon, export=1
  491. h264_loop_filter_start
  492. sub r0, r0, #2
  493. vld1.32 {d18[0]}, [r0], r1
  494. vld1.32 {d16[0]}, [r0], r1
  495. vld1.32 {d0[0]}, [r0], r1
  496. vld1.32 {d2[0]}, [r0], r1
  497. vld1.32 {d18[1]}, [r0], r1
  498. vld1.32 {d16[1]}, [r0], r1
  499. vld1.32 {d0[1]}, [r0], r1
  500. vld1.32 {d2[1]}, [r0], r1
  501. vtrn.16 d18, d0
  502. vtrn.16 d16, d2
  503. vtrn.8 d18, d16
  504. vtrn.8 d0, d2
  505. h264_loop_filter_chroma
  506. vtrn.16 d18, d0
  507. vtrn.16 d16, d2
  508. vtrn.8 d18, d16
  509. vtrn.8 d0, d2
  510. sub r0, r0, r1, lsl #3
  511. vst1.32 {d18[0]}, [r0], r1
  512. vst1.32 {d16[0]}, [r0], r1
  513. vst1.32 {d0[0]}, [r0], r1
  514. vst1.32 {d2[0]}, [r0], r1
  515. vst1.32 {d18[1]}, [r0], r1
  516. vst1.32 {d16[1]}, [r0], r1
  517. vst1.32 {d0[1]}, [r0], r1
  518. vst1.32 {d2[1]}, [r0], r1
  519. bx lr
  520. .endfunc
  521. /* H.264 qpel MC */
  522. .macro lowpass_const r
  523. movw \r, #5
  524. movt \r, #20
  525. vmov.32 d6[0], \r
  526. .endm
  527. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  528. .if \narrow
  529. t0 .req q0
  530. t1 .req q8
  531. .else
  532. t0 .req \d0
  533. t1 .req \d1
  534. .endif
  535. vext.8 d2, \r0, \r1, #2
  536. vext.8 d3, \r0, \r1, #3
  537. vaddl.u8 q1, d2, d3
  538. vext.8 d4, \r0, \r1, #1
  539. vext.8 d5, \r0, \r1, #4
  540. vaddl.u8 q2, d4, d5
  541. vext.8 d30, \r0, \r1, #5
  542. vaddl.u8 t0, \r0, d30
  543. vext.8 d18, \r2, \r3, #2
  544. vmla.i16 t0, q1, d6[1]
  545. vext.8 d19, \r2, \r3, #3
  546. vaddl.u8 q9, d18, d19
  547. vext.8 d20, \r2, \r3, #1
  548. vmls.i16 t0, q2, d6[0]
  549. vext.8 d21, \r2, \r3, #4
  550. vaddl.u8 q10, d20, d21
  551. vext.8 d31, \r2, \r3, #5
  552. vaddl.u8 t1, \r2, d31
  553. vmla.i16 t1, q9, d6[1]
  554. vmls.i16 t1, q10, d6[0]
  555. .if \narrow
  556. vqrshrun.s16 \d0, t0, #5
  557. vqrshrun.s16 \d1, t1, #5
  558. .endif
  559. .unreq t0
  560. .unreq t1
  561. .endm
  562. .macro lowpass_8_1 r0, r1, d0, narrow=1
  563. .if \narrow
  564. t0 .req q0
  565. .else
  566. t0 .req \d0
  567. .endif
  568. vext.8 d2, \r0, \r1, #2
  569. vext.8 d3, \r0, \r1, #3
  570. vaddl.u8 q1, d2, d3
  571. vext.8 d4, \r0, \r1, #1
  572. vext.8 d5, \r0, \r1, #4
  573. vaddl.u8 q2, d4, d5
  574. vext.8 d30, \r0, \r1, #5
  575. vaddl.u8 t0, \r0, d30
  576. vmla.i16 t0, q1, d6[1]
  577. vmls.i16 t0, q2, d6[0]
  578. .if \narrow
  579. vqrshrun.s16 \d0, t0, #5
  580. .endif
  581. .unreq t0
  582. .endm
  583. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  584. vext.16 q1, \r0, \r1, #2
  585. vext.16 q0, \r0, \r1, #3
  586. vaddl.s16 q9, d2, d0
  587. vext.16 q2, \r0, \r1, #1
  588. vaddl.s16 q1, d3, d1
  589. vext.16 q3, \r0, \r1, #4
  590. vaddl.s16 q10, d4, d6
  591. vext.16 \r1, \r0, \r1, #5
  592. vaddl.s16 q2, d5, d7
  593. vaddl.s16 q0, \h0, \h1
  594. vaddl.s16 q8, \l0, \l1
  595. vshl.i32 q3, q9, #4
  596. vshl.i32 q9, q9, #2
  597. vshl.i32 q15, q10, #2
  598. vadd.i32 q9, q9, q3
  599. vadd.i32 q10, q10, q15
  600. vshl.i32 q3, q1, #4
  601. vshl.i32 q1, q1, #2
  602. vshl.i32 q15, q2, #2
  603. vadd.i32 q1, q1, q3
  604. vadd.i32 q2, q2, q15
  605. vadd.i32 q9, q9, q8
  606. vsub.i32 q9, q9, q10
  607. vadd.i32 q1, q1, q0
  608. vsub.i32 q1, q1, q2
  609. vrshrn.s32 d18, q9, #10
  610. vrshrn.s32 d19, q1, #10
  611. vqmovun.s16 \d, q9
  612. .endm
  613. function put_h264_qpel16_h_lowpass_neon_packed
  614. mov r4, lr
  615. mov ip, #16
  616. mov r3, #8
  617. bl put_h264_qpel8_h_lowpass_neon
  618. sub r1, r1, r2, lsl #4
  619. add r1, r1, #8
  620. mov ip, #16
  621. mov lr, r4
  622. b put_h264_qpel8_h_lowpass_neon
  623. .endfunc
  624. function put_h264_qpel16_h_lowpass_neon
  625. push {lr}
  626. mov ip, #16
  627. bl put_h264_qpel8_h_lowpass_neon
  628. sub r0, r0, r3, lsl #4
  629. sub r1, r1, r2, lsl #4
  630. add r0, r0, #8
  631. add r1, r1, #8
  632. mov ip, #16
  633. pop {lr}
  634. .endfunc
  635. function put_h264_qpel8_h_lowpass_neon
  636. 1: vld1.64 {d0, d1}, [r1], r2
  637. vld1.64 {d16,d17}, [r1], r2
  638. subs ip, ip, #2
  639. lowpass_8 d0, d1, d16, d17, d0, d16
  640. vst1.64 {d0}, [r0,:64], r3
  641. vst1.64 {d16}, [r0,:64], r3
  642. bne 1b
  643. bx lr
  644. .endfunc
  645. function put_h264_qpel16_h_lowpass_l2_neon
  646. push {lr}
  647. mov ip, #16
  648. bl put_h264_qpel8_h_lowpass_l2_neon
  649. sub r0, r0, r2, lsl #4
  650. sub r1, r1, r2, lsl #4
  651. sub r3, r3, r2, lsl #4
  652. add r0, r0, #8
  653. add r1, r1, #8
  654. add r3, r3, #8
  655. mov ip, #16
  656. pop {lr}
  657. .endfunc
  658. function put_h264_qpel8_h_lowpass_l2_neon
  659. 1: vld1.64 {d0, d1}, [r1], r2
  660. vld1.64 {d16,d17}, [r1], r2
  661. vld1.64 {d28}, [r3], r2
  662. vld1.64 {d29}, [r3], r2
  663. subs ip, ip, #2
  664. lowpass_8 d0, d1, d16, d17, d0, d1
  665. vrhadd.u8 q0, q0, q14
  666. vst1.64 {d0}, [r0,:64], r2
  667. vst1.64 {d1}, [r0,:64], r2
  668. bne 1b
  669. bx lr
  670. .endfunc
  671. function put_h264_qpel16_v_lowpass_neon_packed
  672. mov r4, lr
  673. mov r2, #8
  674. bl put_h264_qpel8_v_lowpass_neon
  675. sub r1, r1, r3, lsl #2
  676. bl put_h264_qpel8_v_lowpass_neon
  677. sub r1, r1, r3, lsl #4
  678. sub r1, r1, r3, lsl #2
  679. add r1, r1, #8
  680. bl put_h264_qpel8_v_lowpass_neon
  681. sub r1, r1, r3, lsl #2
  682. mov lr, r4
  683. b put_h264_qpel8_v_lowpass_neon
  684. .endfunc
  685. function put_h264_qpel16_v_lowpass_neon
  686. mov r4, lr
  687. bl put_h264_qpel8_v_lowpass_neon
  688. sub r1, r1, r3, lsl #2
  689. bl put_h264_qpel8_v_lowpass_neon
  690. sub r0, r0, r2, lsl #4
  691. add r0, r0, #8
  692. sub r1, r1, r3, lsl #4
  693. sub r1, r1, r3, lsl #2
  694. add r1, r1, #8
  695. bl put_h264_qpel8_v_lowpass_neon
  696. sub r1, r1, r3, lsl #2
  697. mov lr, r4
  698. .endfunc
  699. function put_h264_qpel8_v_lowpass_neon
  700. vld1.64 {d8}, [r1], r3
  701. vld1.64 {d10}, [r1], r3
  702. vld1.64 {d12}, [r1], r3
  703. vld1.64 {d14}, [r1], r3
  704. vld1.64 {d22}, [r1], r3
  705. vld1.64 {d24}, [r1], r3
  706. vld1.64 {d26}, [r1], r3
  707. vld1.64 {d28}, [r1], r3
  708. vld1.64 {d9}, [r1], r3
  709. vld1.64 {d11}, [r1], r3
  710. vld1.64 {d13}, [r1], r3
  711. vld1.64 {d15}, [r1], r3
  712. vld1.64 {d23}, [r1]
  713. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  714. lowpass_8 d8, d9, d10, d11, d8, d10
  715. lowpass_8 d12, d13, d14, d15, d12, d14
  716. lowpass_8 d22, d23, d24, d25, d22, d24
  717. lowpass_8 d26, d27, d28, d29, d26, d28
  718. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  719. vst1.64 {d8}, [r0,:64], r2
  720. vst1.64 {d10}, [r0,:64], r2
  721. vst1.64 {d12}, [r0,:64], r2
  722. vst1.64 {d14}, [r0,:64], r2
  723. vst1.64 {d22}, [r0,:64], r2
  724. vst1.64 {d24}, [r0,:64], r2
  725. vst1.64 {d26}, [r0,:64], r2
  726. vst1.64 {d28}, [r0,:64], r2
  727. bx lr
  728. .endfunc
  729. function put_h264_qpel16_v_lowpass_l2_neon
  730. mov r4, lr
  731. bl put_h264_qpel8_v_lowpass_l2_neon
  732. sub r1, r1, r3, lsl #2
  733. bl put_h264_qpel8_v_lowpass_l2_neon
  734. sub r0, r0, r3, lsl #4
  735. sub ip, ip, r2, lsl #4
  736. add r0, r0, #8
  737. add ip, ip, #8
  738. sub r1, r1, r3, lsl #4
  739. sub r1, r1, r3, lsl #2
  740. add r1, r1, #8
  741. bl put_h264_qpel8_v_lowpass_l2_neon
  742. sub r1, r1, r3, lsl #2
  743. mov lr, r4
  744. .endfunc
  745. function put_h264_qpel8_v_lowpass_l2_neon
  746. vld1.64 {d8}, [r1], r3
  747. vld1.64 {d10}, [r1], r3
  748. vld1.64 {d12}, [r1], r3
  749. vld1.64 {d14}, [r1], r3
  750. vld1.64 {d22}, [r1], r3
  751. vld1.64 {d24}, [r1], r3
  752. vld1.64 {d26}, [r1], r3
  753. vld1.64 {d28}, [r1], r3
  754. vld1.64 {d9}, [r1], r3
  755. vld1.64 {d11}, [r1], r3
  756. vld1.64 {d13}, [r1], r3
  757. vld1.64 {d15}, [r1], r3
  758. vld1.64 {d23}, [r1]
  759. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  760. lowpass_8 d8, d9, d10, d11, d8, d9
  761. lowpass_8 d12, d13, d14, d15, d12, d13
  762. lowpass_8 d22, d23, d24, d25, d22, d23
  763. lowpass_8 d26, d27, d28, d29, d26, d27
  764. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  765. vld1.64 {d0}, [ip], r2
  766. vld1.64 {d1}, [ip], r2
  767. vld1.64 {d2}, [ip], r2
  768. vld1.64 {d3}, [ip], r2
  769. vld1.64 {d4}, [ip], r2
  770. vrhadd.u8 q0, q0, q4
  771. vld1.64 {d5}, [ip], r2
  772. vrhadd.u8 q1, q1, q6
  773. vld1.64 {d10}, [ip], r2
  774. vrhadd.u8 q2, q2, q11
  775. vld1.64 {d11}, [ip], r2
  776. vst1.64 {d0}, [r0,:64], r3
  777. vst1.64 {d1}, [r0,:64], r3
  778. vrhadd.u8 q5, q5, q13
  779. vst1.64 {d2}, [r0,:64], r3
  780. vst1.64 {d3}, [r0,:64], r3
  781. vst1.64 {d4}, [r0,:64], r3
  782. vst1.64 {d5}, [r0,:64], r3
  783. vst1.64 {d10}, [r0,:64], r3
  784. vst1.64 {d11}, [r0,:64], r3
  785. bx lr
  786. .endfunc
  787. function put_h264_qpel8_hv_lowpass_neon_top
  788. lowpass_const ip
  789. mov ip, #12
  790. 1: vld1.64 {d0, d1}, [r1], r3
  791. vld1.64 {d16,d17}, [r1], r3
  792. subs ip, ip, #2
  793. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  794. vst1.64 {d22-d25}, [r4,:128]!
  795. bne 1b
  796. vld1.64 {d0, d1}, [r1]
  797. lowpass_8_1 d0, d1, q12, narrow=0
  798. mov ip, #-16
  799. add r4, r4, ip
  800. vld1.64 {d30,d31}, [r4,:128], ip
  801. vld1.64 {d20,d21}, [r4,:128], ip
  802. vld1.64 {d18,d19}, [r4,:128], ip
  803. vld1.64 {d16,d17}, [r4,:128], ip
  804. vld1.64 {d14,d15}, [r4,:128], ip
  805. vld1.64 {d12,d13}, [r4,:128], ip
  806. vld1.64 {d10,d11}, [r4,:128], ip
  807. vld1.64 {d8, d9}, [r4,:128], ip
  808. vld1.64 {d6, d7}, [r4,:128], ip
  809. vld1.64 {d4, d5}, [r4,:128], ip
  810. vld1.64 {d2, d3}, [r4,:128], ip
  811. vld1.64 {d0, d1}, [r4,:128]
  812. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  813. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  814. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  815. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  816. vst1.64 {d30,d31}, [r4,:128]!
  817. vst1.64 {d6, d7}, [r4,:128]!
  818. vst1.64 {d20,d21}, [r4,:128]!
  819. vst1.64 {d4, d5}, [r4,:128]!
  820. vst1.64 {d18,d19}, [r4,:128]!
  821. vst1.64 {d2, d3}, [r4,:128]!
  822. vst1.64 {d16,d17}, [r4,:128]!
  823. vst1.64 {d0, d1}, [r4,:128]
  824. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  825. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  826. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  827. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  828. vld1.64 {d16,d17}, [r4,:128], ip
  829. vld1.64 {d30,d31}, [r4,:128], ip
  830. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  831. vld1.64 {d16,d17}, [r4,:128], ip
  832. vld1.64 {d30,d31}, [r4,:128], ip
  833. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  834. vld1.64 {d16,d17}, [r4,:128], ip
  835. vld1.64 {d30,d31}, [r4,:128], ip
  836. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  837. vld1.64 {d16,d17}, [r4,:128], ip
  838. vld1.64 {d30,d31}, [r4,:128]
  839. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  840. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  841. bx lr
  842. .endfunc
  843. function put_h264_qpel8_hv_lowpass_neon
  844. mov r10, lr
  845. bl put_h264_qpel8_hv_lowpass_neon_top
  846. vst1.64 {d12}, [r0,:64], r2
  847. vst1.64 {d13}, [r0,:64], r2
  848. vst1.64 {d14}, [r0,:64], r2
  849. vst1.64 {d15}, [r0,:64], r2
  850. vst1.64 {d8}, [r0,:64], r2
  851. vst1.64 {d9}, [r0,:64], r2
  852. vst1.64 {d10}, [r0,:64], r2
  853. vst1.64 {d11}, [r0,:64], r2
  854. mov lr, r10
  855. bx lr
  856. .endfunc
  857. function put_h264_qpel8_hv_lowpass_l2_neon
  858. mov r10, lr
  859. bl put_h264_qpel8_hv_lowpass_neon_top
  860. vld1.64 {d0, d1}, [r2,:128]!
  861. vld1.64 {d2, d3}, [r2,:128]!
  862. vrhadd.u8 q0, q0, q6
  863. vld1.64 {d4, d5}, [r2,:128]!
  864. vrhadd.u8 q1, q1, q7
  865. vld1.64 {d6, d7}, [r2,:128]!
  866. vrhadd.u8 q2, q2, q4
  867. vst1.64 {d0}, [r0,:64], r3
  868. vrhadd.u8 q3, q3, q5
  869. vst1.64 {d1}, [r0,:64], r3
  870. vst1.64 {d2}, [r0,:64], r3
  871. vst1.64 {d3}, [r0,:64], r3
  872. vst1.64 {d4}, [r0,:64], r3
  873. vst1.64 {d5}, [r0,:64], r3
  874. vst1.64 {d6}, [r0,:64], r3
  875. vst1.64 {d7}, [r0,:64], r3
  876. mov lr, r10
  877. bx lr
  878. .endfunc
  879. function put_h264_qpel16_hv_lowpass_neon
  880. mov r9, lr
  881. bl put_h264_qpel8_hv_lowpass_neon
  882. sub r1, r1, r3, lsl #2
  883. bl put_h264_qpel8_hv_lowpass_neon
  884. sub r1, r1, r3, lsl #4
  885. sub r1, r1, r3, lsl #2
  886. add r1, r1, #8
  887. sub r0, r0, r2, lsl #4
  888. add r0, r0, #8
  889. bl put_h264_qpel8_hv_lowpass_neon
  890. sub r1, r1, r3, lsl #2
  891. mov lr, r9
  892. b put_h264_qpel8_hv_lowpass_neon
  893. .endfunc
  894. function put_h264_qpel16_hv_lowpass_l2_neon
  895. mov r9, lr
  896. sub r2, r4, #256
  897. bl put_h264_qpel8_hv_lowpass_l2_neon
  898. sub r1, r1, r3, lsl #2
  899. bl put_h264_qpel8_hv_lowpass_l2_neon
  900. sub r1, r1, r3, lsl #4
  901. sub r1, r1, r3, lsl #2
  902. add r1, r1, #8
  903. sub r0, r0, r3, lsl #4
  904. add r0, r0, #8
  905. bl put_h264_qpel8_hv_lowpass_l2_neon
  906. sub r1, r1, r3, lsl #2
  907. mov lr, r9
  908. b put_h264_qpel8_hv_lowpass_l2_neon
  909. .endfunc
  910. function ff_put_h264_qpel8_mc10_neon, export=1
  911. lowpass_const r3
  912. mov r3, r1
  913. sub r1, r1, #2
  914. mov ip, #8
  915. b put_h264_qpel8_h_lowpass_l2_neon
  916. .endfunc
  917. function ff_put_h264_qpel8_mc20_neon, export=1
  918. lowpass_const r3
  919. sub r1, r1, #2
  920. mov r3, r2
  921. mov ip, #8
  922. b put_h264_qpel8_h_lowpass_neon
  923. .endfunc
  924. function ff_put_h264_qpel8_mc30_neon, export=1
  925. lowpass_const r3
  926. add r3, r1, #1
  927. sub r1, r1, #2
  928. mov ip, #8
  929. b put_h264_qpel8_h_lowpass_l2_neon
  930. .endfunc
  931. function ff_put_h264_qpel8_mc01_neon, export=1
  932. push {lr}
  933. mov ip, r1
  934. put_h264_qpel8_mc01:
  935. lowpass_const r3
  936. mov r3, r2
  937. sub r1, r1, r2, lsl #1
  938. vpush {d8-d15}
  939. bl put_h264_qpel8_v_lowpass_l2_neon
  940. vpop {d8-d15}
  941. pop {pc}
  942. .endfunc
  943. function ff_put_h264_qpel8_mc11_neon, export=1
  944. push {r0, r1, r11, lr}
  945. put_h264_qpel8_mc11:
  946. lowpass_const r3
  947. mov r11, sp
  948. bic sp, sp, #15
  949. sub sp, sp, #64
  950. mov r0, sp
  951. sub r1, r1, #2
  952. mov r3, #8
  953. mov ip, #8
  954. vpush {d8-d15}
  955. bl put_h264_qpel8_h_lowpass_neon
  956. ldrd r0, [r11]
  957. mov r3, r2
  958. add ip, sp, #64
  959. sub r1, r1, r2, lsl #1
  960. mov r2, #8
  961. bl put_h264_qpel8_v_lowpass_l2_neon
  962. vpop {d8-d15}
  963. add sp, r11, #8
  964. pop {r11, pc}
  965. .endfunc
  966. function ff_put_h264_qpel8_mc21_neon, export=1
  967. push {r0, r1, r4, r10, r11, lr}
  968. put_h264_qpel8_mc21:
  969. lowpass_const r3
  970. mov r11, sp
  971. bic sp, sp, #15
  972. sub sp, sp, #(8*8+16*12)
  973. sub r1, r1, #2
  974. mov r3, #8
  975. mov r0, sp
  976. mov ip, #8
  977. vpush {d8-d15}
  978. bl put_h264_qpel8_h_lowpass_neon
  979. mov r4, r0
  980. ldrd r0, [r11]
  981. sub r1, r1, r2, lsl #1
  982. sub r1, r1, #2
  983. mov r3, r2
  984. sub r2, r4, #64
  985. bl put_h264_qpel8_hv_lowpass_l2_neon
  986. vpop {d8-d15}
  987. add sp, r11, #8
  988. pop {r4, r10, r11, pc}
  989. .endfunc
  990. function ff_put_h264_qpel8_mc31_neon, export=1
  991. add r1, r1, #1
  992. push {r0, r1, r11, lr}
  993. sub r1, r1, #1
  994. b put_h264_qpel8_mc11
  995. .endfunc
  996. function ff_put_h264_qpel8_mc02_neon, export=1
  997. push {lr}
  998. lowpass_const r3
  999. sub r1, r1, r2, lsl #1
  1000. mov r3, r2
  1001. vpush {d8-d15}
  1002. bl put_h264_qpel8_v_lowpass_neon
  1003. vpop {d8-d15}
  1004. pop {pc}
  1005. .endfunc
  1006. function ff_put_h264_qpel8_mc12_neon, export=1
  1007. push {r0, r1, r4, r10, r11, lr}
  1008. put_h264_qpel8_mc12:
  1009. lowpass_const r3
  1010. mov r11, sp
  1011. bic sp, sp, #15
  1012. sub sp, sp, #(8*8+16*12)
  1013. sub r1, r1, r2, lsl #1
  1014. mov r3, r2
  1015. mov r2, #8
  1016. mov r0, sp
  1017. vpush {d8-d15}
  1018. bl put_h264_qpel8_v_lowpass_neon
  1019. mov r4, r0
  1020. ldrd r0, [r11]
  1021. sub r1, r1, r3, lsl #1
  1022. sub r1, r1, #2
  1023. sub r2, r4, #64
  1024. bl put_h264_qpel8_hv_lowpass_l2_neon
  1025. vpop {d8-d15}
  1026. add sp, r11, #8
  1027. pop {r4, r10, r11, pc}
  1028. .endfunc
  1029. function ff_put_h264_qpel8_mc22_neon, export=1
  1030. push {r4, r10, r11, lr}
  1031. mov r11, sp
  1032. bic sp, sp, #15
  1033. sub r1, r1, r2, lsl #1
  1034. sub r1, r1, #2
  1035. mov r3, r2
  1036. sub sp, sp, #(16*12)
  1037. mov r4, sp
  1038. vpush {d8-d15}
  1039. bl put_h264_qpel8_hv_lowpass_neon
  1040. vpop {d8-d15}
  1041. mov sp, r11
  1042. pop {r4, r10, r11, pc}
  1043. .endfunc
  1044. function ff_put_h264_qpel8_mc32_neon, export=1
  1045. push {r0, r1, r4, r10, r11, lr}
  1046. add r1, r1, #1
  1047. b put_h264_qpel8_mc12
  1048. .endfunc
  1049. function ff_put_h264_qpel8_mc03_neon, export=1
  1050. push {lr}
  1051. add ip, r1, r2
  1052. b put_h264_qpel8_mc01
  1053. .endfunc
  1054. function ff_put_h264_qpel8_mc13_neon, export=1
  1055. push {r0, r1, r11, lr}
  1056. add r1, r1, r2
  1057. b put_h264_qpel8_mc11
  1058. .endfunc
  1059. function ff_put_h264_qpel8_mc23_neon, export=1
  1060. push {r0, r1, r4, r10, r11, lr}
  1061. add r1, r1, r2
  1062. b put_h264_qpel8_mc21
  1063. .endfunc
  1064. function ff_put_h264_qpel8_mc33_neon, export=1
  1065. add r1, r1, #1
  1066. push {r0, r1, r11, lr}
  1067. add r1, r1, r2
  1068. sub r1, r1, #1
  1069. b put_h264_qpel8_mc11
  1070. .endfunc
  1071. function ff_put_h264_qpel16_mc10_neon, export=1
  1072. lowpass_const r3
  1073. mov r3, r1
  1074. sub r1, r1, #2
  1075. b put_h264_qpel16_h_lowpass_l2_neon
  1076. .endfunc
  1077. function ff_put_h264_qpel16_mc20_neon, export=1
  1078. lowpass_const r3
  1079. sub r1, r1, #2
  1080. mov r3, r2
  1081. b put_h264_qpel16_h_lowpass_neon
  1082. .endfunc
  1083. function ff_put_h264_qpel16_mc30_neon, export=1
  1084. lowpass_const r3
  1085. add r3, r1, #1
  1086. sub r1, r1, #2
  1087. b put_h264_qpel16_h_lowpass_l2_neon
  1088. .endfunc
  1089. function ff_put_h264_qpel16_mc01_neon, export=1
  1090. push {r4, lr}
  1091. mov ip, r1
  1092. put_h264_qpel16_mc01:
  1093. lowpass_const r3
  1094. mov r3, r2
  1095. sub r1, r1, r2, lsl #1
  1096. vpush {d8-d15}
  1097. bl put_h264_qpel16_v_lowpass_l2_neon
  1098. vpop {d8-d15}
  1099. pop {r4, pc}
  1100. .endfunc
  1101. function ff_put_h264_qpel16_mc11_neon, export=1
  1102. push {r0, r1, r4, r11, lr}
  1103. put_h264_qpel16_mc11:
  1104. lowpass_const r3
  1105. mov r11, sp
  1106. bic sp, sp, #15
  1107. sub sp, sp, #256
  1108. mov r0, sp
  1109. sub r1, r1, #2
  1110. mov r3, #16
  1111. vpush {d8-d15}
  1112. bl put_h264_qpel16_h_lowpass_neon
  1113. ldrd r0, [r11]
  1114. mov r3, r2
  1115. add ip, sp, #64
  1116. sub r1, r1, r2, lsl #1
  1117. mov r2, #16
  1118. bl put_h264_qpel16_v_lowpass_l2_neon
  1119. vpop {d8-d15}
  1120. add sp, r11, #8
  1121. pop {r4, r11, pc}
  1122. .endfunc
  1123. function ff_put_h264_qpel16_mc21_neon, export=1
  1124. push {r0, r1, r4-r5, r9-r11, lr}
  1125. put_h264_qpel16_mc21:
  1126. lowpass_const r3
  1127. mov r11, sp
  1128. bic sp, sp, #15
  1129. sub sp, sp, #(16*16+16*12)
  1130. sub r1, r1, #2
  1131. mov r0, sp
  1132. vpush {d8-d15}
  1133. bl put_h264_qpel16_h_lowpass_neon_packed
  1134. mov r4, r0
  1135. ldrd r0, [r11]
  1136. sub r1, r1, r2, lsl #1
  1137. sub r1, r1, #2
  1138. mov r3, r2
  1139. bl put_h264_qpel16_hv_lowpass_l2_neon
  1140. vpop {d8-d15}
  1141. add sp, r11, #8
  1142. pop {r4-r5, r9-r11, pc}
  1143. .endfunc
  1144. function ff_put_h264_qpel16_mc31_neon, export=1
  1145. add r1, r1, #1
  1146. push {r0, r1, r4, r11, lr}
  1147. sub r1, r1, #1
  1148. b put_h264_qpel16_mc11
  1149. .endfunc
  1150. function ff_put_h264_qpel16_mc02_neon, export=1
  1151. push {r4, lr}
  1152. lowpass_const r3
  1153. sub r1, r1, r2, lsl #1
  1154. mov r3, r2
  1155. vpush {d8-d15}
  1156. bl put_h264_qpel16_v_lowpass_neon
  1157. vpop {d8-d15}
  1158. pop {r4, pc}
  1159. .endfunc
  1160. function ff_put_h264_qpel16_mc12_neon, export=1
  1161. push {r0, r1, r4-r5, r9-r11, lr}
  1162. put_h264_qpel16_mc12:
  1163. lowpass_const r3
  1164. mov r11, sp
  1165. bic sp, sp, #15
  1166. sub sp, sp, #(16*16+16*12)
  1167. sub r1, r1, r2, lsl #1
  1168. mov r0, sp
  1169. mov r3, r2
  1170. vpush {d8-d15}
  1171. bl put_h264_qpel16_v_lowpass_neon_packed
  1172. mov r4, r0
  1173. ldrd r0, [r11]
  1174. sub r1, r1, r3, lsl #1
  1175. sub r1, r1, #2
  1176. mov r2, r3
  1177. bl put_h264_qpel16_hv_lowpass_l2_neon
  1178. vpop {d8-d15}
  1179. add sp, r11, #8
  1180. pop {r4-r5, r9-r11, pc}
  1181. .endfunc
  1182. function ff_put_h264_qpel16_mc22_neon, export=1
  1183. push {r4, r9-r11, lr}
  1184. lowpass_const r3
  1185. mov r11, sp
  1186. bic sp, sp, #15
  1187. sub r1, r1, r2, lsl #1
  1188. sub r1, r1, #2
  1189. mov r3, r2
  1190. sub sp, sp, #(16*12)
  1191. mov r4, sp
  1192. vpush {d8-d15}
  1193. bl put_h264_qpel16_hv_lowpass_neon
  1194. vpop {d8-d15}
  1195. mov sp, r11
  1196. pop {r4, r9-r11, pc}
  1197. .endfunc
  1198. function ff_put_h264_qpel16_mc32_neon, export=1
  1199. push {r0, r1, r4-r5, r9-r11, lr}
  1200. add r1, r1, #1
  1201. b put_h264_qpel16_mc12
  1202. .endfunc
  1203. function ff_put_h264_qpel16_mc03_neon, export=1
  1204. push {r4, lr}
  1205. add ip, r1, r2
  1206. b put_h264_qpel16_mc01
  1207. .endfunc
  1208. function ff_put_h264_qpel16_mc13_neon, export=1
  1209. push {r0, r1, r4, r11, lr}
  1210. add r1, r1, r2
  1211. b put_h264_qpel16_mc11
  1212. .endfunc
  1213. function ff_put_h264_qpel16_mc23_neon, export=1
  1214. push {r0, r1, r4-r5, r9-r11, lr}
  1215. add r1, r1, r2
  1216. b put_h264_qpel16_mc21
  1217. .endfunc
  1218. function ff_put_h264_qpel16_mc33_neon, export=1
  1219. add r1, r1, #1
  1220. push {r0, r1, r4, r11, lr}
  1221. add r1, r1, r2
  1222. sub r1, r1, #1
  1223. b put_h264_qpel16_mc11
  1224. .endfunc
  1225. @ Biweighted prediction
  1226. .macro biweight_16 macs, macd
  1227. vdup.8 d0, r4
  1228. vdup.8 d1, r5
  1229. vmov q2, q8
  1230. vmov q3, q8
  1231. 1: subs ip, ip, #2
  1232. vld1.8 {d20-d21},[r0,:128], r2
  1233. \macd q2, d0, d20
  1234. pld [r0]
  1235. \macd q3, d0, d21
  1236. vld1.8 {d22-d23},[r1,:128], r2
  1237. \macs q2, d1, d22
  1238. pld [r1]
  1239. \macs q3, d1, d23
  1240. vmov q12, q8
  1241. vld1.8 {d28-d29},[r0,:128], r2
  1242. vmov q13, q8
  1243. \macd q12, d0, d28
  1244. pld [r0]
  1245. \macd q13, d0, d29
  1246. vld1.8 {d30-d31},[r1,:128], r2
  1247. \macs q12, d1, d30
  1248. pld [r1]
  1249. \macs q13, d1, d31
  1250. vshl.s16 q2, q2, q9
  1251. vshl.s16 q3, q3, q9
  1252. vqmovun.s16 d4, q2
  1253. vqmovun.s16 d5, q3
  1254. vshl.s16 q12, q12, q9
  1255. vshl.s16 q13, q13, q9
  1256. vqmovun.s16 d24, q12
  1257. vqmovun.s16 d25, q13
  1258. vmov q3, q8
  1259. vst1.8 {d4- d5}, [r6,:128], r2
  1260. vmov q2, q8
  1261. vst1.8 {d24-d25},[r6,:128], r2
  1262. bne 1b
  1263. pop {r4-r6, pc}
  1264. .endm
  1265. .macro biweight_8 macs, macd
  1266. vdup.8 d0, r4
  1267. vdup.8 d1, r5
  1268. vmov q1, q8
  1269. vmov q10, q8
  1270. 1: subs ip, ip, #2
  1271. vld1.8 {d4},[r0,:64], r2
  1272. \macd q1, d0, d4
  1273. pld [r0]
  1274. vld1.8 {d5},[r1,:64], r2
  1275. \macs q1, d1, d5
  1276. pld [r1]
  1277. vld1.8 {d6},[r0,:64], r2
  1278. \macd q10, d0, d6
  1279. pld [r0]
  1280. vld1.8 {d7},[r1,:64], r2
  1281. \macs q10, d1, d7
  1282. pld [r1]
  1283. vshl.s16 q1, q1, q9
  1284. vqmovun.s16 d2, q1
  1285. vshl.s16 q10, q10, q9
  1286. vqmovun.s16 d4, q10
  1287. vmov q10, q8
  1288. vst1.8 {d2},[r6,:64], r2
  1289. vmov q1, q8
  1290. vst1.8 {d4},[r6,:64], r2
  1291. bne 1b
  1292. pop {r4-r6, pc}
  1293. .endm
  1294. .macro biweight_4 macs, macd
  1295. vdup.8 d0, r4
  1296. vdup.8 d1, r5
  1297. vmov q1, q8
  1298. vmov q10, q8
  1299. 1: subs ip, ip, #4
  1300. vld1.32 {d4[0]},[r0,:32], r2
  1301. vld1.32 {d4[1]},[r0,:32], r2
  1302. \macd q1, d0, d4
  1303. pld [r0]
  1304. vld1.32 {d5[0]},[r1,:32], r2
  1305. vld1.32 {d5[1]},[r1,:32], r2
  1306. \macs q1, d1, d5
  1307. pld [r1]
  1308. blt 2f
  1309. vld1.32 {d6[0]},[r0,:32], r2
  1310. vld1.32 {d6[1]},[r0,:32], r2
  1311. \macd q10, d0, d6
  1312. pld [r0]
  1313. vld1.32 {d7[0]},[r1,:32], r2
  1314. vld1.32 {d7[1]},[r1,:32], r2
  1315. \macs q10, d1, d7
  1316. pld [r1]
  1317. vshl.s16 q1, q1, q9
  1318. vqmovun.s16 d2, q1
  1319. vshl.s16 q10, q10, q9
  1320. vqmovun.s16 d4, q10
  1321. vmov q10, q8
  1322. vst1.32 {d2[0]},[r6,:32], r2
  1323. vst1.32 {d2[1]},[r6,:32], r2
  1324. vmov q1, q8
  1325. vst1.32 {d4[0]},[r6,:32], r2
  1326. vst1.32 {d4[1]},[r6,:32], r2
  1327. bne 1b
  1328. pop {r4-r6, pc}
  1329. 2: vshl.s16 q1, q1, q9
  1330. vqmovun.s16 d2, q1
  1331. vst1.32 {d2[0]},[r6,:32], r2
  1332. vst1.32 {d2[1]},[r6,:32], r2
  1333. pop {r4-r6, pc}
  1334. .endm
  1335. .macro biweight_func w
  1336. function biweight_h264_pixels_\w\()_neon
  1337. push {r4-r6, lr}
  1338. add r4, sp, #16
  1339. ldm r4, {r4-r6}
  1340. lsr lr, r4, #31
  1341. add r6, r6, #1
  1342. eors lr, lr, r5, lsr #30
  1343. orr r6, r6, #1
  1344. vdup.16 q9, r3
  1345. lsl r6, r6, r3
  1346. vmvn q9, q9
  1347. vdup.16 q8, r6
  1348. mov r6, r0
  1349. beq 10f
  1350. subs lr, lr, #1
  1351. beq 20f
  1352. subs lr, lr, #1
  1353. beq 30f
  1354. b 40f
  1355. 10: biweight_\w vmlal.u8, vmlal.u8
  1356. 20: rsb r4, r4, #0
  1357. biweight_\w vmlal.u8, vmlsl.u8
  1358. 30: rsb r4, r4, #0
  1359. rsb r5, r5, #0
  1360. biweight_\w vmlsl.u8, vmlsl.u8
  1361. 40: rsb r5, r5, #0
  1362. biweight_\w vmlsl.u8, vmlal.u8
  1363. .endfunc
  1364. .endm
  1365. .macro biweight_entry w, h, b=1
  1366. function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
  1367. mov ip, #\h
  1368. .if \b
  1369. b biweight_h264_pixels_\w\()_neon
  1370. .endif
  1371. .endfunc
  1372. .endm
  1373. biweight_entry 16, 8
  1374. biweight_entry 16, 16, b=0
  1375. biweight_func 16
  1376. biweight_entry 8, 16
  1377. biweight_entry 8, 4
  1378. biweight_entry 8, 8, b=0
  1379. biweight_func 8
  1380. biweight_entry 4, 8
  1381. biweight_entry 4, 2
  1382. biweight_entry 4, 4, b=0
  1383. biweight_func 4
  1384. @ Weighted prediction
  1385. .macro weight_16 add
  1386. vdup.8 d0, r3
  1387. 1: subs ip, ip, #2
  1388. vld1.8 {d20-d21},[r0,:128], r1
  1389. vmull.u8 q2, d0, d20
  1390. pld [r0]
  1391. vmull.u8 q3, d0, d21
  1392. vld1.8 {d28-d29},[r0,:128], r1
  1393. vmull.u8 q12, d0, d28
  1394. pld [r0]
  1395. vmull.u8 q13, d0, d29
  1396. \add q2, q8, q2
  1397. vrshl.s16 q2, q2, q9
  1398. \add q3, q8, q3
  1399. vrshl.s16 q3, q3, q9
  1400. vqmovun.s16 d4, q2
  1401. vqmovun.s16 d5, q3
  1402. \add q12, q8, q12
  1403. vrshl.s16 q12, q12, q9
  1404. \add q13, q8, q13
  1405. vrshl.s16 q13, q13, q9
  1406. vqmovun.s16 d24, q12
  1407. vqmovun.s16 d25, q13
  1408. vst1.8 {d4- d5}, [r4,:128], r1
  1409. vst1.8 {d24-d25},[r4,:128], r1
  1410. bne 1b
  1411. pop {r4, pc}
  1412. .endm
  1413. .macro weight_8 add
  1414. vdup.8 d0, r3
  1415. 1: subs ip, ip, #2
  1416. vld1.8 {d4},[r0,:64], r1
  1417. vmull.u8 q1, d0, d4
  1418. pld [r0]
  1419. vld1.8 {d6},[r0,:64], r1
  1420. vmull.u8 q10, d0, d6
  1421. \add q1, q8, q1
  1422. pld [r0]
  1423. vrshl.s16 q1, q1, q9
  1424. vqmovun.s16 d2, q1
  1425. \add q10, q8, q10
  1426. vrshl.s16 q10, q10, q9
  1427. vqmovun.s16 d4, q10
  1428. vst1.8 {d2},[r4,:64], r1
  1429. vst1.8 {d4},[r4,:64], r1
  1430. bne 1b
  1431. pop {r4, pc}
  1432. .endm
  1433. .macro weight_4 add
  1434. vdup.8 d0, r3
  1435. vmov q1, q8
  1436. vmov q10, q8
  1437. 1: subs ip, ip, #4
  1438. vld1.32 {d4[0]},[r0,:32], r1
  1439. vld1.32 {d4[1]},[r0,:32], r1
  1440. vmull.u8 q1, d0, d4
  1441. pld [r0]
  1442. blt 2f
  1443. vld1.32 {d6[0]},[r0,:32], r1
  1444. vld1.32 {d6[1]},[r0,:32], r1
  1445. vmull.u8 q10, d0, d6
  1446. pld [r0]
  1447. \add q1, q8, q1
  1448. vrshl.s16 q1, q1, q9
  1449. vqmovun.s16 d2, q1
  1450. \add q10, q8, q10
  1451. vrshl.s16 q10, q10, q9
  1452. vqmovun.s16 d4, q10
  1453. vmov q10, q8
  1454. vst1.32 {d2[0]},[r4,:32], r1
  1455. vst1.32 {d2[1]},[r4,:32], r1
  1456. vmov q1, q8
  1457. vst1.32 {d4[0]},[r4,:32], r1
  1458. vst1.32 {d4[1]},[r4,:32], r1
  1459. bne 1b
  1460. pop {r4, pc}
  1461. 2: \add q1, q8, q1
  1462. vrshl.s16 q1, q1, q9
  1463. vqmovun.s16 d2, q1
  1464. vst1.32 {d2[0]},[r4,:32], r1
  1465. vst1.32 {d2[1]},[r4,:32], r1
  1466. pop {r4, pc}
  1467. .endm
  1468. .macro weight_func w
  1469. function weight_h264_pixels_\w\()_neon
  1470. push {r4, lr}
  1471. ldr r4, [sp, #8]
  1472. cmp r2, #1
  1473. lsl r4, r4, r2
  1474. vdup.16 q8, r4
  1475. mov r4, r0
  1476. ble 20f
  1477. rsb lr, r2, #1
  1478. vdup.16 q9, lr
  1479. cmp r3, #0
  1480. blt 10f
  1481. weight_\w vhadd.s16
  1482. 10: rsb r3, r3, #0
  1483. weight_\w vhsub.s16
  1484. 20: rsb lr, r2, #0
  1485. vdup.16 q9, lr
  1486. cmp r3, #0
  1487. blt 10f
  1488. weight_\w vadd.s16
  1489. 10: rsb r3, r3, #0
  1490. weight_\w vsub.s16
  1491. .endfunc
  1492. .endm
  1493. .macro weight_entry w, h, b=1
  1494. function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
  1495. mov ip, #\h
  1496. .if \b
  1497. b weight_h264_pixels_\w\()_neon
  1498. .endif
  1499. .endfunc
  1500. .endm
  1501. weight_entry 16, 8
  1502. weight_entry 16, 16, b=0
  1503. weight_func 16
  1504. weight_entry 8, 16
  1505. weight_entry 8, 4
  1506. weight_entry 8, 8, b=0
  1507. weight_func 8
  1508. weight_entry 4, 8
  1509. weight_entry 4, 2
  1510. weight_entry 4, 4, b=0
  1511. weight_func 4