You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1677 lines
55KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .fpu neon
  22. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  23. vtrn.32 \r0, \r4
  24. vtrn.32 \r1, \r5
  25. vtrn.32 \r2, \r6
  26. vtrn.32 \r3, \r7
  27. vtrn.16 \r0, \r2
  28. vtrn.16 \r1, \r3
  29. vtrn.16 \r4, \r6
  30. vtrn.16 \r5, \r7
  31. vtrn.8 \r0, \r1
  32. vtrn.8 \r2, \r3
  33. vtrn.8 \r4, \r5
  34. vtrn.8 \r6, \r7
  35. .endm
  36. .macro transpose_4x4 r0 r1 r2 r3
  37. vtrn.16 \r0, \r2
  38. vtrn.16 \r1, \r3
  39. vtrn.8 \r0, \r1
  40. vtrn.8 \r2, \r3
  41. .endm
  42. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  43. vswp \r0, \r4
  44. vswp \r1, \r5
  45. vswp \r2, \r6
  46. vswp \r3, \r7
  47. .endm
  48. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  49. vtrn.32 \r0, \r2
  50. vtrn.32 \r1, \r3
  51. vtrn.32 \r4, \r6
  52. vtrn.32 \r5, \r7
  53. vtrn.16 \r0, \r1
  54. vtrn.16 \r2, \r3
  55. vtrn.16 \r4, \r5
  56. vtrn.16 \r6, \r7
  57. .endm
  58. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  59. .macro h264_chroma_mc8 type
  60. function ff_\type\()_h264_chroma_mc8_neon, export=1
  61. push {r4-r7, lr}
  62. ldrd r4, [sp, #20]
  63. .ifc \type,avg
  64. mov lr, r0
  65. .endif
  66. pld [r1]
  67. pld [r1, r2]
  68. muls r7, r4, r5
  69. rsb r6, r7, r5, lsl #3
  70. rsb ip, r7, r4, lsl #3
  71. sub r4, r7, r4, lsl #3
  72. sub r4, r4, r5, lsl #3
  73. add r4, r4, #64
  74. beq 2f
  75. add r5, r1, r2
  76. vdup.8 d0, r4
  77. lsl r4, r2, #1
  78. vdup.8 d1, ip
  79. vld1.64 {d4, d5}, [r1], r4
  80. vdup.8 d2, r6
  81. vld1.64 {d6, d7}, [r5], r4
  82. vdup.8 d3, r7
  83. vext.8 d5, d4, d5, #1
  84. vext.8 d7, d6, d7, #1
  85. 1: pld [r5]
  86. vmull.u8 q8, d4, d0
  87. vmlal.u8 q8, d5, d1
  88. vld1.64 {d4, d5}, [r1], r4
  89. vmlal.u8 q8, d6, d2
  90. vext.8 d5, d4, d5, #1
  91. vmlal.u8 q8, d7, d3
  92. vmull.u8 q9, d6, d0
  93. subs r3, r3, #2
  94. vmlal.u8 q9, d7, d1
  95. vmlal.u8 q9, d4, d2
  96. vmlal.u8 q9, d5, d3
  97. vrshrn.u16 d16, q8, #6
  98. vld1.64 {d6, d7}, [r5], r4
  99. pld [r1]
  100. vrshrn.u16 d17, q9, #6
  101. .ifc \type,avg
  102. vld1.64 {d20}, [lr,:64], r2
  103. vld1.64 {d21}, [lr,:64], r2
  104. vrhadd.u8 q8, q8, q10
  105. .endif
  106. vext.8 d7, d6, d7, #1
  107. vst1.64 {d16}, [r0,:64], r2
  108. vst1.64 {d17}, [r0,:64], r2
  109. bgt 1b
  110. pop {r4-r7, pc}
  111. 2: tst r6, r6
  112. add ip, ip, r6
  113. vdup.8 d0, r4
  114. vdup.8 d1, ip
  115. beq 4f
  116. add r5, r1, r2
  117. lsl r4, r2, #1
  118. vld1.64 {d4}, [r1], r4
  119. vld1.64 {d6}, [r5], r4
  120. 3: pld [r5]
  121. vmull.u8 q8, d4, d0
  122. vmlal.u8 q8, d6, d1
  123. vld1.64 {d4}, [r1], r4
  124. vmull.u8 q9, d6, d0
  125. vmlal.u8 q9, d4, d1
  126. vld1.64 {d6}, [r5], r4
  127. vrshrn.u16 d16, q8, #6
  128. vrshrn.u16 d17, q9, #6
  129. .ifc \type,avg
  130. vld1.64 {d20}, [lr,:64], r2
  131. vld1.64 {d21}, [lr,:64], r2
  132. vrhadd.u8 q8, q8, q10
  133. .endif
  134. subs r3, r3, #2
  135. pld [r1]
  136. vst1.64 {d16}, [r0,:64], r2
  137. vst1.64 {d17}, [r0,:64], r2
  138. bgt 3b
  139. pop {r4-r7, pc}
  140. 4: vld1.64 {d4, d5}, [r1], r2
  141. vld1.64 {d6, d7}, [r1], r2
  142. vext.8 d5, d4, d5, #1
  143. vext.8 d7, d6, d7, #1
  144. 5: pld [r1]
  145. subs r3, r3, #2
  146. vmull.u8 q8, d4, d0
  147. vmlal.u8 q8, d5, d1
  148. vld1.64 {d4, d5}, [r1], r2
  149. vmull.u8 q9, d6, d0
  150. vmlal.u8 q9, d7, d1
  151. pld [r1]
  152. vext.8 d5, d4, d5, #1
  153. vrshrn.u16 d16, q8, #6
  154. vrshrn.u16 d17, q9, #6
  155. .ifc \type,avg
  156. vld1.64 {d20}, [lr,:64], r2
  157. vld1.64 {d21}, [lr,:64], r2
  158. vrhadd.u8 q8, q8, q10
  159. .endif
  160. vld1.64 {d6, d7}, [r1], r2
  161. vext.8 d7, d6, d7, #1
  162. vst1.64 {d16}, [r0,:64], r2
  163. vst1.64 {d17}, [r0,:64], r2
  164. bgt 5b
  165. pop {r4-r7, pc}
  166. .endfunc
  167. .endm
  168. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  169. .macro h264_chroma_mc4 type
  170. function ff_\type\()_h264_chroma_mc4_neon, export=1
  171. push {r4-r7, lr}
  172. ldrd r4, [sp, #20]
  173. .ifc \type,avg
  174. mov lr, r0
  175. .endif
  176. pld [r1]
  177. pld [r1, r2]
  178. muls r7, r4, r5
  179. rsb r6, r7, r5, lsl #3
  180. rsb ip, r7, r4, lsl #3
  181. sub r4, r7, r4, lsl #3
  182. sub r4, r4, r5, lsl #3
  183. add r4, r4, #64
  184. beq 2f
  185. add r5, r1, r2
  186. vdup.8 d0, r4
  187. lsl r4, r2, #1
  188. vdup.8 d1, ip
  189. vld1.64 {d4}, [r1], r4
  190. vdup.8 d2, r6
  191. vld1.64 {d6}, [r5], r4
  192. vdup.8 d3, r7
  193. vext.8 d5, d4, d5, #1
  194. vext.8 d7, d6, d7, #1
  195. vtrn.32 d4, d5
  196. vtrn.32 d6, d7
  197. vtrn.32 d0, d1
  198. vtrn.32 d2, d3
  199. 1: pld [r5]
  200. vmull.u8 q8, d4, d0
  201. vmlal.u8 q8, d6, d2
  202. vld1.64 {d4}, [r1], r4
  203. vext.8 d5, d4, d5, #1
  204. vtrn.32 d4, d5
  205. vmull.u8 q9, d6, d0
  206. vmlal.u8 q9, d4, d2
  207. vld1.64 {d6}, [r5], r4
  208. vadd.i16 d16, d16, d17
  209. vadd.i16 d17, d18, d19
  210. vrshrn.u16 d16, q8, #6
  211. subs r3, r3, #2
  212. pld [r1]
  213. .ifc \type,avg
  214. vld1.32 {d20[0]}, [lr,:32], r2
  215. vld1.32 {d20[1]}, [lr,:32], r2
  216. vrhadd.u8 d16, d16, d20
  217. .endif
  218. vext.8 d7, d6, d7, #1
  219. vtrn.32 d6, d7
  220. vst1.32 {d16[0]}, [r0,:32], r2
  221. vst1.32 {d16[1]}, [r0,:32], r2
  222. bgt 1b
  223. pop {r4-r7, pc}
  224. 2: tst r6, r6
  225. add ip, ip, r6
  226. vdup.8 d0, r4
  227. vdup.8 d1, ip
  228. vtrn.32 d0, d1
  229. beq 4f
  230. vext.32 d1, d0, d1, #1
  231. add r5, r1, r2
  232. lsl r4, r2, #1
  233. vld1.32 {d4[0]}, [r1], r4
  234. vld1.32 {d4[1]}, [r5], r4
  235. 3: pld [r5]
  236. vmull.u8 q8, d4, d0
  237. vld1.32 {d4[0]}, [r1], r4
  238. vmull.u8 q9, d4, d1
  239. vld1.32 {d4[1]}, [r5], r4
  240. vadd.i16 d16, d16, d17
  241. vadd.i16 d17, d18, d19
  242. vrshrn.u16 d16, q8, #6
  243. .ifc \type,avg
  244. vld1.32 {d20[0]}, [lr,:32], r2
  245. vld1.32 {d20[1]}, [lr,:32], r2
  246. vrhadd.u8 d16, d16, d20
  247. .endif
  248. subs r3, r3, #2
  249. pld [r1]
  250. vst1.32 {d16[0]}, [r0,:32], r2
  251. vst1.32 {d16[1]}, [r0,:32], r2
  252. bgt 3b
  253. pop {r4-r7, pc}
  254. 4: vld1.64 {d4}, [r1], r2
  255. vld1.64 {d6}, [r1], r2
  256. vext.8 d5, d4, d5, #1
  257. vext.8 d7, d6, d7, #1
  258. vtrn.32 d4, d5
  259. vtrn.32 d6, d7
  260. 5: vmull.u8 q8, d4, d0
  261. vmull.u8 q9, d6, d0
  262. subs r3, r3, #2
  263. vld1.64 {d4}, [r1], r2
  264. vext.8 d5, d4, d5, #1
  265. vtrn.32 d4, d5
  266. vadd.i16 d16, d16, d17
  267. vadd.i16 d17, d18, d19
  268. pld [r1]
  269. vrshrn.u16 d16, q8, #6
  270. .ifc \type,avg
  271. vld1.32 {d20[0]}, [lr,:32], r2
  272. vld1.32 {d20[1]}, [lr,:32], r2
  273. vrhadd.u8 d16, d16, d20
  274. .endif
  275. vld1.64 {d6}, [r1], r2
  276. vext.8 d7, d6, d7, #1
  277. vtrn.32 d6, d7
  278. pld [r1]
  279. vst1.32 {d16[0]}, [r0,:32], r2
  280. vst1.32 {d16[1]}, [r0,:32], r2
  281. bgt 5b
  282. pop {r4-r7, pc}
  283. .endfunc
  284. .endm
  285. .text
  286. .align
  287. h264_chroma_mc8 put
  288. h264_chroma_mc8 avg
  289. h264_chroma_mc4 put
  290. h264_chroma_mc4 avg
  291. /* H.264 loop filter */
  292. .macro h264_loop_filter_start
  293. ldr ip, [sp]
  294. tst r2, r2
  295. ldr ip, [ip]
  296. tstne r3, r3
  297. vmov.32 d24[0], ip
  298. and ip, ip, ip, lsl #16
  299. bxeq lr
  300. ands ip, ip, ip, lsl #8
  301. bxlt lr
  302. .endm
  303. .macro align_push_regs
  304. and ip, sp, #15
  305. add ip, ip, #32
  306. sub sp, sp, ip
  307. vst1.64 {d12-d15}, [sp,:128]
  308. sub sp, sp, #32
  309. vst1.64 {d8-d11}, [sp,:128]
  310. .endm
  311. .macro align_pop_regs
  312. vld1.64 {d8-d11}, [sp,:128]!
  313. vld1.64 {d12-d15}, [sp,:128], ip
  314. .endm
  315. .macro h264_loop_filter_luma
  316. vdup.8 q11, r2 @ alpha
  317. vmovl.u8 q12, d24
  318. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  319. vmovl.u16 q12, d24
  320. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  321. vsli.16 q12, q12, #8
  322. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  323. vsli.32 q12, q12, #16
  324. vclt.u8 q6, q6, q11 @ < alpha
  325. vdup.8 q11, r3 @ beta
  326. vclt.s8 q7, q12, #0
  327. vclt.u8 q14, q14, q11 @ < beta
  328. vclt.u8 q15, q15, q11 @ < beta
  329. vbic q6, q6, q7
  330. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  331. vand q6, q6, q14
  332. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  333. vclt.u8 q4, q4, q11 @ < beta
  334. vand q6, q6, q15
  335. vclt.u8 q5, q5, q11 @ < beta
  336. vand q4, q4, q6
  337. vand q5, q5, q6
  338. vand q12, q12, q6
  339. vrhadd.u8 q14, q8, q0
  340. vsub.i8 q6, q12, q4
  341. vqadd.u8 q7, q9, q12
  342. vhadd.u8 q10, q10, q14
  343. vsub.i8 q6, q6, q5
  344. vhadd.u8 q14, q2, q14
  345. vmin.u8 q7, q7, q10
  346. vqsub.u8 q11, q9, q12
  347. vqadd.u8 q2, q1, q12
  348. vmax.u8 q7, q7, q11
  349. vqsub.u8 q11, q1, q12
  350. vmin.u8 q14, q2, q14
  351. vmovl.u8 q2, d0
  352. vmax.u8 q14, q14, q11
  353. vmovl.u8 q10, d1
  354. vsubw.u8 q2, q2, d16
  355. vsubw.u8 q10, q10, d17
  356. vshl.i16 q2, q2, #2
  357. vshl.i16 q10, q10, #2
  358. vaddw.u8 q2, q2, d18
  359. vaddw.u8 q10, q10, d19
  360. vsubw.u8 q2, q2, d2
  361. vsubw.u8 q10, q10, d3
  362. vrshrn.i16 d4, q2, #3
  363. vrshrn.i16 d5, q10, #3
  364. vbsl q4, q7, q9
  365. vbsl q5, q14, q1
  366. vneg.s8 q7, q6
  367. vmovl.u8 q14, d16
  368. vmin.s8 q2, q2, q6
  369. vmovl.u8 q6, d17
  370. vmax.s8 q2, q2, q7
  371. vmovl.u8 q11, d0
  372. vmovl.u8 q12, d1
  373. vaddw.s8 q14, q14, d4
  374. vaddw.s8 q6, q6, d5
  375. vsubw.s8 q11, q11, d4
  376. vsubw.s8 q12, q12, d5
  377. vqmovun.s16 d16, q14
  378. vqmovun.s16 d17, q6
  379. vqmovun.s16 d0, q11
  380. vqmovun.s16 d1, q12
  381. .endm
  382. function ff_h264_v_loop_filter_luma_neon, export=1
  383. h264_loop_filter_start
  384. vld1.64 {d0, d1}, [r0,:128], r1
  385. vld1.64 {d2, d3}, [r0,:128], r1
  386. vld1.64 {d4, d5}, [r0,:128], r1
  387. sub r0, r0, r1, lsl #2
  388. sub r0, r0, r1, lsl #1
  389. vld1.64 {d20,d21}, [r0,:128], r1
  390. vld1.64 {d18,d19}, [r0,:128], r1
  391. vld1.64 {d16,d17}, [r0,:128], r1
  392. align_push_regs
  393. h264_loop_filter_luma
  394. sub r0, r0, r1, lsl #1
  395. vst1.64 {d8, d9}, [r0,:128], r1
  396. vst1.64 {d16,d17}, [r0,:128], r1
  397. vst1.64 {d0, d1}, [r0,:128], r1
  398. vst1.64 {d10,d11}, [r0,:128]
  399. align_pop_regs
  400. bx lr
  401. .endfunc
  402. function ff_h264_h_loop_filter_luma_neon, export=1
  403. h264_loop_filter_start
  404. sub r0, r0, #4
  405. vld1.64 {d6}, [r0], r1
  406. vld1.64 {d20}, [r0], r1
  407. vld1.64 {d18}, [r0], r1
  408. vld1.64 {d16}, [r0], r1
  409. vld1.64 {d0}, [r0], r1
  410. vld1.64 {d2}, [r0], r1
  411. vld1.64 {d4}, [r0], r1
  412. vld1.64 {d26}, [r0], r1
  413. vld1.64 {d7}, [r0], r1
  414. vld1.64 {d21}, [r0], r1
  415. vld1.64 {d19}, [r0], r1
  416. vld1.64 {d17}, [r0], r1
  417. vld1.64 {d1}, [r0], r1
  418. vld1.64 {d3}, [r0], r1
  419. vld1.64 {d5}, [r0], r1
  420. vld1.64 {d27}, [r0], r1
  421. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  422. align_push_regs
  423. h264_loop_filter_luma
  424. transpose_4x4 q4, q8, q0, q5
  425. sub r0, r0, r1, lsl #4
  426. add r0, r0, #2
  427. vst1.32 {d8[0]}, [r0], r1
  428. vst1.32 {d16[0]}, [r0], r1
  429. vst1.32 {d0[0]}, [r0], r1
  430. vst1.32 {d10[0]}, [r0], r1
  431. vst1.32 {d8[1]}, [r0], r1
  432. vst1.32 {d16[1]}, [r0], r1
  433. vst1.32 {d0[1]}, [r0], r1
  434. vst1.32 {d10[1]}, [r0], r1
  435. vst1.32 {d9[0]}, [r0], r1
  436. vst1.32 {d17[0]}, [r0], r1
  437. vst1.32 {d1[0]}, [r0], r1
  438. vst1.32 {d11[0]}, [r0], r1
  439. vst1.32 {d9[1]}, [r0], r1
  440. vst1.32 {d17[1]}, [r0], r1
  441. vst1.32 {d1[1]}, [r0], r1
  442. vst1.32 {d11[1]}, [r0], r1
  443. align_pop_regs
  444. bx lr
  445. .endfunc
  446. .macro h264_loop_filter_chroma
  447. vdup.8 d22, r2 @ alpha
  448. vmovl.u8 q12, d24
  449. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  450. vmovl.u8 q2, d0
  451. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  452. vsubw.u8 q2, q2, d16
  453. vsli.16 d24, d24, #8
  454. vshl.i16 q2, q2, #2
  455. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  456. vaddw.u8 q2, q2, d18
  457. vclt.u8 d26, d26, d22 @ < alpha
  458. vsubw.u8 q2, q2, d2
  459. vdup.8 d22, r3 @ beta
  460. vclt.s8 d25, d24, #0
  461. vrshrn.i16 d4, q2, #3
  462. vclt.u8 d28, d28, d22 @ < beta
  463. vbic d26, d26, d25
  464. vclt.u8 d30, d30, d22 @ < beta
  465. vand d26, d26, d28
  466. vneg.s8 d25, d24
  467. vand d26, d26, d30
  468. vmin.s8 d4, d4, d24
  469. vmovl.u8 q14, d16
  470. vand d4, d4, d26
  471. vmax.s8 d4, d4, d25
  472. vmovl.u8 q11, d0
  473. vaddw.s8 q14, q14, d4
  474. vsubw.s8 q11, q11, d4
  475. vqmovun.s16 d16, q14
  476. vqmovun.s16 d0, q11
  477. .endm
  478. function ff_h264_v_loop_filter_chroma_neon, export=1
  479. h264_loop_filter_start
  480. sub r0, r0, r1, lsl #1
  481. vld1.64 {d18}, [r0,:64], r1
  482. vld1.64 {d16}, [r0,:64], r1
  483. vld1.64 {d0}, [r0,:64], r1
  484. vld1.64 {d2}, [r0,:64]
  485. h264_loop_filter_chroma
  486. sub r0, r0, r1, lsl #1
  487. vst1.64 {d16}, [r0,:64], r1
  488. vst1.64 {d0}, [r0,:64], r1
  489. bx lr
  490. .endfunc
  491. function ff_h264_h_loop_filter_chroma_neon, export=1
  492. h264_loop_filter_start
  493. sub r0, r0, #2
  494. vld1.32 {d18[0]}, [r0], r1
  495. vld1.32 {d16[0]}, [r0], r1
  496. vld1.32 {d0[0]}, [r0], r1
  497. vld1.32 {d2[0]}, [r0], r1
  498. vld1.32 {d18[1]}, [r0], r1
  499. vld1.32 {d16[1]}, [r0], r1
  500. vld1.32 {d0[1]}, [r0], r1
  501. vld1.32 {d2[1]}, [r0], r1
  502. vtrn.16 d18, d0
  503. vtrn.16 d16, d2
  504. vtrn.8 d18, d16
  505. vtrn.8 d0, d2
  506. h264_loop_filter_chroma
  507. vtrn.16 d18, d0
  508. vtrn.16 d16, d2
  509. vtrn.8 d18, d16
  510. vtrn.8 d0, d2
  511. sub r0, r0, r1, lsl #3
  512. vst1.32 {d18[0]}, [r0], r1
  513. vst1.32 {d16[0]}, [r0], r1
  514. vst1.32 {d0[0]}, [r0], r1
  515. vst1.32 {d2[0]}, [r0], r1
  516. vst1.32 {d18[1]}, [r0], r1
  517. vst1.32 {d16[1]}, [r0], r1
  518. vst1.32 {d0[1]}, [r0], r1
  519. vst1.32 {d2[1]}, [r0], r1
  520. bx lr
  521. .endfunc
  522. /* H.264 qpel MC */
  523. .macro lowpass_const r
  524. movw \r, #5
  525. movt \r, #20
  526. vmov.32 d6[0], \r
  527. .endm
  528. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  529. .if \narrow
  530. t0 .req q0
  531. t1 .req q8
  532. .else
  533. t0 .req \d0
  534. t1 .req \d1
  535. .endif
  536. vext.8 d2, \r0, \r1, #2
  537. vext.8 d3, \r0, \r1, #3
  538. vaddl.u8 q1, d2, d3
  539. vext.8 d4, \r0, \r1, #1
  540. vext.8 d5, \r0, \r1, #4
  541. vaddl.u8 q2, d4, d5
  542. vext.8 d30, \r0, \r1, #5
  543. vaddl.u8 t0, \r0, d30
  544. vext.8 d18, \r2, \r3, #2
  545. vmla.i16 t0, q1, d6[1]
  546. vext.8 d19, \r2, \r3, #3
  547. vaddl.u8 q9, d18, d19
  548. vext.8 d20, \r2, \r3, #1
  549. vmls.i16 t0, q2, d6[0]
  550. vext.8 d21, \r2, \r3, #4
  551. vaddl.u8 q10, d20, d21
  552. vext.8 d31, \r2, \r3, #5
  553. vaddl.u8 t1, \r2, d31
  554. vmla.i16 t1, q9, d6[1]
  555. vmls.i16 t1, q10, d6[0]
  556. .if \narrow
  557. vqrshrun.s16 \d0, t0, #5
  558. vqrshrun.s16 \d1, t1, #5
  559. .endif
  560. .unreq t0
  561. .unreq t1
  562. .endm
  563. .macro lowpass_8_1 r0, r1, d0, narrow=1
  564. .if \narrow
  565. t0 .req q0
  566. .else
  567. t0 .req \d0
  568. .endif
  569. vext.8 d2, \r0, \r1, #2
  570. vext.8 d3, \r0, \r1, #3
  571. vaddl.u8 q1, d2, d3
  572. vext.8 d4, \r0, \r1, #1
  573. vext.8 d5, \r0, \r1, #4
  574. vaddl.u8 q2, d4, d5
  575. vext.8 d30, \r0, \r1, #5
  576. vaddl.u8 t0, \r0, d30
  577. vmla.i16 t0, q1, d6[1]
  578. vmls.i16 t0, q2, d6[0]
  579. .if \narrow
  580. vqrshrun.s16 \d0, t0, #5
  581. .endif
  582. .unreq t0
  583. .endm
  584. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  585. vext.16 q1, \r0, \r1, #2
  586. vext.16 q0, \r0, \r1, #3
  587. vaddl.s16 q9, d2, d0
  588. vext.16 q2, \r0, \r1, #1
  589. vaddl.s16 q1, d3, d1
  590. vext.16 q3, \r0, \r1, #4
  591. vaddl.s16 q10, d4, d6
  592. vext.16 \r1, \r0, \r1, #5
  593. vaddl.s16 q2, d5, d7
  594. vaddl.s16 q0, \h0, \h1
  595. vaddl.s16 q8, \l0, \l1
  596. vshl.i32 q3, q9, #4
  597. vshl.i32 q9, q9, #2
  598. vshl.i32 q15, q10, #2
  599. vadd.i32 q9, q9, q3
  600. vadd.i32 q10, q10, q15
  601. vshl.i32 q3, q1, #4
  602. vshl.i32 q1, q1, #2
  603. vshl.i32 q15, q2, #2
  604. vadd.i32 q1, q1, q3
  605. vadd.i32 q2, q2, q15
  606. vadd.i32 q9, q9, q8
  607. vsub.i32 q9, q9, q10
  608. vadd.i32 q1, q1, q0
  609. vsub.i32 q1, q1, q2
  610. vrshrn.s32 d18, q9, #10
  611. vrshrn.s32 d19, q1, #10
  612. vqmovun.s16 \d, q9
  613. .endm
  614. function put_h264_qpel16_h_lowpass_neon_packed
  615. mov r4, lr
  616. mov ip, #16
  617. mov r3, #8
  618. bl put_h264_qpel8_h_lowpass_neon
  619. sub r1, r1, r2, lsl #4
  620. add r1, r1, #8
  621. mov ip, #16
  622. mov lr, r4
  623. b put_h264_qpel8_h_lowpass_neon
  624. .endfunc
  625. function put_h264_qpel16_h_lowpass_neon
  626. push {lr}
  627. mov ip, #16
  628. bl put_h264_qpel8_h_lowpass_neon
  629. sub r0, r0, r3, lsl #4
  630. sub r1, r1, r2, lsl #4
  631. add r0, r0, #8
  632. add r1, r1, #8
  633. mov ip, #16
  634. pop {lr}
  635. .endfunc
  636. function put_h264_qpel8_h_lowpass_neon
  637. 1: vld1.64 {d0, d1}, [r1], r2
  638. vld1.64 {d16,d17}, [r1], r2
  639. subs ip, ip, #2
  640. lowpass_8 d0, d1, d16, d17, d0, d16
  641. vst1.64 {d0}, [r0,:64], r3
  642. vst1.64 {d16}, [r0,:64], r3
  643. bne 1b
  644. bx lr
  645. .endfunc
  646. function put_h264_qpel16_h_lowpass_l2_neon
  647. push {lr}
  648. mov ip, #16
  649. bl put_h264_qpel8_h_lowpass_l2_neon
  650. sub r0, r0, r2, lsl #4
  651. sub r1, r1, r2, lsl #4
  652. sub r3, r3, r2, lsl #4
  653. add r0, r0, #8
  654. add r1, r1, #8
  655. add r3, r3, #8
  656. mov ip, #16
  657. pop {lr}
  658. .endfunc
  659. function put_h264_qpel8_h_lowpass_l2_neon
  660. 1: vld1.64 {d0, d1}, [r1], r2
  661. vld1.64 {d16,d17}, [r1], r2
  662. vld1.64 {d28}, [r3], r2
  663. vld1.64 {d29}, [r3], r2
  664. subs ip, ip, #2
  665. lowpass_8 d0, d1, d16, d17, d0, d1
  666. vrhadd.u8 q0, q0, q14
  667. vst1.64 {d0}, [r0,:64], r2
  668. vst1.64 {d1}, [r0,:64], r2
  669. bne 1b
  670. bx lr
  671. .endfunc
  672. function put_h264_qpel16_v_lowpass_neon_packed
  673. mov r4, lr
  674. mov r2, #8
  675. bl put_h264_qpel8_v_lowpass_neon
  676. sub r1, r1, r3, lsl #2
  677. bl put_h264_qpel8_v_lowpass_neon
  678. sub r1, r1, r3, lsl #4
  679. sub r1, r1, r3, lsl #2
  680. add r1, r1, #8
  681. bl put_h264_qpel8_v_lowpass_neon
  682. sub r1, r1, r3, lsl #2
  683. mov lr, r4
  684. b put_h264_qpel8_v_lowpass_neon
  685. .endfunc
  686. function put_h264_qpel16_v_lowpass_neon
  687. mov r4, lr
  688. bl put_h264_qpel8_v_lowpass_neon
  689. sub r1, r1, r3, lsl #2
  690. bl put_h264_qpel8_v_lowpass_neon
  691. sub r0, r0, r2, lsl #4
  692. add r0, r0, #8
  693. sub r1, r1, r3, lsl #4
  694. sub r1, r1, r3, lsl #2
  695. add r1, r1, #8
  696. bl put_h264_qpel8_v_lowpass_neon
  697. sub r1, r1, r3, lsl #2
  698. mov lr, r4
  699. .endfunc
  700. function put_h264_qpel8_v_lowpass_neon
  701. vld1.64 {d8}, [r1], r3
  702. vld1.64 {d10}, [r1], r3
  703. vld1.64 {d12}, [r1], r3
  704. vld1.64 {d14}, [r1], r3
  705. vld1.64 {d22}, [r1], r3
  706. vld1.64 {d24}, [r1], r3
  707. vld1.64 {d26}, [r1], r3
  708. vld1.64 {d28}, [r1], r3
  709. vld1.64 {d9}, [r1], r3
  710. vld1.64 {d11}, [r1], r3
  711. vld1.64 {d13}, [r1], r3
  712. vld1.64 {d15}, [r1], r3
  713. vld1.64 {d23}, [r1]
  714. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  715. lowpass_8 d8, d9, d10, d11, d8, d10
  716. lowpass_8 d12, d13, d14, d15, d12, d14
  717. lowpass_8 d22, d23, d24, d25, d22, d24
  718. lowpass_8 d26, d27, d28, d29, d26, d28
  719. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  720. vst1.64 {d8}, [r0,:64], r2
  721. vst1.64 {d10}, [r0,:64], r2
  722. vst1.64 {d12}, [r0,:64], r2
  723. vst1.64 {d14}, [r0,:64], r2
  724. vst1.64 {d22}, [r0,:64], r2
  725. vst1.64 {d24}, [r0,:64], r2
  726. vst1.64 {d26}, [r0,:64], r2
  727. vst1.64 {d28}, [r0,:64], r2
  728. bx lr
  729. .endfunc
  730. function put_h264_qpel16_v_lowpass_l2_neon
  731. mov r4, lr
  732. bl put_h264_qpel8_v_lowpass_l2_neon
  733. sub r1, r1, r3, lsl #2
  734. bl put_h264_qpel8_v_lowpass_l2_neon
  735. sub r0, r0, r3, lsl #4
  736. sub ip, ip, r2, lsl #4
  737. add r0, r0, #8
  738. add ip, ip, #8
  739. sub r1, r1, r3, lsl #4
  740. sub r1, r1, r3, lsl #2
  741. add r1, r1, #8
  742. bl put_h264_qpel8_v_lowpass_l2_neon
  743. sub r1, r1, r3, lsl #2
  744. mov lr, r4
  745. .endfunc
  746. function put_h264_qpel8_v_lowpass_l2_neon
  747. vld1.64 {d8}, [r1], r3
  748. vld1.64 {d10}, [r1], r3
  749. vld1.64 {d12}, [r1], r3
  750. vld1.64 {d14}, [r1], r3
  751. vld1.64 {d22}, [r1], r3
  752. vld1.64 {d24}, [r1], r3
  753. vld1.64 {d26}, [r1], r3
  754. vld1.64 {d28}, [r1], r3
  755. vld1.64 {d9}, [r1], r3
  756. vld1.64 {d11}, [r1], r3
  757. vld1.64 {d13}, [r1], r3
  758. vld1.64 {d15}, [r1], r3
  759. vld1.64 {d23}, [r1]
  760. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  761. lowpass_8 d8, d9, d10, d11, d8, d9
  762. lowpass_8 d12, d13, d14, d15, d12, d13
  763. lowpass_8 d22, d23, d24, d25, d22, d23
  764. lowpass_8 d26, d27, d28, d29, d26, d27
  765. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  766. vld1.64 {d0}, [ip], r2
  767. vld1.64 {d1}, [ip], r2
  768. vld1.64 {d2}, [ip], r2
  769. vld1.64 {d3}, [ip], r2
  770. vld1.64 {d4}, [ip], r2
  771. vrhadd.u8 q0, q0, q4
  772. vld1.64 {d5}, [ip], r2
  773. vrhadd.u8 q1, q1, q6
  774. vld1.64 {d10}, [ip], r2
  775. vrhadd.u8 q2, q2, q11
  776. vld1.64 {d11}, [ip], r2
  777. vst1.64 {d0}, [r0,:64], r3
  778. vst1.64 {d1}, [r0,:64], r3
  779. vrhadd.u8 q5, q5, q13
  780. vst1.64 {d2}, [r0,:64], r3
  781. vst1.64 {d3}, [r0,:64], r3
  782. vst1.64 {d4}, [r0,:64], r3
  783. vst1.64 {d5}, [r0,:64], r3
  784. vst1.64 {d10}, [r0,:64], r3
  785. vst1.64 {d11}, [r0,:64], r3
  786. bx lr
  787. .endfunc
  788. function put_h264_qpel8_hv_lowpass_neon_top
  789. lowpass_const ip
  790. mov ip, #12
  791. 1: vld1.64 {d0, d1}, [r1], r3
  792. vld1.64 {d16,d17}, [r1], r3
  793. subs ip, ip, #2
  794. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  795. vst1.64 {d22-d25}, [r4,:128]!
  796. bne 1b
  797. vld1.64 {d0, d1}, [r1]
  798. lowpass_8_1 d0, d1, q12, narrow=0
  799. mov ip, #-16
  800. add r4, r4, ip
  801. vld1.64 {d30,d31}, [r4,:128], ip
  802. vld1.64 {d20,d21}, [r4,:128], ip
  803. vld1.64 {d18,d19}, [r4,:128], ip
  804. vld1.64 {d16,d17}, [r4,:128], ip
  805. vld1.64 {d14,d15}, [r4,:128], ip
  806. vld1.64 {d12,d13}, [r4,:128], ip
  807. vld1.64 {d10,d11}, [r4,:128], ip
  808. vld1.64 {d8, d9}, [r4,:128], ip
  809. vld1.64 {d6, d7}, [r4,:128], ip
  810. vld1.64 {d4, d5}, [r4,:128], ip
  811. vld1.64 {d2, d3}, [r4,:128], ip
  812. vld1.64 {d0, d1}, [r4,:128]
  813. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  814. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  815. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  816. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  817. vst1.64 {d30,d31}, [r4,:128]!
  818. vst1.64 {d6, d7}, [r4,:128]!
  819. vst1.64 {d20,d21}, [r4,:128]!
  820. vst1.64 {d4, d5}, [r4,:128]!
  821. vst1.64 {d18,d19}, [r4,:128]!
  822. vst1.64 {d2, d3}, [r4,:128]!
  823. vst1.64 {d16,d17}, [r4,:128]!
  824. vst1.64 {d0, d1}, [r4,:128]
  825. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  826. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  827. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  828. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  829. vld1.64 {d16,d17}, [r4,:128], ip
  830. vld1.64 {d30,d31}, [r4,:128], ip
  831. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  832. vld1.64 {d16,d17}, [r4,:128], ip
  833. vld1.64 {d30,d31}, [r4,:128], ip
  834. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  835. vld1.64 {d16,d17}, [r4,:128], ip
  836. vld1.64 {d30,d31}, [r4,:128], ip
  837. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  838. vld1.64 {d16,d17}, [r4,:128], ip
  839. vld1.64 {d30,d31}, [r4,:128]
  840. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  841. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  842. bx lr
  843. .endfunc
  844. function put_h264_qpel8_hv_lowpass_neon
  845. mov r10, lr
  846. bl put_h264_qpel8_hv_lowpass_neon_top
  847. vst1.64 {d12}, [r0,:64], r2
  848. vst1.64 {d13}, [r0,:64], r2
  849. vst1.64 {d14}, [r0,:64], r2
  850. vst1.64 {d15}, [r0,:64], r2
  851. vst1.64 {d8}, [r0,:64], r2
  852. vst1.64 {d9}, [r0,:64], r2
  853. vst1.64 {d10}, [r0,:64], r2
  854. vst1.64 {d11}, [r0,:64], r2
  855. mov lr, r10
  856. bx lr
  857. .endfunc
  858. function put_h264_qpel8_hv_lowpass_l2_neon
  859. mov r10, lr
  860. bl put_h264_qpel8_hv_lowpass_neon_top
  861. vld1.64 {d0, d1}, [r2,:128]!
  862. vld1.64 {d2, d3}, [r2,:128]!
  863. vrhadd.u8 q0, q0, q6
  864. vld1.64 {d4, d5}, [r2,:128]!
  865. vrhadd.u8 q1, q1, q7
  866. vld1.64 {d6, d7}, [r2,:128]!
  867. vrhadd.u8 q2, q2, q4
  868. vst1.64 {d0}, [r0,:64], r3
  869. vrhadd.u8 q3, q3, q5
  870. vst1.64 {d1}, [r0,:64], r3
  871. vst1.64 {d2}, [r0,:64], r3
  872. vst1.64 {d3}, [r0,:64], r3
  873. vst1.64 {d4}, [r0,:64], r3
  874. vst1.64 {d5}, [r0,:64], r3
  875. vst1.64 {d6}, [r0,:64], r3
  876. vst1.64 {d7}, [r0,:64], r3
  877. mov lr, r10
  878. bx lr
  879. .endfunc
  880. function put_h264_qpel16_hv_lowpass_neon
  881. mov r9, lr
  882. bl put_h264_qpel8_hv_lowpass_neon
  883. sub r1, r1, r3, lsl #2
  884. bl put_h264_qpel8_hv_lowpass_neon
  885. sub r1, r1, r3, lsl #4
  886. sub r1, r1, r3, lsl #2
  887. add r1, r1, #8
  888. sub r0, r0, r2, lsl #4
  889. add r0, r0, #8
  890. bl put_h264_qpel8_hv_lowpass_neon
  891. sub r1, r1, r3, lsl #2
  892. mov lr, r9
  893. b put_h264_qpel8_hv_lowpass_neon
  894. .endfunc
  895. function put_h264_qpel16_hv_lowpass_l2_neon
  896. mov r9, lr
  897. sub r2, r4, #256
  898. bl put_h264_qpel8_hv_lowpass_l2_neon
  899. sub r1, r1, r3, lsl #2
  900. bl put_h264_qpel8_hv_lowpass_l2_neon
  901. sub r1, r1, r3, lsl #4
  902. sub r1, r1, r3, lsl #2
  903. add r1, r1, #8
  904. sub r0, r0, r3, lsl #4
  905. add r0, r0, #8
  906. bl put_h264_qpel8_hv_lowpass_l2_neon
  907. sub r1, r1, r3, lsl #2
  908. mov lr, r9
  909. b put_h264_qpel8_hv_lowpass_l2_neon
  910. .endfunc
  911. function ff_put_h264_qpel8_mc10_neon, export=1
  912. lowpass_const r3
  913. mov r3, r1
  914. sub r1, r1, #2
  915. mov ip, #8
  916. b put_h264_qpel8_h_lowpass_l2_neon
  917. .endfunc
  918. function ff_put_h264_qpel8_mc20_neon, export=1
  919. lowpass_const r3
  920. sub r1, r1, #2
  921. mov r3, r2
  922. mov ip, #8
  923. b put_h264_qpel8_h_lowpass_neon
  924. .endfunc
  925. function ff_put_h264_qpel8_mc30_neon, export=1
  926. lowpass_const r3
  927. add r3, r1, #1
  928. sub r1, r1, #2
  929. mov ip, #8
  930. b put_h264_qpel8_h_lowpass_l2_neon
  931. .endfunc
  932. function ff_put_h264_qpel8_mc01_neon, export=1
  933. push {lr}
  934. mov ip, r1
  935. put_h264_qpel8_mc01:
  936. lowpass_const r3
  937. mov r3, r2
  938. sub r1, r1, r2, lsl #1
  939. vpush {d8-d15}
  940. bl put_h264_qpel8_v_lowpass_l2_neon
  941. vpop {d8-d15}
  942. pop {pc}
  943. .endfunc
  944. function ff_put_h264_qpel8_mc11_neon, export=1
  945. push {r0, r1, r2, lr}
  946. put_h264_qpel8_mc11:
  947. lowpass_const r3
  948. sub sp, sp, #64
  949. mov r0, sp
  950. sub r1, r1, #2
  951. mov r3, #8
  952. mov ip, #8
  953. vpush {d8-d15}
  954. bl put_h264_qpel8_h_lowpass_neon
  955. ldrd r0, [sp, #128]
  956. mov r3, r2
  957. add ip, sp, #64
  958. sub r1, r1, r2, lsl #1
  959. mov r2, #8
  960. bl put_h264_qpel8_v_lowpass_l2_neon
  961. vpop {d8-d15}
  962. add sp, sp, #76
  963. pop {pc}
  964. .endfunc
  965. function ff_put_h264_qpel8_mc21_neon, export=1
  966. push {r0, r1, r4, r10, r11, lr}
  967. put_h264_qpel8_mc21:
  968. lowpass_const r3
  969. mov r11, sp
  970. bic sp, sp, #15
  971. sub sp, sp, #(8*8+16*12)
  972. sub r1, r1, #2
  973. mov r3, #8
  974. mov r0, sp
  975. mov ip, #8
  976. vpush {d8-d15}
  977. bl put_h264_qpel8_h_lowpass_neon
  978. mov r4, r0
  979. ldrd r0, [r11]
  980. sub r1, r1, r2, lsl #1
  981. sub r1, r1, #2
  982. mov r3, r2
  983. sub r2, r4, #64
  984. bl put_h264_qpel8_hv_lowpass_l2_neon
  985. vpop {d8-d15}
  986. add sp, r11, #8
  987. pop {r4, r10, r11, pc}
  988. .endfunc
  989. function ff_put_h264_qpel8_mc31_neon, export=1
  990. add r1, r1, #1
  991. push {r0, r1, r2, lr}
  992. sub r1, r1, #1
  993. b put_h264_qpel8_mc11
  994. .endfunc
  995. function ff_put_h264_qpel8_mc02_neon, export=1
  996. push {lr}
  997. lowpass_const r3
  998. sub r1, r1, r2, lsl #1
  999. mov r3, r2
  1000. vpush {d8-d15}
  1001. bl put_h264_qpel8_v_lowpass_neon
  1002. vpop {d8-d15}
  1003. pop {pc}
  1004. .endfunc
  1005. function ff_put_h264_qpel8_mc12_neon, export=1
  1006. push {r0, r1, r4, r10, r11, lr}
  1007. put_h264_qpel8_mc12:
  1008. lowpass_const r3
  1009. mov r11, sp
  1010. bic sp, sp, #15
  1011. sub sp, sp, #(8*8+16*12)
  1012. sub r1, r1, r2, lsl #1
  1013. mov r3, r2
  1014. mov r2, #8
  1015. mov r0, sp
  1016. vpush {d8-d15}
  1017. bl put_h264_qpel8_v_lowpass_neon
  1018. mov r4, r0
  1019. ldrd r0, [r11]
  1020. sub r1, r1, r3, lsl #1
  1021. sub r1, r1, #2
  1022. sub r2, r4, #64
  1023. bl put_h264_qpel8_hv_lowpass_l2_neon
  1024. vpop {d8-d15}
  1025. add sp, r11, #8
  1026. pop {r4, r10, r11, pc}
  1027. .endfunc
  1028. function ff_put_h264_qpel8_mc22_neon, export=1
  1029. push {r4, r10, r11, lr}
  1030. mov r11, sp
  1031. bic sp, sp, #15
  1032. sub r1, r1, r2, lsl #1
  1033. sub r1, r1, #2
  1034. mov r3, r2
  1035. sub sp, sp, #(16*12)
  1036. mov r4, sp
  1037. vpush {d8-d15}
  1038. bl put_h264_qpel8_hv_lowpass_neon
  1039. vpop {d8-d15}
  1040. mov sp, r11
  1041. pop {r4, r10, r11, pc}
  1042. .endfunc
  1043. function ff_put_h264_qpel8_mc32_neon, export=1
  1044. push {r0, r1, r4, r10, r11, lr}
  1045. add r1, r1, #1
  1046. b put_h264_qpel8_mc12
  1047. .endfunc
  1048. function ff_put_h264_qpel8_mc03_neon, export=1
  1049. push {lr}
  1050. add ip, r1, r2
  1051. b put_h264_qpel8_mc01
  1052. .endfunc
  1053. function ff_put_h264_qpel8_mc13_neon, export=1
  1054. push {r0, r1, r2, lr}
  1055. add r1, r1, r2
  1056. b put_h264_qpel8_mc11
  1057. .endfunc
  1058. function ff_put_h264_qpel8_mc23_neon, export=1
  1059. push {r0, r1, r4, r10, r11, lr}
  1060. add r1, r1, r2
  1061. b put_h264_qpel8_mc21
  1062. .endfunc
  1063. function ff_put_h264_qpel8_mc33_neon, export=1
  1064. add r1, r1, #1
  1065. push {r0, r1, r2, lr}
  1066. add r1, r1, r2
  1067. sub r1, r1, #1
  1068. b put_h264_qpel8_mc11
  1069. .endfunc
  1070. function ff_put_h264_qpel16_mc10_neon, export=1
  1071. lowpass_const r3
  1072. mov r3, r1
  1073. sub r1, r1, #2
  1074. b put_h264_qpel16_h_lowpass_l2_neon
  1075. .endfunc
  1076. function ff_put_h264_qpel16_mc20_neon, export=1
  1077. lowpass_const r3
  1078. sub r1, r1, #2
  1079. mov r3, r2
  1080. b put_h264_qpel16_h_lowpass_neon
  1081. .endfunc
  1082. function ff_put_h264_qpel16_mc30_neon, export=1
  1083. lowpass_const r3
  1084. add r3, r1, #1
  1085. sub r1, r1, #2
  1086. b put_h264_qpel16_h_lowpass_l2_neon
  1087. .endfunc
  1088. function ff_put_h264_qpel16_mc01_neon, export=1
  1089. push {r4, lr}
  1090. mov ip, r1
  1091. put_h264_qpel16_mc01:
  1092. lowpass_const r3
  1093. mov r3, r2
  1094. sub r1, r1, r2, lsl #1
  1095. vpush {d8-d15}
  1096. bl put_h264_qpel16_v_lowpass_l2_neon
  1097. vpop {d8-d15}
  1098. pop {r4, pc}
  1099. .endfunc
  1100. function ff_put_h264_qpel16_mc11_neon, export=1
  1101. push {r0, r1, r4, lr}
  1102. put_h264_qpel16_mc11:
  1103. lowpass_const r3
  1104. sub sp, sp, #256
  1105. mov r0, sp
  1106. sub r1, r1, #2
  1107. mov r3, #16
  1108. vpush {d8-d15}
  1109. bl put_h264_qpel16_h_lowpass_neon
  1110. add r0, sp, #256
  1111. ldrd r0, [r0, #64]
  1112. mov r3, r2
  1113. add ip, sp, #64
  1114. sub r1, r1, r2, lsl #1
  1115. mov r2, #16
  1116. bl put_h264_qpel16_v_lowpass_l2_neon
  1117. vpop {d8-d15}
  1118. add sp, sp, #(256+8)
  1119. pop {r4, pc}
  1120. .endfunc
  1121. function ff_put_h264_qpel16_mc21_neon, export=1
  1122. push {r0, r1, r4-r5, r9-r11, lr}
  1123. put_h264_qpel16_mc21:
  1124. lowpass_const r3
  1125. mov r11, sp
  1126. bic sp, sp, #15
  1127. sub sp, sp, #(16*16+16*12)
  1128. sub r1, r1, #2
  1129. mov r0, sp
  1130. vpush {d8-d15}
  1131. bl put_h264_qpel16_h_lowpass_neon_packed
  1132. mov r4, r0
  1133. ldrd r0, [r11]
  1134. sub r1, r1, r2, lsl #1
  1135. sub r1, r1, #2
  1136. mov r3, r2
  1137. bl put_h264_qpel16_hv_lowpass_l2_neon
  1138. vpop {d8-d15}
  1139. add sp, r11, #8
  1140. pop {r4-r5, r9-r11, pc}
  1141. .endfunc
  1142. function ff_put_h264_qpel16_mc31_neon, export=1
  1143. add r1, r1, #1
  1144. push {r0, r1, r4, lr}
  1145. sub r1, r1, #1
  1146. b put_h264_qpel16_mc11
  1147. .endfunc
  1148. function ff_put_h264_qpel16_mc02_neon, export=1
  1149. push {r4, lr}
  1150. lowpass_const r3
  1151. sub r1, r1, r2, lsl #1
  1152. mov r3, r2
  1153. vpush {d8-d15}
  1154. bl put_h264_qpel16_v_lowpass_neon
  1155. vpop {d8-d15}
  1156. pop {r4, pc}
  1157. .endfunc
  1158. function ff_put_h264_qpel16_mc12_neon, export=1
  1159. push {r0, r1, r4-r5, r9-r11, lr}
  1160. put_h264_qpel16_mc12:
  1161. lowpass_const r3
  1162. mov r11, sp
  1163. bic sp, sp, #15
  1164. sub sp, sp, #(16*16+16*12)
  1165. sub r1, r1, r2, lsl #1
  1166. mov r0, sp
  1167. mov r3, r2
  1168. vpush {d8-d15}
  1169. bl put_h264_qpel16_v_lowpass_neon_packed
  1170. mov r4, r0
  1171. ldrd r0, [r11]
  1172. sub r1, r1, r3, lsl #1
  1173. sub r1, r1, #2
  1174. mov r2, r3
  1175. bl put_h264_qpel16_hv_lowpass_l2_neon
  1176. vpop {d8-d15}
  1177. add sp, r11, #8
  1178. pop {r4-r5, r9-r11, pc}
  1179. .endfunc
  1180. function ff_put_h264_qpel16_mc22_neon, export=1
  1181. push {r4, r9-r11, lr}
  1182. lowpass_const r3
  1183. mov r11, sp
  1184. bic sp, sp, #15
  1185. sub r1, r1, r2, lsl #1
  1186. sub r1, r1, #2
  1187. mov r3, r2
  1188. sub sp, sp, #(16*12)
  1189. mov r4, sp
  1190. vpush {d8-d15}
  1191. bl put_h264_qpel16_hv_lowpass_neon
  1192. vpop {d8-d15}
  1193. mov sp, r11
  1194. pop {r4, r9-r11, pc}
  1195. .endfunc
  1196. function ff_put_h264_qpel16_mc32_neon, export=1
  1197. push {r0, r1, r4-r5, r9-r11, lr}
  1198. add r1, r1, #1
  1199. b put_h264_qpel16_mc12
  1200. .endfunc
  1201. function ff_put_h264_qpel16_mc03_neon, export=1
  1202. push {r4, lr}
  1203. add ip, r1, r2
  1204. b put_h264_qpel16_mc01
  1205. .endfunc
  1206. function ff_put_h264_qpel16_mc13_neon, export=1
  1207. push {r0, r1, r4, lr}
  1208. add r1, r1, r2
  1209. b put_h264_qpel16_mc11
  1210. .endfunc
  1211. function ff_put_h264_qpel16_mc23_neon, export=1
  1212. push {r0, r1, r4-r5, r9-r11, lr}
  1213. add r1, r1, r2
  1214. b put_h264_qpel16_mc21
  1215. .endfunc
  1216. function ff_put_h264_qpel16_mc33_neon, export=1
  1217. add r1, r1, #1
  1218. push {r0, r1, r4, lr}
  1219. add r1, r1, r2
  1220. sub r1, r1, #1
  1221. b put_h264_qpel16_mc11
  1222. .endfunc
  1223. @ Biweighted prediction
  1224. .macro biweight_16 macs, macd
  1225. vdup.8 d0, r4
  1226. vdup.8 d1, r5
  1227. vmov q2, q8
  1228. vmov q3, q8
  1229. 1: subs ip, ip, #2
  1230. vld1.8 {d20-d21},[r0,:128], r2
  1231. \macd q2, d0, d20
  1232. pld [r0]
  1233. \macd q3, d0, d21
  1234. vld1.8 {d22-d23},[r1,:128], r2
  1235. \macs q2, d1, d22
  1236. pld [r1]
  1237. \macs q3, d1, d23
  1238. vmov q12, q8
  1239. vld1.8 {d28-d29},[r0,:128], r2
  1240. vmov q13, q8
  1241. \macd q12, d0, d28
  1242. pld [r0]
  1243. \macd q13, d0, d29
  1244. vld1.8 {d30-d31},[r1,:128], r2
  1245. \macs q12, d1, d30
  1246. pld [r1]
  1247. \macs q13, d1, d31
  1248. vshl.s16 q2, q2, q9
  1249. vshl.s16 q3, q3, q9
  1250. vqmovun.s16 d4, q2
  1251. vqmovun.s16 d5, q3
  1252. vshl.s16 q12, q12, q9
  1253. vshl.s16 q13, q13, q9
  1254. vqmovun.s16 d24, q12
  1255. vqmovun.s16 d25, q13
  1256. vmov q3, q8
  1257. vst1.8 {d4- d5}, [r6,:128], r2
  1258. vmov q2, q8
  1259. vst1.8 {d24-d25},[r6,:128], r2
  1260. bne 1b
  1261. pop {r4-r6, pc}
  1262. .endm
  1263. .macro biweight_8 macs, macd
  1264. vdup.8 d0, r4
  1265. vdup.8 d1, r5
  1266. vmov q1, q8
  1267. vmov q10, q8
  1268. 1: subs ip, ip, #2
  1269. vld1.8 {d4},[r0,:64], r2
  1270. \macd q1, d0, d4
  1271. pld [r0]
  1272. vld1.8 {d5},[r1,:64], r2
  1273. \macs q1, d1, d5
  1274. pld [r1]
  1275. vld1.8 {d6},[r0,:64], r2
  1276. \macd q10, d0, d6
  1277. pld [r0]
  1278. vld1.8 {d7},[r1,:64], r2
  1279. \macs q10, d1, d7
  1280. pld [r1]
  1281. vshl.s16 q1, q1, q9
  1282. vqmovun.s16 d2, q1
  1283. vshl.s16 q10, q10, q9
  1284. vqmovun.s16 d4, q10
  1285. vmov q10, q8
  1286. vst1.8 {d2},[r6,:64], r2
  1287. vmov q1, q8
  1288. vst1.8 {d4},[r6,:64], r2
  1289. bne 1b
  1290. pop {r4-r6, pc}
  1291. .endm
  1292. .macro biweight_4 macs, macd
  1293. vdup.8 d0, r4
  1294. vdup.8 d1, r5
  1295. vmov q1, q8
  1296. vmov q10, q8
  1297. 1: subs ip, ip, #4
  1298. vld1.32 {d4[0]},[r0,:32], r2
  1299. vld1.32 {d4[1]},[r0,:32], r2
  1300. \macd q1, d0, d4
  1301. pld [r0]
  1302. vld1.32 {d5[0]},[r1,:32], r2
  1303. vld1.32 {d5[1]},[r1,:32], r2
  1304. \macs q1, d1, d5
  1305. pld [r1]
  1306. blt 2f
  1307. vld1.32 {d6[0]},[r0,:32], r2
  1308. vld1.32 {d6[1]},[r0,:32], r2
  1309. \macd q10, d0, d6
  1310. pld [r0]
  1311. vld1.32 {d7[0]},[r1,:32], r2
  1312. vld1.32 {d7[1]},[r1,:32], r2
  1313. \macs q10, d1, d7
  1314. pld [r1]
  1315. vshl.s16 q1, q1, q9
  1316. vqmovun.s16 d2, q1
  1317. vshl.s16 q10, q10, q9
  1318. vqmovun.s16 d4, q10
  1319. vmov q10, q8
  1320. vst1.32 {d2[0]},[r6,:32], r2
  1321. vst1.32 {d2[1]},[r6,:32], r2
  1322. vmov q1, q8
  1323. vst1.32 {d4[0]},[r6,:32], r2
  1324. vst1.32 {d4[1]},[r6,:32], r2
  1325. bne 1b
  1326. pop {r4-r6, pc}
  1327. 2: vshl.s16 q1, q1, q9
  1328. vqmovun.s16 d2, q1
  1329. vst1.32 {d2[0]},[r6,:32], r2
  1330. vst1.32 {d2[1]},[r6,:32], r2
  1331. pop {r4-r6, pc}
  1332. .endm
  1333. .macro biweight_func w
  1334. function biweight_h264_pixels_\w\()_neon
  1335. push {r4-r6, lr}
  1336. add r4, sp, #16
  1337. ldm r4, {r4-r6}
  1338. lsr lr, r4, #31
  1339. add r6, r6, #1
  1340. eors lr, lr, r5, lsr #30
  1341. orr r6, r6, #1
  1342. vdup.16 q9, r3
  1343. lsl r6, r6, r3
  1344. vmvn q9, q9
  1345. vdup.16 q8, r6
  1346. mov r6, r0
  1347. beq 10f
  1348. subs lr, lr, #1
  1349. beq 20f
  1350. subs lr, lr, #1
  1351. beq 30f
  1352. b 40f
  1353. 10: biweight_\w vmlal.u8, vmlal.u8
  1354. 20: rsb r4, r4, #0
  1355. biweight_\w vmlal.u8, vmlsl.u8
  1356. 30: rsb r4, r4, #0
  1357. rsb r5, r5, #0
  1358. biweight_\w vmlsl.u8, vmlsl.u8
  1359. 40: rsb r5, r5, #0
  1360. biweight_\w vmlsl.u8, vmlal.u8
  1361. .endfunc
  1362. .endm
  1363. .macro biweight_entry w, h, b=1
  1364. function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
  1365. mov ip, #\h
  1366. .if \b
  1367. b biweight_h264_pixels_\w\()_neon
  1368. .endif
  1369. .endfunc
  1370. .endm
  1371. biweight_entry 16, 8
  1372. biweight_entry 16, 16, b=0
  1373. biweight_func 16
  1374. biweight_entry 8, 16
  1375. biweight_entry 8, 4
  1376. biweight_entry 8, 8, b=0
  1377. biweight_func 8
  1378. biweight_entry 4, 8
  1379. biweight_entry 4, 2
  1380. biweight_entry 4, 4, b=0
  1381. biweight_func 4
  1382. @ Weighted prediction
  1383. .macro weight_16 add
  1384. vdup.8 d0, r3
  1385. 1: subs ip, ip, #2
  1386. vld1.8 {d20-d21},[r0,:128], r1
  1387. vmull.u8 q2, d0, d20
  1388. pld [r0]
  1389. vmull.u8 q3, d0, d21
  1390. vld1.8 {d28-d29},[r0,:128], r1
  1391. vmull.u8 q12, d0, d28
  1392. pld [r0]
  1393. vmull.u8 q13, d0, d29
  1394. \add q2, q8, q2
  1395. vrshl.s16 q2, q2, q9
  1396. \add q3, q8, q3
  1397. vrshl.s16 q3, q3, q9
  1398. vqmovun.s16 d4, q2
  1399. vqmovun.s16 d5, q3
  1400. \add q12, q8, q12
  1401. vrshl.s16 q12, q12, q9
  1402. \add q13, q8, q13
  1403. vrshl.s16 q13, q13, q9
  1404. vqmovun.s16 d24, q12
  1405. vqmovun.s16 d25, q13
  1406. vst1.8 {d4- d5}, [r4,:128], r1
  1407. vst1.8 {d24-d25},[r4,:128], r1
  1408. bne 1b
  1409. pop {r4, pc}
  1410. .endm
  1411. .macro weight_8 add
  1412. vdup.8 d0, r3
  1413. 1: subs ip, ip, #2
  1414. vld1.8 {d4},[r0,:64], r1
  1415. vmull.u8 q1, d0, d4
  1416. pld [r0]
  1417. vld1.8 {d6},[r0,:64], r1
  1418. vmull.u8 q10, d0, d6
  1419. \add q1, q8, q1
  1420. pld [r0]
  1421. vrshl.s16 q1, q1, q9
  1422. vqmovun.s16 d2, q1
  1423. \add q10, q8, q10
  1424. vrshl.s16 q10, q10, q9
  1425. vqmovun.s16 d4, q10
  1426. vst1.8 {d2},[r4,:64], r1
  1427. vst1.8 {d4},[r4,:64], r1
  1428. bne 1b
  1429. pop {r4, pc}
  1430. .endm
  1431. .macro weight_4 add
  1432. vdup.8 d0, r3
  1433. vmov q1, q8
  1434. vmov q10, q8
  1435. 1: subs ip, ip, #4
  1436. vld1.32 {d4[0]},[r0,:32], r1
  1437. vld1.32 {d4[1]},[r0,:32], r1
  1438. vmull.u8 q1, d0, d4
  1439. pld [r0]
  1440. blt 2f
  1441. vld1.32 {d6[0]},[r0,:32], r1
  1442. vld1.32 {d6[1]},[r0,:32], r1
  1443. vmull.u8 q10, d0, d6
  1444. pld [r0]
  1445. \add q1, q8, q1
  1446. vrshl.s16 q1, q1, q9
  1447. vqmovun.s16 d2, q1
  1448. \add q10, q8, q10
  1449. vrshl.s16 q10, q10, q9
  1450. vqmovun.s16 d4, q10
  1451. vmov q10, q8
  1452. vst1.32 {d2[0]},[r4,:32], r1
  1453. vst1.32 {d2[1]},[r4,:32], r1
  1454. vmov q1, q8
  1455. vst1.32 {d4[0]},[r4,:32], r1
  1456. vst1.32 {d4[1]},[r4,:32], r1
  1457. bne 1b
  1458. pop {r4, pc}
  1459. 2: \add q1, q8, q1
  1460. vrshl.s16 q1, q1, q9
  1461. vqmovun.s16 d2, q1
  1462. vst1.32 {d2[0]},[r4,:32], r1
  1463. vst1.32 {d2[1]},[r4,:32], r1
  1464. pop {r4, pc}
  1465. .endm
  1466. .macro weight_func w
  1467. function weight_h264_pixels_\w\()_neon
  1468. push {r4, lr}
  1469. ldr r4, [sp, #8]
  1470. cmp r2, #1
  1471. lsl r4, r4, r2
  1472. vdup.16 q8, r4
  1473. mov r4, r0
  1474. ble 20f
  1475. rsb lr, r2, #1
  1476. vdup.16 q9, lr
  1477. cmp r3, #0
  1478. blt 10f
  1479. weight_\w vhadd.s16
  1480. 10: rsb r3, r3, #0
  1481. weight_\w vhsub.s16
  1482. 20: rsb lr, r2, #0
  1483. vdup.16 q9, lr
  1484. cmp r3, #0
  1485. blt 10f
  1486. weight_\w vadd.s16
  1487. 10: rsb r3, r3, #0
  1488. weight_\w vsub.s16
  1489. .endfunc
  1490. .endm
  1491. .macro weight_entry w, h, b=1
  1492. function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
  1493. mov ip, #\h
  1494. .if \b
  1495. b weight_h264_pixels_\w\()_neon
  1496. .endif
  1497. .endfunc
  1498. .endm
  1499. weight_entry 16, 8
  1500. weight_entry 16, 16, b=0
  1501. weight_func 16
  1502. weight_entry 8, 16
  1503. weight_entry 8, 4
  1504. weight_entry 8, 8, b=0
  1505. weight_func 8
  1506. weight_entry 4, 8
  1507. weight_entry 4, 2
  1508. weight_entry 4, 4, b=0
  1509. weight_func 4