You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

724 lines
24KB

  1. /*
  2. * Copyright (c) 2016 Google Inc.
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. @ All public functions in this file have the following signature:
  22. @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
  23. @ const uint8_t *ref, ptrdiff_t ref_stride,
  24. @ int h, int mx, int my);
  25. function ff_vp9_copy64_neon, export=1
  26. ldr r12, [sp]
  27. sub r1, r1, #32
  28. sub r3, r3, #32
  29. 1:
  30. vld1.8 {q0, q1}, [r2]!
  31. vst1.8 {q0, q1}, [r0, :128]!
  32. vld1.8 {q2, q3}, [r2], r3
  33. subs r12, r12, #1
  34. vst1.8 {q2, q3}, [r0, :128], r1
  35. bne 1b
  36. bx lr
  37. endfunc
  38. function ff_vp9_avg64_neon, export=1
  39. push {lr}
  40. ldr r12, [sp, #4]
  41. sub r1, r1, #32
  42. sub r3, r3, #32
  43. mov lr, r0
  44. 1:
  45. vld1.8 {q8, q9}, [r2]!
  46. vld1.8 {q0, q1}, [r0, :128]!
  47. vld1.8 {q10, q11}, [r2], r3
  48. vrhadd.u8 q0, q0, q8
  49. vld1.8 {q2, q3}, [r0, :128], r1
  50. vrhadd.u8 q1, q1, q9
  51. vrhadd.u8 q2, q2, q10
  52. vst1.8 {q0, q1}, [lr, :128]!
  53. vrhadd.u8 q3, q3, q11
  54. vst1.8 {q2, q3}, [lr, :128], r1
  55. subs r12, r12, #1
  56. bne 1b
  57. pop {pc}
  58. endfunc
  59. function ff_vp9_copy32_neon, export=1
  60. ldr r12, [sp]
  61. 1:
  62. vld1.8 {q0, q1}, [r2], r3
  63. subs r12, r12, #1
  64. vst1.8 {q0, q1}, [r0, :128], r1
  65. bne 1b
  66. bx lr
  67. endfunc
  68. function ff_vp9_avg32_neon, export=1
  69. ldr r12, [sp]
  70. 1:
  71. vld1.8 {q2, q3}, [r2], r3
  72. vld1.8 {q0, q1}, [r0, :128]
  73. vrhadd.u8 q0, q0, q2
  74. vrhadd.u8 q1, q1, q3
  75. subs r12, r12, #1
  76. vst1.8 {q0, q1}, [r0, :128], r1
  77. bne 1b
  78. bx lr
  79. endfunc
  80. function ff_vp9_copy16_neon, export=1
  81. push {r4,lr}
  82. ldr r12, [sp, #8]
  83. add r4, r0, r1
  84. add lr, r2, r3
  85. add r1, r1, r1
  86. add r3, r3, r3
  87. 1:
  88. vld1.8 {q0}, [r2], r3
  89. vld1.8 {q1}, [lr], r3
  90. subs r12, r12, #2
  91. vst1.8 {q0}, [r0, :128], r1
  92. vst1.8 {q1}, [r4, :128], r1
  93. bne 1b
  94. pop {r4,pc}
  95. endfunc
  96. function ff_vp9_avg16_neon, export=1
  97. push {lr}
  98. ldr r12, [sp, #4]
  99. mov lr, r0
  100. 1:
  101. vld1.8 {q2}, [r2], r3
  102. vld1.8 {q0}, [r0, :128], r1
  103. vld1.8 {q3}, [r2], r3
  104. vrhadd.u8 q0, q0, q2
  105. vld1.8 {q1}, [r0, :128], r1
  106. vrhadd.u8 q1, q1, q3
  107. subs r12, r12, #2
  108. vst1.8 {q0}, [lr, :128], r1
  109. vst1.8 {q1}, [lr, :128], r1
  110. bne 1b
  111. pop {pc}
  112. endfunc
  113. function ff_vp9_copy8_neon, export=1
  114. ldr r12, [sp]
  115. 1:
  116. vld1.8 {d0}, [r2], r3
  117. vld1.8 {d1}, [r2], r3
  118. subs r12, r12, #2
  119. vst1.8 {d0}, [r0, :64], r1
  120. vst1.8 {d1}, [r0, :64], r1
  121. bne 1b
  122. bx lr
  123. endfunc
  124. function ff_vp9_avg8_neon, export=1
  125. ldr r12, [sp]
  126. 1:
  127. vld1.8 {d2}, [r2], r3
  128. vld1.8 {d0}, [r0, :64], r1
  129. vld1.8 {d3}, [r2], r3
  130. vrhadd.u8 d0, d0, d2
  131. vld1.8 {d1}, [r0, :64]
  132. sub r0, r0, r1
  133. vrhadd.u8 d1, d1, d3
  134. subs r12, r12, #2
  135. vst1.8 {d0}, [r0, :64], r1
  136. vst1.8 {d1}, [r0, :64], r1
  137. bne 1b
  138. bx lr
  139. endfunc
  140. function ff_vp9_copy4_neon, export=1
  141. ldr r12, [sp]
  142. 1:
  143. vld1.32 {d0[]}, [r2], r3
  144. vld1.32 {d1[]}, [r2], r3
  145. vst1.32 {d0[0]}, [r0, :32], r1
  146. vld1.32 {d2[]}, [r2], r3
  147. vst1.32 {d1[0]}, [r0, :32], r1
  148. vld1.32 {d3[]}, [r2], r3
  149. subs r12, r12, #4
  150. vst1.32 {d2[0]}, [r0, :32], r1
  151. vst1.32 {d3[0]}, [r0, :32], r1
  152. bne 1b
  153. bx lr
  154. endfunc
  155. function ff_vp9_avg4_neon, export=1
  156. push {lr}
  157. ldr r12, [sp, #4]
  158. mov lr, r0
  159. 1:
  160. vld1.32 {d4[]}, [r2], r3
  161. vld1.32 {d0[]}, [r0, :32], r1
  162. vld1.32 {d5[]}, [r2], r3
  163. vrhadd.u8 d0, d0, d4
  164. vld1.32 {d1[]}, [r0, :32], r1
  165. vld1.32 {d6[]}, [r2], r3
  166. vrhadd.u8 d1, d1, d5
  167. vld1.32 {d2[]}, [r0, :32], r1
  168. vld1.32 {d7[]}, [r2], r3
  169. vrhadd.u8 d2, d2, d6
  170. vld1.32 {d3[]}, [r0, :32], r1
  171. subs r12, r12, #4
  172. vst1.32 {d0[0]}, [lr, :32], r1
  173. vrhadd.u8 d3, d3, d7
  174. vst1.32 {d1[0]}, [lr, :32], r1
  175. vst1.32 {d2[0]}, [lr, :32], r1
  176. vst1.32 {d3[0]}, [lr, :32], r1
  177. bne 1b
  178. pop {pc}
  179. endfunc
  180. @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
  181. .macro vmul_lane dst, src, idx
  182. .if \idx < 4
  183. vmul.s16 \dst, \src, d0[\idx]
  184. .else
  185. vmul.s16 \dst, \src, d1[\idx - 4]
  186. .endif
  187. .endm
  188. .macro vmla_lane dst, src, idx
  189. .if \idx < 4
  190. vmla.s16 \dst, \src, d0[\idx]
  191. .else
  192. vmla.s16 \dst, \src, d1[\idx - 4]
  193. .endif
  194. .endm
  195. @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
  196. @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
  197. @ dst1-dst2 and dst3-dst4 for size >= 16)
  198. .macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
  199. vext.8 q14, \src1, \src2, #(2*\offset)
  200. vext.8 q15, \src4, \src5, #(2*\offset)
  201. .if \size >= 16
  202. vmla_lane \dst1, q14, \offset
  203. vext.8 q5, \src2, \src3, #(2*\offset)
  204. vmla_lane \dst3, q15, \offset
  205. vext.8 q6, \src5, \src6, #(2*\offset)
  206. vmla_lane \dst2, q5, \offset
  207. vmla_lane \dst4, q6, \offset
  208. .elseif \size == 8
  209. vmla_lane \dst1, q14, \offset
  210. vmla_lane \dst3, q15, \offset
  211. .else
  212. vmla_lane \dst1d, d28, \offset
  213. vmla_lane \dst3d, d30, \offset
  214. .endif
  215. .endm
  216. @ The same as above, but don't accumulate straight into the
  217. @ destination, but use a temp register and accumulate with saturation.
  218. .macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
  219. vext.8 q14, \src1, \src2, #(2*\offset)
  220. vext.8 q15, \src4, \src5, #(2*\offset)
  221. .if \size >= 16
  222. vmul_lane q14, q14, \offset
  223. vext.8 q5, \src2, \src3, #(2*\offset)
  224. vmul_lane q15, q15, \offset
  225. vext.8 q6, \src5, \src6, #(2*\offset)
  226. vmul_lane q5, q5, \offset
  227. vmul_lane q6, q6, \offset
  228. .elseif \size == 8
  229. vmul_lane q14, q14, \offset
  230. vmul_lane q15, q15, \offset
  231. .else
  232. vmul_lane d28, d28, \offset
  233. vmul_lane d30, d30, \offset
  234. .endif
  235. .if \size == 4
  236. vqadd.s16 \dst1d, \dst1d, d28
  237. vqadd.s16 \dst3d, \dst3d, d30
  238. .else
  239. vqadd.s16 \dst1, \dst1, q14
  240. vqadd.s16 \dst3, \dst3, q15
  241. .if \size >= 16
  242. vqadd.s16 \dst2, \dst2, q5
  243. vqadd.s16 \dst4, \dst4, q6
  244. .endif
  245. .endif
  246. .endm
  247. @ Instantiate a horizontal filter function for the given size.
  248. @ This can work on 4, 8 or 16 pixels in parallel; for larger
  249. @ widths it will do 16 pixels at a time and loop horizontally.
  250. @ The actual width is passed in r5, the height in r4 and
  251. @ the filter coefficients in r12. idx2 is the index of the largest
  252. @ filter coefficient (3 or 4) and idx1 is the other one of them.
  253. .macro do_8tap_h type, size, idx1, idx2
  254. function \type\()_8tap_\size\()h_\idx1\idx2
  255. sub r2, r2, #3
  256. add r6, r0, r1
  257. add r7, r2, r3
  258. add r1, r1, r1
  259. add r3, r3, r3
  260. @ Only size >= 16 loops horizontally and needs
  261. @ reduced dst stride
  262. .if \size >= 16
  263. sub r1, r1, r5
  264. .endif
  265. @ size >= 16 loads two qwords and increments r2,
  266. @ for size 4/8 it's enough with one qword and no
  267. @ postincrement
  268. .if \size >= 16
  269. sub r3, r3, r5
  270. sub r3, r3, #8
  271. .endif
  272. @ Load the filter vector
  273. vld1.8 {d0}, [r12,:64]
  274. vmovl.s8 q0, d0
  275. 1:
  276. .if \size >= 16
  277. mov r12, r5
  278. .endif
  279. @ Load src
  280. .if \size >= 16
  281. vld1.8 {d18, d19, d20}, [r2]!
  282. vld1.8 {d24, d25, d26}, [r7]!
  283. .else
  284. vld1.8 {q9}, [r2]
  285. vld1.8 {q12}, [r7]
  286. .endif
  287. vmovl.u8 q8, d18
  288. vmovl.u8 q9, d19
  289. vmovl.u8 q11, d24
  290. vmovl.u8 q12, d25
  291. .if \size >= 16
  292. vmovl.u8 q10, d20
  293. vmovl.u8 q13, d26
  294. .endif
  295. 2:
  296. @ Accumulate, adding idx2 last with a separate
  297. @ saturating add. The positive filter coefficients
  298. @ for all indices except idx2 must add up to less
  299. @ than 127 for this not to overflow.
  300. vmul.s16 q1, q8, d0[0]
  301. vmul.s16 q3, q11, d0[0]
  302. .if \size >= 16
  303. vmul.s16 q2, q9, d0[0]
  304. vmul.s16 q4, q12, d0[0]
  305. .endif
  306. extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size
  307. extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size
  308. extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size
  309. extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size
  310. extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size
  311. extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size
  312. extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size
  313. @ Round, shift and saturate
  314. vqrshrun.s16 d2, q1, #7
  315. vqrshrun.s16 d6, q3, #7
  316. .if \size >= 16
  317. vqrshrun.s16 d3, q2, #7
  318. vqrshrun.s16 d7, q4, #7
  319. .endif
  320. @ Average
  321. .ifc \type,avg
  322. .if \size >= 16
  323. vld1.8 {q14}, [r0,:128]
  324. vld1.8 {q15}, [r6,:128]
  325. vrhadd.u8 q1, q1, q14
  326. vrhadd.u8 q3, q3, q15
  327. .elseif \size == 8
  328. vld1.8 {d28}, [r0,:64]
  329. vld1.8 {d30}, [r6,:64]
  330. vrhadd.u8 d2, d2, d28
  331. vrhadd.u8 d6, d6, d30
  332. .else
  333. @ We only need d28[0], but [] is faster on some cores
  334. vld1.32 {d28[]}, [r0,:32]
  335. vld1.32 {d30[]}, [r6,:32]
  336. vrhadd.u8 d2, d2, d28
  337. vrhadd.u8 d6, d6, d30
  338. .endif
  339. .endif
  340. @ Store and loop horizontally (for size >= 16)
  341. .if \size >= 16
  342. subs r12, r12, #16
  343. vst1.8 {q1}, [r0,:128]!
  344. vst1.8 {q3}, [r6,:128]!
  345. beq 3f
  346. vmov q8, q10
  347. vmov q11, q13
  348. vld1.8 {q10}, [r2]!
  349. vld1.8 {q13}, [r7]!
  350. vmovl.u8 q9, d20
  351. vmovl.u8 q10, d21
  352. vmovl.u8 q12, d26
  353. vmovl.u8 q13, d27
  354. b 2b
  355. .elseif \size == 8
  356. vst1.8 {d2}, [r0,:64]
  357. vst1.8 {d6}, [r6,:64]
  358. .else @ \size == 4
  359. vst1.32 {d2[0]}, [r0,:32]
  360. vst1.32 {d6[0]}, [r6,:32]
  361. .endif
  362. 3:
  363. @ Loop vertically
  364. add r0, r0, r1
  365. add r6, r6, r1
  366. add r2, r2, r3
  367. add r7, r7, r3
  368. subs r4, r4, #2
  369. bne 1b
  370. .if \size >= 16
  371. vpop {q4-q6}
  372. .endif
  373. pop {r4-r7}
  374. bx lr
  375. endfunc
  376. .endm
  377. .macro do_8tap_h_size size
  378. do_8tap_h put, \size, 3, 4
  379. do_8tap_h avg, \size, 3, 4
  380. do_8tap_h put, \size, 4, 3
  381. do_8tap_h avg, \size, 4, 3
  382. .endm
  383. do_8tap_h_size 4
  384. do_8tap_h_size 8
  385. do_8tap_h_size 16
  386. .macro do_8tap_h_func type, filter, offset, size
  387. function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
  388. push {r4-r7}
  389. .if \size >= 16
  390. vpush {q4-q6}
  391. ldr r4, [sp, #64]
  392. ldr r5, [sp, #68]
  393. .else
  394. ldr r4, [sp, #16]
  395. ldr r5, [sp, #20]
  396. .endif
  397. movrelx r12, X(ff_vp9_subpel_filters), r6
  398. add r12, r12, 120*\offset - 8
  399. cmp r5, #8
  400. add r12, r12, r5, lsl #3
  401. mov r5, #\size
  402. .if \size >= 16
  403. bge \type\()_8tap_16h_34
  404. b \type\()_8tap_16h_43
  405. .else
  406. bge \type\()_8tap_\size\()h_34
  407. b \type\()_8tap_\size\()h_43
  408. .endif
  409. endfunc
  410. .endm
  411. .macro do_8tap_h_filters size
  412. do_8tap_h_func put, regular, 1, \size
  413. do_8tap_h_func avg, regular, 1, \size
  414. do_8tap_h_func put, sharp, 2, \size
  415. do_8tap_h_func avg, sharp, 2, \size
  416. do_8tap_h_func put, smooth, 0, \size
  417. do_8tap_h_func avg, smooth, 0, \size
  418. .endm
  419. do_8tap_h_filters 64
  420. do_8tap_h_filters 32
  421. do_8tap_h_filters 16
  422. do_8tap_h_filters 8
  423. do_8tap_h_filters 4
  424. .ltorg
  425. @ Vertical filters
  426. @ Round, shift and saturate and store qreg1-2 over 4 lines
  427. .macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
  428. vqrshrun.s16 \dreg1, \qreg1, #7
  429. vqrshrun.s16 \dreg2, \qreg2, #7
  430. .ifc \type,avg
  431. vld1.32 {\tmp1[]}, [r0,:32], r1
  432. vld1.32 {\tmp2[]}, [r0,:32], r1
  433. vld1.32 {\tmp1[1]}, [r0,:32], r1
  434. vld1.32 {\tmp2[1]}, [r0,:32], r1
  435. vrhadd.u8 \dreg1, \dreg1, \tmp1
  436. vrhadd.u8 \dreg2, \dreg2, \tmp2
  437. sub r0, r0, r1, lsl #2
  438. .endif
  439. vst1.32 {\dreg1[0]}, [r0,:32], r1
  440. vst1.32 {\dreg2[0]}, [r0,:32], r1
  441. vst1.32 {\dreg1[1]}, [r0,:32], r1
  442. vst1.32 {\dreg2[1]}, [r0,:32], r1
  443. .endm
  444. @ Round, shift and saturate and store qreg1-4
  445. .macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
  446. vqrshrun.s16 \dreg1, \qreg1, #7
  447. vqrshrun.s16 \dreg2, \qreg2, #7
  448. vqrshrun.s16 \dreg3, \qreg3, #7
  449. vqrshrun.s16 \dreg4, \qreg4, #7
  450. .ifc \type,avg
  451. vld1.8 {\tmp1}, [r0,:64], r1
  452. vld1.8 {\tmp2}, [r0,:64], r1
  453. vld1.8 {\tmp3}, [r0,:64], r1
  454. vld1.8 {\tmp4}, [r0,:64], r1
  455. vrhadd.u8 \dreg1, \dreg1, \tmp1
  456. vrhadd.u8 \dreg2, \dreg2, \tmp2
  457. vrhadd.u8 \dreg3, \dreg3, \tmp3
  458. vrhadd.u8 \dreg4, \dreg4, \tmp4
  459. sub r0, r0, r1, lsl #2
  460. .endif
  461. vst1.8 {\dreg1}, [r0,:64], r1
  462. vst1.8 {\dreg2}, [r0,:64], r1
  463. vst1.8 {\dreg3}, [r0,:64], r1
  464. vst1.8 {\dreg4}, [r0,:64], r1
  465. .endm
  466. @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
  467. @ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
  468. @ at the end with saturation. Indices 0 and 7 always have negative or zero
  469. @ coefficients, so they can be accumulated into tmp1-tmp2 together with the
  470. @ largest coefficient.
  471. .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
  472. vmul.s16 \dst1, \src2, d0[1]
  473. vmul.s16 \dst2, \src3, d0[1]
  474. vmul.s16 \tmp1, \src1, d0[0]
  475. vmul.s16 \tmp2, \src2, d0[0]
  476. vmla.s16 \dst1, \src3, d0[2]
  477. vmla.s16 \dst2, \src4, d0[2]
  478. .if \idx1 == 3
  479. vmla.s16 \dst1, \src4, d0[3]
  480. vmla.s16 \dst2, \src5, d0[3]
  481. .else
  482. vmla.s16 \dst1, \src5, d1[0]
  483. vmla.s16 \dst2, \src6, d1[0]
  484. .endif
  485. vmla.s16 \dst1, \src6, d1[1]
  486. vmla.s16 \dst2, \src7, d1[1]
  487. vmla.s16 \tmp1, \src8, d1[3]
  488. vmla.s16 \tmp2, \src9, d1[3]
  489. vmla.s16 \dst1, \src7, d1[2]
  490. vmla.s16 \dst2, \src8, d1[2]
  491. .if \idx2 == 3
  492. vmla.s16 \tmp1, \src4, d0[3]
  493. vmla.s16 \tmp2, \src5, d0[3]
  494. .else
  495. vmla.s16 \tmp1, \src5, d1[0]
  496. vmla.s16 \tmp2, \src6, d1[0]
  497. .endif
  498. vqadd.s16 \dst1, \dst1, \tmp1
  499. vqadd.s16 \dst2, \dst2, \tmp2
  500. .endm
  501. @ Load pixels and extend them to 16 bit
  502. .macro loadl dst1, dst2, dst3, dst4
  503. vld1.8 {d2}, [r2], r3
  504. vld1.8 {d3}, [r2], r3
  505. vld1.8 {d4}, [r2], r3
  506. .ifnb \dst4
  507. vld1.8 {d5}, [r2], r3
  508. .endif
  509. vmovl.u8 \dst1, d2
  510. vmovl.u8 \dst2, d3
  511. vmovl.u8 \dst3, d4
  512. .ifnb \dst4
  513. vmovl.u8 \dst4, d5
  514. .endif
  515. .endm
  516. @ Instantiate a vertical filter function for filtering 8 pixels at a time.
  517. @ The height is passed in r4, the width in r5 and the filter coefficients
  518. @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
  519. @ and idx1 is the other one of them.
  520. .macro do_8tap_8v type, idx1, idx2
  521. function \type\()_8tap_8v_\idx1\idx2
  522. sub r2, r2, r3, lsl #1
  523. sub r2, r2, r3
  524. vld1.8 {d0}, [r12, :64]
  525. vmovl.s8 q0, d0
  526. 1:
  527. mov r12, r4
  528. loadl q5, q6, q7
  529. loadl q8, q9, q10, q11
  530. 2:
  531. loadl q12, q13, q14, q15
  532. convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5
  533. convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6
  534. do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type
  535. subs r12, r12, #4
  536. beq 8f
  537. loadl q4, q5, q6, q7
  538. convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9
  539. convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10
  540. do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type
  541. subs r12, r12, #4
  542. beq 8f
  543. loadl q8, q9, q10, q11
  544. convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13
  545. convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14
  546. do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type
  547. subs r12, r12, #4
  548. bne 2b
  549. 8:
  550. subs r5, r5, #8
  551. beq 9f
  552. @ r0 -= h * dst_stride
  553. mls r0, r1, r4, r0
  554. @ r2 -= h * src_stride
  555. mls r2, r3, r4, r2
  556. @ r2 -= 8 * src_stride
  557. sub r2, r2, r3, lsl #3
  558. @ r2 += 1 * src_stride
  559. add r2, r2, r3
  560. add r2, r2, #8
  561. add r0, r0, #8
  562. b 1b
  563. 9:
  564. vpop {q4-q7}
  565. pop {r4-r5}
  566. bx lr
  567. endfunc
  568. .endm
  569. do_8tap_8v put, 3, 4
  570. do_8tap_8v put, 4, 3
  571. do_8tap_8v avg, 3, 4
  572. do_8tap_8v avg, 4, 3
  573. @ Instantiate a vertical filter function for filtering a 4 pixels wide
  574. @ slice. The first half of the registers contain one row, while the second
  575. @ half of a register contains the second-next row (also stored in the first
  576. @ half of the register two steps ahead). The convolution does two outputs
  577. @ at a time; the output of q5-q12 into one, and q4-q13 into another one.
  578. @ The first half of first output is the first output row, the first half
  579. @ of the other output is the second output row. The second halves of the
  580. @ registers are rows 3 and 4.
  581. @ This only is designed to work for 4 or 8 output lines.
  582. .macro do_8tap_4v type, idx1, idx2
  583. function \type\()_8tap_4v_\idx1\idx2
  584. sub r2, r2, r3, lsl #1
  585. sub r2, r2, r3
  586. vld1.8 {d0}, [r12, :64]
  587. vmovl.s8 q0, d0
  588. vld1.32 {d2[]}, [r2], r3
  589. vld1.32 {d3[]}, [r2], r3
  590. vld1.32 {d4[]}, [r2], r3
  591. vld1.32 {d5[]}, [r2], r3
  592. vld1.32 {d6[]}, [r2], r3
  593. vld1.32 {d7[]}, [r2], r3
  594. vext.8 d2, d2, d4, #4
  595. vld1.32 {d8[]}, [r2], r3
  596. vext.8 d3, d3, d5, #4
  597. vld1.32 {d9[]}, [r2], r3
  598. vmovl.u8 q5, d2
  599. vext.8 d4, d4, d6, #4
  600. vld1.32 {d28[]}, [r2], r3
  601. vmovl.u8 q6, d3
  602. vext.8 d5, d5, d7, #4
  603. vld1.32 {d29[]}, [r2], r3
  604. vmovl.u8 q7, d4
  605. vext.8 d6, d6, d8, #4
  606. vld1.32 {d30[]}, [r2], r3
  607. vmovl.u8 q8, d5
  608. vext.8 d7, d7, d9, #4
  609. vmovl.u8 q9, d6
  610. vext.8 d8, d8, d28, #4
  611. vmovl.u8 q10, d7
  612. vext.8 d9, d9, d29, #4
  613. vmovl.u8 q11, d8
  614. vext.8 d28, d28, d30, #4
  615. vmovl.u8 q12, d9
  616. vmovl.u8 q13, d28
  617. convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3
  618. do_store4 q1, d2, q2, d4, d3, d5, \type
  619. subs r4, r4, #4
  620. beq 9f
  621. vld1.32 {d2[]}, [r2], r3
  622. vld1.32 {d3[]}, [r2], r3
  623. vext.8 d29, d29, d2, #4
  624. vext.8 d30, d30, d3, #4
  625. vld1.32 {d2[1]}, [r2], r3
  626. vmovl.u8 q14, d29
  627. vld1.32 {d3[1]}, [r2], r3
  628. vmovl.u8 q15, d30
  629. vmovl.u8 q5, d2
  630. vmovl.u8 q6, d3
  631. convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3
  632. do_store4 q1, d2, q2, d4, d3, d5, \type
  633. 9:
  634. vpop {q4-q7}
  635. pop {r4-r5}
  636. bx lr
  637. endfunc
  638. .endm
  639. do_8tap_4v put, 3, 4
  640. do_8tap_4v put, 4, 3
  641. do_8tap_4v avg, 3, 4
  642. do_8tap_4v avg, 4, 3
  643. .macro do_8tap_v_func type, filter, offset, size
  644. function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
  645. push {r4-r5}
  646. vpush {q4-q7}
  647. ldr r4, [sp, #72]
  648. movrelx r12, X(ff_vp9_subpel_filters), r5
  649. ldr r5, [sp, #80]
  650. add r12, r12, 120*\offset - 8
  651. add r12, r12, r5, lsl #3
  652. cmp r5, #8
  653. mov r5, #\size
  654. .if \size >= 8
  655. bge \type\()_8tap_8v_34
  656. b \type\()_8tap_8v_43
  657. .else
  658. bge \type\()_8tap_4v_34
  659. b \type\()_8tap_4v_43
  660. .endif
  661. endfunc
  662. .endm
  663. .macro do_8tap_v_filters size
  664. do_8tap_v_func put, regular, 1, \size
  665. do_8tap_v_func avg, regular, 1, \size
  666. do_8tap_v_func put, sharp, 2, \size
  667. do_8tap_v_func avg, sharp, 2, \size
  668. do_8tap_v_func put, smooth, 0, \size
  669. do_8tap_v_func avg, smooth, 0, \size
  670. .endm
  671. do_8tap_v_filters 64
  672. do_8tap_v_filters 32
  673. do_8tap_v_filters 16
  674. do_8tap_v_filters 8
  675. do_8tap_v_filters 4