You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1897 lines
66KB

  1. /**
  2. * VP8 NEON optimisations
  3. *
  4. * Copyright (c) 2010 Rob Clark <rob@ti.com>
  5. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of Libav.
  8. *
  9. * Libav is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * Libav is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with Libav; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "asm.S"
  24. #include "neon.S"
  25. function ff_vp8_luma_dc_wht_neon, export=1
  26. vld1.16 {q0-q1}, [r1,:128]
  27. vmov.i16 q15, #0
  28. vadd.i16 d4, d0, d3
  29. vadd.i16 d6, d1, d2
  30. vst1.16 {q15}, [r1,:128]!
  31. vsub.i16 d7, d1, d2
  32. vsub.i16 d5, d0, d3
  33. vst1.16 {q15}, [r1,:128]
  34. vadd.i16 q0, q2, q3
  35. vsub.i16 q1, q2, q3
  36. vmov.i16 q8, #3
  37. vtrn.32 d0, d2
  38. vtrn.32 d1, d3
  39. vtrn.16 d0, d1
  40. vtrn.16 d2, d3
  41. vadd.i16 d0, d0, d16
  42. vadd.i16 d4, d0, d3
  43. vadd.i16 d6, d1, d2
  44. vsub.i16 d7, d1, d2
  45. vsub.i16 d5, d0, d3
  46. vadd.i16 q0, q2, q3
  47. vsub.i16 q1, q2, q3
  48. vshr.s16 q0, q0, #3
  49. vshr.s16 q1, q1, #3
  50. mov r3, #32
  51. vst1.16 {d0[0]}, [r0,:16], r3
  52. vst1.16 {d1[0]}, [r0,:16], r3
  53. vst1.16 {d2[0]}, [r0,:16], r3
  54. vst1.16 {d3[0]}, [r0,:16], r3
  55. vst1.16 {d0[1]}, [r0,:16], r3
  56. vst1.16 {d1[1]}, [r0,:16], r3
  57. vst1.16 {d2[1]}, [r0,:16], r3
  58. vst1.16 {d3[1]}, [r0,:16], r3
  59. vst1.16 {d0[2]}, [r0,:16], r3
  60. vst1.16 {d1[2]}, [r0,:16], r3
  61. vst1.16 {d2[2]}, [r0,:16], r3
  62. vst1.16 {d3[2]}, [r0,:16], r3
  63. vst1.16 {d0[3]}, [r0,:16], r3
  64. vst1.16 {d1[3]}, [r0,:16], r3
  65. vst1.16 {d2[3]}, [r0,:16], r3
  66. vst1.16 {d3[3]}, [r0,:16], r3
  67. bx lr
  68. endfunc
  69. function ff_vp8_luma_dc_wht_dc_neon, export=1
  70. ldrsh r2, [r1]
  71. mov r3, #0
  72. add r2, r2, #3
  73. strh r3, [r1]
  74. asr r2, r2, #3
  75. .rept 16
  76. strh r2, [r0], #32
  77. .endr
  78. bx lr
  79. endfunc
  80. function ff_vp8_idct_add_neon, export=1
  81. vld1.16 {q0-q1}, [r1,:128]
  82. movw r3, #20091
  83. movt r3, #35468/2
  84. vdup.32 d4, r3
  85. vmull.s16 q12, d1, d4[0]
  86. vmull.s16 q13, d3, d4[0]
  87. vqdmulh.s16 d20, d1, d4[1]
  88. vqdmulh.s16 d23, d3, d4[1]
  89. vshrn.s32 d21, q12, #16
  90. vshrn.s32 d22, q13, #16
  91. vadd.s16 d21, d21, d1
  92. vadd.s16 d22, d22, d3
  93. vadd.s16 d16, d0, d2
  94. vsub.s16 d17, d0, d2
  95. vadd.s16 d18, d21, d23
  96. vsub.s16 d19, d20, d22
  97. vadd.s16 q0, q8, q9
  98. vsub.s16 q1, q8, q9
  99. vtrn.32 d0, d3
  100. vtrn.32 d1, d2
  101. vtrn.16 d0, d1
  102. vtrn.16 d3, d2
  103. vmov.i16 q15, #0
  104. vmull.s16 q12, d1, d4[0]
  105. vst1.16 {q15}, [r1,:128]!
  106. vmull.s16 q13, d2, d4[0]
  107. vst1.16 {q15}, [r1,:128]
  108. vqdmulh.s16 d21, d1, d4[1]
  109. vqdmulh.s16 d23, d2, d4[1]
  110. vshrn.s32 d20, q12, #16
  111. vshrn.s32 d22, q13, #16
  112. vadd.i16 d20, d20, d1
  113. vadd.i16 d22, d22, d2
  114. vadd.i16 d16, d0, d3
  115. vsub.i16 d17, d0, d3
  116. vadd.i16 d18, d20, d23
  117. vld1.32 {d20[]}, [r0,:32], r2
  118. vsub.i16 d19, d21, d22
  119. vld1.32 {d22[]}, [r0,:32], r2
  120. vadd.s16 q0, q8, q9
  121. vld1.32 {d23[]}, [r0,:32], r2
  122. vsub.s16 q1, q8, q9
  123. vld1.32 {d21[]}, [r0,:32], r2
  124. vrshr.s16 q0, q0, #3
  125. vtrn.32 q10, q11
  126. vrshr.s16 q1, q1, #3
  127. sub r0, r0, r2, lsl #2
  128. vtrn.32 d0, d3
  129. vtrn.32 d1, d2
  130. vtrn.16 d0, d1
  131. vtrn.16 d3, d2
  132. vaddw.u8 q0, q0, d20
  133. vaddw.u8 q1, q1, d21
  134. vqmovun.s16 d0, q0
  135. vqmovun.s16 d1, q1
  136. vst1.32 {d0[0]}, [r0,:32], r2
  137. vst1.32 {d0[1]}, [r0,:32], r2
  138. vst1.32 {d1[1]}, [r0,:32], r2
  139. vst1.32 {d1[0]}, [r0,:32], r2
  140. bx lr
  141. endfunc
  142. function ff_vp8_idct_dc_add_neon, export=1
  143. mov r3, #0
  144. ldrsh r12, [r1]
  145. strh r3, [r1]
  146. vdup.16 q1, r12
  147. vrshr.s16 q1, q1, #3
  148. vld1.32 {d0[]}, [r0,:32], r2
  149. vld1.32 {d1[]}, [r0,:32], r2
  150. vld1.32 {d0[1]}, [r0,:32], r2
  151. vld1.32 {d1[1]}, [r0,:32], r2
  152. vaddw.u8 q2, q1, d0
  153. vaddw.u8 q3, q1, d1
  154. sub r0, r0, r2, lsl #2
  155. vqmovun.s16 d0, q2
  156. vqmovun.s16 d1, q3
  157. vst1.32 {d0[0]}, [r0,:32], r2
  158. vst1.32 {d1[0]}, [r0,:32], r2
  159. vst1.32 {d0[1]}, [r0,:32], r2
  160. vst1.32 {d1[1]}, [r0,:32], r2
  161. bx lr
  162. endfunc
  163. function ff_vp8_idct_dc_add4uv_neon, export=1
  164. vmov.i16 d0, #0
  165. mov r3, #32
  166. vld1.16 {d16[]}, [r1,:16]
  167. vst1.16 {d0[0]}, [r1,:16], r3
  168. vld1.16 {d17[]}, [r1,:16]
  169. vst1.16 {d0[0]}, [r1,:16], r3
  170. vld1.16 {d18[]}, [r1,:16]
  171. vst1.16 {d0[0]}, [r1,:16], r3
  172. vld1.16 {d19[]}, [r1,:16]
  173. vst1.16 {d0[0]}, [r1,:16], r3
  174. mov r3, r0
  175. vrshr.s16 q8, q8, #3 @ dc >>= 3
  176. vld1.8 {d0}, [r0,:64], r2
  177. vrshr.s16 q9, q9, #3
  178. vld1.8 {d1}, [r0,:64], r2
  179. vaddw.u8 q10, q8, d0
  180. vld1.8 {d2}, [r0,:64], r2
  181. vaddw.u8 q0, q8, d1
  182. vld1.8 {d3}, [r0,:64], r2
  183. vaddw.u8 q11, q8, d2
  184. vld1.8 {d4}, [r0,:64], r2
  185. vaddw.u8 q1, q8, d3
  186. vld1.8 {d5}, [r0,:64], r2
  187. vaddw.u8 q12, q9, d4
  188. vld1.8 {d6}, [r0,:64], r2
  189. vaddw.u8 q2, q9, d5
  190. vld1.8 {d7}, [r0,:64], r2
  191. vaddw.u8 q13, q9, d6
  192. vqmovun.s16 d20, q10
  193. vaddw.u8 q3, q9, d7
  194. vqmovun.s16 d21, q0
  195. vqmovun.s16 d22, q11
  196. vst1.8 {d20}, [r3,:64], r2
  197. vqmovun.s16 d23, q1
  198. vst1.8 {d21}, [r3,:64], r2
  199. vqmovun.s16 d24, q12
  200. vst1.8 {d22}, [r3,:64], r2
  201. vqmovun.s16 d25, q2
  202. vst1.8 {d23}, [r3,:64], r2
  203. vqmovun.s16 d26, q13
  204. vst1.8 {d24}, [r3,:64], r2
  205. vqmovun.s16 d27, q3
  206. vst1.8 {d25}, [r3,:64], r2
  207. vst1.8 {d26}, [r3,:64], r2
  208. vst1.8 {d27}, [r3,:64], r2
  209. bx lr
  210. endfunc
  211. function ff_vp8_idct_dc_add4y_neon, export=1
  212. vmov.i16 d0, #0
  213. mov r3, #32
  214. vld1.16 {d16[]}, [r1,:16]
  215. vst1.16 {d0[0]}, [r1,:16], r3
  216. vld1.16 {d17[]}, [r1,:16]
  217. vst1.16 {d0[0]}, [r1,:16], r3
  218. vld1.16 {d18[]}, [r1,:16]
  219. vst1.16 {d0[0]}, [r1,:16], r3
  220. vld1.16 {d19[]}, [r1,:16]
  221. vst1.16 {d0[0]}, [r1,:16], r3
  222. vrshr.s16 q8, q8, #3 @ dc >>= 3
  223. vld1.8 {q0}, [r0,:128], r2
  224. vrshr.s16 q9, q9, #3
  225. vld1.8 {q1}, [r0,:128], r2
  226. vaddw.u8 q10, q8, d0
  227. vld1.8 {q2}, [r0,:128], r2
  228. vaddw.u8 q0, q9, d1
  229. vld1.8 {q3}, [r0,:128], r2
  230. vaddw.u8 q11, q8, d2
  231. vaddw.u8 q1, q9, d3
  232. vaddw.u8 q12, q8, d4
  233. vaddw.u8 q2, q9, d5
  234. vaddw.u8 q13, q8, d6
  235. vaddw.u8 q3, q9, d7
  236. sub r0, r0, r2, lsl #2
  237. vqmovun.s16 d20, q10
  238. vqmovun.s16 d21, q0
  239. vqmovun.s16 d22, q11
  240. vqmovun.s16 d23, q1
  241. vqmovun.s16 d24, q12
  242. vst1.8 {q10}, [r0,:128], r2
  243. vqmovun.s16 d25, q2
  244. vst1.8 {q11}, [r0,:128], r2
  245. vqmovun.s16 d26, q13
  246. vst1.8 {q12}, [r0,:128], r2
  247. vqmovun.s16 d27, q3
  248. vst1.8 {q13}, [r0,:128], r2
  249. bx lr
  250. endfunc
  251. @ Register layout:
  252. @ P3..Q3 -> q0..q7
  253. @ flim_E -> q14
  254. @ flim_I -> q15
  255. @ hev_thresh -> r12
  256. @
  257. .macro vp8_loop_filter, inner=0, simple=0
  258. .if \simple
  259. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  260. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  261. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  262. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  263. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  264. vmov.i8 q13, #0x80
  265. vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
  266. .else
  267. @ calculate hev and normal_limit:
  268. vabd.u8 q12, q2, q3 @ abs(P1-P0)
  269. vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
  270. vabd.u8 q10, q0, q1 @ abs(P3-P2)
  271. vabd.u8 q11, q1, q2 @ abs(P2-P1)
  272. vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
  273. vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
  274. vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
  275. vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
  276. vand q8, q8, q9
  277. vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
  278. vand q8, q8, q11
  279. vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
  280. vand q8, q8, q10
  281. vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
  282. vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
  283. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  284. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  285. vand q8, q8, q10
  286. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  287. vand q8, q8, q11
  288. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  289. vdup.8 q15, r12 @ hev_thresh
  290. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  291. vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
  292. vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
  293. vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
  294. vand q8, q8, q11
  295. vmov.i8 q13, #0x80
  296. vorr q9, q12, q14
  297. .endif
  298. @ at this point:
  299. @ q8: normal_limit
  300. @ q9: hev
  301. @ convert to signed value:
  302. veor q3, q3, q13 @ PS0 = P0 ^ 0x80
  303. veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
  304. vmov.i16 q12, #3
  305. vsubl.s8 q10, d8, d6 @ QS0 - PS0
  306. vsubl.s8 q11, d9, d7 @ (widened to 16bit)
  307. veor q2, q2, q13 @ PS1 = P1 ^ 0x80
  308. veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
  309. vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
  310. vmul.i16 q11, q11, q12
  311. vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
  312. vmov.i8 q14, #4
  313. vmov.i8 q15, #3
  314. .if \inner
  315. vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
  316. .endif
  317. vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
  318. vaddw.s8 q11, q11, d25
  319. vqmovn.s16 d20, q10 @ narrow result back into q10
  320. vqmovn.s16 d21, q11
  321. .if !\inner && !\simple
  322. veor q1, q1, q13 @ PS2 = P2 ^ 0x80
  323. veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
  324. .endif
  325. vand q10, q10, q8 @ w &= normal_limit
  326. @ registers used at this point..
  327. @ q0 -> P3 (don't corrupt)
  328. @ q1-q6 -> PS2-QS2
  329. @ q7 -> Q3 (don't corrupt)
  330. @ q9 -> hev
  331. @ q10 -> w
  332. @ q13 -> #0x80
  333. @ q14 -> #4
  334. @ q15 -> #3
  335. @ q8, q11, q12 -> unused
  336. @ filter_common: is4tap==1
  337. @ c1 = clamp(w + 4) >> 3;
  338. @ c2 = clamp(w + 3) >> 3;
  339. @ Q0 = s2u(QS0 - c1);
  340. @ P0 = s2u(PS0 + c2);
  341. .if \simple
  342. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  343. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  344. vshr.s8 q11, q11, #3 @ c1 >>= 3
  345. vshr.s8 q12, q12, #3 @ c2 >>= 3
  346. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  347. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  348. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  349. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  350. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  351. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  352. .elseif \inner
  353. @ the !is4tap case of filter_common, only used for inner blocks
  354. @ c3 = ((c1&~hev) + 1) >> 1;
  355. @ Q1 = s2u(QS1 - c3);
  356. @ P1 = s2u(PS1 + c3);
  357. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  358. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  359. vshr.s8 q11, q11, #3 @ c1 >>= 3
  360. vshr.s8 q12, q12, #3 @ c2 >>= 3
  361. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  362. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  363. vbic q11, q11, q9 @ c1 & ~hev
  364. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  365. vrshr.s8 q11, q11, #1 @ c3 >>= 1
  366. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  367. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
  368. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
  369. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  370. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  371. .else
  372. vand q12, q10, q9 @ w & hev
  373. vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
  374. vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
  375. vshr.s8 q11, q11, #3 @ c1 >>= 3
  376. vshr.s8 q12, q12, #3 @ c2 >>= 3
  377. vbic q10, q10, q9 @ w &= ~hev
  378. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  379. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  380. @ filter_mbedge:
  381. @ a = clamp((27*w + 63) >> 7);
  382. @ Q0 = s2u(QS0 - a);
  383. @ P0 = s2u(PS0 + a);
  384. @ a = clamp((18*w + 63) >> 7);
  385. @ Q1 = s2u(QS1 - a);
  386. @ P1 = s2u(PS1 + a);
  387. @ a = clamp((9*w + 63) >> 7);
  388. @ Q2 = s2u(QS2 - a);
  389. @ P2 = s2u(PS2 + a);
  390. vmov.i16 q9, #63
  391. vshll.s8 q14, d20, #3
  392. vshll.s8 q15, d21, #3
  393. vaddw.s8 q14, q14, d20
  394. vaddw.s8 q15, q15, d21
  395. vadd.s16 q8, q9, q14
  396. vadd.s16 q9, q9, q15 @ 9*w + 63
  397. vadd.s16 q11, q8, q14
  398. vadd.s16 q12, q9, q15 @ 18*w + 63
  399. vadd.s16 q14, q11, q14
  400. vadd.s16 q15, q12, q15 @ 27*w + 63
  401. vqshrn.s16 d16, q8, #7
  402. vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
  403. vqshrn.s16 d22, q11, #7
  404. vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
  405. vqshrn.s16 d28, q14, #7
  406. vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
  407. vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
  408. vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
  409. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
  410. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
  411. vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
  412. vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
  413. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  414. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  415. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  416. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  417. veor q1, q1, q13 @ P2 = PS2 ^ 0x80
  418. veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
  419. .endif
  420. .endm
  421. .macro vp8_v_loop_filter16 name, inner=0, simple=0
  422. function ff_vp8_v_loop_filter16\name\()_neon, export=1
  423. vpush {q4-q7}
  424. sub r0, r0, r1, lsl #1+!\simple
  425. @ Load pixels:
  426. .if !\simple
  427. ldr r12, [sp, #64] @ hev_thresh
  428. vld1.8 {q0}, [r0,:128], r1 @ P3
  429. vld1.8 {q1}, [r0,:128], r1 @ P2
  430. .endif
  431. vld1.8 {q2}, [r0,:128], r1 @ P1
  432. vld1.8 {q3}, [r0,:128], r1 @ P0
  433. vld1.8 {q4}, [r0,:128], r1 @ Q0
  434. vld1.8 {q5}, [r0,:128], r1 @ Q1
  435. .if !\simple
  436. vld1.8 {q6}, [r0,:128], r1 @ Q2
  437. vld1.8 {q7}, [r0,:128] @ Q3
  438. vdup.8 q15, r3 @ flim_I
  439. .endif
  440. vdup.8 q14, r2 @ flim_E
  441. vp8_loop_filter inner=\inner, simple=\simple
  442. @ back up to P2: dst -= stride * 6
  443. sub r0, r0, r1, lsl #2
  444. .if !\simple
  445. sub r0, r0, r1, lsl #1
  446. @ Store pixels:
  447. vst1.8 {q1}, [r0,:128], r1 @ P2
  448. .endif
  449. vst1.8 {q2}, [r0,:128], r1 @ P1
  450. vst1.8 {q3}, [r0,:128], r1 @ P0
  451. vst1.8 {q4}, [r0,:128], r1 @ Q0
  452. vst1.8 {q5}, [r0,:128], r1 @ Q1
  453. .if !\simple
  454. vst1.8 {q6}, [r0,:128] @ Q2
  455. .endif
  456. vpop {q4-q7}
  457. bx lr
  458. endfunc
  459. .endm
  460. vp8_v_loop_filter16
  461. vp8_v_loop_filter16 _inner, inner=1
  462. vp8_v_loop_filter16 _simple, simple=1
  463. .macro vp8_v_loop_filter8uv name, inner=0
  464. function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
  465. vpush {q4-q7}
  466. sub r0, r0, r2, lsl #2
  467. sub r1, r1, r2, lsl #2
  468. ldr r12, [sp, #64] @ flim_I
  469. @ Load pixels:
  470. vld1.8 {d0}, [r0,:64], r2 @ P3
  471. vld1.8 {d1}, [r1,:64], r2 @ P3
  472. vld1.8 {d2}, [r0,:64], r2 @ P2
  473. vld1.8 {d3}, [r1,:64], r2 @ P2
  474. vld1.8 {d4}, [r0,:64], r2 @ P1
  475. vld1.8 {d5}, [r1,:64], r2 @ P1
  476. vld1.8 {d6}, [r0,:64], r2 @ P0
  477. vld1.8 {d7}, [r1,:64], r2 @ P0
  478. vld1.8 {d8}, [r0,:64], r2 @ Q0
  479. vld1.8 {d9}, [r1,:64], r2 @ Q0
  480. vld1.8 {d10}, [r0,:64], r2 @ Q1
  481. vld1.8 {d11}, [r1,:64], r2 @ Q1
  482. vld1.8 {d12}, [r0,:64], r2 @ Q2
  483. vld1.8 {d13}, [r1,:64], r2 @ Q2
  484. vld1.8 {d14}, [r0,:64] @ Q3
  485. vld1.8 {d15}, [r1,:64] @ Q3
  486. vdup.8 q14, r3 @ flim_E
  487. vdup.8 q15, r12 @ flim_I
  488. ldr r12, [sp, #68] @ hev_thresh
  489. vp8_loop_filter inner=\inner
  490. @ back up to P2: u,v -= stride * 6
  491. sub r0, r0, r2, lsl #2
  492. sub r1, r1, r2, lsl #2
  493. sub r0, r0, r2, lsl #1
  494. sub r1, r1, r2, lsl #1
  495. @ Store pixels:
  496. vst1.8 {d2}, [r0,:64], r2 @ P2
  497. vst1.8 {d3}, [r1,:64], r2 @ P2
  498. vst1.8 {d4}, [r0,:64], r2 @ P1
  499. vst1.8 {d5}, [r1,:64], r2 @ P1
  500. vst1.8 {d6}, [r0,:64], r2 @ P0
  501. vst1.8 {d7}, [r1,:64], r2 @ P0
  502. vst1.8 {d8}, [r0,:64], r2 @ Q0
  503. vst1.8 {d9}, [r1,:64], r2 @ Q0
  504. vst1.8 {d10}, [r0,:64], r2 @ Q1
  505. vst1.8 {d11}, [r1,:64], r2 @ Q1
  506. vst1.8 {d12}, [r0,:64] @ Q2
  507. vst1.8 {d13}, [r1,:64] @ Q2
  508. vpop {q4-q7}
  509. bx lr
  510. endfunc
  511. .endm
  512. vp8_v_loop_filter8uv
  513. vp8_v_loop_filter8uv _inner, inner=1
  514. .macro vp8_h_loop_filter16 name, inner=0, simple=0
  515. function ff_vp8_h_loop_filter16\name\()_neon, export=1
  516. vpush {q4-q7}
  517. sub r0, r0, #4
  518. .if !\simple
  519. ldr r12, [sp, #64] @ hev_thresh
  520. .endif
  521. @ Load pixels:
  522. vld1.8 {d0}, [r0], r1 @ load first 8-line src data
  523. vld1.8 {d2}, [r0], r1
  524. vld1.8 {d4}, [r0], r1
  525. vld1.8 {d6}, [r0], r1
  526. vld1.8 {d8}, [r0], r1
  527. vld1.8 {d10}, [r0], r1
  528. vld1.8 {d12}, [r0], r1
  529. vld1.8 {d14}, [r0], r1
  530. vld1.8 {d1}, [r0], r1 @ load second 8-line src data
  531. vld1.8 {d3}, [r0], r1
  532. vld1.8 {d5}, [r0], r1
  533. vld1.8 {d7}, [r0], r1
  534. vld1.8 {d9}, [r0], r1
  535. vld1.8 {d11}, [r0], r1
  536. vld1.8 {d13}, [r0], r1
  537. vld1.8 {d15}, [r0], r1
  538. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  539. vdup.8 q14, r2 @ flim_E
  540. .if !\simple
  541. vdup.8 q15, r3 @ flim_I
  542. .endif
  543. vp8_loop_filter inner=\inner, simple=\simple
  544. sub r0, r0, r1, lsl #4 @ backup 16 rows
  545. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  546. @ Store pixels:
  547. vst1.8 {d0}, [r0], r1
  548. vst1.8 {d2}, [r0], r1
  549. vst1.8 {d4}, [r0], r1
  550. vst1.8 {d6}, [r0], r1
  551. vst1.8 {d8}, [r0], r1
  552. vst1.8 {d10}, [r0], r1
  553. vst1.8 {d12}, [r0], r1
  554. vst1.8 {d14}, [r0], r1
  555. vst1.8 {d1}, [r0], r1
  556. vst1.8 {d3}, [r0], r1
  557. vst1.8 {d5}, [r0], r1
  558. vst1.8 {d7}, [r0], r1
  559. vst1.8 {d9}, [r0], r1
  560. vst1.8 {d11}, [r0], r1
  561. vst1.8 {d13}, [r0], r1
  562. vst1.8 {d15}, [r0]
  563. vpop {q4-q7}
  564. bx lr
  565. endfunc
  566. .endm
  567. vp8_h_loop_filter16
  568. vp8_h_loop_filter16 _inner, inner=1
  569. vp8_h_loop_filter16 _simple, simple=1
  570. .macro vp8_h_loop_filter8uv name, inner=0
  571. function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
  572. vpush {q4-q7}
  573. sub r0, r0, #4
  574. sub r1, r1, #4
  575. ldr r12, [sp, #64] @ flim_I
  576. @ Load pixels:
  577. vld1.8 {d0}, [r0], r2 @ load u
  578. vld1.8 {d1}, [r1], r2 @ load v
  579. vld1.8 {d2}, [r0], r2
  580. vld1.8 {d3}, [r1], r2
  581. vld1.8 {d4}, [r0], r2
  582. vld1.8 {d5}, [r1], r2
  583. vld1.8 {d6}, [r0], r2
  584. vld1.8 {d7}, [r1], r2
  585. vld1.8 {d8}, [r0], r2
  586. vld1.8 {d9}, [r1], r2
  587. vld1.8 {d10}, [r0], r2
  588. vld1.8 {d11}, [r1], r2
  589. vld1.8 {d12}, [r0], r2
  590. vld1.8 {d13}, [r1], r2
  591. vld1.8 {d14}, [r0], r2
  592. vld1.8 {d15}, [r1], r2
  593. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  594. vdup.8 q14, r3 @ flim_E
  595. vdup.8 q15, r12 @ flim_I
  596. ldr r12, [sp, #68] @ hev_thresh
  597. vp8_loop_filter inner=\inner
  598. sub r0, r0, r2, lsl #3 @ backup u 8 rows
  599. sub r1, r1, r2, lsl #3 @ backup v 8 rows
  600. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  601. @ Store pixels:
  602. vst1.8 {d0}, [r0], r2
  603. vst1.8 {d1}, [r1], r2
  604. vst1.8 {d2}, [r0], r2
  605. vst1.8 {d3}, [r1], r2
  606. vst1.8 {d4}, [r0], r2
  607. vst1.8 {d5}, [r1], r2
  608. vst1.8 {d6}, [r0], r2
  609. vst1.8 {d7}, [r1], r2
  610. vst1.8 {d8}, [r0], r2
  611. vst1.8 {d9}, [r1], r2
  612. vst1.8 {d10}, [r0], r2
  613. vst1.8 {d11}, [r1], r2
  614. vst1.8 {d12}, [r0], r2
  615. vst1.8 {d13}, [r1], r2
  616. vst1.8 {d14}, [r0]
  617. vst1.8 {d15}, [r1]
  618. vpop {q4-q7}
  619. bx lr
  620. endfunc
  621. .endm
  622. vp8_h_loop_filter8uv
  623. vp8_h_loop_filter8uv _inner, inner=1
  624. function ff_put_vp8_pixels16_neon, export=1
  625. ldr r12, [sp, #0] @ h
  626. 1:
  627. subs r12, r12, #4
  628. vld1.8 {q0}, [r2], r3
  629. vld1.8 {q1}, [r2], r3
  630. vld1.8 {q2}, [r2], r3
  631. vld1.8 {q3}, [r2], r3
  632. vst1.8 {q0}, [r0,:128], r1
  633. vst1.8 {q1}, [r0,:128], r1
  634. vst1.8 {q2}, [r0,:128], r1
  635. vst1.8 {q3}, [r0,:128], r1
  636. bgt 1b
  637. bx lr
  638. endfunc
  639. function ff_put_vp8_pixels8_neon, export=1
  640. ldr r12, [sp, #0] @ h
  641. 1:
  642. subs r12, r12, #4
  643. vld1.8 {d0}, [r2], r3
  644. vld1.8 {d1}, [r2], r3
  645. vld1.8 {d2}, [r2], r3
  646. vld1.8 {d3}, [r2], r3
  647. vst1.8 {d0}, [r0,:64], r1
  648. vst1.8 {d1}, [r0,:64], r1
  649. vst1.8 {d2}, [r0,:64], r1
  650. vst1.8 {d3}, [r0,:64], r1
  651. bgt 1b
  652. bx lr
  653. endfunc
  654. function ff_put_vp8_pixels4_neon, export=1
  655. ldr r12, [sp, #0] @ h
  656. push {r4-r6,lr}
  657. 1:
  658. subs r12, r12, #4
  659. ldr_post r4, r2, r3
  660. ldr_post r5, r2, r3
  661. ldr_post r6, r2, r3
  662. ldr_post lr, r2, r3
  663. str_post r4, r0, r1
  664. str_post r5, r0, r1
  665. str_post r6, r0, r1
  666. str_post lr, r0, r1
  667. bgt 1b
  668. pop {r4-r6,pc}
  669. endfunc
  670. /* 4/6-tap 8th-pel MC */
  671. .macro vp8_epel8_h6 d, a, b
  672. vext.8 d27, \a, \b, #1
  673. vmovl.u8 q8, \a
  674. vext.8 d28, \a, \b, #2
  675. vmovl.u8 q9, d27
  676. vext.8 d29, \a, \b, #3
  677. vmovl.u8 q10, d28
  678. vext.8 d30, \a, \b, #4
  679. vmovl.u8 q11, d29
  680. vext.8 d31, \a, \b, #5
  681. vmovl.u8 q12, d30
  682. vmul.u16 q10, q10, d0[2]
  683. vmovl.u8 q13, d31
  684. vmul.u16 q11, q11, d0[3]
  685. vmls.u16 q10, q9, d0[1]
  686. vmls.u16 q11, q12, d1[0]
  687. vmla.u16 q10, q8, d0[0]
  688. vmla.u16 q11, q13, d1[1]
  689. vqadd.s16 q11, q10, q11
  690. vqrshrun.s16 \d, q11, #7
  691. .endm
  692. .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
  693. vext.8 q14, \q0, \q1, #3
  694. vext.8 q15, \q0, \q1, #4
  695. vmovl.u8 q11, d28
  696. vmovl.u8 q14, d29
  697. vext.8 q3, \q0, \q1, #2
  698. vmovl.u8 q12, d30
  699. vmovl.u8 q15, d31
  700. vext.8 q8, \q0, \q1, #1
  701. vmovl.u8 q10, d6
  702. vmovl.u8 q3, d7
  703. vext.8 q2, \q0, \q1, #5
  704. vmovl.u8 q13, d4
  705. vmovl.u8 q2, d5
  706. vmovl.u8 q9, d16
  707. vmovl.u8 q8, d17
  708. vmul.u16 q11, q11, d0[3]
  709. vmul.u16 q10, q10, d0[2]
  710. vmul.u16 q3, q3, d0[2]
  711. vmul.u16 q14, q14, d0[3]
  712. vmls.u16 q11, q12, d1[0]
  713. vmovl.u8 q12, \s0
  714. vmovl.u8 q1, \s1
  715. vmls.u16 q10, q9, d0[1]
  716. vmls.u16 q3, q8, d0[1]
  717. vmls.u16 q14, q15, d1[0]
  718. vmla.u16 q10, q12, d0[0]
  719. vmla.u16 q11, q13, d1[1]
  720. vmla.u16 q3, q1, d0[0]
  721. vmla.u16 q14, q2, d1[1]
  722. vqadd.s16 q11, q10, q11
  723. vqadd.s16 q14, q3, q14
  724. vqrshrun.s16 \d0, q11, #7
  725. vqrshrun.s16 \d1, q14, #7
  726. .endm
  727. .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
  728. vmovl.u8 q10, \s2
  729. vmovl.u8 q11, \s3
  730. vmovl.u8 q9, \s1
  731. vmovl.u8 q12, \s4
  732. vmovl.u8 q8, \s0
  733. vmovl.u8 q13, \s5
  734. vmul.u16 q10, q10, d0[2]
  735. vmul.u16 q11, q11, d0[3]
  736. vmls.u16 q10, q9, d0[1]
  737. vmls.u16 q11, q12, d1[0]
  738. vmla.u16 q10, q8, d0[0]
  739. vmla.u16 q11, q13, d1[1]
  740. vqadd.s16 q11, q10, q11
  741. vqrshrun.s16 \d0, q11, #7
  742. .endm
  743. .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
  744. vmovl.u8 q10, \s0
  745. vmovl.u8 q11, \s3
  746. vmovl.u8 q14, \s6
  747. vmovl.u8 q9, \s1
  748. vmovl.u8 q12, \s4
  749. vmovl.u8 q8, \s2
  750. vmovl.u8 q13, \s5
  751. vmul.u16 q10, q10, d0[0]
  752. vmul.u16 q15, q11, d0[3]
  753. vmul.u16 q11, q11, d0[2]
  754. vmul.u16 q14, q14, d1[1]
  755. vmls.u16 q10, q9, d0[1]
  756. vmls.u16 q15, q12, d1[0]
  757. vmls.u16 q11, q8, d0[1]
  758. vmls.u16 q14, q13, d1[0]
  759. vmla.u16 q10, q8, d0[2]
  760. vmla.u16 q15, q13, d1[1]
  761. vmla.u16 q11, q9, d0[0]
  762. vmla.u16 q14, q12, d0[3]
  763. vqadd.s16 q15, q10, q15
  764. vqadd.s16 q14, q11, q14
  765. vqrshrun.s16 \d0, q15, #7
  766. vqrshrun.s16 \d1, q14, #7
  767. .endm
  768. .macro vp8_epel8_h4 d, a, b
  769. vext.8 d28, \a, \b, #1
  770. vmovl.u8 q9, \a
  771. vext.8 d29, \a, \b, #2
  772. vmovl.u8 q10, d28
  773. vext.8 d30, \a, \b, #3
  774. vmovl.u8 q11, d29
  775. vmovl.u8 q12, d30
  776. vmul.u16 q10, q10, d0[2]
  777. vmul.u16 q11, q11, d0[3]
  778. vmls.u16 q10, q9, d0[1]
  779. vmls.u16 q11, q12, d1[0]
  780. vqadd.s16 q11, q10, q11
  781. vqrshrun.s16 \d, q11, #7
  782. .endm
  783. .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
  784. vmovl.u8 q9, \s0
  785. vmovl.u8 q10, \s1
  786. vmovl.u8 q11, \s2
  787. vmovl.u8 q12, \s3
  788. vmovl.u8 q13, \s4
  789. vmul.u16 q8, q10, d0[2]
  790. vmul.u16 q14, q11, d0[3]
  791. vmul.u16 q11, q11, d0[2]
  792. vmul.u16 q15, q12, d0[3]
  793. vmls.u16 q8, q9, d0[1]
  794. vmls.u16 q14, q12, d1[0]
  795. vmls.u16 q11, q10, d0[1]
  796. vmls.u16 q15, q13, d1[0]
  797. vqadd.s16 q8, q8, q14
  798. vqadd.s16 q11, q11, q15
  799. vqrshrun.s16 \d0, q8, #7
  800. vqrshrun.s16 \d1, q11, #7
  801. .endm
  802. function ff_put_vp8_epel16_v6_neon, export=1
  803. sub r2, r2, r3, lsl #1
  804. push {r4,lr}
  805. vpush {d8-d15}
  806. ldr r4, [sp, #80] @ my
  807. movrel lr, subpel_filters-16
  808. ldr r12, [sp, #72] @ h
  809. add r4, lr, r4, lsl #4
  810. vld1.16 {q0}, [r4,:128]
  811. 1:
  812. vld1.8 {d2-d3}, [r2], r3
  813. vld1.8 {d4-d5}, [r2], r3
  814. vld1.8 {d6-d7}, [r2], r3
  815. vld1.8 {d8-d9}, [r2], r3
  816. vld1.8 {d10-d11},[r2], r3
  817. vld1.8 {d12-d13},[r2], r3
  818. vld1.8 {d14-d15},[r2]
  819. sub r2, r2, r3, lsl #2
  820. vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
  821. vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
  822. vst1.8 {d2-d3}, [r0,:128], r1
  823. vst1.8 {d4-d5}, [r0,:128], r1
  824. subs r12, r12, #2
  825. bne 1b
  826. vpop {d8-d15}
  827. pop {r4,pc}
  828. endfunc
  829. function ff_put_vp8_epel16_h6_neon, export=1
  830. sub r2, r2, #2
  831. push {r4,lr}
  832. ldr r4, [sp, #12] @ mx
  833. movrel lr, subpel_filters-16
  834. ldr r12, [sp, #8] @ h
  835. add r4, lr, r4, lsl #4
  836. vld1.16 {q0}, [r4,:128]
  837. 1:
  838. vld1.8 {d2-d4}, [r2], r3
  839. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  840. vst1.8 {d2-d3}, [r0,:128], r1
  841. subs r12, r12, #1
  842. bne 1b
  843. pop {r4,pc}
  844. endfunc
  845. function ff_put_vp8_epel16_h6v6_neon, export=1
  846. sub r2, r2, r3, lsl #1
  847. sub r2, r2, #2
  848. push {r4,lr}
  849. vpush {d8-d9}
  850. @ first pass (horizontal):
  851. ldr r4, [sp, #28] @ mx
  852. movrel lr, subpel_filters-16
  853. ldr r12, [sp, #24] @ h
  854. add r4, lr, r4, lsl #4
  855. sub sp, sp, #336+16
  856. vld1.16 {q0}, [r4,:128]
  857. add lr, sp, #15
  858. add r12, r12, #5
  859. bic lr, lr, #15
  860. 1:
  861. vld1.8 {d2,d3,d4}, [r2], r3
  862. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  863. vst1.8 {d2-d3}, [lr,:128]!
  864. subs r12, r12, #1
  865. bne 1b
  866. @ second pass (vertical):
  867. ldr r4, [sp, #336+16+32] @ my
  868. movrel lr, subpel_filters-16
  869. ldr r12, [sp, #336+16+24] @ h
  870. add r4, lr, r4, lsl #4
  871. add lr, sp, #15
  872. vld1.16 {q0}, [r4,:128]
  873. bic lr, lr, #15
  874. 2:
  875. vld1.8 {d2-d5}, [lr,:128]!
  876. vld1.8 {d6-d9}, [lr,:128]!
  877. vld1.8 {d28-d31},[lr,:128]
  878. sub lr, lr, #48
  879. vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
  880. vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
  881. vst1.8 {d2-d3}, [r0,:128], r1
  882. subs r12, r12, #1
  883. bne 2b
  884. add sp, sp, #336+16
  885. vpop {d8-d9}
  886. pop {r4,pc}
  887. endfunc
  888. function ff_put_vp8_epel8_v6_neon, export=1
  889. sub r2, r2, r3, lsl #1
  890. push {r4,lr}
  891. ldr r4, [sp, #16] @ my
  892. movrel lr, subpel_filters-16
  893. ldr r12, [sp, #8] @ h
  894. add r4, lr, r4, lsl #4
  895. vld1.16 {q0}, [r4,:128]
  896. 1:
  897. vld1.8 {d2}, [r2], r3
  898. vld1.8 {d3}, [r2], r3
  899. vld1.8 {d4}, [r2], r3
  900. vld1.8 {d5}, [r2], r3
  901. vld1.8 {d6}, [r2], r3
  902. vld1.8 {d7}, [r2], r3
  903. vld1.8 {d28}, [r2]
  904. sub r2, r2, r3, lsl #2
  905. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  906. vst1.8 {d2}, [r0,:64], r1
  907. vst1.8 {d3}, [r0,:64], r1
  908. subs r12, r12, #2
  909. bne 1b
  910. pop {r4,pc}
  911. endfunc
  912. function ff_put_vp8_epel8_h6_neon, export=1
  913. sub r2, r2, #2
  914. push {r4,lr}
  915. ldr r4, [sp, #12] @ mx
  916. movrel lr, subpel_filters-16
  917. ldr r12, [sp, #8] @ h
  918. add r4, lr, r4, lsl #4
  919. vld1.16 {q0}, [r4,:128]
  920. 1:
  921. vld1.8 {d2,d3}, [r2], r3
  922. vp8_epel8_h6 d2, d2, d3
  923. vst1.8 {d2}, [r0,:64], r1
  924. subs r12, r12, #1
  925. bne 1b
  926. pop {r4,pc}
  927. endfunc
  928. function ff_put_vp8_epel8_h6v6_neon, export=1
  929. sub r2, r2, r3, lsl #1
  930. sub r2, r2, #2
  931. push {r4,lr}
  932. @ first pass (horizontal):
  933. ldr r4, [sp, #12] @ mx
  934. movrel lr, subpel_filters-16
  935. ldr r12, [sp, #8] @ h
  936. add r4, lr, r4, lsl #4
  937. sub sp, sp, #168+16
  938. vld1.16 {q0}, [r4,:128]
  939. add lr, sp, #15
  940. add r12, r12, #5
  941. bic lr, lr, #15
  942. 1:
  943. vld1.8 {d2,d3}, [r2], r3
  944. vp8_epel8_h6 d2, d2, d3
  945. vst1.8 {d2}, [lr,:64]!
  946. subs r12, r12, #1
  947. bne 1b
  948. @ second pass (vertical):
  949. ldr r4, [sp, #168+16+16] @ my
  950. movrel lr, subpel_filters-16
  951. ldr r12, [sp, #168+16+8] @ h
  952. add r4, lr, r4, lsl #4
  953. add lr, sp, #15
  954. vld1.16 {q0}, [r4,:128]
  955. bic lr, lr, #15
  956. 2:
  957. vld1.8 {d2-d5}, [lr,:128]!
  958. vld1.8 {d6-d7}, [lr,:128]!
  959. vld1.8 {d30}, [lr,:64]
  960. sub lr, lr, #32
  961. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  962. vst1.8 {d2}, [r0,:64], r1
  963. vst1.8 {d3}, [r0,:64], r1
  964. subs r12, r12, #2
  965. bne 2b
  966. add sp, sp, #168+16
  967. pop {r4,pc}
  968. endfunc
  969. function ff_put_vp8_epel8_v4_neon, export=1
  970. sub r2, r2, r3
  971. push {r4,lr}
  972. ldr r4, [sp, #16] @ my
  973. movrel lr, subpel_filters-16
  974. ldr r12, [sp, #8] @ h
  975. add r4, lr, r4, lsl #4
  976. vld1.16 {q0}, [r4,:128]
  977. 1:
  978. vld1.8 {d2}, [r2], r3
  979. vld1.8 {d3}, [r2], r3
  980. vld1.8 {d4}, [r2], r3
  981. vld1.8 {d5}, [r2], r3
  982. vld1.8 {d6}, [r2]
  983. sub r2, r2, r3, lsl #1
  984. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  985. vst1.8 {d2}, [r0,:64], r1
  986. vst1.8 {d3}, [r0,:64], r1
  987. subs r12, r12, #2
  988. bne 1b
  989. pop {r4,pc}
  990. endfunc
  991. function ff_put_vp8_epel8_h4_neon, export=1
  992. sub r2, r2, #1
  993. push {r4,lr}
  994. ldr r4, [sp, #12] @ mx
  995. movrel lr, subpel_filters-16
  996. ldr r12, [sp, #8] @ h
  997. add r4, lr, r4, lsl #4
  998. vld1.16 {q0}, [r4,:128]
  999. 1:
  1000. vld1.8 {d2,d3}, [r2], r3
  1001. vp8_epel8_h4 d2, d2, d3
  1002. vst1.8 {d2}, [r0,:64], r1
  1003. subs r12, r12, #1
  1004. bne 1b
  1005. pop {r4,pc}
  1006. endfunc
  1007. function ff_put_vp8_epel8_h4v4_neon, export=1
  1008. sub r2, r2, r3
  1009. sub r2, r2, #1
  1010. push {r4,lr}
  1011. @ first pass (horizontal):
  1012. ldr r4, [sp, #12] @ mx
  1013. movrel lr, subpel_filters-16
  1014. ldr r12, [sp, #8] @ h
  1015. add r4, lr, r4, lsl #4
  1016. sub sp, sp, #168+16
  1017. vld1.16 {q0}, [r4,:128]
  1018. add lr, sp, #15
  1019. add r12, r12, #3
  1020. bic lr, lr, #15
  1021. 1:
  1022. vld1.8 {d2,d3}, [r2], r3
  1023. vp8_epel8_h4 d2, d2, d3
  1024. vst1.8 {d2}, [lr,:64]!
  1025. subs r12, r12, #1
  1026. bne 1b
  1027. @ second pass (vertical):
  1028. ldr r4, [sp, #168+16+16] @ my
  1029. movrel lr, subpel_filters-16
  1030. ldr r12, [sp, #168+16+8] @ h
  1031. add r4, lr, r4, lsl #4
  1032. add lr, sp, #15
  1033. vld1.16 {q0}, [r4,:128]
  1034. bic lr, lr, #15
  1035. 2:
  1036. vld1.8 {d2-d5}, [lr,:128]!
  1037. vld1.8 {d6}, [lr,:64]
  1038. sub lr, lr, #16
  1039. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1040. vst1.8 {d2}, [r0,:64], r1
  1041. vst1.8 {d3}, [r0,:64], r1
  1042. subs r12, r12, #2
  1043. bne 2b
  1044. add sp, sp, #168+16
  1045. pop {r4,pc}
  1046. endfunc
  1047. function ff_put_vp8_epel8_h6v4_neon, export=1
  1048. sub r2, r2, r3
  1049. sub r2, r2, #2
  1050. push {r4,lr}
  1051. @ first pass (horizontal):
  1052. ldr r4, [sp, #12] @ mx
  1053. movrel lr, subpel_filters-16
  1054. ldr r12, [sp, #8] @ h
  1055. add r4, lr, r4, lsl #4
  1056. sub sp, sp, #168+16
  1057. vld1.16 {q0}, [r4,:128]
  1058. add lr, sp, #15
  1059. add r12, r12, #3
  1060. bic lr, lr, #15
  1061. 1:
  1062. vld1.8 {d2,d3}, [r2], r3
  1063. vp8_epel8_h6 d2, d2, d3
  1064. vst1.8 {d2}, [lr,:64]!
  1065. subs r12, r12, #1
  1066. bne 1b
  1067. @ second pass (vertical):
  1068. ldr r4, [sp, #168+16+16] @ my
  1069. movrel lr, subpel_filters-16
  1070. ldr r12, [sp, #168+16+8] @ h
  1071. add r4, lr, r4, lsl #4
  1072. add lr, sp, #15
  1073. vld1.16 {q0}, [r4,:128]
  1074. bic lr, lr, #15
  1075. 2:
  1076. vld1.8 {d2-d5}, [lr,:128]!
  1077. vld1.8 {d6}, [lr,:64]
  1078. sub lr, lr, #16
  1079. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1080. vst1.8 {d2}, [r0,:64], r1
  1081. vst1.8 {d3}, [r0,:64], r1
  1082. subs r12, r12, #2
  1083. bne 2b
  1084. add sp, sp, #168+16
  1085. pop {r4,pc}
  1086. endfunc
  1087. function ff_put_vp8_epel8_h4v6_neon, export=1
  1088. sub r2, r2, r3, lsl #1
  1089. sub r2, r2, #1
  1090. push {r4,lr}
  1091. @ first pass (horizontal):
  1092. ldr r4, [sp, #12] @ mx
  1093. movrel lr, subpel_filters-16
  1094. ldr r12, [sp, #8] @ h
  1095. add r4, lr, r4, lsl #4
  1096. sub sp, sp, #168+16
  1097. vld1.16 {q0}, [r4,:128]
  1098. add lr, sp, #15
  1099. add r12, r12, #5
  1100. bic lr, lr, #15
  1101. 1:
  1102. vld1.8 {d2,d3}, [r2], r3
  1103. vp8_epel8_h4 d2, d2, d3
  1104. vst1.8 {d2}, [lr,:64]!
  1105. subs r12, r12, #1
  1106. bne 1b
  1107. @ second pass (vertical):
  1108. ldr r4, [sp, #168+16+16] @ my
  1109. movrel lr, subpel_filters-16
  1110. ldr r12, [sp, #168+16+8] @ h
  1111. add r4, lr, r4, lsl #4
  1112. add lr, sp, #15
  1113. vld1.16 {q0}, [r4,:128]
  1114. bic lr, lr, #15
  1115. 2:
  1116. vld1.8 {d2-d5}, [lr,:128]!
  1117. vld1.8 {d6-d7}, [lr,:128]!
  1118. vld1.8 {d30}, [lr,:64]
  1119. sub lr, lr, #32
  1120. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  1121. vst1.8 {d2}, [r0,:64], r1
  1122. vst1.8 {d3}, [r0,:64], r1
  1123. subs r12, r12, #2
  1124. bne 2b
  1125. add sp, sp, #168+16
  1126. pop {r4,pc}
  1127. endfunc
  1128. .ltorg
  1129. function ff_put_vp8_epel4_v6_neon, export=1
  1130. sub r2, r2, r3, lsl #1
  1131. push {r4,lr}
  1132. ldr r4, [sp, #16] @ my
  1133. movrel lr, subpel_filters-16
  1134. ldr r12, [sp, #8] @ h
  1135. add r4, lr, r4, lsl #4
  1136. vld1.16 {q0}, [r4,:128]
  1137. 1:
  1138. vld1.32 {d2[]}, [r2], r3
  1139. vld1.32 {d3[]}, [r2], r3
  1140. vld1.32 {d4[]}, [r2], r3
  1141. vld1.32 {d5[]}, [r2], r3
  1142. vld1.32 {d6[]}, [r2], r3
  1143. vld1.32 {d7[]}, [r2], r3
  1144. vld1.32 {d28[]}, [r2]
  1145. sub r2, r2, r3, lsl #2
  1146. vld1.32 {d2[1]}, [r2], r3
  1147. vld1.32 {d3[1]}, [r2], r3
  1148. vld1.32 {d4[1]}, [r2], r3
  1149. vld1.32 {d5[1]}, [r2], r3
  1150. vld1.32 {d6[1]}, [r2], r3
  1151. vld1.32 {d7[1]}, [r2], r3
  1152. vld1.32 {d28[1]}, [r2]
  1153. sub r2, r2, r3, lsl #2
  1154. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  1155. vst1.32 {d2[0]}, [r0,:32], r1
  1156. vst1.32 {d3[0]}, [r0,:32], r1
  1157. vst1.32 {d2[1]}, [r0,:32], r1
  1158. vst1.32 {d3[1]}, [r0,:32], r1
  1159. subs r12, r12, #4
  1160. bne 1b
  1161. pop {r4,pc}
  1162. endfunc
  1163. function ff_put_vp8_epel4_h6_neon, export=1
  1164. sub r2, r2, #2
  1165. push {r4,lr}
  1166. ldr r4, [sp, #12] @ mx
  1167. movrel lr, subpel_filters-16
  1168. ldr r12, [sp, #8] @ h
  1169. add r4, lr, r4, lsl #4
  1170. vld1.16 {q0}, [r4,:128]
  1171. 1:
  1172. vld1.8 {q1}, [r2], r3
  1173. vp8_epel8_h6 d2, d2, d3
  1174. vst1.32 {d2[0]}, [r0,:32], r1
  1175. subs r12, r12, #1
  1176. bne 1b
  1177. pop {r4,pc}
  1178. endfunc
  1179. function ff_put_vp8_epel4_h6v6_neon, export=1
  1180. sub r2, r2, r3, lsl #1
  1181. sub r2, r2, #2
  1182. push {r4,lr}
  1183. ldr r4, [sp, #12] @ mx
  1184. movrel lr, subpel_filters-16
  1185. ldr r12, [sp, #8] @ h
  1186. add r4, lr, r4, lsl #4
  1187. sub sp, sp, #52+16
  1188. vld1.16 {q0}, [r4,:128]
  1189. add lr, sp, #15
  1190. add r12, r12, #5
  1191. bic lr, lr, #15
  1192. 1:
  1193. vld1.8 {q1}, [r2], r3
  1194. vp8_epel8_h6 d2, d2, d3
  1195. vst1.32 {d2[0]}, [lr,:32]!
  1196. subs r12, r12, #1
  1197. bne 1b
  1198. ldr r4, [sp, #52+16+16] @ my
  1199. movrel lr, subpel_filters-16
  1200. ldr r12, [sp, #52+16+8] @ h
  1201. add r4, lr, r4, lsl #4
  1202. add lr, sp, #15
  1203. vld1.16 {q0}, [r4,:128]
  1204. bic lr, lr, #15
  1205. 2:
  1206. vld1.8 {d2-d3}, [lr,:128]!
  1207. vld1.8 {d6}, [lr,:64]!
  1208. vld1.32 {d28[]}, [lr,:32]
  1209. sub lr, lr, #16
  1210. vld1.8 {d4-d5}, [lr]!
  1211. vld1.8 {d7}, [lr,:64]!
  1212. vld1.32 {d28[1]}, [lr,:32]
  1213. sub lr, lr, #16
  1214. vtrn.32 q1, q2
  1215. vtrn.32 d6, d7
  1216. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1217. vst1.32 {d2[0]}, [r0,:32], r1
  1218. vst1.32 {d3[0]}, [r0,:32], r1
  1219. vst1.32 {d2[1]}, [r0,:32], r1
  1220. vst1.32 {d3[1]}, [r0,:32], r1
  1221. subs r12, r12, #4
  1222. bne 2b
  1223. add sp, sp, #52+16
  1224. pop {r4,pc}
  1225. endfunc
  1226. function ff_put_vp8_epel4_h4v6_neon, export=1
  1227. sub r2, r2, r3, lsl #1
  1228. sub r2, r2, #1
  1229. push {r4,lr}
  1230. ldr r4, [sp, #12] @ mx
  1231. movrel lr, subpel_filters-16
  1232. ldr r12, [sp, #8] @ h
  1233. add r4, lr, r4, lsl #4
  1234. sub sp, sp, #52+16
  1235. vld1.16 {q0}, [r4,:128]
  1236. add lr, sp, #15
  1237. add r12, r12, #5
  1238. bic lr, lr, #15
  1239. 1:
  1240. vld1.8 {d2}, [r2], r3
  1241. vp8_epel8_h4 d2, d2, d2
  1242. vst1.32 {d2[0]}, [lr,:32]!
  1243. subs r12, r12, #1
  1244. bne 1b
  1245. ldr r4, [sp, #52+16+16] @ my
  1246. movrel lr, subpel_filters-16
  1247. ldr r12, [sp, #52+16+8] @ h
  1248. add r4, lr, r4, lsl #4
  1249. add lr, sp, #15
  1250. vld1.16 {q0}, [r4,:128]
  1251. bic lr, lr, #15
  1252. 2:
  1253. vld1.8 {d2-d3}, [lr,:128]!
  1254. vld1.8 {d6}, [lr,:64]!
  1255. vld1.32 {d28[]}, [lr,:32]
  1256. sub lr, lr, #16
  1257. vld1.8 {d4-d5}, [lr]!
  1258. vld1.8 {d7}, [lr,:64]!
  1259. vld1.32 {d28[1]}, [lr,:32]
  1260. sub lr, lr, #16
  1261. vtrn.32 q1, q2
  1262. vtrn.32 d6, d7
  1263. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1264. vst1.32 {d2[0]}, [r0,:32], r1
  1265. vst1.32 {d3[0]}, [r0,:32], r1
  1266. vst1.32 {d2[1]}, [r0,:32], r1
  1267. vst1.32 {d3[1]}, [r0,:32], r1
  1268. subs r12, r12, #4
  1269. bne 2b
  1270. add sp, sp, #52+16
  1271. pop {r4,pc}
  1272. endfunc
  1273. function ff_put_vp8_epel4_h6v4_neon, export=1
  1274. sub r2, r2, r3
  1275. sub r2, r2, #2
  1276. push {r4,lr}
  1277. ldr r4, [sp, #12] @ mx
  1278. movrel lr, subpel_filters-16
  1279. ldr r12, [sp, #8] @ h
  1280. add r4, lr, r4, lsl #4
  1281. sub sp, sp, #44+16
  1282. vld1.16 {q0}, [r4,:128]
  1283. add lr, sp, #15
  1284. add r12, r12, #3
  1285. bic lr, lr, #15
  1286. 1:
  1287. vld1.8 {q1}, [r2], r3
  1288. vp8_epel8_h6 d2, d2, d3
  1289. vst1.32 {d2[0]}, [lr,:32]!
  1290. subs r12, r12, #1
  1291. bne 1b
  1292. ldr r4, [sp, #44+16+16] @ my
  1293. movrel lr, subpel_filters-16
  1294. ldr r12, [sp, #44+16+8] @ h
  1295. add r4, lr, r4, lsl #4
  1296. add lr, sp, #15
  1297. vld1.16 {q0}, [r4,:128]
  1298. bic lr, lr, #15
  1299. 2:
  1300. vld1.8 {d2-d3}, [lr,:128]!
  1301. vld1.32 {d6[]}, [lr,:32]
  1302. sub lr, lr, #8
  1303. vld1.8 {d4-d5}, [lr]!
  1304. vld1.32 {d6[1]}, [lr,:32]
  1305. sub lr, lr, #8
  1306. vtrn.32 q1, q2
  1307. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1308. vst1.32 {d2[0]}, [r0,:32], r1
  1309. vst1.32 {d3[0]}, [r0,:32], r1
  1310. vst1.32 {d2[1]}, [r0,:32], r1
  1311. vst1.32 {d3[1]}, [r0,:32], r1
  1312. subs r12, r12, #4
  1313. bne 2b
  1314. add sp, sp, #44+16
  1315. pop {r4,pc}
  1316. endfunc
  1317. function ff_put_vp8_epel4_h4_neon, export=1
  1318. sub r2, r2, #1
  1319. push {r4,lr}
  1320. ldr r4, [sp, #12] @ mx
  1321. movrel lr, subpel_filters-16
  1322. ldr r12, [sp, #8] @ h
  1323. add r4, lr, r4, lsl #4
  1324. vld1.16 {q0}, [r4,:128]
  1325. 1:
  1326. vld1.8 {d2}, [r2], r3
  1327. vp8_epel8_h4 d2, d2, d2
  1328. vst1.32 {d2[0]}, [r0,:32], r1
  1329. subs r12, r12, #1
  1330. bne 1b
  1331. pop {r4,pc}
  1332. endfunc
  1333. function ff_put_vp8_epel4_v4_neon, export=1
  1334. sub r2, r2, r3
  1335. push {r4,lr}
  1336. ldr r4, [sp, #16] @ my
  1337. movrel lr, subpel_filters-16
  1338. ldr r12, [sp, #8] @ h
  1339. add r4, lr, r4, lsl #4
  1340. vld1.16 {q0}, [r4,:128]
  1341. 1:
  1342. vld1.32 {d2[]}, [r2], r3
  1343. vld1.32 {d3[]}, [r2], r3
  1344. vld1.32 {d4[]}, [r2], r3
  1345. vld1.32 {d5[]}, [r2], r3
  1346. vld1.32 {d6[]}, [r2]
  1347. sub r2, r2, r3, lsl #1
  1348. vld1.32 {d2[1]}, [r2], r3
  1349. vld1.32 {d3[1]}, [r2], r3
  1350. vld1.32 {d4[1]}, [r2], r3
  1351. vld1.32 {d5[1]}, [r2], r3
  1352. vld1.32 {d6[1]}, [r2]
  1353. sub r2, r2, r3, lsl #1
  1354. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1355. vst1.32 {d2[0]}, [r0,:32], r1
  1356. vst1.32 {d3[0]}, [r0,:32], r1
  1357. vst1.32 {d2[1]}, [r0,:32], r1
  1358. vst1.32 {d3[1]}, [r0,:32], r1
  1359. subs r12, r12, #4
  1360. bne 1b
  1361. pop {r4,pc}
  1362. endfunc
  1363. function ff_put_vp8_epel4_h4v4_neon, export=1
  1364. sub r2, r2, r3
  1365. sub r2, r2, #1
  1366. push {r4,lr}
  1367. ldr r4, [sp, #12] @ mx
  1368. movrel lr, subpel_filters-16
  1369. ldr r12, [sp, #8] @ h
  1370. add r4, lr, r4, lsl #4
  1371. sub sp, sp, #44+16
  1372. vld1.16 {q0}, [r4,:128]
  1373. add lr, sp, #15
  1374. add r12, r12, #3
  1375. bic lr, lr, #15
  1376. 1:
  1377. vld1.8 {d2}, [r2], r3
  1378. vp8_epel8_h4 d2, d2, d3
  1379. vst1.32 {d2[0]}, [lr,:32]!
  1380. subs r12, r12, #1
  1381. bne 1b
  1382. ldr r4, [sp, #44+16+16] @ my
  1383. movrel lr, subpel_filters-16
  1384. ldr r12, [sp, #44+16+8] @ h
  1385. add r4, lr, r4, lsl #4
  1386. add lr, sp, #15
  1387. vld1.16 {q0}, [r4,:128]
  1388. bic lr, lr, #15
  1389. 2:
  1390. vld1.8 {d2-d3}, [lr,:128]!
  1391. vld1.32 {d6[]}, [lr,:32]
  1392. sub lr, lr, #8
  1393. vld1.8 {d4-d5}, [lr]!
  1394. vld1.32 {d6[1]}, [lr,:32]
  1395. sub lr, lr, #8
  1396. vtrn.32 q1, q2
  1397. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1398. vst1.32 {d2[0]}, [r0,:32], r1
  1399. vst1.32 {d3[0]}, [r0,:32], r1
  1400. vst1.32 {d2[1]}, [r0,:32], r1
  1401. vst1.32 {d3[1]}, [r0,:32], r1
  1402. subs r12, r12, #4
  1403. bne 2b
  1404. add sp, sp, #44+16
  1405. pop {r4,pc}
  1406. endfunc
  1407. @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
  1408. @ arithmatic can be used to apply filters
  1409. const subpel_filters, align=4
  1410. .short 0, 6, 123, 12, 1, 0, 0, 0
  1411. .short 2, 11, 108, 36, 8, 1, 0, 0
  1412. .short 0, 9, 93, 50, 6, 0, 0, 0
  1413. .short 3, 16, 77, 77, 16, 3, 0, 0
  1414. .short 0, 6, 50, 93, 9, 0, 0, 0
  1415. .short 1, 8, 36, 108, 11, 2, 0, 0
  1416. .short 0, 1, 12, 123, 6, 0, 0, 0
  1417. endconst
  1418. /* Bilinear MC */
  1419. function ff_put_vp8_bilin16_h_neon, export=1
  1420. ldr r3, [sp, #4] @ mx
  1421. rsb r12, r3, #8
  1422. vdup.8 d0, r3
  1423. vdup.8 d1, r12
  1424. ldr r12, [sp] @ h
  1425. 1:
  1426. subs r12, r12, #2
  1427. vld1.8 {d2-d4}, [r2], r1
  1428. vext.8 q2, q1, q2, #1
  1429. vmull.u8 q8, d2, d1
  1430. vmlal.u8 q8, d4, d0
  1431. vld1.8 {d18-d20},[r2], r1
  1432. vmull.u8 q3, d3, d1
  1433. vmlal.u8 q3, d5, d0
  1434. vext.8 q10, q9, q10, #1
  1435. vmull.u8 q11, d18, d1
  1436. vmlal.u8 q11, d20, d0
  1437. vmull.u8 q12, d19, d1
  1438. vmlal.u8 q12, d21, d0
  1439. vrshrn.u16 d4, q8, #3
  1440. vrshrn.u16 d5, q3, #3
  1441. vrshrn.u16 d6, q11, #3
  1442. vrshrn.u16 d7, q12, #3
  1443. vst1.8 {q2}, [r0,:128], r1
  1444. vst1.8 {q3}, [r0,:128], r1
  1445. bgt 1b
  1446. bx lr
  1447. endfunc
  1448. function ff_put_vp8_bilin16_v_neon, export=1
  1449. ldr r3, [sp, #8] @ my
  1450. rsb r12, r3, #8
  1451. vdup.8 d0, r3
  1452. vdup.8 d1, r12
  1453. ldr r12, [sp] @ h
  1454. vld1.8 {q1}, [r2], r1
  1455. 1:
  1456. subs r12, r12, #2
  1457. vld1.8 {q2}, [r2], r1
  1458. vmull.u8 q3, d2, d1
  1459. vmlal.u8 q3, d4, d0
  1460. vmull.u8 q8, d3, d1
  1461. vmlal.u8 q8, d5, d0
  1462. vld1.8 {q1}, [r2], r1
  1463. vmull.u8 q9, d4, d1
  1464. vmlal.u8 q9, d2, d0
  1465. vmull.u8 q10, d5, d1
  1466. vmlal.u8 q10, d3, d0
  1467. vrshrn.u16 d4, q3, #3
  1468. vrshrn.u16 d5, q8, #3
  1469. vrshrn.u16 d6, q9, #3
  1470. vrshrn.u16 d7, q10, #3
  1471. vst1.8 {q2}, [r0,:128], r1
  1472. vst1.8 {q3}, [r0,:128], r1
  1473. bgt 1b
  1474. bx lr
  1475. endfunc
  1476. function ff_put_vp8_bilin16_hv_neon, export=1
  1477. ldr r3, [sp, #4] @ mx
  1478. rsb r12, r3, #8
  1479. vdup.8 d0, r3
  1480. vdup.8 d1, r12
  1481. ldr r3, [sp, #8] @ my
  1482. rsb r12, r3, #8
  1483. vdup.8 d2, r3
  1484. vdup.8 d3, r12
  1485. ldr r12, [sp] @ h
  1486. vld1.8 {d4-d6}, [r2], r1
  1487. vext.8 q3, q2, q3, #1
  1488. vmull.u8 q8, d4, d1
  1489. vmlal.u8 q8, d6, d0
  1490. vmull.u8 q9, d5, d1
  1491. vmlal.u8 q9, d7, d0
  1492. vrshrn.u16 d4, q8, #3
  1493. vrshrn.u16 d5, q9, #3
  1494. 1:
  1495. subs r12, r12, #2
  1496. vld1.8 {d18-d20},[r2], r1
  1497. vext.8 q10, q9, q10, #1
  1498. vmull.u8 q11, d18, d1
  1499. vmlal.u8 q11, d20, d0
  1500. vld1.8 {d26-d28},[r2], r1
  1501. vmull.u8 q12, d19, d1
  1502. vmlal.u8 q12, d21, d0
  1503. vext.8 q14, q13, q14, #1
  1504. vmull.u8 q8, d26, d1
  1505. vmlal.u8 q8, d28, d0
  1506. vmull.u8 q9, d27, d1
  1507. vmlal.u8 q9, d29, d0
  1508. vrshrn.u16 d6, q11, #3
  1509. vrshrn.u16 d7, q12, #3
  1510. vmull.u8 q12, d4, d3
  1511. vmlal.u8 q12, d6, d2
  1512. vmull.u8 q15, d5, d3
  1513. vmlal.u8 q15, d7, d2
  1514. vrshrn.u16 d4, q8, #3
  1515. vrshrn.u16 d5, q9, #3
  1516. vmull.u8 q10, d6, d3
  1517. vmlal.u8 q10, d4, d2
  1518. vmull.u8 q11, d7, d3
  1519. vmlal.u8 q11, d5, d2
  1520. vrshrn.u16 d24, q12, #3
  1521. vrshrn.u16 d25, q15, #3
  1522. vst1.8 {q12}, [r0,:128], r1
  1523. vrshrn.u16 d20, q10, #3
  1524. vrshrn.u16 d21, q11, #3
  1525. vst1.8 {q10}, [r0,:128], r1
  1526. bgt 1b
  1527. bx lr
  1528. endfunc
  1529. function ff_put_vp8_bilin8_h_neon, export=1
  1530. ldr r3, [sp, #4] @ mx
  1531. rsb r12, r3, #8
  1532. vdup.8 d0, r3
  1533. vdup.8 d1, r12
  1534. ldr r12, [sp] @ h
  1535. 1:
  1536. subs r12, r12, #2
  1537. vld1.8 {q1}, [r2], r1
  1538. vext.8 d3, d2, d3, #1
  1539. vmull.u8 q2, d2, d1
  1540. vmlal.u8 q2, d3, d0
  1541. vld1.8 {q3}, [r2], r1
  1542. vext.8 d7, d6, d7, #1
  1543. vmull.u8 q8, d6, d1
  1544. vmlal.u8 q8, d7, d0
  1545. vrshrn.u16 d4, q2, #3
  1546. vrshrn.u16 d16, q8, #3
  1547. vst1.8 {d4}, [r0,:64], r1
  1548. vst1.8 {d16}, [r0,:64], r1
  1549. bgt 1b
  1550. bx lr
  1551. endfunc
  1552. function ff_put_vp8_bilin8_v_neon, export=1
  1553. ldr r3, [sp, #8] @ my
  1554. rsb r12, r3, #8
  1555. vdup.8 d0, r3
  1556. vdup.8 d1, r12
  1557. ldr r12, [sp] @ h
  1558. vld1.8 {d2}, [r2], r1
  1559. 1:
  1560. subs r12, r12, #2
  1561. vld1.8 {d3}, [r2], r1
  1562. vmull.u8 q2, d2, d1
  1563. vmlal.u8 q2, d3, d0
  1564. vld1.8 {d2}, [r2], r1
  1565. vmull.u8 q3, d3, d1
  1566. vmlal.u8 q3, d2, d0
  1567. vrshrn.u16 d4, q2, #3
  1568. vrshrn.u16 d6, q3, #3
  1569. vst1.8 {d4}, [r0,:64], r1
  1570. vst1.8 {d6}, [r0,:64], r1
  1571. bgt 1b
  1572. bx lr
  1573. endfunc
  1574. function ff_put_vp8_bilin8_hv_neon, export=1
  1575. ldr r3, [sp, #4] @ mx
  1576. rsb r12, r3, #8
  1577. vdup.8 d0, r3
  1578. vdup.8 d1, r12
  1579. ldr r3, [sp, #8] @ my
  1580. rsb r12, r3, #8
  1581. vdup.8 d2, r3
  1582. vdup.8 d3, r12
  1583. ldr r12, [sp] @ h
  1584. vld1.8 {q2}, [r2], r1
  1585. vext.8 d5, d4, d5, #1
  1586. vmull.u8 q9, d4, d1
  1587. vmlal.u8 q9, d5, d0
  1588. vrshrn.u16 d22, q9, #3
  1589. 1:
  1590. subs r12, r12, #2
  1591. vld1.8 {q3}, [r2], r1
  1592. vext.8 d7, d6, d7, #1
  1593. vmull.u8 q8, d6, d1
  1594. vmlal.u8 q8, d7, d0
  1595. vld1.8 {q2}, [r2], r1
  1596. vext.8 d5, d4, d5, #1
  1597. vmull.u8 q9, d4, d1
  1598. vmlal.u8 q9, d5, d0
  1599. vrshrn.u16 d16, q8, #3
  1600. vmull.u8 q10, d22, d3
  1601. vmlal.u8 q10, d16, d2
  1602. vrshrn.u16 d22, q9, #3
  1603. vmull.u8 q12, d16, d3
  1604. vmlal.u8 q12, d22, d2
  1605. vrshrn.u16 d20, q10, #3
  1606. vst1.8 {d20}, [r0,:64], r1
  1607. vrshrn.u16 d23, q12, #3
  1608. vst1.8 {d23}, [r0,:64], r1
  1609. bgt 1b
  1610. bx lr
  1611. endfunc
  1612. function ff_put_vp8_bilin4_h_neon, export=1
  1613. ldr r3, [sp, #4] @ mx
  1614. rsb r12, r3, #8
  1615. vdup.8 d0, r3
  1616. vdup.8 d1, r12
  1617. ldr r12, [sp] @ h
  1618. 1:
  1619. subs r12, r12, #2
  1620. vld1.8 {d2}, [r2], r1
  1621. vext.8 d3, d2, d3, #1
  1622. vld1.8 {d6}, [r2], r1
  1623. vext.8 d7, d6, d7, #1
  1624. vtrn.32 q1, q3
  1625. vmull.u8 q2, d2, d1
  1626. vmlal.u8 q2, d3, d0
  1627. vrshrn.u16 d4, q2, #3
  1628. vst1.32 {d4[0]}, [r0,:32], r1
  1629. vst1.32 {d4[1]}, [r0,:32], r1
  1630. bgt 1b
  1631. bx lr
  1632. endfunc
  1633. function ff_put_vp8_bilin4_v_neon, export=1
  1634. ldr r3, [sp, #8] @ my
  1635. rsb r12, r3, #8
  1636. vdup.8 d0, r3
  1637. vdup.8 d1, r12
  1638. ldr r12, [sp] @ h
  1639. vld1.32 {d2[]}, [r2], r1
  1640. 1:
  1641. vld1.32 {d3[]}, [r2]
  1642. vld1.32 {d2[1]}, [r2], r1
  1643. vld1.32 {d3[1]}, [r2], r1
  1644. vmull.u8 q2, d2, d1
  1645. vmlal.u8 q2, d3, d0
  1646. vtrn.32 d3, d2
  1647. vrshrn.u16 d4, q2, #3
  1648. vst1.32 {d4[0]}, [r0,:32], r1
  1649. vst1.32 {d4[1]}, [r0,:32], r1
  1650. subs r12, r12, #2
  1651. bgt 1b
  1652. bx lr
  1653. endfunc
  1654. function ff_put_vp8_bilin4_hv_neon, export=1
  1655. ldr r3, [sp, #4] @ mx
  1656. rsb r12, r3, #8
  1657. vdup.8 d0, r3
  1658. vdup.8 d1, r12
  1659. ldr r3, [sp, #8] @ my
  1660. rsb r12, r3, #8
  1661. vdup.8 d2, r3
  1662. vdup.8 d3, r12
  1663. ldr r12, [sp] @ h
  1664. vld1.8 {d4}, [r2], r1
  1665. vext.8 d5, d4, d4, #1
  1666. vmull.u8 q9, d4, d1
  1667. vmlal.u8 q9, d5, d0
  1668. vrshrn.u16 d22, q9, #3
  1669. 1:
  1670. subs r12, r12, #2
  1671. vld1.8 {d6}, [r2], r1
  1672. vext.8 d7, d6, d6, #1
  1673. vld1.8 {d4}, [r2], r1
  1674. vext.8 d5, d4, d4, #1
  1675. vtrn.32 q3, q2
  1676. vmull.u8 q8, d6, d1
  1677. vmlal.u8 q8, d7, d0
  1678. vrshrn.u16 d16, q8, #3
  1679. vmull.u8 q10, d16, d2
  1680. vtrn.32 d22, d16
  1681. vmlal.u8 q10, d22, d3
  1682. vrev64.32 d22, d16
  1683. vrshrn.u16 d20, q10, #3
  1684. vst1.32 {d20[0]}, [r0,:32], r1
  1685. vst1.32 {d20[1]}, [r0,:32], r1
  1686. bgt 1b
  1687. bx lr
  1688. endfunc