You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1884 lines
66KB

  1. /**
  2. * VP8 NEON optimisations
  3. *
  4. * Copyright (c) 2010 Rob Clark <rob@ti.com>
  5. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "asm.S"
  24. function ff_vp8_luma_dc_wht_neon, export=1
  25. vld1.16 {q0-q1}, [r1,:128]
  26. vmov.i16 q15, #0
  27. vadd.i16 d4, d0, d3
  28. vadd.i16 d6, d1, d2
  29. vst1.16 {q15}, [r1,:128]!
  30. vsub.i16 d7, d1, d2
  31. vsub.i16 d5, d0, d3
  32. vst1.16 {q15}, [r1,:128]
  33. vadd.i16 q0, q2, q3
  34. vsub.i16 q1, q2, q3
  35. vmov.i16 q8, #3
  36. vtrn.32 d0, d2
  37. vtrn.32 d1, d3
  38. vtrn.16 d0, d1
  39. vtrn.16 d2, d3
  40. vadd.i16 d0, d0, d16
  41. vadd.i16 d4, d0, d3
  42. vadd.i16 d6, d1, d2
  43. vsub.i16 d7, d1, d2
  44. vsub.i16 d5, d0, d3
  45. vadd.i16 q0, q2, q3
  46. vsub.i16 q1, q2, q3
  47. vshr.s16 q0, q0, #3
  48. vshr.s16 q1, q1, #3
  49. mov r3, #32
  50. vst1.16 {d0[0]}, [r0,:16], r3
  51. vst1.16 {d1[0]}, [r0,:16], r3
  52. vst1.16 {d2[0]}, [r0,:16], r3
  53. vst1.16 {d3[0]}, [r0,:16], r3
  54. vst1.16 {d0[1]}, [r0,:16], r3
  55. vst1.16 {d1[1]}, [r0,:16], r3
  56. vst1.16 {d2[1]}, [r0,:16], r3
  57. vst1.16 {d3[1]}, [r0,:16], r3
  58. vst1.16 {d0[2]}, [r0,:16], r3
  59. vst1.16 {d1[2]}, [r0,:16], r3
  60. vst1.16 {d2[2]}, [r0,:16], r3
  61. vst1.16 {d3[2]}, [r0,:16], r3
  62. vst1.16 {d0[3]}, [r0,:16], r3
  63. vst1.16 {d1[3]}, [r0,:16], r3
  64. vst1.16 {d2[3]}, [r0,:16], r3
  65. vst1.16 {d3[3]}, [r0,:16], r3
  66. bx lr
  67. endfunc
  68. function ff_vp8_idct_add_neon, export=1
  69. vld1.16 {q0-q1}, [r1,:128]
  70. movw r3, #20091
  71. movt r3, #35468/2
  72. vdup.32 d4, r3
  73. vmull.s16 q12, d1, d4[0]
  74. vmull.s16 q13, d3, d4[0]
  75. vqdmulh.s16 d20, d1, d4[1]
  76. vqdmulh.s16 d23, d3, d4[1]
  77. vshrn.s32 d21, q12, #16
  78. vshrn.s32 d22, q13, #16
  79. vadd.s16 d21, d21, d1
  80. vadd.s16 d22, d22, d3
  81. vadd.s16 d16, d0, d2
  82. vsub.s16 d17, d0, d2
  83. vadd.s16 d18, d21, d23
  84. vsub.s16 d19, d20, d22
  85. vadd.s16 q0, q8, q9
  86. vsub.s16 q1, q8, q9
  87. vtrn.32 d0, d3
  88. vtrn.32 d1, d2
  89. vtrn.16 d0, d1
  90. vtrn.16 d3, d2
  91. vmov.i16 q15, #0
  92. vmull.s16 q12, d1, d4[0]
  93. vst1.16 {q15}, [r1,:128]!
  94. vmull.s16 q13, d2, d4[0]
  95. vst1.16 {q15}, [r1,:128]
  96. vqdmulh.s16 d21, d1, d4[1]
  97. vqdmulh.s16 d23, d2, d4[1]
  98. vshrn.s32 d20, q12, #16
  99. vshrn.s32 d22, q13, #16
  100. vadd.i16 d20, d20, d1
  101. vadd.i16 d22, d22, d2
  102. vadd.i16 d16, d0, d3
  103. vsub.i16 d17, d0, d3
  104. vadd.i16 d18, d20, d23
  105. vld1.32 {d20[]}, [r0,:32], r2
  106. vsub.i16 d19, d21, d22
  107. vld1.32 {d22[]}, [r0,:32], r2
  108. vadd.s16 q0, q8, q9
  109. vld1.32 {d23[]}, [r0,:32], r2
  110. vsub.s16 q1, q8, q9
  111. vld1.32 {d21[]}, [r0,:32], r2
  112. vrshr.s16 q0, q0, #3
  113. vtrn.32 q10, q11
  114. vrshr.s16 q1, q1, #3
  115. sub r0, r0, r2, lsl #2
  116. vtrn.32 d0, d3
  117. vtrn.32 d1, d2
  118. vtrn.16 d0, d1
  119. vtrn.16 d3, d2
  120. vaddw.u8 q0, q0, d20
  121. vaddw.u8 q1, q1, d21
  122. vqmovun.s16 d0, q0
  123. vqmovun.s16 d1, q1
  124. vst1.32 {d0[0]}, [r0,:32], r2
  125. vst1.32 {d0[1]}, [r0,:32], r2
  126. vst1.32 {d1[1]}, [r0,:32], r2
  127. vst1.32 {d1[0]}, [r0,:32], r2
  128. bx lr
  129. endfunc
  130. function ff_vp8_idct_dc_add_neon, export=1
  131. mov r3, #0
  132. ldrsh r12, [r1]
  133. strh r3, [r1]
  134. vdup.16 q1, r12
  135. vrshr.s16 q1, q1, #3
  136. vld1.32 {d0[]}, [r0,:32], r2
  137. vld1.32 {d1[]}, [r0,:32], r2
  138. vld1.32 {d0[1]}, [r0,:32], r2
  139. vld1.32 {d1[1]}, [r0,:32], r2
  140. vaddw.u8 q2, q1, d0
  141. vaddw.u8 q3, q1, d1
  142. sub r0, r0, r2, lsl #2
  143. vqmovun.s16 d0, q2
  144. vqmovun.s16 d1, q3
  145. vst1.32 {d0[0]}, [r0,:32], r2
  146. vst1.32 {d1[0]}, [r0,:32], r2
  147. vst1.32 {d0[1]}, [r0,:32], r2
  148. vst1.32 {d1[1]}, [r0,:32], r2
  149. bx lr
  150. endfunc
  151. function ff_vp8_idct_dc_add4uv_neon, export=1
  152. vmov.i16 d0, #0
  153. mov r3, #32
  154. vld1.16 {d16[]}, [r1,:16]
  155. vst1.16 {d0[0]}, [r1,:16], r3
  156. vld1.16 {d17[]}, [r1,:16]
  157. vst1.16 {d0[0]}, [r1,:16], r3
  158. vld1.16 {d18[]}, [r1,:16]
  159. vst1.16 {d0[0]}, [r1,:16], r3
  160. vld1.16 {d19[]}, [r1,:16]
  161. vst1.16 {d0[0]}, [r1,:16], r3
  162. mov r3, r0
  163. vrshr.s16 q8, q8, #3 @ dc >>= 3
  164. vld1.8 {d0}, [r0,:64], r2
  165. vrshr.s16 q9, q9, #3
  166. vld1.8 {d1}, [r0,:64], r2
  167. vaddw.u8 q10, q8, d0
  168. vld1.8 {d2}, [r0,:64], r2
  169. vaddw.u8 q0, q8, d1
  170. vld1.8 {d3}, [r0,:64], r2
  171. vaddw.u8 q11, q8, d2
  172. vld1.8 {d4}, [r0,:64], r2
  173. vaddw.u8 q1, q8, d3
  174. vld1.8 {d5}, [r0,:64], r2
  175. vaddw.u8 q12, q9, d4
  176. vld1.8 {d6}, [r0,:64], r2
  177. vaddw.u8 q2, q9, d5
  178. vld1.8 {d7}, [r0,:64], r2
  179. vaddw.u8 q13, q9, d6
  180. vqmovun.s16 d20, q10
  181. vaddw.u8 q3, q9, d7
  182. vqmovun.s16 d21, q0
  183. vqmovun.s16 d22, q11
  184. vst1.8 {d20}, [r3,:64], r2
  185. vqmovun.s16 d23, q1
  186. vst1.8 {d21}, [r3,:64], r2
  187. vqmovun.s16 d24, q12
  188. vst1.8 {d22}, [r3,:64], r2
  189. vqmovun.s16 d25, q2
  190. vst1.8 {d23}, [r3,:64], r2
  191. vqmovun.s16 d26, q13
  192. vst1.8 {d24}, [r3,:64], r2
  193. vqmovun.s16 d27, q3
  194. vst1.8 {d25}, [r3,:64], r2
  195. vst1.8 {d26}, [r3,:64], r2
  196. vst1.8 {d27}, [r3,:64], r2
  197. bx lr
  198. endfunc
  199. function ff_vp8_idct_dc_add4y_neon, export=1
  200. vmov.i16 d0, #0
  201. mov r3, #32
  202. vld1.16 {d16[]}, [r1,:16]
  203. vst1.16 {d0[0]}, [r1,:16], r3
  204. vld1.16 {d17[]}, [r1,:16]
  205. vst1.16 {d0[0]}, [r1,:16], r3
  206. vld1.16 {d18[]}, [r1,:16]
  207. vst1.16 {d0[0]}, [r1,:16], r3
  208. vld1.16 {d19[]}, [r1,:16]
  209. vst1.16 {d0[0]}, [r1,:16], r3
  210. vrshr.s16 q8, q8, #3 @ dc >>= 3
  211. vld1.8 {q0}, [r0,:128], r2
  212. vrshr.s16 q9, q9, #3
  213. vld1.8 {q1}, [r0,:128], r2
  214. vaddw.u8 q10, q8, d0
  215. vld1.8 {q2}, [r0,:128], r2
  216. vaddw.u8 q0, q9, d1
  217. vld1.8 {q3}, [r0,:128], r2
  218. vaddw.u8 q11, q8, d2
  219. vaddw.u8 q1, q9, d3
  220. vaddw.u8 q12, q8, d4
  221. vaddw.u8 q2, q9, d5
  222. vaddw.u8 q13, q8, d6
  223. vaddw.u8 q3, q9, d7
  224. sub r0, r0, r2, lsl #2
  225. vqmovun.s16 d20, q10
  226. vqmovun.s16 d21, q0
  227. vqmovun.s16 d22, q11
  228. vqmovun.s16 d23, q1
  229. vqmovun.s16 d24, q12
  230. vst1.8 {q10}, [r0,:128], r2
  231. vqmovun.s16 d25, q2
  232. vst1.8 {q11}, [r0,:128], r2
  233. vqmovun.s16 d26, q13
  234. vst1.8 {q12}, [r0,:128], r2
  235. vqmovun.s16 d27, q3
  236. vst1.8 {q13}, [r0,:128], r2
  237. bx lr
  238. endfunc
  239. @ Register layout:
  240. @ P3..Q3 -> q0..q7
  241. @ flim_E -> q14
  242. @ flim_I -> q15
  243. @ hev_thresh -> r12
  244. @
  245. .macro vp8_loop_filter, inner=0, simple=0
  246. .if \simple
  247. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  248. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  249. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  250. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  251. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  252. vmov.i8 q13, #0x80
  253. vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
  254. .else
  255. @ calculate hev and normal_limit:
  256. vabd.u8 q12, q2, q3 @ abs(P1-P0)
  257. vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
  258. vabd.u8 q10, q0, q1 @ abs(P3-P2)
  259. vabd.u8 q11, q1, q2 @ abs(P2-P1)
  260. vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
  261. vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
  262. vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
  263. vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
  264. vand q8, q8, q9
  265. vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
  266. vand q8, q8, q11
  267. vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
  268. vand q8, q8, q10
  269. vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
  270. vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
  271. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  272. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  273. vand q8, q8, q10
  274. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  275. vand q8, q8, q11
  276. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  277. vdup.8 q15, r12 @ hev_thresh
  278. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  279. vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
  280. vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
  281. vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
  282. vand q8, q8, q11
  283. vmov.i8 q13, #0x80
  284. vorr q9, q12, q14
  285. .endif
  286. @ at this point:
  287. @ q8: normal_limit
  288. @ q9: hev
  289. @ convert to signed value:
  290. veor q3, q3, q13 @ PS0 = P0 ^ 0x80
  291. veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
  292. vmov.i16 q12, #3
  293. vsubl.s8 q10, d8, d6 @ QS0 - PS0
  294. vsubl.s8 q11, d9, d7 @ (widened to 16bit)
  295. veor q2, q2, q13 @ PS1 = P1 ^ 0x80
  296. veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
  297. vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
  298. vmul.i16 q11, q11, q12
  299. vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
  300. vmov.i8 q14, #4
  301. vmov.i8 q15, #3
  302. .if \inner
  303. vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
  304. .endif
  305. vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
  306. vaddw.s8 q11, q11, d25
  307. vqmovn.s16 d20, q10 @ narrow result back into q10
  308. vqmovn.s16 d21, q11
  309. .if !\inner && !\simple
  310. veor q1, q1, q13 @ PS2 = P2 ^ 0x80
  311. veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
  312. .endif
  313. vand q10, q10, q8 @ w &= normal_limit
  314. @ registers used at this point..
  315. @ q0 -> P3 (don't corrupt)
  316. @ q1-q6 -> PS2-QS2
  317. @ q7 -> Q3 (don't corrupt)
  318. @ q9 -> hev
  319. @ q10 -> w
  320. @ q13 -> #0x80
  321. @ q14 -> #4
  322. @ q15 -> #3
  323. @ q8, q11, q12 -> unused
  324. @ filter_common: is4tap==1
  325. @ c1 = clamp(w + 4) >> 3;
  326. @ c2 = clamp(w + 3) >> 3;
  327. @ Q0 = s2u(QS0 - c1);
  328. @ P0 = s2u(PS0 + c2);
  329. .if \simple
  330. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  331. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  332. vshr.s8 q11, q11, #3 @ c1 >>= 3
  333. vshr.s8 q12, q12, #3 @ c2 >>= 3
  334. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  335. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  336. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  337. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  338. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  339. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  340. .elseif \inner
  341. @ the !is4tap case of filter_common, only used for inner blocks
  342. @ c3 = ((c1&~hev) + 1) >> 1;
  343. @ Q1 = s2u(QS1 - c3);
  344. @ P1 = s2u(PS1 + c3);
  345. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  346. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  347. vshr.s8 q11, q11, #3 @ c1 >>= 3
  348. vshr.s8 q12, q12, #3 @ c2 >>= 3
  349. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  350. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  351. vbic q11, q11, q9 @ c1 & ~hev
  352. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  353. vrshr.s8 q11, q11, #1 @ c3 >>= 1
  354. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  355. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
  356. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
  357. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  358. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  359. .else
  360. vand q12, q10, q9 @ w & hev
  361. vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
  362. vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
  363. vshr.s8 q11, q11, #3 @ c1 >>= 3
  364. vshr.s8 q12, q12, #3 @ c2 >>= 3
  365. vbic q10, q10, q9 @ w &= ~hev
  366. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  367. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  368. @ filter_mbedge:
  369. @ a = clamp((27*w + 63) >> 7);
  370. @ Q0 = s2u(QS0 - a);
  371. @ P0 = s2u(PS0 + a);
  372. @ a = clamp((18*w + 63) >> 7);
  373. @ Q1 = s2u(QS1 - a);
  374. @ P1 = s2u(PS1 + a);
  375. @ a = clamp((9*w + 63) >> 7);
  376. @ Q2 = s2u(QS2 - a);
  377. @ P2 = s2u(PS2 + a);
  378. vmov.i16 q9, #63
  379. vshll.s8 q14, d20, #3
  380. vshll.s8 q15, d21, #3
  381. vaddw.s8 q14, q14, d20
  382. vaddw.s8 q15, q15, d21
  383. vadd.s16 q8, q9, q14
  384. vadd.s16 q9, q9, q15 @ 9*w + 63
  385. vadd.s16 q11, q8, q14
  386. vadd.s16 q12, q9, q15 @ 18*w + 63
  387. vadd.s16 q14, q11, q14
  388. vadd.s16 q15, q12, q15 @ 27*w + 63
  389. vqshrn.s16 d16, q8, #7
  390. vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
  391. vqshrn.s16 d22, q11, #7
  392. vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
  393. vqshrn.s16 d28, q14, #7
  394. vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
  395. vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
  396. vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
  397. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
  398. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
  399. vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
  400. vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
  401. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  402. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  403. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  404. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  405. veor q1, q1, q13 @ P2 = PS2 ^ 0x80
  406. veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
  407. .endif
  408. .endm
  409. .macro transpose8x16matrix
  410. vtrn.32 q0, q4
  411. vtrn.32 q1, q5
  412. vtrn.32 q2, q6
  413. vtrn.32 q3, q7
  414. vtrn.16 q0, q2
  415. vtrn.16 q1, q3
  416. vtrn.16 q4, q6
  417. vtrn.16 q5, q7
  418. vtrn.8 q0, q1
  419. vtrn.8 q2, q3
  420. vtrn.8 q4, q5
  421. vtrn.8 q6, q7
  422. .endm
  423. .macro vp8_v_loop_filter16 name, inner=0, simple=0
  424. function ff_vp8_v_loop_filter16\name\()_neon, export=1
  425. vpush {q4-q7}
  426. sub r0, r0, r1, lsl #1+!\simple
  427. @ Load pixels:
  428. .if !\simple
  429. ldr r12, [sp, #64] @ hev_thresh
  430. vld1.8 {q0}, [r0,:128], r1 @ P3
  431. vld1.8 {q1}, [r0,:128], r1 @ P2
  432. .endif
  433. vld1.8 {q2}, [r0,:128], r1 @ P1
  434. vld1.8 {q3}, [r0,:128], r1 @ P0
  435. vld1.8 {q4}, [r0,:128], r1 @ Q0
  436. vld1.8 {q5}, [r0,:128], r1 @ Q1
  437. .if !\simple
  438. vld1.8 {q6}, [r0,:128], r1 @ Q2
  439. vld1.8 {q7}, [r0,:128] @ Q3
  440. vdup.8 q15, r3 @ flim_I
  441. .endif
  442. vdup.8 q14, r2 @ flim_E
  443. vp8_loop_filter inner=\inner, simple=\simple
  444. @ back up to P2: dst -= stride * 6
  445. sub r0, r0, r1, lsl #2
  446. .if !\simple
  447. sub r0, r0, r1, lsl #1
  448. @ Store pixels:
  449. vst1.8 {q1}, [r0,:128], r1 @ P2
  450. .endif
  451. vst1.8 {q2}, [r0,:128], r1 @ P1
  452. vst1.8 {q3}, [r0,:128], r1 @ P0
  453. vst1.8 {q4}, [r0,:128], r1 @ Q0
  454. vst1.8 {q5}, [r0,:128], r1 @ Q1
  455. .if !\simple
  456. vst1.8 {q6}, [r0,:128] @ Q2
  457. .endif
  458. vpop {q4-q7}
  459. bx lr
  460. endfunc
  461. .endm
  462. vp8_v_loop_filter16
  463. vp8_v_loop_filter16 _inner, inner=1
  464. vp8_v_loop_filter16 _simple, simple=1
  465. .macro vp8_v_loop_filter8uv name, inner=0
  466. function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
  467. vpush {q4-q7}
  468. sub r0, r0, r2, lsl #2
  469. sub r1, r1, r2, lsl #2
  470. ldr r12, [sp, #64] @ flim_I
  471. @ Load pixels:
  472. vld1.8 {d0}, [r0,:64], r2 @ P3
  473. vld1.8 {d1}, [r1,:64], r2 @ P3
  474. vld1.8 {d2}, [r0,:64], r2 @ P2
  475. vld1.8 {d3}, [r1,:64], r2 @ P2
  476. vld1.8 {d4}, [r0,:64], r2 @ P1
  477. vld1.8 {d5}, [r1,:64], r2 @ P1
  478. vld1.8 {d6}, [r0,:64], r2 @ P0
  479. vld1.8 {d7}, [r1,:64], r2 @ P0
  480. vld1.8 {d8}, [r0,:64], r2 @ Q0
  481. vld1.8 {d9}, [r1,:64], r2 @ Q0
  482. vld1.8 {d10}, [r0,:64], r2 @ Q1
  483. vld1.8 {d11}, [r1,:64], r2 @ Q1
  484. vld1.8 {d12}, [r0,:64], r2 @ Q2
  485. vld1.8 {d13}, [r1,:64], r2 @ Q2
  486. vld1.8 {d14}, [r0,:64] @ Q3
  487. vld1.8 {d15}, [r1,:64] @ Q3
  488. vdup.8 q14, r3 @ flim_E
  489. vdup.8 q15, r12 @ flim_I
  490. ldr r12, [sp, #68] @ hev_thresh
  491. vp8_loop_filter inner=\inner
  492. @ back up to P2: u,v -= stride * 6
  493. sub r0, r0, r2, lsl #2
  494. sub r1, r1, r2, lsl #2
  495. sub r0, r0, r2, lsl #1
  496. sub r1, r1, r2, lsl #1
  497. @ Store pixels:
  498. vst1.8 {d2}, [r0,:64], r2 @ P2
  499. vst1.8 {d3}, [r1,:64], r2 @ P2
  500. vst1.8 {d4}, [r0,:64], r2 @ P1
  501. vst1.8 {d5}, [r1,:64], r2 @ P1
  502. vst1.8 {d6}, [r0,:64], r2 @ P0
  503. vst1.8 {d7}, [r1,:64], r2 @ P0
  504. vst1.8 {d8}, [r0,:64], r2 @ Q0
  505. vst1.8 {d9}, [r1,:64], r2 @ Q0
  506. vst1.8 {d10}, [r0,:64], r2 @ Q1
  507. vst1.8 {d11}, [r1,:64], r2 @ Q1
  508. vst1.8 {d12}, [r0,:64] @ Q2
  509. vst1.8 {d13}, [r1,:64] @ Q2
  510. vpop {q4-q7}
  511. bx lr
  512. endfunc
  513. .endm
  514. vp8_v_loop_filter8uv
  515. vp8_v_loop_filter8uv _inner, inner=1
  516. .macro vp8_h_loop_filter16 name, inner=0, simple=0
  517. function ff_vp8_h_loop_filter16\name\()_neon, export=1
  518. vpush {q4-q7}
  519. sub r0, r0, #4
  520. .if !\simple
  521. ldr r12, [sp, #64] @ hev_thresh
  522. .endif
  523. @ Load pixels:
  524. vld1.8 {d0}, [r0], r1 @ load first 8-line src data
  525. vld1.8 {d2}, [r0], r1
  526. vld1.8 {d4}, [r0], r1
  527. vld1.8 {d6}, [r0], r1
  528. vld1.8 {d8}, [r0], r1
  529. vld1.8 {d10}, [r0], r1
  530. vld1.8 {d12}, [r0], r1
  531. vld1.8 {d14}, [r0], r1
  532. vld1.8 {d1}, [r0], r1 @ load second 8-line src data
  533. vld1.8 {d3}, [r0], r1
  534. vld1.8 {d5}, [r0], r1
  535. vld1.8 {d7}, [r0], r1
  536. vld1.8 {d9}, [r0], r1
  537. vld1.8 {d11}, [r0], r1
  538. vld1.8 {d13}, [r0], r1
  539. vld1.8 {d15}, [r0], r1
  540. transpose8x16matrix
  541. vdup.8 q14, r2 @ flim_E
  542. .if !\simple
  543. vdup.8 q15, r3 @ flim_I
  544. .endif
  545. vp8_loop_filter inner=\inner, simple=\simple
  546. sub r0, r0, r1, lsl #4 @ backup 16 rows
  547. transpose8x16matrix
  548. @ Store pixels:
  549. vst1.8 {d0}, [r0], r1
  550. vst1.8 {d2}, [r0], r1
  551. vst1.8 {d4}, [r0], r1
  552. vst1.8 {d6}, [r0], r1
  553. vst1.8 {d8}, [r0], r1
  554. vst1.8 {d10}, [r0], r1
  555. vst1.8 {d12}, [r0], r1
  556. vst1.8 {d14}, [r0], r1
  557. vst1.8 {d1}, [r0], r1
  558. vst1.8 {d3}, [r0], r1
  559. vst1.8 {d5}, [r0], r1
  560. vst1.8 {d7}, [r0], r1
  561. vst1.8 {d9}, [r0], r1
  562. vst1.8 {d11}, [r0], r1
  563. vst1.8 {d13}, [r0], r1
  564. vst1.8 {d15}, [r0]
  565. vpop {q4-q7}
  566. bx lr
  567. endfunc
  568. .endm
  569. vp8_h_loop_filter16
  570. vp8_h_loop_filter16 _inner, inner=1
  571. vp8_h_loop_filter16 _simple, simple=1
  572. .macro vp8_h_loop_filter8uv name, inner=0
  573. function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
  574. vpush {q4-q7}
  575. sub r0, r0, #4
  576. sub r1, r1, #4
  577. ldr r12, [sp, #64] @ flim_I
  578. @ Load pixels:
  579. vld1.8 {d0}, [r0], r2 @ load u
  580. vld1.8 {d1}, [r1], r2 @ load v
  581. vld1.8 {d2}, [r0], r2
  582. vld1.8 {d3}, [r1], r2
  583. vld1.8 {d4}, [r0], r2
  584. vld1.8 {d5}, [r1], r2
  585. vld1.8 {d6}, [r0], r2
  586. vld1.8 {d7}, [r1], r2
  587. vld1.8 {d8}, [r0], r2
  588. vld1.8 {d9}, [r1], r2
  589. vld1.8 {d10}, [r0], r2
  590. vld1.8 {d11}, [r1], r2
  591. vld1.8 {d12}, [r0], r2
  592. vld1.8 {d13}, [r1], r2
  593. vld1.8 {d14}, [r0], r2
  594. vld1.8 {d15}, [r1], r2
  595. transpose8x16matrix
  596. vdup.8 q14, r3 @ flim_E
  597. vdup.8 q15, r12 @ flim_I
  598. ldr r12, [sp, #68] @ hev_thresh
  599. vp8_loop_filter inner=\inner
  600. sub r0, r0, r2, lsl #3 @ backup u 8 rows
  601. sub r1, r1, r2, lsl #3 @ backup v 8 rows
  602. transpose8x16matrix
  603. @ Store pixels:
  604. vst1.8 {d0}, [r0], r2
  605. vst1.8 {d1}, [r1], r2
  606. vst1.8 {d2}, [r0], r2
  607. vst1.8 {d3}, [r1], r2
  608. vst1.8 {d4}, [r0], r2
  609. vst1.8 {d5}, [r1], r2
  610. vst1.8 {d6}, [r0], r2
  611. vst1.8 {d7}, [r1], r2
  612. vst1.8 {d8}, [r0], r2
  613. vst1.8 {d9}, [r1], r2
  614. vst1.8 {d10}, [r0], r2
  615. vst1.8 {d11}, [r1], r2
  616. vst1.8 {d12}, [r0], r2
  617. vst1.8 {d13}, [r1], r2
  618. vst1.8 {d14}, [r0]
  619. vst1.8 {d15}, [r1]
  620. vpop {q4-q7}
  621. bx lr
  622. endfunc
  623. .endm
  624. vp8_h_loop_filter8uv
  625. vp8_h_loop_filter8uv _inner, inner=1
  626. function ff_put_vp8_pixels16_neon, export=1
  627. ldr r12, [sp, #0] @ h
  628. 1:
  629. subs r12, r12, #4
  630. vld1.8 {q0}, [r2], r3
  631. vld1.8 {q1}, [r2], r3
  632. vld1.8 {q2}, [r2], r3
  633. vld1.8 {q3}, [r2], r3
  634. vst1.8 {q0}, [r0,:128], r1
  635. vst1.8 {q1}, [r0,:128], r1
  636. vst1.8 {q2}, [r0,:128], r1
  637. vst1.8 {q3}, [r0,:128], r1
  638. bgt 1b
  639. bx lr
  640. endfunc
  641. function ff_put_vp8_pixels8_neon, export=1
  642. ldr r12, [sp, #0] @ h
  643. 1:
  644. subs r12, r12, #4
  645. vld1.8 {d0}, [r2], r3
  646. vld1.8 {d1}, [r2], r3
  647. vld1.8 {d2}, [r2], r3
  648. vld1.8 {d3}, [r2], r3
  649. vst1.8 {d0}, [r0,:64], r1
  650. vst1.8 {d1}, [r0,:64], r1
  651. vst1.8 {d2}, [r0,:64], r1
  652. vst1.8 {d3}, [r0,:64], r1
  653. bgt 1b
  654. bx lr
  655. endfunc
  656. /* 4/6-tap 8th-pel MC */
  657. .macro vp8_epel8_h6 d, a, b
  658. vext.8 d27, \a, \b, #1
  659. vmovl.u8 q8, \a
  660. vext.8 d28, \a, \b, #2
  661. vmovl.u8 q9, d27
  662. vext.8 d29, \a, \b, #3
  663. vmovl.u8 q10, d28
  664. vext.8 d30, \a, \b, #4
  665. vmovl.u8 q11, d29
  666. vext.8 d31, \a, \b, #5
  667. vmovl.u8 q12, d30
  668. vmul.u16 q10, q10, d0[2]
  669. vmovl.u8 q13, d31
  670. vmul.u16 q11, q11, d0[3]
  671. vmls.u16 q10, q9, d0[1]
  672. vmls.u16 q11, q12, d1[0]
  673. vmla.u16 q10, q8, d0[0]
  674. vmla.u16 q11, q13, d1[1]
  675. vqadd.s16 q11, q10, q11
  676. vqrshrun.s16 \d, q11, #7
  677. .endm
  678. .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
  679. vext.8 q14, \q0, \q1, #3
  680. vext.8 q15, \q0, \q1, #4
  681. vmovl.u8 q11, d28
  682. vmovl.u8 q14, d29
  683. vext.8 q3, \q0, \q1, #2
  684. vmovl.u8 q12, d30
  685. vmovl.u8 q15, d31
  686. vext.8 q8, \q0, \q1, #1
  687. vmovl.u8 q10, d6
  688. vmovl.u8 q3, d7
  689. vext.8 q2, \q0, \q1, #5
  690. vmovl.u8 q13, d4
  691. vmovl.u8 q2, d5
  692. vmovl.u8 q9, d16
  693. vmovl.u8 q8, d17
  694. vmul.u16 q11, q11, d0[3]
  695. vmul.u16 q10, q10, d0[2]
  696. vmul.u16 q3, q3, d0[2]
  697. vmul.u16 q14, q14, d0[3]
  698. vmls.u16 q11, q12, d1[0]
  699. vmovl.u8 q12, \s0
  700. vmovl.u8 q1, \s1
  701. vmls.u16 q10, q9, d0[1]
  702. vmls.u16 q3, q8, d0[1]
  703. vmls.u16 q14, q15, d1[0]
  704. vmla.u16 q10, q12, d0[0]
  705. vmla.u16 q11, q13, d1[1]
  706. vmla.u16 q3, q1, d0[0]
  707. vmla.u16 q14, q2, d1[1]
  708. vqadd.s16 q11, q10, q11
  709. vqadd.s16 q14, q3, q14
  710. vqrshrun.s16 \d0, q11, #7
  711. vqrshrun.s16 \d1, q14, #7
  712. .endm
  713. .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
  714. vmovl.u8 q10, \s2
  715. vmovl.u8 q11, \s3
  716. vmovl.u8 q9, \s1
  717. vmovl.u8 q12, \s4
  718. vmovl.u8 q8, \s0
  719. vmovl.u8 q13, \s5
  720. vmul.u16 q10, q10, d0[2]
  721. vmul.u16 q11, q11, d0[3]
  722. vmls.u16 q10, q9, d0[1]
  723. vmls.u16 q11, q12, d1[0]
  724. vmla.u16 q10, q8, d0[0]
  725. vmla.u16 q11, q13, d1[1]
  726. vqadd.s16 q11, q10, q11
  727. vqrshrun.s16 \d0, q11, #7
  728. .endm
  729. .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
  730. vmovl.u8 q10, \s0
  731. vmovl.u8 q11, \s3
  732. vmovl.u8 q14, \s6
  733. vmovl.u8 q9, \s1
  734. vmovl.u8 q12, \s4
  735. vmovl.u8 q8, \s2
  736. vmovl.u8 q13, \s5
  737. vmul.u16 q10, q10, d0[0]
  738. vmul.u16 q15, q11, d0[3]
  739. vmul.u16 q11, q11, d0[2]
  740. vmul.u16 q14, q14, d1[1]
  741. vmls.u16 q10, q9, d0[1]
  742. vmls.u16 q15, q12, d1[0]
  743. vmls.u16 q11, q8, d0[1]
  744. vmls.u16 q14, q13, d1[0]
  745. vmla.u16 q10, q8, d0[2]
  746. vmla.u16 q15, q13, d1[1]
  747. vmla.u16 q11, q9, d0[0]
  748. vmla.u16 q14, q12, d0[3]
  749. vqadd.s16 q15, q10, q15
  750. vqadd.s16 q14, q11, q14
  751. vqrshrun.s16 \d0, q15, #7
  752. vqrshrun.s16 \d1, q14, #7
  753. .endm
  754. .macro vp8_epel8_h4 d, a, b
  755. vext.8 d28, \a, \b, #1
  756. vmovl.u8 q9, \a
  757. vext.8 d29, \a, \b, #2
  758. vmovl.u8 q10, d28
  759. vext.8 d30, \a, \b, #3
  760. vmovl.u8 q11, d29
  761. vmovl.u8 q12, d30
  762. vmul.u16 q10, q10, d0[2]
  763. vmul.u16 q11, q11, d0[3]
  764. vmls.u16 q10, q9, d0[1]
  765. vmls.u16 q11, q12, d1[0]
  766. vqadd.s16 q11, q10, q11
  767. vqrshrun.s16 \d, q11, #7
  768. .endm
  769. .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
  770. vmovl.u8 q9, \s0
  771. vmovl.u8 q10, \s1
  772. vmovl.u8 q11, \s2
  773. vmovl.u8 q12, \s3
  774. vmovl.u8 q13, \s4
  775. vmul.u16 q8, q10, d0[2]
  776. vmul.u16 q14, q11, d0[3]
  777. vmul.u16 q11, q11, d0[2]
  778. vmul.u16 q15, q12, d0[3]
  779. vmls.u16 q8, q9, d0[1]
  780. vmls.u16 q14, q12, d1[0]
  781. vmls.u16 q11, q10, d0[1]
  782. vmls.u16 q15, q13, d1[0]
  783. vqadd.s16 q8, q8, q14
  784. vqadd.s16 q11, q11, q15
  785. vqrshrun.s16 \d0, q8, #7
  786. vqrshrun.s16 \d1, q11, #7
  787. .endm
  788. function ff_put_vp8_epel16_v6_neon, export=1
  789. sub r2, r2, r3, lsl #1
  790. push {r4,lr}
  791. vpush {d8-d15}
  792. ldr r4, [sp, #80] @ my
  793. movrel lr, subpel_filters-16
  794. ldr r12, [sp, #72] @ h
  795. add r4, lr, r4, lsl #4
  796. vld1.16 {q0}, [r4,:128]
  797. 1:
  798. vld1.8 {d2-d3}, [r2], r3
  799. vld1.8 {d4-d5}, [r2], r3
  800. vld1.8 {d6-d7}, [r2], r3
  801. vld1.8 {d8-d9}, [r2], r3
  802. vld1.8 {d10-d11},[r2], r3
  803. vld1.8 {d12-d13},[r2], r3
  804. vld1.8 {d14-d15},[r2]
  805. sub r2, r2, r3, lsl #2
  806. vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
  807. vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
  808. vst1.8 {d2-d3}, [r0,:128], r1
  809. vst1.8 {d4-d5}, [r0,:128], r1
  810. subs r12, r12, #2
  811. bne 1b
  812. vpop {d8-d15}
  813. pop {r4,pc}
  814. endfunc
  815. function ff_put_vp8_epel16_h6_neon, export=1
  816. sub r2, r2, #2
  817. push {r4,lr}
  818. ldr r4, [sp, #12] @ mx
  819. movrel lr, subpel_filters-16
  820. ldr r12, [sp, #8] @ h
  821. add r4, lr, r4, lsl #4
  822. vld1.16 {q0}, [r4,:128]
  823. 1:
  824. vld1.8 {d2-d4}, [r2], r3
  825. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  826. vst1.8 {d2-d3}, [r0,:128], r1
  827. subs r12, r12, #1
  828. bne 1b
  829. pop {r4,pc}
  830. endfunc
  831. function ff_put_vp8_epel16_h6v6_neon, export=1
  832. sub r2, r2, r3, lsl #1
  833. sub r2, r2, #2
  834. push {r4,lr}
  835. vpush {d8-d9}
  836. @ first pass (horizontal):
  837. ldr r4, [sp, #28] @ mx
  838. movrel lr, subpel_filters-16
  839. ldr r12, [sp, #24] @ h
  840. add r4, lr, r4, lsl #4
  841. sub sp, sp, #336+16
  842. vld1.16 {q0}, [r4,:128]
  843. add lr, sp, #15
  844. add r12, r12, #5
  845. bic lr, lr, #15
  846. 1:
  847. vld1.8 {d2,d3,d4}, [r2], r3
  848. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  849. vst1.8 {d2-d3}, [lr,:128]!
  850. subs r12, r12, #1
  851. bne 1b
  852. @ second pass (vertical):
  853. ldr r4, [sp, #336+16+32] @ my
  854. movrel lr, subpel_filters-16
  855. ldr r12, [sp, #336+16+24] @ h
  856. add r4, lr, r4, lsl #4
  857. add lr, sp, #15
  858. vld1.16 {q0}, [r4,:128]
  859. bic lr, lr, #15
  860. 2:
  861. vld1.8 {d2-d5}, [lr,:128]!
  862. vld1.8 {d6-d9}, [lr,:128]!
  863. vld1.8 {d28-d31},[lr,:128]
  864. sub lr, lr, #48
  865. vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
  866. vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
  867. vst1.8 {d2-d3}, [r0,:128], r1
  868. subs r12, r12, #1
  869. bne 2b
  870. add sp, sp, #336+16
  871. vpop {d8-d9}
  872. pop {r4,pc}
  873. endfunc
  874. function ff_put_vp8_epel8_v6_neon, export=1
  875. sub r2, r2, r3, lsl #1
  876. push {r4,lr}
  877. ldr r4, [sp, #16] @ my
  878. movrel lr, subpel_filters-16
  879. ldr r12, [sp, #8] @ h
  880. add r4, lr, r4, lsl #4
  881. vld1.16 {q0}, [r4,:128]
  882. 1:
  883. vld1.8 {d2}, [r2], r3
  884. vld1.8 {d3}, [r2], r3
  885. vld1.8 {d4}, [r2], r3
  886. vld1.8 {d5}, [r2], r3
  887. vld1.8 {d6}, [r2], r3
  888. vld1.8 {d7}, [r2], r3
  889. vld1.8 {d28}, [r2]
  890. sub r2, r2, r3, lsl #2
  891. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  892. vst1.8 {d2}, [r0,:64], r1
  893. vst1.8 {d3}, [r0,:64], r1
  894. subs r12, r12, #2
  895. bne 1b
  896. pop {r4,pc}
  897. endfunc
  898. function ff_put_vp8_epel8_h6_neon, export=1
  899. sub r2, r2, #2
  900. push {r4,lr}
  901. ldr r4, [sp, #12] @ mx
  902. movrel lr, subpel_filters-16
  903. ldr r12, [sp, #8] @ h
  904. add r4, lr, r4, lsl #4
  905. vld1.16 {q0}, [r4,:128]
  906. 1:
  907. vld1.8 {d2,d3}, [r2], r3
  908. vp8_epel8_h6 d2, d2, d3
  909. vst1.8 {d2}, [r0,:64], r1
  910. subs r12, r12, #1
  911. bne 1b
  912. pop {r4,pc}
  913. endfunc
  914. function ff_put_vp8_epel8_h6v6_neon, export=1
  915. sub r2, r2, r3, lsl #1
  916. sub r2, r2, #2
  917. push {r4,lr}
  918. @ first pass (horizontal):
  919. ldr r4, [sp, #12] @ mx
  920. movrel lr, subpel_filters-16
  921. ldr r12, [sp, #8] @ h
  922. add r4, lr, r4, lsl #4
  923. sub sp, sp, #168+16
  924. vld1.16 {q0}, [r4,:128]
  925. add lr, sp, #15
  926. add r12, r12, #5
  927. bic lr, lr, #15
  928. 1:
  929. vld1.8 {d2,d3}, [r2], r3
  930. vp8_epel8_h6 d2, d2, d3
  931. vst1.8 {d2}, [lr,:64]!
  932. subs r12, r12, #1
  933. bne 1b
  934. @ second pass (vertical):
  935. ldr r4, [sp, #168+16+16] @ my
  936. movrel lr, subpel_filters-16
  937. ldr r12, [sp, #168+16+8] @ h
  938. add r4, lr, r4, lsl #4
  939. add lr, sp, #15
  940. vld1.16 {q0}, [r4,:128]
  941. bic lr, lr, #15
  942. 2:
  943. vld1.8 {d2-d5}, [lr,:128]!
  944. vld1.8 {d6-d7}, [lr,:128]!
  945. vld1.8 {d30}, [lr,:64]
  946. sub lr, lr, #32
  947. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  948. vst1.8 {d2}, [r0,:64], r1
  949. vst1.8 {d3}, [r0,:64], r1
  950. subs r12, r12, #2
  951. bne 2b
  952. add sp, sp, #168+16
  953. pop {r4,pc}
  954. endfunc
  955. function ff_put_vp8_epel8_v4_neon, export=1
  956. sub r2, r2, r3
  957. push {r4,lr}
  958. ldr r4, [sp, #16] @ my
  959. movrel lr, subpel_filters-16
  960. ldr r12, [sp, #8] @ h
  961. add r4, lr, r4, lsl #4
  962. vld1.16 {q0}, [r4,:128]
  963. 1:
  964. vld1.8 {d2}, [r2], r3
  965. vld1.8 {d3}, [r2], r3
  966. vld1.8 {d4}, [r2], r3
  967. vld1.8 {d5}, [r2], r3
  968. vld1.8 {d6}, [r2]
  969. sub r2, r2, r3, lsl #1
  970. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  971. vst1.8 {d2}, [r0,:64], r1
  972. vst1.8 {d3}, [r0,:64], r1
  973. subs r12, r12, #2
  974. bne 1b
  975. pop {r4,pc}
  976. endfunc
  977. function ff_put_vp8_epel8_h4_neon, export=1
  978. sub r2, r2, #1
  979. push {r4,lr}
  980. ldr r4, [sp, #12] @ mx
  981. movrel lr, subpel_filters-16
  982. ldr r12, [sp, #8] @ h
  983. add r4, lr, r4, lsl #4
  984. vld1.16 {q0}, [r4,:128]
  985. 1:
  986. vld1.8 {d2,d3}, [r2], r3
  987. vp8_epel8_h4 d2, d2, d3
  988. vst1.8 {d2}, [r0,:64], r1
  989. subs r12, r12, #1
  990. bne 1b
  991. pop {r4,pc}
  992. endfunc
  993. function ff_put_vp8_epel8_h4v4_neon, export=1
  994. sub r2, r2, r3
  995. sub r2, r2, #1
  996. push {r4,lr}
  997. @ first pass (horizontal):
  998. ldr r4, [sp, #12] @ mx
  999. movrel lr, subpel_filters-16
  1000. ldr r12, [sp, #8] @ h
  1001. add r4, lr, r4, lsl #4
  1002. sub sp, sp, #168+16
  1003. vld1.16 {q0}, [r4,:128]
  1004. add lr, sp, #15
  1005. add r12, r12, #3
  1006. bic lr, lr, #15
  1007. 1:
  1008. vld1.8 {d2,d3}, [r2], r3
  1009. vp8_epel8_h4 d2, d2, d3
  1010. vst1.8 {d2}, [lr,:64]!
  1011. subs r12, r12, #1
  1012. bne 1b
  1013. @ second pass (vertical):
  1014. ldr r4, [sp, #168+16+16] @ my
  1015. movrel lr, subpel_filters-16
  1016. ldr r12, [sp, #168+16+8] @ h
  1017. add r4, lr, r4, lsl #4
  1018. add lr, sp, #15
  1019. vld1.16 {q0}, [r4,:128]
  1020. bic lr, lr, #15
  1021. 2:
  1022. vld1.8 {d2-d5}, [lr,:128]!
  1023. vld1.8 {d6}, [lr,:64]
  1024. sub lr, lr, #16
  1025. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1026. vst1.8 {d2}, [r0,:64], r1
  1027. vst1.8 {d3}, [r0,:64], r1
  1028. subs r12, r12, #2
  1029. bne 2b
  1030. add sp, sp, #168+16
  1031. pop {r4,pc}
  1032. endfunc
  1033. function ff_put_vp8_epel8_h6v4_neon, export=1
  1034. sub r2, r2, r3
  1035. sub r2, r2, #2
  1036. push {r4,lr}
  1037. @ first pass (horizontal):
  1038. ldr r4, [sp, #12] @ mx
  1039. movrel lr, subpel_filters-16
  1040. ldr r12, [sp, #8] @ h
  1041. add r4, lr, r4, lsl #4
  1042. sub sp, sp, #168+16
  1043. vld1.16 {q0}, [r4,:128]
  1044. add lr, sp, #15
  1045. add r12, r12, #3
  1046. bic lr, lr, #15
  1047. 1:
  1048. vld1.8 {d2,d3}, [r2], r3
  1049. vp8_epel8_h6 d2, d2, d3
  1050. vst1.8 {d2}, [lr,:64]!
  1051. subs r12, r12, #1
  1052. bne 1b
  1053. @ second pass (vertical):
  1054. ldr r4, [sp, #168+16+16] @ my
  1055. movrel lr, subpel_filters-16
  1056. ldr r12, [sp, #168+16+8] @ h
  1057. add r4, lr, r4, lsl #4
  1058. add lr, sp, #15
  1059. vld1.16 {q0}, [r4,:128]
  1060. bic lr, lr, #15
  1061. 2:
  1062. vld1.8 {d2-d5}, [lr,:128]!
  1063. vld1.8 {d6}, [lr,:64]
  1064. sub lr, lr, #16
  1065. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1066. vst1.8 {d2}, [r0,:64], r1
  1067. vst1.8 {d3}, [r0,:64], r1
  1068. subs r12, r12, #2
  1069. bne 2b
  1070. add sp, sp, #168+16
  1071. pop {r4,pc}
  1072. endfunc
  1073. function ff_put_vp8_epel8_h4v6_neon, export=1
  1074. sub r2, r2, r3, lsl #1
  1075. sub r2, r2, #1
  1076. push {r4,lr}
  1077. @ first pass (horizontal):
  1078. ldr r4, [sp, #12] @ mx
  1079. movrel lr, subpel_filters-16
  1080. ldr r12, [sp, #8] @ h
  1081. add r4, lr, r4, lsl #4
  1082. sub sp, sp, #168+16
  1083. vld1.16 {q0}, [r4,:128]
  1084. add lr, sp, #15
  1085. add r12, r12, #5
  1086. bic lr, lr, #15
  1087. 1:
  1088. vld1.8 {d2,d3}, [r2], r3
  1089. vp8_epel8_h4 d2, d2, d3
  1090. vst1.8 {d2}, [lr,:64]!
  1091. subs r12, r12, #1
  1092. bne 1b
  1093. @ second pass (vertical):
  1094. ldr r4, [sp, #168+16+16] @ my
  1095. movrel lr, subpel_filters-16
  1096. ldr r12, [sp, #168+16+8] @ h
  1097. add r4, lr, r4, lsl #4
  1098. add lr, sp, #15
  1099. vld1.16 {q0}, [r4,:128]
  1100. bic lr, lr, #15
  1101. 2:
  1102. vld1.8 {d2-d5}, [lr,:128]!
  1103. vld1.8 {d6-d7}, [lr,:128]!
  1104. vld1.8 {d30}, [lr,:64]
  1105. sub lr, lr, #32
  1106. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  1107. vst1.8 {d2}, [r0,:64], r1
  1108. vst1.8 {d3}, [r0,:64], r1
  1109. subs r12, r12, #2
  1110. bne 2b
  1111. add sp, sp, #168+16
  1112. pop {r4,pc}
  1113. endfunc
  1114. .ltorg
  1115. function ff_put_vp8_epel4_v6_neon, export=1
  1116. sub r2, r2, r3, lsl #1
  1117. push {r4,lr}
  1118. ldr r4, [sp, #16] @ my
  1119. movrel lr, subpel_filters-16
  1120. ldr r12, [sp, #8] @ h
  1121. add r4, lr, r4, lsl #4
  1122. vld1.16 {q0}, [r4,:128]
  1123. 1:
  1124. vld1.32 {d2[]}, [r2], r3
  1125. vld1.32 {d3[]}, [r2], r3
  1126. vld1.32 {d4[]}, [r2], r3
  1127. vld1.32 {d5[]}, [r2], r3
  1128. vld1.32 {d6[]}, [r2], r3
  1129. vld1.32 {d7[]}, [r2], r3
  1130. vld1.32 {d28[]}, [r2]
  1131. sub r2, r2, r3, lsl #2
  1132. vld1.32 {d2[1]}, [r2], r3
  1133. vld1.32 {d3[1]}, [r2], r3
  1134. vld1.32 {d4[1]}, [r2], r3
  1135. vld1.32 {d5[1]}, [r2], r3
  1136. vld1.32 {d6[1]}, [r2], r3
  1137. vld1.32 {d7[1]}, [r2], r3
  1138. vld1.32 {d28[1]}, [r2]
  1139. sub r2, r2, r3, lsl #2
  1140. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  1141. vst1.32 {d2[0]}, [r0,:32], r1
  1142. vst1.32 {d3[0]}, [r0,:32], r1
  1143. vst1.32 {d2[1]}, [r0,:32], r1
  1144. vst1.32 {d3[1]}, [r0,:32], r1
  1145. subs r12, r12, #4
  1146. bne 1b
  1147. pop {r4,pc}
  1148. endfunc
  1149. function ff_put_vp8_epel4_h6_neon, export=1
  1150. sub r2, r2, #2
  1151. push {r4,lr}
  1152. ldr r4, [sp, #12] @ mx
  1153. movrel lr, subpel_filters-16
  1154. ldr r12, [sp, #8] @ h
  1155. add r4, lr, r4, lsl #4
  1156. vld1.16 {q0}, [r4,:128]
  1157. 1:
  1158. vld1.8 {q1}, [r2], r3
  1159. vp8_epel8_h6 d2, d2, d3
  1160. vst1.32 {d2[0]}, [r0,:32], r1
  1161. subs r12, r12, #1
  1162. bne 1b
  1163. pop {r4,pc}
  1164. endfunc
  1165. function ff_put_vp8_epel4_h6v6_neon, export=1
  1166. sub r2, r2, r3, lsl #1
  1167. sub r2, r2, #2
  1168. push {r4,lr}
  1169. ldr r4, [sp, #12] @ mx
  1170. movrel lr, subpel_filters-16
  1171. ldr r12, [sp, #8] @ h
  1172. add r4, lr, r4, lsl #4
  1173. sub sp, sp, #52+16
  1174. vld1.16 {q0}, [r4,:128]
  1175. add lr, sp, #15
  1176. add r12, r12, #5
  1177. bic lr, lr, #15
  1178. 1:
  1179. vld1.8 {q1}, [r2], r3
  1180. vp8_epel8_h6 d2, d2, d3
  1181. vst1.32 {d2[0]}, [lr,:32]!
  1182. subs r12, r12, #1
  1183. bne 1b
  1184. ldr r4, [sp, #52+16+16] @ my
  1185. movrel lr, subpel_filters-16
  1186. ldr r12, [sp, #52+16+8] @ h
  1187. add r4, lr, r4, lsl #4
  1188. add lr, sp, #15
  1189. vld1.16 {q0}, [r4,:128]
  1190. bic lr, lr, #15
  1191. 2:
  1192. vld1.8 {d2-d3}, [lr,:128]!
  1193. vld1.8 {d6}, [lr,:64]!
  1194. vld1.32 {d28[]}, [lr,:32]
  1195. sub lr, lr, #16
  1196. vld1.8 {d4-d5}, [lr]!
  1197. vld1.8 {d7}, [lr,:64]!
  1198. vld1.32 {d28[1]}, [lr,:32]
  1199. sub lr, lr, #16
  1200. vtrn.32 q1, q2
  1201. vtrn.32 d6, d7
  1202. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1203. vst1.32 {d2[0]}, [r0,:32], r1
  1204. vst1.32 {d3[0]}, [r0,:32], r1
  1205. vst1.32 {d2[1]}, [r0,:32], r1
  1206. vst1.32 {d3[1]}, [r0,:32], r1
  1207. subs r12, r12, #4
  1208. bne 2b
  1209. add sp, sp, #52+16
  1210. pop {r4,pc}
  1211. endfunc
  1212. function ff_put_vp8_epel4_h4v6_neon, export=1
  1213. sub r2, r2, r3, lsl #1
  1214. sub r2, r2, #1
  1215. push {r4,lr}
  1216. ldr r4, [sp, #12] @ mx
  1217. movrel lr, subpel_filters-16
  1218. ldr r12, [sp, #8] @ h
  1219. add r4, lr, r4, lsl #4
  1220. sub sp, sp, #52+16
  1221. vld1.16 {q0}, [r4,:128]
  1222. add lr, sp, #15
  1223. add r12, r12, #5
  1224. bic lr, lr, #15
  1225. 1:
  1226. vld1.8 {d2}, [r2], r3
  1227. vp8_epel8_h4 d2, d2, d2
  1228. vst1.32 {d2[0]}, [lr,:32]!
  1229. subs r12, r12, #1
  1230. bne 1b
  1231. ldr r4, [sp, #52+16+16] @ my
  1232. movrel lr, subpel_filters-16
  1233. ldr r12, [sp, #52+16+8] @ h
  1234. add r4, lr, r4, lsl #4
  1235. add lr, sp, #15
  1236. vld1.16 {q0}, [r4,:128]
  1237. bic lr, lr, #15
  1238. 2:
  1239. vld1.8 {d2-d3}, [lr,:128]!
  1240. vld1.8 {d6}, [lr,:64]!
  1241. vld1.32 {d28[]}, [lr,:32]
  1242. sub lr, lr, #16
  1243. vld1.8 {d4-d5}, [lr]!
  1244. vld1.8 {d7}, [lr,:64]!
  1245. vld1.32 {d28[1]}, [lr,:32]
  1246. sub lr, lr, #16
  1247. vtrn.32 q1, q2
  1248. vtrn.32 d6, d7
  1249. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1250. vst1.32 {d2[0]}, [r0,:32], r1
  1251. vst1.32 {d3[0]}, [r0,:32], r1
  1252. vst1.32 {d2[1]}, [r0,:32], r1
  1253. vst1.32 {d3[1]}, [r0,:32], r1
  1254. subs r12, r12, #4
  1255. bne 2b
  1256. add sp, sp, #52+16
  1257. pop {r4,pc}
  1258. endfunc
  1259. function ff_put_vp8_epel4_h6v4_neon, export=1
  1260. sub r2, r2, r3
  1261. sub r2, r2, #2
  1262. push {r4,lr}
  1263. ldr r4, [sp, #12] @ mx
  1264. movrel lr, subpel_filters-16
  1265. ldr r12, [sp, #8] @ h
  1266. add r4, lr, r4, lsl #4
  1267. sub sp, sp, #44+16
  1268. vld1.16 {q0}, [r4,:128]
  1269. add lr, sp, #15
  1270. add r12, r12, #3
  1271. bic lr, lr, #15
  1272. 1:
  1273. vld1.8 {q1}, [r2], r3
  1274. vp8_epel8_h6 d2, d2, d3
  1275. vst1.32 {d2[0]}, [lr,:32]!
  1276. subs r12, r12, #1
  1277. bne 1b
  1278. ldr r4, [sp, #44+16+16] @ my
  1279. movrel lr, subpel_filters-16
  1280. ldr r12, [sp, #44+16+8] @ h
  1281. add r4, lr, r4, lsl #4
  1282. add lr, sp, #15
  1283. vld1.16 {q0}, [r4,:128]
  1284. bic lr, lr, #15
  1285. 2:
  1286. vld1.8 {d2-d3}, [lr,:128]!
  1287. vld1.32 {d6[]}, [lr,:32]
  1288. sub lr, lr, #8
  1289. vld1.8 {d4-d5}, [lr]!
  1290. vld1.32 {d6[1]}, [lr,:32]
  1291. sub lr, lr, #8
  1292. vtrn.32 q1, q2
  1293. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1294. vst1.32 {d2[0]}, [r0,:32], r1
  1295. vst1.32 {d3[0]}, [r0,:32], r1
  1296. vst1.32 {d2[1]}, [r0,:32], r1
  1297. vst1.32 {d3[1]}, [r0,:32], r1
  1298. subs r12, r12, #4
  1299. bne 2b
  1300. add sp, sp, #44+16
  1301. pop {r4,pc}
  1302. endfunc
  1303. function ff_put_vp8_epel4_h4_neon, export=1
  1304. sub r2, r2, #1
  1305. push {r4,lr}
  1306. ldr r4, [sp, #12] @ mx
  1307. movrel lr, subpel_filters-16
  1308. ldr r12, [sp, #8] @ h
  1309. add r4, lr, r4, lsl #4
  1310. vld1.16 {q0}, [r4,:128]
  1311. 1:
  1312. vld1.8 {d2}, [r2], r3
  1313. vp8_epel8_h4 d2, d2, d2
  1314. vst1.32 {d2[0]}, [r0,:32], r1
  1315. subs r12, r12, #1
  1316. bne 1b
  1317. pop {r4,pc}
  1318. endfunc
  1319. function ff_put_vp8_epel4_v4_neon, export=1
  1320. sub r2, r2, r3
  1321. push {r4,lr}
  1322. ldr r4, [sp, #16] @ my
  1323. movrel lr, subpel_filters-16
  1324. ldr r12, [sp, #8] @ h
  1325. add r4, lr, r4, lsl #4
  1326. vld1.16 {q0}, [r4,:128]
  1327. 1:
  1328. vld1.32 {d2[]}, [r2], r3
  1329. vld1.32 {d3[]}, [r2], r3
  1330. vld1.32 {d4[]}, [r2], r3
  1331. vld1.32 {d5[]}, [r2], r3
  1332. vld1.32 {d6[]}, [r2]
  1333. sub r2, r2, r3, lsl #1
  1334. vld1.32 {d2[1]}, [r2], r3
  1335. vld1.32 {d3[1]}, [r2], r3
  1336. vld1.32 {d4[1]}, [r2], r3
  1337. vld1.32 {d5[1]}, [r2], r3
  1338. vld1.32 {d6[1]}, [r2]
  1339. sub r2, r2, r3, lsl #1
  1340. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1341. vst1.32 {d2[0]}, [r0,:32], r1
  1342. vst1.32 {d3[0]}, [r0,:32], r1
  1343. vst1.32 {d2[1]}, [r0,:32], r1
  1344. vst1.32 {d3[1]}, [r0,:32], r1
  1345. subs r12, r12, #4
  1346. bne 1b
  1347. pop {r4,pc}
  1348. endfunc
  1349. function ff_put_vp8_epel4_h4v4_neon, export=1
  1350. sub r2, r2, r3
  1351. sub r2, r2, #1
  1352. push {r4,lr}
  1353. ldr r4, [sp, #12] @ mx
  1354. movrel lr, subpel_filters-16
  1355. ldr r12, [sp, #8] @ h
  1356. add r4, lr, r4, lsl #4
  1357. sub sp, sp, #44+16
  1358. vld1.16 {q0}, [r4,:128]
  1359. add lr, sp, #15
  1360. add r12, r12, #3
  1361. bic lr, lr, #15
  1362. 1:
  1363. vld1.8 {d2}, [r2], r3
  1364. vp8_epel8_h4 d2, d2, d3
  1365. vst1.32 {d2[0]}, [lr,:32]!
  1366. subs r12, r12, #1
  1367. bne 1b
  1368. ldr r4, [sp, #44+16+16] @ my
  1369. movrel lr, subpel_filters-16
  1370. ldr r12, [sp, #44+16+8] @ h
  1371. add r4, lr, r4, lsl #4
  1372. add lr, sp, #15
  1373. vld1.16 {q0}, [r4,:128]
  1374. bic lr, lr, #15
  1375. 2:
  1376. vld1.8 {d2-d3}, [lr,:128]!
  1377. vld1.32 {d6[]}, [lr,:32]
  1378. sub lr, lr, #8
  1379. vld1.8 {d4-d5}, [lr]!
  1380. vld1.32 {d6[1]}, [lr,:32]
  1381. sub lr, lr, #8
  1382. vtrn.32 q1, q2
  1383. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1384. vst1.32 {d2[0]}, [r0,:32], r1
  1385. vst1.32 {d3[0]}, [r0,:32], r1
  1386. vst1.32 {d2[1]}, [r0,:32], r1
  1387. vst1.32 {d3[1]}, [r0,:32], r1
  1388. subs r12, r12, #4
  1389. bne 2b
  1390. add sp, sp, #44+16
  1391. pop {r4,pc}
  1392. endfunc
  1393. @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
  1394. @ arithmatic can be used to apply filters
  1395. const subpel_filters, align=4
  1396. .short 0, 6, 123, 12, 1, 0, 0, 0
  1397. .short 2, 11, 108, 36, 8, 1, 0, 0
  1398. .short 0, 9, 93, 50, 6, 0, 0, 0
  1399. .short 3, 16, 77, 77, 16, 3, 0, 0
  1400. .short 0, 6, 50, 93, 9, 0, 0, 0
  1401. .short 1, 8, 36, 108, 11, 2, 0, 0
  1402. .short 0, 1, 12, 123, 6, 0, 0, 0
  1403. endconst
  1404. /* Bilinear MC */
  1405. function ff_put_vp8_bilin16_h_neon, export=1
  1406. ldr r3, [sp, #4] @ mx
  1407. rsb r12, r3, #8
  1408. vdup.8 d0, r3
  1409. vdup.8 d1, r12
  1410. ldr r12, [sp] @ h
  1411. 1:
  1412. subs r12, r12, #2
  1413. vld1.8 {d2-d4}, [r2], r1
  1414. vext.8 q2, q1, q2, #1
  1415. vmull.u8 q8, d2, d1
  1416. vmlal.u8 q8, d4, d0
  1417. vld1.8 {d18-d20},[r2], r1
  1418. vmull.u8 q3, d3, d1
  1419. vmlal.u8 q3, d5, d0
  1420. vext.8 q10, q9, q10, #1
  1421. vmull.u8 q11, d18, d1
  1422. vmlal.u8 q11, d20, d0
  1423. vmull.u8 q12, d19, d1
  1424. vmlal.u8 q12, d21, d0
  1425. vrshrn.u16 d4, q8, #3
  1426. vrshrn.u16 d5, q3, #3
  1427. vrshrn.u16 d6, q11, #3
  1428. vrshrn.u16 d7, q12, #3
  1429. vst1.8 {q2}, [r0,:128], r1
  1430. vst1.8 {q3}, [r0,:128], r1
  1431. bgt 1b
  1432. bx lr
  1433. endfunc
  1434. function ff_put_vp8_bilin16_v_neon, export=1
  1435. ldr r3, [sp, #8] @ my
  1436. rsb r12, r3, #8
  1437. vdup.8 d0, r3
  1438. vdup.8 d1, r12
  1439. ldr r12, [sp] @ h
  1440. vld1.8 {q1}, [r2], r1
  1441. 1:
  1442. subs r12, r12, #2
  1443. vld1.8 {q2}, [r2], r1
  1444. vmull.u8 q3, d2, d1
  1445. vmlal.u8 q3, d4, d0
  1446. vmull.u8 q8, d3, d1
  1447. vmlal.u8 q8, d5, d0
  1448. vld1.8 {q1}, [r2], r1
  1449. vmull.u8 q9, d4, d1
  1450. vmlal.u8 q9, d2, d0
  1451. vmull.u8 q10, d5, d1
  1452. vmlal.u8 q10, d3, d0
  1453. vrshrn.u16 d4, q3, #3
  1454. vrshrn.u16 d5, q8, #3
  1455. vrshrn.u16 d6, q9, #3
  1456. vrshrn.u16 d7, q10, #3
  1457. vst1.8 {q2}, [r0,:128], r1
  1458. vst1.8 {q3}, [r0,:128], r1
  1459. bgt 1b
  1460. bx lr
  1461. endfunc
  1462. function ff_put_vp8_bilin16_hv_neon, export=1
  1463. ldr r3, [sp, #4] @ mx
  1464. rsb r12, r3, #8
  1465. vdup.8 d0, r3
  1466. vdup.8 d1, r12
  1467. ldr r3, [sp, #8] @ my
  1468. rsb r12, r3, #8
  1469. vdup.8 d2, r3
  1470. vdup.8 d3, r12
  1471. ldr r12, [sp] @ h
  1472. vld1.8 {d4-d6}, [r2], r1
  1473. vext.8 q3, q2, q3, #1
  1474. vmull.u8 q8, d4, d1
  1475. vmlal.u8 q8, d6, d0
  1476. vmull.u8 q9, d5, d1
  1477. vmlal.u8 q9, d7, d0
  1478. vrshrn.u16 d4, q8, #3
  1479. vrshrn.u16 d5, q9, #3
  1480. 1:
  1481. subs r12, r12, #2
  1482. vld1.8 {d18-d20},[r2], r1
  1483. vext.8 q10, q9, q10, #1
  1484. vmull.u8 q11, d18, d1
  1485. vmlal.u8 q11, d20, d0
  1486. vld1.8 {d26-d28},[r2], r1
  1487. vmull.u8 q12, d19, d1
  1488. vmlal.u8 q12, d21, d0
  1489. vext.8 q14, q13, q14, #1
  1490. vmull.u8 q8, d26, d1
  1491. vmlal.u8 q8, d28, d0
  1492. vmull.u8 q9, d27, d1
  1493. vmlal.u8 q9, d29, d0
  1494. vrshrn.u16 d6, q11, #3
  1495. vrshrn.u16 d7, q12, #3
  1496. vmull.u8 q12, d4, d3
  1497. vmlal.u8 q12, d6, d2
  1498. vmull.u8 q15, d5, d3
  1499. vmlal.u8 q15, d7, d2
  1500. vrshrn.u16 d4, q8, #3
  1501. vrshrn.u16 d5, q9, #3
  1502. vmull.u8 q10, d6, d3
  1503. vmlal.u8 q10, d4, d2
  1504. vmull.u8 q11, d7, d3
  1505. vmlal.u8 q11, d5, d2
  1506. vrshrn.u16 d24, q12, #3
  1507. vrshrn.u16 d25, q15, #3
  1508. vst1.8 {q12}, [r0,:128], r1
  1509. vrshrn.u16 d20, q10, #3
  1510. vrshrn.u16 d21, q11, #3
  1511. vst1.8 {q10}, [r0,:128], r1
  1512. bgt 1b
  1513. bx lr
  1514. endfunc
  1515. function ff_put_vp8_bilin8_h_neon, export=1
  1516. ldr r3, [sp, #4] @ mx
  1517. rsb r12, r3, #8
  1518. vdup.8 d0, r3
  1519. vdup.8 d1, r12
  1520. ldr r12, [sp] @ h
  1521. 1:
  1522. subs r12, r12, #2
  1523. vld1.8 {q1}, [r2], r1
  1524. vext.8 d3, d2, d3, #1
  1525. vmull.u8 q2, d2, d1
  1526. vmlal.u8 q2, d3, d0
  1527. vld1.8 {q3}, [r2], r1
  1528. vext.8 d7, d6, d7, #1
  1529. vmull.u8 q8, d6, d1
  1530. vmlal.u8 q8, d7, d0
  1531. vrshrn.u16 d4, q2, #3
  1532. vrshrn.u16 d16, q8, #3
  1533. vst1.8 {d4}, [r0,:64], r1
  1534. vst1.8 {d16}, [r0,:64], r1
  1535. bgt 1b
  1536. bx lr
  1537. endfunc
  1538. function ff_put_vp8_bilin8_v_neon, export=1
  1539. ldr r3, [sp, #8] @ my
  1540. rsb r12, r3, #8
  1541. vdup.8 d0, r3
  1542. vdup.8 d1, r12
  1543. ldr r12, [sp] @ h
  1544. vld1.8 {d2}, [r2], r1
  1545. 1:
  1546. subs r12, r12, #2
  1547. vld1.8 {d3}, [r2], r1
  1548. vmull.u8 q2, d2, d1
  1549. vmlal.u8 q2, d3, d0
  1550. vld1.8 {d2}, [r2], r1
  1551. vmull.u8 q3, d3, d1
  1552. vmlal.u8 q3, d2, d0
  1553. vrshrn.u16 d4, q2, #3
  1554. vrshrn.u16 d6, q3, #3
  1555. vst1.8 {d4}, [r0,:64], r1
  1556. vst1.8 {d6}, [r0,:64], r1
  1557. bgt 1b
  1558. bx lr
  1559. endfunc
  1560. function ff_put_vp8_bilin8_hv_neon, export=1
  1561. ldr r3, [sp, #4] @ mx
  1562. rsb r12, r3, #8
  1563. vdup.8 d0, r3
  1564. vdup.8 d1, r12
  1565. ldr r3, [sp, #8] @ my
  1566. rsb r12, r3, #8
  1567. vdup.8 d2, r3
  1568. vdup.8 d3, r12
  1569. ldr r12, [sp] @ h
  1570. vld1.8 {q2}, [r2], r1
  1571. vext.8 d5, d4, d5, #1
  1572. vmull.u8 q9, d4, d1
  1573. vmlal.u8 q9, d5, d0
  1574. vrshrn.u16 d22, q9, #3
  1575. 1:
  1576. subs r12, r12, #2
  1577. vld1.8 {q3}, [r2], r1
  1578. vext.8 d7, d6, d7, #1
  1579. vmull.u8 q8, d6, d1
  1580. vmlal.u8 q8, d7, d0
  1581. vld1.8 {q2}, [r2], r1
  1582. vext.8 d5, d4, d5, #1
  1583. vmull.u8 q9, d4, d1
  1584. vmlal.u8 q9, d5, d0
  1585. vrshrn.u16 d16, q8, #3
  1586. vmull.u8 q10, d22, d3
  1587. vmlal.u8 q10, d16, d2
  1588. vrshrn.u16 d22, q9, #3
  1589. vmull.u8 q12, d16, d3
  1590. vmlal.u8 q12, d22, d2
  1591. vrshrn.u16 d20, q10, #3
  1592. vst1.8 {d20}, [r0,:64], r1
  1593. vrshrn.u16 d23, q12, #3
  1594. vst1.8 {d23}, [r0,:64], r1
  1595. bgt 1b
  1596. bx lr
  1597. endfunc
  1598. function ff_put_vp8_bilin4_h_neon, export=1
  1599. ldr r3, [sp, #4] @ mx
  1600. rsb r12, r3, #8
  1601. vdup.8 d0, r3
  1602. vdup.8 d1, r12
  1603. ldr r12, [sp] @ h
  1604. 1:
  1605. subs r12, r12, #2
  1606. vld1.8 {d2}, [r2], r1
  1607. vext.8 d3, d2, d3, #1
  1608. vld1.8 {d6}, [r2], r1
  1609. vext.8 d7, d6, d7, #1
  1610. vtrn.32 q1, q3
  1611. vmull.u8 q2, d2, d1
  1612. vmlal.u8 q2, d3, d0
  1613. vrshrn.u16 d4, q2, #3
  1614. vst1.32 {d4[0]}, [r0,:32], r1
  1615. vst1.32 {d4[1]}, [r0,:32], r1
  1616. bgt 1b
  1617. bx lr
  1618. endfunc
  1619. function ff_put_vp8_bilin4_v_neon, export=1
  1620. ldr r3, [sp, #8] @ my
  1621. rsb r12, r3, #8
  1622. vdup.8 d0, r3
  1623. vdup.8 d1, r12
  1624. ldr r12, [sp] @ h
  1625. vld1.32 {d2[]}, [r2], r1
  1626. 1:
  1627. vld1.32 {d3[]}, [r2]
  1628. vld1.32 {d2[1]}, [r2], r1
  1629. vld1.32 {d3[1]}, [r2], r1
  1630. vmull.u8 q2, d2, d1
  1631. vmlal.u8 q2, d3, d0
  1632. vtrn.32 d3, d2
  1633. vrshrn.u16 d4, q2, #3
  1634. vst1.32 {d4[0]}, [r0,:32], r1
  1635. vst1.32 {d4[1]}, [r0,:32], r1
  1636. subs r12, r12, #2
  1637. bgt 1b
  1638. bx lr
  1639. endfunc
  1640. function ff_put_vp8_bilin4_hv_neon, export=1
  1641. ldr r3, [sp, #4] @ mx
  1642. rsb r12, r3, #8
  1643. vdup.8 d0, r3
  1644. vdup.8 d1, r12
  1645. ldr r3, [sp, #8] @ my
  1646. rsb r12, r3, #8
  1647. vdup.8 d2, r3
  1648. vdup.8 d3, r12
  1649. ldr r12, [sp] @ h
  1650. vld1.8 {d4}, [r2], r1
  1651. vext.8 d5, d4, d4, #1
  1652. vmull.u8 q9, d4, d1
  1653. vmlal.u8 q9, d5, d0
  1654. vrshrn.u16 d22, q9, #3
  1655. 1:
  1656. subs r12, r12, #2
  1657. vld1.8 {d6}, [r2], r1
  1658. vext.8 d7, d6, d6, #1
  1659. vld1.8 {d4}, [r2], r1
  1660. vext.8 d5, d4, d4, #1
  1661. vtrn.32 q3, q2
  1662. vmull.u8 q8, d6, d1
  1663. vmlal.u8 q8, d7, d0
  1664. vrshrn.u16 d16, q8, #3
  1665. vmull.u8 q10, d16, d2
  1666. vtrn.32 d22, d16
  1667. vmlal.u8 q10, d22, d3
  1668. vrev64.32 d22, d16
  1669. vrshrn.u16 d20, q10, #3
  1670. vst1.32 {d20[0]}, [r0,:32], r1
  1671. vst1.32 {d20[1]}, [r0,:32], r1
  1672. bgt 1b
  1673. bx lr
  1674. endfunc