You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1913 lines
66KB

  1. /**
  2. * VP8 NEON optimisations
  3. *
  4. * Copyright (c) 2010 Rob Clark <rob@ti.com>
  5. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "asm.S"
  24. function ff_vp8_luma_dc_wht_neon, export=1
  25. vld1.16 {q0-q1}, [r1,:128]
  26. vmov.i16 q15, #0
  27. vadd.i16 d4, d0, d3
  28. vadd.i16 d6, d1, d2
  29. vst1.16 {q15}, [r1,:128]!
  30. vsub.i16 d7, d1, d2
  31. vsub.i16 d5, d0, d3
  32. vst1.16 {q15}, [r1,:128]
  33. vadd.i16 q0, q2, q3
  34. vsub.i16 q1, q2, q3
  35. vmov.i16 q8, #3
  36. vtrn.32 d0, d2
  37. vtrn.32 d1, d3
  38. vtrn.16 d0, d1
  39. vtrn.16 d2, d3
  40. vadd.i16 d0, d0, d16
  41. vadd.i16 d4, d0, d3
  42. vadd.i16 d6, d1, d2
  43. vsub.i16 d7, d1, d2
  44. vsub.i16 d5, d0, d3
  45. vadd.i16 q0, q2, q3
  46. vsub.i16 q1, q2, q3
  47. vshr.s16 q0, q0, #3
  48. vshr.s16 q1, q1, #3
  49. mov r3, #32
  50. vst1.16 {d0[0]}, [r0,:16], r3
  51. vst1.16 {d1[0]}, [r0,:16], r3
  52. vst1.16 {d2[0]}, [r0,:16], r3
  53. vst1.16 {d3[0]}, [r0,:16], r3
  54. vst1.16 {d0[1]}, [r0,:16], r3
  55. vst1.16 {d1[1]}, [r0,:16], r3
  56. vst1.16 {d2[1]}, [r0,:16], r3
  57. vst1.16 {d3[1]}, [r0,:16], r3
  58. vst1.16 {d0[2]}, [r0,:16], r3
  59. vst1.16 {d1[2]}, [r0,:16], r3
  60. vst1.16 {d2[2]}, [r0,:16], r3
  61. vst1.16 {d3[2]}, [r0,:16], r3
  62. vst1.16 {d0[3]}, [r0,:16], r3
  63. vst1.16 {d1[3]}, [r0,:16], r3
  64. vst1.16 {d2[3]}, [r0,:16], r3
  65. vst1.16 {d3[3]}, [r0,:16], r3
  66. bx lr
  67. endfunc
  68. function ff_vp8_luma_dc_wht_dc_neon, export=1
  69. ldrsh r2, [r1]
  70. mov r3, #0
  71. add r2, r2, #3
  72. strh r3, [r1]
  73. asr r2, r2, #3
  74. .rept 16
  75. strh r2, [r0], #32
  76. .endr
  77. bx lr
  78. endfunc
  79. function ff_vp8_idct_add_neon, export=1
  80. vld1.16 {q0-q1}, [r1,:128]
  81. movw r3, #20091
  82. movt r3, #35468/2
  83. vdup.32 d4, r3
  84. vmull.s16 q12, d1, d4[0]
  85. vmull.s16 q13, d3, d4[0]
  86. vqdmulh.s16 d20, d1, d4[1]
  87. vqdmulh.s16 d23, d3, d4[1]
  88. vshrn.s32 d21, q12, #16
  89. vshrn.s32 d22, q13, #16
  90. vadd.s16 d21, d21, d1
  91. vadd.s16 d22, d22, d3
  92. vadd.s16 d16, d0, d2
  93. vsub.s16 d17, d0, d2
  94. vadd.s16 d18, d21, d23
  95. vsub.s16 d19, d20, d22
  96. vadd.s16 q0, q8, q9
  97. vsub.s16 q1, q8, q9
  98. vtrn.32 d0, d3
  99. vtrn.32 d1, d2
  100. vtrn.16 d0, d1
  101. vtrn.16 d3, d2
  102. vmov.i16 q15, #0
  103. vmull.s16 q12, d1, d4[0]
  104. vst1.16 {q15}, [r1,:128]!
  105. vmull.s16 q13, d2, d4[0]
  106. vst1.16 {q15}, [r1,:128]
  107. vqdmulh.s16 d21, d1, d4[1]
  108. vqdmulh.s16 d23, d2, d4[1]
  109. vshrn.s32 d20, q12, #16
  110. vshrn.s32 d22, q13, #16
  111. vadd.i16 d20, d20, d1
  112. vadd.i16 d22, d22, d2
  113. vadd.i16 d16, d0, d3
  114. vsub.i16 d17, d0, d3
  115. vadd.i16 d18, d20, d23
  116. vld1.32 {d20[]}, [r0,:32], r2
  117. vsub.i16 d19, d21, d22
  118. vld1.32 {d22[]}, [r0,:32], r2
  119. vadd.s16 q0, q8, q9
  120. vld1.32 {d23[]}, [r0,:32], r2
  121. vsub.s16 q1, q8, q9
  122. vld1.32 {d21[]}, [r0,:32], r2
  123. vrshr.s16 q0, q0, #3
  124. vtrn.32 q10, q11
  125. vrshr.s16 q1, q1, #3
  126. sub r0, r0, r2, lsl #2
  127. vtrn.32 d0, d3
  128. vtrn.32 d1, d2
  129. vtrn.16 d0, d1
  130. vtrn.16 d3, d2
  131. vaddw.u8 q0, q0, d20
  132. vaddw.u8 q1, q1, d21
  133. vqmovun.s16 d0, q0
  134. vqmovun.s16 d1, q1
  135. vst1.32 {d0[0]}, [r0,:32], r2
  136. vst1.32 {d0[1]}, [r0,:32], r2
  137. vst1.32 {d1[1]}, [r0,:32], r2
  138. vst1.32 {d1[0]}, [r0,:32], r2
  139. bx lr
  140. endfunc
  141. function ff_vp8_idct_dc_add_neon, export=1
  142. mov r3, #0
  143. ldrsh r12, [r1]
  144. strh r3, [r1]
  145. vdup.16 q1, r12
  146. vrshr.s16 q1, q1, #3
  147. vld1.32 {d0[]}, [r0,:32], r2
  148. vld1.32 {d1[]}, [r0,:32], r2
  149. vld1.32 {d0[1]}, [r0,:32], r2
  150. vld1.32 {d1[1]}, [r0,:32], r2
  151. vaddw.u8 q2, q1, d0
  152. vaddw.u8 q3, q1, d1
  153. sub r0, r0, r2, lsl #2
  154. vqmovun.s16 d0, q2
  155. vqmovun.s16 d1, q3
  156. vst1.32 {d0[0]}, [r0,:32], r2
  157. vst1.32 {d1[0]}, [r0,:32], r2
  158. vst1.32 {d0[1]}, [r0,:32], r2
  159. vst1.32 {d1[1]}, [r0,:32], r2
  160. bx lr
  161. endfunc
  162. function ff_vp8_idct_dc_add4uv_neon, export=1
  163. vmov.i16 d0, #0
  164. mov r3, #32
  165. vld1.16 {d16[]}, [r1,:16]
  166. vst1.16 {d0[0]}, [r1,:16], r3
  167. vld1.16 {d17[]}, [r1,:16]
  168. vst1.16 {d0[0]}, [r1,:16], r3
  169. vld1.16 {d18[]}, [r1,:16]
  170. vst1.16 {d0[0]}, [r1,:16], r3
  171. vld1.16 {d19[]}, [r1,:16]
  172. vst1.16 {d0[0]}, [r1,:16], r3
  173. mov r3, r0
  174. vrshr.s16 q8, q8, #3 @ dc >>= 3
  175. vld1.8 {d0}, [r0,:64], r2
  176. vrshr.s16 q9, q9, #3
  177. vld1.8 {d1}, [r0,:64], r2
  178. vaddw.u8 q10, q8, d0
  179. vld1.8 {d2}, [r0,:64], r2
  180. vaddw.u8 q0, q8, d1
  181. vld1.8 {d3}, [r0,:64], r2
  182. vaddw.u8 q11, q8, d2
  183. vld1.8 {d4}, [r0,:64], r2
  184. vaddw.u8 q1, q8, d3
  185. vld1.8 {d5}, [r0,:64], r2
  186. vaddw.u8 q12, q9, d4
  187. vld1.8 {d6}, [r0,:64], r2
  188. vaddw.u8 q2, q9, d5
  189. vld1.8 {d7}, [r0,:64], r2
  190. vaddw.u8 q13, q9, d6
  191. vqmovun.s16 d20, q10
  192. vaddw.u8 q3, q9, d7
  193. vqmovun.s16 d21, q0
  194. vqmovun.s16 d22, q11
  195. vst1.8 {d20}, [r3,:64], r2
  196. vqmovun.s16 d23, q1
  197. vst1.8 {d21}, [r3,:64], r2
  198. vqmovun.s16 d24, q12
  199. vst1.8 {d22}, [r3,:64], r2
  200. vqmovun.s16 d25, q2
  201. vst1.8 {d23}, [r3,:64], r2
  202. vqmovun.s16 d26, q13
  203. vst1.8 {d24}, [r3,:64], r2
  204. vqmovun.s16 d27, q3
  205. vst1.8 {d25}, [r3,:64], r2
  206. vst1.8 {d26}, [r3,:64], r2
  207. vst1.8 {d27}, [r3,:64], r2
  208. bx lr
  209. endfunc
  210. function ff_vp8_idct_dc_add4y_neon, export=1
  211. vmov.i16 d0, #0
  212. mov r3, #32
  213. vld1.16 {d16[]}, [r1,:16]
  214. vst1.16 {d0[0]}, [r1,:16], r3
  215. vld1.16 {d17[]}, [r1,:16]
  216. vst1.16 {d0[0]}, [r1,:16], r3
  217. vld1.16 {d18[]}, [r1,:16]
  218. vst1.16 {d0[0]}, [r1,:16], r3
  219. vld1.16 {d19[]}, [r1,:16]
  220. vst1.16 {d0[0]}, [r1,:16], r3
  221. vrshr.s16 q8, q8, #3 @ dc >>= 3
  222. vld1.8 {q0}, [r0,:128], r2
  223. vrshr.s16 q9, q9, #3
  224. vld1.8 {q1}, [r0,:128], r2
  225. vaddw.u8 q10, q8, d0
  226. vld1.8 {q2}, [r0,:128], r2
  227. vaddw.u8 q0, q9, d1
  228. vld1.8 {q3}, [r0,:128], r2
  229. vaddw.u8 q11, q8, d2
  230. vaddw.u8 q1, q9, d3
  231. vaddw.u8 q12, q8, d4
  232. vaddw.u8 q2, q9, d5
  233. vaddw.u8 q13, q8, d6
  234. vaddw.u8 q3, q9, d7
  235. sub r0, r0, r2, lsl #2
  236. vqmovun.s16 d20, q10
  237. vqmovun.s16 d21, q0
  238. vqmovun.s16 d22, q11
  239. vqmovun.s16 d23, q1
  240. vqmovun.s16 d24, q12
  241. vst1.8 {q10}, [r0,:128], r2
  242. vqmovun.s16 d25, q2
  243. vst1.8 {q11}, [r0,:128], r2
  244. vqmovun.s16 d26, q13
  245. vst1.8 {q12}, [r0,:128], r2
  246. vqmovun.s16 d27, q3
  247. vst1.8 {q13}, [r0,:128], r2
  248. bx lr
  249. endfunc
  250. @ Register layout:
  251. @ P3..Q3 -> q0..q7
  252. @ flim_E -> q14
  253. @ flim_I -> q15
  254. @ hev_thresh -> r12
  255. @
  256. .macro vp8_loop_filter, inner=0, simple=0
  257. .if \simple
  258. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  259. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  260. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  261. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  262. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  263. vmov.i8 q13, #0x80
  264. vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
  265. .else
  266. @ calculate hev and normal_limit:
  267. vabd.u8 q12, q2, q3 @ abs(P1-P0)
  268. vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
  269. vabd.u8 q10, q0, q1 @ abs(P3-P2)
  270. vabd.u8 q11, q1, q2 @ abs(P2-P1)
  271. vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
  272. vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
  273. vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
  274. vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
  275. vand q8, q8, q9
  276. vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
  277. vand q8, q8, q11
  278. vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
  279. vand q8, q8, q10
  280. vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
  281. vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
  282. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  283. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  284. vand q8, q8, q10
  285. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  286. vand q8, q8, q11
  287. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  288. vdup.8 q15, r12 @ hev_thresh
  289. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  290. vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
  291. vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
  292. vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
  293. vand q8, q8, q11
  294. vmov.i8 q13, #0x80
  295. vorr q9, q12, q14
  296. .endif
  297. @ at this point:
  298. @ q8: normal_limit
  299. @ q9: hev
  300. @ convert to signed value:
  301. veor q3, q3, q13 @ PS0 = P0 ^ 0x80
  302. veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
  303. vmov.i16 q12, #3
  304. vsubl.s8 q10, d8, d6 @ QS0 - PS0
  305. vsubl.s8 q11, d9, d7 @ (widened to 16bit)
  306. veor q2, q2, q13 @ PS1 = P1 ^ 0x80
  307. veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
  308. vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
  309. vmul.i16 q11, q11, q12
  310. vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
  311. vmov.i8 q14, #4
  312. vmov.i8 q15, #3
  313. .if \inner
  314. vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
  315. .endif
  316. vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
  317. vaddw.s8 q11, q11, d25
  318. vqmovn.s16 d20, q10 @ narrow result back into q10
  319. vqmovn.s16 d21, q11
  320. .if !\inner && !\simple
  321. veor q1, q1, q13 @ PS2 = P2 ^ 0x80
  322. veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
  323. .endif
  324. vand q10, q10, q8 @ w &= normal_limit
  325. @ registers used at this point..
  326. @ q0 -> P3 (don't corrupt)
  327. @ q1-q6 -> PS2-QS2
  328. @ q7 -> Q3 (don't corrupt)
  329. @ q9 -> hev
  330. @ q10 -> w
  331. @ q13 -> #0x80
  332. @ q14 -> #4
  333. @ q15 -> #3
  334. @ q8, q11, q12 -> unused
  335. @ filter_common: is4tap==1
  336. @ c1 = clamp(w + 4) >> 3;
  337. @ c2 = clamp(w + 3) >> 3;
  338. @ Q0 = s2u(QS0 - c1);
  339. @ P0 = s2u(PS0 + c2);
  340. .if \simple
  341. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  342. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  343. vshr.s8 q11, q11, #3 @ c1 >>= 3
  344. vshr.s8 q12, q12, #3 @ c2 >>= 3
  345. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  346. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  347. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  348. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  349. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  350. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  351. .elseif \inner
  352. @ the !is4tap case of filter_common, only used for inner blocks
  353. @ c3 = ((c1&~hev) + 1) >> 1;
  354. @ Q1 = s2u(QS1 - c3);
  355. @ P1 = s2u(PS1 + c3);
  356. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  357. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  358. vshr.s8 q11, q11, #3 @ c1 >>= 3
  359. vshr.s8 q12, q12, #3 @ c2 >>= 3
  360. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  361. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  362. vbic q11, q11, q9 @ c1 & ~hev
  363. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  364. vrshr.s8 q11, q11, #1 @ c3 >>= 1
  365. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  366. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
  367. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
  368. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  369. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  370. .else
  371. vand q12, q10, q9 @ w & hev
  372. vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
  373. vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
  374. vshr.s8 q11, q11, #3 @ c1 >>= 3
  375. vshr.s8 q12, q12, #3 @ c2 >>= 3
  376. vbic q10, q10, q9 @ w &= ~hev
  377. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  378. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  379. @ filter_mbedge:
  380. @ a = clamp((27*w + 63) >> 7);
  381. @ Q0 = s2u(QS0 - a);
  382. @ P0 = s2u(PS0 + a);
  383. @ a = clamp((18*w + 63) >> 7);
  384. @ Q1 = s2u(QS1 - a);
  385. @ P1 = s2u(PS1 + a);
  386. @ a = clamp((9*w + 63) >> 7);
  387. @ Q2 = s2u(QS2 - a);
  388. @ P2 = s2u(PS2 + a);
  389. vmov.i16 q9, #63
  390. vshll.s8 q14, d20, #3
  391. vshll.s8 q15, d21, #3
  392. vaddw.s8 q14, q14, d20
  393. vaddw.s8 q15, q15, d21
  394. vadd.s16 q8, q9, q14
  395. vadd.s16 q9, q9, q15 @ 9*w + 63
  396. vadd.s16 q11, q8, q14
  397. vadd.s16 q12, q9, q15 @ 18*w + 63
  398. vadd.s16 q14, q11, q14
  399. vadd.s16 q15, q12, q15 @ 27*w + 63
  400. vqshrn.s16 d16, q8, #7
  401. vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
  402. vqshrn.s16 d22, q11, #7
  403. vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
  404. vqshrn.s16 d28, q14, #7
  405. vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
  406. vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
  407. vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
  408. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
  409. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
  410. vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
  411. vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
  412. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  413. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  414. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  415. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  416. veor q1, q1, q13 @ P2 = PS2 ^ 0x80
  417. veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
  418. .endif
  419. .endm
  420. .macro transpose8x16matrix
  421. vtrn.32 q0, q4
  422. vtrn.32 q1, q5
  423. vtrn.32 q2, q6
  424. vtrn.32 q3, q7
  425. vtrn.16 q0, q2
  426. vtrn.16 q1, q3
  427. vtrn.16 q4, q6
  428. vtrn.16 q5, q7
  429. vtrn.8 q0, q1
  430. vtrn.8 q2, q3
  431. vtrn.8 q4, q5
  432. vtrn.8 q6, q7
  433. .endm
  434. .macro vp8_v_loop_filter16 name, inner=0, simple=0
  435. function ff_vp8_v_loop_filter16\name\()_neon, export=1
  436. vpush {q4-q7}
  437. sub r0, r0, r1, lsl #1+!\simple
  438. @ Load pixels:
  439. .if !\simple
  440. ldr r12, [sp, #64] @ hev_thresh
  441. vld1.8 {q0}, [r0,:128], r1 @ P3
  442. vld1.8 {q1}, [r0,:128], r1 @ P2
  443. .endif
  444. vld1.8 {q2}, [r0,:128], r1 @ P1
  445. vld1.8 {q3}, [r0,:128], r1 @ P0
  446. vld1.8 {q4}, [r0,:128], r1 @ Q0
  447. vld1.8 {q5}, [r0,:128], r1 @ Q1
  448. .if !\simple
  449. vld1.8 {q6}, [r0,:128], r1 @ Q2
  450. vld1.8 {q7}, [r0,:128] @ Q3
  451. vdup.8 q15, r3 @ flim_I
  452. .endif
  453. vdup.8 q14, r2 @ flim_E
  454. vp8_loop_filter inner=\inner, simple=\simple
  455. @ back up to P2: dst -= stride * 6
  456. sub r0, r0, r1, lsl #2
  457. .if !\simple
  458. sub r0, r0, r1, lsl #1
  459. @ Store pixels:
  460. vst1.8 {q1}, [r0,:128], r1 @ P2
  461. .endif
  462. vst1.8 {q2}, [r0,:128], r1 @ P1
  463. vst1.8 {q3}, [r0,:128], r1 @ P0
  464. vst1.8 {q4}, [r0,:128], r1 @ Q0
  465. vst1.8 {q5}, [r0,:128], r1 @ Q1
  466. .if !\simple
  467. vst1.8 {q6}, [r0,:128] @ Q2
  468. .endif
  469. vpop {q4-q7}
  470. bx lr
  471. endfunc
  472. .endm
  473. vp8_v_loop_filter16
  474. vp8_v_loop_filter16 _inner, inner=1
  475. vp8_v_loop_filter16 _simple, simple=1
  476. .macro vp8_v_loop_filter8uv name, inner=0
  477. function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
  478. vpush {q4-q7}
  479. sub r0, r0, r2, lsl #2
  480. sub r1, r1, r2, lsl #2
  481. ldr r12, [sp, #64] @ flim_I
  482. @ Load pixels:
  483. vld1.8 {d0}, [r0,:64], r2 @ P3
  484. vld1.8 {d1}, [r1,:64], r2 @ P3
  485. vld1.8 {d2}, [r0,:64], r2 @ P2
  486. vld1.8 {d3}, [r1,:64], r2 @ P2
  487. vld1.8 {d4}, [r0,:64], r2 @ P1
  488. vld1.8 {d5}, [r1,:64], r2 @ P1
  489. vld1.8 {d6}, [r0,:64], r2 @ P0
  490. vld1.8 {d7}, [r1,:64], r2 @ P0
  491. vld1.8 {d8}, [r0,:64], r2 @ Q0
  492. vld1.8 {d9}, [r1,:64], r2 @ Q0
  493. vld1.8 {d10}, [r0,:64], r2 @ Q1
  494. vld1.8 {d11}, [r1,:64], r2 @ Q1
  495. vld1.8 {d12}, [r0,:64], r2 @ Q2
  496. vld1.8 {d13}, [r1,:64], r2 @ Q2
  497. vld1.8 {d14}, [r0,:64] @ Q3
  498. vld1.8 {d15}, [r1,:64] @ Q3
  499. vdup.8 q14, r3 @ flim_E
  500. vdup.8 q15, r12 @ flim_I
  501. ldr r12, [sp, #68] @ hev_thresh
  502. vp8_loop_filter inner=\inner
  503. @ back up to P2: u,v -= stride * 6
  504. sub r0, r0, r2, lsl #2
  505. sub r1, r1, r2, lsl #2
  506. sub r0, r0, r2, lsl #1
  507. sub r1, r1, r2, lsl #1
  508. @ Store pixels:
  509. vst1.8 {d2}, [r0,:64], r2 @ P2
  510. vst1.8 {d3}, [r1,:64], r2 @ P2
  511. vst1.8 {d4}, [r0,:64], r2 @ P1
  512. vst1.8 {d5}, [r1,:64], r2 @ P1
  513. vst1.8 {d6}, [r0,:64], r2 @ P0
  514. vst1.8 {d7}, [r1,:64], r2 @ P0
  515. vst1.8 {d8}, [r0,:64], r2 @ Q0
  516. vst1.8 {d9}, [r1,:64], r2 @ Q0
  517. vst1.8 {d10}, [r0,:64], r2 @ Q1
  518. vst1.8 {d11}, [r1,:64], r2 @ Q1
  519. vst1.8 {d12}, [r0,:64] @ Q2
  520. vst1.8 {d13}, [r1,:64] @ Q2
  521. vpop {q4-q7}
  522. bx lr
  523. endfunc
  524. .endm
  525. vp8_v_loop_filter8uv
  526. vp8_v_loop_filter8uv _inner, inner=1
  527. .macro vp8_h_loop_filter16 name, inner=0, simple=0
  528. function ff_vp8_h_loop_filter16\name\()_neon, export=1
  529. vpush {q4-q7}
  530. sub r0, r0, #4
  531. .if !\simple
  532. ldr r12, [sp, #64] @ hev_thresh
  533. .endif
  534. @ Load pixels:
  535. vld1.8 {d0}, [r0], r1 @ load first 8-line src data
  536. vld1.8 {d2}, [r0], r1
  537. vld1.8 {d4}, [r0], r1
  538. vld1.8 {d6}, [r0], r1
  539. vld1.8 {d8}, [r0], r1
  540. vld1.8 {d10}, [r0], r1
  541. vld1.8 {d12}, [r0], r1
  542. vld1.8 {d14}, [r0], r1
  543. vld1.8 {d1}, [r0], r1 @ load second 8-line src data
  544. vld1.8 {d3}, [r0], r1
  545. vld1.8 {d5}, [r0], r1
  546. vld1.8 {d7}, [r0], r1
  547. vld1.8 {d9}, [r0], r1
  548. vld1.8 {d11}, [r0], r1
  549. vld1.8 {d13}, [r0], r1
  550. vld1.8 {d15}, [r0], r1
  551. transpose8x16matrix
  552. vdup.8 q14, r2 @ flim_E
  553. .if !\simple
  554. vdup.8 q15, r3 @ flim_I
  555. .endif
  556. vp8_loop_filter inner=\inner, simple=\simple
  557. sub r0, r0, r1, lsl #4 @ backup 16 rows
  558. transpose8x16matrix
  559. @ Store pixels:
  560. vst1.8 {d0}, [r0], r1
  561. vst1.8 {d2}, [r0], r1
  562. vst1.8 {d4}, [r0], r1
  563. vst1.8 {d6}, [r0], r1
  564. vst1.8 {d8}, [r0], r1
  565. vst1.8 {d10}, [r0], r1
  566. vst1.8 {d12}, [r0], r1
  567. vst1.8 {d14}, [r0], r1
  568. vst1.8 {d1}, [r0], r1
  569. vst1.8 {d3}, [r0], r1
  570. vst1.8 {d5}, [r0], r1
  571. vst1.8 {d7}, [r0], r1
  572. vst1.8 {d9}, [r0], r1
  573. vst1.8 {d11}, [r0], r1
  574. vst1.8 {d13}, [r0], r1
  575. vst1.8 {d15}, [r0]
  576. vpop {q4-q7}
  577. bx lr
  578. endfunc
  579. .endm
  580. vp8_h_loop_filter16
  581. vp8_h_loop_filter16 _inner, inner=1
  582. vp8_h_loop_filter16 _simple, simple=1
  583. .macro vp8_h_loop_filter8uv name, inner=0
  584. function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
  585. vpush {q4-q7}
  586. sub r0, r0, #4
  587. sub r1, r1, #4
  588. ldr r12, [sp, #64] @ flim_I
  589. @ Load pixels:
  590. vld1.8 {d0}, [r0], r2 @ load u
  591. vld1.8 {d1}, [r1], r2 @ load v
  592. vld1.8 {d2}, [r0], r2
  593. vld1.8 {d3}, [r1], r2
  594. vld1.8 {d4}, [r0], r2
  595. vld1.8 {d5}, [r1], r2
  596. vld1.8 {d6}, [r0], r2
  597. vld1.8 {d7}, [r1], r2
  598. vld1.8 {d8}, [r0], r2
  599. vld1.8 {d9}, [r1], r2
  600. vld1.8 {d10}, [r0], r2
  601. vld1.8 {d11}, [r1], r2
  602. vld1.8 {d12}, [r0], r2
  603. vld1.8 {d13}, [r1], r2
  604. vld1.8 {d14}, [r0], r2
  605. vld1.8 {d15}, [r1], r2
  606. transpose8x16matrix
  607. vdup.8 q14, r3 @ flim_E
  608. vdup.8 q15, r12 @ flim_I
  609. ldr r12, [sp, #68] @ hev_thresh
  610. vp8_loop_filter inner=\inner
  611. sub r0, r0, r2, lsl #3 @ backup u 8 rows
  612. sub r1, r1, r2, lsl #3 @ backup v 8 rows
  613. transpose8x16matrix
  614. @ Store pixels:
  615. vst1.8 {d0}, [r0], r2
  616. vst1.8 {d1}, [r1], r2
  617. vst1.8 {d2}, [r0], r2
  618. vst1.8 {d3}, [r1], r2
  619. vst1.8 {d4}, [r0], r2
  620. vst1.8 {d5}, [r1], r2
  621. vst1.8 {d6}, [r0], r2
  622. vst1.8 {d7}, [r1], r2
  623. vst1.8 {d8}, [r0], r2
  624. vst1.8 {d9}, [r1], r2
  625. vst1.8 {d10}, [r0], r2
  626. vst1.8 {d11}, [r1], r2
  627. vst1.8 {d12}, [r0], r2
  628. vst1.8 {d13}, [r1], r2
  629. vst1.8 {d14}, [r0]
  630. vst1.8 {d15}, [r1]
  631. vpop {q4-q7}
  632. bx lr
  633. endfunc
  634. .endm
  635. vp8_h_loop_filter8uv
  636. vp8_h_loop_filter8uv _inner, inner=1
  637. function ff_put_vp8_pixels16_neon, export=1
  638. ldr r12, [sp, #0] @ h
  639. 1:
  640. subs r12, r12, #4
  641. vld1.8 {q0}, [r2], r3
  642. vld1.8 {q1}, [r2], r3
  643. vld1.8 {q2}, [r2], r3
  644. vld1.8 {q3}, [r2], r3
  645. vst1.8 {q0}, [r0,:128], r1
  646. vst1.8 {q1}, [r0,:128], r1
  647. vst1.8 {q2}, [r0,:128], r1
  648. vst1.8 {q3}, [r0,:128], r1
  649. bgt 1b
  650. bx lr
  651. endfunc
  652. function ff_put_vp8_pixels8_neon, export=1
  653. ldr r12, [sp, #0] @ h
  654. 1:
  655. subs r12, r12, #4
  656. vld1.8 {d0}, [r2], r3
  657. vld1.8 {d1}, [r2], r3
  658. vld1.8 {d2}, [r2], r3
  659. vld1.8 {d3}, [r2], r3
  660. vst1.8 {d0}, [r0,:64], r1
  661. vst1.8 {d1}, [r0,:64], r1
  662. vst1.8 {d2}, [r0,:64], r1
  663. vst1.8 {d3}, [r0,:64], r1
  664. bgt 1b
  665. bx lr
  666. endfunc
  667. function ff_put_vp8_pixels4_neon, export=1
  668. ldr r12, [sp, #0] @ h
  669. push {r4-r6,lr}
  670. 1:
  671. subs r12, r12, #4
  672. ldr r4, [r2], r3
  673. ldr r5, [r2], r3
  674. ldr r6, [r2], r3
  675. ldr lr, [r2], r3
  676. str r4, [r0], r1
  677. str r5, [r0], r1
  678. str r6, [r0], r1
  679. str lr, [r0], r1
  680. bgt 1b
  681. pop {r4-r6,pc}
  682. endfunc
  683. /* 4/6-tap 8th-pel MC */
  684. .macro vp8_epel8_h6 d, a, b
  685. vext.8 d27, \a, \b, #1
  686. vmovl.u8 q8, \a
  687. vext.8 d28, \a, \b, #2
  688. vmovl.u8 q9, d27
  689. vext.8 d29, \a, \b, #3
  690. vmovl.u8 q10, d28
  691. vext.8 d30, \a, \b, #4
  692. vmovl.u8 q11, d29
  693. vext.8 d31, \a, \b, #5
  694. vmovl.u8 q12, d30
  695. vmul.u16 q10, q10, d0[2]
  696. vmovl.u8 q13, d31
  697. vmul.u16 q11, q11, d0[3]
  698. vmls.u16 q10, q9, d0[1]
  699. vmls.u16 q11, q12, d1[0]
  700. vmla.u16 q10, q8, d0[0]
  701. vmla.u16 q11, q13, d1[1]
  702. vqadd.s16 q11, q10, q11
  703. vqrshrun.s16 \d, q11, #7
  704. .endm
  705. .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
  706. vext.8 q14, \q0, \q1, #3
  707. vext.8 q15, \q0, \q1, #4
  708. vmovl.u8 q11, d28
  709. vmovl.u8 q14, d29
  710. vext.8 q3, \q0, \q1, #2
  711. vmovl.u8 q12, d30
  712. vmovl.u8 q15, d31
  713. vext.8 q8, \q0, \q1, #1
  714. vmovl.u8 q10, d6
  715. vmovl.u8 q3, d7
  716. vext.8 q2, \q0, \q1, #5
  717. vmovl.u8 q13, d4
  718. vmovl.u8 q2, d5
  719. vmovl.u8 q9, d16
  720. vmovl.u8 q8, d17
  721. vmul.u16 q11, q11, d0[3]
  722. vmul.u16 q10, q10, d0[2]
  723. vmul.u16 q3, q3, d0[2]
  724. vmul.u16 q14, q14, d0[3]
  725. vmls.u16 q11, q12, d1[0]
  726. vmovl.u8 q12, \s0
  727. vmovl.u8 q1, \s1
  728. vmls.u16 q10, q9, d0[1]
  729. vmls.u16 q3, q8, d0[1]
  730. vmls.u16 q14, q15, d1[0]
  731. vmla.u16 q10, q12, d0[0]
  732. vmla.u16 q11, q13, d1[1]
  733. vmla.u16 q3, q1, d0[0]
  734. vmla.u16 q14, q2, d1[1]
  735. vqadd.s16 q11, q10, q11
  736. vqadd.s16 q14, q3, q14
  737. vqrshrun.s16 \d0, q11, #7
  738. vqrshrun.s16 \d1, q14, #7
  739. .endm
  740. .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
  741. vmovl.u8 q10, \s2
  742. vmovl.u8 q11, \s3
  743. vmovl.u8 q9, \s1
  744. vmovl.u8 q12, \s4
  745. vmovl.u8 q8, \s0
  746. vmovl.u8 q13, \s5
  747. vmul.u16 q10, q10, d0[2]
  748. vmul.u16 q11, q11, d0[3]
  749. vmls.u16 q10, q9, d0[1]
  750. vmls.u16 q11, q12, d1[0]
  751. vmla.u16 q10, q8, d0[0]
  752. vmla.u16 q11, q13, d1[1]
  753. vqadd.s16 q11, q10, q11
  754. vqrshrun.s16 \d0, q11, #7
  755. .endm
  756. .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
  757. vmovl.u8 q10, \s0
  758. vmovl.u8 q11, \s3
  759. vmovl.u8 q14, \s6
  760. vmovl.u8 q9, \s1
  761. vmovl.u8 q12, \s4
  762. vmovl.u8 q8, \s2
  763. vmovl.u8 q13, \s5
  764. vmul.u16 q10, q10, d0[0]
  765. vmul.u16 q15, q11, d0[3]
  766. vmul.u16 q11, q11, d0[2]
  767. vmul.u16 q14, q14, d1[1]
  768. vmls.u16 q10, q9, d0[1]
  769. vmls.u16 q15, q12, d1[0]
  770. vmls.u16 q11, q8, d0[1]
  771. vmls.u16 q14, q13, d1[0]
  772. vmla.u16 q10, q8, d0[2]
  773. vmla.u16 q15, q13, d1[1]
  774. vmla.u16 q11, q9, d0[0]
  775. vmla.u16 q14, q12, d0[3]
  776. vqadd.s16 q15, q10, q15
  777. vqadd.s16 q14, q11, q14
  778. vqrshrun.s16 \d0, q15, #7
  779. vqrshrun.s16 \d1, q14, #7
  780. .endm
  781. .macro vp8_epel8_h4 d, a, b
  782. vext.8 d28, \a, \b, #1
  783. vmovl.u8 q9, \a
  784. vext.8 d29, \a, \b, #2
  785. vmovl.u8 q10, d28
  786. vext.8 d30, \a, \b, #3
  787. vmovl.u8 q11, d29
  788. vmovl.u8 q12, d30
  789. vmul.u16 q10, q10, d0[2]
  790. vmul.u16 q11, q11, d0[3]
  791. vmls.u16 q10, q9, d0[1]
  792. vmls.u16 q11, q12, d1[0]
  793. vqadd.s16 q11, q10, q11
  794. vqrshrun.s16 \d, q11, #7
  795. .endm
  796. .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
  797. vmovl.u8 q9, \s0
  798. vmovl.u8 q10, \s1
  799. vmovl.u8 q11, \s2
  800. vmovl.u8 q12, \s3
  801. vmovl.u8 q13, \s4
  802. vmul.u16 q8, q10, d0[2]
  803. vmul.u16 q14, q11, d0[3]
  804. vmul.u16 q11, q11, d0[2]
  805. vmul.u16 q15, q12, d0[3]
  806. vmls.u16 q8, q9, d0[1]
  807. vmls.u16 q14, q12, d1[0]
  808. vmls.u16 q11, q10, d0[1]
  809. vmls.u16 q15, q13, d1[0]
  810. vqadd.s16 q8, q8, q14
  811. vqadd.s16 q11, q11, q15
  812. vqrshrun.s16 \d0, q8, #7
  813. vqrshrun.s16 \d1, q11, #7
  814. .endm
  815. function ff_put_vp8_epel16_v6_neon, export=1
  816. sub r2, r2, r3, lsl #1
  817. push {r4,lr}
  818. vpush {d8-d15}
  819. ldr r4, [sp, #80] @ my
  820. movrel lr, subpel_filters-16
  821. ldr r12, [sp, #72] @ h
  822. add r4, lr, r4, lsl #4
  823. vld1.16 {q0}, [r4,:128]
  824. 1:
  825. vld1.8 {d2-d3}, [r2], r3
  826. vld1.8 {d4-d5}, [r2], r3
  827. vld1.8 {d6-d7}, [r2], r3
  828. vld1.8 {d8-d9}, [r2], r3
  829. vld1.8 {d10-d11},[r2], r3
  830. vld1.8 {d12-d13},[r2], r3
  831. vld1.8 {d14-d15},[r2]
  832. sub r2, r2, r3, lsl #2
  833. vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
  834. vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
  835. vst1.8 {d2-d3}, [r0,:128], r1
  836. vst1.8 {d4-d5}, [r0,:128], r1
  837. subs r12, r12, #2
  838. bne 1b
  839. vpop {d8-d15}
  840. pop {r4,pc}
  841. endfunc
  842. function ff_put_vp8_epel16_h6_neon, export=1
  843. sub r2, r2, #2
  844. push {r4,lr}
  845. ldr r4, [sp, #12] @ mx
  846. movrel lr, subpel_filters-16
  847. ldr r12, [sp, #8] @ h
  848. add r4, lr, r4, lsl #4
  849. vld1.16 {q0}, [r4,:128]
  850. 1:
  851. vld1.8 {d2-d4}, [r2], r3
  852. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  853. vst1.8 {d2-d3}, [r0,:128], r1
  854. subs r12, r12, #1
  855. bne 1b
  856. pop {r4,pc}
  857. endfunc
  858. function ff_put_vp8_epel16_h6v6_neon, export=1
  859. sub r2, r2, r3, lsl #1
  860. sub r2, r2, #2
  861. push {r4,lr}
  862. vpush {d8-d9}
  863. @ first pass (horizontal):
  864. ldr r4, [sp, #28] @ mx
  865. movrel lr, subpel_filters-16
  866. ldr r12, [sp, #24] @ h
  867. add r4, lr, r4, lsl #4
  868. sub sp, sp, #336+16
  869. vld1.16 {q0}, [r4,:128]
  870. add lr, sp, #15
  871. add r12, r12, #5
  872. bic lr, lr, #15
  873. 1:
  874. vld1.8 {d2,d3,d4}, [r2], r3
  875. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  876. vst1.8 {d2-d3}, [lr,:128]!
  877. subs r12, r12, #1
  878. bne 1b
  879. @ second pass (vertical):
  880. ldr r4, [sp, #336+16+32] @ my
  881. movrel lr, subpel_filters-16
  882. ldr r12, [sp, #336+16+24] @ h
  883. add r4, lr, r4, lsl #4
  884. add lr, sp, #15
  885. vld1.16 {q0}, [r4,:128]
  886. bic lr, lr, #15
  887. 2:
  888. vld1.8 {d2-d5}, [lr,:128]!
  889. vld1.8 {d6-d9}, [lr,:128]!
  890. vld1.8 {d28-d31},[lr,:128]
  891. sub lr, lr, #48
  892. vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
  893. vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
  894. vst1.8 {d2-d3}, [r0,:128], r1
  895. subs r12, r12, #1
  896. bne 2b
  897. add sp, sp, #336+16
  898. vpop {d8-d9}
  899. pop {r4,pc}
  900. endfunc
  901. function ff_put_vp8_epel8_v6_neon, export=1
  902. sub r2, r2, r3, lsl #1
  903. push {r4,lr}
  904. ldr r4, [sp, #16] @ my
  905. movrel lr, subpel_filters-16
  906. ldr r12, [sp, #8] @ h
  907. add r4, lr, r4, lsl #4
  908. vld1.16 {q0}, [r4,:128]
  909. 1:
  910. vld1.8 {d2}, [r2], r3
  911. vld1.8 {d3}, [r2], r3
  912. vld1.8 {d4}, [r2], r3
  913. vld1.8 {d5}, [r2], r3
  914. vld1.8 {d6}, [r2], r3
  915. vld1.8 {d7}, [r2], r3
  916. vld1.8 {d28}, [r2]
  917. sub r2, r2, r3, lsl #2
  918. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  919. vst1.8 {d2}, [r0,:64], r1
  920. vst1.8 {d3}, [r0,:64], r1
  921. subs r12, r12, #2
  922. bne 1b
  923. pop {r4,pc}
  924. endfunc
  925. function ff_put_vp8_epel8_h6_neon, export=1
  926. sub r2, r2, #2
  927. push {r4,lr}
  928. ldr r4, [sp, #12] @ mx
  929. movrel lr, subpel_filters-16
  930. ldr r12, [sp, #8] @ h
  931. add r4, lr, r4, lsl #4
  932. vld1.16 {q0}, [r4,:128]
  933. 1:
  934. vld1.8 {d2,d3}, [r2], r3
  935. vp8_epel8_h6 d2, d2, d3
  936. vst1.8 {d2}, [r0,:64], r1
  937. subs r12, r12, #1
  938. bne 1b
  939. pop {r4,pc}
  940. endfunc
  941. function ff_put_vp8_epel8_h6v6_neon, export=1
  942. sub r2, r2, r3, lsl #1
  943. sub r2, r2, #2
  944. push {r4,lr}
  945. @ first pass (horizontal):
  946. ldr r4, [sp, #12] @ mx
  947. movrel lr, subpel_filters-16
  948. ldr r12, [sp, #8] @ h
  949. add r4, lr, r4, lsl #4
  950. sub sp, sp, #168+16
  951. vld1.16 {q0}, [r4,:128]
  952. add lr, sp, #15
  953. add r12, r12, #5
  954. bic lr, lr, #15
  955. 1:
  956. vld1.8 {d2,d3}, [r2], r3
  957. vp8_epel8_h6 d2, d2, d3
  958. vst1.8 {d2}, [lr,:64]!
  959. subs r12, r12, #1
  960. bne 1b
  961. @ second pass (vertical):
  962. ldr r4, [sp, #168+16+16] @ my
  963. movrel lr, subpel_filters-16
  964. ldr r12, [sp, #168+16+8] @ h
  965. add r4, lr, r4, lsl #4
  966. add lr, sp, #15
  967. vld1.16 {q0}, [r4,:128]
  968. bic lr, lr, #15
  969. 2:
  970. vld1.8 {d2-d5}, [lr,:128]!
  971. vld1.8 {d6-d7}, [lr,:128]!
  972. vld1.8 {d30}, [lr,:64]
  973. sub lr, lr, #32
  974. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  975. vst1.8 {d2}, [r0,:64], r1
  976. vst1.8 {d3}, [r0,:64], r1
  977. subs r12, r12, #2
  978. bne 2b
  979. add sp, sp, #168+16
  980. pop {r4,pc}
  981. endfunc
  982. function ff_put_vp8_epel8_v4_neon, export=1
  983. sub r2, r2, r3
  984. push {r4,lr}
  985. ldr r4, [sp, #16] @ my
  986. movrel lr, subpel_filters-16
  987. ldr r12, [sp, #8] @ h
  988. add r4, lr, r4, lsl #4
  989. vld1.16 {q0}, [r4,:128]
  990. 1:
  991. vld1.8 {d2}, [r2], r3
  992. vld1.8 {d3}, [r2], r3
  993. vld1.8 {d4}, [r2], r3
  994. vld1.8 {d5}, [r2], r3
  995. vld1.8 {d6}, [r2]
  996. sub r2, r2, r3, lsl #1
  997. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  998. vst1.8 {d2}, [r0,:64], r1
  999. vst1.8 {d3}, [r0,:64], r1
  1000. subs r12, r12, #2
  1001. bne 1b
  1002. pop {r4,pc}
  1003. endfunc
  1004. function ff_put_vp8_epel8_h4_neon, export=1
  1005. sub r2, r2, #1
  1006. push {r4,lr}
  1007. ldr r4, [sp, #12] @ mx
  1008. movrel lr, subpel_filters-16
  1009. ldr r12, [sp, #8] @ h
  1010. add r4, lr, r4, lsl #4
  1011. vld1.16 {q0}, [r4,:128]
  1012. 1:
  1013. vld1.8 {d2,d3}, [r2], r3
  1014. vp8_epel8_h4 d2, d2, d3
  1015. vst1.8 {d2}, [r0,:64], r1
  1016. subs r12, r12, #1
  1017. bne 1b
  1018. pop {r4,pc}
  1019. endfunc
  1020. function ff_put_vp8_epel8_h4v4_neon, export=1
  1021. sub r2, r2, r3
  1022. sub r2, r2, #1
  1023. push {r4,lr}
  1024. @ first pass (horizontal):
  1025. ldr r4, [sp, #12] @ mx
  1026. movrel lr, subpel_filters-16
  1027. ldr r12, [sp, #8] @ h
  1028. add r4, lr, r4, lsl #4
  1029. sub sp, sp, #168+16
  1030. vld1.16 {q0}, [r4,:128]
  1031. add lr, sp, #15
  1032. add r12, r12, #3
  1033. bic lr, lr, #15
  1034. 1:
  1035. vld1.8 {d2,d3}, [r2], r3
  1036. vp8_epel8_h4 d2, d2, d3
  1037. vst1.8 {d2}, [lr,:64]!
  1038. subs r12, r12, #1
  1039. bne 1b
  1040. @ second pass (vertical):
  1041. ldr r4, [sp, #168+16+16] @ my
  1042. movrel lr, subpel_filters-16
  1043. ldr r12, [sp, #168+16+8] @ h
  1044. add r4, lr, r4, lsl #4
  1045. add lr, sp, #15
  1046. vld1.16 {q0}, [r4,:128]
  1047. bic lr, lr, #15
  1048. 2:
  1049. vld1.8 {d2-d5}, [lr,:128]!
  1050. vld1.8 {d6}, [lr,:64]
  1051. sub lr, lr, #16
  1052. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1053. vst1.8 {d2}, [r0,:64], r1
  1054. vst1.8 {d3}, [r0,:64], r1
  1055. subs r12, r12, #2
  1056. bne 2b
  1057. add sp, sp, #168+16
  1058. pop {r4,pc}
  1059. endfunc
  1060. function ff_put_vp8_epel8_h6v4_neon, export=1
  1061. sub r2, r2, r3
  1062. sub r2, r2, #2
  1063. push {r4,lr}
  1064. @ first pass (horizontal):
  1065. ldr r4, [sp, #12] @ mx
  1066. movrel lr, subpel_filters-16
  1067. ldr r12, [sp, #8] @ h
  1068. add r4, lr, r4, lsl #4
  1069. sub sp, sp, #168+16
  1070. vld1.16 {q0}, [r4,:128]
  1071. add lr, sp, #15
  1072. add r12, r12, #3
  1073. bic lr, lr, #15
  1074. 1:
  1075. vld1.8 {d2,d3}, [r2], r3
  1076. vp8_epel8_h6 d2, d2, d3
  1077. vst1.8 {d2}, [lr,:64]!
  1078. subs r12, r12, #1
  1079. bne 1b
  1080. @ second pass (vertical):
  1081. ldr r4, [sp, #168+16+16] @ my
  1082. movrel lr, subpel_filters-16
  1083. ldr r12, [sp, #168+16+8] @ h
  1084. add r4, lr, r4, lsl #4
  1085. add lr, sp, #15
  1086. vld1.16 {q0}, [r4,:128]
  1087. bic lr, lr, #15
  1088. 2:
  1089. vld1.8 {d2-d5}, [lr,:128]!
  1090. vld1.8 {d6}, [lr,:64]
  1091. sub lr, lr, #16
  1092. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1093. vst1.8 {d2}, [r0,:64], r1
  1094. vst1.8 {d3}, [r0,:64], r1
  1095. subs r12, r12, #2
  1096. bne 2b
  1097. add sp, sp, #168+16
  1098. pop {r4,pc}
  1099. endfunc
  1100. function ff_put_vp8_epel8_h4v6_neon, export=1
  1101. sub r2, r2, r3, lsl #1
  1102. sub r2, r2, #1
  1103. push {r4,lr}
  1104. @ first pass (horizontal):
  1105. ldr r4, [sp, #12] @ mx
  1106. movrel lr, subpel_filters-16
  1107. ldr r12, [sp, #8] @ h
  1108. add r4, lr, r4, lsl #4
  1109. sub sp, sp, #168+16
  1110. vld1.16 {q0}, [r4,:128]
  1111. add lr, sp, #15
  1112. add r12, r12, #5
  1113. bic lr, lr, #15
  1114. 1:
  1115. vld1.8 {d2,d3}, [r2], r3
  1116. vp8_epel8_h4 d2, d2, d3
  1117. vst1.8 {d2}, [lr,:64]!
  1118. subs r12, r12, #1
  1119. bne 1b
  1120. @ second pass (vertical):
  1121. ldr r4, [sp, #168+16+16] @ my
  1122. movrel lr, subpel_filters-16
  1123. ldr r12, [sp, #168+16+8] @ h
  1124. add r4, lr, r4, lsl #4
  1125. add lr, sp, #15
  1126. vld1.16 {q0}, [r4,:128]
  1127. bic lr, lr, #15
  1128. 2:
  1129. vld1.8 {d2-d5}, [lr,:128]!
  1130. vld1.8 {d6-d7}, [lr,:128]!
  1131. vld1.8 {d30}, [lr,:64]
  1132. sub lr, lr, #32
  1133. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  1134. vst1.8 {d2}, [r0,:64], r1
  1135. vst1.8 {d3}, [r0,:64], r1
  1136. subs r12, r12, #2
  1137. bne 2b
  1138. add sp, sp, #168+16
  1139. pop {r4,pc}
  1140. endfunc
  1141. .ltorg
  1142. function ff_put_vp8_epel4_v6_neon, export=1
  1143. sub r2, r2, r3, lsl #1
  1144. push {r4,lr}
  1145. ldr r4, [sp, #16] @ my
  1146. movrel lr, subpel_filters-16
  1147. ldr r12, [sp, #8] @ h
  1148. add r4, lr, r4, lsl #4
  1149. vld1.16 {q0}, [r4,:128]
  1150. 1:
  1151. vld1.32 {d2[]}, [r2], r3
  1152. vld1.32 {d3[]}, [r2], r3
  1153. vld1.32 {d4[]}, [r2], r3
  1154. vld1.32 {d5[]}, [r2], r3
  1155. vld1.32 {d6[]}, [r2], r3
  1156. vld1.32 {d7[]}, [r2], r3
  1157. vld1.32 {d28[]}, [r2]
  1158. sub r2, r2, r3, lsl #2
  1159. vld1.32 {d2[1]}, [r2], r3
  1160. vld1.32 {d3[1]}, [r2], r3
  1161. vld1.32 {d4[1]}, [r2], r3
  1162. vld1.32 {d5[1]}, [r2], r3
  1163. vld1.32 {d6[1]}, [r2], r3
  1164. vld1.32 {d7[1]}, [r2], r3
  1165. vld1.32 {d28[1]}, [r2]
  1166. sub r2, r2, r3, lsl #2
  1167. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  1168. vst1.32 {d2[0]}, [r0,:32], r1
  1169. vst1.32 {d3[0]}, [r0,:32], r1
  1170. vst1.32 {d2[1]}, [r0,:32], r1
  1171. vst1.32 {d3[1]}, [r0,:32], r1
  1172. subs r12, r12, #4
  1173. bne 1b
  1174. pop {r4,pc}
  1175. endfunc
  1176. function ff_put_vp8_epel4_h6_neon, export=1
  1177. sub r2, r2, #2
  1178. push {r4,lr}
  1179. ldr r4, [sp, #12] @ mx
  1180. movrel lr, subpel_filters-16
  1181. ldr r12, [sp, #8] @ h
  1182. add r4, lr, r4, lsl #4
  1183. vld1.16 {q0}, [r4,:128]
  1184. 1:
  1185. vld1.8 {q1}, [r2], r3
  1186. vp8_epel8_h6 d2, d2, d3
  1187. vst1.32 {d2[0]}, [r0,:32], r1
  1188. subs r12, r12, #1
  1189. bne 1b
  1190. pop {r4,pc}
  1191. endfunc
  1192. function ff_put_vp8_epel4_h6v6_neon, export=1
  1193. sub r2, r2, r3, lsl #1
  1194. sub r2, r2, #2
  1195. push {r4,lr}
  1196. ldr r4, [sp, #12] @ mx
  1197. movrel lr, subpel_filters-16
  1198. ldr r12, [sp, #8] @ h
  1199. add r4, lr, r4, lsl #4
  1200. sub sp, sp, #52+16
  1201. vld1.16 {q0}, [r4,:128]
  1202. add lr, sp, #15
  1203. add r12, r12, #5
  1204. bic lr, lr, #15
  1205. 1:
  1206. vld1.8 {q1}, [r2], r3
  1207. vp8_epel8_h6 d2, d2, d3
  1208. vst1.32 {d2[0]}, [lr,:32]!
  1209. subs r12, r12, #1
  1210. bne 1b
  1211. ldr r4, [sp, #52+16+16] @ my
  1212. movrel lr, subpel_filters-16
  1213. ldr r12, [sp, #52+16+8] @ h
  1214. add r4, lr, r4, lsl #4
  1215. add lr, sp, #15
  1216. vld1.16 {q0}, [r4,:128]
  1217. bic lr, lr, #15
  1218. 2:
  1219. vld1.8 {d2-d3}, [lr,:128]!
  1220. vld1.8 {d6}, [lr,:64]!
  1221. vld1.32 {d28[]}, [lr,:32]
  1222. sub lr, lr, #16
  1223. vld1.8 {d4-d5}, [lr]!
  1224. vld1.8 {d7}, [lr,:64]!
  1225. vld1.32 {d28[1]}, [lr,:32]
  1226. sub lr, lr, #16
  1227. vtrn.32 q1, q2
  1228. vtrn.32 d6, d7
  1229. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1230. vst1.32 {d2[0]}, [r0,:32], r1
  1231. vst1.32 {d3[0]}, [r0,:32], r1
  1232. vst1.32 {d2[1]}, [r0,:32], r1
  1233. vst1.32 {d3[1]}, [r0,:32], r1
  1234. subs r12, r12, #4
  1235. bne 2b
  1236. add sp, sp, #52+16
  1237. pop {r4,pc}
  1238. endfunc
  1239. function ff_put_vp8_epel4_h4v6_neon, export=1
  1240. sub r2, r2, r3, lsl #1
  1241. sub r2, r2, #1
  1242. push {r4,lr}
  1243. ldr r4, [sp, #12] @ mx
  1244. movrel lr, subpel_filters-16
  1245. ldr r12, [sp, #8] @ h
  1246. add r4, lr, r4, lsl #4
  1247. sub sp, sp, #52+16
  1248. vld1.16 {q0}, [r4,:128]
  1249. add lr, sp, #15
  1250. add r12, r12, #5
  1251. bic lr, lr, #15
  1252. 1:
  1253. vld1.8 {d2}, [r2], r3
  1254. vp8_epel8_h4 d2, d2, d2
  1255. vst1.32 {d2[0]}, [lr,:32]!
  1256. subs r12, r12, #1
  1257. bne 1b
  1258. ldr r4, [sp, #52+16+16] @ my
  1259. movrel lr, subpel_filters-16
  1260. ldr r12, [sp, #52+16+8] @ h
  1261. add r4, lr, r4, lsl #4
  1262. add lr, sp, #15
  1263. vld1.16 {q0}, [r4,:128]
  1264. bic lr, lr, #15
  1265. 2:
  1266. vld1.8 {d2-d3}, [lr,:128]!
  1267. vld1.8 {d6}, [lr,:64]!
  1268. vld1.32 {d28[]}, [lr,:32]
  1269. sub lr, lr, #16
  1270. vld1.8 {d4-d5}, [lr]!
  1271. vld1.8 {d7}, [lr,:64]!
  1272. vld1.32 {d28[1]}, [lr,:32]
  1273. sub lr, lr, #16
  1274. vtrn.32 q1, q2
  1275. vtrn.32 d6, d7
  1276. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1277. vst1.32 {d2[0]}, [r0,:32], r1
  1278. vst1.32 {d3[0]}, [r0,:32], r1
  1279. vst1.32 {d2[1]}, [r0,:32], r1
  1280. vst1.32 {d3[1]}, [r0,:32], r1
  1281. subs r12, r12, #4
  1282. bne 2b
  1283. add sp, sp, #52+16
  1284. pop {r4,pc}
  1285. endfunc
  1286. function ff_put_vp8_epel4_h6v4_neon, export=1
  1287. sub r2, r2, r3
  1288. sub r2, r2, #2
  1289. push {r4,lr}
  1290. ldr r4, [sp, #12] @ mx
  1291. movrel lr, subpel_filters-16
  1292. ldr r12, [sp, #8] @ h
  1293. add r4, lr, r4, lsl #4
  1294. sub sp, sp, #44+16
  1295. vld1.16 {q0}, [r4,:128]
  1296. add lr, sp, #15
  1297. add r12, r12, #3
  1298. bic lr, lr, #15
  1299. 1:
  1300. vld1.8 {q1}, [r2], r3
  1301. vp8_epel8_h6 d2, d2, d3
  1302. vst1.32 {d2[0]}, [lr,:32]!
  1303. subs r12, r12, #1
  1304. bne 1b
  1305. ldr r4, [sp, #44+16+16] @ my
  1306. movrel lr, subpel_filters-16
  1307. ldr r12, [sp, #44+16+8] @ h
  1308. add r4, lr, r4, lsl #4
  1309. add lr, sp, #15
  1310. vld1.16 {q0}, [r4,:128]
  1311. bic lr, lr, #15
  1312. 2:
  1313. vld1.8 {d2-d3}, [lr,:128]!
  1314. vld1.32 {d6[]}, [lr,:32]
  1315. sub lr, lr, #8
  1316. vld1.8 {d4-d5}, [lr]!
  1317. vld1.32 {d6[1]}, [lr,:32]
  1318. sub lr, lr, #8
  1319. vtrn.32 q1, q2
  1320. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1321. vst1.32 {d2[0]}, [r0,:32], r1
  1322. vst1.32 {d3[0]}, [r0,:32], r1
  1323. vst1.32 {d2[1]}, [r0,:32], r1
  1324. vst1.32 {d3[1]}, [r0,:32], r1
  1325. subs r12, r12, #4
  1326. bne 2b
  1327. add sp, sp, #44+16
  1328. pop {r4,pc}
  1329. endfunc
  1330. function ff_put_vp8_epel4_h4_neon, export=1
  1331. sub r2, r2, #1
  1332. push {r4,lr}
  1333. ldr r4, [sp, #12] @ mx
  1334. movrel lr, subpel_filters-16
  1335. ldr r12, [sp, #8] @ h
  1336. add r4, lr, r4, lsl #4
  1337. vld1.16 {q0}, [r4,:128]
  1338. 1:
  1339. vld1.8 {d2}, [r2], r3
  1340. vp8_epel8_h4 d2, d2, d2
  1341. vst1.32 {d2[0]}, [r0,:32], r1
  1342. subs r12, r12, #1
  1343. bne 1b
  1344. pop {r4,pc}
  1345. endfunc
  1346. function ff_put_vp8_epel4_v4_neon, export=1
  1347. sub r2, r2, r3
  1348. push {r4,lr}
  1349. ldr r4, [sp, #16] @ my
  1350. movrel lr, subpel_filters-16
  1351. ldr r12, [sp, #8] @ h
  1352. add r4, lr, r4, lsl #4
  1353. vld1.16 {q0}, [r4,:128]
  1354. 1:
  1355. vld1.32 {d2[]}, [r2], r3
  1356. vld1.32 {d3[]}, [r2], r3
  1357. vld1.32 {d4[]}, [r2], r3
  1358. vld1.32 {d5[]}, [r2], r3
  1359. vld1.32 {d6[]}, [r2]
  1360. sub r2, r2, r3, lsl #1
  1361. vld1.32 {d2[1]}, [r2], r3
  1362. vld1.32 {d3[1]}, [r2], r3
  1363. vld1.32 {d4[1]}, [r2], r3
  1364. vld1.32 {d5[1]}, [r2], r3
  1365. vld1.32 {d6[1]}, [r2]
  1366. sub r2, r2, r3, lsl #1
  1367. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1368. vst1.32 {d2[0]}, [r0,:32], r1
  1369. vst1.32 {d3[0]}, [r0,:32], r1
  1370. vst1.32 {d2[1]}, [r0,:32], r1
  1371. vst1.32 {d3[1]}, [r0,:32], r1
  1372. subs r12, r12, #4
  1373. bne 1b
  1374. pop {r4,pc}
  1375. endfunc
  1376. function ff_put_vp8_epel4_h4v4_neon, export=1
  1377. sub r2, r2, r3
  1378. sub r2, r2, #1
  1379. push {r4,lr}
  1380. ldr r4, [sp, #12] @ mx
  1381. movrel lr, subpel_filters-16
  1382. ldr r12, [sp, #8] @ h
  1383. add r4, lr, r4, lsl #4
  1384. sub sp, sp, #44+16
  1385. vld1.16 {q0}, [r4,:128]
  1386. add lr, sp, #15
  1387. add r12, r12, #3
  1388. bic lr, lr, #15
  1389. 1:
  1390. vld1.8 {d2}, [r2], r3
  1391. vp8_epel8_h4 d2, d2, d3
  1392. vst1.32 {d2[0]}, [lr,:32]!
  1393. subs r12, r12, #1
  1394. bne 1b
  1395. ldr r4, [sp, #44+16+16] @ my
  1396. movrel lr, subpel_filters-16
  1397. ldr r12, [sp, #44+16+8] @ h
  1398. add r4, lr, r4, lsl #4
  1399. add lr, sp, #15
  1400. vld1.16 {q0}, [r4,:128]
  1401. bic lr, lr, #15
  1402. 2:
  1403. vld1.8 {d2-d3}, [lr,:128]!
  1404. vld1.32 {d6[]}, [lr,:32]
  1405. sub lr, lr, #8
  1406. vld1.8 {d4-d5}, [lr]!
  1407. vld1.32 {d6[1]}, [lr,:32]
  1408. sub lr, lr, #8
  1409. vtrn.32 q1, q2
  1410. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1411. vst1.32 {d2[0]}, [r0,:32], r1
  1412. vst1.32 {d3[0]}, [r0,:32], r1
  1413. vst1.32 {d2[1]}, [r0,:32], r1
  1414. vst1.32 {d3[1]}, [r0,:32], r1
  1415. subs r12, r12, #4
  1416. bne 2b
  1417. add sp, sp, #44+16
  1418. pop {r4,pc}
  1419. endfunc
  1420. @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
  1421. @ arithmatic can be used to apply filters
  1422. const subpel_filters, align=4
  1423. .short 0, 6, 123, 12, 1, 0, 0, 0
  1424. .short 2, 11, 108, 36, 8, 1, 0, 0
  1425. .short 0, 9, 93, 50, 6, 0, 0, 0
  1426. .short 3, 16, 77, 77, 16, 3, 0, 0
  1427. .short 0, 6, 50, 93, 9, 0, 0, 0
  1428. .short 1, 8, 36, 108, 11, 2, 0, 0
  1429. .short 0, 1, 12, 123, 6, 0, 0, 0
  1430. endconst
  1431. /* Bilinear MC */
  1432. function ff_put_vp8_bilin16_h_neon, export=1
  1433. ldr r3, [sp, #4] @ mx
  1434. rsb r12, r3, #8
  1435. vdup.8 d0, r3
  1436. vdup.8 d1, r12
  1437. ldr r12, [sp] @ h
  1438. 1:
  1439. subs r12, r12, #2
  1440. vld1.8 {d2-d4}, [r2], r1
  1441. vext.8 q2, q1, q2, #1
  1442. vmull.u8 q8, d2, d1
  1443. vmlal.u8 q8, d4, d0
  1444. vld1.8 {d18-d20},[r2], r1
  1445. vmull.u8 q3, d3, d1
  1446. vmlal.u8 q3, d5, d0
  1447. vext.8 q10, q9, q10, #1
  1448. vmull.u8 q11, d18, d1
  1449. vmlal.u8 q11, d20, d0
  1450. vmull.u8 q12, d19, d1
  1451. vmlal.u8 q12, d21, d0
  1452. vrshrn.u16 d4, q8, #3
  1453. vrshrn.u16 d5, q3, #3
  1454. vrshrn.u16 d6, q11, #3
  1455. vrshrn.u16 d7, q12, #3
  1456. vst1.8 {q2}, [r0,:128], r1
  1457. vst1.8 {q3}, [r0,:128], r1
  1458. bgt 1b
  1459. bx lr
  1460. endfunc
  1461. function ff_put_vp8_bilin16_v_neon, export=1
  1462. ldr r3, [sp, #8] @ my
  1463. rsb r12, r3, #8
  1464. vdup.8 d0, r3
  1465. vdup.8 d1, r12
  1466. ldr r12, [sp] @ h
  1467. vld1.8 {q1}, [r2], r1
  1468. 1:
  1469. subs r12, r12, #2
  1470. vld1.8 {q2}, [r2], r1
  1471. vmull.u8 q3, d2, d1
  1472. vmlal.u8 q3, d4, d0
  1473. vmull.u8 q8, d3, d1
  1474. vmlal.u8 q8, d5, d0
  1475. vld1.8 {q1}, [r2], r1
  1476. vmull.u8 q9, d4, d1
  1477. vmlal.u8 q9, d2, d0
  1478. vmull.u8 q10, d5, d1
  1479. vmlal.u8 q10, d3, d0
  1480. vrshrn.u16 d4, q3, #3
  1481. vrshrn.u16 d5, q8, #3
  1482. vrshrn.u16 d6, q9, #3
  1483. vrshrn.u16 d7, q10, #3
  1484. vst1.8 {q2}, [r0,:128], r1
  1485. vst1.8 {q3}, [r0,:128], r1
  1486. bgt 1b
  1487. bx lr
  1488. endfunc
  1489. function ff_put_vp8_bilin16_hv_neon, export=1
  1490. ldr r3, [sp, #4] @ mx
  1491. rsb r12, r3, #8
  1492. vdup.8 d0, r3
  1493. vdup.8 d1, r12
  1494. ldr r3, [sp, #8] @ my
  1495. rsb r12, r3, #8
  1496. vdup.8 d2, r3
  1497. vdup.8 d3, r12
  1498. ldr r12, [sp] @ h
  1499. vld1.8 {d4-d6}, [r2], r1
  1500. vext.8 q3, q2, q3, #1
  1501. vmull.u8 q8, d4, d1
  1502. vmlal.u8 q8, d6, d0
  1503. vmull.u8 q9, d5, d1
  1504. vmlal.u8 q9, d7, d0
  1505. vrshrn.u16 d4, q8, #3
  1506. vrshrn.u16 d5, q9, #3
  1507. 1:
  1508. subs r12, r12, #2
  1509. vld1.8 {d18-d20},[r2], r1
  1510. vext.8 q10, q9, q10, #1
  1511. vmull.u8 q11, d18, d1
  1512. vmlal.u8 q11, d20, d0
  1513. vld1.8 {d26-d28},[r2], r1
  1514. vmull.u8 q12, d19, d1
  1515. vmlal.u8 q12, d21, d0
  1516. vext.8 q14, q13, q14, #1
  1517. vmull.u8 q8, d26, d1
  1518. vmlal.u8 q8, d28, d0
  1519. vmull.u8 q9, d27, d1
  1520. vmlal.u8 q9, d29, d0
  1521. vrshrn.u16 d6, q11, #3
  1522. vrshrn.u16 d7, q12, #3
  1523. vmull.u8 q12, d4, d3
  1524. vmlal.u8 q12, d6, d2
  1525. vmull.u8 q15, d5, d3
  1526. vmlal.u8 q15, d7, d2
  1527. vrshrn.u16 d4, q8, #3
  1528. vrshrn.u16 d5, q9, #3
  1529. vmull.u8 q10, d6, d3
  1530. vmlal.u8 q10, d4, d2
  1531. vmull.u8 q11, d7, d3
  1532. vmlal.u8 q11, d5, d2
  1533. vrshrn.u16 d24, q12, #3
  1534. vrshrn.u16 d25, q15, #3
  1535. vst1.8 {q12}, [r0,:128], r1
  1536. vrshrn.u16 d20, q10, #3
  1537. vrshrn.u16 d21, q11, #3
  1538. vst1.8 {q10}, [r0,:128], r1
  1539. bgt 1b
  1540. bx lr
  1541. endfunc
  1542. function ff_put_vp8_bilin8_h_neon, export=1
  1543. ldr r3, [sp, #4] @ mx
  1544. rsb r12, r3, #8
  1545. vdup.8 d0, r3
  1546. vdup.8 d1, r12
  1547. ldr r12, [sp] @ h
  1548. 1:
  1549. subs r12, r12, #2
  1550. vld1.8 {q1}, [r2], r1
  1551. vext.8 d3, d2, d3, #1
  1552. vmull.u8 q2, d2, d1
  1553. vmlal.u8 q2, d3, d0
  1554. vld1.8 {q3}, [r2], r1
  1555. vext.8 d7, d6, d7, #1
  1556. vmull.u8 q8, d6, d1
  1557. vmlal.u8 q8, d7, d0
  1558. vrshrn.u16 d4, q2, #3
  1559. vrshrn.u16 d16, q8, #3
  1560. vst1.8 {d4}, [r0,:64], r1
  1561. vst1.8 {d16}, [r0,:64], r1
  1562. bgt 1b
  1563. bx lr
  1564. endfunc
  1565. function ff_put_vp8_bilin8_v_neon, export=1
  1566. ldr r3, [sp, #8] @ my
  1567. rsb r12, r3, #8
  1568. vdup.8 d0, r3
  1569. vdup.8 d1, r12
  1570. ldr r12, [sp] @ h
  1571. vld1.8 {d2}, [r2], r1
  1572. 1:
  1573. subs r12, r12, #2
  1574. vld1.8 {d3}, [r2], r1
  1575. vmull.u8 q2, d2, d1
  1576. vmlal.u8 q2, d3, d0
  1577. vld1.8 {d2}, [r2], r1
  1578. vmull.u8 q3, d3, d1
  1579. vmlal.u8 q3, d2, d0
  1580. vrshrn.u16 d4, q2, #3
  1581. vrshrn.u16 d6, q3, #3
  1582. vst1.8 {d4}, [r0,:64], r1
  1583. vst1.8 {d6}, [r0,:64], r1
  1584. bgt 1b
  1585. bx lr
  1586. endfunc
  1587. function ff_put_vp8_bilin8_hv_neon, export=1
  1588. ldr r3, [sp, #4] @ mx
  1589. rsb r12, r3, #8
  1590. vdup.8 d0, r3
  1591. vdup.8 d1, r12
  1592. ldr r3, [sp, #8] @ my
  1593. rsb r12, r3, #8
  1594. vdup.8 d2, r3
  1595. vdup.8 d3, r12
  1596. ldr r12, [sp] @ h
  1597. vld1.8 {q2}, [r2], r1
  1598. vext.8 d5, d4, d5, #1
  1599. vmull.u8 q9, d4, d1
  1600. vmlal.u8 q9, d5, d0
  1601. vrshrn.u16 d22, q9, #3
  1602. 1:
  1603. subs r12, r12, #2
  1604. vld1.8 {q3}, [r2], r1
  1605. vext.8 d7, d6, d7, #1
  1606. vmull.u8 q8, d6, d1
  1607. vmlal.u8 q8, d7, d0
  1608. vld1.8 {q2}, [r2], r1
  1609. vext.8 d5, d4, d5, #1
  1610. vmull.u8 q9, d4, d1
  1611. vmlal.u8 q9, d5, d0
  1612. vrshrn.u16 d16, q8, #3
  1613. vmull.u8 q10, d22, d3
  1614. vmlal.u8 q10, d16, d2
  1615. vrshrn.u16 d22, q9, #3
  1616. vmull.u8 q12, d16, d3
  1617. vmlal.u8 q12, d22, d2
  1618. vrshrn.u16 d20, q10, #3
  1619. vst1.8 {d20}, [r0,:64], r1
  1620. vrshrn.u16 d23, q12, #3
  1621. vst1.8 {d23}, [r0,:64], r1
  1622. bgt 1b
  1623. bx lr
  1624. endfunc
  1625. function ff_put_vp8_bilin4_h_neon, export=1
  1626. ldr r3, [sp, #4] @ mx
  1627. rsb r12, r3, #8
  1628. vdup.8 d0, r3
  1629. vdup.8 d1, r12
  1630. ldr r12, [sp] @ h
  1631. 1:
  1632. subs r12, r12, #2
  1633. vld1.8 {d2}, [r2], r1
  1634. vext.8 d3, d2, d3, #1
  1635. vld1.8 {d6}, [r2], r1
  1636. vext.8 d7, d6, d7, #1
  1637. vtrn.32 q1, q3
  1638. vmull.u8 q2, d2, d1
  1639. vmlal.u8 q2, d3, d0
  1640. vrshrn.u16 d4, q2, #3
  1641. vst1.32 {d4[0]}, [r0,:32], r1
  1642. vst1.32 {d4[1]}, [r0,:32], r1
  1643. bgt 1b
  1644. bx lr
  1645. endfunc
  1646. function ff_put_vp8_bilin4_v_neon, export=1
  1647. ldr r3, [sp, #8] @ my
  1648. rsb r12, r3, #8
  1649. vdup.8 d0, r3
  1650. vdup.8 d1, r12
  1651. ldr r12, [sp] @ h
  1652. vld1.32 {d2[]}, [r2], r1
  1653. 1:
  1654. vld1.32 {d3[]}, [r2]
  1655. vld1.32 {d2[1]}, [r2], r1
  1656. vld1.32 {d3[1]}, [r2], r1
  1657. vmull.u8 q2, d2, d1
  1658. vmlal.u8 q2, d3, d0
  1659. vtrn.32 d3, d2
  1660. vrshrn.u16 d4, q2, #3
  1661. vst1.32 {d4[0]}, [r0,:32], r1
  1662. vst1.32 {d4[1]}, [r0,:32], r1
  1663. subs r12, r12, #2
  1664. bgt 1b
  1665. bx lr
  1666. endfunc
  1667. function ff_put_vp8_bilin4_hv_neon, export=1
  1668. ldr r3, [sp, #4] @ mx
  1669. rsb r12, r3, #8
  1670. vdup.8 d0, r3
  1671. vdup.8 d1, r12
  1672. ldr r3, [sp, #8] @ my
  1673. rsb r12, r3, #8
  1674. vdup.8 d2, r3
  1675. vdup.8 d3, r12
  1676. ldr r12, [sp] @ h
  1677. vld1.8 {d4}, [r2], r1
  1678. vext.8 d5, d4, d4, #1
  1679. vmull.u8 q9, d4, d1
  1680. vmlal.u8 q9, d5, d0
  1681. vrshrn.u16 d22, q9, #3
  1682. 1:
  1683. subs r12, r12, #2
  1684. vld1.8 {d6}, [r2], r1
  1685. vext.8 d7, d6, d6, #1
  1686. vld1.8 {d4}, [r2], r1
  1687. vext.8 d5, d4, d4, #1
  1688. vtrn.32 q3, q2
  1689. vmull.u8 q8, d6, d1
  1690. vmlal.u8 q8, d7, d0
  1691. vrshrn.u16 d16, q8, #3
  1692. vmull.u8 q10, d16, d2
  1693. vtrn.32 d22, d16
  1694. vmlal.u8 q10, d22, d3
  1695. vrev64.32 d22, d16
  1696. vrshrn.u16 d20, q10, #3
  1697. vst1.32 {d20[0]}, [r0,:32], r1
  1698. vst1.32 {d20[1]}, [r0,:32], r1
  1699. bgt 1b
  1700. bx lr
  1701. endfunc