You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1868 lines
65KB

  1. /*
  2. * VP8 NEON optimisations
  3. *
  4. * Copyright (c) 2010 Rob Clark <rob@ti.com>
  5. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  6. *
  7. * This file is part of Libav.
  8. *
  9. * Libav is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * Libav is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with Libav; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavutil/arm/asm.S"
  24. #include "neon.S"
  25. function ff_vp8_luma_dc_wht_neon, export=1
  26. vld1.16 {q0-q1}, [r1,:128]
  27. vmov.i16 q15, #0
  28. vadd.i16 d4, d0, d3
  29. vadd.i16 d6, d1, d2
  30. vst1.16 {q15}, [r1,:128]!
  31. vsub.i16 d7, d1, d2
  32. vsub.i16 d5, d0, d3
  33. vst1.16 {q15}, [r1,:128]
  34. vadd.i16 q0, q2, q3
  35. vsub.i16 q1, q2, q3
  36. vmov.i16 q8, #3
  37. vtrn.32 d0, d2
  38. vtrn.32 d1, d3
  39. vtrn.16 d0, d1
  40. vtrn.16 d2, d3
  41. vadd.i16 d0, d0, d16
  42. vadd.i16 d4, d0, d3
  43. vadd.i16 d6, d1, d2
  44. vsub.i16 d7, d1, d2
  45. vsub.i16 d5, d0, d3
  46. vadd.i16 q0, q2, q3
  47. vsub.i16 q1, q2, q3
  48. vshr.s16 q0, q0, #3
  49. vshr.s16 q1, q1, #3
  50. mov r3, #32
  51. vst1.16 {d0[0]}, [r0,:16], r3
  52. vst1.16 {d1[0]}, [r0,:16], r3
  53. vst1.16 {d2[0]}, [r0,:16], r3
  54. vst1.16 {d3[0]}, [r0,:16], r3
  55. vst1.16 {d0[1]}, [r0,:16], r3
  56. vst1.16 {d1[1]}, [r0,:16], r3
  57. vst1.16 {d2[1]}, [r0,:16], r3
  58. vst1.16 {d3[1]}, [r0,:16], r3
  59. vst1.16 {d0[2]}, [r0,:16], r3
  60. vst1.16 {d1[2]}, [r0,:16], r3
  61. vst1.16 {d2[2]}, [r0,:16], r3
  62. vst1.16 {d3[2]}, [r0,:16], r3
  63. vst1.16 {d0[3]}, [r0,:16], r3
  64. vst1.16 {d1[3]}, [r0,:16], r3
  65. vst1.16 {d2[3]}, [r0,:16], r3
  66. vst1.16 {d3[3]}, [r0,:16], r3
  67. bx lr
  68. endfunc
  69. function ff_vp8_idct_add_neon, export=1
  70. vld1.16 {q0-q1}, [r1,:128]
  71. movw r3, #20091
  72. movt r3, #35468/2
  73. vdup.32 d4, r3
  74. vmull.s16 q12, d1, d4[0]
  75. vmull.s16 q13, d3, d4[0]
  76. vqdmulh.s16 d20, d1, d4[1]
  77. vqdmulh.s16 d23, d3, d4[1]
  78. vshrn.s32 d21, q12, #16
  79. vshrn.s32 d22, q13, #16
  80. vadd.s16 d21, d21, d1
  81. vadd.s16 d22, d22, d3
  82. vadd.s16 d16, d0, d2
  83. vsub.s16 d17, d0, d2
  84. vadd.s16 d18, d21, d23
  85. vsub.s16 d19, d20, d22
  86. vadd.s16 q0, q8, q9
  87. vsub.s16 q1, q8, q9
  88. vtrn.32 d0, d3
  89. vtrn.32 d1, d2
  90. vtrn.16 d0, d1
  91. vtrn.16 d3, d2
  92. vmov.i16 q15, #0
  93. vmull.s16 q12, d1, d4[0]
  94. vst1.16 {q15}, [r1,:128]!
  95. vmull.s16 q13, d2, d4[0]
  96. vst1.16 {q15}, [r1,:128]
  97. vqdmulh.s16 d21, d1, d4[1]
  98. vqdmulh.s16 d23, d2, d4[1]
  99. vshrn.s32 d20, q12, #16
  100. vshrn.s32 d22, q13, #16
  101. vadd.i16 d20, d20, d1
  102. vadd.i16 d22, d22, d2
  103. vadd.i16 d16, d0, d3
  104. vsub.i16 d17, d0, d3
  105. vadd.i16 d18, d20, d23
  106. vld1.32 {d20[]}, [r0,:32], r2
  107. vsub.i16 d19, d21, d22
  108. vld1.32 {d22[]}, [r0,:32], r2
  109. vadd.s16 q0, q8, q9
  110. vld1.32 {d23[]}, [r0,:32], r2
  111. vsub.s16 q1, q8, q9
  112. vld1.32 {d21[]}, [r0,:32], r2
  113. vrshr.s16 q0, q0, #3
  114. vtrn.32 q10, q11
  115. vrshr.s16 q1, q1, #3
  116. sub r0, r0, r2, lsl #2
  117. vtrn.32 d0, d3
  118. vtrn.32 d1, d2
  119. vtrn.16 d0, d1
  120. vtrn.16 d3, d2
  121. vaddw.u8 q0, q0, d20
  122. vaddw.u8 q1, q1, d21
  123. vqmovun.s16 d0, q0
  124. vqmovun.s16 d1, q1
  125. vst1.32 {d0[0]}, [r0,:32], r2
  126. vst1.32 {d0[1]}, [r0,:32], r2
  127. vst1.32 {d1[1]}, [r0,:32], r2
  128. vst1.32 {d1[0]}, [r0,:32], r2
  129. bx lr
  130. endfunc
  131. function ff_vp8_idct_dc_add_neon, export=1
  132. mov r3, #0
  133. ldrsh r12, [r1]
  134. strh r3, [r1]
  135. vdup.16 q1, r12
  136. vrshr.s16 q1, q1, #3
  137. vld1.32 {d0[]}, [r0,:32], r2
  138. vld1.32 {d1[]}, [r0,:32], r2
  139. vld1.32 {d0[1]}, [r0,:32], r2
  140. vld1.32 {d1[1]}, [r0,:32], r2
  141. vaddw.u8 q2, q1, d0
  142. vaddw.u8 q3, q1, d1
  143. sub r0, r0, r2, lsl #2
  144. vqmovun.s16 d0, q2
  145. vqmovun.s16 d1, q3
  146. vst1.32 {d0[0]}, [r0,:32], r2
  147. vst1.32 {d1[0]}, [r0,:32], r2
  148. vst1.32 {d0[1]}, [r0,:32], r2
  149. vst1.32 {d1[1]}, [r0,:32], r2
  150. bx lr
  151. endfunc
  152. function ff_vp8_idct_dc_add4uv_neon, export=1
  153. vmov.i16 d0, #0
  154. mov r3, #32
  155. vld1.16 {d16[]}, [r1,:16]
  156. vst1.16 {d0[0]}, [r1,:16], r3
  157. vld1.16 {d17[]}, [r1,:16]
  158. vst1.16 {d0[0]}, [r1,:16], r3
  159. vld1.16 {d18[]}, [r1,:16]
  160. vst1.16 {d0[0]}, [r1,:16], r3
  161. vld1.16 {d19[]}, [r1,:16]
  162. vst1.16 {d0[0]}, [r1,:16], r3
  163. mov r3, r0
  164. vrshr.s16 q8, q8, #3 @ dc >>= 3
  165. vld1.8 {d0}, [r0,:64], r2
  166. vrshr.s16 q9, q9, #3
  167. vld1.8 {d1}, [r0,:64], r2
  168. vaddw.u8 q10, q8, d0
  169. vld1.8 {d2}, [r0,:64], r2
  170. vaddw.u8 q0, q8, d1
  171. vld1.8 {d3}, [r0,:64], r2
  172. vaddw.u8 q11, q8, d2
  173. vld1.8 {d4}, [r0,:64], r2
  174. vaddw.u8 q1, q8, d3
  175. vld1.8 {d5}, [r0,:64], r2
  176. vaddw.u8 q12, q9, d4
  177. vld1.8 {d6}, [r0,:64], r2
  178. vaddw.u8 q2, q9, d5
  179. vld1.8 {d7}, [r0,:64], r2
  180. vaddw.u8 q13, q9, d6
  181. vqmovun.s16 d20, q10
  182. vaddw.u8 q3, q9, d7
  183. vqmovun.s16 d21, q0
  184. vqmovun.s16 d22, q11
  185. vst1.8 {d20}, [r3,:64], r2
  186. vqmovun.s16 d23, q1
  187. vst1.8 {d21}, [r3,:64], r2
  188. vqmovun.s16 d24, q12
  189. vst1.8 {d22}, [r3,:64], r2
  190. vqmovun.s16 d25, q2
  191. vst1.8 {d23}, [r3,:64], r2
  192. vqmovun.s16 d26, q13
  193. vst1.8 {d24}, [r3,:64], r2
  194. vqmovun.s16 d27, q3
  195. vst1.8 {d25}, [r3,:64], r2
  196. vst1.8 {d26}, [r3,:64], r2
  197. vst1.8 {d27}, [r3,:64], r2
  198. bx lr
  199. endfunc
  200. function ff_vp8_idct_dc_add4y_neon, export=1
  201. vmov.i16 d0, #0
  202. mov r3, #32
  203. vld1.16 {d16[]}, [r1,:16]
  204. vst1.16 {d0[0]}, [r1,:16], r3
  205. vld1.16 {d17[]}, [r1,:16]
  206. vst1.16 {d0[0]}, [r1,:16], r3
  207. vld1.16 {d18[]}, [r1,:16]
  208. vst1.16 {d0[0]}, [r1,:16], r3
  209. vld1.16 {d19[]}, [r1,:16]
  210. vst1.16 {d0[0]}, [r1,:16], r3
  211. vrshr.s16 q8, q8, #3 @ dc >>= 3
  212. vld1.8 {q0}, [r0,:128], r2
  213. vrshr.s16 q9, q9, #3
  214. vld1.8 {q1}, [r0,:128], r2
  215. vaddw.u8 q10, q8, d0
  216. vld1.8 {q2}, [r0,:128], r2
  217. vaddw.u8 q0, q9, d1
  218. vld1.8 {q3}, [r0,:128], r2
  219. vaddw.u8 q11, q8, d2
  220. vaddw.u8 q1, q9, d3
  221. vaddw.u8 q12, q8, d4
  222. vaddw.u8 q2, q9, d5
  223. vaddw.u8 q13, q8, d6
  224. vaddw.u8 q3, q9, d7
  225. sub r0, r0, r2, lsl #2
  226. vqmovun.s16 d20, q10
  227. vqmovun.s16 d21, q0
  228. vqmovun.s16 d22, q11
  229. vqmovun.s16 d23, q1
  230. vqmovun.s16 d24, q12
  231. vst1.8 {q10}, [r0,:128], r2
  232. vqmovun.s16 d25, q2
  233. vst1.8 {q11}, [r0,:128], r2
  234. vqmovun.s16 d26, q13
  235. vst1.8 {q12}, [r0,:128], r2
  236. vqmovun.s16 d27, q3
  237. vst1.8 {q13}, [r0,:128], r2
  238. bx lr
  239. endfunc
  240. @ Register layout:
  241. @ P3..Q3 -> q0..q7
  242. @ flim_E -> q14
  243. @ flim_I -> q15
  244. @ hev_thresh -> r12
  245. @
  246. .macro vp8_loop_filter, inner=0, simple=0
  247. .if \simple
  248. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  249. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  250. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  251. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  252. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  253. vmov.i8 q13, #0x80
  254. vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
  255. .else
  256. @ calculate hev and normal_limit:
  257. vabd.u8 q12, q2, q3 @ abs(P1-P0)
  258. vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
  259. vabd.u8 q10, q0, q1 @ abs(P3-P2)
  260. vabd.u8 q11, q1, q2 @ abs(P2-P1)
  261. vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
  262. vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
  263. vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
  264. vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
  265. vand q8, q8, q9
  266. vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
  267. vand q8, q8, q11
  268. vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
  269. vand q8, q8, q10
  270. vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
  271. vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
  272. vabd.u8 q9, q3, q4 @ abs(P0-Q0)
  273. vabd.u8 q15, q2, q5 @ abs(P1-Q1)
  274. vand q8, q8, q10
  275. vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
  276. vand q8, q8, q11
  277. vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
  278. vdup.8 q15, r12 @ hev_thresh
  279. vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
  280. vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
  281. vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
  282. vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
  283. vand q8, q8, q11
  284. vmov.i8 q13, #0x80
  285. vorr q9, q12, q14
  286. .endif
  287. @ at this point:
  288. @ q8: normal_limit
  289. @ q9: hev
  290. @ convert to signed value:
  291. veor q3, q3, q13 @ PS0 = P0 ^ 0x80
  292. veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
  293. vmov.i16 q12, #3
  294. vsubl.s8 q10, d8, d6 @ QS0 - PS0
  295. vsubl.s8 q11, d9, d7 @ (widened to 16 bits)
  296. veor q2, q2, q13 @ PS1 = P1 ^ 0x80
  297. veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
  298. vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
  299. vmul.i16 q11, q11, q12
  300. vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
  301. vmov.i8 q14, #4
  302. vmov.i8 q15, #3
  303. .if \inner
  304. vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
  305. .endif
  306. vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
  307. vaddw.s8 q11, q11, d25
  308. vqmovn.s16 d20, q10 @ narrow result back into q10
  309. vqmovn.s16 d21, q11
  310. .if !\inner && !\simple
  311. veor q1, q1, q13 @ PS2 = P2 ^ 0x80
  312. veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
  313. .endif
  314. vand q10, q10, q8 @ w &= normal_limit
  315. @ registers used at this point..
  316. @ q0 -> P3 (don't corrupt)
  317. @ q1-q6 -> PS2-QS2
  318. @ q7 -> Q3 (don't corrupt)
  319. @ q9 -> hev
  320. @ q10 -> w
  321. @ q13 -> #0x80
  322. @ q14 -> #4
  323. @ q15 -> #3
  324. @ q8, q11, q12 -> unused
  325. @ filter_common: is4tap==1
  326. @ c1 = clamp(w + 4) >> 3;
  327. @ c2 = clamp(w + 3) >> 3;
  328. @ Q0 = s2u(QS0 - c1);
  329. @ P0 = s2u(PS0 + c2);
  330. .if \simple
  331. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  332. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  333. vshr.s8 q11, q11, #3 @ c1 >>= 3
  334. vshr.s8 q12, q12, #3 @ c2 >>= 3
  335. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  336. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  337. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  338. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  339. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  340. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  341. .elseif \inner
  342. @ the !is4tap case of filter_common, only used for inner blocks
  343. @ c3 = ((c1&~hev) + 1) >> 1;
  344. @ Q1 = s2u(QS1 - c3);
  345. @ P1 = s2u(PS1 + c3);
  346. vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
  347. vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
  348. vshr.s8 q11, q11, #3 @ c1 >>= 3
  349. vshr.s8 q12, q12, #3 @ c2 >>= 3
  350. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  351. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  352. vbic q11, q11, q9 @ c1 & ~hev
  353. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  354. vrshr.s8 q11, q11, #1 @ c3 >>= 1
  355. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  356. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
  357. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
  358. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  359. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  360. .else
  361. vand q12, q10, q9 @ w & hev
  362. vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
  363. vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
  364. vshr.s8 q11, q11, #3 @ c1 >>= 3
  365. vshr.s8 q12, q12, #3 @ c2 >>= 3
  366. vbic q10, q10, q9 @ w &= ~hev
  367. vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
  368. vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
  369. @ filter_mbedge:
  370. @ a = clamp((27*w + 63) >> 7);
  371. @ Q0 = s2u(QS0 - a);
  372. @ P0 = s2u(PS0 + a);
  373. @ a = clamp((18*w + 63) >> 7);
  374. @ Q1 = s2u(QS1 - a);
  375. @ P1 = s2u(PS1 + a);
  376. @ a = clamp((9*w + 63) >> 7);
  377. @ Q2 = s2u(QS2 - a);
  378. @ P2 = s2u(PS2 + a);
  379. vmov.i16 q9, #63
  380. vshll.s8 q14, d20, #3
  381. vshll.s8 q15, d21, #3
  382. vaddw.s8 q14, q14, d20
  383. vaddw.s8 q15, q15, d21
  384. vadd.s16 q8, q9, q14
  385. vadd.s16 q9, q9, q15 @ 9*w + 63
  386. vadd.s16 q11, q8, q14
  387. vadd.s16 q12, q9, q15 @ 18*w + 63
  388. vadd.s16 q14, q11, q14
  389. vadd.s16 q15, q12, q15 @ 27*w + 63
  390. vqshrn.s16 d16, q8, #7
  391. vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
  392. vqshrn.s16 d22, q11, #7
  393. vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
  394. vqshrn.s16 d28, q14, #7
  395. vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
  396. vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
  397. vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
  398. vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
  399. vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
  400. vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
  401. vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
  402. veor q3, q3, q13 @ P0 = PS0 ^ 0x80
  403. veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
  404. veor q2, q2, q13 @ P1 = PS1 ^ 0x80
  405. veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
  406. veor q1, q1, q13 @ P2 = PS2 ^ 0x80
  407. veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
  408. .endif
  409. .endm
  410. .macro vp8_v_loop_filter16 name, inner=0, simple=0
  411. function ff_vp8_v_loop_filter16\name\()_neon, export=1
  412. vpush {q4-q7}
  413. sub r0, r0, r1, lsl #1+!\simple
  414. @ Load pixels:
  415. .if !\simple
  416. ldr r12, [sp, #64] @ hev_thresh
  417. vld1.8 {q0}, [r0,:128], r1 @ P3
  418. vld1.8 {q1}, [r0,:128], r1 @ P2
  419. .endif
  420. vld1.8 {q2}, [r0,:128], r1 @ P1
  421. vld1.8 {q3}, [r0,:128], r1 @ P0
  422. vld1.8 {q4}, [r0,:128], r1 @ Q0
  423. vld1.8 {q5}, [r0,:128], r1 @ Q1
  424. .if !\simple
  425. vld1.8 {q6}, [r0,:128], r1 @ Q2
  426. vld1.8 {q7}, [r0,:128] @ Q3
  427. vdup.8 q15, r3 @ flim_I
  428. .endif
  429. vdup.8 q14, r2 @ flim_E
  430. vp8_loop_filter inner=\inner, simple=\simple
  431. @ back up to P2: dst -= stride * 6
  432. sub r0, r0, r1, lsl #2
  433. .if !\simple
  434. sub r0, r0, r1, lsl #1
  435. @ Store pixels:
  436. vst1.8 {q1}, [r0,:128], r1 @ P2
  437. .endif
  438. vst1.8 {q2}, [r0,:128], r1 @ P1
  439. vst1.8 {q3}, [r0,:128], r1 @ P0
  440. vst1.8 {q4}, [r0,:128], r1 @ Q0
  441. vst1.8 {q5}, [r0,:128], r1 @ Q1
  442. .if !\simple
  443. vst1.8 {q6}, [r0,:128] @ Q2
  444. .endif
  445. vpop {q4-q7}
  446. bx lr
  447. endfunc
  448. .endm
  449. vp8_v_loop_filter16
  450. vp8_v_loop_filter16 _inner, inner=1
  451. vp8_v_loop_filter16 _simple, simple=1
  452. .macro vp8_v_loop_filter8uv name, inner=0
  453. function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
  454. vpush {q4-q7}
  455. sub r0, r0, r2, lsl #2
  456. sub r1, r1, r2, lsl #2
  457. ldr r12, [sp, #64] @ flim_I
  458. @ Load pixels:
  459. vld1.8 {d0}, [r0,:64], r2 @ P3
  460. vld1.8 {d1}, [r1,:64], r2 @ P3
  461. vld1.8 {d2}, [r0,:64], r2 @ P2
  462. vld1.8 {d3}, [r1,:64], r2 @ P2
  463. vld1.8 {d4}, [r0,:64], r2 @ P1
  464. vld1.8 {d5}, [r1,:64], r2 @ P1
  465. vld1.8 {d6}, [r0,:64], r2 @ P0
  466. vld1.8 {d7}, [r1,:64], r2 @ P0
  467. vld1.8 {d8}, [r0,:64], r2 @ Q0
  468. vld1.8 {d9}, [r1,:64], r2 @ Q0
  469. vld1.8 {d10}, [r0,:64], r2 @ Q1
  470. vld1.8 {d11}, [r1,:64], r2 @ Q1
  471. vld1.8 {d12}, [r0,:64], r2 @ Q2
  472. vld1.8 {d13}, [r1,:64], r2 @ Q2
  473. vld1.8 {d14}, [r0,:64] @ Q3
  474. vld1.8 {d15}, [r1,:64] @ Q3
  475. vdup.8 q14, r3 @ flim_E
  476. vdup.8 q15, r12 @ flim_I
  477. ldr r12, [sp, #68] @ hev_thresh
  478. vp8_loop_filter inner=\inner
  479. @ back up to P2: u,v -= stride * 6
  480. sub r0, r0, r2, lsl #2
  481. sub r1, r1, r2, lsl #2
  482. sub r0, r0, r2, lsl #1
  483. sub r1, r1, r2, lsl #1
  484. @ Store pixels:
  485. vst1.8 {d2}, [r0,:64], r2 @ P2
  486. vst1.8 {d3}, [r1,:64], r2 @ P2
  487. vst1.8 {d4}, [r0,:64], r2 @ P1
  488. vst1.8 {d5}, [r1,:64], r2 @ P1
  489. vst1.8 {d6}, [r0,:64], r2 @ P0
  490. vst1.8 {d7}, [r1,:64], r2 @ P0
  491. vst1.8 {d8}, [r0,:64], r2 @ Q0
  492. vst1.8 {d9}, [r1,:64], r2 @ Q0
  493. vst1.8 {d10}, [r0,:64], r2 @ Q1
  494. vst1.8 {d11}, [r1,:64], r2 @ Q1
  495. vst1.8 {d12}, [r0,:64] @ Q2
  496. vst1.8 {d13}, [r1,:64] @ Q2
  497. vpop {q4-q7}
  498. bx lr
  499. endfunc
  500. .endm
  501. vp8_v_loop_filter8uv
  502. vp8_v_loop_filter8uv _inner, inner=1
  503. .macro vp8_h_loop_filter16 name, inner=0, simple=0
  504. function ff_vp8_h_loop_filter16\name\()_neon, export=1
  505. vpush {q4-q7}
  506. sub r0, r0, #4
  507. .if !\simple
  508. ldr r12, [sp, #64] @ hev_thresh
  509. .endif
  510. @ Load pixels:
  511. vld1.8 {d0}, [r0], r1 @ load first 8-line src data
  512. vld1.8 {d2}, [r0], r1
  513. vld1.8 {d4}, [r0], r1
  514. vld1.8 {d6}, [r0], r1
  515. vld1.8 {d8}, [r0], r1
  516. vld1.8 {d10}, [r0], r1
  517. vld1.8 {d12}, [r0], r1
  518. vld1.8 {d14}, [r0], r1
  519. vld1.8 {d1}, [r0], r1 @ load second 8-line src data
  520. vld1.8 {d3}, [r0], r1
  521. vld1.8 {d5}, [r0], r1
  522. vld1.8 {d7}, [r0], r1
  523. vld1.8 {d9}, [r0], r1
  524. vld1.8 {d11}, [r0], r1
  525. vld1.8 {d13}, [r0], r1
  526. vld1.8 {d15}, [r0], r1
  527. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  528. vdup.8 q14, r2 @ flim_E
  529. .if !\simple
  530. vdup.8 q15, r3 @ flim_I
  531. .endif
  532. vp8_loop_filter inner=\inner, simple=\simple
  533. sub r0, r0, r1, lsl #4 @ backup 16 rows
  534. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  535. @ Store pixels:
  536. vst1.8 {d0}, [r0], r1
  537. vst1.8 {d2}, [r0], r1
  538. vst1.8 {d4}, [r0], r1
  539. vst1.8 {d6}, [r0], r1
  540. vst1.8 {d8}, [r0], r1
  541. vst1.8 {d10}, [r0], r1
  542. vst1.8 {d12}, [r0], r1
  543. vst1.8 {d14}, [r0], r1
  544. vst1.8 {d1}, [r0], r1
  545. vst1.8 {d3}, [r0], r1
  546. vst1.8 {d5}, [r0], r1
  547. vst1.8 {d7}, [r0], r1
  548. vst1.8 {d9}, [r0], r1
  549. vst1.8 {d11}, [r0], r1
  550. vst1.8 {d13}, [r0], r1
  551. vst1.8 {d15}, [r0]
  552. vpop {q4-q7}
  553. bx lr
  554. endfunc
  555. .endm
  556. vp8_h_loop_filter16
  557. vp8_h_loop_filter16 _inner, inner=1
  558. vp8_h_loop_filter16 _simple, simple=1
  559. .macro vp8_h_loop_filter8uv name, inner=0
  560. function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
  561. vpush {q4-q7}
  562. sub r0, r0, #4
  563. sub r1, r1, #4
  564. ldr r12, [sp, #64] @ flim_I
  565. @ Load pixels:
  566. vld1.8 {d0}, [r0], r2 @ load u
  567. vld1.8 {d1}, [r1], r2 @ load v
  568. vld1.8 {d2}, [r0], r2
  569. vld1.8 {d3}, [r1], r2
  570. vld1.8 {d4}, [r0], r2
  571. vld1.8 {d5}, [r1], r2
  572. vld1.8 {d6}, [r0], r2
  573. vld1.8 {d7}, [r1], r2
  574. vld1.8 {d8}, [r0], r2
  575. vld1.8 {d9}, [r1], r2
  576. vld1.8 {d10}, [r0], r2
  577. vld1.8 {d11}, [r1], r2
  578. vld1.8 {d12}, [r0], r2
  579. vld1.8 {d13}, [r1], r2
  580. vld1.8 {d14}, [r0], r2
  581. vld1.8 {d15}, [r1], r2
  582. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  583. vdup.8 q14, r3 @ flim_E
  584. vdup.8 q15, r12 @ flim_I
  585. ldr r12, [sp, #68] @ hev_thresh
  586. vp8_loop_filter inner=\inner
  587. sub r0, r0, r2, lsl #3 @ backup u 8 rows
  588. sub r1, r1, r2, lsl #3 @ backup v 8 rows
  589. transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
  590. @ Store pixels:
  591. vst1.8 {d0}, [r0], r2
  592. vst1.8 {d1}, [r1], r2
  593. vst1.8 {d2}, [r0], r2
  594. vst1.8 {d3}, [r1], r2
  595. vst1.8 {d4}, [r0], r2
  596. vst1.8 {d5}, [r1], r2
  597. vst1.8 {d6}, [r0], r2
  598. vst1.8 {d7}, [r1], r2
  599. vst1.8 {d8}, [r0], r2
  600. vst1.8 {d9}, [r1], r2
  601. vst1.8 {d10}, [r0], r2
  602. vst1.8 {d11}, [r1], r2
  603. vst1.8 {d12}, [r0], r2
  604. vst1.8 {d13}, [r1], r2
  605. vst1.8 {d14}, [r0]
  606. vst1.8 {d15}, [r1]
  607. vpop {q4-q7}
  608. bx lr
  609. endfunc
  610. .endm
  611. vp8_h_loop_filter8uv
  612. vp8_h_loop_filter8uv _inner, inner=1
  613. function ff_put_vp8_pixels16_neon, export=1
  614. ldr r12, [sp, #0] @ h
  615. 1:
  616. subs r12, r12, #4
  617. vld1.8 {q0}, [r2], r3
  618. vld1.8 {q1}, [r2], r3
  619. vld1.8 {q2}, [r2], r3
  620. vld1.8 {q3}, [r2], r3
  621. vst1.8 {q0}, [r0,:128], r1
  622. vst1.8 {q1}, [r0,:128], r1
  623. vst1.8 {q2}, [r0,:128], r1
  624. vst1.8 {q3}, [r0,:128], r1
  625. bgt 1b
  626. bx lr
  627. endfunc
  628. function ff_put_vp8_pixels8_neon, export=1
  629. ldr r12, [sp, #0] @ h
  630. 1:
  631. subs r12, r12, #4
  632. vld1.8 {d0}, [r2], r3
  633. vld1.8 {d1}, [r2], r3
  634. vld1.8 {d2}, [r2], r3
  635. vld1.8 {d3}, [r2], r3
  636. vst1.8 {d0}, [r0,:64], r1
  637. vst1.8 {d1}, [r0,:64], r1
  638. vst1.8 {d2}, [r0,:64], r1
  639. vst1.8 {d3}, [r0,:64], r1
  640. bgt 1b
  641. bx lr
  642. endfunc
  643. /* 4/6-tap 8th-pel MC */
  644. .macro vp8_epel8_h6 d, a, b
  645. vext.8 d27, \a, \b, #1
  646. vmovl.u8 q8, \a
  647. vext.8 d28, \a, \b, #2
  648. vmovl.u8 q9, d27
  649. vext.8 d29, \a, \b, #3
  650. vmovl.u8 q10, d28
  651. vext.8 d30, \a, \b, #4
  652. vmovl.u8 q11, d29
  653. vext.8 d31, \a, \b, #5
  654. vmovl.u8 q12, d30
  655. vmul.u16 q10, q10, d0[2]
  656. vmovl.u8 q13, d31
  657. vmul.u16 q11, q11, d0[3]
  658. vmls.u16 q10, q9, d0[1]
  659. vmls.u16 q11, q12, d1[0]
  660. vmla.u16 q10, q8, d0[0]
  661. vmla.u16 q11, q13, d1[1]
  662. vqadd.s16 q11, q10, q11
  663. vqrshrun.s16 \d, q11, #7
  664. .endm
  665. .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
  666. vext.8 q14, \q0, \q1, #3
  667. vext.8 q15, \q0, \q1, #4
  668. vmovl.u8 q11, d28
  669. vmovl.u8 q14, d29
  670. vext.8 q3, \q0, \q1, #2
  671. vmovl.u8 q12, d30
  672. vmovl.u8 q15, d31
  673. vext.8 q8, \q0, \q1, #1
  674. vmovl.u8 q10, d6
  675. vmovl.u8 q3, d7
  676. vext.8 q2, \q0, \q1, #5
  677. vmovl.u8 q13, d4
  678. vmovl.u8 q2, d5
  679. vmovl.u8 q9, d16
  680. vmovl.u8 q8, d17
  681. vmul.u16 q11, q11, d0[3]
  682. vmul.u16 q10, q10, d0[2]
  683. vmul.u16 q3, q3, d0[2]
  684. vmul.u16 q14, q14, d0[3]
  685. vmls.u16 q11, q12, d1[0]
  686. vmovl.u8 q12, \s0
  687. vmovl.u8 q1, \s1
  688. vmls.u16 q10, q9, d0[1]
  689. vmls.u16 q3, q8, d0[1]
  690. vmls.u16 q14, q15, d1[0]
  691. vmla.u16 q10, q12, d0[0]
  692. vmla.u16 q11, q13, d1[1]
  693. vmla.u16 q3, q1, d0[0]
  694. vmla.u16 q14, q2, d1[1]
  695. vqadd.s16 q11, q10, q11
  696. vqadd.s16 q14, q3, q14
  697. vqrshrun.s16 \d0, q11, #7
  698. vqrshrun.s16 \d1, q14, #7
  699. .endm
  700. .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
  701. vmovl.u8 q10, \s2
  702. vmovl.u8 q11, \s3
  703. vmovl.u8 q9, \s1
  704. vmovl.u8 q12, \s4
  705. vmovl.u8 q8, \s0
  706. vmovl.u8 q13, \s5
  707. vmul.u16 q10, q10, d0[2]
  708. vmul.u16 q11, q11, d0[3]
  709. vmls.u16 q10, q9, d0[1]
  710. vmls.u16 q11, q12, d1[0]
  711. vmla.u16 q10, q8, d0[0]
  712. vmla.u16 q11, q13, d1[1]
  713. vqadd.s16 q11, q10, q11
  714. vqrshrun.s16 \d0, q11, #7
  715. .endm
  716. .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
  717. vmovl.u8 q10, \s0
  718. vmovl.u8 q11, \s3
  719. vmovl.u8 q14, \s6
  720. vmovl.u8 q9, \s1
  721. vmovl.u8 q12, \s4
  722. vmovl.u8 q8, \s2
  723. vmovl.u8 q13, \s5
  724. vmul.u16 q10, q10, d0[0]
  725. vmul.u16 q15, q11, d0[3]
  726. vmul.u16 q11, q11, d0[2]
  727. vmul.u16 q14, q14, d1[1]
  728. vmls.u16 q10, q9, d0[1]
  729. vmls.u16 q15, q12, d1[0]
  730. vmls.u16 q11, q8, d0[1]
  731. vmls.u16 q14, q13, d1[0]
  732. vmla.u16 q10, q8, d0[2]
  733. vmla.u16 q15, q13, d1[1]
  734. vmla.u16 q11, q9, d0[0]
  735. vmla.u16 q14, q12, d0[3]
  736. vqadd.s16 q15, q10, q15
  737. vqadd.s16 q14, q11, q14
  738. vqrshrun.s16 \d0, q15, #7
  739. vqrshrun.s16 \d1, q14, #7
  740. .endm
  741. .macro vp8_epel8_h4 d, a, b
  742. vext.8 d28, \a, \b, #1
  743. vmovl.u8 q9, \a
  744. vext.8 d29, \a, \b, #2
  745. vmovl.u8 q10, d28
  746. vext.8 d30, \a, \b, #3
  747. vmovl.u8 q11, d29
  748. vmovl.u8 q12, d30
  749. vmul.u16 q10, q10, d0[2]
  750. vmul.u16 q11, q11, d0[3]
  751. vmls.u16 q10, q9, d0[1]
  752. vmls.u16 q11, q12, d1[0]
  753. vqadd.s16 q11, q10, q11
  754. vqrshrun.s16 \d, q11, #7
  755. .endm
  756. .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
  757. vmovl.u8 q9, \s0
  758. vmovl.u8 q10, \s1
  759. vmovl.u8 q11, \s2
  760. vmovl.u8 q12, \s3
  761. vmovl.u8 q13, \s4
  762. vmul.u16 q8, q10, d0[2]
  763. vmul.u16 q14, q11, d0[3]
  764. vmul.u16 q11, q11, d0[2]
  765. vmul.u16 q15, q12, d0[3]
  766. vmls.u16 q8, q9, d0[1]
  767. vmls.u16 q14, q12, d1[0]
  768. vmls.u16 q11, q10, d0[1]
  769. vmls.u16 q15, q13, d1[0]
  770. vqadd.s16 q8, q8, q14
  771. vqadd.s16 q11, q11, q15
  772. vqrshrun.s16 \d0, q8, #7
  773. vqrshrun.s16 \d1, q11, #7
  774. .endm
  775. function ff_put_vp8_epel16_v6_neon, export=1
  776. sub r2, r2, r3, lsl #1
  777. push {r4,lr}
  778. vpush {d8-d15}
  779. ldr r4, [sp, #80] @ my
  780. movrel lr, subpel_filters-16
  781. ldr r12, [sp, #72] @ h
  782. add r4, lr, r4, lsl #4
  783. vld1.16 {q0}, [r4,:128]
  784. 1:
  785. vld1.8 {d2-d3}, [r2], r3
  786. vld1.8 {d4-d5}, [r2], r3
  787. vld1.8 {d6-d7}, [r2], r3
  788. vld1.8 {d8-d9}, [r2], r3
  789. vld1.8 {d10-d11},[r2], r3
  790. vld1.8 {d12-d13},[r2], r3
  791. vld1.8 {d14-d15},[r2]
  792. sub r2, r2, r3, lsl #2
  793. vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
  794. vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
  795. vst1.8 {d2-d3}, [r0,:128], r1
  796. vst1.8 {d4-d5}, [r0,:128], r1
  797. subs r12, r12, #2
  798. bne 1b
  799. vpop {d8-d15}
  800. pop {r4,pc}
  801. endfunc
  802. function ff_put_vp8_epel16_h6_neon, export=1
  803. sub r2, r2, #2
  804. push {r4,lr}
  805. ldr r4, [sp, #12] @ mx
  806. movrel lr, subpel_filters-16
  807. ldr r12, [sp, #8] @ h
  808. add r4, lr, r4, lsl #4
  809. vld1.16 {q0}, [r4,:128]
  810. 1:
  811. vld1.8 {d2-d4}, [r2], r3
  812. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  813. vst1.8 {d2-d3}, [r0,:128], r1
  814. subs r12, r12, #1
  815. bne 1b
  816. pop {r4,pc}
  817. endfunc
  818. function ff_put_vp8_epel16_h6v6_neon, export=1
  819. sub r2, r2, r3, lsl #1
  820. sub r2, r2, #2
  821. push {r4,lr}
  822. vpush {d8-d9}
  823. @ first pass (horizontal):
  824. ldr r4, [sp, #28] @ mx
  825. movrel lr, subpel_filters-16
  826. ldr r12, [sp, #24] @ h
  827. add r4, lr, r4, lsl #4
  828. sub sp, sp, #336+16
  829. vld1.16 {q0}, [r4,:128]
  830. add lr, sp, #15
  831. add r12, r12, #5
  832. bic lr, lr, #15
  833. 1:
  834. vld1.8 {d2,d3,d4}, [r2], r3
  835. vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
  836. vst1.8 {d2-d3}, [lr,:128]!
  837. subs r12, r12, #1
  838. bne 1b
  839. @ second pass (vertical):
  840. ldr r4, [sp, #336+16+32] @ my
  841. movrel lr, subpel_filters-16
  842. ldr r12, [sp, #336+16+24] @ h
  843. add r4, lr, r4, lsl #4
  844. add lr, sp, #15
  845. vld1.16 {q0}, [r4,:128]
  846. bic lr, lr, #15
  847. 2:
  848. vld1.8 {d2-d5}, [lr,:128]!
  849. vld1.8 {d6-d9}, [lr,:128]!
  850. vld1.8 {d28-d31},[lr,:128]
  851. sub lr, lr, #48
  852. vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
  853. vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
  854. vst1.8 {d2-d3}, [r0,:128], r1
  855. subs r12, r12, #1
  856. bne 2b
  857. add sp, sp, #336+16
  858. vpop {d8-d9}
  859. pop {r4,pc}
  860. endfunc
  861. function ff_put_vp8_epel8_v6_neon, export=1
  862. sub r2, r2, r3, lsl #1
  863. push {r4,lr}
  864. ldr r4, [sp, #16] @ my
  865. movrel lr, subpel_filters-16
  866. ldr r12, [sp, #8] @ h
  867. add r4, lr, r4, lsl #4
  868. vld1.16 {q0}, [r4,:128]
  869. 1:
  870. vld1.8 {d2}, [r2], r3
  871. vld1.8 {d3}, [r2], r3
  872. vld1.8 {d4}, [r2], r3
  873. vld1.8 {d5}, [r2], r3
  874. vld1.8 {d6}, [r2], r3
  875. vld1.8 {d7}, [r2], r3
  876. vld1.8 {d28}, [r2]
  877. sub r2, r2, r3, lsl #2
  878. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  879. vst1.8 {d2}, [r0,:64], r1
  880. vst1.8 {d3}, [r0,:64], r1
  881. subs r12, r12, #2
  882. bne 1b
  883. pop {r4,pc}
  884. endfunc
  885. function ff_put_vp8_epel8_h6_neon, export=1
  886. sub r2, r2, #2
  887. push {r4,lr}
  888. ldr r4, [sp, #12] @ mx
  889. movrel lr, subpel_filters-16
  890. ldr r12, [sp, #8] @ h
  891. add r4, lr, r4, lsl #4
  892. vld1.16 {q0}, [r4,:128]
  893. 1:
  894. vld1.8 {d2,d3}, [r2], r3
  895. vp8_epel8_h6 d2, d2, d3
  896. vst1.8 {d2}, [r0,:64], r1
  897. subs r12, r12, #1
  898. bne 1b
  899. pop {r4,pc}
  900. endfunc
  901. function ff_put_vp8_epel8_h6v6_neon, export=1
  902. sub r2, r2, r3, lsl #1
  903. sub r2, r2, #2
  904. push {r4,lr}
  905. @ first pass (horizontal):
  906. ldr r4, [sp, #12] @ mx
  907. movrel lr, subpel_filters-16
  908. ldr r12, [sp, #8] @ h
  909. add r4, lr, r4, lsl #4
  910. sub sp, sp, #168+16
  911. vld1.16 {q0}, [r4,:128]
  912. add lr, sp, #15
  913. add r12, r12, #5
  914. bic lr, lr, #15
  915. 1:
  916. vld1.8 {d2,d3}, [r2], r3
  917. vp8_epel8_h6 d2, d2, d3
  918. vst1.8 {d2}, [lr,:64]!
  919. subs r12, r12, #1
  920. bne 1b
  921. @ second pass (vertical):
  922. ldr r4, [sp, #168+16+16] @ my
  923. movrel lr, subpel_filters-16
  924. ldr r12, [sp, #168+16+8] @ h
  925. add r4, lr, r4, lsl #4
  926. add lr, sp, #15
  927. vld1.16 {q0}, [r4,:128]
  928. bic lr, lr, #15
  929. 2:
  930. vld1.8 {d2-d5}, [lr,:128]!
  931. vld1.8 {d6-d7}, [lr,:128]!
  932. vld1.8 {d30}, [lr,:64]
  933. sub lr, lr, #32
  934. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  935. vst1.8 {d2}, [r0,:64], r1
  936. vst1.8 {d3}, [r0,:64], r1
  937. subs r12, r12, #2
  938. bne 2b
  939. add sp, sp, #168+16
  940. pop {r4,pc}
  941. endfunc
  942. function ff_put_vp8_epel8_v4_neon, export=1
  943. sub r2, r2, r3
  944. push {r4,lr}
  945. ldr r4, [sp, #16] @ my
  946. movrel lr, subpel_filters-16
  947. ldr r12, [sp, #8] @ h
  948. add r4, lr, r4, lsl #4
  949. vld1.16 {q0}, [r4,:128]
  950. 1:
  951. vld1.8 {d2}, [r2], r3
  952. vld1.8 {d3}, [r2], r3
  953. vld1.8 {d4}, [r2], r3
  954. vld1.8 {d5}, [r2], r3
  955. vld1.8 {d6}, [r2]
  956. sub r2, r2, r3, lsl #1
  957. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  958. vst1.8 {d2}, [r0,:64], r1
  959. vst1.8 {d3}, [r0,:64], r1
  960. subs r12, r12, #2
  961. bne 1b
  962. pop {r4,pc}
  963. endfunc
  964. function ff_put_vp8_epel8_h4_neon, export=1
  965. sub r2, r2, #1
  966. push {r4,lr}
  967. ldr r4, [sp, #12] @ mx
  968. movrel lr, subpel_filters-16
  969. ldr r12, [sp, #8] @ h
  970. add r4, lr, r4, lsl #4
  971. vld1.16 {q0}, [r4,:128]
  972. 1:
  973. vld1.8 {d2,d3}, [r2], r3
  974. vp8_epel8_h4 d2, d2, d3
  975. vst1.8 {d2}, [r0,:64], r1
  976. subs r12, r12, #1
  977. bne 1b
  978. pop {r4,pc}
  979. endfunc
  980. function ff_put_vp8_epel8_h4v4_neon, export=1
  981. sub r2, r2, r3
  982. sub r2, r2, #1
  983. push {r4,lr}
  984. @ first pass (horizontal):
  985. ldr r4, [sp, #12] @ mx
  986. movrel lr, subpel_filters-16
  987. ldr r12, [sp, #8] @ h
  988. add r4, lr, r4, lsl #4
  989. sub sp, sp, #168+16
  990. vld1.16 {q0}, [r4,:128]
  991. add lr, sp, #15
  992. add r12, r12, #3
  993. bic lr, lr, #15
  994. 1:
  995. vld1.8 {d2,d3}, [r2], r3
  996. vp8_epel8_h4 d2, d2, d3
  997. vst1.8 {d2}, [lr,:64]!
  998. subs r12, r12, #1
  999. bne 1b
  1000. @ second pass (vertical):
  1001. ldr r4, [sp, #168+16+16] @ my
  1002. movrel lr, subpel_filters-16
  1003. ldr r12, [sp, #168+16+8] @ h
  1004. add r4, lr, r4, lsl #4
  1005. add lr, sp, #15
  1006. vld1.16 {q0}, [r4,:128]
  1007. bic lr, lr, #15
  1008. 2:
  1009. vld1.8 {d2-d5}, [lr,:128]!
  1010. vld1.8 {d6}, [lr,:64]
  1011. sub lr, lr, #16
  1012. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1013. vst1.8 {d2}, [r0,:64], r1
  1014. vst1.8 {d3}, [r0,:64], r1
  1015. subs r12, r12, #2
  1016. bne 2b
  1017. add sp, sp, #168+16
  1018. pop {r4,pc}
  1019. endfunc
  1020. function ff_put_vp8_epel8_h6v4_neon, export=1
  1021. sub r2, r2, r3
  1022. sub r2, r2, #2
  1023. push {r4,lr}
  1024. @ first pass (horizontal):
  1025. ldr r4, [sp, #12] @ mx
  1026. movrel lr, subpel_filters-16
  1027. ldr r12, [sp, #8] @ h
  1028. add r4, lr, r4, lsl #4
  1029. sub sp, sp, #168+16
  1030. vld1.16 {q0}, [r4,:128]
  1031. add lr, sp, #15
  1032. add r12, r12, #3
  1033. bic lr, lr, #15
  1034. 1:
  1035. vld1.8 {d2,d3}, [r2], r3
  1036. vp8_epel8_h6 d2, d2, d3
  1037. vst1.8 {d2}, [lr,:64]!
  1038. subs r12, r12, #1
  1039. bne 1b
  1040. @ second pass (vertical):
  1041. ldr r4, [sp, #168+16+16] @ my
  1042. movrel lr, subpel_filters-16
  1043. ldr r12, [sp, #168+16+8] @ h
  1044. add r4, lr, r4, lsl #4
  1045. add lr, sp, #15
  1046. vld1.16 {q0}, [r4,:128]
  1047. bic lr, lr, #15
  1048. 2:
  1049. vld1.8 {d2-d5}, [lr,:128]!
  1050. vld1.8 {d6}, [lr,:64]
  1051. sub lr, lr, #16
  1052. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1053. vst1.8 {d2}, [r0,:64], r1
  1054. vst1.8 {d3}, [r0,:64], r1
  1055. subs r12, r12, #2
  1056. bne 2b
  1057. add sp, sp, #168+16
  1058. pop {r4,pc}
  1059. endfunc
  1060. function ff_put_vp8_epel8_h4v6_neon, export=1
  1061. sub r2, r2, r3, lsl #1
  1062. sub r2, r2, #1
  1063. push {r4,lr}
  1064. @ first pass (horizontal):
  1065. ldr r4, [sp, #12] @ mx
  1066. movrel lr, subpel_filters-16
  1067. ldr r12, [sp, #8] @ h
  1068. add r4, lr, r4, lsl #4
  1069. sub sp, sp, #168+16
  1070. vld1.16 {q0}, [r4,:128]
  1071. add lr, sp, #15
  1072. add r12, r12, #5
  1073. bic lr, lr, #15
  1074. 1:
  1075. vld1.8 {d2,d3}, [r2], r3
  1076. vp8_epel8_h4 d2, d2, d3
  1077. vst1.8 {d2}, [lr,:64]!
  1078. subs r12, r12, #1
  1079. bne 1b
  1080. @ second pass (vertical):
  1081. ldr r4, [sp, #168+16+16] @ my
  1082. movrel lr, subpel_filters-16
  1083. ldr r12, [sp, #168+16+8] @ h
  1084. add r4, lr, r4, lsl #4
  1085. add lr, sp, #15
  1086. vld1.16 {q0}, [r4,:128]
  1087. bic lr, lr, #15
  1088. 2:
  1089. vld1.8 {d2-d5}, [lr,:128]!
  1090. vld1.8 {d6-d7}, [lr,:128]!
  1091. vld1.8 {d30}, [lr,:64]
  1092. sub lr, lr, #32
  1093. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
  1094. vst1.8 {d2}, [r0,:64], r1
  1095. vst1.8 {d3}, [r0,:64], r1
  1096. subs r12, r12, #2
  1097. bne 2b
  1098. add sp, sp, #168+16
  1099. pop {r4,pc}
  1100. endfunc
  1101. .ltorg
  1102. function ff_put_vp8_epel4_v6_neon, export=1
  1103. sub r2, r2, r3, lsl #1
  1104. push {r4,lr}
  1105. ldr r4, [sp, #16] @ my
  1106. movrel lr, subpel_filters-16
  1107. ldr r12, [sp, #8] @ h
  1108. add r4, lr, r4, lsl #4
  1109. vld1.16 {q0}, [r4,:128]
  1110. 1:
  1111. vld1.32 {d2[]}, [r2], r3
  1112. vld1.32 {d3[]}, [r2], r3
  1113. vld1.32 {d4[]}, [r2], r3
  1114. vld1.32 {d5[]}, [r2], r3
  1115. vld1.32 {d6[]}, [r2], r3
  1116. vld1.32 {d7[]}, [r2], r3
  1117. vld1.32 {d28[]}, [r2]
  1118. sub r2, r2, r3, lsl #2
  1119. vld1.32 {d2[1]}, [r2], r3
  1120. vld1.32 {d3[1]}, [r2], r3
  1121. vld1.32 {d4[1]}, [r2], r3
  1122. vld1.32 {d5[1]}, [r2], r3
  1123. vld1.32 {d6[1]}, [r2], r3
  1124. vld1.32 {d7[1]}, [r2], r3
  1125. vld1.32 {d28[1]}, [r2]
  1126. sub r2, r2, r3, lsl #2
  1127. vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
  1128. vst1.32 {d2[0]}, [r0,:32], r1
  1129. vst1.32 {d3[0]}, [r0,:32], r1
  1130. vst1.32 {d2[1]}, [r0,:32], r1
  1131. vst1.32 {d3[1]}, [r0,:32], r1
  1132. subs r12, r12, #4
  1133. bne 1b
  1134. pop {r4,pc}
  1135. endfunc
  1136. function ff_put_vp8_epel4_h6_neon, export=1
  1137. sub r2, r2, #2
  1138. push {r4,lr}
  1139. ldr r4, [sp, #12] @ mx
  1140. movrel lr, subpel_filters-16
  1141. ldr r12, [sp, #8] @ h
  1142. add r4, lr, r4, lsl #4
  1143. vld1.16 {q0}, [r4,:128]
  1144. 1:
  1145. vld1.8 {q1}, [r2], r3
  1146. vp8_epel8_h6 d2, d2, d3
  1147. vst1.32 {d2[0]}, [r0,:32], r1
  1148. subs r12, r12, #1
  1149. bne 1b
  1150. pop {r4,pc}
  1151. endfunc
  1152. function ff_put_vp8_epel4_h6v6_neon, export=1
  1153. sub r2, r2, r3, lsl #1
  1154. sub r2, r2, #2
  1155. push {r4,lr}
  1156. ldr r4, [sp, #12] @ mx
  1157. movrel lr, subpel_filters-16
  1158. ldr r12, [sp, #8] @ h
  1159. add r4, lr, r4, lsl #4
  1160. sub sp, sp, #52+16
  1161. vld1.16 {q0}, [r4,:128]
  1162. add lr, sp, #15
  1163. add r12, r12, #5
  1164. bic lr, lr, #15
  1165. 1:
  1166. vld1.8 {q1}, [r2], r3
  1167. vp8_epel8_h6 d2, d2, d3
  1168. vst1.32 {d2[0]}, [lr,:32]!
  1169. subs r12, r12, #1
  1170. bne 1b
  1171. ldr r4, [sp, #52+16+16] @ my
  1172. movrel lr, subpel_filters-16
  1173. ldr r12, [sp, #52+16+8] @ h
  1174. add r4, lr, r4, lsl #4
  1175. add lr, sp, #15
  1176. vld1.16 {q0}, [r4,:128]
  1177. bic lr, lr, #15
  1178. 2:
  1179. vld1.8 {d2-d3}, [lr,:128]!
  1180. vld1.8 {d6}, [lr,:64]!
  1181. vld1.32 {d28[]}, [lr,:32]
  1182. sub lr, lr, #16
  1183. vld1.8 {d4-d5}, [lr]!
  1184. vld1.8 {d7}, [lr,:64]!
  1185. vld1.32 {d28[1]}, [lr,:32]
  1186. sub lr, lr, #16
  1187. vtrn.32 q1, q2
  1188. vtrn.32 d6, d7
  1189. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1190. vst1.32 {d2[0]}, [r0,:32], r1
  1191. vst1.32 {d3[0]}, [r0,:32], r1
  1192. vst1.32 {d2[1]}, [r0,:32], r1
  1193. vst1.32 {d3[1]}, [r0,:32], r1
  1194. subs r12, r12, #4
  1195. bne 2b
  1196. add sp, sp, #52+16
  1197. pop {r4,pc}
  1198. endfunc
  1199. function ff_put_vp8_epel4_h4v6_neon, export=1
  1200. sub r2, r2, r3, lsl #1
  1201. sub r2, r2, #1
  1202. push {r4,lr}
  1203. ldr r4, [sp, #12] @ mx
  1204. movrel lr, subpel_filters-16
  1205. ldr r12, [sp, #8] @ h
  1206. add r4, lr, r4, lsl #4
  1207. sub sp, sp, #52+16
  1208. vld1.16 {q0}, [r4,:128]
  1209. add lr, sp, #15
  1210. add r12, r12, #5
  1211. bic lr, lr, #15
  1212. 1:
  1213. vld1.8 {d2}, [r2], r3
  1214. vp8_epel8_h4 d2, d2, d2
  1215. vst1.32 {d2[0]}, [lr,:32]!
  1216. subs r12, r12, #1
  1217. bne 1b
  1218. ldr r4, [sp, #52+16+16] @ my
  1219. movrel lr, subpel_filters-16
  1220. ldr r12, [sp, #52+16+8] @ h
  1221. add r4, lr, r4, lsl #4
  1222. add lr, sp, #15
  1223. vld1.16 {q0}, [r4,:128]
  1224. bic lr, lr, #15
  1225. 2:
  1226. vld1.8 {d2-d3}, [lr,:128]!
  1227. vld1.8 {d6}, [lr,:64]!
  1228. vld1.32 {d28[]}, [lr,:32]
  1229. sub lr, lr, #16
  1230. vld1.8 {d4-d5}, [lr]!
  1231. vld1.8 {d7}, [lr,:64]!
  1232. vld1.32 {d28[1]}, [lr,:32]
  1233. sub lr, lr, #16
  1234. vtrn.32 q1, q2
  1235. vtrn.32 d6, d7
  1236. vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
  1237. vst1.32 {d2[0]}, [r0,:32], r1
  1238. vst1.32 {d3[0]}, [r0,:32], r1
  1239. vst1.32 {d2[1]}, [r0,:32], r1
  1240. vst1.32 {d3[1]}, [r0,:32], r1
  1241. subs r12, r12, #4
  1242. bne 2b
  1243. add sp, sp, #52+16
  1244. pop {r4,pc}
  1245. endfunc
  1246. function ff_put_vp8_epel4_h6v4_neon, export=1
  1247. sub r2, r2, r3
  1248. sub r2, r2, #2
  1249. push {r4,lr}
  1250. ldr r4, [sp, #12] @ mx
  1251. movrel lr, subpel_filters-16
  1252. ldr r12, [sp, #8] @ h
  1253. add r4, lr, r4, lsl #4
  1254. sub sp, sp, #44+16
  1255. vld1.16 {q0}, [r4,:128]
  1256. add lr, sp, #15
  1257. add r12, r12, #3
  1258. bic lr, lr, #15
  1259. 1:
  1260. vld1.8 {q1}, [r2], r3
  1261. vp8_epel8_h6 d2, d2, d3
  1262. vst1.32 {d2[0]}, [lr,:32]!
  1263. subs r12, r12, #1
  1264. bne 1b
  1265. ldr r4, [sp, #44+16+16] @ my
  1266. movrel lr, subpel_filters-16
  1267. ldr r12, [sp, #44+16+8] @ h
  1268. add r4, lr, r4, lsl #4
  1269. add lr, sp, #15
  1270. vld1.16 {q0}, [r4,:128]
  1271. bic lr, lr, #15
  1272. 2:
  1273. vld1.8 {d2-d3}, [lr,:128]!
  1274. vld1.32 {d6[]}, [lr,:32]
  1275. sub lr, lr, #8
  1276. vld1.8 {d4-d5}, [lr]!
  1277. vld1.32 {d6[1]}, [lr,:32]
  1278. sub lr, lr, #8
  1279. vtrn.32 q1, q2
  1280. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1281. vst1.32 {d2[0]}, [r0,:32], r1
  1282. vst1.32 {d3[0]}, [r0,:32], r1
  1283. vst1.32 {d2[1]}, [r0,:32], r1
  1284. vst1.32 {d3[1]}, [r0,:32], r1
  1285. subs r12, r12, #4
  1286. bne 2b
  1287. add sp, sp, #44+16
  1288. pop {r4,pc}
  1289. endfunc
  1290. function ff_put_vp8_epel4_h4_neon, export=1
  1291. sub r2, r2, #1
  1292. push {r4,lr}
  1293. ldr r4, [sp, #12] @ mx
  1294. movrel lr, subpel_filters-16
  1295. ldr r12, [sp, #8] @ h
  1296. add r4, lr, r4, lsl #4
  1297. vld1.16 {q0}, [r4,:128]
  1298. 1:
  1299. vld1.8 {d2}, [r2], r3
  1300. vp8_epel8_h4 d2, d2, d2
  1301. vst1.32 {d2[0]}, [r0,:32], r1
  1302. subs r12, r12, #1
  1303. bne 1b
  1304. pop {r4,pc}
  1305. endfunc
  1306. function ff_put_vp8_epel4_v4_neon, export=1
  1307. sub r2, r2, r3
  1308. push {r4,lr}
  1309. ldr r4, [sp, #16] @ my
  1310. movrel lr, subpel_filters-16
  1311. ldr r12, [sp, #8] @ h
  1312. add r4, lr, r4, lsl #4
  1313. vld1.16 {q0}, [r4,:128]
  1314. 1:
  1315. vld1.32 {d2[]}, [r2], r3
  1316. vld1.32 {d3[]}, [r2], r3
  1317. vld1.32 {d4[]}, [r2], r3
  1318. vld1.32 {d5[]}, [r2], r3
  1319. vld1.32 {d6[]}, [r2]
  1320. sub r2, r2, r3, lsl #1
  1321. vld1.32 {d2[1]}, [r2], r3
  1322. vld1.32 {d3[1]}, [r2], r3
  1323. vld1.32 {d4[1]}, [r2], r3
  1324. vld1.32 {d5[1]}, [r2], r3
  1325. vld1.32 {d6[1]}, [r2]
  1326. sub r2, r2, r3, lsl #1
  1327. vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
  1328. vst1.32 {d2[0]}, [r0,:32], r1
  1329. vst1.32 {d3[0]}, [r0,:32], r1
  1330. vst1.32 {d2[1]}, [r0,:32], r1
  1331. vst1.32 {d3[1]}, [r0,:32], r1
  1332. subs r12, r12, #4
  1333. bne 1b
  1334. pop {r4,pc}
  1335. endfunc
  1336. function ff_put_vp8_epel4_h4v4_neon, export=1
  1337. sub r2, r2, r3
  1338. sub r2, r2, #1
  1339. push {r4,lr}
  1340. ldr r4, [sp, #12] @ mx
  1341. movrel lr, subpel_filters-16
  1342. ldr r12, [sp, #8] @ h
  1343. add r4, lr, r4, lsl #4
  1344. sub sp, sp, #44+16
  1345. vld1.16 {q0}, [r4,:128]
  1346. add lr, sp, #15
  1347. add r12, r12, #3
  1348. bic lr, lr, #15
  1349. 1:
  1350. vld1.8 {d2}, [r2], r3
  1351. vp8_epel8_h4 d2, d2, d3
  1352. vst1.32 {d2[0]}, [lr,:32]!
  1353. subs r12, r12, #1
  1354. bne 1b
  1355. ldr r4, [sp, #44+16+16] @ my
  1356. movrel lr, subpel_filters-16
  1357. ldr r12, [sp, #44+16+8] @ h
  1358. add r4, lr, r4, lsl #4
  1359. add lr, sp, #15
  1360. vld1.16 {q0}, [r4,:128]
  1361. bic lr, lr, #15
  1362. 2:
  1363. vld1.8 {d2-d3}, [lr,:128]!
  1364. vld1.32 {d6[]}, [lr,:32]
  1365. sub lr, lr, #8
  1366. vld1.8 {d4-d5}, [lr]!
  1367. vld1.32 {d6[1]}, [lr,:32]
  1368. sub lr, lr, #8
  1369. vtrn.32 q1, q2
  1370. vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
  1371. vst1.32 {d2[0]}, [r0,:32], r1
  1372. vst1.32 {d3[0]}, [r0,:32], r1
  1373. vst1.32 {d2[1]}, [r0,:32], r1
  1374. vst1.32 {d3[1]}, [r0,:32], r1
  1375. subs r12, r12, #4
  1376. bne 2b
  1377. add sp, sp, #44+16
  1378. pop {r4,pc}
  1379. endfunc
  1380. @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
  1381. @ arithmatic can be used to apply filters
  1382. const subpel_filters, align=4
  1383. .short 0, 6, 123, 12, 1, 0, 0, 0
  1384. .short 2, 11, 108, 36, 8, 1, 0, 0
  1385. .short 0, 9, 93, 50, 6, 0, 0, 0
  1386. .short 3, 16, 77, 77, 16, 3, 0, 0
  1387. .short 0, 6, 50, 93, 9, 0, 0, 0
  1388. .short 1, 8, 36, 108, 11, 2, 0, 0
  1389. .short 0, 1, 12, 123, 6, 0, 0, 0
  1390. endconst
  1391. /* Bilinear MC */
  1392. function ff_put_vp8_bilin16_h_neon, export=1
  1393. ldr r12, [sp, #4] @ mx
  1394. vdup.8 d0, r12
  1395. rsb r12, r12, #8
  1396. vdup.8 d1, r12
  1397. ldr r12, [sp] @ h
  1398. 1:
  1399. subs r12, r12, #2
  1400. vld1.8 {d2-d4}, [r2], r3
  1401. vext.8 q2, q1, q2, #1
  1402. vmull.u8 q8, d2, d1
  1403. vmlal.u8 q8, d4, d0
  1404. vld1.8 {d18-d20},[r2], r3
  1405. vmull.u8 q3, d3, d1
  1406. vmlal.u8 q3, d5, d0
  1407. vext.8 q10, q9, q10, #1
  1408. vmull.u8 q11, d18, d1
  1409. vmlal.u8 q11, d20, d0
  1410. vmull.u8 q12, d19, d1
  1411. vmlal.u8 q12, d21, d0
  1412. vrshrn.u16 d4, q8, #3
  1413. vrshrn.u16 d5, q3, #3
  1414. vrshrn.u16 d6, q11, #3
  1415. vrshrn.u16 d7, q12, #3
  1416. vst1.8 {q2}, [r0,:128], r1
  1417. vst1.8 {q3}, [r0,:128], r1
  1418. bgt 1b
  1419. bx lr
  1420. endfunc
  1421. function ff_put_vp8_bilin16_v_neon, export=1
  1422. ldr r12, [sp, #8] @ my
  1423. vdup.8 d0, r12
  1424. rsb r12, r12, #8
  1425. vdup.8 d1, r12
  1426. ldr r12, [sp] @ h
  1427. vld1.8 {q1}, [r2], r3
  1428. 1:
  1429. subs r12, r12, #2
  1430. vld1.8 {q2}, [r2], r3
  1431. vmull.u8 q3, d2, d1
  1432. vmlal.u8 q3, d4, d0
  1433. vmull.u8 q8, d3, d1
  1434. vmlal.u8 q8, d5, d0
  1435. vld1.8 {q1}, [r2], r3
  1436. vmull.u8 q9, d4, d1
  1437. vmlal.u8 q9, d2, d0
  1438. vmull.u8 q10, d5, d1
  1439. vmlal.u8 q10, d3, d0
  1440. vrshrn.u16 d4, q3, #3
  1441. vrshrn.u16 d5, q8, #3
  1442. vrshrn.u16 d6, q9, #3
  1443. vrshrn.u16 d7, q10, #3
  1444. vst1.8 {q2}, [r0,:128], r1
  1445. vst1.8 {q3}, [r0,:128], r1
  1446. bgt 1b
  1447. bx lr
  1448. endfunc
  1449. function ff_put_vp8_bilin16_hv_neon, export=1
  1450. ldr r12, [sp, #4] @ mx
  1451. vdup.8 d0, r12
  1452. rsb r12, r12, #8
  1453. vdup.8 d1, r12
  1454. ldr r12, [sp, #8] @ my
  1455. vdup.8 d2, r12
  1456. rsb r12, r12, #8
  1457. vdup.8 d3, r12
  1458. ldr r12, [sp] @ h
  1459. vld1.8 {d4-d6}, [r2], r3
  1460. vext.8 q3, q2, q3, #1
  1461. vmull.u8 q8, d4, d1
  1462. vmlal.u8 q8, d6, d0
  1463. vmull.u8 q9, d5, d1
  1464. vmlal.u8 q9, d7, d0
  1465. vrshrn.u16 d4, q8, #3
  1466. vrshrn.u16 d5, q9, #3
  1467. 1:
  1468. subs r12, r12, #2
  1469. vld1.8 {d18-d20},[r2], r3
  1470. vext.8 q10, q9, q10, #1
  1471. vmull.u8 q11, d18, d1
  1472. vmlal.u8 q11, d20, d0
  1473. vld1.8 {d26-d28},[r2], r3
  1474. vmull.u8 q12, d19, d1
  1475. vmlal.u8 q12, d21, d0
  1476. vext.8 q14, q13, q14, #1
  1477. vmull.u8 q8, d26, d1
  1478. vmlal.u8 q8, d28, d0
  1479. vmull.u8 q9, d27, d1
  1480. vmlal.u8 q9, d29, d0
  1481. vrshrn.u16 d6, q11, #3
  1482. vrshrn.u16 d7, q12, #3
  1483. vmull.u8 q12, d4, d3
  1484. vmlal.u8 q12, d6, d2
  1485. vmull.u8 q15, d5, d3
  1486. vmlal.u8 q15, d7, d2
  1487. vrshrn.u16 d4, q8, #3
  1488. vrshrn.u16 d5, q9, #3
  1489. vmull.u8 q10, d6, d3
  1490. vmlal.u8 q10, d4, d2
  1491. vmull.u8 q11, d7, d3
  1492. vmlal.u8 q11, d5, d2
  1493. vrshrn.u16 d24, q12, #3
  1494. vrshrn.u16 d25, q15, #3
  1495. vst1.8 {q12}, [r0,:128], r1
  1496. vrshrn.u16 d20, q10, #3
  1497. vrshrn.u16 d21, q11, #3
  1498. vst1.8 {q10}, [r0,:128], r1
  1499. bgt 1b
  1500. bx lr
  1501. endfunc
  1502. function ff_put_vp8_bilin8_h_neon, export=1
  1503. ldr r12, [sp, #4] @ mx
  1504. vdup.8 d0, r12
  1505. rsb r12, r12, #8
  1506. vdup.8 d1, r12
  1507. ldr r12, [sp] @ h
  1508. 1:
  1509. subs r12, r12, #2
  1510. vld1.8 {q1}, [r2], r3
  1511. vext.8 d3, d2, d3, #1
  1512. vmull.u8 q2, d2, d1
  1513. vmlal.u8 q2, d3, d0
  1514. vld1.8 {q3}, [r2], r3
  1515. vext.8 d7, d6, d7, #1
  1516. vmull.u8 q8, d6, d1
  1517. vmlal.u8 q8, d7, d0
  1518. vrshrn.u16 d4, q2, #3
  1519. vrshrn.u16 d16, q8, #3
  1520. vst1.8 {d4}, [r0,:64], r1
  1521. vst1.8 {d16}, [r0,:64], r1
  1522. bgt 1b
  1523. bx lr
  1524. endfunc
  1525. function ff_put_vp8_bilin8_v_neon, export=1
  1526. ldr r12, [sp, #8] @ my
  1527. vdup.8 d0, r12
  1528. rsb r12, r12, #8
  1529. vdup.8 d1, r12
  1530. ldr r12, [sp] @ h
  1531. vld1.8 {d2}, [r2], r3
  1532. 1:
  1533. subs r12, r12, #2
  1534. vld1.8 {d3}, [r2], r3
  1535. vmull.u8 q2, d2, d1
  1536. vmlal.u8 q2, d3, d0
  1537. vld1.8 {d2}, [r2], r3
  1538. vmull.u8 q3, d3, d1
  1539. vmlal.u8 q3, d2, d0
  1540. vrshrn.u16 d4, q2, #3
  1541. vrshrn.u16 d6, q3, #3
  1542. vst1.8 {d4}, [r0,:64], r1
  1543. vst1.8 {d6}, [r0,:64], r1
  1544. bgt 1b
  1545. bx lr
  1546. endfunc
  1547. function ff_put_vp8_bilin8_hv_neon, export=1
  1548. ldr r12, [sp, #4] @ mx
  1549. vdup.8 d0, r12
  1550. rsb r12, r12, #8
  1551. vdup.8 d1, r12
  1552. ldr r12, [sp, #8] @ my
  1553. vdup.8 d2, r12
  1554. rsb r12, r12, #8
  1555. vdup.8 d3, r12
  1556. ldr r12, [sp] @ h
  1557. vld1.8 {q2}, [r2], r3
  1558. vext.8 d5, d4, d5, #1
  1559. vmull.u8 q9, d4, d1
  1560. vmlal.u8 q9, d5, d0
  1561. vrshrn.u16 d22, q9, #3
  1562. 1:
  1563. subs r12, r12, #2
  1564. vld1.8 {q3}, [r2], r3
  1565. vext.8 d7, d6, d7, #1
  1566. vmull.u8 q8, d6, d1
  1567. vmlal.u8 q8, d7, d0
  1568. vld1.8 {q2}, [r2], r3
  1569. vext.8 d5, d4, d5, #1
  1570. vmull.u8 q9, d4, d1
  1571. vmlal.u8 q9, d5, d0
  1572. vrshrn.u16 d16, q8, #3
  1573. vmull.u8 q10, d22, d3
  1574. vmlal.u8 q10, d16, d2
  1575. vrshrn.u16 d22, q9, #3
  1576. vmull.u8 q12, d16, d3
  1577. vmlal.u8 q12, d22, d2
  1578. vrshrn.u16 d20, q10, #3
  1579. vst1.8 {d20}, [r0,:64], r1
  1580. vrshrn.u16 d23, q12, #3
  1581. vst1.8 {d23}, [r0,:64], r1
  1582. bgt 1b
  1583. bx lr
  1584. endfunc
  1585. function ff_put_vp8_bilin4_h_neon, export=1
  1586. ldr r12, [sp, #4] @ mx
  1587. vdup.8 d0, r12
  1588. rsb r12, r12, #8
  1589. vdup.8 d1, r12
  1590. ldr r12, [sp] @ h
  1591. 1:
  1592. subs r12, r12, #2
  1593. vld1.8 {d2}, [r2], r3
  1594. vext.8 d3, d2, d3, #1
  1595. vld1.8 {d6}, [r2], r3
  1596. vext.8 d7, d6, d7, #1
  1597. vtrn.32 q1, q3
  1598. vmull.u8 q2, d2, d1
  1599. vmlal.u8 q2, d3, d0
  1600. vrshrn.u16 d4, q2, #3
  1601. vst1.32 {d4[0]}, [r0,:32], r1
  1602. vst1.32 {d4[1]}, [r0,:32], r1
  1603. bgt 1b
  1604. bx lr
  1605. endfunc
  1606. function ff_put_vp8_bilin4_v_neon, export=1
  1607. ldr r12, [sp, #8] @ my
  1608. vdup.8 d0, r12
  1609. rsb r12, r12, #8
  1610. vdup.8 d1, r12
  1611. ldr r12, [sp] @ h
  1612. vld1.32 {d2[]}, [r2], r3
  1613. 1:
  1614. vld1.32 {d3[]}, [r2]
  1615. vld1.32 {d2[1]}, [r2], r3
  1616. vld1.32 {d3[1]}, [r2], r3
  1617. vmull.u8 q2, d2, d1
  1618. vmlal.u8 q2, d3, d0
  1619. vtrn.32 d3, d2
  1620. vrshrn.u16 d4, q2, #3
  1621. vst1.32 {d4[0]}, [r0,:32], r1
  1622. vst1.32 {d4[1]}, [r0,:32], r1
  1623. subs r12, r12, #2
  1624. bgt 1b
  1625. bx lr
  1626. endfunc
  1627. function ff_put_vp8_bilin4_hv_neon, export=1
  1628. ldr r12, [sp, #4] @ mx
  1629. vdup.8 d0, r12
  1630. rsb r12, r12, #8
  1631. vdup.8 d1, r12
  1632. ldr r12, [sp, #8] @ my
  1633. vdup.8 d2, r12
  1634. rsb r12, r12, #8
  1635. vdup.8 d3, r12
  1636. ldr r12, [sp] @ h
  1637. vld1.8 {d4}, [r2], r3
  1638. vext.8 d5, d4, d4, #1
  1639. vmull.u8 q9, d4, d1
  1640. vmlal.u8 q9, d5, d0
  1641. vrshrn.u16 d22, q9, #3
  1642. 1:
  1643. subs r12, r12, #2
  1644. vld1.8 {d6}, [r2], r3
  1645. vext.8 d7, d6, d6, #1
  1646. vld1.8 {d4}, [r2], r3
  1647. vext.8 d5, d4, d4, #1
  1648. vtrn.32 q3, q2
  1649. vmull.u8 q8, d6, d1
  1650. vmlal.u8 q8, d7, d0
  1651. vrshrn.u16 d16, q8, #3
  1652. vmull.u8 q10, d16, d2
  1653. vtrn.32 d22, d16
  1654. vmlal.u8 q10, d22, d3
  1655. vrev64.32 d22, d16
  1656. vrshrn.u16 d20, q10, #3
  1657. vst1.32 {d20[0]}, [r0,:32], r1
  1658. vst1.32 {d20[1]}, [r0,:32], r1
  1659. bgt 1b
  1660. bx lr
  1661. endfunc