You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

956 lines
32KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. #include "neon.S"
  22. /* H.264 qpel MC */
  23. .macro lowpass_const r
  24. movw \r, #5
  25. movt \r, #20
  26. vmov.32 d6[0], \r
  27. .endm
  28. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  29. .if \narrow
  30. t0 .req q0
  31. t1 .req q8
  32. .else
  33. t0 .req \d0
  34. t1 .req \d1
  35. .endif
  36. vext.8 d2, \r0, \r1, #2
  37. vext.8 d3, \r0, \r1, #3
  38. vaddl.u8 q1, d2, d3
  39. vext.8 d4, \r0, \r1, #1
  40. vext.8 d5, \r0, \r1, #4
  41. vaddl.u8 q2, d4, d5
  42. vext.8 d30, \r0, \r1, #5
  43. vaddl.u8 t0, \r0, d30
  44. vext.8 d18, \r2, \r3, #2
  45. vmla.i16 t0, q1, d6[1]
  46. vext.8 d19, \r2, \r3, #3
  47. vaddl.u8 q9, d18, d19
  48. vext.8 d20, \r2, \r3, #1
  49. vmls.i16 t0, q2, d6[0]
  50. vext.8 d21, \r2, \r3, #4
  51. vaddl.u8 q10, d20, d21
  52. vext.8 d31, \r2, \r3, #5
  53. vaddl.u8 t1, \r2, d31
  54. vmla.i16 t1, q9, d6[1]
  55. vmls.i16 t1, q10, d6[0]
  56. .if \narrow
  57. vqrshrun.s16 \d0, t0, #5
  58. vqrshrun.s16 \d1, t1, #5
  59. .endif
  60. .unreq t0
  61. .unreq t1
  62. .endm
  63. .macro lowpass_8_1 r0, r1, d0, narrow=1
  64. .if \narrow
  65. t0 .req q0
  66. .else
  67. t0 .req \d0
  68. .endif
  69. vext.8 d2, \r0, \r1, #2
  70. vext.8 d3, \r0, \r1, #3
  71. vaddl.u8 q1, d2, d3
  72. vext.8 d4, \r0, \r1, #1
  73. vext.8 d5, \r0, \r1, #4
  74. vaddl.u8 q2, d4, d5
  75. vext.8 d30, \r0, \r1, #5
  76. vaddl.u8 t0, \r0, d30
  77. vmla.i16 t0, q1, d6[1]
  78. vmls.i16 t0, q2, d6[0]
  79. .if \narrow
  80. vqrshrun.s16 \d0, t0, #5
  81. .endif
  82. .unreq t0
  83. .endm
  84. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  85. vext.16 q1, \r0, \r1, #2
  86. vext.16 q0, \r0, \r1, #3
  87. vaddl.s16 q9, d2, d0
  88. vext.16 q2, \r0, \r1, #1
  89. vaddl.s16 q1, d3, d1
  90. vext.16 q3, \r0, \r1, #4
  91. vaddl.s16 q10, d4, d6
  92. vext.16 \r1, \r0, \r1, #5
  93. vaddl.s16 q2, d5, d7
  94. vaddl.s16 q0, \h0, \h1
  95. vaddl.s16 q8, \l0, \l1
  96. vshl.i32 q3, q9, #4
  97. vshl.i32 q9, q9, #2
  98. vshl.i32 q15, q10, #2
  99. vadd.i32 q9, q9, q3
  100. vadd.i32 q10, q10, q15
  101. vshl.i32 q3, q1, #4
  102. vshl.i32 q1, q1, #2
  103. vshl.i32 q15, q2, #2
  104. vadd.i32 q1, q1, q3
  105. vadd.i32 q2, q2, q15
  106. vadd.i32 q9, q9, q8
  107. vsub.i32 q9, q9, q10
  108. vadd.i32 q1, q1, q0
  109. vsub.i32 q1, q1, q2
  110. vrshrn.s32 d18, q9, #10
  111. vrshrn.s32 d19, q1, #10
  112. vqmovun.s16 \d, q9
  113. .endm
  114. function put_h264_qpel16_h_lowpass_neon_packed
  115. mov r4, lr
  116. mov r12, #16
  117. mov r3, #8
  118. bl put_h264_qpel8_h_lowpass_neon
  119. sub r1, r1, r2, lsl #4
  120. add r1, r1, #8
  121. mov r12, #16
  122. mov lr, r4
  123. b put_h264_qpel8_h_lowpass_neon
  124. endfunc
  125. .macro h264_qpel_h_lowpass type
  126. function \type\()_h264_qpel16_h_lowpass_neon
  127. push {lr}
  128. mov r12, #16
  129. bl \type\()_h264_qpel8_h_lowpass_neon
  130. sub r0, r0, r3, lsl #4
  131. sub r1, r1, r2, lsl #4
  132. add r0, r0, #8
  133. add r1, r1, #8
  134. mov r12, #16
  135. pop {lr}
  136. endfunc
  137. function \type\()_h264_qpel8_h_lowpass_neon
  138. 1: vld1.8 {d0, d1}, [r1], r2
  139. vld1.8 {d16,d17}, [r1], r2
  140. subs r12, r12, #2
  141. lowpass_8 d0, d1, d16, d17, d0, d16
  142. .ifc \type,avg
  143. vld1.8 {d2}, [r0,:64], r3
  144. vrhadd.u8 d0, d0, d2
  145. vld1.8 {d3}, [r0,:64]
  146. vrhadd.u8 d16, d16, d3
  147. sub r0, r0, r3
  148. .endif
  149. vst1.8 {d0}, [r0,:64], r3
  150. vst1.8 {d16}, [r0,:64], r3
  151. bne 1b
  152. bx lr
  153. endfunc
  154. .endm
  155. h264_qpel_h_lowpass put
  156. h264_qpel_h_lowpass avg
  157. .macro h264_qpel_h_lowpass_l2 type
  158. function \type\()_h264_qpel16_h_lowpass_l2_neon
  159. push {lr}
  160. mov r12, #16
  161. bl \type\()_h264_qpel8_h_lowpass_l2_neon
  162. sub r0, r0, r2, lsl #4
  163. sub r1, r1, r2, lsl #4
  164. sub r3, r3, r2, lsl #4
  165. add r0, r0, #8
  166. add r1, r1, #8
  167. add r3, r3, #8
  168. mov r12, #16
  169. pop {lr}
  170. endfunc
  171. function \type\()_h264_qpel8_h_lowpass_l2_neon
  172. 1: vld1.8 {d0, d1}, [r1], r2
  173. vld1.8 {d16,d17}, [r1], r2
  174. vld1.8 {d28}, [r3], r2
  175. vld1.8 {d29}, [r3], r2
  176. subs r12, r12, #2
  177. lowpass_8 d0, d1, d16, d17, d0, d1
  178. vrhadd.u8 q0, q0, q14
  179. .ifc \type,avg
  180. vld1.8 {d2}, [r0,:64], r2
  181. vrhadd.u8 d0, d0, d2
  182. vld1.8 {d3}, [r0,:64]
  183. vrhadd.u8 d1, d1, d3
  184. sub r0, r0, r2
  185. .endif
  186. vst1.8 {d0}, [r0,:64], r2
  187. vst1.8 {d1}, [r0,:64], r2
  188. bne 1b
  189. bx lr
  190. endfunc
  191. .endm
  192. h264_qpel_h_lowpass_l2 put
  193. h264_qpel_h_lowpass_l2 avg
  194. function put_h264_qpel16_v_lowpass_neon_packed
  195. mov r4, lr
  196. mov r2, #8
  197. bl put_h264_qpel8_v_lowpass_neon
  198. sub r1, r1, r3, lsl #2
  199. bl put_h264_qpel8_v_lowpass_neon
  200. sub r1, r1, r3, lsl #4
  201. sub r1, r1, r3, lsl #2
  202. add r1, r1, #8
  203. bl put_h264_qpel8_v_lowpass_neon
  204. sub r1, r1, r3, lsl #2
  205. mov lr, r4
  206. b put_h264_qpel8_v_lowpass_neon
  207. endfunc
  208. .macro h264_qpel_v_lowpass type
  209. function \type\()_h264_qpel16_v_lowpass_neon
  210. mov r4, lr
  211. bl \type\()_h264_qpel8_v_lowpass_neon
  212. sub r1, r1, r3, lsl #2
  213. bl \type\()_h264_qpel8_v_lowpass_neon
  214. sub r0, r0, r2, lsl #4
  215. add r0, r0, #8
  216. sub r1, r1, r3, lsl #4
  217. sub r1, r1, r3, lsl #2
  218. add r1, r1, #8
  219. bl \type\()_h264_qpel8_v_lowpass_neon
  220. sub r1, r1, r3, lsl #2
  221. mov lr, r4
  222. endfunc
  223. function \type\()_h264_qpel8_v_lowpass_neon
  224. vld1.8 {d8}, [r1], r3
  225. vld1.8 {d10}, [r1], r3
  226. vld1.8 {d12}, [r1], r3
  227. vld1.8 {d14}, [r1], r3
  228. vld1.8 {d22}, [r1], r3
  229. vld1.8 {d24}, [r1], r3
  230. vld1.8 {d26}, [r1], r3
  231. vld1.8 {d28}, [r1], r3
  232. vld1.8 {d9}, [r1], r3
  233. vld1.8 {d11}, [r1], r3
  234. vld1.8 {d13}, [r1], r3
  235. vld1.8 {d15}, [r1], r3
  236. vld1.8 {d23}, [r1]
  237. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  238. lowpass_8 d8, d9, d10, d11, d8, d10
  239. lowpass_8 d12, d13, d14, d15, d12, d14
  240. lowpass_8 d22, d23, d24, d25, d22, d24
  241. lowpass_8 d26, d27, d28, d29, d26, d28
  242. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  243. .ifc \type,avg
  244. vld1.8 {d9}, [r0,:64], r2
  245. vrhadd.u8 d8, d8, d9
  246. vld1.8 {d11}, [r0,:64], r2
  247. vrhadd.u8 d10, d10, d11
  248. vld1.8 {d13}, [r0,:64], r2
  249. vrhadd.u8 d12, d12, d13
  250. vld1.8 {d15}, [r0,:64], r2
  251. vrhadd.u8 d14, d14, d15
  252. vld1.8 {d23}, [r0,:64], r2
  253. vrhadd.u8 d22, d22, d23
  254. vld1.8 {d25}, [r0,:64], r2
  255. vrhadd.u8 d24, d24, d25
  256. vld1.8 {d27}, [r0,:64], r2
  257. vrhadd.u8 d26, d26, d27
  258. vld1.8 {d29}, [r0,:64], r2
  259. vrhadd.u8 d28, d28, d29
  260. sub r0, r0, r2, lsl #3
  261. .endif
  262. vst1.8 {d8}, [r0,:64], r2
  263. vst1.8 {d10}, [r0,:64], r2
  264. vst1.8 {d12}, [r0,:64], r2
  265. vst1.8 {d14}, [r0,:64], r2
  266. vst1.8 {d22}, [r0,:64], r2
  267. vst1.8 {d24}, [r0,:64], r2
  268. vst1.8 {d26}, [r0,:64], r2
  269. vst1.8 {d28}, [r0,:64], r2
  270. bx lr
  271. endfunc
  272. .endm
  273. h264_qpel_v_lowpass put
  274. h264_qpel_v_lowpass avg
  275. .macro h264_qpel_v_lowpass_l2 type
  276. function \type\()_h264_qpel16_v_lowpass_l2_neon
  277. mov r4, lr
  278. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  279. sub r1, r1, r3, lsl #2
  280. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  281. sub r0, r0, r3, lsl #4
  282. sub r12, r12, r2, lsl #4
  283. add r0, r0, #8
  284. add r12, r12, #8
  285. sub r1, r1, r3, lsl #4
  286. sub r1, r1, r3, lsl #2
  287. add r1, r1, #8
  288. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  289. sub r1, r1, r3, lsl #2
  290. mov lr, r4
  291. endfunc
  292. function \type\()_h264_qpel8_v_lowpass_l2_neon
  293. vld1.8 {d8}, [r1], r3
  294. vld1.8 {d10}, [r1], r3
  295. vld1.8 {d12}, [r1], r3
  296. vld1.8 {d14}, [r1], r3
  297. vld1.8 {d22}, [r1], r3
  298. vld1.8 {d24}, [r1], r3
  299. vld1.8 {d26}, [r1], r3
  300. vld1.8 {d28}, [r1], r3
  301. vld1.8 {d9}, [r1], r3
  302. vld1.8 {d11}, [r1], r3
  303. vld1.8 {d13}, [r1], r3
  304. vld1.8 {d15}, [r1], r3
  305. vld1.8 {d23}, [r1]
  306. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  307. lowpass_8 d8, d9, d10, d11, d8, d9
  308. lowpass_8 d12, d13, d14, d15, d12, d13
  309. lowpass_8 d22, d23, d24, d25, d22, d23
  310. lowpass_8 d26, d27, d28, d29, d26, d27
  311. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  312. vld1.8 {d0}, [r12], r2
  313. vld1.8 {d1}, [r12], r2
  314. vld1.8 {d2}, [r12], r2
  315. vld1.8 {d3}, [r12], r2
  316. vld1.8 {d4}, [r12], r2
  317. vrhadd.u8 q0, q0, q4
  318. vld1.8 {d5}, [r12], r2
  319. vrhadd.u8 q1, q1, q6
  320. vld1.8 {d10}, [r12], r2
  321. vrhadd.u8 q2, q2, q11
  322. vld1.8 {d11}, [r12], r2
  323. vrhadd.u8 q5, q5, q13
  324. .ifc \type,avg
  325. vld1.8 {d16}, [r0,:64], r3
  326. vrhadd.u8 d0, d0, d16
  327. vld1.8 {d17}, [r0,:64], r3
  328. vrhadd.u8 d1, d1, d17
  329. vld1.8 {d16}, [r0,:64], r3
  330. vrhadd.u8 d2, d2, d16
  331. vld1.8 {d17}, [r0,:64], r3
  332. vrhadd.u8 d3, d3, d17
  333. vld1.8 {d16}, [r0,:64], r3
  334. vrhadd.u8 d4, d4, d16
  335. vld1.8 {d17}, [r0,:64], r3
  336. vrhadd.u8 d5, d5, d17
  337. vld1.8 {d16}, [r0,:64], r3
  338. vrhadd.u8 d10, d10, d16
  339. vld1.8 {d17}, [r0,:64], r3
  340. vrhadd.u8 d11, d11, d17
  341. sub r0, r0, r3, lsl #3
  342. .endif
  343. vst1.8 {d0}, [r0,:64], r3
  344. vst1.8 {d1}, [r0,:64], r3
  345. vst1.8 {d2}, [r0,:64], r3
  346. vst1.8 {d3}, [r0,:64], r3
  347. vst1.8 {d4}, [r0,:64], r3
  348. vst1.8 {d5}, [r0,:64], r3
  349. vst1.8 {d10}, [r0,:64], r3
  350. vst1.8 {d11}, [r0,:64], r3
  351. bx lr
  352. endfunc
  353. .endm
  354. h264_qpel_v_lowpass_l2 put
  355. h264_qpel_v_lowpass_l2 avg
  356. function put_h264_qpel8_hv_lowpass_neon_top
  357. lowpass_const r12
  358. mov r12, #12
  359. 1: vld1.8 {d0, d1}, [r1], r3
  360. vld1.8 {d16,d17}, [r1], r3
  361. subs r12, r12, #2
  362. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  363. vst1.8 {d22-d25}, [r4,:128]!
  364. bne 1b
  365. vld1.8 {d0, d1}, [r1]
  366. lowpass_8_1 d0, d1, q12, narrow=0
  367. mov r12, #-16
  368. add r4, r4, r12
  369. vld1.8 {d30,d31}, [r4,:128], r12
  370. vld1.8 {d20,d21}, [r4,:128], r12
  371. vld1.8 {d18,d19}, [r4,:128], r12
  372. vld1.8 {d16,d17}, [r4,:128], r12
  373. vld1.8 {d14,d15}, [r4,:128], r12
  374. vld1.8 {d12,d13}, [r4,:128], r12
  375. vld1.8 {d10,d11}, [r4,:128], r12
  376. vld1.8 {d8, d9}, [r4,:128], r12
  377. vld1.8 {d6, d7}, [r4,:128], r12
  378. vld1.8 {d4, d5}, [r4,:128], r12
  379. vld1.8 {d2, d3}, [r4,:128], r12
  380. vld1.8 {d0, d1}, [r4,:128]
  381. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  382. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  383. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  384. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  385. vst1.8 {d30,d31}, [r4,:128]!
  386. vst1.8 {d6, d7}, [r4,:128]!
  387. vst1.8 {d20,d21}, [r4,:128]!
  388. vst1.8 {d4, d5}, [r4,:128]!
  389. vst1.8 {d18,d19}, [r4,:128]!
  390. vst1.8 {d2, d3}, [r4,:128]!
  391. vst1.8 {d16,d17}, [r4,:128]!
  392. vst1.8 {d0, d1}, [r4,:128]
  393. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  394. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  395. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  396. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  397. vld1.8 {d16,d17}, [r4,:128], r12
  398. vld1.8 {d30,d31}, [r4,:128], r12
  399. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  400. vld1.8 {d16,d17}, [r4,:128], r12
  401. vld1.8 {d30,d31}, [r4,:128], r12
  402. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  403. vld1.8 {d16,d17}, [r4,:128], r12
  404. vld1.8 {d30,d31}, [r4,:128], r12
  405. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  406. vld1.8 {d16,d17}, [r4,:128], r12
  407. vld1.8 {d30,d31}, [r4,:128]
  408. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  409. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  410. bx lr
  411. endfunc
  412. .macro h264_qpel8_hv_lowpass type
  413. function \type\()_h264_qpel8_hv_lowpass_neon
  414. mov r10, lr
  415. bl put_h264_qpel8_hv_lowpass_neon_top
  416. .ifc \type,avg
  417. vld1.8 {d0}, [r0,:64], r2
  418. vrhadd.u8 d12, d12, d0
  419. vld1.8 {d1}, [r0,:64], r2
  420. vrhadd.u8 d13, d13, d1
  421. vld1.8 {d2}, [r0,:64], r2
  422. vrhadd.u8 d14, d14, d2
  423. vld1.8 {d3}, [r0,:64], r2
  424. vrhadd.u8 d15, d15, d3
  425. vld1.8 {d4}, [r0,:64], r2
  426. vrhadd.u8 d8, d8, d4
  427. vld1.8 {d5}, [r0,:64], r2
  428. vrhadd.u8 d9, d9, d5
  429. vld1.8 {d6}, [r0,:64], r2
  430. vrhadd.u8 d10, d10, d6
  431. vld1.8 {d7}, [r0,:64], r2
  432. vrhadd.u8 d11, d11, d7
  433. sub r0, r0, r2, lsl #3
  434. .endif
  435. vst1.8 {d12}, [r0,:64], r2
  436. vst1.8 {d13}, [r0,:64], r2
  437. vst1.8 {d14}, [r0,:64], r2
  438. vst1.8 {d15}, [r0,:64], r2
  439. vst1.8 {d8}, [r0,:64], r2
  440. vst1.8 {d9}, [r0,:64], r2
  441. vst1.8 {d10}, [r0,:64], r2
  442. vst1.8 {d11}, [r0,:64], r2
  443. mov lr, r10
  444. bx lr
  445. endfunc
  446. .endm
  447. h264_qpel8_hv_lowpass put
  448. h264_qpel8_hv_lowpass avg
  449. .macro h264_qpel8_hv_lowpass_l2 type
  450. function \type\()_h264_qpel8_hv_lowpass_l2_neon
  451. mov r10, lr
  452. bl put_h264_qpel8_hv_lowpass_neon_top
  453. vld1.8 {d0, d1}, [r2,:128]!
  454. vld1.8 {d2, d3}, [r2,:128]!
  455. vrhadd.u8 q0, q0, q6
  456. vld1.8 {d4, d5}, [r2,:128]!
  457. vrhadd.u8 q1, q1, q7
  458. vld1.8 {d6, d7}, [r2,:128]!
  459. vrhadd.u8 q2, q2, q4
  460. vrhadd.u8 q3, q3, q5
  461. .ifc \type,avg
  462. vld1.8 {d16}, [r0,:64], r3
  463. vrhadd.u8 d0, d0, d16
  464. vld1.8 {d17}, [r0,:64], r3
  465. vrhadd.u8 d1, d1, d17
  466. vld1.8 {d18}, [r0,:64], r3
  467. vrhadd.u8 d2, d2, d18
  468. vld1.8 {d19}, [r0,:64], r3
  469. vrhadd.u8 d3, d3, d19
  470. vld1.8 {d20}, [r0,:64], r3
  471. vrhadd.u8 d4, d4, d20
  472. vld1.8 {d21}, [r0,:64], r3
  473. vrhadd.u8 d5, d5, d21
  474. vld1.8 {d22}, [r0,:64], r3
  475. vrhadd.u8 d6, d6, d22
  476. vld1.8 {d23}, [r0,:64], r3
  477. vrhadd.u8 d7, d7, d23
  478. sub r0, r0, r3, lsl #3
  479. .endif
  480. vst1.8 {d0}, [r0,:64], r3
  481. vst1.8 {d1}, [r0,:64], r3
  482. vst1.8 {d2}, [r0,:64], r3
  483. vst1.8 {d3}, [r0,:64], r3
  484. vst1.8 {d4}, [r0,:64], r3
  485. vst1.8 {d5}, [r0,:64], r3
  486. vst1.8 {d6}, [r0,:64], r3
  487. vst1.8 {d7}, [r0,:64], r3
  488. mov lr, r10
  489. bx lr
  490. endfunc
  491. .endm
  492. h264_qpel8_hv_lowpass_l2 put
  493. h264_qpel8_hv_lowpass_l2 avg
  494. .macro h264_qpel16_hv type
  495. function \type\()_h264_qpel16_hv_lowpass_neon
  496. mov r9, lr
  497. bl \type\()_h264_qpel8_hv_lowpass_neon
  498. sub r1, r1, r3, lsl #2
  499. bl \type\()_h264_qpel8_hv_lowpass_neon
  500. sub r1, r1, r3, lsl #4
  501. sub r1, r1, r3, lsl #2
  502. add r1, r1, #8
  503. sub r0, r0, r2, lsl #4
  504. add r0, r0, #8
  505. bl \type\()_h264_qpel8_hv_lowpass_neon
  506. sub r1, r1, r3, lsl #2
  507. mov lr, r9
  508. b \type\()_h264_qpel8_hv_lowpass_neon
  509. endfunc
  510. function \type\()_h264_qpel16_hv_lowpass_l2_neon
  511. mov r9, lr
  512. sub r2, r4, #256
  513. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  514. sub r1, r1, r3, lsl #2
  515. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  516. sub r1, r1, r3, lsl #4
  517. sub r1, r1, r3, lsl #2
  518. add r1, r1, #8
  519. sub r0, r0, r3, lsl #4
  520. add r0, r0, #8
  521. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  522. sub r1, r1, r3, lsl #2
  523. mov lr, r9
  524. b \type\()_h264_qpel8_hv_lowpass_l2_neon
  525. endfunc
  526. .endm
  527. h264_qpel16_hv put
  528. h264_qpel16_hv avg
  529. .macro h264_qpel8 type
  530. function ff_\type\()_h264_qpel8_mc10_neon, export=1
  531. lowpass_const r3
  532. mov r3, r1
  533. sub r1, r1, #2
  534. mov r12, #8
  535. b \type\()_h264_qpel8_h_lowpass_l2_neon
  536. endfunc
  537. function ff_\type\()_h264_qpel8_mc20_neon, export=1
  538. lowpass_const r3
  539. sub r1, r1, #2
  540. mov r3, r2
  541. mov r12, #8
  542. b \type\()_h264_qpel8_h_lowpass_neon
  543. endfunc
  544. function ff_\type\()_h264_qpel8_mc30_neon, export=1
  545. lowpass_const r3
  546. add r3, r1, #1
  547. sub r1, r1, #2
  548. mov r12, #8
  549. b \type\()_h264_qpel8_h_lowpass_l2_neon
  550. endfunc
  551. function ff_\type\()_h264_qpel8_mc01_neon, export=1
  552. push {lr}
  553. mov r12, r1
  554. \type\()_h264_qpel8_mc01:
  555. lowpass_const r3
  556. mov r3, r2
  557. sub r1, r1, r2, lsl #1
  558. vpush {d8-d15}
  559. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  560. vpop {d8-d15}
  561. pop {pc}
  562. endfunc
  563. function ff_\type\()_h264_qpel8_mc11_neon, export=1
  564. push {r0, r1, r11, lr}
  565. \type\()_h264_qpel8_mc11:
  566. lowpass_const r3
  567. mov r11, sp
  568. A bic sp, sp, #15
  569. T bic r0, r11, #15
  570. T mov sp, r0
  571. sub sp, sp, #64
  572. mov r0, sp
  573. sub r1, r1, #2
  574. mov r3, #8
  575. mov r12, #8
  576. vpush {d8-d15}
  577. bl put_h264_qpel8_h_lowpass_neon
  578. ldrd r0, r1, [r11], #8
  579. mov r3, r2
  580. add r12, sp, #64
  581. sub r1, r1, r2, lsl #1
  582. mov r2, #8
  583. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  584. vpop {d8-d15}
  585. mov sp, r11
  586. pop {r11, pc}
  587. endfunc
  588. function ff_\type\()_h264_qpel8_mc21_neon, export=1
  589. push {r0, r1, r4, r10, r11, lr}
  590. \type\()_h264_qpel8_mc21:
  591. lowpass_const r3
  592. mov r11, sp
  593. A bic sp, sp, #15
  594. T bic r0, r11, #15
  595. T mov sp, r0
  596. sub sp, sp, #(8*8+16*12)
  597. sub r1, r1, #2
  598. mov r3, #8
  599. mov r0, sp
  600. mov r12, #8
  601. vpush {d8-d15}
  602. bl put_h264_qpel8_h_lowpass_neon
  603. mov r4, r0
  604. ldrd r0, r1, [r11], #8
  605. sub r1, r1, r2, lsl #1
  606. sub r1, r1, #2
  607. mov r3, r2
  608. sub r2, r4, #64
  609. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  610. vpop {d8-d15}
  611. mov sp, r11
  612. pop {r4, r10, r11, pc}
  613. endfunc
  614. function ff_\type\()_h264_qpel8_mc31_neon, export=1
  615. add r1, r1, #1
  616. push {r0, r1, r11, lr}
  617. sub r1, r1, #1
  618. b \type\()_h264_qpel8_mc11
  619. endfunc
  620. function ff_\type\()_h264_qpel8_mc02_neon, export=1
  621. push {lr}
  622. lowpass_const r3
  623. sub r1, r1, r2, lsl #1
  624. mov r3, r2
  625. vpush {d8-d15}
  626. bl \type\()_h264_qpel8_v_lowpass_neon
  627. vpop {d8-d15}
  628. pop {pc}
  629. endfunc
  630. function ff_\type\()_h264_qpel8_mc12_neon, export=1
  631. push {r0, r1, r4, r10, r11, lr}
  632. \type\()_h264_qpel8_mc12:
  633. lowpass_const r3
  634. mov r11, sp
  635. A bic sp, sp, #15
  636. T bic r0, r11, #15
  637. T mov sp, r0
  638. sub sp, sp, #(8*8+16*12)
  639. sub r1, r1, r2, lsl #1
  640. mov r3, r2
  641. mov r2, #8
  642. mov r0, sp
  643. vpush {d8-d15}
  644. bl put_h264_qpel8_v_lowpass_neon
  645. mov r4, r0
  646. ldrd r0, r1, [r11], #8
  647. sub r1, r1, r3, lsl #1
  648. sub r1, r1, #2
  649. sub r2, r4, #64
  650. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  651. vpop {d8-d15}
  652. mov sp, r11
  653. pop {r4, r10, r11, pc}
  654. endfunc
  655. function ff_\type\()_h264_qpel8_mc22_neon, export=1
  656. push {r4, r10, r11, lr}
  657. mov r11, sp
  658. A bic sp, sp, #15
  659. T bic r4, r11, #15
  660. T mov sp, r4
  661. sub r1, r1, r2, lsl #1
  662. sub r1, r1, #2
  663. mov r3, r2
  664. sub sp, sp, #(16*12)
  665. mov r4, sp
  666. vpush {d8-d15}
  667. bl \type\()_h264_qpel8_hv_lowpass_neon
  668. vpop {d8-d15}
  669. mov sp, r11
  670. pop {r4, r10, r11, pc}
  671. endfunc
  672. function ff_\type\()_h264_qpel8_mc32_neon, export=1
  673. push {r0, r1, r4, r10, r11, lr}
  674. add r1, r1, #1
  675. b \type\()_h264_qpel8_mc12
  676. endfunc
  677. function ff_\type\()_h264_qpel8_mc03_neon, export=1
  678. push {lr}
  679. add r12, r1, r2
  680. b \type\()_h264_qpel8_mc01
  681. endfunc
  682. function ff_\type\()_h264_qpel8_mc13_neon, export=1
  683. push {r0, r1, r11, lr}
  684. add r1, r1, r2
  685. b \type\()_h264_qpel8_mc11
  686. endfunc
  687. function ff_\type\()_h264_qpel8_mc23_neon, export=1
  688. push {r0, r1, r4, r10, r11, lr}
  689. add r1, r1, r2
  690. b \type\()_h264_qpel8_mc21
  691. endfunc
  692. function ff_\type\()_h264_qpel8_mc33_neon, export=1
  693. add r1, r1, #1
  694. push {r0, r1, r11, lr}
  695. add r1, r1, r2
  696. sub r1, r1, #1
  697. b \type\()_h264_qpel8_mc11
  698. endfunc
  699. .endm
  700. h264_qpel8 put
  701. h264_qpel8 avg
  702. .macro h264_qpel16 type
  703. function ff_\type\()_h264_qpel16_mc10_neon, export=1
  704. lowpass_const r3
  705. mov r3, r1
  706. sub r1, r1, #2
  707. b \type\()_h264_qpel16_h_lowpass_l2_neon
  708. endfunc
  709. function ff_\type\()_h264_qpel16_mc20_neon, export=1
  710. lowpass_const r3
  711. sub r1, r1, #2
  712. mov r3, r2
  713. b \type\()_h264_qpel16_h_lowpass_neon
  714. endfunc
  715. function ff_\type\()_h264_qpel16_mc30_neon, export=1
  716. lowpass_const r3
  717. add r3, r1, #1
  718. sub r1, r1, #2
  719. b \type\()_h264_qpel16_h_lowpass_l2_neon
  720. endfunc
  721. function ff_\type\()_h264_qpel16_mc01_neon, export=1
  722. push {r4, lr}
  723. mov r12, r1
  724. \type\()_h264_qpel16_mc01:
  725. lowpass_const r3
  726. mov r3, r2
  727. sub r1, r1, r2, lsl #1
  728. vpush {d8-d15}
  729. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  730. vpop {d8-d15}
  731. pop {r4, pc}
  732. endfunc
  733. function ff_\type\()_h264_qpel16_mc11_neon, export=1
  734. push {r0, r1, r4, r11, lr}
  735. \type\()_h264_qpel16_mc11:
  736. lowpass_const r3
  737. mov r11, sp
  738. A bic sp, sp, #15
  739. T bic r0, r11, #15
  740. T mov sp, r0
  741. sub sp, sp, #256
  742. mov r0, sp
  743. sub r1, r1, #2
  744. mov r3, #16
  745. vpush {d8-d15}
  746. bl put_h264_qpel16_h_lowpass_neon
  747. ldrd r0, r1, [r11], #8
  748. mov r3, r2
  749. add r12, sp, #64
  750. sub r1, r1, r2, lsl #1
  751. mov r2, #16
  752. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  753. vpop {d8-d15}
  754. mov sp, r11
  755. pop {r4, r11, pc}
  756. endfunc
  757. function ff_\type\()_h264_qpel16_mc21_neon, export=1
  758. push {r0, r1, r4-r5, r9-r11, lr}
  759. \type\()_h264_qpel16_mc21:
  760. lowpass_const r3
  761. mov r11, sp
  762. A bic sp, sp, #15
  763. T bic r0, r11, #15
  764. T mov sp, r0
  765. sub sp, sp, #(16*16+16*12)
  766. sub r1, r1, #2
  767. mov r0, sp
  768. vpush {d8-d15}
  769. bl put_h264_qpel16_h_lowpass_neon_packed
  770. mov r4, r0
  771. ldrd r0, r1, [r11], #8
  772. sub r1, r1, r2, lsl #1
  773. sub r1, r1, #2
  774. mov r3, r2
  775. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  776. vpop {d8-d15}
  777. mov sp, r11
  778. pop {r4-r5, r9-r11, pc}
  779. endfunc
  780. function ff_\type\()_h264_qpel16_mc31_neon, export=1
  781. add r1, r1, #1
  782. push {r0, r1, r4, r11, lr}
  783. sub r1, r1, #1
  784. b \type\()_h264_qpel16_mc11
  785. endfunc
  786. function ff_\type\()_h264_qpel16_mc02_neon, export=1
  787. push {r4, lr}
  788. lowpass_const r3
  789. sub r1, r1, r2, lsl #1
  790. mov r3, r2
  791. vpush {d8-d15}
  792. bl \type\()_h264_qpel16_v_lowpass_neon
  793. vpop {d8-d15}
  794. pop {r4, pc}
  795. endfunc
  796. function ff_\type\()_h264_qpel16_mc12_neon, export=1
  797. push {r0, r1, r4-r5, r9-r11, lr}
  798. \type\()_h264_qpel16_mc12:
  799. lowpass_const r3
  800. mov r11, sp
  801. A bic sp, sp, #15
  802. T bic r0, r11, #15
  803. T mov sp, r0
  804. sub sp, sp, #(16*16+16*12)
  805. sub r1, r1, r2, lsl #1
  806. mov r0, sp
  807. mov r3, r2
  808. vpush {d8-d15}
  809. bl put_h264_qpel16_v_lowpass_neon_packed
  810. mov r4, r0
  811. ldrd r0, r1, [r11], #8
  812. sub r1, r1, r3, lsl #1
  813. sub r1, r1, #2
  814. mov r2, r3
  815. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  816. vpop {d8-d15}
  817. mov sp, r11
  818. pop {r4-r5, r9-r11, pc}
  819. endfunc
  820. function ff_\type\()_h264_qpel16_mc22_neon, export=1
  821. push {r4, r9-r11, lr}
  822. lowpass_const r3
  823. mov r11, sp
  824. A bic sp, sp, #15
  825. T bic r4, r11, #15
  826. T mov sp, r4
  827. sub r1, r1, r2, lsl #1
  828. sub r1, r1, #2
  829. mov r3, r2
  830. sub sp, sp, #(16*12)
  831. mov r4, sp
  832. vpush {d8-d15}
  833. bl \type\()_h264_qpel16_hv_lowpass_neon
  834. vpop {d8-d15}
  835. mov sp, r11
  836. pop {r4, r9-r11, pc}
  837. endfunc
  838. function ff_\type\()_h264_qpel16_mc32_neon, export=1
  839. push {r0, r1, r4-r5, r9-r11, lr}
  840. add r1, r1, #1
  841. b \type\()_h264_qpel16_mc12
  842. endfunc
  843. function ff_\type\()_h264_qpel16_mc03_neon, export=1
  844. push {r4, lr}
  845. add r12, r1, r2
  846. b \type\()_h264_qpel16_mc01
  847. endfunc
  848. function ff_\type\()_h264_qpel16_mc13_neon, export=1
  849. push {r0, r1, r4, r11, lr}
  850. add r1, r1, r2
  851. b \type\()_h264_qpel16_mc11
  852. endfunc
  853. function ff_\type\()_h264_qpel16_mc23_neon, export=1
  854. push {r0, r1, r4-r5, r9-r11, lr}
  855. add r1, r1, r2
  856. b \type\()_h264_qpel16_mc21
  857. endfunc
  858. function ff_\type\()_h264_qpel16_mc33_neon, export=1
  859. add r1, r1, #1
  860. push {r0, r1, r4, r11, lr}
  861. add r1, r1, r2
  862. sub r1, r1, #1
  863. b \type\()_h264_qpel16_mc11
  864. endfunc
  865. .endm
  866. h264_qpel16 put
  867. h264_qpel16 avg