You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1475 lines
49KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. #include "neon.S"
  22. /* H.264 loop filter */
  23. .macro h264_loop_filter_start
  24. ldr r12, [sp]
  25. tst r2, r2
  26. ldr r12, [r12]
  27. it ne
  28. tstne r3, r3
  29. vmov.32 d24[0], r12
  30. and r12, r12, r12, lsl #16
  31. it eq
  32. bxeq lr
  33. ands r12, r12, r12, lsl #8
  34. it lt
  35. bxlt lr
  36. .endm
  37. .macro h264_loop_filter_luma
  38. vdup.8 q11, r2 @ alpha
  39. vmovl.u8 q12, d24
  40. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  41. vmovl.u16 q12, d24
  42. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  43. vsli.16 q12, q12, #8
  44. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  45. vsli.32 q12, q12, #16
  46. vclt.u8 q6, q6, q11 @ < alpha
  47. vdup.8 q11, r3 @ beta
  48. vclt.s8 q7, q12, #0
  49. vclt.u8 q14, q14, q11 @ < beta
  50. vclt.u8 q15, q15, q11 @ < beta
  51. vbic q6, q6, q7
  52. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  53. vand q6, q6, q14
  54. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  55. vclt.u8 q4, q4, q11 @ < beta
  56. vand q6, q6, q15
  57. vclt.u8 q5, q5, q11 @ < beta
  58. vand q4, q4, q6
  59. vand q5, q5, q6
  60. vand q12, q12, q6
  61. vrhadd.u8 q14, q8, q0
  62. vsub.i8 q6, q12, q4
  63. vqadd.u8 q7, q9, q12
  64. vhadd.u8 q10, q10, q14
  65. vsub.i8 q6, q6, q5
  66. vhadd.u8 q14, q2, q14
  67. vmin.u8 q7, q7, q10
  68. vqsub.u8 q11, q9, q12
  69. vqadd.u8 q2, q1, q12
  70. vmax.u8 q7, q7, q11
  71. vqsub.u8 q11, q1, q12
  72. vmin.u8 q14, q2, q14
  73. vmovl.u8 q2, d0
  74. vmax.u8 q14, q14, q11
  75. vmovl.u8 q10, d1
  76. vsubw.u8 q2, q2, d16
  77. vsubw.u8 q10, q10, d17
  78. vshl.i16 q2, q2, #2
  79. vshl.i16 q10, q10, #2
  80. vaddw.u8 q2, q2, d18
  81. vaddw.u8 q10, q10, d19
  82. vsubw.u8 q2, q2, d2
  83. vsubw.u8 q10, q10, d3
  84. vrshrn.i16 d4, q2, #3
  85. vrshrn.i16 d5, q10, #3
  86. vbsl q4, q7, q9
  87. vbsl q5, q14, q1
  88. vneg.s8 q7, q6
  89. vmovl.u8 q14, d16
  90. vmin.s8 q2, q2, q6
  91. vmovl.u8 q6, d17
  92. vmax.s8 q2, q2, q7
  93. vmovl.u8 q11, d0
  94. vmovl.u8 q12, d1
  95. vaddw.s8 q14, q14, d4
  96. vaddw.s8 q6, q6, d5
  97. vsubw.s8 q11, q11, d4
  98. vsubw.s8 q12, q12, d5
  99. vqmovun.s16 d16, q14
  100. vqmovun.s16 d17, q6
  101. vqmovun.s16 d0, q11
  102. vqmovun.s16 d1, q12
  103. .endm
  104. function ff_h264_v_loop_filter_luma_neon, export=1
  105. h264_loop_filter_start
  106. vld1.8 {d0, d1}, [r0,:128], r1
  107. vld1.8 {d2, d3}, [r0,:128], r1
  108. vld1.8 {d4, d5}, [r0,:128], r1
  109. sub r0, r0, r1, lsl #2
  110. sub r0, r0, r1, lsl #1
  111. vld1.8 {d20,d21}, [r0,:128], r1
  112. vld1.8 {d18,d19}, [r0,:128], r1
  113. vld1.8 {d16,d17}, [r0,:128], r1
  114. vpush {d8-d15}
  115. h264_loop_filter_luma
  116. sub r0, r0, r1, lsl #1
  117. vst1.8 {d8, d9}, [r0,:128], r1
  118. vst1.8 {d16,d17}, [r0,:128], r1
  119. vst1.8 {d0, d1}, [r0,:128], r1
  120. vst1.8 {d10,d11}, [r0,:128]
  121. vpop {d8-d15}
  122. bx lr
  123. endfunc
  124. function ff_h264_h_loop_filter_luma_neon, export=1
  125. h264_loop_filter_start
  126. sub r0, r0, #4
  127. vld1.8 {d6}, [r0], r1
  128. vld1.8 {d20}, [r0], r1
  129. vld1.8 {d18}, [r0], r1
  130. vld1.8 {d16}, [r0], r1
  131. vld1.8 {d0}, [r0], r1
  132. vld1.8 {d2}, [r0], r1
  133. vld1.8 {d4}, [r0], r1
  134. vld1.8 {d26}, [r0], r1
  135. vld1.8 {d7}, [r0], r1
  136. vld1.8 {d21}, [r0], r1
  137. vld1.8 {d19}, [r0], r1
  138. vld1.8 {d17}, [r0], r1
  139. vld1.8 {d1}, [r0], r1
  140. vld1.8 {d3}, [r0], r1
  141. vld1.8 {d5}, [r0], r1
  142. vld1.8 {d27}, [r0], r1
  143. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  144. vpush {d8-d15}
  145. h264_loop_filter_luma
  146. transpose_4x4 q4, q8, q0, q5
  147. sub r0, r0, r1, lsl #4
  148. add r0, r0, #2
  149. vst1.32 {d8[0]}, [r0], r1
  150. vst1.32 {d16[0]}, [r0], r1
  151. vst1.32 {d0[0]}, [r0], r1
  152. vst1.32 {d10[0]}, [r0], r1
  153. vst1.32 {d8[1]}, [r0], r1
  154. vst1.32 {d16[1]}, [r0], r1
  155. vst1.32 {d0[1]}, [r0], r1
  156. vst1.32 {d10[1]}, [r0], r1
  157. vst1.32 {d9[0]}, [r0], r1
  158. vst1.32 {d17[0]}, [r0], r1
  159. vst1.32 {d1[0]}, [r0], r1
  160. vst1.32 {d11[0]}, [r0], r1
  161. vst1.32 {d9[1]}, [r0], r1
  162. vst1.32 {d17[1]}, [r0], r1
  163. vst1.32 {d1[1]}, [r0], r1
  164. vst1.32 {d11[1]}, [r0], r1
  165. vpop {d8-d15}
  166. bx lr
  167. endfunc
  168. .macro h264_loop_filter_chroma
  169. vdup.8 d22, r2 @ alpha
  170. vmovl.u8 q12, d24
  171. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  172. vmovl.u8 q2, d0
  173. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  174. vsubw.u8 q2, q2, d16
  175. vsli.16 d24, d24, #8
  176. vshl.i16 q2, q2, #2
  177. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  178. vaddw.u8 q2, q2, d18
  179. vclt.u8 d26, d26, d22 @ < alpha
  180. vsubw.u8 q2, q2, d2
  181. vdup.8 d22, r3 @ beta
  182. vrshrn.i16 d4, q2, #3
  183. vclt.u8 d28, d28, d22 @ < beta
  184. vclt.u8 d30, d30, d22 @ < beta
  185. vmin.s8 d4, d4, d24
  186. vneg.s8 d25, d24
  187. vand d26, d26, d28
  188. vmax.s8 d4, d4, d25
  189. vand d26, d26, d30
  190. vmovl.u8 q11, d0
  191. vand d4, d4, d26
  192. vmovl.u8 q14, d16
  193. vaddw.s8 q14, q14, d4
  194. vsubw.s8 q11, q11, d4
  195. vqmovun.s16 d16, q14
  196. vqmovun.s16 d0, q11
  197. .endm
  198. function ff_h264_v_loop_filter_chroma_neon, export=1
  199. h264_loop_filter_start
  200. sub r0, r0, r1, lsl #1
  201. vld1.8 {d18}, [r0,:64], r1
  202. vld1.8 {d16}, [r0,:64], r1
  203. vld1.8 {d0}, [r0,:64], r1
  204. vld1.8 {d2}, [r0,:64]
  205. h264_loop_filter_chroma
  206. sub r0, r0, r1, lsl #1
  207. vst1.8 {d16}, [r0,:64], r1
  208. vst1.8 {d0}, [r0,:64], r1
  209. bx lr
  210. endfunc
  211. function ff_h264_h_loop_filter_chroma_neon, export=1
  212. h264_loop_filter_start
  213. sub r0, r0, #2
  214. vld1.32 {d18[0]}, [r0], r1
  215. vld1.32 {d16[0]}, [r0], r1
  216. vld1.32 {d0[0]}, [r0], r1
  217. vld1.32 {d2[0]}, [r0], r1
  218. vld1.32 {d18[1]}, [r0], r1
  219. vld1.32 {d16[1]}, [r0], r1
  220. vld1.32 {d0[1]}, [r0], r1
  221. vld1.32 {d2[1]}, [r0], r1
  222. vtrn.16 d18, d0
  223. vtrn.16 d16, d2
  224. vtrn.8 d18, d16
  225. vtrn.8 d0, d2
  226. h264_loop_filter_chroma
  227. vtrn.16 d18, d0
  228. vtrn.16 d16, d2
  229. vtrn.8 d18, d16
  230. vtrn.8 d0, d2
  231. sub r0, r0, r1, lsl #3
  232. vst1.32 {d18[0]}, [r0], r1
  233. vst1.32 {d16[0]}, [r0], r1
  234. vst1.32 {d0[0]}, [r0], r1
  235. vst1.32 {d2[0]}, [r0], r1
  236. vst1.32 {d18[1]}, [r0], r1
  237. vst1.32 {d16[1]}, [r0], r1
  238. vst1.32 {d0[1]}, [r0], r1
  239. vst1.32 {d2[1]}, [r0], r1
  240. bx lr
  241. endfunc
  242. /* H.264 qpel MC */
  243. .macro lowpass_const r
  244. movw \r, #5
  245. movt \r, #20
  246. vmov.32 d6[0], \r
  247. .endm
  248. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  249. .if \narrow
  250. t0 .req q0
  251. t1 .req q8
  252. .else
  253. t0 .req \d0
  254. t1 .req \d1
  255. .endif
  256. vext.8 d2, \r0, \r1, #2
  257. vext.8 d3, \r0, \r1, #3
  258. vaddl.u8 q1, d2, d3
  259. vext.8 d4, \r0, \r1, #1
  260. vext.8 d5, \r0, \r1, #4
  261. vaddl.u8 q2, d4, d5
  262. vext.8 d30, \r0, \r1, #5
  263. vaddl.u8 t0, \r0, d30
  264. vext.8 d18, \r2, \r3, #2
  265. vmla.i16 t0, q1, d6[1]
  266. vext.8 d19, \r2, \r3, #3
  267. vaddl.u8 q9, d18, d19
  268. vext.8 d20, \r2, \r3, #1
  269. vmls.i16 t0, q2, d6[0]
  270. vext.8 d21, \r2, \r3, #4
  271. vaddl.u8 q10, d20, d21
  272. vext.8 d31, \r2, \r3, #5
  273. vaddl.u8 t1, \r2, d31
  274. vmla.i16 t1, q9, d6[1]
  275. vmls.i16 t1, q10, d6[0]
  276. .if \narrow
  277. vqrshrun.s16 \d0, t0, #5
  278. vqrshrun.s16 \d1, t1, #5
  279. .endif
  280. .unreq t0
  281. .unreq t1
  282. .endm
  283. .macro lowpass_8_1 r0, r1, d0, narrow=1
  284. .if \narrow
  285. t0 .req q0
  286. .else
  287. t0 .req \d0
  288. .endif
  289. vext.8 d2, \r0, \r1, #2
  290. vext.8 d3, \r0, \r1, #3
  291. vaddl.u8 q1, d2, d3
  292. vext.8 d4, \r0, \r1, #1
  293. vext.8 d5, \r0, \r1, #4
  294. vaddl.u8 q2, d4, d5
  295. vext.8 d30, \r0, \r1, #5
  296. vaddl.u8 t0, \r0, d30
  297. vmla.i16 t0, q1, d6[1]
  298. vmls.i16 t0, q2, d6[0]
  299. .if \narrow
  300. vqrshrun.s16 \d0, t0, #5
  301. .endif
  302. .unreq t0
  303. .endm
  304. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  305. vext.16 q1, \r0, \r1, #2
  306. vext.16 q0, \r0, \r1, #3
  307. vaddl.s16 q9, d2, d0
  308. vext.16 q2, \r0, \r1, #1
  309. vaddl.s16 q1, d3, d1
  310. vext.16 q3, \r0, \r1, #4
  311. vaddl.s16 q10, d4, d6
  312. vext.16 \r1, \r0, \r1, #5
  313. vaddl.s16 q2, d5, d7
  314. vaddl.s16 q0, \h0, \h1
  315. vaddl.s16 q8, \l0, \l1
  316. vshl.i32 q3, q9, #4
  317. vshl.i32 q9, q9, #2
  318. vshl.i32 q15, q10, #2
  319. vadd.i32 q9, q9, q3
  320. vadd.i32 q10, q10, q15
  321. vshl.i32 q3, q1, #4
  322. vshl.i32 q1, q1, #2
  323. vshl.i32 q15, q2, #2
  324. vadd.i32 q1, q1, q3
  325. vadd.i32 q2, q2, q15
  326. vadd.i32 q9, q9, q8
  327. vsub.i32 q9, q9, q10
  328. vadd.i32 q1, q1, q0
  329. vsub.i32 q1, q1, q2
  330. vrshrn.s32 d18, q9, #10
  331. vrshrn.s32 d19, q1, #10
  332. vqmovun.s16 \d, q9
  333. .endm
  334. function put_h264_qpel16_h_lowpass_neon_packed
  335. mov r4, lr
  336. mov r12, #16
  337. mov r3, #8
  338. bl put_h264_qpel8_h_lowpass_neon
  339. sub r1, r1, r2, lsl #4
  340. add r1, r1, #8
  341. mov r12, #16
  342. mov lr, r4
  343. b put_h264_qpel8_h_lowpass_neon
  344. endfunc
  345. .macro h264_qpel_h_lowpass type
  346. function \type\()_h264_qpel16_h_lowpass_neon
  347. push {lr}
  348. mov r12, #16
  349. bl \type\()_h264_qpel8_h_lowpass_neon
  350. sub r0, r0, r3, lsl #4
  351. sub r1, r1, r2, lsl #4
  352. add r0, r0, #8
  353. add r1, r1, #8
  354. mov r12, #16
  355. pop {lr}
  356. endfunc
  357. function \type\()_h264_qpel8_h_lowpass_neon
  358. 1: vld1.8 {d0, d1}, [r1], r2
  359. vld1.8 {d16,d17}, [r1], r2
  360. subs r12, r12, #2
  361. lowpass_8 d0, d1, d16, d17, d0, d16
  362. .ifc \type,avg
  363. vld1.8 {d2}, [r0,:64], r3
  364. vrhadd.u8 d0, d0, d2
  365. vld1.8 {d3}, [r0,:64]
  366. vrhadd.u8 d16, d16, d3
  367. sub r0, r0, r3
  368. .endif
  369. vst1.8 {d0}, [r0,:64], r3
  370. vst1.8 {d16}, [r0,:64], r3
  371. bne 1b
  372. bx lr
  373. endfunc
  374. .endm
  375. h264_qpel_h_lowpass put
  376. h264_qpel_h_lowpass avg
  377. .macro h264_qpel_h_lowpass_l2 type
  378. function \type\()_h264_qpel16_h_lowpass_l2_neon
  379. push {lr}
  380. mov r12, #16
  381. bl \type\()_h264_qpel8_h_lowpass_l2_neon
  382. sub r0, r0, r2, lsl #4
  383. sub r1, r1, r2, lsl #4
  384. sub r3, r3, r2, lsl #4
  385. add r0, r0, #8
  386. add r1, r1, #8
  387. add r3, r3, #8
  388. mov r12, #16
  389. pop {lr}
  390. endfunc
  391. function \type\()_h264_qpel8_h_lowpass_l2_neon
  392. 1: vld1.8 {d0, d1}, [r1], r2
  393. vld1.8 {d16,d17}, [r1], r2
  394. vld1.8 {d28}, [r3], r2
  395. vld1.8 {d29}, [r3], r2
  396. subs r12, r12, #2
  397. lowpass_8 d0, d1, d16, d17, d0, d1
  398. vrhadd.u8 q0, q0, q14
  399. .ifc \type,avg
  400. vld1.8 {d2}, [r0,:64], r2
  401. vrhadd.u8 d0, d0, d2
  402. vld1.8 {d3}, [r0,:64]
  403. vrhadd.u8 d1, d1, d3
  404. sub r0, r0, r2
  405. .endif
  406. vst1.8 {d0}, [r0,:64], r2
  407. vst1.8 {d1}, [r0,:64], r2
  408. bne 1b
  409. bx lr
  410. endfunc
  411. .endm
  412. h264_qpel_h_lowpass_l2 put
  413. h264_qpel_h_lowpass_l2 avg
  414. function put_h264_qpel16_v_lowpass_neon_packed
  415. mov r4, lr
  416. mov r2, #8
  417. bl put_h264_qpel8_v_lowpass_neon
  418. sub r1, r1, r3, lsl #2
  419. bl put_h264_qpel8_v_lowpass_neon
  420. sub r1, r1, r3, lsl #4
  421. sub r1, r1, r3, lsl #2
  422. add r1, r1, #8
  423. bl put_h264_qpel8_v_lowpass_neon
  424. sub r1, r1, r3, lsl #2
  425. mov lr, r4
  426. b put_h264_qpel8_v_lowpass_neon
  427. endfunc
  428. .macro h264_qpel_v_lowpass type
  429. function \type\()_h264_qpel16_v_lowpass_neon
  430. mov r4, lr
  431. bl \type\()_h264_qpel8_v_lowpass_neon
  432. sub r1, r1, r3, lsl #2
  433. bl \type\()_h264_qpel8_v_lowpass_neon
  434. sub r0, r0, r2, lsl #4
  435. add r0, r0, #8
  436. sub r1, r1, r3, lsl #4
  437. sub r1, r1, r3, lsl #2
  438. add r1, r1, #8
  439. bl \type\()_h264_qpel8_v_lowpass_neon
  440. sub r1, r1, r3, lsl #2
  441. mov lr, r4
  442. endfunc
  443. function \type\()_h264_qpel8_v_lowpass_neon
  444. vld1.8 {d8}, [r1], r3
  445. vld1.8 {d10}, [r1], r3
  446. vld1.8 {d12}, [r1], r3
  447. vld1.8 {d14}, [r1], r3
  448. vld1.8 {d22}, [r1], r3
  449. vld1.8 {d24}, [r1], r3
  450. vld1.8 {d26}, [r1], r3
  451. vld1.8 {d28}, [r1], r3
  452. vld1.8 {d9}, [r1], r3
  453. vld1.8 {d11}, [r1], r3
  454. vld1.8 {d13}, [r1], r3
  455. vld1.8 {d15}, [r1], r3
  456. vld1.8 {d23}, [r1]
  457. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  458. lowpass_8 d8, d9, d10, d11, d8, d10
  459. lowpass_8 d12, d13, d14, d15, d12, d14
  460. lowpass_8 d22, d23, d24, d25, d22, d24
  461. lowpass_8 d26, d27, d28, d29, d26, d28
  462. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  463. .ifc \type,avg
  464. vld1.8 {d9}, [r0,:64], r2
  465. vrhadd.u8 d8, d8, d9
  466. vld1.8 {d11}, [r0,:64], r2
  467. vrhadd.u8 d10, d10, d11
  468. vld1.8 {d13}, [r0,:64], r2
  469. vrhadd.u8 d12, d12, d13
  470. vld1.8 {d15}, [r0,:64], r2
  471. vrhadd.u8 d14, d14, d15
  472. vld1.8 {d23}, [r0,:64], r2
  473. vrhadd.u8 d22, d22, d23
  474. vld1.8 {d25}, [r0,:64], r2
  475. vrhadd.u8 d24, d24, d25
  476. vld1.8 {d27}, [r0,:64], r2
  477. vrhadd.u8 d26, d26, d27
  478. vld1.8 {d29}, [r0,:64], r2
  479. vrhadd.u8 d28, d28, d29
  480. sub r0, r0, r2, lsl #3
  481. .endif
  482. vst1.8 {d8}, [r0,:64], r2
  483. vst1.8 {d10}, [r0,:64], r2
  484. vst1.8 {d12}, [r0,:64], r2
  485. vst1.8 {d14}, [r0,:64], r2
  486. vst1.8 {d22}, [r0,:64], r2
  487. vst1.8 {d24}, [r0,:64], r2
  488. vst1.8 {d26}, [r0,:64], r2
  489. vst1.8 {d28}, [r0,:64], r2
  490. bx lr
  491. endfunc
  492. .endm
  493. h264_qpel_v_lowpass put
  494. h264_qpel_v_lowpass avg
  495. .macro h264_qpel_v_lowpass_l2 type
  496. function \type\()_h264_qpel16_v_lowpass_l2_neon
  497. mov r4, lr
  498. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  499. sub r1, r1, r3, lsl #2
  500. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  501. sub r0, r0, r3, lsl #4
  502. sub r12, r12, r2, lsl #4
  503. add r0, r0, #8
  504. add r12, r12, #8
  505. sub r1, r1, r3, lsl #4
  506. sub r1, r1, r3, lsl #2
  507. add r1, r1, #8
  508. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  509. sub r1, r1, r3, lsl #2
  510. mov lr, r4
  511. endfunc
  512. function \type\()_h264_qpel8_v_lowpass_l2_neon
  513. vld1.8 {d8}, [r1], r3
  514. vld1.8 {d10}, [r1], r3
  515. vld1.8 {d12}, [r1], r3
  516. vld1.8 {d14}, [r1], r3
  517. vld1.8 {d22}, [r1], r3
  518. vld1.8 {d24}, [r1], r3
  519. vld1.8 {d26}, [r1], r3
  520. vld1.8 {d28}, [r1], r3
  521. vld1.8 {d9}, [r1], r3
  522. vld1.8 {d11}, [r1], r3
  523. vld1.8 {d13}, [r1], r3
  524. vld1.8 {d15}, [r1], r3
  525. vld1.8 {d23}, [r1]
  526. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  527. lowpass_8 d8, d9, d10, d11, d8, d9
  528. lowpass_8 d12, d13, d14, d15, d12, d13
  529. lowpass_8 d22, d23, d24, d25, d22, d23
  530. lowpass_8 d26, d27, d28, d29, d26, d27
  531. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  532. vld1.8 {d0}, [r12], r2
  533. vld1.8 {d1}, [r12], r2
  534. vld1.8 {d2}, [r12], r2
  535. vld1.8 {d3}, [r12], r2
  536. vld1.8 {d4}, [r12], r2
  537. vrhadd.u8 q0, q0, q4
  538. vld1.8 {d5}, [r12], r2
  539. vrhadd.u8 q1, q1, q6
  540. vld1.8 {d10}, [r12], r2
  541. vrhadd.u8 q2, q2, q11
  542. vld1.8 {d11}, [r12], r2
  543. vrhadd.u8 q5, q5, q13
  544. .ifc \type,avg
  545. vld1.8 {d16}, [r0,:64], r3
  546. vrhadd.u8 d0, d0, d16
  547. vld1.8 {d17}, [r0,:64], r3
  548. vrhadd.u8 d1, d1, d17
  549. vld1.8 {d16}, [r0,:64], r3
  550. vrhadd.u8 d2, d2, d16
  551. vld1.8 {d17}, [r0,:64], r3
  552. vrhadd.u8 d3, d3, d17
  553. vld1.8 {d16}, [r0,:64], r3
  554. vrhadd.u8 d4, d4, d16
  555. vld1.8 {d17}, [r0,:64], r3
  556. vrhadd.u8 d5, d5, d17
  557. vld1.8 {d16}, [r0,:64], r3
  558. vrhadd.u8 d10, d10, d16
  559. vld1.8 {d17}, [r0,:64], r3
  560. vrhadd.u8 d11, d11, d17
  561. sub r0, r0, r3, lsl #3
  562. .endif
  563. vst1.8 {d0}, [r0,:64], r3
  564. vst1.8 {d1}, [r0,:64], r3
  565. vst1.8 {d2}, [r0,:64], r3
  566. vst1.8 {d3}, [r0,:64], r3
  567. vst1.8 {d4}, [r0,:64], r3
  568. vst1.8 {d5}, [r0,:64], r3
  569. vst1.8 {d10}, [r0,:64], r3
  570. vst1.8 {d11}, [r0,:64], r3
  571. bx lr
  572. endfunc
  573. .endm
  574. h264_qpel_v_lowpass_l2 put
  575. h264_qpel_v_lowpass_l2 avg
  576. function put_h264_qpel8_hv_lowpass_neon_top
  577. lowpass_const r12
  578. mov r12, #12
  579. 1: vld1.8 {d0, d1}, [r1], r3
  580. vld1.8 {d16,d17}, [r1], r3
  581. subs r12, r12, #2
  582. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  583. vst1.8 {d22-d25}, [r4,:128]!
  584. bne 1b
  585. vld1.8 {d0, d1}, [r1]
  586. lowpass_8_1 d0, d1, q12, narrow=0
  587. mov r12, #-16
  588. add r4, r4, r12
  589. vld1.8 {d30,d31}, [r4,:128], r12
  590. vld1.8 {d20,d21}, [r4,:128], r12
  591. vld1.8 {d18,d19}, [r4,:128], r12
  592. vld1.8 {d16,d17}, [r4,:128], r12
  593. vld1.8 {d14,d15}, [r4,:128], r12
  594. vld1.8 {d12,d13}, [r4,:128], r12
  595. vld1.8 {d10,d11}, [r4,:128], r12
  596. vld1.8 {d8, d9}, [r4,:128], r12
  597. vld1.8 {d6, d7}, [r4,:128], r12
  598. vld1.8 {d4, d5}, [r4,:128], r12
  599. vld1.8 {d2, d3}, [r4,:128], r12
  600. vld1.8 {d0, d1}, [r4,:128]
  601. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  602. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  603. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  604. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  605. vst1.8 {d30,d31}, [r4,:128]!
  606. vst1.8 {d6, d7}, [r4,:128]!
  607. vst1.8 {d20,d21}, [r4,:128]!
  608. vst1.8 {d4, d5}, [r4,:128]!
  609. vst1.8 {d18,d19}, [r4,:128]!
  610. vst1.8 {d2, d3}, [r4,:128]!
  611. vst1.8 {d16,d17}, [r4,:128]!
  612. vst1.8 {d0, d1}, [r4,:128]
  613. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  614. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  615. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  616. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  617. vld1.8 {d16,d17}, [r4,:128], r12
  618. vld1.8 {d30,d31}, [r4,:128], r12
  619. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  620. vld1.8 {d16,d17}, [r4,:128], r12
  621. vld1.8 {d30,d31}, [r4,:128], r12
  622. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  623. vld1.8 {d16,d17}, [r4,:128], r12
  624. vld1.8 {d30,d31}, [r4,:128], r12
  625. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  626. vld1.8 {d16,d17}, [r4,:128], r12
  627. vld1.8 {d30,d31}, [r4,:128]
  628. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  629. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  630. bx lr
  631. endfunc
  632. .macro h264_qpel8_hv_lowpass type
  633. function \type\()_h264_qpel8_hv_lowpass_neon
  634. mov r10, lr
  635. bl put_h264_qpel8_hv_lowpass_neon_top
  636. .ifc \type,avg
  637. vld1.8 {d0}, [r0,:64], r2
  638. vrhadd.u8 d12, d12, d0
  639. vld1.8 {d1}, [r0,:64], r2
  640. vrhadd.u8 d13, d13, d1
  641. vld1.8 {d2}, [r0,:64], r2
  642. vrhadd.u8 d14, d14, d2
  643. vld1.8 {d3}, [r0,:64], r2
  644. vrhadd.u8 d15, d15, d3
  645. vld1.8 {d4}, [r0,:64], r2
  646. vrhadd.u8 d8, d8, d4
  647. vld1.8 {d5}, [r0,:64], r2
  648. vrhadd.u8 d9, d9, d5
  649. vld1.8 {d6}, [r0,:64], r2
  650. vrhadd.u8 d10, d10, d6
  651. vld1.8 {d7}, [r0,:64], r2
  652. vrhadd.u8 d11, d11, d7
  653. sub r0, r0, r2, lsl #3
  654. .endif
  655. vst1.8 {d12}, [r0,:64], r2
  656. vst1.8 {d13}, [r0,:64], r2
  657. vst1.8 {d14}, [r0,:64], r2
  658. vst1.8 {d15}, [r0,:64], r2
  659. vst1.8 {d8}, [r0,:64], r2
  660. vst1.8 {d9}, [r0,:64], r2
  661. vst1.8 {d10}, [r0,:64], r2
  662. vst1.8 {d11}, [r0,:64], r2
  663. mov lr, r10
  664. bx lr
  665. endfunc
  666. .endm
  667. h264_qpel8_hv_lowpass put
  668. h264_qpel8_hv_lowpass avg
  669. .macro h264_qpel8_hv_lowpass_l2 type
  670. function \type\()_h264_qpel8_hv_lowpass_l2_neon
  671. mov r10, lr
  672. bl put_h264_qpel8_hv_lowpass_neon_top
  673. vld1.8 {d0, d1}, [r2,:128]!
  674. vld1.8 {d2, d3}, [r2,:128]!
  675. vrhadd.u8 q0, q0, q6
  676. vld1.8 {d4, d5}, [r2,:128]!
  677. vrhadd.u8 q1, q1, q7
  678. vld1.8 {d6, d7}, [r2,:128]!
  679. vrhadd.u8 q2, q2, q4
  680. vrhadd.u8 q3, q3, q5
  681. .ifc \type,avg
  682. vld1.8 {d16}, [r0,:64], r3
  683. vrhadd.u8 d0, d0, d16
  684. vld1.8 {d17}, [r0,:64], r3
  685. vrhadd.u8 d1, d1, d17
  686. vld1.8 {d18}, [r0,:64], r3
  687. vrhadd.u8 d2, d2, d18
  688. vld1.8 {d19}, [r0,:64], r3
  689. vrhadd.u8 d3, d3, d19
  690. vld1.8 {d20}, [r0,:64], r3
  691. vrhadd.u8 d4, d4, d20
  692. vld1.8 {d21}, [r0,:64], r3
  693. vrhadd.u8 d5, d5, d21
  694. vld1.8 {d22}, [r0,:64], r3
  695. vrhadd.u8 d6, d6, d22
  696. vld1.8 {d23}, [r0,:64], r3
  697. vrhadd.u8 d7, d7, d23
  698. sub r0, r0, r3, lsl #3
  699. .endif
  700. vst1.8 {d0}, [r0,:64], r3
  701. vst1.8 {d1}, [r0,:64], r3
  702. vst1.8 {d2}, [r0,:64], r3
  703. vst1.8 {d3}, [r0,:64], r3
  704. vst1.8 {d4}, [r0,:64], r3
  705. vst1.8 {d5}, [r0,:64], r3
  706. vst1.8 {d6}, [r0,:64], r3
  707. vst1.8 {d7}, [r0,:64], r3
  708. mov lr, r10
  709. bx lr
  710. endfunc
  711. .endm
  712. h264_qpel8_hv_lowpass_l2 put
  713. h264_qpel8_hv_lowpass_l2 avg
  714. .macro h264_qpel16_hv type
  715. function \type\()_h264_qpel16_hv_lowpass_neon
  716. mov r9, lr
  717. bl \type\()_h264_qpel8_hv_lowpass_neon
  718. sub r1, r1, r3, lsl #2
  719. bl \type\()_h264_qpel8_hv_lowpass_neon
  720. sub r1, r1, r3, lsl #4
  721. sub r1, r1, r3, lsl #2
  722. add r1, r1, #8
  723. sub r0, r0, r2, lsl #4
  724. add r0, r0, #8
  725. bl \type\()_h264_qpel8_hv_lowpass_neon
  726. sub r1, r1, r3, lsl #2
  727. mov lr, r9
  728. b \type\()_h264_qpel8_hv_lowpass_neon
  729. endfunc
  730. function \type\()_h264_qpel16_hv_lowpass_l2_neon
  731. mov r9, lr
  732. sub r2, r4, #256
  733. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  734. sub r1, r1, r3, lsl #2
  735. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  736. sub r1, r1, r3, lsl #4
  737. sub r1, r1, r3, lsl #2
  738. add r1, r1, #8
  739. sub r0, r0, r3, lsl #4
  740. add r0, r0, #8
  741. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  742. sub r1, r1, r3, lsl #2
  743. mov lr, r9
  744. b \type\()_h264_qpel8_hv_lowpass_l2_neon
  745. endfunc
  746. .endm
  747. h264_qpel16_hv put
  748. h264_qpel16_hv avg
  749. .macro h264_qpel8 type
  750. function ff_\type\()_h264_qpel8_mc10_neon, export=1
  751. lowpass_const r3
  752. mov r3, r1
  753. sub r1, r1, #2
  754. mov r12, #8
  755. b \type\()_h264_qpel8_h_lowpass_l2_neon
  756. endfunc
  757. function ff_\type\()_h264_qpel8_mc20_neon, export=1
  758. lowpass_const r3
  759. sub r1, r1, #2
  760. mov r3, r2
  761. mov r12, #8
  762. b \type\()_h264_qpel8_h_lowpass_neon
  763. endfunc
  764. function ff_\type\()_h264_qpel8_mc30_neon, export=1
  765. lowpass_const r3
  766. add r3, r1, #1
  767. sub r1, r1, #2
  768. mov r12, #8
  769. b \type\()_h264_qpel8_h_lowpass_l2_neon
  770. endfunc
  771. function ff_\type\()_h264_qpel8_mc01_neon, export=1
  772. push {lr}
  773. mov r12, r1
  774. \type\()_h264_qpel8_mc01:
  775. lowpass_const r3
  776. mov r3, r2
  777. sub r1, r1, r2, lsl #1
  778. vpush {d8-d15}
  779. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  780. vpop {d8-d15}
  781. pop {pc}
  782. endfunc
  783. function ff_\type\()_h264_qpel8_mc11_neon, export=1
  784. push {r0, r1, r11, lr}
  785. \type\()_h264_qpel8_mc11:
  786. lowpass_const r3
  787. mov r11, sp
  788. A bic sp, sp, #15
  789. T bic r0, r11, #15
  790. T mov sp, r0
  791. sub sp, sp, #64
  792. mov r0, sp
  793. sub r1, r1, #2
  794. mov r3, #8
  795. mov r12, #8
  796. vpush {d8-d15}
  797. bl put_h264_qpel8_h_lowpass_neon
  798. ldrd r0, [r11], #8
  799. mov r3, r2
  800. add r12, sp, #64
  801. sub r1, r1, r2, lsl #1
  802. mov r2, #8
  803. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  804. vpop {d8-d15}
  805. mov sp, r11
  806. pop {r11, pc}
  807. endfunc
  808. function ff_\type\()_h264_qpel8_mc21_neon, export=1
  809. push {r0, r1, r4, r10, r11, lr}
  810. \type\()_h264_qpel8_mc21:
  811. lowpass_const r3
  812. mov r11, sp
  813. A bic sp, sp, #15
  814. T bic r0, r11, #15
  815. T mov sp, r0
  816. sub sp, sp, #(8*8+16*12)
  817. sub r1, r1, #2
  818. mov r3, #8
  819. mov r0, sp
  820. mov r12, #8
  821. vpush {d8-d15}
  822. bl put_h264_qpel8_h_lowpass_neon
  823. mov r4, r0
  824. ldrd r0, [r11], #8
  825. sub r1, r1, r2, lsl #1
  826. sub r1, r1, #2
  827. mov r3, r2
  828. sub r2, r4, #64
  829. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  830. vpop {d8-d15}
  831. mov sp, r11
  832. pop {r4, r10, r11, pc}
  833. endfunc
  834. function ff_\type\()_h264_qpel8_mc31_neon, export=1
  835. add r1, r1, #1
  836. push {r0, r1, r11, lr}
  837. sub r1, r1, #1
  838. b \type\()_h264_qpel8_mc11
  839. endfunc
  840. function ff_\type\()_h264_qpel8_mc02_neon, export=1
  841. push {lr}
  842. lowpass_const r3
  843. sub r1, r1, r2, lsl #1
  844. mov r3, r2
  845. vpush {d8-d15}
  846. bl \type\()_h264_qpel8_v_lowpass_neon
  847. vpop {d8-d15}
  848. pop {pc}
  849. endfunc
  850. function ff_\type\()_h264_qpel8_mc12_neon, export=1
  851. push {r0, r1, r4, r10, r11, lr}
  852. \type\()_h264_qpel8_mc12:
  853. lowpass_const r3
  854. mov r11, sp
  855. A bic sp, sp, #15
  856. T bic r0, r11, #15
  857. T mov sp, r0
  858. sub sp, sp, #(8*8+16*12)
  859. sub r1, r1, r2, lsl #1
  860. mov r3, r2
  861. mov r2, #8
  862. mov r0, sp
  863. vpush {d8-d15}
  864. bl put_h264_qpel8_v_lowpass_neon
  865. mov r4, r0
  866. ldrd r0, [r11], #8
  867. sub r1, r1, r3, lsl #1
  868. sub r1, r1, #2
  869. sub r2, r4, #64
  870. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  871. vpop {d8-d15}
  872. mov sp, r11
  873. pop {r4, r10, r11, pc}
  874. endfunc
  875. function ff_\type\()_h264_qpel8_mc22_neon, export=1
  876. push {r4, r10, r11, lr}
  877. mov r11, sp
  878. A bic sp, sp, #15
  879. T bic r4, r11, #15
  880. T mov sp, r4
  881. sub r1, r1, r2, lsl #1
  882. sub r1, r1, #2
  883. mov r3, r2
  884. sub sp, sp, #(16*12)
  885. mov r4, sp
  886. vpush {d8-d15}
  887. bl \type\()_h264_qpel8_hv_lowpass_neon
  888. vpop {d8-d15}
  889. mov sp, r11
  890. pop {r4, r10, r11, pc}
  891. endfunc
  892. function ff_\type\()_h264_qpel8_mc32_neon, export=1
  893. push {r0, r1, r4, r10, r11, lr}
  894. add r1, r1, #1
  895. b \type\()_h264_qpel8_mc12
  896. endfunc
  897. function ff_\type\()_h264_qpel8_mc03_neon, export=1
  898. push {lr}
  899. add r12, r1, r2
  900. b \type\()_h264_qpel8_mc01
  901. endfunc
  902. function ff_\type\()_h264_qpel8_mc13_neon, export=1
  903. push {r0, r1, r11, lr}
  904. add r1, r1, r2
  905. b \type\()_h264_qpel8_mc11
  906. endfunc
  907. function ff_\type\()_h264_qpel8_mc23_neon, export=1
  908. push {r0, r1, r4, r10, r11, lr}
  909. add r1, r1, r2
  910. b \type\()_h264_qpel8_mc21
  911. endfunc
  912. function ff_\type\()_h264_qpel8_mc33_neon, export=1
  913. add r1, r1, #1
  914. push {r0, r1, r11, lr}
  915. add r1, r1, r2
  916. sub r1, r1, #1
  917. b \type\()_h264_qpel8_mc11
  918. endfunc
  919. .endm
  920. h264_qpel8 put
  921. h264_qpel8 avg
  922. .macro h264_qpel16 type
  923. function ff_\type\()_h264_qpel16_mc10_neon, export=1
  924. lowpass_const r3
  925. mov r3, r1
  926. sub r1, r1, #2
  927. b \type\()_h264_qpel16_h_lowpass_l2_neon
  928. endfunc
  929. function ff_\type\()_h264_qpel16_mc20_neon, export=1
  930. lowpass_const r3
  931. sub r1, r1, #2
  932. mov r3, r2
  933. b \type\()_h264_qpel16_h_lowpass_neon
  934. endfunc
  935. function ff_\type\()_h264_qpel16_mc30_neon, export=1
  936. lowpass_const r3
  937. add r3, r1, #1
  938. sub r1, r1, #2
  939. b \type\()_h264_qpel16_h_lowpass_l2_neon
  940. endfunc
  941. function ff_\type\()_h264_qpel16_mc01_neon, export=1
  942. push {r4, lr}
  943. mov r12, r1
  944. \type\()_h264_qpel16_mc01:
  945. lowpass_const r3
  946. mov r3, r2
  947. sub r1, r1, r2, lsl #1
  948. vpush {d8-d15}
  949. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  950. vpop {d8-d15}
  951. pop {r4, pc}
  952. endfunc
  953. function ff_\type\()_h264_qpel16_mc11_neon, export=1
  954. push {r0, r1, r4, r11, lr}
  955. \type\()_h264_qpel16_mc11:
  956. lowpass_const r3
  957. mov r11, sp
  958. A bic sp, sp, #15
  959. T bic r0, r11, #15
  960. T mov sp, r0
  961. sub sp, sp, #256
  962. mov r0, sp
  963. sub r1, r1, #2
  964. mov r3, #16
  965. vpush {d8-d15}
  966. bl put_h264_qpel16_h_lowpass_neon
  967. ldrd r0, [r11], #8
  968. mov r3, r2
  969. add r12, sp, #64
  970. sub r1, r1, r2, lsl #1
  971. mov r2, #16
  972. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  973. vpop {d8-d15}
  974. mov sp, r11
  975. pop {r4, r11, pc}
  976. endfunc
  977. function ff_\type\()_h264_qpel16_mc21_neon, export=1
  978. push {r0, r1, r4-r5, r9-r11, lr}
  979. \type\()_h264_qpel16_mc21:
  980. lowpass_const r3
  981. mov r11, sp
  982. A bic sp, sp, #15
  983. T bic r0, r11, #15
  984. T mov sp, r0
  985. sub sp, sp, #(16*16+16*12)
  986. sub r1, r1, #2
  987. mov r0, sp
  988. vpush {d8-d15}
  989. bl put_h264_qpel16_h_lowpass_neon_packed
  990. mov r4, r0
  991. ldrd r0, [r11], #8
  992. sub r1, r1, r2, lsl #1
  993. sub r1, r1, #2
  994. mov r3, r2
  995. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  996. vpop {d8-d15}
  997. mov sp, r11
  998. pop {r4-r5, r9-r11, pc}
  999. endfunc
  1000. function ff_\type\()_h264_qpel16_mc31_neon, export=1
  1001. add r1, r1, #1
  1002. push {r0, r1, r4, r11, lr}
  1003. sub r1, r1, #1
  1004. b \type\()_h264_qpel16_mc11
  1005. endfunc
  1006. function ff_\type\()_h264_qpel16_mc02_neon, export=1
  1007. push {r4, lr}
  1008. lowpass_const r3
  1009. sub r1, r1, r2, lsl #1
  1010. mov r3, r2
  1011. vpush {d8-d15}
  1012. bl \type\()_h264_qpel16_v_lowpass_neon
  1013. vpop {d8-d15}
  1014. pop {r4, pc}
  1015. endfunc
  1016. function ff_\type\()_h264_qpel16_mc12_neon, export=1
  1017. push {r0, r1, r4-r5, r9-r11, lr}
  1018. \type\()_h264_qpel16_mc12:
  1019. lowpass_const r3
  1020. mov r11, sp
  1021. A bic sp, sp, #15
  1022. T bic r0, r11, #15
  1023. T mov sp, r0
  1024. sub sp, sp, #(16*16+16*12)
  1025. sub r1, r1, r2, lsl #1
  1026. mov r0, sp
  1027. mov r3, r2
  1028. vpush {d8-d15}
  1029. bl put_h264_qpel16_v_lowpass_neon_packed
  1030. mov r4, r0
  1031. ldrd r0, [r11], #8
  1032. sub r1, r1, r3, lsl #1
  1033. sub r1, r1, #2
  1034. mov r2, r3
  1035. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1036. vpop {d8-d15}
  1037. mov sp, r11
  1038. pop {r4-r5, r9-r11, pc}
  1039. endfunc
  1040. function ff_\type\()_h264_qpel16_mc22_neon, export=1
  1041. push {r4, r9-r11, lr}
  1042. lowpass_const r3
  1043. mov r11, sp
  1044. A bic sp, sp, #15
  1045. T bic r4, r11, #15
  1046. T mov sp, r4
  1047. sub r1, r1, r2, lsl #1
  1048. sub r1, r1, #2
  1049. mov r3, r2
  1050. sub sp, sp, #(16*12)
  1051. mov r4, sp
  1052. vpush {d8-d15}
  1053. bl \type\()_h264_qpel16_hv_lowpass_neon
  1054. vpop {d8-d15}
  1055. mov sp, r11
  1056. pop {r4, r9-r11, pc}
  1057. endfunc
  1058. function ff_\type\()_h264_qpel16_mc32_neon, export=1
  1059. push {r0, r1, r4-r5, r9-r11, lr}
  1060. add r1, r1, #1
  1061. b \type\()_h264_qpel16_mc12
  1062. endfunc
  1063. function ff_\type\()_h264_qpel16_mc03_neon, export=1
  1064. push {r4, lr}
  1065. add r12, r1, r2
  1066. b \type\()_h264_qpel16_mc01
  1067. endfunc
  1068. function ff_\type\()_h264_qpel16_mc13_neon, export=1
  1069. push {r0, r1, r4, r11, lr}
  1070. add r1, r1, r2
  1071. b \type\()_h264_qpel16_mc11
  1072. endfunc
  1073. function ff_\type\()_h264_qpel16_mc23_neon, export=1
  1074. push {r0, r1, r4-r5, r9-r11, lr}
  1075. add r1, r1, r2
  1076. b \type\()_h264_qpel16_mc21
  1077. endfunc
  1078. function ff_\type\()_h264_qpel16_mc33_neon, export=1
  1079. add r1, r1, #1
  1080. push {r0, r1, r4, r11, lr}
  1081. add r1, r1, r2
  1082. sub r1, r1, #1
  1083. b \type\()_h264_qpel16_mc11
  1084. endfunc
  1085. .endm
  1086. h264_qpel16 put
  1087. h264_qpel16 avg
  1088. @ Biweighted prediction
  1089. .macro biweight_16 macs, macd
  1090. vdup.8 d0, r4
  1091. vdup.8 d1, r5
  1092. vmov q2, q8
  1093. vmov q3, q8
  1094. 1: subs r3, r3, #2
  1095. vld1.8 {d20-d21},[r0,:128], r2
  1096. \macd q2, d0, d20
  1097. pld [r0]
  1098. \macd q3, d0, d21
  1099. vld1.8 {d22-d23},[r1,:128], r2
  1100. \macs q2, d1, d22
  1101. pld [r1]
  1102. \macs q3, d1, d23
  1103. vmov q12, q8
  1104. vld1.8 {d28-d29},[r0,:128], r2
  1105. vmov q13, q8
  1106. \macd q12, d0, d28
  1107. pld [r0]
  1108. \macd q13, d0, d29
  1109. vld1.8 {d30-d31},[r1,:128], r2
  1110. \macs q12, d1, d30
  1111. pld [r1]
  1112. \macs q13, d1, d31
  1113. vshl.s16 q2, q2, q9
  1114. vshl.s16 q3, q3, q9
  1115. vqmovun.s16 d4, q2
  1116. vqmovun.s16 d5, q3
  1117. vshl.s16 q12, q12, q9
  1118. vshl.s16 q13, q13, q9
  1119. vqmovun.s16 d24, q12
  1120. vqmovun.s16 d25, q13
  1121. vmov q3, q8
  1122. vst1.8 {d4- d5}, [r6,:128], r2
  1123. vmov q2, q8
  1124. vst1.8 {d24-d25},[r6,:128], r2
  1125. bne 1b
  1126. pop {r4-r6, pc}
  1127. .endm
  1128. .macro biweight_8 macs, macd
  1129. vdup.8 d0, r4
  1130. vdup.8 d1, r5
  1131. vmov q1, q8
  1132. vmov q10, q8
  1133. 1: subs r3, r3, #2
  1134. vld1.8 {d4},[r0,:64], r2
  1135. \macd q1, d0, d4
  1136. pld [r0]
  1137. vld1.8 {d5},[r1,:64], r2
  1138. \macs q1, d1, d5
  1139. pld [r1]
  1140. vld1.8 {d6},[r0,:64], r2
  1141. \macd q10, d0, d6
  1142. pld [r0]
  1143. vld1.8 {d7},[r1,:64], r2
  1144. \macs q10, d1, d7
  1145. pld [r1]
  1146. vshl.s16 q1, q1, q9
  1147. vqmovun.s16 d2, q1
  1148. vshl.s16 q10, q10, q9
  1149. vqmovun.s16 d4, q10
  1150. vmov q10, q8
  1151. vst1.8 {d2},[r6,:64], r2
  1152. vmov q1, q8
  1153. vst1.8 {d4},[r6,:64], r2
  1154. bne 1b
  1155. pop {r4-r6, pc}
  1156. .endm
  1157. .macro biweight_4 macs, macd
  1158. vdup.8 d0, r4
  1159. vdup.8 d1, r5
  1160. vmov q1, q8
  1161. vmov q10, q8
  1162. 1: subs r3, r3, #4
  1163. vld1.32 {d4[0]},[r0,:32], r2
  1164. vld1.32 {d4[1]},[r0,:32], r2
  1165. \macd q1, d0, d4
  1166. pld [r0]
  1167. vld1.32 {d5[0]},[r1,:32], r2
  1168. vld1.32 {d5[1]},[r1,:32], r2
  1169. \macs q1, d1, d5
  1170. pld [r1]
  1171. blt 2f
  1172. vld1.32 {d6[0]},[r0,:32], r2
  1173. vld1.32 {d6[1]},[r0,:32], r2
  1174. \macd q10, d0, d6
  1175. pld [r0]
  1176. vld1.32 {d7[0]},[r1,:32], r2
  1177. vld1.32 {d7[1]},[r1,:32], r2
  1178. \macs q10, d1, d7
  1179. pld [r1]
  1180. vshl.s16 q1, q1, q9
  1181. vqmovun.s16 d2, q1
  1182. vshl.s16 q10, q10, q9
  1183. vqmovun.s16 d4, q10
  1184. vmov q10, q8
  1185. vst1.32 {d2[0]},[r6,:32], r2
  1186. vst1.32 {d2[1]},[r6,:32], r2
  1187. vmov q1, q8
  1188. vst1.32 {d4[0]},[r6,:32], r2
  1189. vst1.32 {d4[1]},[r6,:32], r2
  1190. bne 1b
  1191. pop {r4-r6, pc}
  1192. 2: vshl.s16 q1, q1, q9
  1193. vqmovun.s16 d2, q1
  1194. vst1.32 {d2[0]},[r6,:32], r2
  1195. vst1.32 {d2[1]},[r6,:32], r2
  1196. pop {r4-r6, pc}
  1197. .endm
  1198. .macro biweight_func w
  1199. function ff_biweight_h264_pixels_\w\()_neon, export=1
  1200. push {r4-r6, lr}
  1201. ldr r12, [sp, #16]
  1202. add r4, sp, #20
  1203. ldm r4, {r4-r6}
  1204. lsr lr, r4, #31
  1205. add r6, r6, #1
  1206. eors lr, lr, r5, lsr #30
  1207. orr r6, r6, #1
  1208. vdup.16 q9, r12
  1209. lsl r6, r6, r12
  1210. vmvn q9, q9
  1211. vdup.16 q8, r6
  1212. mov r6, r0
  1213. beq 10f
  1214. subs lr, lr, #1
  1215. beq 20f
  1216. subs lr, lr, #1
  1217. beq 30f
  1218. b 40f
  1219. 10: biweight_\w vmlal.u8, vmlal.u8
  1220. 20: rsb r4, r4, #0
  1221. biweight_\w vmlal.u8, vmlsl.u8
  1222. 30: rsb r4, r4, #0
  1223. rsb r5, r5, #0
  1224. biweight_\w vmlsl.u8, vmlsl.u8
  1225. 40: rsb r5, r5, #0
  1226. biweight_\w vmlsl.u8, vmlal.u8
  1227. endfunc
  1228. .endm
  1229. biweight_func 16
  1230. biweight_func 8
  1231. biweight_func 4
  1232. @ Weighted prediction
  1233. .macro weight_16 add
  1234. vdup.8 d0, r12
  1235. 1: subs r2, r2, #2
  1236. vld1.8 {d20-d21},[r0,:128], r1
  1237. vmull.u8 q2, d0, d20
  1238. pld [r0]
  1239. vmull.u8 q3, d0, d21
  1240. vld1.8 {d28-d29},[r0,:128], r1
  1241. vmull.u8 q12, d0, d28
  1242. pld [r0]
  1243. vmull.u8 q13, d0, d29
  1244. \add q2, q8, q2
  1245. vrshl.s16 q2, q2, q9
  1246. \add q3, q8, q3
  1247. vrshl.s16 q3, q3, q9
  1248. vqmovun.s16 d4, q2
  1249. vqmovun.s16 d5, q3
  1250. \add q12, q8, q12
  1251. vrshl.s16 q12, q12, q9
  1252. \add q13, q8, q13
  1253. vrshl.s16 q13, q13, q9
  1254. vqmovun.s16 d24, q12
  1255. vqmovun.s16 d25, q13
  1256. vst1.8 {d4- d5}, [r4,:128], r1
  1257. vst1.8 {d24-d25},[r4,:128], r1
  1258. bne 1b
  1259. pop {r4, pc}
  1260. .endm
  1261. .macro weight_8 add
  1262. vdup.8 d0, r12
  1263. 1: subs r2, r2, #2
  1264. vld1.8 {d4},[r0,:64], r1
  1265. vmull.u8 q1, d0, d4
  1266. pld [r0]
  1267. vld1.8 {d6},[r0,:64], r1
  1268. vmull.u8 q10, d0, d6
  1269. \add q1, q8, q1
  1270. pld [r0]
  1271. vrshl.s16 q1, q1, q9
  1272. vqmovun.s16 d2, q1
  1273. \add q10, q8, q10
  1274. vrshl.s16 q10, q10, q9
  1275. vqmovun.s16 d4, q10
  1276. vst1.8 {d2},[r4,:64], r1
  1277. vst1.8 {d4},[r4,:64], r1
  1278. bne 1b
  1279. pop {r4, pc}
  1280. .endm
  1281. .macro weight_4 add
  1282. vdup.8 d0, r12
  1283. vmov q1, q8
  1284. vmov q10, q8
  1285. 1: subs r2, r2, #4
  1286. vld1.32 {d4[0]},[r0,:32], r1
  1287. vld1.32 {d4[1]},[r0,:32], r1
  1288. vmull.u8 q1, d0, d4
  1289. pld [r0]
  1290. blt 2f
  1291. vld1.32 {d6[0]},[r0,:32], r1
  1292. vld1.32 {d6[1]},[r0,:32], r1
  1293. vmull.u8 q10, d0, d6
  1294. pld [r0]
  1295. \add q1, q8, q1
  1296. vrshl.s16 q1, q1, q9
  1297. vqmovun.s16 d2, q1
  1298. \add q10, q8, q10
  1299. vrshl.s16 q10, q10, q9
  1300. vqmovun.s16 d4, q10
  1301. vmov q10, q8
  1302. vst1.32 {d2[0]},[r4,:32], r1
  1303. vst1.32 {d2[1]},[r4,:32], r1
  1304. vmov q1, q8
  1305. vst1.32 {d4[0]},[r4,:32], r1
  1306. vst1.32 {d4[1]},[r4,:32], r1
  1307. bne 1b
  1308. pop {r4, pc}
  1309. 2: \add q1, q8, q1
  1310. vrshl.s16 q1, q1, q9
  1311. vqmovun.s16 d2, q1
  1312. vst1.32 {d2[0]},[r4,:32], r1
  1313. vst1.32 {d2[1]},[r4,:32], r1
  1314. pop {r4, pc}
  1315. .endm
  1316. .macro weight_func w
  1317. function ff_weight_h264_pixels_\w\()_neon, export=1
  1318. push {r4, lr}
  1319. ldr r12, [sp, #8]
  1320. ldr r4, [sp, #12]
  1321. cmp r3, #1
  1322. lsl r4, r4, r3
  1323. vdup.16 q8, r4
  1324. mov r4, r0
  1325. ble 20f
  1326. rsb lr, r3, #1
  1327. vdup.16 q9, lr
  1328. cmp r12, #0
  1329. blt 10f
  1330. weight_\w vhadd.s16
  1331. 10: rsb r12, r12, #0
  1332. weight_\w vhsub.s16
  1333. 20: rsb lr, r3, #0
  1334. vdup.16 q9, lr
  1335. cmp r12, #0
  1336. blt 10f
  1337. weight_\w vadd.s16
  1338. 10: rsb r12, r12, #0
  1339. weight_\w vsub.s16
  1340. endfunc
  1341. .endm
  1342. weight_func 16
  1343. weight_func 8
  1344. weight_func 4