You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1882 lines
62KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  22. vtrn.32 \r0, \r4
  23. vtrn.32 \r1, \r5
  24. vtrn.32 \r2, \r6
  25. vtrn.32 \r3, \r7
  26. vtrn.16 \r0, \r2
  27. vtrn.16 \r1, \r3
  28. vtrn.16 \r4, \r6
  29. vtrn.16 \r5, \r7
  30. vtrn.8 \r0, \r1
  31. vtrn.8 \r2, \r3
  32. vtrn.8 \r4, \r5
  33. vtrn.8 \r6, \r7
  34. .endm
  35. .macro transpose_4x4 r0 r1 r2 r3
  36. vtrn.16 \r0, \r2
  37. vtrn.16 \r1, \r3
  38. vtrn.8 \r0, \r1
  39. vtrn.8 \r2, \r3
  40. .endm
  41. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  42. vswp \r0, \r4
  43. vswp \r1, \r5
  44. vswp \r2, \r6
  45. vswp \r3, \r7
  46. .endm
  47. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  48. vtrn.32 \r0, \r2
  49. vtrn.32 \r1, \r3
  50. vtrn.32 \r4, \r6
  51. vtrn.32 \r5, \r7
  52. vtrn.16 \r0, \r1
  53. vtrn.16 \r2, \r3
  54. vtrn.16 \r4, \r5
  55. vtrn.16 \r6, \r7
  56. .endm
  57. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  58. .macro h264_chroma_mc8 type
  59. function ff_\type\()_h264_chroma_mc8_neon, export=1
  60. push {r4-r7, lr}
  61. ldrd r4, [sp, #20]
  62. .ifc \type,avg
  63. mov lr, r0
  64. .endif
  65. pld [r1]
  66. pld [r1, r2]
  67. muls r7, r4, r5
  68. rsb r6, r7, r5, lsl #3
  69. rsb ip, r7, r4, lsl #3
  70. sub r4, r7, r4, lsl #3
  71. sub r4, r4, r5, lsl #3
  72. add r4, r4, #64
  73. beq 2f
  74. add r5, r1, r2
  75. vdup.8 d0, r4
  76. lsl r4, r2, #1
  77. vdup.8 d1, ip
  78. vld1.64 {d4, d5}, [r1], r4
  79. vdup.8 d2, r6
  80. vld1.64 {d6, d7}, [r5], r4
  81. vdup.8 d3, r7
  82. vext.8 d5, d4, d5, #1
  83. vext.8 d7, d6, d7, #1
  84. 1: pld [r5]
  85. vmull.u8 q8, d4, d0
  86. vmlal.u8 q8, d5, d1
  87. vld1.64 {d4, d5}, [r1], r4
  88. vmlal.u8 q8, d6, d2
  89. vext.8 d5, d4, d5, #1
  90. vmlal.u8 q8, d7, d3
  91. vmull.u8 q9, d6, d0
  92. subs r3, r3, #2
  93. vmlal.u8 q9, d7, d1
  94. vmlal.u8 q9, d4, d2
  95. vmlal.u8 q9, d5, d3
  96. vrshrn.u16 d16, q8, #6
  97. vld1.64 {d6, d7}, [r5], r4
  98. pld [r1]
  99. vrshrn.u16 d17, q9, #6
  100. .ifc \type,avg
  101. vld1.64 {d20}, [lr,:64], r2
  102. vld1.64 {d21}, [lr,:64], r2
  103. vrhadd.u8 q8, q8, q10
  104. .endif
  105. vext.8 d7, d6, d7, #1
  106. vst1.64 {d16}, [r0,:64], r2
  107. vst1.64 {d17}, [r0,:64], r2
  108. bgt 1b
  109. pop {r4-r7, pc}
  110. 2: tst r6, r6
  111. add ip, ip, r6
  112. vdup.8 d0, r4
  113. vdup.8 d1, ip
  114. beq 4f
  115. add r5, r1, r2
  116. lsl r4, r2, #1
  117. vld1.64 {d4}, [r1], r4
  118. vld1.64 {d6}, [r5], r4
  119. 3: pld [r5]
  120. vmull.u8 q8, d4, d0
  121. vmlal.u8 q8, d6, d1
  122. vld1.64 {d4}, [r1], r4
  123. vmull.u8 q9, d6, d0
  124. vmlal.u8 q9, d4, d1
  125. vld1.64 {d6}, [r5], r4
  126. vrshrn.u16 d16, q8, #6
  127. vrshrn.u16 d17, q9, #6
  128. .ifc \type,avg
  129. vld1.64 {d20}, [lr,:64], r2
  130. vld1.64 {d21}, [lr,:64], r2
  131. vrhadd.u8 q8, q8, q10
  132. .endif
  133. subs r3, r3, #2
  134. pld [r1]
  135. vst1.64 {d16}, [r0,:64], r2
  136. vst1.64 {d17}, [r0,:64], r2
  137. bgt 3b
  138. pop {r4-r7, pc}
  139. 4: vld1.64 {d4, d5}, [r1], r2
  140. vld1.64 {d6, d7}, [r1], r2
  141. vext.8 d5, d4, d5, #1
  142. vext.8 d7, d6, d7, #1
  143. 5: pld [r1]
  144. subs r3, r3, #2
  145. vmull.u8 q8, d4, d0
  146. vmlal.u8 q8, d5, d1
  147. vld1.64 {d4, d5}, [r1], r2
  148. vmull.u8 q9, d6, d0
  149. vmlal.u8 q9, d7, d1
  150. pld [r1]
  151. vext.8 d5, d4, d5, #1
  152. vrshrn.u16 d16, q8, #6
  153. vrshrn.u16 d17, q9, #6
  154. .ifc \type,avg
  155. vld1.64 {d20}, [lr,:64], r2
  156. vld1.64 {d21}, [lr,:64], r2
  157. vrhadd.u8 q8, q8, q10
  158. .endif
  159. vld1.64 {d6, d7}, [r1], r2
  160. vext.8 d7, d6, d7, #1
  161. vst1.64 {d16}, [r0,:64], r2
  162. vst1.64 {d17}, [r0,:64], r2
  163. bgt 5b
  164. pop {r4-r7, pc}
  165. endfunc
  166. .endm
  167. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  168. .macro h264_chroma_mc4 type
  169. function ff_\type\()_h264_chroma_mc4_neon, export=1
  170. push {r4-r7, lr}
  171. ldrd r4, [sp, #20]
  172. .ifc \type,avg
  173. mov lr, r0
  174. .endif
  175. pld [r1]
  176. pld [r1, r2]
  177. muls r7, r4, r5
  178. rsb r6, r7, r5, lsl #3
  179. rsb ip, r7, r4, lsl #3
  180. sub r4, r7, r4, lsl #3
  181. sub r4, r4, r5, lsl #3
  182. add r4, r4, #64
  183. beq 2f
  184. add r5, r1, r2
  185. vdup.8 d0, r4
  186. lsl r4, r2, #1
  187. vdup.8 d1, ip
  188. vld1.64 {d4}, [r1], r4
  189. vdup.8 d2, r6
  190. vld1.64 {d6}, [r5], r4
  191. vdup.8 d3, r7
  192. vext.8 d5, d4, d5, #1
  193. vext.8 d7, d6, d7, #1
  194. vtrn.32 d4, d5
  195. vtrn.32 d6, d7
  196. vtrn.32 d0, d1
  197. vtrn.32 d2, d3
  198. 1: pld [r5]
  199. vmull.u8 q8, d4, d0
  200. vmlal.u8 q8, d6, d2
  201. vld1.64 {d4}, [r1], r4
  202. vext.8 d5, d4, d5, #1
  203. vtrn.32 d4, d5
  204. vmull.u8 q9, d6, d0
  205. vmlal.u8 q9, d4, d2
  206. vld1.64 {d6}, [r5], r4
  207. vadd.i16 d16, d16, d17
  208. vadd.i16 d17, d18, d19
  209. vrshrn.u16 d16, q8, #6
  210. subs r3, r3, #2
  211. pld [r1]
  212. .ifc \type,avg
  213. vld1.32 {d20[0]}, [lr,:32], r2
  214. vld1.32 {d20[1]}, [lr,:32], r2
  215. vrhadd.u8 d16, d16, d20
  216. .endif
  217. vext.8 d7, d6, d7, #1
  218. vtrn.32 d6, d7
  219. vst1.32 {d16[0]}, [r0,:32], r2
  220. vst1.32 {d16[1]}, [r0,:32], r2
  221. bgt 1b
  222. pop {r4-r7, pc}
  223. 2: tst r6, r6
  224. add ip, ip, r6
  225. vdup.8 d0, r4
  226. vdup.8 d1, ip
  227. vtrn.32 d0, d1
  228. beq 4f
  229. vext.32 d1, d0, d1, #1
  230. add r5, r1, r2
  231. lsl r4, r2, #1
  232. vld1.32 {d4[0]}, [r1], r4
  233. vld1.32 {d4[1]}, [r5], r4
  234. 3: pld [r5]
  235. vmull.u8 q8, d4, d0
  236. vld1.32 {d4[0]}, [r1], r4
  237. vmull.u8 q9, d4, d1
  238. vld1.32 {d4[1]}, [r5], r4
  239. vadd.i16 d16, d16, d17
  240. vadd.i16 d17, d18, d19
  241. vrshrn.u16 d16, q8, #6
  242. .ifc \type,avg
  243. vld1.32 {d20[0]}, [lr,:32], r2
  244. vld1.32 {d20[1]}, [lr,:32], r2
  245. vrhadd.u8 d16, d16, d20
  246. .endif
  247. subs r3, r3, #2
  248. pld [r1]
  249. vst1.32 {d16[0]}, [r0,:32], r2
  250. vst1.32 {d16[1]}, [r0,:32], r2
  251. bgt 3b
  252. pop {r4-r7, pc}
  253. 4: vld1.64 {d4}, [r1], r2
  254. vld1.64 {d6}, [r1], r2
  255. vext.8 d5, d4, d5, #1
  256. vext.8 d7, d6, d7, #1
  257. vtrn.32 d4, d5
  258. vtrn.32 d6, d7
  259. 5: vmull.u8 q8, d4, d0
  260. vmull.u8 q9, d6, d0
  261. subs r3, r3, #2
  262. vld1.64 {d4}, [r1], r2
  263. vext.8 d5, d4, d5, #1
  264. vtrn.32 d4, d5
  265. vadd.i16 d16, d16, d17
  266. vadd.i16 d17, d18, d19
  267. pld [r1]
  268. vrshrn.u16 d16, q8, #6
  269. .ifc \type,avg
  270. vld1.32 {d20[0]}, [lr,:32], r2
  271. vld1.32 {d20[1]}, [lr,:32], r2
  272. vrhadd.u8 d16, d16, d20
  273. .endif
  274. vld1.64 {d6}, [r1], r2
  275. vext.8 d7, d6, d7, #1
  276. vtrn.32 d6, d7
  277. pld [r1]
  278. vst1.32 {d16[0]}, [r0,:32], r2
  279. vst1.32 {d16[1]}, [r0,:32], r2
  280. bgt 5b
  281. pop {r4-r7, pc}
  282. endfunc
  283. .endm
  284. .macro h264_chroma_mc2 type
  285. function ff_\type\()_h264_chroma_mc2_neon, export=1
  286. push {r4-r6, lr}
  287. ldr r4, [sp, #16]
  288. ldr lr, [sp, #20]
  289. pld [r1]
  290. pld [r1, r2]
  291. orrs r5, r4, lr
  292. beq 2f
  293. mul r5, r4, lr
  294. rsb r6, r5, lr, lsl #3
  295. rsb r12, r5, r4, lsl #3
  296. sub r4, r5, r4, lsl #3
  297. sub r4, r4, lr, lsl #3
  298. add r4, r4, #64
  299. vdup.8 d0, r4
  300. vdup.8 d2, r12
  301. vdup.8 d1, r6
  302. vdup.8 d3, r5
  303. vtrn.16 q0, q1
  304. 1:
  305. vld1.32 {d4[0]}, [r1], r2
  306. vld1.32 {d4[1]}, [r1], r2
  307. vrev64.32 d5, d4
  308. vld1.32 {d5[1]}, [r1]
  309. vext.8 q3, q2, q2, #1
  310. vtrn.16 q2, q3
  311. vmull.u8 q8, d4, d0
  312. vmlal.u8 q8, d5, d1
  313. .ifc \type,avg
  314. vld1.16 {d18[0]}, [r0,:16], r2
  315. vld1.16 {d18[1]}, [r0,:16]
  316. sub r0, r0, r2
  317. .endif
  318. vtrn.32 d16, d17
  319. vadd.i16 d16, d16, d17
  320. vrshrn.u16 d16, q8, #6
  321. .ifc \type,avg
  322. vrhadd.u8 d16, d16, d18
  323. .endif
  324. vst1.16 {d16[0]}, [r0,:16], r2
  325. vst1.16 {d16[1]}, [r0,:16], r2
  326. subs r3, r3, #2
  327. bgt 1b
  328. pop {r4-r6, pc}
  329. 2:
  330. .ifc \type,put
  331. ldrh r5, [r1], r2
  332. strh r5, [r0], r2
  333. ldrh r6, [r1], r2
  334. strh r6, [r0], r2
  335. .else
  336. vld1.16 {d16[0]}, [r1], r2
  337. vld1.16 {d16[1]}, [r1], r2
  338. vld1.16 {d18[0]}, [r0,:16], r2
  339. vld1.16 {d18[1]}, [r0,:16]
  340. sub r0, r0, r2
  341. vrhadd.u8 d16, d16, d18
  342. vst1.16 {d16[0]}, [r0,:16], r2
  343. vst1.16 {d16[1]}, [r0,:16], r2
  344. .endif
  345. subs r3, r3, #2
  346. bgt 2b
  347. pop {r4-r6, pc}
  348. endfunc
  349. .endm
  350. .text
  351. .align
  352. h264_chroma_mc8 put
  353. h264_chroma_mc8 avg
  354. h264_chroma_mc4 put
  355. h264_chroma_mc4 avg
  356. h264_chroma_mc2 put
  357. h264_chroma_mc2 avg
  358. /* H.264 loop filter */
  359. .macro h264_loop_filter_start
  360. ldr ip, [sp]
  361. tst r2, r2
  362. ldr ip, [ip]
  363. tstne r3, r3
  364. vmov.32 d24[0], ip
  365. and ip, ip, ip, lsl #16
  366. bxeq lr
  367. ands ip, ip, ip, lsl #8
  368. bxlt lr
  369. .endm
  370. .macro align_push_regs
  371. and ip, sp, #15
  372. add ip, ip, #32
  373. sub sp, sp, ip
  374. vst1.64 {d12-d15}, [sp,:128]
  375. sub sp, sp, #32
  376. vst1.64 {d8-d11}, [sp,:128]
  377. .endm
  378. .macro align_pop_regs
  379. vld1.64 {d8-d11}, [sp,:128]!
  380. vld1.64 {d12-d15}, [sp,:128], ip
  381. .endm
  382. .macro h264_loop_filter_luma
  383. vdup.8 q11, r2 @ alpha
  384. vmovl.u8 q12, d24
  385. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  386. vmovl.u16 q12, d24
  387. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  388. vsli.16 q12, q12, #8
  389. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  390. vsli.32 q12, q12, #16
  391. vclt.u8 q6, q6, q11 @ < alpha
  392. vdup.8 q11, r3 @ beta
  393. vclt.s8 q7, q12, #0
  394. vclt.u8 q14, q14, q11 @ < beta
  395. vclt.u8 q15, q15, q11 @ < beta
  396. vbic q6, q6, q7
  397. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  398. vand q6, q6, q14
  399. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  400. vclt.u8 q4, q4, q11 @ < beta
  401. vand q6, q6, q15
  402. vclt.u8 q5, q5, q11 @ < beta
  403. vand q4, q4, q6
  404. vand q5, q5, q6
  405. vand q12, q12, q6
  406. vrhadd.u8 q14, q8, q0
  407. vsub.i8 q6, q12, q4
  408. vqadd.u8 q7, q9, q12
  409. vhadd.u8 q10, q10, q14
  410. vsub.i8 q6, q6, q5
  411. vhadd.u8 q14, q2, q14
  412. vmin.u8 q7, q7, q10
  413. vqsub.u8 q11, q9, q12
  414. vqadd.u8 q2, q1, q12
  415. vmax.u8 q7, q7, q11
  416. vqsub.u8 q11, q1, q12
  417. vmin.u8 q14, q2, q14
  418. vmovl.u8 q2, d0
  419. vmax.u8 q14, q14, q11
  420. vmovl.u8 q10, d1
  421. vsubw.u8 q2, q2, d16
  422. vsubw.u8 q10, q10, d17
  423. vshl.i16 q2, q2, #2
  424. vshl.i16 q10, q10, #2
  425. vaddw.u8 q2, q2, d18
  426. vaddw.u8 q10, q10, d19
  427. vsubw.u8 q2, q2, d2
  428. vsubw.u8 q10, q10, d3
  429. vrshrn.i16 d4, q2, #3
  430. vrshrn.i16 d5, q10, #3
  431. vbsl q4, q7, q9
  432. vbsl q5, q14, q1
  433. vneg.s8 q7, q6
  434. vmovl.u8 q14, d16
  435. vmin.s8 q2, q2, q6
  436. vmovl.u8 q6, d17
  437. vmax.s8 q2, q2, q7
  438. vmovl.u8 q11, d0
  439. vmovl.u8 q12, d1
  440. vaddw.s8 q14, q14, d4
  441. vaddw.s8 q6, q6, d5
  442. vsubw.s8 q11, q11, d4
  443. vsubw.s8 q12, q12, d5
  444. vqmovun.s16 d16, q14
  445. vqmovun.s16 d17, q6
  446. vqmovun.s16 d0, q11
  447. vqmovun.s16 d1, q12
  448. .endm
  449. function ff_h264_v_loop_filter_luma_neon, export=1
  450. h264_loop_filter_start
  451. vld1.64 {d0, d1}, [r0,:128], r1
  452. vld1.64 {d2, d3}, [r0,:128], r1
  453. vld1.64 {d4, d5}, [r0,:128], r1
  454. sub r0, r0, r1, lsl #2
  455. sub r0, r0, r1, lsl #1
  456. vld1.64 {d20,d21}, [r0,:128], r1
  457. vld1.64 {d18,d19}, [r0,:128], r1
  458. vld1.64 {d16,d17}, [r0,:128], r1
  459. align_push_regs
  460. h264_loop_filter_luma
  461. sub r0, r0, r1, lsl #1
  462. vst1.64 {d8, d9}, [r0,:128], r1
  463. vst1.64 {d16,d17}, [r0,:128], r1
  464. vst1.64 {d0, d1}, [r0,:128], r1
  465. vst1.64 {d10,d11}, [r0,:128]
  466. align_pop_regs
  467. bx lr
  468. endfunc
  469. function ff_h264_h_loop_filter_luma_neon, export=1
  470. h264_loop_filter_start
  471. sub r0, r0, #4
  472. vld1.64 {d6}, [r0], r1
  473. vld1.64 {d20}, [r0], r1
  474. vld1.64 {d18}, [r0], r1
  475. vld1.64 {d16}, [r0], r1
  476. vld1.64 {d0}, [r0], r1
  477. vld1.64 {d2}, [r0], r1
  478. vld1.64 {d4}, [r0], r1
  479. vld1.64 {d26}, [r0], r1
  480. vld1.64 {d7}, [r0], r1
  481. vld1.64 {d21}, [r0], r1
  482. vld1.64 {d19}, [r0], r1
  483. vld1.64 {d17}, [r0], r1
  484. vld1.64 {d1}, [r0], r1
  485. vld1.64 {d3}, [r0], r1
  486. vld1.64 {d5}, [r0], r1
  487. vld1.64 {d27}, [r0], r1
  488. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  489. align_push_regs
  490. h264_loop_filter_luma
  491. transpose_4x4 q4, q8, q0, q5
  492. sub r0, r0, r1, lsl #4
  493. add r0, r0, #2
  494. vst1.32 {d8[0]}, [r0], r1
  495. vst1.32 {d16[0]}, [r0], r1
  496. vst1.32 {d0[0]}, [r0], r1
  497. vst1.32 {d10[0]}, [r0], r1
  498. vst1.32 {d8[1]}, [r0], r1
  499. vst1.32 {d16[1]}, [r0], r1
  500. vst1.32 {d0[1]}, [r0], r1
  501. vst1.32 {d10[1]}, [r0], r1
  502. vst1.32 {d9[0]}, [r0], r1
  503. vst1.32 {d17[0]}, [r0], r1
  504. vst1.32 {d1[0]}, [r0], r1
  505. vst1.32 {d11[0]}, [r0], r1
  506. vst1.32 {d9[1]}, [r0], r1
  507. vst1.32 {d17[1]}, [r0], r1
  508. vst1.32 {d1[1]}, [r0], r1
  509. vst1.32 {d11[1]}, [r0], r1
  510. align_pop_regs
  511. bx lr
  512. endfunc
  513. .macro h264_loop_filter_chroma
  514. vdup.8 d22, r2 @ alpha
  515. vmovl.u8 q12, d24
  516. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  517. vmovl.u8 q2, d0
  518. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  519. vsubw.u8 q2, q2, d16
  520. vsli.16 d24, d24, #8
  521. vshl.i16 q2, q2, #2
  522. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  523. vaddw.u8 q2, q2, d18
  524. vclt.u8 d26, d26, d22 @ < alpha
  525. vsubw.u8 q2, q2, d2
  526. vdup.8 d22, r3 @ beta
  527. vrshrn.i16 d4, q2, #3
  528. vclt.u8 d28, d28, d22 @ < beta
  529. vclt.u8 d30, d30, d22 @ < beta
  530. vmin.s8 d4, d4, d24
  531. vneg.s8 d25, d24
  532. vand d26, d26, d28
  533. vmax.s8 d4, d4, d25
  534. vand d26, d26, d30
  535. vmovl.u8 q11, d0
  536. vand d4, d4, d26
  537. vmovl.u8 q14, d16
  538. vaddw.s8 q14, q14, d4
  539. vsubw.s8 q11, q11, d4
  540. vqmovun.s16 d16, q14
  541. vqmovun.s16 d0, q11
  542. .endm
  543. function ff_h264_v_loop_filter_chroma_neon, export=1
  544. h264_loop_filter_start
  545. sub r0, r0, r1, lsl #1
  546. vld1.64 {d18}, [r0,:64], r1
  547. vld1.64 {d16}, [r0,:64], r1
  548. vld1.64 {d0}, [r0,:64], r1
  549. vld1.64 {d2}, [r0,:64]
  550. h264_loop_filter_chroma
  551. sub r0, r0, r1, lsl #1
  552. vst1.64 {d16}, [r0,:64], r1
  553. vst1.64 {d0}, [r0,:64], r1
  554. bx lr
  555. endfunc
  556. function ff_h264_h_loop_filter_chroma_neon, export=1
  557. h264_loop_filter_start
  558. sub r0, r0, #2
  559. vld1.32 {d18[0]}, [r0], r1
  560. vld1.32 {d16[0]}, [r0], r1
  561. vld1.32 {d0[0]}, [r0], r1
  562. vld1.32 {d2[0]}, [r0], r1
  563. vld1.32 {d18[1]}, [r0], r1
  564. vld1.32 {d16[1]}, [r0], r1
  565. vld1.32 {d0[1]}, [r0], r1
  566. vld1.32 {d2[1]}, [r0], r1
  567. vtrn.16 d18, d0
  568. vtrn.16 d16, d2
  569. vtrn.8 d18, d16
  570. vtrn.8 d0, d2
  571. h264_loop_filter_chroma
  572. vtrn.16 d18, d0
  573. vtrn.16 d16, d2
  574. vtrn.8 d18, d16
  575. vtrn.8 d0, d2
  576. sub r0, r0, r1, lsl #3
  577. vst1.32 {d18[0]}, [r0], r1
  578. vst1.32 {d16[0]}, [r0], r1
  579. vst1.32 {d0[0]}, [r0], r1
  580. vst1.32 {d2[0]}, [r0], r1
  581. vst1.32 {d18[1]}, [r0], r1
  582. vst1.32 {d16[1]}, [r0], r1
  583. vst1.32 {d0[1]}, [r0], r1
  584. vst1.32 {d2[1]}, [r0], r1
  585. bx lr
  586. endfunc
  587. /* H.264 qpel MC */
  588. .macro lowpass_const r
  589. movw \r, #5
  590. movt \r, #20
  591. vmov.32 d6[0], \r
  592. .endm
  593. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  594. .if \narrow
  595. t0 .req q0
  596. t1 .req q8
  597. .else
  598. t0 .req \d0
  599. t1 .req \d1
  600. .endif
  601. vext.8 d2, \r0, \r1, #2
  602. vext.8 d3, \r0, \r1, #3
  603. vaddl.u8 q1, d2, d3
  604. vext.8 d4, \r0, \r1, #1
  605. vext.8 d5, \r0, \r1, #4
  606. vaddl.u8 q2, d4, d5
  607. vext.8 d30, \r0, \r1, #5
  608. vaddl.u8 t0, \r0, d30
  609. vext.8 d18, \r2, \r3, #2
  610. vmla.i16 t0, q1, d6[1]
  611. vext.8 d19, \r2, \r3, #3
  612. vaddl.u8 q9, d18, d19
  613. vext.8 d20, \r2, \r3, #1
  614. vmls.i16 t0, q2, d6[0]
  615. vext.8 d21, \r2, \r3, #4
  616. vaddl.u8 q10, d20, d21
  617. vext.8 d31, \r2, \r3, #5
  618. vaddl.u8 t1, \r2, d31
  619. vmla.i16 t1, q9, d6[1]
  620. vmls.i16 t1, q10, d6[0]
  621. .if \narrow
  622. vqrshrun.s16 \d0, t0, #5
  623. vqrshrun.s16 \d1, t1, #5
  624. .endif
  625. .unreq t0
  626. .unreq t1
  627. .endm
  628. .macro lowpass_8_1 r0, r1, d0, narrow=1
  629. .if \narrow
  630. t0 .req q0
  631. .else
  632. t0 .req \d0
  633. .endif
  634. vext.8 d2, \r0, \r1, #2
  635. vext.8 d3, \r0, \r1, #3
  636. vaddl.u8 q1, d2, d3
  637. vext.8 d4, \r0, \r1, #1
  638. vext.8 d5, \r0, \r1, #4
  639. vaddl.u8 q2, d4, d5
  640. vext.8 d30, \r0, \r1, #5
  641. vaddl.u8 t0, \r0, d30
  642. vmla.i16 t0, q1, d6[1]
  643. vmls.i16 t0, q2, d6[0]
  644. .if \narrow
  645. vqrshrun.s16 \d0, t0, #5
  646. .endif
  647. .unreq t0
  648. .endm
  649. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  650. vext.16 q1, \r0, \r1, #2
  651. vext.16 q0, \r0, \r1, #3
  652. vaddl.s16 q9, d2, d0
  653. vext.16 q2, \r0, \r1, #1
  654. vaddl.s16 q1, d3, d1
  655. vext.16 q3, \r0, \r1, #4
  656. vaddl.s16 q10, d4, d6
  657. vext.16 \r1, \r0, \r1, #5
  658. vaddl.s16 q2, d5, d7
  659. vaddl.s16 q0, \h0, \h1
  660. vaddl.s16 q8, \l0, \l1
  661. vshl.i32 q3, q9, #4
  662. vshl.i32 q9, q9, #2
  663. vshl.i32 q15, q10, #2
  664. vadd.i32 q9, q9, q3
  665. vadd.i32 q10, q10, q15
  666. vshl.i32 q3, q1, #4
  667. vshl.i32 q1, q1, #2
  668. vshl.i32 q15, q2, #2
  669. vadd.i32 q1, q1, q3
  670. vadd.i32 q2, q2, q15
  671. vadd.i32 q9, q9, q8
  672. vsub.i32 q9, q9, q10
  673. vadd.i32 q1, q1, q0
  674. vsub.i32 q1, q1, q2
  675. vrshrn.s32 d18, q9, #10
  676. vrshrn.s32 d19, q1, #10
  677. vqmovun.s16 \d, q9
  678. .endm
  679. function put_h264_qpel16_h_lowpass_neon_packed
  680. mov r4, lr
  681. mov ip, #16
  682. mov r3, #8
  683. bl put_h264_qpel8_h_lowpass_neon
  684. sub r1, r1, r2, lsl #4
  685. add r1, r1, #8
  686. mov ip, #16
  687. mov lr, r4
  688. b put_h264_qpel8_h_lowpass_neon
  689. endfunc
  690. .macro h264_qpel_h_lowpass type
  691. function \type\()_h264_qpel16_h_lowpass_neon
  692. push {lr}
  693. mov ip, #16
  694. bl \type\()_h264_qpel8_h_lowpass_neon
  695. sub r0, r0, r3, lsl #4
  696. sub r1, r1, r2, lsl #4
  697. add r0, r0, #8
  698. add r1, r1, #8
  699. mov ip, #16
  700. pop {lr}
  701. endfunc
  702. function \type\()_h264_qpel8_h_lowpass_neon
  703. 1: vld1.64 {d0, d1}, [r1], r2
  704. vld1.64 {d16,d17}, [r1], r2
  705. subs ip, ip, #2
  706. lowpass_8 d0, d1, d16, d17, d0, d16
  707. .ifc \type,avg
  708. vld1.8 {d2}, [r0,:64], r3
  709. vrhadd.u8 d0, d0, d2
  710. vld1.8 {d3}, [r0,:64]
  711. vrhadd.u8 d16, d16, d3
  712. sub r0, r0, r3
  713. .endif
  714. vst1.64 {d0}, [r0,:64], r3
  715. vst1.64 {d16}, [r0,:64], r3
  716. bne 1b
  717. bx lr
  718. endfunc
  719. .endm
  720. h264_qpel_h_lowpass put
  721. h264_qpel_h_lowpass avg
  722. .macro h264_qpel_h_lowpass_l2 type
  723. function \type\()_h264_qpel16_h_lowpass_l2_neon
  724. push {lr}
  725. mov ip, #16
  726. bl \type\()_h264_qpel8_h_lowpass_l2_neon
  727. sub r0, r0, r2, lsl #4
  728. sub r1, r1, r2, lsl #4
  729. sub r3, r3, r2, lsl #4
  730. add r0, r0, #8
  731. add r1, r1, #8
  732. add r3, r3, #8
  733. mov ip, #16
  734. pop {lr}
  735. endfunc
  736. function \type\()_h264_qpel8_h_lowpass_l2_neon
  737. 1: vld1.64 {d0, d1}, [r1], r2
  738. vld1.64 {d16,d17}, [r1], r2
  739. vld1.64 {d28}, [r3], r2
  740. vld1.64 {d29}, [r3], r2
  741. subs ip, ip, #2
  742. lowpass_8 d0, d1, d16, d17, d0, d1
  743. vrhadd.u8 q0, q0, q14
  744. .ifc \type,avg
  745. vld1.8 {d2}, [r0,:64], r2
  746. vrhadd.u8 d0, d0, d2
  747. vld1.8 {d3}, [r0,:64]
  748. vrhadd.u8 d1, d1, d3
  749. sub r0, r0, r2
  750. .endif
  751. vst1.64 {d0}, [r0,:64], r2
  752. vst1.64 {d1}, [r0,:64], r2
  753. bne 1b
  754. bx lr
  755. endfunc
  756. .endm
  757. h264_qpel_h_lowpass_l2 put
  758. h264_qpel_h_lowpass_l2 avg
  759. function put_h264_qpel16_v_lowpass_neon_packed
  760. mov r4, lr
  761. mov r2, #8
  762. bl put_h264_qpel8_v_lowpass_neon
  763. sub r1, r1, r3, lsl #2
  764. bl put_h264_qpel8_v_lowpass_neon
  765. sub r1, r1, r3, lsl #4
  766. sub r1, r1, r3, lsl #2
  767. add r1, r1, #8
  768. bl put_h264_qpel8_v_lowpass_neon
  769. sub r1, r1, r3, lsl #2
  770. mov lr, r4
  771. b put_h264_qpel8_v_lowpass_neon
  772. endfunc
  773. .macro h264_qpel_v_lowpass type
  774. function \type\()_h264_qpel16_v_lowpass_neon
  775. mov r4, lr
  776. bl \type\()_h264_qpel8_v_lowpass_neon
  777. sub r1, r1, r3, lsl #2
  778. bl \type\()_h264_qpel8_v_lowpass_neon
  779. sub r0, r0, r2, lsl #4
  780. add r0, r0, #8
  781. sub r1, r1, r3, lsl #4
  782. sub r1, r1, r3, lsl #2
  783. add r1, r1, #8
  784. bl \type\()_h264_qpel8_v_lowpass_neon
  785. sub r1, r1, r3, lsl #2
  786. mov lr, r4
  787. endfunc
  788. function \type\()_h264_qpel8_v_lowpass_neon
  789. vld1.64 {d8}, [r1], r3
  790. vld1.64 {d10}, [r1], r3
  791. vld1.64 {d12}, [r1], r3
  792. vld1.64 {d14}, [r1], r3
  793. vld1.64 {d22}, [r1], r3
  794. vld1.64 {d24}, [r1], r3
  795. vld1.64 {d26}, [r1], r3
  796. vld1.64 {d28}, [r1], r3
  797. vld1.64 {d9}, [r1], r3
  798. vld1.64 {d11}, [r1], r3
  799. vld1.64 {d13}, [r1], r3
  800. vld1.64 {d15}, [r1], r3
  801. vld1.64 {d23}, [r1]
  802. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  803. lowpass_8 d8, d9, d10, d11, d8, d10
  804. lowpass_8 d12, d13, d14, d15, d12, d14
  805. lowpass_8 d22, d23, d24, d25, d22, d24
  806. lowpass_8 d26, d27, d28, d29, d26, d28
  807. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  808. .ifc \type,avg
  809. vld1.8 {d9}, [r0,:64], r2
  810. vrhadd.u8 d8, d8, d9
  811. vld1.8 {d11}, [r0,:64], r2
  812. vrhadd.u8 d10, d10, d11
  813. vld1.8 {d13}, [r0,:64], r2
  814. vrhadd.u8 d12, d12, d13
  815. vld1.8 {d15}, [r0,:64], r2
  816. vrhadd.u8 d14, d14, d15
  817. vld1.8 {d23}, [r0,:64], r2
  818. vrhadd.u8 d22, d22, d23
  819. vld1.8 {d25}, [r0,:64], r2
  820. vrhadd.u8 d24, d24, d25
  821. vld1.8 {d27}, [r0,:64], r2
  822. vrhadd.u8 d26, d26, d27
  823. vld1.8 {d29}, [r0,:64], r2
  824. vrhadd.u8 d28, d28, d29
  825. sub r0, r0, r2, lsl #3
  826. .endif
  827. vst1.64 {d8}, [r0,:64], r2
  828. vst1.64 {d10}, [r0,:64], r2
  829. vst1.64 {d12}, [r0,:64], r2
  830. vst1.64 {d14}, [r0,:64], r2
  831. vst1.64 {d22}, [r0,:64], r2
  832. vst1.64 {d24}, [r0,:64], r2
  833. vst1.64 {d26}, [r0,:64], r2
  834. vst1.64 {d28}, [r0,:64], r2
  835. bx lr
  836. endfunc
  837. .endm
  838. h264_qpel_v_lowpass put
  839. h264_qpel_v_lowpass avg
  840. .macro h264_qpel_v_lowpass_l2 type
  841. function \type\()_h264_qpel16_v_lowpass_l2_neon
  842. mov r4, lr
  843. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  844. sub r1, r1, r3, lsl #2
  845. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  846. sub r0, r0, r3, lsl #4
  847. sub ip, ip, r2, lsl #4
  848. add r0, r0, #8
  849. add ip, ip, #8
  850. sub r1, r1, r3, lsl #4
  851. sub r1, r1, r3, lsl #2
  852. add r1, r1, #8
  853. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  854. sub r1, r1, r3, lsl #2
  855. mov lr, r4
  856. endfunc
  857. function \type\()_h264_qpel8_v_lowpass_l2_neon
  858. vld1.64 {d8}, [r1], r3
  859. vld1.64 {d10}, [r1], r3
  860. vld1.64 {d12}, [r1], r3
  861. vld1.64 {d14}, [r1], r3
  862. vld1.64 {d22}, [r1], r3
  863. vld1.64 {d24}, [r1], r3
  864. vld1.64 {d26}, [r1], r3
  865. vld1.64 {d28}, [r1], r3
  866. vld1.64 {d9}, [r1], r3
  867. vld1.64 {d11}, [r1], r3
  868. vld1.64 {d13}, [r1], r3
  869. vld1.64 {d15}, [r1], r3
  870. vld1.64 {d23}, [r1]
  871. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  872. lowpass_8 d8, d9, d10, d11, d8, d9
  873. lowpass_8 d12, d13, d14, d15, d12, d13
  874. lowpass_8 d22, d23, d24, d25, d22, d23
  875. lowpass_8 d26, d27, d28, d29, d26, d27
  876. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  877. vld1.64 {d0}, [ip], r2
  878. vld1.64 {d1}, [ip], r2
  879. vld1.64 {d2}, [ip], r2
  880. vld1.64 {d3}, [ip], r2
  881. vld1.64 {d4}, [ip], r2
  882. vrhadd.u8 q0, q0, q4
  883. vld1.64 {d5}, [ip], r2
  884. vrhadd.u8 q1, q1, q6
  885. vld1.64 {d10}, [ip], r2
  886. vrhadd.u8 q2, q2, q11
  887. vld1.64 {d11}, [ip], r2
  888. vrhadd.u8 q5, q5, q13
  889. .ifc \type,avg
  890. vld1.8 {d16}, [r0,:64], r3
  891. vrhadd.u8 d0, d0, d16
  892. vld1.8 {d17}, [r0,:64], r3
  893. vrhadd.u8 d1, d1, d17
  894. vld1.8 {d16}, [r0,:64], r3
  895. vrhadd.u8 d2, d2, d16
  896. vld1.8 {d17}, [r0,:64], r3
  897. vrhadd.u8 d3, d3, d17
  898. vld1.8 {d16}, [r0,:64], r3
  899. vrhadd.u8 d4, d4, d16
  900. vld1.8 {d17}, [r0,:64], r3
  901. vrhadd.u8 d5, d5, d17
  902. vld1.8 {d16}, [r0,:64], r3
  903. vrhadd.u8 d10, d10, d16
  904. vld1.8 {d17}, [r0,:64], r3
  905. vrhadd.u8 d11, d11, d17
  906. sub r0, r0, r3, lsl #3
  907. .endif
  908. vst1.64 {d0}, [r0,:64], r3
  909. vst1.64 {d1}, [r0,:64], r3
  910. vst1.64 {d2}, [r0,:64], r3
  911. vst1.64 {d3}, [r0,:64], r3
  912. vst1.64 {d4}, [r0,:64], r3
  913. vst1.64 {d5}, [r0,:64], r3
  914. vst1.64 {d10}, [r0,:64], r3
  915. vst1.64 {d11}, [r0,:64], r3
  916. bx lr
  917. endfunc
  918. .endm
  919. h264_qpel_v_lowpass_l2 put
  920. h264_qpel_v_lowpass_l2 avg
  921. function put_h264_qpel8_hv_lowpass_neon_top
  922. lowpass_const ip
  923. mov ip, #12
  924. 1: vld1.64 {d0, d1}, [r1], r3
  925. vld1.64 {d16,d17}, [r1], r3
  926. subs ip, ip, #2
  927. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  928. vst1.64 {d22-d25}, [r4,:128]!
  929. bne 1b
  930. vld1.64 {d0, d1}, [r1]
  931. lowpass_8_1 d0, d1, q12, narrow=0
  932. mov ip, #-16
  933. add r4, r4, ip
  934. vld1.64 {d30,d31}, [r4,:128], ip
  935. vld1.64 {d20,d21}, [r4,:128], ip
  936. vld1.64 {d18,d19}, [r4,:128], ip
  937. vld1.64 {d16,d17}, [r4,:128], ip
  938. vld1.64 {d14,d15}, [r4,:128], ip
  939. vld1.64 {d12,d13}, [r4,:128], ip
  940. vld1.64 {d10,d11}, [r4,:128], ip
  941. vld1.64 {d8, d9}, [r4,:128], ip
  942. vld1.64 {d6, d7}, [r4,:128], ip
  943. vld1.64 {d4, d5}, [r4,:128], ip
  944. vld1.64 {d2, d3}, [r4,:128], ip
  945. vld1.64 {d0, d1}, [r4,:128]
  946. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  947. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  948. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  949. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  950. vst1.64 {d30,d31}, [r4,:128]!
  951. vst1.64 {d6, d7}, [r4,:128]!
  952. vst1.64 {d20,d21}, [r4,:128]!
  953. vst1.64 {d4, d5}, [r4,:128]!
  954. vst1.64 {d18,d19}, [r4,:128]!
  955. vst1.64 {d2, d3}, [r4,:128]!
  956. vst1.64 {d16,d17}, [r4,:128]!
  957. vst1.64 {d0, d1}, [r4,:128]
  958. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  959. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  960. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  961. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  962. vld1.64 {d16,d17}, [r4,:128], ip
  963. vld1.64 {d30,d31}, [r4,:128], ip
  964. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  965. vld1.64 {d16,d17}, [r4,:128], ip
  966. vld1.64 {d30,d31}, [r4,:128], ip
  967. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  968. vld1.64 {d16,d17}, [r4,:128], ip
  969. vld1.64 {d30,d31}, [r4,:128], ip
  970. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  971. vld1.64 {d16,d17}, [r4,:128], ip
  972. vld1.64 {d30,d31}, [r4,:128]
  973. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  974. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  975. bx lr
  976. endfunc
  977. .macro h264_qpel8_hv_lowpass type
  978. function \type\()_h264_qpel8_hv_lowpass_neon
  979. mov r10, lr
  980. bl put_h264_qpel8_hv_lowpass_neon_top
  981. .ifc \type,avg
  982. vld1.8 {d0}, [r0,:64], r2
  983. vrhadd.u8 d12, d12, d0
  984. vld1.8 {d1}, [r0,:64], r2
  985. vrhadd.u8 d13, d13, d1
  986. vld1.8 {d2}, [r0,:64], r2
  987. vrhadd.u8 d14, d14, d2
  988. vld1.8 {d3}, [r0,:64], r2
  989. vrhadd.u8 d15, d15, d3
  990. vld1.8 {d4}, [r0,:64], r2
  991. vrhadd.u8 d8, d8, d4
  992. vld1.8 {d5}, [r0,:64], r2
  993. vrhadd.u8 d9, d9, d5
  994. vld1.8 {d6}, [r0,:64], r2
  995. vrhadd.u8 d10, d10, d6
  996. vld1.8 {d7}, [r0,:64], r2
  997. vrhadd.u8 d11, d11, d7
  998. sub r0, r0, r2, lsl #3
  999. .endif
  1000. vst1.64 {d12}, [r0,:64], r2
  1001. vst1.64 {d13}, [r0,:64], r2
  1002. vst1.64 {d14}, [r0,:64], r2
  1003. vst1.64 {d15}, [r0,:64], r2
  1004. vst1.64 {d8}, [r0,:64], r2
  1005. vst1.64 {d9}, [r0,:64], r2
  1006. vst1.64 {d10}, [r0,:64], r2
  1007. vst1.64 {d11}, [r0,:64], r2
  1008. mov lr, r10
  1009. bx lr
  1010. endfunc
  1011. .endm
  1012. h264_qpel8_hv_lowpass put
  1013. h264_qpel8_hv_lowpass avg
  1014. .macro h264_qpel8_hv_lowpass_l2 type
  1015. function \type\()_h264_qpel8_hv_lowpass_l2_neon
  1016. mov r10, lr
  1017. bl put_h264_qpel8_hv_lowpass_neon_top
  1018. vld1.64 {d0, d1}, [r2,:128]!
  1019. vld1.64 {d2, d3}, [r2,:128]!
  1020. vrhadd.u8 q0, q0, q6
  1021. vld1.64 {d4, d5}, [r2,:128]!
  1022. vrhadd.u8 q1, q1, q7
  1023. vld1.64 {d6, d7}, [r2,:128]!
  1024. vrhadd.u8 q2, q2, q4
  1025. vrhadd.u8 q3, q3, q5
  1026. .ifc \type,avg
  1027. vld1.8 {d16}, [r0,:64], r3
  1028. vrhadd.u8 d0, d0, d16
  1029. vld1.8 {d17}, [r0,:64], r3
  1030. vrhadd.u8 d1, d1, d17
  1031. vld1.8 {d18}, [r0,:64], r3
  1032. vrhadd.u8 d2, d2, d18
  1033. vld1.8 {d19}, [r0,:64], r3
  1034. vrhadd.u8 d3, d3, d19
  1035. vld1.8 {d20}, [r0,:64], r3
  1036. vrhadd.u8 d4, d4, d20
  1037. vld1.8 {d21}, [r0,:64], r3
  1038. vrhadd.u8 d5, d5, d21
  1039. vld1.8 {d22}, [r0,:64], r3
  1040. vrhadd.u8 d6, d6, d22
  1041. vld1.8 {d23}, [r0,:64], r3
  1042. vrhadd.u8 d7, d7, d23
  1043. sub r0, r0, r3, lsl #3
  1044. .endif
  1045. vst1.64 {d0}, [r0,:64], r3
  1046. vst1.64 {d1}, [r0,:64], r3
  1047. vst1.64 {d2}, [r0,:64], r3
  1048. vst1.64 {d3}, [r0,:64], r3
  1049. vst1.64 {d4}, [r0,:64], r3
  1050. vst1.64 {d5}, [r0,:64], r3
  1051. vst1.64 {d6}, [r0,:64], r3
  1052. vst1.64 {d7}, [r0,:64], r3
  1053. mov lr, r10
  1054. bx lr
  1055. endfunc
  1056. .endm
  1057. h264_qpel8_hv_lowpass_l2 put
  1058. h264_qpel8_hv_lowpass_l2 avg
  1059. .macro h264_qpel16_hv type
  1060. function \type\()_h264_qpel16_hv_lowpass_neon
  1061. mov r9, lr
  1062. bl \type\()_h264_qpel8_hv_lowpass_neon
  1063. sub r1, r1, r3, lsl #2
  1064. bl \type\()_h264_qpel8_hv_lowpass_neon
  1065. sub r1, r1, r3, lsl #4
  1066. sub r1, r1, r3, lsl #2
  1067. add r1, r1, #8
  1068. sub r0, r0, r2, lsl #4
  1069. add r0, r0, #8
  1070. bl \type\()_h264_qpel8_hv_lowpass_neon
  1071. sub r1, r1, r3, lsl #2
  1072. mov lr, r9
  1073. b \type\()_h264_qpel8_hv_lowpass_neon
  1074. endfunc
  1075. function \type\()_h264_qpel16_hv_lowpass_l2_neon
  1076. mov r9, lr
  1077. sub r2, r4, #256
  1078. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1079. sub r1, r1, r3, lsl #2
  1080. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1081. sub r1, r1, r3, lsl #4
  1082. sub r1, r1, r3, lsl #2
  1083. add r1, r1, #8
  1084. sub r0, r0, r3, lsl #4
  1085. add r0, r0, #8
  1086. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1087. sub r1, r1, r3, lsl #2
  1088. mov lr, r9
  1089. b \type\()_h264_qpel8_hv_lowpass_l2_neon
  1090. endfunc
  1091. .endm
  1092. h264_qpel16_hv put
  1093. h264_qpel16_hv avg
  1094. .macro h264_qpel8 type
  1095. function ff_\type\()_h264_qpel8_mc10_neon, export=1
  1096. lowpass_const r3
  1097. mov r3, r1
  1098. sub r1, r1, #2
  1099. mov ip, #8
  1100. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1101. endfunc
  1102. function ff_\type\()_h264_qpel8_mc20_neon, export=1
  1103. lowpass_const r3
  1104. sub r1, r1, #2
  1105. mov r3, r2
  1106. mov ip, #8
  1107. b \type\()_h264_qpel8_h_lowpass_neon
  1108. endfunc
  1109. function ff_\type\()_h264_qpel8_mc30_neon, export=1
  1110. lowpass_const r3
  1111. add r3, r1, #1
  1112. sub r1, r1, #2
  1113. mov ip, #8
  1114. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1115. endfunc
  1116. function ff_\type\()_h264_qpel8_mc01_neon, export=1
  1117. push {lr}
  1118. mov ip, r1
  1119. \type\()_h264_qpel8_mc01:
  1120. lowpass_const r3
  1121. mov r3, r2
  1122. sub r1, r1, r2, lsl #1
  1123. vpush {d8-d15}
  1124. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1125. vpop {d8-d15}
  1126. pop {pc}
  1127. endfunc
  1128. function ff_\type\()_h264_qpel8_mc11_neon, export=1
  1129. push {r0, r1, r11, lr}
  1130. \type\()_h264_qpel8_mc11:
  1131. lowpass_const r3
  1132. mov r11, sp
  1133. bic sp, sp, #15
  1134. sub sp, sp, #64
  1135. mov r0, sp
  1136. sub r1, r1, #2
  1137. mov r3, #8
  1138. mov ip, #8
  1139. vpush {d8-d15}
  1140. bl put_h264_qpel8_h_lowpass_neon
  1141. ldrd r0, [r11]
  1142. mov r3, r2
  1143. add ip, sp, #64
  1144. sub r1, r1, r2, lsl #1
  1145. mov r2, #8
  1146. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1147. vpop {d8-d15}
  1148. add sp, r11, #8
  1149. pop {r11, pc}
  1150. endfunc
  1151. function ff_\type\()_h264_qpel8_mc21_neon, export=1
  1152. push {r0, r1, r4, r10, r11, lr}
  1153. \type\()_h264_qpel8_mc21:
  1154. lowpass_const r3
  1155. mov r11, sp
  1156. bic sp, sp, #15
  1157. sub sp, sp, #(8*8+16*12)
  1158. sub r1, r1, #2
  1159. mov r3, #8
  1160. mov r0, sp
  1161. mov ip, #8
  1162. vpush {d8-d15}
  1163. bl put_h264_qpel8_h_lowpass_neon
  1164. mov r4, r0
  1165. ldrd r0, [r11]
  1166. sub r1, r1, r2, lsl #1
  1167. sub r1, r1, #2
  1168. mov r3, r2
  1169. sub r2, r4, #64
  1170. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1171. vpop {d8-d15}
  1172. add sp, r11, #8
  1173. pop {r4, r10, r11, pc}
  1174. endfunc
  1175. function ff_\type\()_h264_qpel8_mc31_neon, export=1
  1176. add r1, r1, #1
  1177. push {r0, r1, r11, lr}
  1178. sub r1, r1, #1
  1179. b \type\()_h264_qpel8_mc11
  1180. endfunc
  1181. function ff_\type\()_h264_qpel8_mc02_neon, export=1
  1182. push {lr}
  1183. lowpass_const r3
  1184. sub r1, r1, r2, lsl #1
  1185. mov r3, r2
  1186. vpush {d8-d15}
  1187. bl \type\()_h264_qpel8_v_lowpass_neon
  1188. vpop {d8-d15}
  1189. pop {pc}
  1190. endfunc
  1191. function ff_\type\()_h264_qpel8_mc12_neon, export=1
  1192. push {r0, r1, r4, r10, r11, lr}
  1193. \type\()_h264_qpel8_mc12:
  1194. lowpass_const r3
  1195. mov r11, sp
  1196. bic sp, sp, #15
  1197. sub sp, sp, #(8*8+16*12)
  1198. sub r1, r1, r2, lsl #1
  1199. mov r3, r2
  1200. mov r2, #8
  1201. mov r0, sp
  1202. vpush {d8-d15}
  1203. bl put_h264_qpel8_v_lowpass_neon
  1204. mov r4, r0
  1205. ldrd r0, [r11]
  1206. sub r1, r1, r3, lsl #1
  1207. sub r1, r1, #2
  1208. sub r2, r4, #64
  1209. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1210. vpop {d8-d15}
  1211. add sp, r11, #8
  1212. pop {r4, r10, r11, pc}
  1213. endfunc
  1214. function ff_\type\()_h264_qpel8_mc22_neon, export=1
  1215. push {r4, r10, r11, lr}
  1216. mov r11, sp
  1217. bic sp, sp, #15
  1218. sub r1, r1, r2, lsl #1
  1219. sub r1, r1, #2
  1220. mov r3, r2
  1221. sub sp, sp, #(16*12)
  1222. mov r4, sp
  1223. vpush {d8-d15}
  1224. bl \type\()_h264_qpel8_hv_lowpass_neon
  1225. vpop {d8-d15}
  1226. mov sp, r11
  1227. pop {r4, r10, r11, pc}
  1228. endfunc
  1229. function ff_\type\()_h264_qpel8_mc32_neon, export=1
  1230. push {r0, r1, r4, r10, r11, lr}
  1231. add r1, r1, #1
  1232. b \type\()_h264_qpel8_mc12
  1233. endfunc
  1234. function ff_\type\()_h264_qpel8_mc03_neon, export=1
  1235. push {lr}
  1236. add ip, r1, r2
  1237. b \type\()_h264_qpel8_mc01
  1238. endfunc
  1239. function ff_\type\()_h264_qpel8_mc13_neon, export=1
  1240. push {r0, r1, r11, lr}
  1241. add r1, r1, r2
  1242. b \type\()_h264_qpel8_mc11
  1243. endfunc
  1244. function ff_\type\()_h264_qpel8_mc23_neon, export=1
  1245. push {r0, r1, r4, r10, r11, lr}
  1246. add r1, r1, r2
  1247. b \type\()_h264_qpel8_mc21
  1248. endfunc
  1249. function ff_\type\()_h264_qpel8_mc33_neon, export=1
  1250. add r1, r1, #1
  1251. push {r0, r1, r11, lr}
  1252. add r1, r1, r2
  1253. sub r1, r1, #1
  1254. b \type\()_h264_qpel8_mc11
  1255. endfunc
  1256. .endm
  1257. h264_qpel8 put
  1258. h264_qpel8 avg
  1259. .macro h264_qpel16 type
  1260. function ff_\type\()_h264_qpel16_mc10_neon, export=1
  1261. lowpass_const r3
  1262. mov r3, r1
  1263. sub r1, r1, #2
  1264. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1265. endfunc
  1266. function ff_\type\()_h264_qpel16_mc20_neon, export=1
  1267. lowpass_const r3
  1268. sub r1, r1, #2
  1269. mov r3, r2
  1270. b \type\()_h264_qpel16_h_lowpass_neon
  1271. endfunc
  1272. function ff_\type\()_h264_qpel16_mc30_neon, export=1
  1273. lowpass_const r3
  1274. add r3, r1, #1
  1275. sub r1, r1, #2
  1276. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1277. endfunc
  1278. function ff_\type\()_h264_qpel16_mc01_neon, export=1
  1279. push {r4, lr}
  1280. mov ip, r1
  1281. \type\()_h264_qpel16_mc01:
  1282. lowpass_const r3
  1283. mov r3, r2
  1284. sub r1, r1, r2, lsl #1
  1285. vpush {d8-d15}
  1286. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1287. vpop {d8-d15}
  1288. pop {r4, pc}
  1289. endfunc
  1290. function ff_\type\()_h264_qpel16_mc11_neon, export=1
  1291. push {r0, r1, r4, r11, lr}
  1292. \type\()_h264_qpel16_mc11:
  1293. lowpass_const r3
  1294. mov r11, sp
  1295. bic sp, sp, #15
  1296. sub sp, sp, #256
  1297. mov r0, sp
  1298. sub r1, r1, #2
  1299. mov r3, #16
  1300. vpush {d8-d15}
  1301. bl put_h264_qpel16_h_lowpass_neon
  1302. ldrd r0, [r11]
  1303. mov r3, r2
  1304. add ip, sp, #64
  1305. sub r1, r1, r2, lsl #1
  1306. mov r2, #16
  1307. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1308. vpop {d8-d15}
  1309. add sp, r11, #8
  1310. pop {r4, r11, pc}
  1311. endfunc
  1312. function ff_\type\()_h264_qpel16_mc21_neon, export=1
  1313. push {r0, r1, r4-r5, r9-r11, lr}
  1314. \type\()_h264_qpel16_mc21:
  1315. lowpass_const r3
  1316. mov r11, sp
  1317. bic sp, sp, #15
  1318. sub sp, sp, #(16*16+16*12)
  1319. sub r1, r1, #2
  1320. mov r0, sp
  1321. vpush {d8-d15}
  1322. bl put_h264_qpel16_h_lowpass_neon_packed
  1323. mov r4, r0
  1324. ldrd r0, [r11]
  1325. sub r1, r1, r2, lsl #1
  1326. sub r1, r1, #2
  1327. mov r3, r2
  1328. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1329. vpop {d8-d15}
  1330. add sp, r11, #8
  1331. pop {r4-r5, r9-r11, pc}
  1332. endfunc
  1333. function ff_\type\()_h264_qpel16_mc31_neon, export=1
  1334. add r1, r1, #1
  1335. push {r0, r1, r4, r11, lr}
  1336. sub r1, r1, #1
  1337. b \type\()_h264_qpel16_mc11
  1338. endfunc
  1339. function ff_\type\()_h264_qpel16_mc02_neon, export=1
  1340. push {r4, lr}
  1341. lowpass_const r3
  1342. sub r1, r1, r2, lsl #1
  1343. mov r3, r2
  1344. vpush {d8-d15}
  1345. bl \type\()_h264_qpel16_v_lowpass_neon
  1346. vpop {d8-d15}
  1347. pop {r4, pc}
  1348. endfunc
  1349. function ff_\type\()_h264_qpel16_mc12_neon, export=1
  1350. push {r0, r1, r4-r5, r9-r11, lr}
  1351. \type\()_h264_qpel16_mc12:
  1352. lowpass_const r3
  1353. mov r11, sp
  1354. bic sp, sp, #15
  1355. sub sp, sp, #(16*16+16*12)
  1356. sub r1, r1, r2, lsl #1
  1357. mov r0, sp
  1358. mov r3, r2
  1359. vpush {d8-d15}
  1360. bl put_h264_qpel16_v_lowpass_neon_packed
  1361. mov r4, r0
  1362. ldrd r0, [r11]
  1363. sub r1, r1, r3, lsl #1
  1364. sub r1, r1, #2
  1365. mov r2, r3
  1366. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1367. vpop {d8-d15}
  1368. add sp, r11, #8
  1369. pop {r4-r5, r9-r11, pc}
  1370. endfunc
  1371. function ff_\type\()_h264_qpel16_mc22_neon, export=1
  1372. push {r4, r9-r11, lr}
  1373. lowpass_const r3
  1374. mov r11, sp
  1375. bic sp, sp, #15
  1376. sub r1, r1, r2, lsl #1
  1377. sub r1, r1, #2
  1378. mov r3, r2
  1379. sub sp, sp, #(16*12)
  1380. mov r4, sp
  1381. vpush {d8-d15}
  1382. bl \type\()_h264_qpel16_hv_lowpass_neon
  1383. vpop {d8-d15}
  1384. mov sp, r11
  1385. pop {r4, r9-r11, pc}
  1386. endfunc
  1387. function ff_\type\()_h264_qpel16_mc32_neon, export=1
  1388. push {r0, r1, r4-r5, r9-r11, lr}
  1389. add r1, r1, #1
  1390. b \type\()_h264_qpel16_mc12
  1391. endfunc
  1392. function ff_\type\()_h264_qpel16_mc03_neon, export=1
  1393. push {r4, lr}
  1394. add ip, r1, r2
  1395. b \type\()_h264_qpel16_mc01
  1396. endfunc
  1397. function ff_\type\()_h264_qpel16_mc13_neon, export=1
  1398. push {r0, r1, r4, r11, lr}
  1399. add r1, r1, r2
  1400. b \type\()_h264_qpel16_mc11
  1401. endfunc
  1402. function ff_\type\()_h264_qpel16_mc23_neon, export=1
  1403. push {r0, r1, r4-r5, r9-r11, lr}
  1404. add r1, r1, r2
  1405. b \type\()_h264_qpel16_mc21
  1406. endfunc
  1407. function ff_\type\()_h264_qpel16_mc33_neon, export=1
  1408. add r1, r1, #1
  1409. push {r0, r1, r4, r11, lr}
  1410. add r1, r1, r2
  1411. sub r1, r1, #1
  1412. b \type\()_h264_qpel16_mc11
  1413. endfunc
  1414. .endm
  1415. h264_qpel16 put
  1416. h264_qpel16 avg
  1417. @ Biweighted prediction
  1418. .macro biweight_16 macs, macd
  1419. vdup.8 d0, r4
  1420. vdup.8 d1, r5
  1421. vmov q2, q8
  1422. vmov q3, q8
  1423. 1: subs ip, ip, #2
  1424. vld1.8 {d20-d21},[r0,:128], r2
  1425. \macd q2, d0, d20
  1426. pld [r0]
  1427. \macd q3, d0, d21
  1428. vld1.8 {d22-d23},[r1,:128], r2
  1429. \macs q2, d1, d22
  1430. pld [r1]
  1431. \macs q3, d1, d23
  1432. vmov q12, q8
  1433. vld1.8 {d28-d29},[r0,:128], r2
  1434. vmov q13, q8
  1435. \macd q12, d0, d28
  1436. pld [r0]
  1437. \macd q13, d0, d29
  1438. vld1.8 {d30-d31},[r1,:128], r2
  1439. \macs q12, d1, d30
  1440. pld [r1]
  1441. \macs q13, d1, d31
  1442. vshl.s16 q2, q2, q9
  1443. vshl.s16 q3, q3, q9
  1444. vqmovun.s16 d4, q2
  1445. vqmovun.s16 d5, q3
  1446. vshl.s16 q12, q12, q9
  1447. vshl.s16 q13, q13, q9
  1448. vqmovun.s16 d24, q12
  1449. vqmovun.s16 d25, q13
  1450. vmov q3, q8
  1451. vst1.8 {d4- d5}, [r6,:128], r2
  1452. vmov q2, q8
  1453. vst1.8 {d24-d25},[r6,:128], r2
  1454. bne 1b
  1455. pop {r4-r6, pc}
  1456. .endm
  1457. .macro biweight_8 macs, macd
  1458. vdup.8 d0, r4
  1459. vdup.8 d1, r5
  1460. vmov q1, q8
  1461. vmov q10, q8
  1462. 1: subs ip, ip, #2
  1463. vld1.8 {d4},[r0,:64], r2
  1464. \macd q1, d0, d4
  1465. pld [r0]
  1466. vld1.8 {d5},[r1,:64], r2
  1467. \macs q1, d1, d5
  1468. pld [r1]
  1469. vld1.8 {d6},[r0,:64], r2
  1470. \macd q10, d0, d6
  1471. pld [r0]
  1472. vld1.8 {d7},[r1,:64], r2
  1473. \macs q10, d1, d7
  1474. pld [r1]
  1475. vshl.s16 q1, q1, q9
  1476. vqmovun.s16 d2, q1
  1477. vshl.s16 q10, q10, q9
  1478. vqmovun.s16 d4, q10
  1479. vmov q10, q8
  1480. vst1.8 {d2},[r6,:64], r2
  1481. vmov q1, q8
  1482. vst1.8 {d4},[r6,:64], r2
  1483. bne 1b
  1484. pop {r4-r6, pc}
  1485. .endm
  1486. .macro biweight_4 macs, macd
  1487. vdup.8 d0, r4
  1488. vdup.8 d1, r5
  1489. vmov q1, q8
  1490. vmov q10, q8
  1491. 1: subs ip, ip, #4
  1492. vld1.32 {d4[0]},[r0,:32], r2
  1493. vld1.32 {d4[1]},[r0,:32], r2
  1494. \macd q1, d0, d4
  1495. pld [r0]
  1496. vld1.32 {d5[0]},[r1,:32], r2
  1497. vld1.32 {d5[1]},[r1,:32], r2
  1498. \macs q1, d1, d5
  1499. pld [r1]
  1500. blt 2f
  1501. vld1.32 {d6[0]},[r0,:32], r2
  1502. vld1.32 {d6[1]},[r0,:32], r2
  1503. \macd q10, d0, d6
  1504. pld [r0]
  1505. vld1.32 {d7[0]},[r1,:32], r2
  1506. vld1.32 {d7[1]},[r1,:32], r2
  1507. \macs q10, d1, d7
  1508. pld [r1]
  1509. vshl.s16 q1, q1, q9
  1510. vqmovun.s16 d2, q1
  1511. vshl.s16 q10, q10, q9
  1512. vqmovun.s16 d4, q10
  1513. vmov q10, q8
  1514. vst1.32 {d2[0]},[r6,:32], r2
  1515. vst1.32 {d2[1]},[r6,:32], r2
  1516. vmov q1, q8
  1517. vst1.32 {d4[0]},[r6,:32], r2
  1518. vst1.32 {d4[1]},[r6,:32], r2
  1519. bne 1b
  1520. pop {r4-r6, pc}
  1521. 2: vshl.s16 q1, q1, q9
  1522. vqmovun.s16 d2, q1
  1523. vst1.32 {d2[0]},[r6,:32], r2
  1524. vst1.32 {d2[1]},[r6,:32], r2
  1525. pop {r4-r6, pc}
  1526. .endm
  1527. .macro biweight_func w
  1528. function biweight_h264_pixels_\w\()_neon
  1529. push {r4-r6, lr}
  1530. add r4, sp, #16
  1531. ldm r4, {r4-r6}
  1532. lsr lr, r4, #31
  1533. add r6, r6, #1
  1534. eors lr, lr, r5, lsr #30
  1535. orr r6, r6, #1
  1536. vdup.16 q9, r3
  1537. lsl r6, r6, r3
  1538. vmvn q9, q9
  1539. vdup.16 q8, r6
  1540. mov r6, r0
  1541. beq 10f
  1542. subs lr, lr, #1
  1543. beq 20f
  1544. subs lr, lr, #1
  1545. beq 30f
  1546. b 40f
  1547. 10: biweight_\w vmlal.u8, vmlal.u8
  1548. 20: rsb r4, r4, #0
  1549. biweight_\w vmlal.u8, vmlsl.u8
  1550. 30: rsb r4, r4, #0
  1551. rsb r5, r5, #0
  1552. biweight_\w vmlsl.u8, vmlsl.u8
  1553. 40: rsb r5, r5, #0
  1554. biweight_\w vmlsl.u8, vmlal.u8
  1555. endfunc
  1556. .endm
  1557. .macro biweight_entry w, h, b=1
  1558. function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
  1559. mov ip, #\h
  1560. .if \b
  1561. b biweight_h264_pixels_\w\()_neon
  1562. .endif
  1563. endfunc
  1564. .endm
  1565. biweight_entry 16, 8
  1566. biweight_entry 16, 16, b=0
  1567. biweight_func 16
  1568. biweight_entry 8, 16
  1569. biweight_entry 8, 4
  1570. biweight_entry 8, 8, b=0
  1571. biweight_func 8
  1572. biweight_entry 4, 8
  1573. biweight_entry 4, 2
  1574. biweight_entry 4, 4, b=0
  1575. biweight_func 4
  1576. @ Weighted prediction
  1577. .macro weight_16 add
  1578. vdup.8 d0, r3
  1579. 1: subs ip, ip, #2
  1580. vld1.8 {d20-d21},[r0,:128], r1
  1581. vmull.u8 q2, d0, d20
  1582. pld [r0]
  1583. vmull.u8 q3, d0, d21
  1584. vld1.8 {d28-d29},[r0,:128], r1
  1585. vmull.u8 q12, d0, d28
  1586. pld [r0]
  1587. vmull.u8 q13, d0, d29
  1588. \add q2, q8, q2
  1589. vrshl.s16 q2, q2, q9
  1590. \add q3, q8, q3
  1591. vrshl.s16 q3, q3, q9
  1592. vqmovun.s16 d4, q2
  1593. vqmovun.s16 d5, q3
  1594. \add q12, q8, q12
  1595. vrshl.s16 q12, q12, q9
  1596. \add q13, q8, q13
  1597. vrshl.s16 q13, q13, q9
  1598. vqmovun.s16 d24, q12
  1599. vqmovun.s16 d25, q13
  1600. vst1.8 {d4- d5}, [r4,:128], r1
  1601. vst1.8 {d24-d25},[r4,:128], r1
  1602. bne 1b
  1603. pop {r4, pc}
  1604. .endm
  1605. .macro weight_8 add
  1606. vdup.8 d0, r3
  1607. 1: subs ip, ip, #2
  1608. vld1.8 {d4},[r0,:64], r1
  1609. vmull.u8 q1, d0, d4
  1610. pld [r0]
  1611. vld1.8 {d6},[r0,:64], r1
  1612. vmull.u8 q10, d0, d6
  1613. \add q1, q8, q1
  1614. pld [r0]
  1615. vrshl.s16 q1, q1, q9
  1616. vqmovun.s16 d2, q1
  1617. \add q10, q8, q10
  1618. vrshl.s16 q10, q10, q9
  1619. vqmovun.s16 d4, q10
  1620. vst1.8 {d2},[r4,:64], r1
  1621. vst1.8 {d4},[r4,:64], r1
  1622. bne 1b
  1623. pop {r4, pc}
  1624. .endm
  1625. .macro weight_4 add
  1626. vdup.8 d0, r3
  1627. vmov q1, q8
  1628. vmov q10, q8
  1629. 1: subs ip, ip, #4
  1630. vld1.32 {d4[0]},[r0,:32], r1
  1631. vld1.32 {d4[1]},[r0,:32], r1
  1632. vmull.u8 q1, d0, d4
  1633. pld [r0]
  1634. blt 2f
  1635. vld1.32 {d6[0]},[r0,:32], r1
  1636. vld1.32 {d6[1]},[r0,:32], r1
  1637. vmull.u8 q10, d0, d6
  1638. pld [r0]
  1639. \add q1, q8, q1
  1640. vrshl.s16 q1, q1, q9
  1641. vqmovun.s16 d2, q1
  1642. \add q10, q8, q10
  1643. vrshl.s16 q10, q10, q9
  1644. vqmovun.s16 d4, q10
  1645. vmov q10, q8
  1646. vst1.32 {d2[0]},[r4,:32], r1
  1647. vst1.32 {d2[1]},[r4,:32], r1
  1648. vmov q1, q8
  1649. vst1.32 {d4[0]},[r4,:32], r1
  1650. vst1.32 {d4[1]},[r4,:32], r1
  1651. bne 1b
  1652. pop {r4, pc}
  1653. 2: \add q1, q8, q1
  1654. vrshl.s16 q1, q1, q9
  1655. vqmovun.s16 d2, q1
  1656. vst1.32 {d2[0]},[r4,:32], r1
  1657. vst1.32 {d2[1]},[r4,:32], r1
  1658. pop {r4, pc}
  1659. .endm
  1660. .macro weight_func w
  1661. function weight_h264_pixels_\w\()_neon
  1662. push {r4, lr}
  1663. ldr r4, [sp, #8]
  1664. cmp r2, #1
  1665. lsl r4, r4, r2
  1666. vdup.16 q8, r4
  1667. mov r4, r0
  1668. ble 20f
  1669. rsb lr, r2, #1
  1670. vdup.16 q9, lr
  1671. cmp r3, #0
  1672. blt 10f
  1673. weight_\w vhadd.s16
  1674. 10: rsb r3, r3, #0
  1675. weight_\w vhsub.s16
  1676. 20: rsb lr, r2, #0
  1677. vdup.16 q9, lr
  1678. cmp r3, #0
  1679. blt 10f
  1680. weight_\w vadd.s16
  1681. 10: rsb r3, r3, #0
  1682. weight_\w vsub.s16
  1683. endfunc
  1684. .endm
  1685. .macro weight_entry w, h, b=1
  1686. function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
  1687. mov ip, #\h
  1688. .if \b
  1689. b weight_h264_pixels_\w\()_neon
  1690. .endif
  1691. endfunc
  1692. .endm
  1693. weight_entry 16, 8
  1694. weight_entry 16, 16, b=0
  1695. weight_func 16
  1696. weight_entry 8, 16
  1697. weight_entry 8, 4
  1698. weight_entry 8, 8, b=0
  1699. weight_func 8
  1700. weight_entry 4, 8
  1701. weight_entry 4, 2
  1702. weight_entry 4, 4, b=0
  1703. weight_func 4