You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1884 lines
62KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  22. vtrn.32 \r0, \r4
  23. vtrn.32 \r1, \r5
  24. vtrn.32 \r2, \r6
  25. vtrn.32 \r3, \r7
  26. vtrn.16 \r0, \r2
  27. vtrn.16 \r1, \r3
  28. vtrn.16 \r4, \r6
  29. vtrn.16 \r5, \r7
  30. vtrn.8 \r0, \r1
  31. vtrn.8 \r2, \r3
  32. vtrn.8 \r4, \r5
  33. vtrn.8 \r6, \r7
  34. .endm
  35. .macro transpose_4x4 r0 r1 r2 r3
  36. vtrn.16 \r0, \r2
  37. vtrn.16 \r1, \r3
  38. vtrn.8 \r0, \r1
  39. vtrn.8 \r2, \r3
  40. .endm
  41. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  42. vswp \r0, \r4
  43. vswp \r1, \r5
  44. vswp \r2, \r6
  45. vswp \r3, \r7
  46. .endm
  47. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  48. vtrn.32 \r0, \r2
  49. vtrn.32 \r1, \r3
  50. vtrn.32 \r4, \r6
  51. vtrn.32 \r5, \r7
  52. vtrn.16 \r0, \r1
  53. vtrn.16 \r2, \r3
  54. vtrn.16 \r4, \r5
  55. vtrn.16 \r6, \r7
  56. .endm
  57. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  58. .macro h264_chroma_mc8 type
  59. function ff_\type\()_h264_chroma_mc8_neon, export=1
  60. push {r4-r7, lr}
  61. ldrd r4, [sp, #20]
  62. .ifc \type,avg
  63. mov lr, r0
  64. .endif
  65. pld [r1]
  66. pld [r1, r2]
  67. muls r7, r4, r5
  68. rsb r6, r7, r5, lsl #3
  69. rsb ip, r7, r4, lsl #3
  70. sub r4, r7, r4, lsl #3
  71. sub r4, r4, r5, lsl #3
  72. add r4, r4, #64
  73. beq 2f
  74. add r5, r1, r2
  75. vdup.8 d0, r4
  76. lsl r4, r2, #1
  77. vdup.8 d1, ip
  78. vld1.64 {d4, d5}, [r1], r4
  79. vdup.8 d2, r6
  80. vld1.64 {d6, d7}, [r5], r4
  81. vdup.8 d3, r7
  82. vext.8 d5, d4, d5, #1
  83. vext.8 d7, d6, d7, #1
  84. 1: pld [r5]
  85. vmull.u8 q8, d4, d0
  86. vmlal.u8 q8, d5, d1
  87. vld1.64 {d4, d5}, [r1], r4
  88. vmlal.u8 q8, d6, d2
  89. vext.8 d5, d4, d5, #1
  90. vmlal.u8 q8, d7, d3
  91. vmull.u8 q9, d6, d0
  92. subs r3, r3, #2
  93. vmlal.u8 q9, d7, d1
  94. vmlal.u8 q9, d4, d2
  95. vmlal.u8 q9, d5, d3
  96. vrshrn.u16 d16, q8, #6
  97. vld1.64 {d6, d7}, [r5], r4
  98. pld [r1]
  99. vrshrn.u16 d17, q9, #6
  100. .ifc \type,avg
  101. vld1.64 {d20}, [lr,:64], r2
  102. vld1.64 {d21}, [lr,:64], r2
  103. vrhadd.u8 q8, q8, q10
  104. .endif
  105. vext.8 d7, d6, d7, #1
  106. vst1.64 {d16}, [r0,:64], r2
  107. vst1.64 {d17}, [r0,:64], r2
  108. bgt 1b
  109. pop {r4-r7, pc}
  110. 2: tst r6, r6
  111. add ip, ip, r6
  112. vdup.8 d0, r4
  113. vdup.8 d1, ip
  114. beq 4f
  115. add r5, r1, r2
  116. lsl r4, r2, #1
  117. vld1.64 {d4}, [r1], r4
  118. vld1.64 {d6}, [r5], r4
  119. 3: pld [r5]
  120. vmull.u8 q8, d4, d0
  121. vmlal.u8 q8, d6, d1
  122. vld1.64 {d4}, [r1], r4
  123. vmull.u8 q9, d6, d0
  124. vmlal.u8 q9, d4, d1
  125. vld1.64 {d6}, [r5], r4
  126. vrshrn.u16 d16, q8, #6
  127. vrshrn.u16 d17, q9, #6
  128. .ifc \type,avg
  129. vld1.64 {d20}, [lr,:64], r2
  130. vld1.64 {d21}, [lr,:64], r2
  131. vrhadd.u8 q8, q8, q10
  132. .endif
  133. subs r3, r3, #2
  134. pld [r1]
  135. vst1.64 {d16}, [r0,:64], r2
  136. vst1.64 {d17}, [r0,:64], r2
  137. bgt 3b
  138. pop {r4-r7, pc}
  139. 4: vld1.64 {d4, d5}, [r1], r2
  140. vld1.64 {d6, d7}, [r1], r2
  141. vext.8 d5, d4, d5, #1
  142. vext.8 d7, d6, d7, #1
  143. 5: pld [r1]
  144. subs r3, r3, #2
  145. vmull.u8 q8, d4, d0
  146. vmlal.u8 q8, d5, d1
  147. vld1.64 {d4, d5}, [r1], r2
  148. vmull.u8 q9, d6, d0
  149. vmlal.u8 q9, d7, d1
  150. pld [r1]
  151. vext.8 d5, d4, d5, #1
  152. vrshrn.u16 d16, q8, #6
  153. vrshrn.u16 d17, q9, #6
  154. .ifc \type,avg
  155. vld1.64 {d20}, [lr,:64], r2
  156. vld1.64 {d21}, [lr,:64], r2
  157. vrhadd.u8 q8, q8, q10
  158. .endif
  159. vld1.64 {d6, d7}, [r1], r2
  160. vext.8 d7, d6, d7, #1
  161. vst1.64 {d16}, [r0,:64], r2
  162. vst1.64 {d17}, [r0,:64], r2
  163. bgt 5b
  164. pop {r4-r7, pc}
  165. .endfunc
  166. .endm
  167. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  168. .macro h264_chroma_mc4 type
  169. function ff_\type\()_h264_chroma_mc4_neon, export=1
  170. push {r4-r7, lr}
  171. ldrd r4, [sp, #20]
  172. .ifc \type,avg
  173. mov lr, r0
  174. .endif
  175. pld [r1]
  176. pld [r1, r2]
  177. muls r7, r4, r5
  178. rsb r6, r7, r5, lsl #3
  179. rsb ip, r7, r4, lsl #3
  180. sub r4, r7, r4, lsl #3
  181. sub r4, r4, r5, lsl #3
  182. add r4, r4, #64
  183. beq 2f
  184. add r5, r1, r2
  185. vdup.8 d0, r4
  186. lsl r4, r2, #1
  187. vdup.8 d1, ip
  188. vld1.64 {d4}, [r1], r4
  189. vdup.8 d2, r6
  190. vld1.64 {d6}, [r5], r4
  191. vdup.8 d3, r7
  192. vext.8 d5, d4, d5, #1
  193. vext.8 d7, d6, d7, #1
  194. vtrn.32 d4, d5
  195. vtrn.32 d6, d7
  196. vtrn.32 d0, d1
  197. vtrn.32 d2, d3
  198. 1: pld [r5]
  199. vmull.u8 q8, d4, d0
  200. vmlal.u8 q8, d6, d2
  201. vld1.64 {d4}, [r1], r4
  202. vext.8 d5, d4, d5, #1
  203. vtrn.32 d4, d5
  204. vmull.u8 q9, d6, d0
  205. vmlal.u8 q9, d4, d2
  206. vld1.64 {d6}, [r5], r4
  207. vadd.i16 d16, d16, d17
  208. vadd.i16 d17, d18, d19
  209. vrshrn.u16 d16, q8, #6
  210. subs r3, r3, #2
  211. pld [r1]
  212. .ifc \type,avg
  213. vld1.32 {d20[0]}, [lr,:32], r2
  214. vld1.32 {d20[1]}, [lr,:32], r2
  215. vrhadd.u8 d16, d16, d20
  216. .endif
  217. vext.8 d7, d6, d7, #1
  218. vtrn.32 d6, d7
  219. vst1.32 {d16[0]}, [r0,:32], r2
  220. vst1.32 {d16[1]}, [r0,:32], r2
  221. bgt 1b
  222. pop {r4-r7, pc}
  223. 2: tst r6, r6
  224. add ip, ip, r6
  225. vdup.8 d0, r4
  226. vdup.8 d1, ip
  227. vtrn.32 d0, d1
  228. beq 4f
  229. vext.32 d1, d0, d1, #1
  230. add r5, r1, r2
  231. lsl r4, r2, #1
  232. vld1.32 {d4[0]}, [r1], r4
  233. vld1.32 {d4[1]}, [r5], r4
  234. 3: pld [r5]
  235. vmull.u8 q8, d4, d0
  236. vld1.32 {d4[0]}, [r1], r4
  237. vmull.u8 q9, d4, d1
  238. vld1.32 {d4[1]}, [r5], r4
  239. vadd.i16 d16, d16, d17
  240. vadd.i16 d17, d18, d19
  241. vrshrn.u16 d16, q8, #6
  242. .ifc \type,avg
  243. vld1.32 {d20[0]}, [lr,:32], r2
  244. vld1.32 {d20[1]}, [lr,:32], r2
  245. vrhadd.u8 d16, d16, d20
  246. .endif
  247. subs r3, r3, #2
  248. pld [r1]
  249. vst1.32 {d16[0]}, [r0,:32], r2
  250. vst1.32 {d16[1]}, [r0,:32], r2
  251. bgt 3b
  252. pop {r4-r7, pc}
  253. 4: vld1.64 {d4}, [r1], r2
  254. vld1.64 {d6}, [r1], r2
  255. vext.8 d5, d4, d5, #1
  256. vext.8 d7, d6, d7, #1
  257. vtrn.32 d4, d5
  258. vtrn.32 d6, d7
  259. 5: vmull.u8 q8, d4, d0
  260. vmull.u8 q9, d6, d0
  261. subs r3, r3, #2
  262. vld1.64 {d4}, [r1], r2
  263. vext.8 d5, d4, d5, #1
  264. vtrn.32 d4, d5
  265. vadd.i16 d16, d16, d17
  266. vadd.i16 d17, d18, d19
  267. pld [r1]
  268. vrshrn.u16 d16, q8, #6
  269. .ifc \type,avg
  270. vld1.32 {d20[0]}, [lr,:32], r2
  271. vld1.32 {d20[1]}, [lr,:32], r2
  272. vrhadd.u8 d16, d16, d20
  273. .endif
  274. vld1.64 {d6}, [r1], r2
  275. vext.8 d7, d6, d7, #1
  276. vtrn.32 d6, d7
  277. pld [r1]
  278. vst1.32 {d16[0]}, [r0,:32], r2
  279. vst1.32 {d16[1]}, [r0,:32], r2
  280. bgt 5b
  281. pop {r4-r7, pc}
  282. .endfunc
  283. .endm
  284. .macro h264_chroma_mc2 type
  285. function ff_\type\()_h264_chroma_mc2_neon, export=1
  286. push {r4-r6, lr}
  287. ldr r4, [sp, #16]
  288. ldr lr, [sp, #20]
  289. pld [r1]
  290. pld [r1, r2]
  291. orrs r5, r4, lr
  292. beq 2f
  293. mul r5, r4, lr
  294. rsb r6, r5, lr, lsl #3
  295. rsb r12, r5, r4, lsl #3
  296. sub r4, r5, r4, lsl #3
  297. sub r4, r4, lr, lsl #3
  298. add r4, r4, #64
  299. vdup.8 d0, r4
  300. vdup.8 d2, r12
  301. vdup.8 d1, r6
  302. vdup.8 d3, r5
  303. vtrn.16 q0, q1
  304. 1:
  305. vld1.32 {d4[0]}, [r1], r2
  306. vld1.32 {d4[1]}, [r1], r2
  307. vrev64.32 d5, d4
  308. vld1.32 {d5[1]}, [r1]
  309. vext.8 q3, q2, q2, #1
  310. vtrn.16 q2, q3
  311. vmull.u8 q8, d4, d0
  312. vmlal.u8 q8, d5, d1
  313. .ifc \type,avg
  314. vld1.16 {d18[0]}, [r0,:16], r2
  315. vld1.16 {d18[1]}, [r0,:16]
  316. sub r0, r0, r2
  317. .endif
  318. vtrn.32 d16, d17
  319. vadd.i16 d16, d16, d17
  320. vrshrn.u16 d16, q8, #6
  321. .ifc \type,avg
  322. vrhadd.u8 d16, d16, d18
  323. .endif
  324. vst1.16 {d16[0]}, [r0,:16], r2
  325. vst1.16 {d16[1]}, [r0,:16], r2
  326. subs r3, r3, #2
  327. bgt 1b
  328. pop {r4-r6, pc}
  329. 2:
  330. .ifc \type,put
  331. ldrh r5, [r1], r2
  332. strh r5, [r0], r2
  333. ldrh r6, [r1], r2
  334. strh r6, [r0], r2
  335. .else
  336. vld1.16 {d16[0]}, [r1], r2
  337. vld1.16 {d16[1]}, [r1], r2
  338. vld1.16 {d18[0]}, [r0,:16], r2
  339. vld1.16 {d18[1]}, [r0,:16]
  340. sub r0, r0, r2
  341. vrhadd.u8 d16, d16, d18
  342. vst1.16 {d16[0]}, [r0,:16], r2
  343. vst1.16 {d16[1]}, [r0,:16], r2
  344. .endif
  345. subs r3, r3, #2
  346. bgt 2b
  347. pop {r4-r6, pc}
  348. .endfunc
  349. .endm
  350. .text
  351. .align
  352. h264_chroma_mc8 put
  353. h264_chroma_mc8 avg
  354. h264_chroma_mc4 put
  355. h264_chroma_mc4 avg
  356. h264_chroma_mc2 put
  357. h264_chroma_mc2 avg
  358. /* H.264 loop filter */
  359. .macro h264_loop_filter_start
  360. ldr ip, [sp]
  361. tst r2, r2
  362. ldr ip, [ip]
  363. tstne r3, r3
  364. vmov.32 d24[0], ip
  365. and ip, ip, ip, lsl #16
  366. bxeq lr
  367. ands ip, ip, ip, lsl #8
  368. bxlt lr
  369. .endm
  370. .macro align_push_regs
  371. and ip, sp, #15
  372. add ip, ip, #32
  373. sub sp, sp, ip
  374. vst1.64 {d12-d15}, [sp,:128]
  375. sub sp, sp, #32
  376. vst1.64 {d8-d11}, [sp,:128]
  377. .endm
  378. .macro align_pop_regs
  379. vld1.64 {d8-d11}, [sp,:128]!
  380. vld1.64 {d12-d15}, [sp,:128], ip
  381. .endm
  382. .macro h264_loop_filter_luma
  383. vdup.8 q11, r2 @ alpha
  384. vmovl.u8 q12, d24
  385. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  386. vmovl.u16 q12, d24
  387. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  388. vsli.16 q12, q12, #8
  389. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  390. vsli.32 q12, q12, #16
  391. vclt.u8 q6, q6, q11 @ < alpha
  392. vdup.8 q11, r3 @ beta
  393. vclt.s8 q7, q12, #0
  394. vclt.u8 q14, q14, q11 @ < beta
  395. vclt.u8 q15, q15, q11 @ < beta
  396. vbic q6, q6, q7
  397. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  398. vand q6, q6, q14
  399. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  400. vclt.u8 q4, q4, q11 @ < beta
  401. vand q6, q6, q15
  402. vclt.u8 q5, q5, q11 @ < beta
  403. vand q4, q4, q6
  404. vand q5, q5, q6
  405. vand q12, q12, q6
  406. vrhadd.u8 q14, q8, q0
  407. vsub.i8 q6, q12, q4
  408. vqadd.u8 q7, q9, q12
  409. vhadd.u8 q10, q10, q14
  410. vsub.i8 q6, q6, q5
  411. vhadd.u8 q14, q2, q14
  412. vmin.u8 q7, q7, q10
  413. vqsub.u8 q11, q9, q12
  414. vqadd.u8 q2, q1, q12
  415. vmax.u8 q7, q7, q11
  416. vqsub.u8 q11, q1, q12
  417. vmin.u8 q14, q2, q14
  418. vmovl.u8 q2, d0
  419. vmax.u8 q14, q14, q11
  420. vmovl.u8 q10, d1
  421. vsubw.u8 q2, q2, d16
  422. vsubw.u8 q10, q10, d17
  423. vshl.i16 q2, q2, #2
  424. vshl.i16 q10, q10, #2
  425. vaddw.u8 q2, q2, d18
  426. vaddw.u8 q10, q10, d19
  427. vsubw.u8 q2, q2, d2
  428. vsubw.u8 q10, q10, d3
  429. vrshrn.i16 d4, q2, #3
  430. vrshrn.i16 d5, q10, #3
  431. vbsl q4, q7, q9
  432. vbsl q5, q14, q1
  433. vneg.s8 q7, q6
  434. vmovl.u8 q14, d16
  435. vmin.s8 q2, q2, q6
  436. vmovl.u8 q6, d17
  437. vmax.s8 q2, q2, q7
  438. vmovl.u8 q11, d0
  439. vmovl.u8 q12, d1
  440. vaddw.s8 q14, q14, d4
  441. vaddw.s8 q6, q6, d5
  442. vsubw.s8 q11, q11, d4
  443. vsubw.s8 q12, q12, d5
  444. vqmovun.s16 d16, q14
  445. vqmovun.s16 d17, q6
  446. vqmovun.s16 d0, q11
  447. vqmovun.s16 d1, q12
  448. .endm
  449. function ff_h264_v_loop_filter_luma_neon, export=1
  450. h264_loop_filter_start
  451. vld1.64 {d0, d1}, [r0,:128], r1
  452. vld1.64 {d2, d3}, [r0,:128], r1
  453. vld1.64 {d4, d5}, [r0,:128], r1
  454. sub r0, r0, r1, lsl #2
  455. sub r0, r0, r1, lsl #1
  456. vld1.64 {d20,d21}, [r0,:128], r1
  457. vld1.64 {d18,d19}, [r0,:128], r1
  458. vld1.64 {d16,d17}, [r0,:128], r1
  459. align_push_regs
  460. h264_loop_filter_luma
  461. sub r0, r0, r1, lsl #1
  462. vst1.64 {d8, d9}, [r0,:128], r1
  463. vst1.64 {d16,d17}, [r0,:128], r1
  464. vst1.64 {d0, d1}, [r0,:128], r1
  465. vst1.64 {d10,d11}, [r0,:128]
  466. align_pop_regs
  467. bx lr
  468. .endfunc
  469. function ff_h264_h_loop_filter_luma_neon, export=1
  470. h264_loop_filter_start
  471. sub r0, r0, #4
  472. vld1.64 {d6}, [r0], r1
  473. vld1.64 {d20}, [r0], r1
  474. vld1.64 {d18}, [r0], r1
  475. vld1.64 {d16}, [r0], r1
  476. vld1.64 {d0}, [r0], r1
  477. vld1.64 {d2}, [r0], r1
  478. vld1.64 {d4}, [r0], r1
  479. vld1.64 {d26}, [r0], r1
  480. vld1.64 {d7}, [r0], r1
  481. vld1.64 {d21}, [r0], r1
  482. vld1.64 {d19}, [r0], r1
  483. vld1.64 {d17}, [r0], r1
  484. vld1.64 {d1}, [r0], r1
  485. vld1.64 {d3}, [r0], r1
  486. vld1.64 {d5}, [r0], r1
  487. vld1.64 {d27}, [r0], r1
  488. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  489. align_push_regs
  490. h264_loop_filter_luma
  491. transpose_4x4 q4, q8, q0, q5
  492. sub r0, r0, r1, lsl #4
  493. add r0, r0, #2
  494. vst1.32 {d8[0]}, [r0], r1
  495. vst1.32 {d16[0]}, [r0], r1
  496. vst1.32 {d0[0]}, [r0], r1
  497. vst1.32 {d10[0]}, [r0], r1
  498. vst1.32 {d8[1]}, [r0], r1
  499. vst1.32 {d16[1]}, [r0], r1
  500. vst1.32 {d0[1]}, [r0], r1
  501. vst1.32 {d10[1]}, [r0], r1
  502. vst1.32 {d9[0]}, [r0], r1
  503. vst1.32 {d17[0]}, [r0], r1
  504. vst1.32 {d1[0]}, [r0], r1
  505. vst1.32 {d11[0]}, [r0], r1
  506. vst1.32 {d9[1]}, [r0], r1
  507. vst1.32 {d17[1]}, [r0], r1
  508. vst1.32 {d1[1]}, [r0], r1
  509. vst1.32 {d11[1]}, [r0], r1
  510. align_pop_regs
  511. bx lr
  512. .endfunc
  513. .macro h264_loop_filter_chroma
  514. vdup.8 d22, r2 @ alpha
  515. vmovl.u8 q12, d24
  516. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  517. vmovl.u8 q2, d0
  518. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  519. vsubw.u8 q2, q2, d16
  520. vsli.16 d24, d24, #8
  521. vshl.i16 q2, q2, #2
  522. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  523. vaddw.u8 q2, q2, d18
  524. vclt.u8 d26, d26, d22 @ < alpha
  525. vsubw.u8 q2, q2, d2
  526. vdup.8 d22, r3 @ beta
  527. vclt.s8 d25, d24, #0
  528. vrshrn.i16 d4, q2, #3
  529. vclt.u8 d28, d28, d22 @ < beta
  530. vbic d26, d26, d25
  531. vclt.u8 d30, d30, d22 @ < beta
  532. vand d26, d26, d28
  533. vneg.s8 d25, d24
  534. vand d26, d26, d30
  535. vmin.s8 d4, d4, d24
  536. vmovl.u8 q14, d16
  537. vand d4, d4, d26
  538. vmax.s8 d4, d4, d25
  539. vmovl.u8 q11, d0
  540. vaddw.s8 q14, q14, d4
  541. vsubw.s8 q11, q11, d4
  542. vqmovun.s16 d16, q14
  543. vqmovun.s16 d0, q11
  544. .endm
  545. function ff_h264_v_loop_filter_chroma_neon, export=1
  546. h264_loop_filter_start
  547. sub r0, r0, r1, lsl #1
  548. vld1.64 {d18}, [r0,:64], r1
  549. vld1.64 {d16}, [r0,:64], r1
  550. vld1.64 {d0}, [r0,:64], r1
  551. vld1.64 {d2}, [r0,:64]
  552. h264_loop_filter_chroma
  553. sub r0, r0, r1, lsl #1
  554. vst1.64 {d16}, [r0,:64], r1
  555. vst1.64 {d0}, [r0,:64], r1
  556. bx lr
  557. .endfunc
  558. function ff_h264_h_loop_filter_chroma_neon, export=1
  559. h264_loop_filter_start
  560. sub r0, r0, #2
  561. vld1.32 {d18[0]}, [r0], r1
  562. vld1.32 {d16[0]}, [r0], r1
  563. vld1.32 {d0[0]}, [r0], r1
  564. vld1.32 {d2[0]}, [r0], r1
  565. vld1.32 {d18[1]}, [r0], r1
  566. vld1.32 {d16[1]}, [r0], r1
  567. vld1.32 {d0[1]}, [r0], r1
  568. vld1.32 {d2[1]}, [r0], r1
  569. vtrn.16 d18, d0
  570. vtrn.16 d16, d2
  571. vtrn.8 d18, d16
  572. vtrn.8 d0, d2
  573. h264_loop_filter_chroma
  574. vtrn.16 d18, d0
  575. vtrn.16 d16, d2
  576. vtrn.8 d18, d16
  577. vtrn.8 d0, d2
  578. sub r0, r0, r1, lsl #3
  579. vst1.32 {d18[0]}, [r0], r1
  580. vst1.32 {d16[0]}, [r0], r1
  581. vst1.32 {d0[0]}, [r0], r1
  582. vst1.32 {d2[0]}, [r0], r1
  583. vst1.32 {d18[1]}, [r0], r1
  584. vst1.32 {d16[1]}, [r0], r1
  585. vst1.32 {d0[1]}, [r0], r1
  586. vst1.32 {d2[1]}, [r0], r1
  587. bx lr
  588. .endfunc
  589. /* H.264 qpel MC */
  590. .macro lowpass_const r
  591. movw \r, #5
  592. movt \r, #20
  593. vmov.32 d6[0], \r
  594. .endm
  595. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  596. .if \narrow
  597. t0 .req q0
  598. t1 .req q8
  599. .else
  600. t0 .req \d0
  601. t1 .req \d1
  602. .endif
  603. vext.8 d2, \r0, \r1, #2
  604. vext.8 d3, \r0, \r1, #3
  605. vaddl.u8 q1, d2, d3
  606. vext.8 d4, \r0, \r1, #1
  607. vext.8 d5, \r0, \r1, #4
  608. vaddl.u8 q2, d4, d5
  609. vext.8 d30, \r0, \r1, #5
  610. vaddl.u8 t0, \r0, d30
  611. vext.8 d18, \r2, \r3, #2
  612. vmla.i16 t0, q1, d6[1]
  613. vext.8 d19, \r2, \r3, #3
  614. vaddl.u8 q9, d18, d19
  615. vext.8 d20, \r2, \r3, #1
  616. vmls.i16 t0, q2, d6[0]
  617. vext.8 d21, \r2, \r3, #4
  618. vaddl.u8 q10, d20, d21
  619. vext.8 d31, \r2, \r3, #5
  620. vaddl.u8 t1, \r2, d31
  621. vmla.i16 t1, q9, d6[1]
  622. vmls.i16 t1, q10, d6[0]
  623. .if \narrow
  624. vqrshrun.s16 \d0, t0, #5
  625. vqrshrun.s16 \d1, t1, #5
  626. .endif
  627. .unreq t0
  628. .unreq t1
  629. .endm
  630. .macro lowpass_8_1 r0, r1, d0, narrow=1
  631. .if \narrow
  632. t0 .req q0
  633. .else
  634. t0 .req \d0
  635. .endif
  636. vext.8 d2, \r0, \r1, #2
  637. vext.8 d3, \r0, \r1, #3
  638. vaddl.u8 q1, d2, d3
  639. vext.8 d4, \r0, \r1, #1
  640. vext.8 d5, \r0, \r1, #4
  641. vaddl.u8 q2, d4, d5
  642. vext.8 d30, \r0, \r1, #5
  643. vaddl.u8 t0, \r0, d30
  644. vmla.i16 t0, q1, d6[1]
  645. vmls.i16 t0, q2, d6[0]
  646. .if \narrow
  647. vqrshrun.s16 \d0, t0, #5
  648. .endif
  649. .unreq t0
  650. .endm
  651. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  652. vext.16 q1, \r0, \r1, #2
  653. vext.16 q0, \r0, \r1, #3
  654. vaddl.s16 q9, d2, d0
  655. vext.16 q2, \r0, \r1, #1
  656. vaddl.s16 q1, d3, d1
  657. vext.16 q3, \r0, \r1, #4
  658. vaddl.s16 q10, d4, d6
  659. vext.16 \r1, \r0, \r1, #5
  660. vaddl.s16 q2, d5, d7
  661. vaddl.s16 q0, \h0, \h1
  662. vaddl.s16 q8, \l0, \l1
  663. vshl.i32 q3, q9, #4
  664. vshl.i32 q9, q9, #2
  665. vshl.i32 q15, q10, #2
  666. vadd.i32 q9, q9, q3
  667. vadd.i32 q10, q10, q15
  668. vshl.i32 q3, q1, #4
  669. vshl.i32 q1, q1, #2
  670. vshl.i32 q15, q2, #2
  671. vadd.i32 q1, q1, q3
  672. vadd.i32 q2, q2, q15
  673. vadd.i32 q9, q9, q8
  674. vsub.i32 q9, q9, q10
  675. vadd.i32 q1, q1, q0
  676. vsub.i32 q1, q1, q2
  677. vrshrn.s32 d18, q9, #10
  678. vrshrn.s32 d19, q1, #10
  679. vqmovun.s16 \d, q9
  680. .endm
  681. function put_h264_qpel16_h_lowpass_neon_packed
  682. mov r4, lr
  683. mov ip, #16
  684. mov r3, #8
  685. bl put_h264_qpel8_h_lowpass_neon
  686. sub r1, r1, r2, lsl #4
  687. add r1, r1, #8
  688. mov ip, #16
  689. mov lr, r4
  690. b put_h264_qpel8_h_lowpass_neon
  691. .endfunc
  692. .macro h264_qpel_h_lowpass type
  693. function \type\()_h264_qpel16_h_lowpass_neon
  694. push {lr}
  695. mov ip, #16
  696. bl \type\()_h264_qpel8_h_lowpass_neon
  697. sub r0, r0, r3, lsl #4
  698. sub r1, r1, r2, lsl #4
  699. add r0, r0, #8
  700. add r1, r1, #8
  701. mov ip, #16
  702. pop {lr}
  703. .endfunc
  704. function \type\()_h264_qpel8_h_lowpass_neon
  705. 1: vld1.64 {d0, d1}, [r1], r2
  706. vld1.64 {d16,d17}, [r1], r2
  707. subs ip, ip, #2
  708. lowpass_8 d0, d1, d16, d17, d0, d16
  709. .ifc \type,avg
  710. vld1.8 {d2}, [r0,:64], r3
  711. vrhadd.u8 d0, d0, d2
  712. vld1.8 {d3}, [r0,:64]
  713. vrhadd.u8 d16, d16, d3
  714. sub r0, r0, r3
  715. .endif
  716. vst1.64 {d0}, [r0,:64], r3
  717. vst1.64 {d16}, [r0,:64], r3
  718. bne 1b
  719. bx lr
  720. .endfunc
  721. .endm
  722. h264_qpel_h_lowpass put
  723. h264_qpel_h_lowpass avg
  724. .macro h264_qpel_h_lowpass_l2 type
  725. function \type\()_h264_qpel16_h_lowpass_l2_neon
  726. push {lr}
  727. mov ip, #16
  728. bl \type\()_h264_qpel8_h_lowpass_l2_neon
  729. sub r0, r0, r2, lsl #4
  730. sub r1, r1, r2, lsl #4
  731. sub r3, r3, r2, lsl #4
  732. add r0, r0, #8
  733. add r1, r1, #8
  734. add r3, r3, #8
  735. mov ip, #16
  736. pop {lr}
  737. .endfunc
  738. function \type\()_h264_qpel8_h_lowpass_l2_neon
  739. 1: vld1.64 {d0, d1}, [r1], r2
  740. vld1.64 {d16,d17}, [r1], r2
  741. vld1.64 {d28}, [r3], r2
  742. vld1.64 {d29}, [r3], r2
  743. subs ip, ip, #2
  744. lowpass_8 d0, d1, d16, d17, d0, d1
  745. vrhadd.u8 q0, q0, q14
  746. .ifc \type,avg
  747. vld1.8 {d2}, [r0,:64], r2
  748. vrhadd.u8 d0, d0, d2
  749. vld1.8 {d3}, [r0,:64]
  750. vrhadd.u8 d1, d1, d3
  751. sub r0, r0, r2
  752. .endif
  753. vst1.64 {d0}, [r0,:64], r2
  754. vst1.64 {d1}, [r0,:64], r2
  755. bne 1b
  756. bx lr
  757. .endfunc
  758. .endm
  759. h264_qpel_h_lowpass_l2 put
  760. h264_qpel_h_lowpass_l2 avg
  761. function put_h264_qpel16_v_lowpass_neon_packed
  762. mov r4, lr
  763. mov r2, #8
  764. bl put_h264_qpel8_v_lowpass_neon
  765. sub r1, r1, r3, lsl #2
  766. bl put_h264_qpel8_v_lowpass_neon
  767. sub r1, r1, r3, lsl #4
  768. sub r1, r1, r3, lsl #2
  769. add r1, r1, #8
  770. bl put_h264_qpel8_v_lowpass_neon
  771. sub r1, r1, r3, lsl #2
  772. mov lr, r4
  773. b put_h264_qpel8_v_lowpass_neon
  774. .endfunc
  775. .macro h264_qpel_v_lowpass type
  776. function \type\()_h264_qpel16_v_lowpass_neon
  777. mov r4, lr
  778. bl \type\()_h264_qpel8_v_lowpass_neon
  779. sub r1, r1, r3, lsl #2
  780. bl \type\()_h264_qpel8_v_lowpass_neon
  781. sub r0, r0, r2, lsl #4
  782. add r0, r0, #8
  783. sub r1, r1, r3, lsl #4
  784. sub r1, r1, r3, lsl #2
  785. add r1, r1, #8
  786. bl \type\()_h264_qpel8_v_lowpass_neon
  787. sub r1, r1, r3, lsl #2
  788. mov lr, r4
  789. .endfunc
  790. function \type\()_h264_qpel8_v_lowpass_neon
  791. vld1.64 {d8}, [r1], r3
  792. vld1.64 {d10}, [r1], r3
  793. vld1.64 {d12}, [r1], r3
  794. vld1.64 {d14}, [r1], r3
  795. vld1.64 {d22}, [r1], r3
  796. vld1.64 {d24}, [r1], r3
  797. vld1.64 {d26}, [r1], r3
  798. vld1.64 {d28}, [r1], r3
  799. vld1.64 {d9}, [r1], r3
  800. vld1.64 {d11}, [r1], r3
  801. vld1.64 {d13}, [r1], r3
  802. vld1.64 {d15}, [r1], r3
  803. vld1.64 {d23}, [r1]
  804. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  805. lowpass_8 d8, d9, d10, d11, d8, d10
  806. lowpass_8 d12, d13, d14, d15, d12, d14
  807. lowpass_8 d22, d23, d24, d25, d22, d24
  808. lowpass_8 d26, d27, d28, d29, d26, d28
  809. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  810. .ifc \type,avg
  811. vld1.8 {d9}, [r0,:64], r2
  812. vrhadd.u8 d8, d8, d9
  813. vld1.8 {d11}, [r0,:64], r2
  814. vrhadd.u8 d10, d10, d11
  815. vld1.8 {d13}, [r0,:64], r2
  816. vrhadd.u8 d12, d12, d13
  817. vld1.8 {d15}, [r0,:64], r2
  818. vrhadd.u8 d14, d14, d15
  819. vld1.8 {d23}, [r0,:64], r2
  820. vrhadd.u8 d22, d22, d23
  821. vld1.8 {d25}, [r0,:64], r2
  822. vrhadd.u8 d24, d24, d25
  823. vld1.8 {d27}, [r0,:64], r2
  824. vrhadd.u8 d26, d26, d27
  825. vld1.8 {d29}, [r0,:64], r2
  826. vrhadd.u8 d28, d28, d29
  827. sub r0, r0, r2, lsl #3
  828. .endif
  829. vst1.64 {d8}, [r0,:64], r2
  830. vst1.64 {d10}, [r0,:64], r2
  831. vst1.64 {d12}, [r0,:64], r2
  832. vst1.64 {d14}, [r0,:64], r2
  833. vst1.64 {d22}, [r0,:64], r2
  834. vst1.64 {d24}, [r0,:64], r2
  835. vst1.64 {d26}, [r0,:64], r2
  836. vst1.64 {d28}, [r0,:64], r2
  837. bx lr
  838. .endfunc
  839. .endm
  840. h264_qpel_v_lowpass put
  841. h264_qpel_v_lowpass avg
  842. .macro h264_qpel_v_lowpass_l2 type
  843. function \type\()_h264_qpel16_v_lowpass_l2_neon
  844. mov r4, lr
  845. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  846. sub r1, r1, r3, lsl #2
  847. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  848. sub r0, r0, r3, lsl #4
  849. sub ip, ip, r2, lsl #4
  850. add r0, r0, #8
  851. add ip, ip, #8
  852. sub r1, r1, r3, lsl #4
  853. sub r1, r1, r3, lsl #2
  854. add r1, r1, #8
  855. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  856. sub r1, r1, r3, lsl #2
  857. mov lr, r4
  858. .endfunc
  859. function \type\()_h264_qpel8_v_lowpass_l2_neon
  860. vld1.64 {d8}, [r1], r3
  861. vld1.64 {d10}, [r1], r3
  862. vld1.64 {d12}, [r1], r3
  863. vld1.64 {d14}, [r1], r3
  864. vld1.64 {d22}, [r1], r3
  865. vld1.64 {d24}, [r1], r3
  866. vld1.64 {d26}, [r1], r3
  867. vld1.64 {d28}, [r1], r3
  868. vld1.64 {d9}, [r1], r3
  869. vld1.64 {d11}, [r1], r3
  870. vld1.64 {d13}, [r1], r3
  871. vld1.64 {d15}, [r1], r3
  872. vld1.64 {d23}, [r1]
  873. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  874. lowpass_8 d8, d9, d10, d11, d8, d9
  875. lowpass_8 d12, d13, d14, d15, d12, d13
  876. lowpass_8 d22, d23, d24, d25, d22, d23
  877. lowpass_8 d26, d27, d28, d29, d26, d27
  878. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  879. vld1.64 {d0}, [ip], r2
  880. vld1.64 {d1}, [ip], r2
  881. vld1.64 {d2}, [ip], r2
  882. vld1.64 {d3}, [ip], r2
  883. vld1.64 {d4}, [ip], r2
  884. vrhadd.u8 q0, q0, q4
  885. vld1.64 {d5}, [ip], r2
  886. vrhadd.u8 q1, q1, q6
  887. vld1.64 {d10}, [ip], r2
  888. vrhadd.u8 q2, q2, q11
  889. vld1.64 {d11}, [ip], r2
  890. vrhadd.u8 q5, q5, q13
  891. .ifc \type,avg
  892. vld1.8 {d16}, [r0,:64], r3
  893. vrhadd.u8 d0, d0, d16
  894. vld1.8 {d17}, [r0,:64], r3
  895. vrhadd.u8 d1, d1, d17
  896. vld1.8 {d16}, [r0,:64], r3
  897. vrhadd.u8 d2, d2, d16
  898. vld1.8 {d17}, [r0,:64], r3
  899. vrhadd.u8 d3, d3, d17
  900. vld1.8 {d16}, [r0,:64], r3
  901. vrhadd.u8 d4, d4, d16
  902. vld1.8 {d17}, [r0,:64], r3
  903. vrhadd.u8 d5, d5, d17
  904. vld1.8 {d16}, [r0,:64], r3
  905. vrhadd.u8 d10, d10, d16
  906. vld1.8 {d17}, [r0,:64], r3
  907. vrhadd.u8 d11, d11, d17
  908. sub r0, r0, r3, lsl #3
  909. .endif
  910. vst1.64 {d0}, [r0,:64], r3
  911. vst1.64 {d1}, [r0,:64], r3
  912. vst1.64 {d2}, [r0,:64], r3
  913. vst1.64 {d3}, [r0,:64], r3
  914. vst1.64 {d4}, [r0,:64], r3
  915. vst1.64 {d5}, [r0,:64], r3
  916. vst1.64 {d10}, [r0,:64], r3
  917. vst1.64 {d11}, [r0,:64], r3
  918. bx lr
  919. .endfunc
  920. .endm
  921. h264_qpel_v_lowpass_l2 put
  922. h264_qpel_v_lowpass_l2 avg
  923. function put_h264_qpel8_hv_lowpass_neon_top
  924. lowpass_const ip
  925. mov ip, #12
  926. 1: vld1.64 {d0, d1}, [r1], r3
  927. vld1.64 {d16,d17}, [r1], r3
  928. subs ip, ip, #2
  929. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  930. vst1.64 {d22-d25}, [r4,:128]!
  931. bne 1b
  932. vld1.64 {d0, d1}, [r1]
  933. lowpass_8_1 d0, d1, q12, narrow=0
  934. mov ip, #-16
  935. add r4, r4, ip
  936. vld1.64 {d30,d31}, [r4,:128], ip
  937. vld1.64 {d20,d21}, [r4,:128], ip
  938. vld1.64 {d18,d19}, [r4,:128], ip
  939. vld1.64 {d16,d17}, [r4,:128], ip
  940. vld1.64 {d14,d15}, [r4,:128], ip
  941. vld1.64 {d12,d13}, [r4,:128], ip
  942. vld1.64 {d10,d11}, [r4,:128], ip
  943. vld1.64 {d8, d9}, [r4,:128], ip
  944. vld1.64 {d6, d7}, [r4,:128], ip
  945. vld1.64 {d4, d5}, [r4,:128], ip
  946. vld1.64 {d2, d3}, [r4,:128], ip
  947. vld1.64 {d0, d1}, [r4,:128]
  948. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  949. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  950. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  951. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  952. vst1.64 {d30,d31}, [r4,:128]!
  953. vst1.64 {d6, d7}, [r4,:128]!
  954. vst1.64 {d20,d21}, [r4,:128]!
  955. vst1.64 {d4, d5}, [r4,:128]!
  956. vst1.64 {d18,d19}, [r4,:128]!
  957. vst1.64 {d2, d3}, [r4,:128]!
  958. vst1.64 {d16,d17}, [r4,:128]!
  959. vst1.64 {d0, d1}, [r4,:128]
  960. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  961. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  962. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  963. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  964. vld1.64 {d16,d17}, [r4,:128], ip
  965. vld1.64 {d30,d31}, [r4,:128], ip
  966. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  967. vld1.64 {d16,d17}, [r4,:128], ip
  968. vld1.64 {d30,d31}, [r4,:128], ip
  969. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  970. vld1.64 {d16,d17}, [r4,:128], ip
  971. vld1.64 {d30,d31}, [r4,:128], ip
  972. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  973. vld1.64 {d16,d17}, [r4,:128], ip
  974. vld1.64 {d30,d31}, [r4,:128]
  975. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  976. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  977. bx lr
  978. .endfunc
  979. .macro h264_qpel8_hv_lowpass type
  980. function \type\()_h264_qpel8_hv_lowpass_neon
  981. mov r10, lr
  982. bl put_h264_qpel8_hv_lowpass_neon_top
  983. .ifc \type,avg
  984. vld1.8 {d0}, [r0,:64], r2
  985. vrhadd.u8 d12, d12, d0
  986. vld1.8 {d1}, [r0,:64], r2
  987. vrhadd.u8 d13, d13, d1
  988. vld1.8 {d2}, [r0,:64], r2
  989. vrhadd.u8 d14, d14, d2
  990. vld1.8 {d3}, [r0,:64], r2
  991. vrhadd.u8 d15, d15, d3
  992. vld1.8 {d4}, [r0,:64], r2
  993. vrhadd.u8 d8, d8, d4
  994. vld1.8 {d5}, [r0,:64], r2
  995. vrhadd.u8 d9, d9, d5
  996. vld1.8 {d6}, [r0,:64], r2
  997. vrhadd.u8 d10, d10, d6
  998. vld1.8 {d7}, [r0,:64], r2
  999. vrhadd.u8 d11, d11, d7
  1000. sub r0, r0, r2, lsl #3
  1001. .endif
  1002. vst1.64 {d12}, [r0,:64], r2
  1003. vst1.64 {d13}, [r0,:64], r2
  1004. vst1.64 {d14}, [r0,:64], r2
  1005. vst1.64 {d15}, [r0,:64], r2
  1006. vst1.64 {d8}, [r0,:64], r2
  1007. vst1.64 {d9}, [r0,:64], r2
  1008. vst1.64 {d10}, [r0,:64], r2
  1009. vst1.64 {d11}, [r0,:64], r2
  1010. mov lr, r10
  1011. bx lr
  1012. .endfunc
  1013. .endm
  1014. h264_qpel8_hv_lowpass put
  1015. h264_qpel8_hv_lowpass avg
  1016. .macro h264_qpel8_hv_lowpass_l2 type
  1017. function \type\()_h264_qpel8_hv_lowpass_l2_neon
  1018. mov r10, lr
  1019. bl put_h264_qpel8_hv_lowpass_neon_top
  1020. vld1.64 {d0, d1}, [r2,:128]!
  1021. vld1.64 {d2, d3}, [r2,:128]!
  1022. vrhadd.u8 q0, q0, q6
  1023. vld1.64 {d4, d5}, [r2,:128]!
  1024. vrhadd.u8 q1, q1, q7
  1025. vld1.64 {d6, d7}, [r2,:128]!
  1026. vrhadd.u8 q2, q2, q4
  1027. vrhadd.u8 q3, q3, q5
  1028. .ifc \type,avg
  1029. vld1.8 {d16}, [r0,:64], r3
  1030. vrhadd.u8 d0, d0, d16
  1031. vld1.8 {d17}, [r0,:64], r3
  1032. vrhadd.u8 d1, d1, d17
  1033. vld1.8 {d18}, [r0,:64], r3
  1034. vrhadd.u8 d2, d2, d18
  1035. vld1.8 {d19}, [r0,:64], r3
  1036. vrhadd.u8 d3, d3, d19
  1037. vld1.8 {d20}, [r0,:64], r3
  1038. vrhadd.u8 d4, d4, d20
  1039. vld1.8 {d21}, [r0,:64], r3
  1040. vrhadd.u8 d5, d5, d21
  1041. vld1.8 {d22}, [r0,:64], r3
  1042. vrhadd.u8 d6, d6, d22
  1043. vld1.8 {d23}, [r0,:64], r3
  1044. vrhadd.u8 d7, d7, d23
  1045. sub r0, r0, r3, lsl #3
  1046. .endif
  1047. vst1.64 {d0}, [r0,:64], r3
  1048. vst1.64 {d1}, [r0,:64], r3
  1049. vst1.64 {d2}, [r0,:64], r3
  1050. vst1.64 {d3}, [r0,:64], r3
  1051. vst1.64 {d4}, [r0,:64], r3
  1052. vst1.64 {d5}, [r0,:64], r3
  1053. vst1.64 {d6}, [r0,:64], r3
  1054. vst1.64 {d7}, [r0,:64], r3
  1055. mov lr, r10
  1056. bx lr
  1057. .endfunc
  1058. .endm
  1059. h264_qpel8_hv_lowpass_l2 put
  1060. h264_qpel8_hv_lowpass_l2 avg
  1061. .macro h264_qpel16_hv type
  1062. function \type\()_h264_qpel16_hv_lowpass_neon
  1063. mov r9, lr
  1064. bl \type\()_h264_qpel8_hv_lowpass_neon
  1065. sub r1, r1, r3, lsl #2
  1066. bl \type\()_h264_qpel8_hv_lowpass_neon
  1067. sub r1, r1, r3, lsl #4
  1068. sub r1, r1, r3, lsl #2
  1069. add r1, r1, #8
  1070. sub r0, r0, r2, lsl #4
  1071. add r0, r0, #8
  1072. bl \type\()_h264_qpel8_hv_lowpass_neon
  1073. sub r1, r1, r3, lsl #2
  1074. mov lr, r9
  1075. b \type\()_h264_qpel8_hv_lowpass_neon
  1076. .endfunc
  1077. function \type\()_h264_qpel16_hv_lowpass_l2_neon
  1078. mov r9, lr
  1079. sub r2, r4, #256
  1080. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1081. sub r1, r1, r3, lsl #2
  1082. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1083. sub r1, r1, r3, lsl #4
  1084. sub r1, r1, r3, lsl #2
  1085. add r1, r1, #8
  1086. sub r0, r0, r3, lsl #4
  1087. add r0, r0, #8
  1088. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1089. sub r1, r1, r3, lsl #2
  1090. mov lr, r9
  1091. b \type\()_h264_qpel8_hv_lowpass_l2_neon
  1092. .endfunc
  1093. .endm
  1094. h264_qpel16_hv put
  1095. h264_qpel16_hv avg
  1096. .macro h264_qpel8 type
  1097. function ff_\type\()_h264_qpel8_mc10_neon, export=1
  1098. lowpass_const r3
  1099. mov r3, r1
  1100. sub r1, r1, #2
  1101. mov ip, #8
  1102. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1103. .endfunc
  1104. function ff_\type\()_h264_qpel8_mc20_neon, export=1
  1105. lowpass_const r3
  1106. sub r1, r1, #2
  1107. mov r3, r2
  1108. mov ip, #8
  1109. b \type\()_h264_qpel8_h_lowpass_neon
  1110. .endfunc
  1111. function ff_\type\()_h264_qpel8_mc30_neon, export=1
  1112. lowpass_const r3
  1113. add r3, r1, #1
  1114. sub r1, r1, #2
  1115. mov ip, #8
  1116. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1117. .endfunc
  1118. function ff_\type\()_h264_qpel8_mc01_neon, export=1
  1119. push {lr}
  1120. mov ip, r1
  1121. \type\()_h264_qpel8_mc01:
  1122. lowpass_const r3
  1123. mov r3, r2
  1124. sub r1, r1, r2, lsl #1
  1125. vpush {d8-d15}
  1126. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1127. vpop {d8-d15}
  1128. pop {pc}
  1129. .endfunc
  1130. function ff_\type\()_h264_qpel8_mc11_neon, export=1
  1131. push {r0, r1, r11, lr}
  1132. \type\()_h264_qpel8_mc11:
  1133. lowpass_const r3
  1134. mov r11, sp
  1135. bic sp, sp, #15
  1136. sub sp, sp, #64
  1137. mov r0, sp
  1138. sub r1, r1, #2
  1139. mov r3, #8
  1140. mov ip, #8
  1141. vpush {d8-d15}
  1142. bl put_h264_qpel8_h_lowpass_neon
  1143. ldrd r0, [r11]
  1144. mov r3, r2
  1145. add ip, sp, #64
  1146. sub r1, r1, r2, lsl #1
  1147. mov r2, #8
  1148. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1149. vpop {d8-d15}
  1150. add sp, r11, #8
  1151. pop {r11, pc}
  1152. .endfunc
  1153. function ff_\type\()_h264_qpel8_mc21_neon, export=1
  1154. push {r0, r1, r4, r10, r11, lr}
  1155. \type\()_h264_qpel8_mc21:
  1156. lowpass_const r3
  1157. mov r11, sp
  1158. bic sp, sp, #15
  1159. sub sp, sp, #(8*8+16*12)
  1160. sub r1, r1, #2
  1161. mov r3, #8
  1162. mov r0, sp
  1163. mov ip, #8
  1164. vpush {d8-d15}
  1165. bl put_h264_qpel8_h_lowpass_neon
  1166. mov r4, r0
  1167. ldrd r0, [r11]
  1168. sub r1, r1, r2, lsl #1
  1169. sub r1, r1, #2
  1170. mov r3, r2
  1171. sub r2, r4, #64
  1172. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1173. vpop {d8-d15}
  1174. add sp, r11, #8
  1175. pop {r4, r10, r11, pc}
  1176. .endfunc
  1177. function ff_\type\()_h264_qpel8_mc31_neon, export=1
  1178. add r1, r1, #1
  1179. push {r0, r1, r11, lr}
  1180. sub r1, r1, #1
  1181. b \type\()_h264_qpel8_mc11
  1182. .endfunc
  1183. function ff_\type\()_h264_qpel8_mc02_neon, export=1
  1184. push {lr}
  1185. lowpass_const r3
  1186. sub r1, r1, r2, lsl #1
  1187. mov r3, r2
  1188. vpush {d8-d15}
  1189. bl \type\()_h264_qpel8_v_lowpass_neon
  1190. vpop {d8-d15}
  1191. pop {pc}
  1192. .endfunc
  1193. function ff_\type\()_h264_qpel8_mc12_neon, export=1
  1194. push {r0, r1, r4, r10, r11, lr}
  1195. \type\()_h264_qpel8_mc12:
  1196. lowpass_const r3
  1197. mov r11, sp
  1198. bic sp, sp, #15
  1199. sub sp, sp, #(8*8+16*12)
  1200. sub r1, r1, r2, lsl #1
  1201. mov r3, r2
  1202. mov r2, #8
  1203. mov r0, sp
  1204. vpush {d8-d15}
  1205. bl put_h264_qpel8_v_lowpass_neon
  1206. mov r4, r0
  1207. ldrd r0, [r11]
  1208. sub r1, r1, r3, lsl #1
  1209. sub r1, r1, #2
  1210. sub r2, r4, #64
  1211. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1212. vpop {d8-d15}
  1213. add sp, r11, #8
  1214. pop {r4, r10, r11, pc}
  1215. .endfunc
  1216. function ff_\type\()_h264_qpel8_mc22_neon, export=1
  1217. push {r4, r10, r11, lr}
  1218. mov r11, sp
  1219. bic sp, sp, #15
  1220. sub r1, r1, r2, lsl #1
  1221. sub r1, r1, #2
  1222. mov r3, r2
  1223. sub sp, sp, #(16*12)
  1224. mov r4, sp
  1225. vpush {d8-d15}
  1226. bl \type\()_h264_qpel8_hv_lowpass_neon
  1227. vpop {d8-d15}
  1228. mov sp, r11
  1229. pop {r4, r10, r11, pc}
  1230. .endfunc
  1231. function ff_\type\()_h264_qpel8_mc32_neon, export=1
  1232. push {r0, r1, r4, r10, r11, lr}
  1233. add r1, r1, #1
  1234. b \type\()_h264_qpel8_mc12
  1235. .endfunc
  1236. function ff_\type\()_h264_qpel8_mc03_neon, export=1
  1237. push {lr}
  1238. add ip, r1, r2
  1239. b \type\()_h264_qpel8_mc01
  1240. .endfunc
  1241. function ff_\type\()_h264_qpel8_mc13_neon, export=1
  1242. push {r0, r1, r11, lr}
  1243. add r1, r1, r2
  1244. b \type\()_h264_qpel8_mc11
  1245. .endfunc
  1246. function ff_\type\()_h264_qpel8_mc23_neon, export=1
  1247. push {r0, r1, r4, r10, r11, lr}
  1248. add r1, r1, r2
  1249. b \type\()_h264_qpel8_mc21
  1250. .endfunc
  1251. function ff_\type\()_h264_qpel8_mc33_neon, export=1
  1252. add r1, r1, #1
  1253. push {r0, r1, r11, lr}
  1254. add r1, r1, r2
  1255. sub r1, r1, #1
  1256. b \type\()_h264_qpel8_mc11
  1257. .endfunc
  1258. .endm
  1259. h264_qpel8 put
  1260. h264_qpel8 avg
  1261. .macro h264_qpel16 type
  1262. function ff_\type\()_h264_qpel16_mc10_neon, export=1
  1263. lowpass_const r3
  1264. mov r3, r1
  1265. sub r1, r1, #2
  1266. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1267. .endfunc
  1268. function ff_\type\()_h264_qpel16_mc20_neon, export=1
  1269. lowpass_const r3
  1270. sub r1, r1, #2
  1271. mov r3, r2
  1272. b \type\()_h264_qpel16_h_lowpass_neon
  1273. .endfunc
  1274. function ff_\type\()_h264_qpel16_mc30_neon, export=1
  1275. lowpass_const r3
  1276. add r3, r1, #1
  1277. sub r1, r1, #2
  1278. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1279. .endfunc
  1280. function ff_\type\()_h264_qpel16_mc01_neon, export=1
  1281. push {r4, lr}
  1282. mov ip, r1
  1283. \type\()_h264_qpel16_mc01:
  1284. lowpass_const r3
  1285. mov r3, r2
  1286. sub r1, r1, r2, lsl #1
  1287. vpush {d8-d15}
  1288. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1289. vpop {d8-d15}
  1290. pop {r4, pc}
  1291. .endfunc
  1292. function ff_\type\()_h264_qpel16_mc11_neon, export=1
  1293. push {r0, r1, r4, r11, lr}
  1294. \type\()_h264_qpel16_mc11:
  1295. lowpass_const r3
  1296. mov r11, sp
  1297. bic sp, sp, #15
  1298. sub sp, sp, #256
  1299. mov r0, sp
  1300. sub r1, r1, #2
  1301. mov r3, #16
  1302. vpush {d8-d15}
  1303. bl put_h264_qpel16_h_lowpass_neon
  1304. ldrd r0, [r11]
  1305. mov r3, r2
  1306. add ip, sp, #64
  1307. sub r1, r1, r2, lsl #1
  1308. mov r2, #16
  1309. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1310. vpop {d8-d15}
  1311. add sp, r11, #8
  1312. pop {r4, r11, pc}
  1313. .endfunc
  1314. function ff_\type\()_h264_qpel16_mc21_neon, export=1
  1315. push {r0, r1, r4-r5, r9-r11, lr}
  1316. \type\()_h264_qpel16_mc21:
  1317. lowpass_const r3
  1318. mov r11, sp
  1319. bic sp, sp, #15
  1320. sub sp, sp, #(16*16+16*12)
  1321. sub r1, r1, #2
  1322. mov r0, sp
  1323. vpush {d8-d15}
  1324. bl put_h264_qpel16_h_lowpass_neon_packed
  1325. mov r4, r0
  1326. ldrd r0, [r11]
  1327. sub r1, r1, r2, lsl #1
  1328. sub r1, r1, #2
  1329. mov r3, r2
  1330. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1331. vpop {d8-d15}
  1332. add sp, r11, #8
  1333. pop {r4-r5, r9-r11, pc}
  1334. .endfunc
  1335. function ff_\type\()_h264_qpel16_mc31_neon, export=1
  1336. add r1, r1, #1
  1337. push {r0, r1, r4, r11, lr}
  1338. sub r1, r1, #1
  1339. b \type\()_h264_qpel16_mc11
  1340. .endfunc
  1341. function ff_\type\()_h264_qpel16_mc02_neon, export=1
  1342. push {r4, lr}
  1343. lowpass_const r3
  1344. sub r1, r1, r2, lsl #1
  1345. mov r3, r2
  1346. vpush {d8-d15}
  1347. bl \type\()_h264_qpel16_v_lowpass_neon
  1348. vpop {d8-d15}
  1349. pop {r4, pc}
  1350. .endfunc
  1351. function ff_\type\()_h264_qpel16_mc12_neon, export=1
  1352. push {r0, r1, r4-r5, r9-r11, lr}
  1353. \type\()_h264_qpel16_mc12:
  1354. lowpass_const r3
  1355. mov r11, sp
  1356. bic sp, sp, #15
  1357. sub sp, sp, #(16*16+16*12)
  1358. sub r1, r1, r2, lsl #1
  1359. mov r0, sp
  1360. mov r3, r2
  1361. vpush {d8-d15}
  1362. bl put_h264_qpel16_v_lowpass_neon_packed
  1363. mov r4, r0
  1364. ldrd r0, [r11]
  1365. sub r1, r1, r3, lsl #1
  1366. sub r1, r1, #2
  1367. mov r2, r3
  1368. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1369. vpop {d8-d15}
  1370. add sp, r11, #8
  1371. pop {r4-r5, r9-r11, pc}
  1372. .endfunc
  1373. function ff_\type\()_h264_qpel16_mc22_neon, export=1
  1374. push {r4, r9-r11, lr}
  1375. lowpass_const r3
  1376. mov r11, sp
  1377. bic sp, sp, #15
  1378. sub r1, r1, r2, lsl #1
  1379. sub r1, r1, #2
  1380. mov r3, r2
  1381. sub sp, sp, #(16*12)
  1382. mov r4, sp
  1383. vpush {d8-d15}
  1384. bl \type\()_h264_qpel16_hv_lowpass_neon
  1385. vpop {d8-d15}
  1386. mov sp, r11
  1387. pop {r4, r9-r11, pc}
  1388. .endfunc
  1389. function ff_\type\()_h264_qpel16_mc32_neon, export=1
  1390. push {r0, r1, r4-r5, r9-r11, lr}
  1391. add r1, r1, #1
  1392. b \type\()_h264_qpel16_mc12
  1393. .endfunc
  1394. function ff_\type\()_h264_qpel16_mc03_neon, export=1
  1395. push {r4, lr}
  1396. add ip, r1, r2
  1397. b \type\()_h264_qpel16_mc01
  1398. .endfunc
  1399. function ff_\type\()_h264_qpel16_mc13_neon, export=1
  1400. push {r0, r1, r4, r11, lr}
  1401. add r1, r1, r2
  1402. b \type\()_h264_qpel16_mc11
  1403. .endfunc
  1404. function ff_\type\()_h264_qpel16_mc23_neon, export=1
  1405. push {r0, r1, r4-r5, r9-r11, lr}
  1406. add r1, r1, r2
  1407. b \type\()_h264_qpel16_mc21
  1408. .endfunc
  1409. function ff_\type\()_h264_qpel16_mc33_neon, export=1
  1410. add r1, r1, #1
  1411. push {r0, r1, r4, r11, lr}
  1412. add r1, r1, r2
  1413. sub r1, r1, #1
  1414. b \type\()_h264_qpel16_mc11
  1415. .endfunc
  1416. .endm
  1417. h264_qpel16 put
  1418. h264_qpel16 avg
  1419. @ Biweighted prediction
  1420. .macro biweight_16 macs, macd
  1421. vdup.8 d0, r4
  1422. vdup.8 d1, r5
  1423. vmov q2, q8
  1424. vmov q3, q8
  1425. 1: subs ip, ip, #2
  1426. vld1.8 {d20-d21},[r0,:128], r2
  1427. \macd q2, d0, d20
  1428. pld [r0]
  1429. \macd q3, d0, d21
  1430. vld1.8 {d22-d23},[r1,:128], r2
  1431. \macs q2, d1, d22
  1432. pld [r1]
  1433. \macs q3, d1, d23
  1434. vmov q12, q8
  1435. vld1.8 {d28-d29},[r0,:128], r2
  1436. vmov q13, q8
  1437. \macd q12, d0, d28
  1438. pld [r0]
  1439. \macd q13, d0, d29
  1440. vld1.8 {d30-d31},[r1,:128], r2
  1441. \macs q12, d1, d30
  1442. pld [r1]
  1443. \macs q13, d1, d31
  1444. vshl.s16 q2, q2, q9
  1445. vshl.s16 q3, q3, q9
  1446. vqmovun.s16 d4, q2
  1447. vqmovun.s16 d5, q3
  1448. vshl.s16 q12, q12, q9
  1449. vshl.s16 q13, q13, q9
  1450. vqmovun.s16 d24, q12
  1451. vqmovun.s16 d25, q13
  1452. vmov q3, q8
  1453. vst1.8 {d4- d5}, [r6,:128], r2
  1454. vmov q2, q8
  1455. vst1.8 {d24-d25},[r6,:128], r2
  1456. bne 1b
  1457. pop {r4-r6, pc}
  1458. .endm
  1459. .macro biweight_8 macs, macd
  1460. vdup.8 d0, r4
  1461. vdup.8 d1, r5
  1462. vmov q1, q8
  1463. vmov q10, q8
  1464. 1: subs ip, ip, #2
  1465. vld1.8 {d4},[r0,:64], r2
  1466. \macd q1, d0, d4
  1467. pld [r0]
  1468. vld1.8 {d5},[r1,:64], r2
  1469. \macs q1, d1, d5
  1470. pld [r1]
  1471. vld1.8 {d6},[r0,:64], r2
  1472. \macd q10, d0, d6
  1473. pld [r0]
  1474. vld1.8 {d7},[r1,:64], r2
  1475. \macs q10, d1, d7
  1476. pld [r1]
  1477. vshl.s16 q1, q1, q9
  1478. vqmovun.s16 d2, q1
  1479. vshl.s16 q10, q10, q9
  1480. vqmovun.s16 d4, q10
  1481. vmov q10, q8
  1482. vst1.8 {d2},[r6,:64], r2
  1483. vmov q1, q8
  1484. vst1.8 {d4},[r6,:64], r2
  1485. bne 1b
  1486. pop {r4-r6, pc}
  1487. .endm
  1488. .macro biweight_4 macs, macd
  1489. vdup.8 d0, r4
  1490. vdup.8 d1, r5
  1491. vmov q1, q8
  1492. vmov q10, q8
  1493. 1: subs ip, ip, #4
  1494. vld1.32 {d4[0]},[r0,:32], r2
  1495. vld1.32 {d4[1]},[r0,:32], r2
  1496. \macd q1, d0, d4
  1497. pld [r0]
  1498. vld1.32 {d5[0]},[r1,:32], r2
  1499. vld1.32 {d5[1]},[r1,:32], r2
  1500. \macs q1, d1, d5
  1501. pld [r1]
  1502. blt 2f
  1503. vld1.32 {d6[0]},[r0,:32], r2
  1504. vld1.32 {d6[1]},[r0,:32], r2
  1505. \macd q10, d0, d6
  1506. pld [r0]
  1507. vld1.32 {d7[0]},[r1,:32], r2
  1508. vld1.32 {d7[1]},[r1,:32], r2
  1509. \macs q10, d1, d7
  1510. pld [r1]
  1511. vshl.s16 q1, q1, q9
  1512. vqmovun.s16 d2, q1
  1513. vshl.s16 q10, q10, q9
  1514. vqmovun.s16 d4, q10
  1515. vmov q10, q8
  1516. vst1.32 {d2[0]},[r6,:32], r2
  1517. vst1.32 {d2[1]},[r6,:32], r2
  1518. vmov q1, q8
  1519. vst1.32 {d4[0]},[r6,:32], r2
  1520. vst1.32 {d4[1]},[r6,:32], r2
  1521. bne 1b
  1522. pop {r4-r6, pc}
  1523. 2: vshl.s16 q1, q1, q9
  1524. vqmovun.s16 d2, q1
  1525. vst1.32 {d2[0]},[r6,:32], r2
  1526. vst1.32 {d2[1]},[r6,:32], r2
  1527. pop {r4-r6, pc}
  1528. .endm
  1529. .macro biweight_func w
  1530. function biweight_h264_pixels_\w\()_neon
  1531. push {r4-r6, lr}
  1532. add r4, sp, #16
  1533. ldm r4, {r4-r6}
  1534. lsr lr, r4, #31
  1535. add r6, r6, #1
  1536. eors lr, lr, r5, lsr #30
  1537. orr r6, r6, #1
  1538. vdup.16 q9, r3
  1539. lsl r6, r6, r3
  1540. vmvn q9, q9
  1541. vdup.16 q8, r6
  1542. mov r6, r0
  1543. beq 10f
  1544. subs lr, lr, #1
  1545. beq 20f
  1546. subs lr, lr, #1
  1547. beq 30f
  1548. b 40f
  1549. 10: biweight_\w vmlal.u8, vmlal.u8
  1550. 20: rsb r4, r4, #0
  1551. biweight_\w vmlal.u8, vmlsl.u8
  1552. 30: rsb r4, r4, #0
  1553. rsb r5, r5, #0
  1554. biweight_\w vmlsl.u8, vmlsl.u8
  1555. 40: rsb r5, r5, #0
  1556. biweight_\w vmlsl.u8, vmlal.u8
  1557. .endfunc
  1558. .endm
  1559. .macro biweight_entry w, h, b=1
  1560. function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
  1561. mov ip, #\h
  1562. .if \b
  1563. b biweight_h264_pixels_\w\()_neon
  1564. .endif
  1565. .endfunc
  1566. .endm
  1567. biweight_entry 16, 8
  1568. biweight_entry 16, 16, b=0
  1569. biweight_func 16
  1570. biweight_entry 8, 16
  1571. biweight_entry 8, 4
  1572. biweight_entry 8, 8, b=0
  1573. biweight_func 8
  1574. biweight_entry 4, 8
  1575. biweight_entry 4, 2
  1576. biweight_entry 4, 4, b=0
  1577. biweight_func 4
  1578. @ Weighted prediction
  1579. .macro weight_16 add
  1580. vdup.8 d0, r3
  1581. 1: subs ip, ip, #2
  1582. vld1.8 {d20-d21},[r0,:128], r1
  1583. vmull.u8 q2, d0, d20
  1584. pld [r0]
  1585. vmull.u8 q3, d0, d21
  1586. vld1.8 {d28-d29},[r0,:128], r1
  1587. vmull.u8 q12, d0, d28
  1588. pld [r0]
  1589. vmull.u8 q13, d0, d29
  1590. \add q2, q8, q2
  1591. vrshl.s16 q2, q2, q9
  1592. \add q3, q8, q3
  1593. vrshl.s16 q3, q3, q9
  1594. vqmovun.s16 d4, q2
  1595. vqmovun.s16 d5, q3
  1596. \add q12, q8, q12
  1597. vrshl.s16 q12, q12, q9
  1598. \add q13, q8, q13
  1599. vrshl.s16 q13, q13, q9
  1600. vqmovun.s16 d24, q12
  1601. vqmovun.s16 d25, q13
  1602. vst1.8 {d4- d5}, [r4,:128], r1
  1603. vst1.8 {d24-d25},[r4,:128], r1
  1604. bne 1b
  1605. pop {r4, pc}
  1606. .endm
  1607. .macro weight_8 add
  1608. vdup.8 d0, r3
  1609. 1: subs ip, ip, #2
  1610. vld1.8 {d4},[r0,:64], r1
  1611. vmull.u8 q1, d0, d4
  1612. pld [r0]
  1613. vld1.8 {d6},[r0,:64], r1
  1614. vmull.u8 q10, d0, d6
  1615. \add q1, q8, q1
  1616. pld [r0]
  1617. vrshl.s16 q1, q1, q9
  1618. vqmovun.s16 d2, q1
  1619. \add q10, q8, q10
  1620. vrshl.s16 q10, q10, q9
  1621. vqmovun.s16 d4, q10
  1622. vst1.8 {d2},[r4,:64], r1
  1623. vst1.8 {d4},[r4,:64], r1
  1624. bne 1b
  1625. pop {r4, pc}
  1626. .endm
  1627. .macro weight_4 add
  1628. vdup.8 d0, r3
  1629. vmov q1, q8
  1630. vmov q10, q8
  1631. 1: subs ip, ip, #4
  1632. vld1.32 {d4[0]},[r0,:32], r1
  1633. vld1.32 {d4[1]},[r0,:32], r1
  1634. vmull.u8 q1, d0, d4
  1635. pld [r0]
  1636. blt 2f
  1637. vld1.32 {d6[0]},[r0,:32], r1
  1638. vld1.32 {d6[1]},[r0,:32], r1
  1639. vmull.u8 q10, d0, d6
  1640. pld [r0]
  1641. \add q1, q8, q1
  1642. vrshl.s16 q1, q1, q9
  1643. vqmovun.s16 d2, q1
  1644. \add q10, q8, q10
  1645. vrshl.s16 q10, q10, q9
  1646. vqmovun.s16 d4, q10
  1647. vmov q10, q8
  1648. vst1.32 {d2[0]},[r4,:32], r1
  1649. vst1.32 {d2[1]},[r4,:32], r1
  1650. vmov q1, q8
  1651. vst1.32 {d4[0]},[r4,:32], r1
  1652. vst1.32 {d4[1]},[r4,:32], r1
  1653. bne 1b
  1654. pop {r4, pc}
  1655. 2: \add q1, q8, q1
  1656. vrshl.s16 q1, q1, q9
  1657. vqmovun.s16 d2, q1
  1658. vst1.32 {d2[0]},[r4,:32], r1
  1659. vst1.32 {d2[1]},[r4,:32], r1
  1660. pop {r4, pc}
  1661. .endm
  1662. .macro weight_func w
  1663. function weight_h264_pixels_\w\()_neon
  1664. push {r4, lr}
  1665. ldr r4, [sp, #8]
  1666. cmp r2, #1
  1667. lsl r4, r4, r2
  1668. vdup.16 q8, r4
  1669. mov r4, r0
  1670. ble 20f
  1671. rsb lr, r2, #1
  1672. vdup.16 q9, lr
  1673. cmp r3, #0
  1674. blt 10f
  1675. weight_\w vhadd.s16
  1676. 10: rsb r3, r3, #0
  1677. weight_\w vhsub.s16
  1678. 20: rsb lr, r2, #0
  1679. vdup.16 q9, lr
  1680. cmp r3, #0
  1681. blt 10f
  1682. weight_\w vadd.s16
  1683. 10: rsb r3, r3, #0
  1684. weight_\w vsub.s16
  1685. .endfunc
  1686. .endm
  1687. .macro weight_entry w, h, b=1
  1688. function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
  1689. mov ip, #\h
  1690. .if \b
  1691. b weight_h264_pixels_\w\()_neon
  1692. .endif
  1693. .endfunc
  1694. .endm
  1695. weight_entry 16, 8
  1696. weight_entry 16, 16, b=0
  1697. weight_func 16
  1698. weight_entry 8, 16
  1699. weight_entry 8, 4
  1700. weight_entry 8, 8, b=0
  1701. weight_func 8
  1702. weight_entry 4, 8
  1703. weight_entry 4, 2
  1704. weight_entry 4, 4, b=0
  1705. weight_func 4