You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1853 lines
61KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  22. vtrn.32 \r0, \r4
  23. vtrn.32 \r1, \r5
  24. vtrn.32 \r2, \r6
  25. vtrn.32 \r3, \r7
  26. vtrn.16 \r0, \r2
  27. vtrn.16 \r1, \r3
  28. vtrn.16 \r4, \r6
  29. vtrn.16 \r5, \r7
  30. vtrn.8 \r0, \r1
  31. vtrn.8 \r2, \r3
  32. vtrn.8 \r4, \r5
  33. vtrn.8 \r6, \r7
  34. .endm
  35. .macro transpose_4x4 r0 r1 r2 r3
  36. vtrn.16 \r0, \r2
  37. vtrn.16 \r1, \r3
  38. vtrn.8 \r0, \r1
  39. vtrn.8 \r2, \r3
  40. .endm
  41. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  42. vswp \r0, \r4
  43. vswp \r1, \r5
  44. vswp \r2, \r6
  45. vswp \r3, \r7
  46. .endm
  47. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  48. vtrn.32 \r0, \r2
  49. vtrn.32 \r1, \r3
  50. vtrn.32 \r4, \r6
  51. vtrn.32 \r5, \r7
  52. vtrn.16 \r0, \r1
  53. vtrn.16 \r2, \r3
  54. vtrn.16 \r4, \r5
  55. vtrn.16 \r6, \r7
  56. .endm
  57. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  58. .macro h264_chroma_mc8 type
  59. function ff_\type\()_h264_chroma_mc8_neon, export=1
  60. push {r4-r7, lr}
  61. ldrd r4, [sp, #20]
  62. .ifc \type,avg
  63. mov lr, r0
  64. .endif
  65. pld [r1]
  66. pld [r1, r2]
  67. A muls r7, r4, r5
  68. T mul r7, r4, r5
  69. T cmp r7, #0
  70. rsb r6, r7, r5, lsl #3
  71. rsb ip, r7, r4, lsl #3
  72. sub r4, r7, r4, lsl #3
  73. sub r4, r4, r5, lsl #3
  74. add r4, r4, #64
  75. beq 2f
  76. add r5, r1, r2
  77. vdup.8 d0, r4
  78. lsl r4, r2, #1
  79. vdup.8 d1, ip
  80. vld1.64 {d4, d5}, [r1], r4
  81. vdup.8 d2, r6
  82. vld1.64 {d6, d7}, [r5], r4
  83. vdup.8 d3, r7
  84. vext.8 d5, d4, d5, #1
  85. vext.8 d7, d6, d7, #1
  86. 1: pld [r5]
  87. vmull.u8 q8, d4, d0
  88. vmlal.u8 q8, d5, d1
  89. vld1.64 {d4, d5}, [r1], r4
  90. vmlal.u8 q8, d6, d2
  91. vext.8 d5, d4, d5, #1
  92. vmlal.u8 q8, d7, d3
  93. vmull.u8 q9, d6, d0
  94. subs r3, r3, #2
  95. vmlal.u8 q9, d7, d1
  96. vmlal.u8 q9, d4, d2
  97. vmlal.u8 q9, d5, d3
  98. vrshrn.u16 d16, q8, #6
  99. vld1.64 {d6, d7}, [r5], r4
  100. pld [r1]
  101. vrshrn.u16 d17, q9, #6
  102. .ifc \type,avg
  103. vld1.64 {d20}, [lr,:64], r2
  104. vld1.64 {d21}, [lr,:64], r2
  105. vrhadd.u8 q8, q8, q10
  106. .endif
  107. vext.8 d7, d6, d7, #1
  108. vst1.64 {d16}, [r0,:64], r2
  109. vst1.64 {d17}, [r0,:64], r2
  110. bgt 1b
  111. pop {r4-r7, pc}
  112. 2: tst r6, r6
  113. add ip, ip, r6
  114. vdup.8 d0, r4
  115. vdup.8 d1, ip
  116. beq 4f
  117. add r5, r1, r2
  118. lsl r4, r2, #1
  119. vld1.64 {d4}, [r1], r4
  120. vld1.64 {d6}, [r5], r4
  121. 3: pld [r5]
  122. vmull.u8 q8, d4, d0
  123. vmlal.u8 q8, d6, d1
  124. vld1.64 {d4}, [r1], r4
  125. vmull.u8 q9, d6, d0
  126. vmlal.u8 q9, d4, d1
  127. vld1.64 {d6}, [r5], r4
  128. vrshrn.u16 d16, q8, #6
  129. vrshrn.u16 d17, q9, #6
  130. .ifc \type,avg
  131. vld1.64 {d20}, [lr,:64], r2
  132. vld1.64 {d21}, [lr,:64], r2
  133. vrhadd.u8 q8, q8, q10
  134. .endif
  135. subs r3, r3, #2
  136. pld [r1]
  137. vst1.64 {d16}, [r0,:64], r2
  138. vst1.64 {d17}, [r0,:64], r2
  139. bgt 3b
  140. pop {r4-r7, pc}
  141. 4: vld1.64 {d4, d5}, [r1], r2
  142. vld1.64 {d6, d7}, [r1], r2
  143. vext.8 d5, d4, d5, #1
  144. vext.8 d7, d6, d7, #1
  145. 5: pld [r1]
  146. subs r3, r3, #2
  147. vmull.u8 q8, d4, d0
  148. vmlal.u8 q8, d5, d1
  149. vld1.64 {d4, d5}, [r1], r2
  150. vmull.u8 q9, d6, d0
  151. vmlal.u8 q9, d7, d1
  152. pld [r1]
  153. vext.8 d5, d4, d5, #1
  154. vrshrn.u16 d16, q8, #6
  155. vrshrn.u16 d17, q9, #6
  156. .ifc \type,avg
  157. vld1.64 {d20}, [lr,:64], r2
  158. vld1.64 {d21}, [lr,:64], r2
  159. vrhadd.u8 q8, q8, q10
  160. .endif
  161. vld1.64 {d6, d7}, [r1], r2
  162. vext.8 d7, d6, d7, #1
  163. vst1.64 {d16}, [r0,:64], r2
  164. vst1.64 {d17}, [r0,:64], r2
  165. bgt 5b
  166. pop {r4-r7, pc}
  167. endfunc
  168. .endm
  169. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  170. .macro h264_chroma_mc4 type
  171. function ff_\type\()_h264_chroma_mc4_neon, export=1
  172. push {r4-r7, lr}
  173. ldrd r4, [sp, #20]
  174. .ifc \type,avg
  175. mov lr, r0
  176. .endif
  177. pld [r1]
  178. pld [r1, r2]
  179. A muls r7, r4, r5
  180. T mul r7, r4, r5
  181. T cmp r7, #0
  182. rsb r6, r7, r5, lsl #3
  183. rsb ip, r7, r4, lsl #3
  184. sub r4, r7, r4, lsl #3
  185. sub r4, r4, r5, lsl #3
  186. add r4, r4, #64
  187. beq 2f
  188. add r5, r1, r2
  189. vdup.8 d0, r4
  190. lsl r4, r2, #1
  191. vdup.8 d1, ip
  192. vld1.64 {d4}, [r1], r4
  193. vdup.8 d2, r6
  194. vld1.64 {d6}, [r5], r4
  195. vdup.8 d3, r7
  196. vext.8 d5, d4, d5, #1
  197. vext.8 d7, d6, d7, #1
  198. vtrn.32 d4, d5
  199. vtrn.32 d6, d7
  200. vtrn.32 d0, d1
  201. vtrn.32 d2, d3
  202. 1: pld [r5]
  203. vmull.u8 q8, d4, d0
  204. vmlal.u8 q8, d6, d2
  205. vld1.64 {d4}, [r1], r4
  206. vext.8 d5, d4, d5, #1
  207. vtrn.32 d4, d5
  208. vmull.u8 q9, d6, d0
  209. vmlal.u8 q9, d4, d2
  210. vld1.64 {d6}, [r5], r4
  211. vadd.i16 d16, d16, d17
  212. vadd.i16 d17, d18, d19
  213. vrshrn.u16 d16, q8, #6
  214. subs r3, r3, #2
  215. pld [r1]
  216. .ifc \type,avg
  217. vld1.32 {d20[0]}, [lr,:32], r2
  218. vld1.32 {d20[1]}, [lr,:32], r2
  219. vrhadd.u8 d16, d16, d20
  220. .endif
  221. vext.8 d7, d6, d7, #1
  222. vtrn.32 d6, d7
  223. vst1.32 {d16[0]}, [r0,:32], r2
  224. vst1.32 {d16[1]}, [r0,:32], r2
  225. bgt 1b
  226. pop {r4-r7, pc}
  227. 2: tst r6, r6
  228. add ip, ip, r6
  229. vdup.8 d0, r4
  230. vdup.8 d1, ip
  231. vtrn.32 d0, d1
  232. beq 4f
  233. vext.32 d1, d0, d1, #1
  234. add r5, r1, r2
  235. lsl r4, r2, #1
  236. vld1.32 {d4[0]}, [r1], r4
  237. vld1.32 {d4[1]}, [r5], r4
  238. 3: pld [r5]
  239. vmull.u8 q8, d4, d0
  240. vld1.32 {d4[0]}, [r1], r4
  241. vmull.u8 q9, d4, d1
  242. vld1.32 {d4[1]}, [r5], r4
  243. vadd.i16 d16, d16, d17
  244. vadd.i16 d17, d18, d19
  245. vrshrn.u16 d16, q8, #6
  246. .ifc \type,avg
  247. vld1.32 {d20[0]}, [lr,:32], r2
  248. vld1.32 {d20[1]}, [lr,:32], r2
  249. vrhadd.u8 d16, d16, d20
  250. .endif
  251. subs r3, r3, #2
  252. pld [r1]
  253. vst1.32 {d16[0]}, [r0,:32], r2
  254. vst1.32 {d16[1]}, [r0,:32], r2
  255. bgt 3b
  256. pop {r4-r7, pc}
  257. 4: vld1.64 {d4}, [r1], r2
  258. vld1.64 {d6}, [r1], r2
  259. vext.8 d5, d4, d5, #1
  260. vext.8 d7, d6, d7, #1
  261. vtrn.32 d4, d5
  262. vtrn.32 d6, d7
  263. 5: vmull.u8 q8, d4, d0
  264. vmull.u8 q9, d6, d0
  265. subs r3, r3, #2
  266. vld1.64 {d4}, [r1], r2
  267. vext.8 d5, d4, d5, #1
  268. vtrn.32 d4, d5
  269. vadd.i16 d16, d16, d17
  270. vadd.i16 d17, d18, d19
  271. pld [r1]
  272. vrshrn.u16 d16, q8, #6
  273. .ifc \type,avg
  274. vld1.32 {d20[0]}, [lr,:32], r2
  275. vld1.32 {d20[1]}, [lr,:32], r2
  276. vrhadd.u8 d16, d16, d20
  277. .endif
  278. vld1.64 {d6}, [r1], r2
  279. vext.8 d7, d6, d7, #1
  280. vtrn.32 d6, d7
  281. pld [r1]
  282. vst1.32 {d16[0]}, [r0,:32], r2
  283. vst1.32 {d16[1]}, [r0,:32], r2
  284. bgt 5b
  285. pop {r4-r7, pc}
  286. endfunc
  287. .endm
  288. .macro h264_chroma_mc2 type
  289. function ff_\type\()_h264_chroma_mc2_neon, export=1
  290. push {r4-r6, lr}
  291. ldr r4, [sp, #16]
  292. ldr lr, [sp, #20]
  293. pld [r1]
  294. pld [r1, r2]
  295. orrs r5, r4, lr
  296. beq 2f
  297. mul r5, r4, lr
  298. rsb r6, r5, lr, lsl #3
  299. rsb r12, r5, r4, lsl #3
  300. sub r4, r5, r4, lsl #3
  301. sub r4, r4, lr, lsl #3
  302. add r4, r4, #64
  303. vdup.8 d0, r4
  304. vdup.8 d2, r12
  305. vdup.8 d1, r6
  306. vdup.8 d3, r5
  307. vtrn.16 q0, q1
  308. 1:
  309. vld1.32 {d4[0]}, [r1], r2
  310. vld1.32 {d4[1]}, [r1], r2
  311. vrev64.32 d5, d4
  312. vld1.32 {d5[1]}, [r1]
  313. vext.8 q3, q2, q2, #1
  314. vtrn.16 q2, q3
  315. vmull.u8 q8, d4, d0
  316. vmlal.u8 q8, d5, d1
  317. .ifc \type,avg
  318. vld1.16 {d18[0]}, [r0,:16], r2
  319. vld1.16 {d18[1]}, [r0,:16]
  320. sub r0, r0, r2
  321. .endif
  322. vtrn.32 d16, d17
  323. vadd.i16 d16, d16, d17
  324. vrshrn.u16 d16, q8, #6
  325. .ifc \type,avg
  326. vrhadd.u8 d16, d16, d18
  327. .endif
  328. vst1.16 {d16[0]}, [r0,:16], r2
  329. vst1.16 {d16[1]}, [r0,:16], r2
  330. subs r3, r3, #2
  331. bgt 1b
  332. pop {r4-r6, pc}
  333. 2:
  334. .ifc \type,put
  335. ldrh_post r5, r1, r2
  336. strh_post r5, r0, r2
  337. ldrh_post r6, r1, r2
  338. strh_post r6, r0, r2
  339. .else
  340. vld1.16 {d16[0]}, [r1], r2
  341. vld1.16 {d16[1]}, [r1], r2
  342. vld1.16 {d18[0]}, [r0,:16], r2
  343. vld1.16 {d18[1]}, [r0,:16]
  344. sub r0, r0, r2
  345. vrhadd.u8 d16, d16, d18
  346. vst1.16 {d16[0]}, [r0,:16], r2
  347. vst1.16 {d16[1]}, [r0,:16], r2
  348. .endif
  349. subs r3, r3, #2
  350. bgt 2b
  351. pop {r4-r6, pc}
  352. endfunc
  353. .endm
  354. h264_chroma_mc8 put
  355. h264_chroma_mc8 avg
  356. h264_chroma_mc4 put
  357. h264_chroma_mc4 avg
  358. h264_chroma_mc2 put
  359. h264_chroma_mc2 avg
  360. /* H.264 loop filter */
  361. .macro h264_loop_filter_start
  362. ldr ip, [sp]
  363. tst r2, r2
  364. ldr ip, [ip]
  365. it ne
  366. tstne r3, r3
  367. vmov.32 d24[0], ip
  368. and ip, ip, ip, lsl #16
  369. it eq
  370. bxeq lr
  371. ands ip, ip, ip, lsl #8
  372. it lt
  373. bxlt lr
  374. .endm
  375. .macro h264_loop_filter_luma
  376. vdup.8 q11, r2 @ alpha
  377. vmovl.u8 q12, d24
  378. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  379. vmovl.u16 q12, d24
  380. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  381. vsli.16 q12, q12, #8
  382. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  383. vsli.32 q12, q12, #16
  384. vclt.u8 q6, q6, q11 @ < alpha
  385. vdup.8 q11, r3 @ beta
  386. vclt.s8 q7, q12, #0
  387. vclt.u8 q14, q14, q11 @ < beta
  388. vclt.u8 q15, q15, q11 @ < beta
  389. vbic q6, q6, q7
  390. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  391. vand q6, q6, q14
  392. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  393. vclt.u8 q4, q4, q11 @ < beta
  394. vand q6, q6, q15
  395. vclt.u8 q5, q5, q11 @ < beta
  396. vand q4, q4, q6
  397. vand q5, q5, q6
  398. vand q12, q12, q6
  399. vrhadd.u8 q14, q8, q0
  400. vsub.i8 q6, q12, q4
  401. vqadd.u8 q7, q9, q12
  402. vhadd.u8 q10, q10, q14
  403. vsub.i8 q6, q6, q5
  404. vhadd.u8 q14, q2, q14
  405. vmin.u8 q7, q7, q10
  406. vqsub.u8 q11, q9, q12
  407. vqadd.u8 q2, q1, q12
  408. vmax.u8 q7, q7, q11
  409. vqsub.u8 q11, q1, q12
  410. vmin.u8 q14, q2, q14
  411. vmovl.u8 q2, d0
  412. vmax.u8 q14, q14, q11
  413. vmovl.u8 q10, d1
  414. vsubw.u8 q2, q2, d16
  415. vsubw.u8 q10, q10, d17
  416. vshl.i16 q2, q2, #2
  417. vshl.i16 q10, q10, #2
  418. vaddw.u8 q2, q2, d18
  419. vaddw.u8 q10, q10, d19
  420. vsubw.u8 q2, q2, d2
  421. vsubw.u8 q10, q10, d3
  422. vrshrn.i16 d4, q2, #3
  423. vrshrn.i16 d5, q10, #3
  424. vbsl q4, q7, q9
  425. vbsl q5, q14, q1
  426. vneg.s8 q7, q6
  427. vmovl.u8 q14, d16
  428. vmin.s8 q2, q2, q6
  429. vmovl.u8 q6, d17
  430. vmax.s8 q2, q2, q7
  431. vmovl.u8 q11, d0
  432. vmovl.u8 q12, d1
  433. vaddw.s8 q14, q14, d4
  434. vaddw.s8 q6, q6, d5
  435. vsubw.s8 q11, q11, d4
  436. vsubw.s8 q12, q12, d5
  437. vqmovun.s16 d16, q14
  438. vqmovun.s16 d17, q6
  439. vqmovun.s16 d0, q11
  440. vqmovun.s16 d1, q12
  441. .endm
  442. function ff_h264_v_loop_filter_luma_neon, export=1
  443. h264_loop_filter_start
  444. vld1.64 {d0, d1}, [r0,:128], r1
  445. vld1.64 {d2, d3}, [r0,:128], r1
  446. vld1.64 {d4, d5}, [r0,:128], r1
  447. sub r0, r0, r1, lsl #2
  448. sub r0, r0, r1, lsl #1
  449. vld1.64 {d20,d21}, [r0,:128], r1
  450. vld1.64 {d18,d19}, [r0,:128], r1
  451. vld1.64 {d16,d17}, [r0,:128], r1
  452. vpush {d8-d15}
  453. h264_loop_filter_luma
  454. sub r0, r0, r1, lsl #1
  455. vst1.64 {d8, d9}, [r0,:128], r1
  456. vst1.64 {d16,d17}, [r0,:128], r1
  457. vst1.64 {d0, d1}, [r0,:128], r1
  458. vst1.64 {d10,d11}, [r0,:128]
  459. vpop {d8-d15}
  460. bx lr
  461. endfunc
  462. function ff_h264_h_loop_filter_luma_neon, export=1
  463. h264_loop_filter_start
  464. sub r0, r0, #4
  465. vld1.64 {d6}, [r0], r1
  466. vld1.64 {d20}, [r0], r1
  467. vld1.64 {d18}, [r0], r1
  468. vld1.64 {d16}, [r0], r1
  469. vld1.64 {d0}, [r0], r1
  470. vld1.64 {d2}, [r0], r1
  471. vld1.64 {d4}, [r0], r1
  472. vld1.64 {d26}, [r0], r1
  473. vld1.64 {d7}, [r0], r1
  474. vld1.64 {d21}, [r0], r1
  475. vld1.64 {d19}, [r0], r1
  476. vld1.64 {d17}, [r0], r1
  477. vld1.64 {d1}, [r0], r1
  478. vld1.64 {d3}, [r0], r1
  479. vld1.64 {d5}, [r0], r1
  480. vld1.64 {d27}, [r0], r1
  481. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  482. vpush {d8-d15}
  483. h264_loop_filter_luma
  484. transpose_4x4 q4, q8, q0, q5
  485. sub r0, r0, r1, lsl #4
  486. add r0, r0, #2
  487. vst1.32 {d8[0]}, [r0], r1
  488. vst1.32 {d16[0]}, [r0], r1
  489. vst1.32 {d0[0]}, [r0], r1
  490. vst1.32 {d10[0]}, [r0], r1
  491. vst1.32 {d8[1]}, [r0], r1
  492. vst1.32 {d16[1]}, [r0], r1
  493. vst1.32 {d0[1]}, [r0], r1
  494. vst1.32 {d10[1]}, [r0], r1
  495. vst1.32 {d9[0]}, [r0], r1
  496. vst1.32 {d17[0]}, [r0], r1
  497. vst1.32 {d1[0]}, [r0], r1
  498. vst1.32 {d11[0]}, [r0], r1
  499. vst1.32 {d9[1]}, [r0], r1
  500. vst1.32 {d17[1]}, [r0], r1
  501. vst1.32 {d1[1]}, [r0], r1
  502. vst1.32 {d11[1]}, [r0], r1
  503. vpop {d8-d15}
  504. bx lr
  505. endfunc
  506. .macro h264_loop_filter_chroma
  507. vdup.8 d22, r2 @ alpha
  508. vmovl.u8 q12, d24
  509. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  510. vmovl.u8 q2, d0
  511. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  512. vsubw.u8 q2, q2, d16
  513. vsli.16 d24, d24, #8
  514. vshl.i16 q2, q2, #2
  515. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  516. vaddw.u8 q2, q2, d18
  517. vclt.u8 d26, d26, d22 @ < alpha
  518. vsubw.u8 q2, q2, d2
  519. vdup.8 d22, r3 @ beta
  520. vrshrn.i16 d4, q2, #3
  521. vclt.u8 d28, d28, d22 @ < beta
  522. vclt.u8 d30, d30, d22 @ < beta
  523. vmin.s8 d4, d4, d24
  524. vneg.s8 d25, d24
  525. vand d26, d26, d28
  526. vmax.s8 d4, d4, d25
  527. vand d26, d26, d30
  528. vmovl.u8 q11, d0
  529. vand d4, d4, d26
  530. vmovl.u8 q14, d16
  531. vaddw.s8 q14, q14, d4
  532. vsubw.s8 q11, q11, d4
  533. vqmovun.s16 d16, q14
  534. vqmovun.s16 d0, q11
  535. .endm
  536. function ff_h264_v_loop_filter_chroma_neon, export=1
  537. h264_loop_filter_start
  538. sub r0, r0, r1, lsl #1
  539. vld1.64 {d18}, [r0,:64], r1
  540. vld1.64 {d16}, [r0,:64], r1
  541. vld1.64 {d0}, [r0,:64], r1
  542. vld1.64 {d2}, [r0,:64]
  543. h264_loop_filter_chroma
  544. sub r0, r0, r1, lsl #1
  545. vst1.64 {d16}, [r0,:64], r1
  546. vst1.64 {d0}, [r0,:64], r1
  547. bx lr
  548. endfunc
  549. function ff_h264_h_loop_filter_chroma_neon, export=1
  550. h264_loop_filter_start
  551. sub r0, r0, #2
  552. vld1.32 {d18[0]}, [r0], r1
  553. vld1.32 {d16[0]}, [r0], r1
  554. vld1.32 {d0[0]}, [r0], r1
  555. vld1.32 {d2[0]}, [r0], r1
  556. vld1.32 {d18[1]}, [r0], r1
  557. vld1.32 {d16[1]}, [r0], r1
  558. vld1.32 {d0[1]}, [r0], r1
  559. vld1.32 {d2[1]}, [r0], r1
  560. vtrn.16 d18, d0
  561. vtrn.16 d16, d2
  562. vtrn.8 d18, d16
  563. vtrn.8 d0, d2
  564. h264_loop_filter_chroma
  565. vtrn.16 d18, d0
  566. vtrn.16 d16, d2
  567. vtrn.8 d18, d16
  568. vtrn.8 d0, d2
  569. sub r0, r0, r1, lsl #3
  570. vst1.32 {d18[0]}, [r0], r1
  571. vst1.32 {d16[0]}, [r0], r1
  572. vst1.32 {d0[0]}, [r0], r1
  573. vst1.32 {d2[0]}, [r0], r1
  574. vst1.32 {d18[1]}, [r0], r1
  575. vst1.32 {d16[1]}, [r0], r1
  576. vst1.32 {d0[1]}, [r0], r1
  577. vst1.32 {d2[1]}, [r0], r1
  578. bx lr
  579. endfunc
  580. /* H.264 qpel MC */
  581. .macro lowpass_const r
  582. movw \r, #5
  583. movt \r, #20
  584. vmov.32 d6[0], \r
  585. .endm
  586. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  587. .if \narrow
  588. t0 .req q0
  589. t1 .req q8
  590. .else
  591. t0 .req \d0
  592. t1 .req \d1
  593. .endif
  594. vext.8 d2, \r0, \r1, #2
  595. vext.8 d3, \r0, \r1, #3
  596. vaddl.u8 q1, d2, d3
  597. vext.8 d4, \r0, \r1, #1
  598. vext.8 d5, \r0, \r1, #4
  599. vaddl.u8 q2, d4, d5
  600. vext.8 d30, \r0, \r1, #5
  601. vaddl.u8 t0, \r0, d30
  602. vext.8 d18, \r2, \r3, #2
  603. vmla.i16 t0, q1, d6[1]
  604. vext.8 d19, \r2, \r3, #3
  605. vaddl.u8 q9, d18, d19
  606. vext.8 d20, \r2, \r3, #1
  607. vmls.i16 t0, q2, d6[0]
  608. vext.8 d21, \r2, \r3, #4
  609. vaddl.u8 q10, d20, d21
  610. vext.8 d31, \r2, \r3, #5
  611. vaddl.u8 t1, \r2, d31
  612. vmla.i16 t1, q9, d6[1]
  613. vmls.i16 t1, q10, d6[0]
  614. .if \narrow
  615. vqrshrun.s16 \d0, t0, #5
  616. vqrshrun.s16 \d1, t1, #5
  617. .endif
  618. .unreq t0
  619. .unreq t1
  620. .endm
  621. .macro lowpass_8_1 r0, r1, d0, narrow=1
  622. .if \narrow
  623. t0 .req q0
  624. .else
  625. t0 .req \d0
  626. .endif
  627. vext.8 d2, \r0, \r1, #2
  628. vext.8 d3, \r0, \r1, #3
  629. vaddl.u8 q1, d2, d3
  630. vext.8 d4, \r0, \r1, #1
  631. vext.8 d5, \r0, \r1, #4
  632. vaddl.u8 q2, d4, d5
  633. vext.8 d30, \r0, \r1, #5
  634. vaddl.u8 t0, \r0, d30
  635. vmla.i16 t0, q1, d6[1]
  636. vmls.i16 t0, q2, d6[0]
  637. .if \narrow
  638. vqrshrun.s16 \d0, t0, #5
  639. .endif
  640. .unreq t0
  641. .endm
  642. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  643. vext.16 q1, \r0, \r1, #2
  644. vext.16 q0, \r0, \r1, #3
  645. vaddl.s16 q9, d2, d0
  646. vext.16 q2, \r0, \r1, #1
  647. vaddl.s16 q1, d3, d1
  648. vext.16 q3, \r0, \r1, #4
  649. vaddl.s16 q10, d4, d6
  650. vext.16 \r1, \r0, \r1, #5
  651. vaddl.s16 q2, d5, d7
  652. vaddl.s16 q0, \h0, \h1
  653. vaddl.s16 q8, \l0, \l1
  654. vshl.i32 q3, q9, #4
  655. vshl.i32 q9, q9, #2
  656. vshl.i32 q15, q10, #2
  657. vadd.i32 q9, q9, q3
  658. vadd.i32 q10, q10, q15
  659. vshl.i32 q3, q1, #4
  660. vshl.i32 q1, q1, #2
  661. vshl.i32 q15, q2, #2
  662. vadd.i32 q1, q1, q3
  663. vadd.i32 q2, q2, q15
  664. vadd.i32 q9, q9, q8
  665. vsub.i32 q9, q9, q10
  666. vadd.i32 q1, q1, q0
  667. vsub.i32 q1, q1, q2
  668. vrshrn.s32 d18, q9, #10
  669. vrshrn.s32 d19, q1, #10
  670. vqmovun.s16 \d, q9
  671. .endm
  672. function put_h264_qpel16_h_lowpass_neon_packed
  673. mov r4, lr
  674. mov ip, #16
  675. mov r3, #8
  676. bl put_h264_qpel8_h_lowpass_neon
  677. sub r1, r1, r2, lsl #4
  678. add r1, r1, #8
  679. mov ip, #16
  680. mov lr, r4
  681. b put_h264_qpel8_h_lowpass_neon
  682. endfunc
  683. .macro h264_qpel_h_lowpass type
  684. function \type\()_h264_qpel16_h_lowpass_neon
  685. push {lr}
  686. mov ip, #16
  687. bl \type\()_h264_qpel8_h_lowpass_neon
  688. sub r0, r0, r3, lsl #4
  689. sub r1, r1, r2, lsl #4
  690. add r0, r0, #8
  691. add r1, r1, #8
  692. mov ip, #16
  693. pop {lr}
  694. endfunc
  695. function \type\()_h264_qpel8_h_lowpass_neon
  696. 1: vld1.64 {d0, d1}, [r1], r2
  697. vld1.64 {d16,d17}, [r1], r2
  698. subs ip, ip, #2
  699. lowpass_8 d0, d1, d16, d17, d0, d16
  700. .ifc \type,avg
  701. vld1.8 {d2}, [r0,:64], r3
  702. vrhadd.u8 d0, d0, d2
  703. vld1.8 {d3}, [r0,:64]
  704. vrhadd.u8 d16, d16, d3
  705. sub r0, r0, r3
  706. .endif
  707. vst1.64 {d0}, [r0,:64], r3
  708. vst1.64 {d16}, [r0,:64], r3
  709. bne 1b
  710. bx lr
  711. endfunc
  712. .endm
  713. h264_qpel_h_lowpass put
  714. h264_qpel_h_lowpass avg
  715. .macro h264_qpel_h_lowpass_l2 type
  716. function \type\()_h264_qpel16_h_lowpass_l2_neon
  717. push {lr}
  718. mov ip, #16
  719. bl \type\()_h264_qpel8_h_lowpass_l2_neon
  720. sub r0, r0, r2, lsl #4
  721. sub r1, r1, r2, lsl #4
  722. sub r3, r3, r2, lsl #4
  723. add r0, r0, #8
  724. add r1, r1, #8
  725. add r3, r3, #8
  726. mov ip, #16
  727. pop {lr}
  728. endfunc
  729. function \type\()_h264_qpel8_h_lowpass_l2_neon
  730. 1: vld1.64 {d0, d1}, [r1], r2
  731. vld1.64 {d16,d17}, [r1], r2
  732. vld1.64 {d28}, [r3], r2
  733. vld1.64 {d29}, [r3], r2
  734. subs ip, ip, #2
  735. lowpass_8 d0, d1, d16, d17, d0, d1
  736. vrhadd.u8 q0, q0, q14
  737. .ifc \type,avg
  738. vld1.8 {d2}, [r0,:64], r2
  739. vrhadd.u8 d0, d0, d2
  740. vld1.8 {d3}, [r0,:64]
  741. vrhadd.u8 d1, d1, d3
  742. sub r0, r0, r2
  743. .endif
  744. vst1.64 {d0}, [r0,:64], r2
  745. vst1.64 {d1}, [r0,:64], r2
  746. bne 1b
  747. bx lr
  748. endfunc
  749. .endm
  750. h264_qpel_h_lowpass_l2 put
  751. h264_qpel_h_lowpass_l2 avg
  752. function put_h264_qpel16_v_lowpass_neon_packed
  753. mov r4, lr
  754. mov r2, #8
  755. bl put_h264_qpel8_v_lowpass_neon
  756. sub r1, r1, r3, lsl #2
  757. bl put_h264_qpel8_v_lowpass_neon
  758. sub r1, r1, r3, lsl #4
  759. sub r1, r1, r3, lsl #2
  760. add r1, r1, #8
  761. bl put_h264_qpel8_v_lowpass_neon
  762. sub r1, r1, r3, lsl #2
  763. mov lr, r4
  764. b put_h264_qpel8_v_lowpass_neon
  765. endfunc
  766. .macro h264_qpel_v_lowpass type
  767. function \type\()_h264_qpel16_v_lowpass_neon
  768. mov r4, lr
  769. bl \type\()_h264_qpel8_v_lowpass_neon
  770. sub r1, r1, r3, lsl #2
  771. bl \type\()_h264_qpel8_v_lowpass_neon
  772. sub r0, r0, r2, lsl #4
  773. add r0, r0, #8
  774. sub r1, r1, r3, lsl #4
  775. sub r1, r1, r3, lsl #2
  776. add r1, r1, #8
  777. bl \type\()_h264_qpel8_v_lowpass_neon
  778. sub r1, r1, r3, lsl #2
  779. mov lr, r4
  780. endfunc
  781. function \type\()_h264_qpel8_v_lowpass_neon
  782. vld1.64 {d8}, [r1], r3
  783. vld1.64 {d10}, [r1], r3
  784. vld1.64 {d12}, [r1], r3
  785. vld1.64 {d14}, [r1], r3
  786. vld1.64 {d22}, [r1], r3
  787. vld1.64 {d24}, [r1], r3
  788. vld1.64 {d26}, [r1], r3
  789. vld1.64 {d28}, [r1], r3
  790. vld1.64 {d9}, [r1], r3
  791. vld1.64 {d11}, [r1], r3
  792. vld1.64 {d13}, [r1], r3
  793. vld1.64 {d15}, [r1], r3
  794. vld1.64 {d23}, [r1]
  795. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  796. lowpass_8 d8, d9, d10, d11, d8, d10
  797. lowpass_8 d12, d13, d14, d15, d12, d14
  798. lowpass_8 d22, d23, d24, d25, d22, d24
  799. lowpass_8 d26, d27, d28, d29, d26, d28
  800. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  801. .ifc \type,avg
  802. vld1.8 {d9}, [r0,:64], r2
  803. vrhadd.u8 d8, d8, d9
  804. vld1.8 {d11}, [r0,:64], r2
  805. vrhadd.u8 d10, d10, d11
  806. vld1.8 {d13}, [r0,:64], r2
  807. vrhadd.u8 d12, d12, d13
  808. vld1.8 {d15}, [r0,:64], r2
  809. vrhadd.u8 d14, d14, d15
  810. vld1.8 {d23}, [r0,:64], r2
  811. vrhadd.u8 d22, d22, d23
  812. vld1.8 {d25}, [r0,:64], r2
  813. vrhadd.u8 d24, d24, d25
  814. vld1.8 {d27}, [r0,:64], r2
  815. vrhadd.u8 d26, d26, d27
  816. vld1.8 {d29}, [r0,:64], r2
  817. vrhadd.u8 d28, d28, d29
  818. sub r0, r0, r2, lsl #3
  819. .endif
  820. vst1.64 {d8}, [r0,:64], r2
  821. vst1.64 {d10}, [r0,:64], r2
  822. vst1.64 {d12}, [r0,:64], r2
  823. vst1.64 {d14}, [r0,:64], r2
  824. vst1.64 {d22}, [r0,:64], r2
  825. vst1.64 {d24}, [r0,:64], r2
  826. vst1.64 {d26}, [r0,:64], r2
  827. vst1.64 {d28}, [r0,:64], r2
  828. bx lr
  829. endfunc
  830. .endm
  831. h264_qpel_v_lowpass put
  832. h264_qpel_v_lowpass avg
  833. .macro h264_qpel_v_lowpass_l2 type
  834. function \type\()_h264_qpel16_v_lowpass_l2_neon
  835. mov r4, lr
  836. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  837. sub r1, r1, r3, lsl #2
  838. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  839. sub r0, r0, r3, lsl #4
  840. sub ip, ip, r2, lsl #4
  841. add r0, r0, #8
  842. add ip, ip, #8
  843. sub r1, r1, r3, lsl #4
  844. sub r1, r1, r3, lsl #2
  845. add r1, r1, #8
  846. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  847. sub r1, r1, r3, lsl #2
  848. mov lr, r4
  849. endfunc
  850. function \type\()_h264_qpel8_v_lowpass_l2_neon
  851. vld1.64 {d8}, [r1], r3
  852. vld1.64 {d10}, [r1], r3
  853. vld1.64 {d12}, [r1], r3
  854. vld1.64 {d14}, [r1], r3
  855. vld1.64 {d22}, [r1], r3
  856. vld1.64 {d24}, [r1], r3
  857. vld1.64 {d26}, [r1], r3
  858. vld1.64 {d28}, [r1], r3
  859. vld1.64 {d9}, [r1], r3
  860. vld1.64 {d11}, [r1], r3
  861. vld1.64 {d13}, [r1], r3
  862. vld1.64 {d15}, [r1], r3
  863. vld1.64 {d23}, [r1]
  864. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  865. lowpass_8 d8, d9, d10, d11, d8, d9
  866. lowpass_8 d12, d13, d14, d15, d12, d13
  867. lowpass_8 d22, d23, d24, d25, d22, d23
  868. lowpass_8 d26, d27, d28, d29, d26, d27
  869. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  870. vld1.64 {d0}, [ip], r2
  871. vld1.64 {d1}, [ip], r2
  872. vld1.64 {d2}, [ip], r2
  873. vld1.64 {d3}, [ip], r2
  874. vld1.64 {d4}, [ip], r2
  875. vrhadd.u8 q0, q0, q4
  876. vld1.64 {d5}, [ip], r2
  877. vrhadd.u8 q1, q1, q6
  878. vld1.64 {d10}, [ip], r2
  879. vrhadd.u8 q2, q2, q11
  880. vld1.64 {d11}, [ip], r2
  881. vrhadd.u8 q5, q5, q13
  882. .ifc \type,avg
  883. vld1.8 {d16}, [r0,:64], r3
  884. vrhadd.u8 d0, d0, d16
  885. vld1.8 {d17}, [r0,:64], r3
  886. vrhadd.u8 d1, d1, d17
  887. vld1.8 {d16}, [r0,:64], r3
  888. vrhadd.u8 d2, d2, d16
  889. vld1.8 {d17}, [r0,:64], r3
  890. vrhadd.u8 d3, d3, d17
  891. vld1.8 {d16}, [r0,:64], r3
  892. vrhadd.u8 d4, d4, d16
  893. vld1.8 {d17}, [r0,:64], r3
  894. vrhadd.u8 d5, d5, d17
  895. vld1.8 {d16}, [r0,:64], r3
  896. vrhadd.u8 d10, d10, d16
  897. vld1.8 {d17}, [r0,:64], r3
  898. vrhadd.u8 d11, d11, d17
  899. sub r0, r0, r3, lsl #3
  900. .endif
  901. vst1.64 {d0}, [r0,:64], r3
  902. vst1.64 {d1}, [r0,:64], r3
  903. vst1.64 {d2}, [r0,:64], r3
  904. vst1.64 {d3}, [r0,:64], r3
  905. vst1.64 {d4}, [r0,:64], r3
  906. vst1.64 {d5}, [r0,:64], r3
  907. vst1.64 {d10}, [r0,:64], r3
  908. vst1.64 {d11}, [r0,:64], r3
  909. bx lr
  910. endfunc
  911. .endm
  912. h264_qpel_v_lowpass_l2 put
  913. h264_qpel_v_lowpass_l2 avg
  914. function put_h264_qpel8_hv_lowpass_neon_top
  915. lowpass_const ip
  916. mov ip, #12
  917. 1: vld1.64 {d0, d1}, [r1], r3
  918. vld1.64 {d16,d17}, [r1], r3
  919. subs ip, ip, #2
  920. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  921. vst1.64 {d22-d25}, [r4,:128]!
  922. bne 1b
  923. vld1.64 {d0, d1}, [r1]
  924. lowpass_8_1 d0, d1, q12, narrow=0
  925. mov ip, #-16
  926. add r4, r4, ip
  927. vld1.64 {d30,d31}, [r4,:128], ip
  928. vld1.64 {d20,d21}, [r4,:128], ip
  929. vld1.64 {d18,d19}, [r4,:128], ip
  930. vld1.64 {d16,d17}, [r4,:128], ip
  931. vld1.64 {d14,d15}, [r4,:128], ip
  932. vld1.64 {d12,d13}, [r4,:128], ip
  933. vld1.64 {d10,d11}, [r4,:128], ip
  934. vld1.64 {d8, d9}, [r4,:128], ip
  935. vld1.64 {d6, d7}, [r4,:128], ip
  936. vld1.64 {d4, d5}, [r4,:128], ip
  937. vld1.64 {d2, d3}, [r4,:128], ip
  938. vld1.64 {d0, d1}, [r4,:128]
  939. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  940. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  941. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  942. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  943. vst1.64 {d30,d31}, [r4,:128]!
  944. vst1.64 {d6, d7}, [r4,:128]!
  945. vst1.64 {d20,d21}, [r4,:128]!
  946. vst1.64 {d4, d5}, [r4,:128]!
  947. vst1.64 {d18,d19}, [r4,:128]!
  948. vst1.64 {d2, d3}, [r4,:128]!
  949. vst1.64 {d16,d17}, [r4,:128]!
  950. vst1.64 {d0, d1}, [r4,:128]
  951. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  952. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  953. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  954. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  955. vld1.64 {d16,d17}, [r4,:128], ip
  956. vld1.64 {d30,d31}, [r4,:128], ip
  957. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  958. vld1.64 {d16,d17}, [r4,:128], ip
  959. vld1.64 {d30,d31}, [r4,:128], ip
  960. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  961. vld1.64 {d16,d17}, [r4,:128], ip
  962. vld1.64 {d30,d31}, [r4,:128], ip
  963. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  964. vld1.64 {d16,d17}, [r4,:128], ip
  965. vld1.64 {d30,d31}, [r4,:128]
  966. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  967. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  968. bx lr
  969. endfunc
  970. .macro h264_qpel8_hv_lowpass type
  971. function \type\()_h264_qpel8_hv_lowpass_neon
  972. mov r10, lr
  973. bl put_h264_qpel8_hv_lowpass_neon_top
  974. .ifc \type,avg
  975. vld1.8 {d0}, [r0,:64], r2
  976. vrhadd.u8 d12, d12, d0
  977. vld1.8 {d1}, [r0,:64], r2
  978. vrhadd.u8 d13, d13, d1
  979. vld1.8 {d2}, [r0,:64], r2
  980. vrhadd.u8 d14, d14, d2
  981. vld1.8 {d3}, [r0,:64], r2
  982. vrhadd.u8 d15, d15, d3
  983. vld1.8 {d4}, [r0,:64], r2
  984. vrhadd.u8 d8, d8, d4
  985. vld1.8 {d5}, [r0,:64], r2
  986. vrhadd.u8 d9, d9, d5
  987. vld1.8 {d6}, [r0,:64], r2
  988. vrhadd.u8 d10, d10, d6
  989. vld1.8 {d7}, [r0,:64], r2
  990. vrhadd.u8 d11, d11, d7
  991. sub r0, r0, r2, lsl #3
  992. .endif
  993. vst1.64 {d12}, [r0,:64], r2
  994. vst1.64 {d13}, [r0,:64], r2
  995. vst1.64 {d14}, [r0,:64], r2
  996. vst1.64 {d15}, [r0,:64], r2
  997. vst1.64 {d8}, [r0,:64], r2
  998. vst1.64 {d9}, [r0,:64], r2
  999. vst1.64 {d10}, [r0,:64], r2
  1000. vst1.64 {d11}, [r0,:64], r2
  1001. mov lr, r10
  1002. bx lr
  1003. endfunc
  1004. .endm
  1005. h264_qpel8_hv_lowpass put
  1006. h264_qpel8_hv_lowpass avg
  1007. .macro h264_qpel8_hv_lowpass_l2 type
  1008. function \type\()_h264_qpel8_hv_lowpass_l2_neon
  1009. mov r10, lr
  1010. bl put_h264_qpel8_hv_lowpass_neon_top
  1011. vld1.64 {d0, d1}, [r2,:128]!
  1012. vld1.64 {d2, d3}, [r2,:128]!
  1013. vrhadd.u8 q0, q0, q6
  1014. vld1.64 {d4, d5}, [r2,:128]!
  1015. vrhadd.u8 q1, q1, q7
  1016. vld1.64 {d6, d7}, [r2,:128]!
  1017. vrhadd.u8 q2, q2, q4
  1018. vrhadd.u8 q3, q3, q5
  1019. .ifc \type,avg
  1020. vld1.8 {d16}, [r0,:64], r3
  1021. vrhadd.u8 d0, d0, d16
  1022. vld1.8 {d17}, [r0,:64], r3
  1023. vrhadd.u8 d1, d1, d17
  1024. vld1.8 {d18}, [r0,:64], r3
  1025. vrhadd.u8 d2, d2, d18
  1026. vld1.8 {d19}, [r0,:64], r3
  1027. vrhadd.u8 d3, d3, d19
  1028. vld1.8 {d20}, [r0,:64], r3
  1029. vrhadd.u8 d4, d4, d20
  1030. vld1.8 {d21}, [r0,:64], r3
  1031. vrhadd.u8 d5, d5, d21
  1032. vld1.8 {d22}, [r0,:64], r3
  1033. vrhadd.u8 d6, d6, d22
  1034. vld1.8 {d23}, [r0,:64], r3
  1035. vrhadd.u8 d7, d7, d23
  1036. sub r0, r0, r3, lsl #3
  1037. .endif
  1038. vst1.64 {d0}, [r0,:64], r3
  1039. vst1.64 {d1}, [r0,:64], r3
  1040. vst1.64 {d2}, [r0,:64], r3
  1041. vst1.64 {d3}, [r0,:64], r3
  1042. vst1.64 {d4}, [r0,:64], r3
  1043. vst1.64 {d5}, [r0,:64], r3
  1044. vst1.64 {d6}, [r0,:64], r3
  1045. vst1.64 {d7}, [r0,:64], r3
  1046. mov lr, r10
  1047. bx lr
  1048. endfunc
  1049. .endm
  1050. h264_qpel8_hv_lowpass_l2 put
  1051. h264_qpel8_hv_lowpass_l2 avg
  1052. .macro h264_qpel16_hv type
  1053. function \type\()_h264_qpel16_hv_lowpass_neon
  1054. mov r9, lr
  1055. bl \type\()_h264_qpel8_hv_lowpass_neon
  1056. sub r1, r1, r3, lsl #2
  1057. bl \type\()_h264_qpel8_hv_lowpass_neon
  1058. sub r1, r1, r3, lsl #4
  1059. sub r1, r1, r3, lsl #2
  1060. add r1, r1, #8
  1061. sub r0, r0, r2, lsl #4
  1062. add r0, r0, #8
  1063. bl \type\()_h264_qpel8_hv_lowpass_neon
  1064. sub r1, r1, r3, lsl #2
  1065. mov lr, r9
  1066. b \type\()_h264_qpel8_hv_lowpass_neon
  1067. endfunc
  1068. function \type\()_h264_qpel16_hv_lowpass_l2_neon
  1069. mov r9, lr
  1070. sub r2, r4, #256
  1071. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1072. sub r1, r1, r3, lsl #2
  1073. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1074. sub r1, r1, r3, lsl #4
  1075. sub r1, r1, r3, lsl #2
  1076. add r1, r1, #8
  1077. sub r0, r0, r3, lsl #4
  1078. add r0, r0, #8
  1079. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1080. sub r1, r1, r3, lsl #2
  1081. mov lr, r9
  1082. b \type\()_h264_qpel8_hv_lowpass_l2_neon
  1083. endfunc
  1084. .endm
  1085. h264_qpel16_hv put
  1086. h264_qpel16_hv avg
  1087. .macro h264_qpel8 type
  1088. function ff_\type\()_h264_qpel8_mc10_neon, export=1
  1089. lowpass_const r3
  1090. mov r3, r1
  1091. sub r1, r1, #2
  1092. mov ip, #8
  1093. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1094. endfunc
  1095. function ff_\type\()_h264_qpel8_mc20_neon, export=1
  1096. lowpass_const r3
  1097. sub r1, r1, #2
  1098. mov r3, r2
  1099. mov ip, #8
  1100. b \type\()_h264_qpel8_h_lowpass_neon
  1101. endfunc
  1102. function ff_\type\()_h264_qpel8_mc30_neon, export=1
  1103. lowpass_const r3
  1104. add r3, r1, #1
  1105. sub r1, r1, #2
  1106. mov ip, #8
  1107. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1108. endfunc
  1109. function ff_\type\()_h264_qpel8_mc01_neon, export=1
  1110. push {lr}
  1111. mov ip, r1
  1112. \type\()_h264_qpel8_mc01:
  1113. lowpass_const r3
  1114. mov r3, r2
  1115. sub r1, r1, r2, lsl #1
  1116. vpush {d8-d15}
  1117. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1118. vpop {d8-d15}
  1119. pop {pc}
  1120. endfunc
  1121. function ff_\type\()_h264_qpel8_mc11_neon, export=1
  1122. push {r0, r1, r11, lr}
  1123. \type\()_h264_qpel8_mc11:
  1124. lowpass_const r3
  1125. mov r11, sp
  1126. A bic sp, sp, #15
  1127. T bic r0, r11, #15
  1128. T mov sp, r0
  1129. sub sp, sp, #64
  1130. mov r0, sp
  1131. sub r1, r1, #2
  1132. mov r3, #8
  1133. mov ip, #8
  1134. vpush {d8-d15}
  1135. bl put_h264_qpel8_h_lowpass_neon
  1136. ldrd r0, [r11], #8
  1137. mov r3, r2
  1138. add ip, sp, #64
  1139. sub r1, r1, r2, lsl #1
  1140. mov r2, #8
  1141. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1142. vpop {d8-d15}
  1143. mov sp, r11
  1144. pop {r11, pc}
  1145. endfunc
  1146. function ff_\type\()_h264_qpel8_mc21_neon, export=1
  1147. push {r0, r1, r4, r10, r11, lr}
  1148. \type\()_h264_qpel8_mc21:
  1149. lowpass_const r3
  1150. mov r11, sp
  1151. A bic sp, sp, #15
  1152. T bic r0, r11, #15
  1153. T mov sp, r0
  1154. sub sp, sp, #(8*8+16*12)
  1155. sub r1, r1, #2
  1156. mov r3, #8
  1157. mov r0, sp
  1158. mov ip, #8
  1159. vpush {d8-d15}
  1160. bl put_h264_qpel8_h_lowpass_neon
  1161. mov r4, r0
  1162. ldrd r0, [r11], #8
  1163. sub r1, r1, r2, lsl #1
  1164. sub r1, r1, #2
  1165. mov r3, r2
  1166. sub r2, r4, #64
  1167. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1168. vpop {d8-d15}
  1169. mov sp, r11
  1170. pop {r4, r10, r11, pc}
  1171. endfunc
  1172. function ff_\type\()_h264_qpel8_mc31_neon, export=1
  1173. add r1, r1, #1
  1174. push {r0, r1, r11, lr}
  1175. sub r1, r1, #1
  1176. b \type\()_h264_qpel8_mc11
  1177. endfunc
  1178. function ff_\type\()_h264_qpel8_mc02_neon, export=1
  1179. push {lr}
  1180. lowpass_const r3
  1181. sub r1, r1, r2, lsl #1
  1182. mov r3, r2
  1183. vpush {d8-d15}
  1184. bl \type\()_h264_qpel8_v_lowpass_neon
  1185. vpop {d8-d15}
  1186. pop {pc}
  1187. endfunc
  1188. function ff_\type\()_h264_qpel8_mc12_neon, export=1
  1189. push {r0, r1, r4, r10, r11, lr}
  1190. \type\()_h264_qpel8_mc12:
  1191. lowpass_const r3
  1192. mov r11, sp
  1193. A bic sp, sp, #15
  1194. T bic r0, r11, #15
  1195. T mov sp, r0
  1196. sub sp, sp, #(8*8+16*12)
  1197. sub r1, r1, r2, lsl #1
  1198. mov r3, r2
  1199. mov r2, #8
  1200. mov r0, sp
  1201. vpush {d8-d15}
  1202. bl put_h264_qpel8_v_lowpass_neon
  1203. mov r4, r0
  1204. ldrd r0, [r11], #8
  1205. sub r1, r1, r3, lsl #1
  1206. sub r1, r1, #2
  1207. sub r2, r4, #64
  1208. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1209. vpop {d8-d15}
  1210. mov sp, r11
  1211. pop {r4, r10, r11, pc}
  1212. endfunc
  1213. function ff_\type\()_h264_qpel8_mc22_neon, export=1
  1214. push {r4, r10, r11, lr}
  1215. mov r11, sp
  1216. A bic sp, sp, #15
  1217. T bic r4, r11, #15
  1218. T mov sp, r4
  1219. sub r1, r1, r2, lsl #1
  1220. sub r1, r1, #2
  1221. mov r3, r2
  1222. sub sp, sp, #(16*12)
  1223. mov r4, sp
  1224. vpush {d8-d15}
  1225. bl \type\()_h264_qpel8_hv_lowpass_neon
  1226. vpop {d8-d15}
  1227. mov sp, r11
  1228. pop {r4, r10, r11, pc}
  1229. endfunc
  1230. function ff_\type\()_h264_qpel8_mc32_neon, export=1
  1231. push {r0, r1, r4, r10, r11, lr}
  1232. add r1, r1, #1
  1233. b \type\()_h264_qpel8_mc12
  1234. endfunc
  1235. function ff_\type\()_h264_qpel8_mc03_neon, export=1
  1236. push {lr}
  1237. add ip, r1, r2
  1238. b \type\()_h264_qpel8_mc01
  1239. endfunc
  1240. function ff_\type\()_h264_qpel8_mc13_neon, export=1
  1241. push {r0, r1, r11, lr}
  1242. add r1, r1, r2
  1243. b \type\()_h264_qpel8_mc11
  1244. endfunc
  1245. function ff_\type\()_h264_qpel8_mc23_neon, export=1
  1246. push {r0, r1, r4, r10, r11, lr}
  1247. add r1, r1, r2
  1248. b \type\()_h264_qpel8_mc21
  1249. endfunc
  1250. function ff_\type\()_h264_qpel8_mc33_neon, export=1
  1251. add r1, r1, #1
  1252. push {r0, r1, r11, lr}
  1253. add r1, r1, r2
  1254. sub r1, r1, #1
  1255. b \type\()_h264_qpel8_mc11
  1256. endfunc
  1257. .endm
  1258. h264_qpel8 put
  1259. h264_qpel8 avg
  1260. .macro h264_qpel16 type
  1261. function ff_\type\()_h264_qpel16_mc10_neon, export=1
  1262. lowpass_const r3
  1263. mov r3, r1
  1264. sub r1, r1, #2
  1265. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1266. endfunc
  1267. function ff_\type\()_h264_qpel16_mc20_neon, export=1
  1268. lowpass_const r3
  1269. sub r1, r1, #2
  1270. mov r3, r2
  1271. b \type\()_h264_qpel16_h_lowpass_neon
  1272. endfunc
  1273. function ff_\type\()_h264_qpel16_mc30_neon, export=1
  1274. lowpass_const r3
  1275. add r3, r1, #1
  1276. sub r1, r1, #2
  1277. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1278. endfunc
  1279. function ff_\type\()_h264_qpel16_mc01_neon, export=1
  1280. push {r4, lr}
  1281. mov ip, r1
  1282. \type\()_h264_qpel16_mc01:
  1283. lowpass_const r3
  1284. mov r3, r2
  1285. sub r1, r1, r2, lsl #1
  1286. vpush {d8-d15}
  1287. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1288. vpop {d8-d15}
  1289. pop {r4, pc}
  1290. endfunc
  1291. function ff_\type\()_h264_qpel16_mc11_neon, export=1
  1292. push {r0, r1, r4, r11, lr}
  1293. \type\()_h264_qpel16_mc11:
  1294. lowpass_const r3
  1295. mov r11, sp
  1296. A bic sp, sp, #15
  1297. T bic r0, r11, #15
  1298. T mov sp, r0
  1299. sub sp, sp, #256
  1300. mov r0, sp
  1301. sub r1, r1, #2
  1302. mov r3, #16
  1303. vpush {d8-d15}
  1304. bl put_h264_qpel16_h_lowpass_neon
  1305. ldrd r0, [r11], #8
  1306. mov r3, r2
  1307. add ip, sp, #64
  1308. sub r1, r1, r2, lsl #1
  1309. mov r2, #16
  1310. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1311. vpop {d8-d15}
  1312. mov sp, r11
  1313. pop {r4, r11, pc}
  1314. endfunc
  1315. function ff_\type\()_h264_qpel16_mc21_neon, export=1
  1316. push {r0, r1, r4-r5, r9-r11, lr}
  1317. \type\()_h264_qpel16_mc21:
  1318. lowpass_const r3
  1319. mov r11, sp
  1320. A bic sp, sp, #15
  1321. T bic r0, r11, #15
  1322. T mov sp, r0
  1323. sub sp, sp, #(16*16+16*12)
  1324. sub r1, r1, #2
  1325. mov r0, sp
  1326. vpush {d8-d15}
  1327. bl put_h264_qpel16_h_lowpass_neon_packed
  1328. mov r4, r0
  1329. ldrd r0, [r11], #8
  1330. sub r1, r1, r2, lsl #1
  1331. sub r1, r1, #2
  1332. mov r3, r2
  1333. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1334. vpop {d8-d15}
  1335. mov sp, r11
  1336. pop {r4-r5, r9-r11, pc}
  1337. endfunc
  1338. function ff_\type\()_h264_qpel16_mc31_neon, export=1
  1339. add r1, r1, #1
  1340. push {r0, r1, r4, r11, lr}
  1341. sub r1, r1, #1
  1342. b \type\()_h264_qpel16_mc11
  1343. endfunc
  1344. function ff_\type\()_h264_qpel16_mc02_neon, export=1
  1345. push {r4, lr}
  1346. lowpass_const r3
  1347. sub r1, r1, r2, lsl #1
  1348. mov r3, r2
  1349. vpush {d8-d15}
  1350. bl \type\()_h264_qpel16_v_lowpass_neon
  1351. vpop {d8-d15}
  1352. pop {r4, pc}
  1353. endfunc
  1354. function ff_\type\()_h264_qpel16_mc12_neon, export=1
  1355. push {r0, r1, r4-r5, r9-r11, lr}
  1356. \type\()_h264_qpel16_mc12:
  1357. lowpass_const r3
  1358. mov r11, sp
  1359. A bic sp, sp, #15
  1360. T bic r0, r11, #15
  1361. T mov sp, r0
  1362. sub sp, sp, #(16*16+16*12)
  1363. sub r1, r1, r2, lsl #1
  1364. mov r0, sp
  1365. mov r3, r2
  1366. vpush {d8-d15}
  1367. bl put_h264_qpel16_v_lowpass_neon_packed
  1368. mov r4, r0
  1369. ldrd r0, [r11], #8
  1370. sub r1, r1, r3, lsl #1
  1371. sub r1, r1, #2
  1372. mov r2, r3
  1373. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1374. vpop {d8-d15}
  1375. mov sp, r11
  1376. pop {r4-r5, r9-r11, pc}
  1377. endfunc
  1378. function ff_\type\()_h264_qpel16_mc22_neon, export=1
  1379. push {r4, r9-r11, lr}
  1380. lowpass_const r3
  1381. mov r11, sp
  1382. A bic sp, sp, #15
  1383. T bic r4, r11, #15
  1384. T mov sp, r4
  1385. sub r1, r1, r2, lsl #1
  1386. sub r1, r1, #2
  1387. mov r3, r2
  1388. sub sp, sp, #(16*12)
  1389. mov r4, sp
  1390. vpush {d8-d15}
  1391. bl \type\()_h264_qpel16_hv_lowpass_neon
  1392. vpop {d8-d15}
  1393. mov sp, r11
  1394. pop {r4, r9-r11, pc}
  1395. endfunc
  1396. function ff_\type\()_h264_qpel16_mc32_neon, export=1
  1397. push {r0, r1, r4-r5, r9-r11, lr}
  1398. add r1, r1, #1
  1399. b \type\()_h264_qpel16_mc12
  1400. endfunc
  1401. function ff_\type\()_h264_qpel16_mc03_neon, export=1
  1402. push {r4, lr}
  1403. add ip, r1, r2
  1404. b \type\()_h264_qpel16_mc01
  1405. endfunc
  1406. function ff_\type\()_h264_qpel16_mc13_neon, export=1
  1407. push {r0, r1, r4, r11, lr}
  1408. add r1, r1, r2
  1409. b \type\()_h264_qpel16_mc11
  1410. endfunc
  1411. function ff_\type\()_h264_qpel16_mc23_neon, export=1
  1412. push {r0, r1, r4-r5, r9-r11, lr}
  1413. add r1, r1, r2
  1414. b \type\()_h264_qpel16_mc21
  1415. endfunc
  1416. function ff_\type\()_h264_qpel16_mc33_neon, export=1
  1417. add r1, r1, #1
  1418. push {r0, r1, r4, r11, lr}
  1419. add r1, r1, r2
  1420. sub r1, r1, #1
  1421. b \type\()_h264_qpel16_mc11
  1422. endfunc
  1423. .endm
  1424. h264_qpel16 put
  1425. h264_qpel16 avg
  1426. @ Biweighted prediction
  1427. .macro biweight_16 macs, macd
  1428. vdup.8 d0, r4
  1429. vdup.8 d1, r5
  1430. vmov q2, q8
  1431. vmov q3, q8
  1432. 1: subs r3, r3, #2
  1433. vld1.8 {d20-d21},[r0,:128], r2
  1434. \macd q2, d0, d20
  1435. pld [r0]
  1436. \macd q3, d0, d21
  1437. vld1.8 {d22-d23},[r1,:128], r2
  1438. \macs q2, d1, d22
  1439. pld [r1]
  1440. \macs q3, d1, d23
  1441. vmov q12, q8
  1442. vld1.8 {d28-d29},[r0,:128], r2
  1443. vmov q13, q8
  1444. \macd q12, d0, d28
  1445. pld [r0]
  1446. \macd q13, d0, d29
  1447. vld1.8 {d30-d31},[r1,:128], r2
  1448. \macs q12, d1, d30
  1449. pld [r1]
  1450. \macs q13, d1, d31
  1451. vshl.s16 q2, q2, q9
  1452. vshl.s16 q3, q3, q9
  1453. vqmovun.s16 d4, q2
  1454. vqmovun.s16 d5, q3
  1455. vshl.s16 q12, q12, q9
  1456. vshl.s16 q13, q13, q9
  1457. vqmovun.s16 d24, q12
  1458. vqmovun.s16 d25, q13
  1459. vmov q3, q8
  1460. vst1.8 {d4- d5}, [r6,:128], r2
  1461. vmov q2, q8
  1462. vst1.8 {d24-d25},[r6,:128], r2
  1463. bne 1b
  1464. pop {r4-r6, pc}
  1465. .endm
  1466. .macro biweight_8 macs, macd
  1467. vdup.8 d0, r4
  1468. vdup.8 d1, r5
  1469. vmov q1, q8
  1470. vmov q10, q8
  1471. 1: subs r3, r3, #2
  1472. vld1.8 {d4},[r0,:64], r2
  1473. \macd q1, d0, d4
  1474. pld [r0]
  1475. vld1.8 {d5},[r1,:64], r2
  1476. \macs q1, d1, d5
  1477. pld [r1]
  1478. vld1.8 {d6},[r0,:64], r2
  1479. \macd q10, d0, d6
  1480. pld [r0]
  1481. vld1.8 {d7},[r1,:64], r2
  1482. \macs q10, d1, d7
  1483. pld [r1]
  1484. vshl.s16 q1, q1, q9
  1485. vqmovun.s16 d2, q1
  1486. vshl.s16 q10, q10, q9
  1487. vqmovun.s16 d4, q10
  1488. vmov q10, q8
  1489. vst1.8 {d2},[r6,:64], r2
  1490. vmov q1, q8
  1491. vst1.8 {d4},[r6,:64], r2
  1492. bne 1b
  1493. pop {r4-r6, pc}
  1494. .endm
  1495. .macro biweight_4 macs, macd
  1496. vdup.8 d0, r4
  1497. vdup.8 d1, r5
  1498. vmov q1, q8
  1499. vmov q10, q8
  1500. 1: subs r3, r3, #4
  1501. vld1.32 {d4[0]},[r0,:32], r2
  1502. vld1.32 {d4[1]},[r0,:32], r2
  1503. \macd q1, d0, d4
  1504. pld [r0]
  1505. vld1.32 {d5[0]},[r1,:32], r2
  1506. vld1.32 {d5[1]},[r1,:32], r2
  1507. \macs q1, d1, d5
  1508. pld [r1]
  1509. blt 2f
  1510. vld1.32 {d6[0]},[r0,:32], r2
  1511. vld1.32 {d6[1]},[r0,:32], r2
  1512. \macd q10, d0, d6
  1513. pld [r0]
  1514. vld1.32 {d7[0]},[r1,:32], r2
  1515. vld1.32 {d7[1]},[r1,:32], r2
  1516. \macs q10, d1, d7
  1517. pld [r1]
  1518. vshl.s16 q1, q1, q9
  1519. vqmovun.s16 d2, q1
  1520. vshl.s16 q10, q10, q9
  1521. vqmovun.s16 d4, q10
  1522. vmov q10, q8
  1523. vst1.32 {d2[0]},[r6,:32], r2
  1524. vst1.32 {d2[1]},[r6,:32], r2
  1525. vmov q1, q8
  1526. vst1.32 {d4[0]},[r6,:32], r2
  1527. vst1.32 {d4[1]},[r6,:32], r2
  1528. bne 1b
  1529. pop {r4-r6, pc}
  1530. 2: vshl.s16 q1, q1, q9
  1531. vqmovun.s16 d2, q1
  1532. vst1.32 {d2[0]},[r6,:32], r2
  1533. vst1.32 {d2[1]},[r6,:32], r2
  1534. pop {r4-r6, pc}
  1535. .endm
  1536. .macro biweight_func w
  1537. function ff_biweight_h264_pixels_\w\()_neon, export=1
  1538. push {r4-r6, lr}
  1539. ldr r12, [sp, #16]
  1540. add r4, sp, #20
  1541. ldm r4, {r4-r6}
  1542. lsr lr, r4, #31
  1543. add r6, r6, #1
  1544. eors lr, lr, r5, lsr #30
  1545. orr r6, r6, #1
  1546. vdup.16 q9, r12
  1547. lsl r6, r6, r12
  1548. vmvn q9, q9
  1549. vdup.16 q8, r6
  1550. mov r6, r0
  1551. beq 10f
  1552. subs lr, lr, #1
  1553. beq 20f
  1554. subs lr, lr, #1
  1555. beq 30f
  1556. b 40f
  1557. 10: biweight_\w vmlal.u8, vmlal.u8
  1558. 20: rsb r4, r4, #0
  1559. biweight_\w vmlal.u8, vmlsl.u8
  1560. 30: rsb r4, r4, #0
  1561. rsb r5, r5, #0
  1562. biweight_\w vmlsl.u8, vmlsl.u8
  1563. 40: rsb r5, r5, #0
  1564. biweight_\w vmlsl.u8, vmlal.u8
  1565. endfunc
  1566. .endm
  1567. biweight_func 16
  1568. biweight_func 8
  1569. biweight_func 4
  1570. @ Weighted prediction
  1571. .macro weight_16 add
  1572. vdup.8 d0, r12
  1573. 1: subs r2, r2, #2
  1574. vld1.8 {d20-d21},[r0,:128], r1
  1575. vmull.u8 q2, d0, d20
  1576. pld [r0]
  1577. vmull.u8 q3, d0, d21
  1578. vld1.8 {d28-d29},[r0,:128], r1
  1579. vmull.u8 q12, d0, d28
  1580. pld [r0]
  1581. vmull.u8 q13, d0, d29
  1582. \add q2, q8, q2
  1583. vrshl.s16 q2, q2, q9
  1584. \add q3, q8, q3
  1585. vrshl.s16 q3, q3, q9
  1586. vqmovun.s16 d4, q2
  1587. vqmovun.s16 d5, q3
  1588. \add q12, q8, q12
  1589. vrshl.s16 q12, q12, q9
  1590. \add q13, q8, q13
  1591. vrshl.s16 q13, q13, q9
  1592. vqmovun.s16 d24, q12
  1593. vqmovun.s16 d25, q13
  1594. vst1.8 {d4- d5}, [r4,:128], r1
  1595. vst1.8 {d24-d25},[r4,:128], r1
  1596. bne 1b
  1597. pop {r4, pc}
  1598. .endm
  1599. .macro weight_8 add
  1600. vdup.8 d0, r12
  1601. 1: subs r2, r2, #2
  1602. vld1.8 {d4},[r0,:64], r1
  1603. vmull.u8 q1, d0, d4
  1604. pld [r0]
  1605. vld1.8 {d6},[r0,:64], r1
  1606. vmull.u8 q10, d0, d6
  1607. \add q1, q8, q1
  1608. pld [r0]
  1609. vrshl.s16 q1, q1, q9
  1610. vqmovun.s16 d2, q1
  1611. \add q10, q8, q10
  1612. vrshl.s16 q10, q10, q9
  1613. vqmovun.s16 d4, q10
  1614. vst1.8 {d2},[r4,:64], r1
  1615. vst1.8 {d4},[r4,:64], r1
  1616. bne 1b
  1617. pop {r4, pc}
  1618. .endm
  1619. .macro weight_4 add
  1620. vdup.8 d0, r12
  1621. vmov q1, q8
  1622. vmov q10, q8
  1623. 1: subs r2, r2, #4
  1624. vld1.32 {d4[0]},[r0,:32], r1
  1625. vld1.32 {d4[1]},[r0,:32], r1
  1626. vmull.u8 q1, d0, d4
  1627. pld [r0]
  1628. blt 2f
  1629. vld1.32 {d6[0]},[r0,:32], r1
  1630. vld1.32 {d6[1]},[r0,:32], r1
  1631. vmull.u8 q10, d0, d6
  1632. pld [r0]
  1633. \add q1, q8, q1
  1634. vrshl.s16 q1, q1, q9
  1635. vqmovun.s16 d2, q1
  1636. \add q10, q8, q10
  1637. vrshl.s16 q10, q10, q9
  1638. vqmovun.s16 d4, q10
  1639. vmov q10, q8
  1640. vst1.32 {d2[0]},[r4,:32], r1
  1641. vst1.32 {d2[1]},[r4,:32], r1
  1642. vmov q1, q8
  1643. vst1.32 {d4[0]},[r4,:32], r1
  1644. vst1.32 {d4[1]},[r4,:32], r1
  1645. bne 1b
  1646. pop {r4, pc}
  1647. 2: \add q1, q8, q1
  1648. vrshl.s16 q1, q1, q9
  1649. vqmovun.s16 d2, q1
  1650. vst1.32 {d2[0]},[r4,:32], r1
  1651. vst1.32 {d2[1]},[r4,:32], r1
  1652. pop {r4, pc}
  1653. .endm
  1654. .macro weight_func w
  1655. function ff_weight_h264_pixels_\w\()_neon, export=1
  1656. push {r4, lr}
  1657. ldr r12, [sp, #8]
  1658. ldr r4, [sp, #12]
  1659. cmp r3, #1
  1660. lsl r4, r4, r3
  1661. vdup.16 q8, r4
  1662. mov r4, r0
  1663. ble 20f
  1664. rsb lr, r3, #1
  1665. vdup.16 q9, lr
  1666. cmp r12, #0
  1667. blt 10f
  1668. weight_\w vhadd.s16
  1669. 10: rsb r12, r12, #0
  1670. weight_\w vhsub.s16
  1671. 20: rsb lr, r3, #0
  1672. vdup.16 q9, lr
  1673. cmp r12, #0
  1674. blt 10f
  1675. weight_\w vadd.s16
  1676. 10: rsb r12, r12, #0
  1677. weight_\w vsub.s16
  1678. endfunc
  1679. .endm
  1680. weight_func 16
  1681. weight_func 8
  1682. weight_func 4