You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1856 lines
61KB

  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
  22. vtrn.32 \r0, \r4
  23. vtrn.32 \r1, \r5
  24. vtrn.32 \r2, \r6
  25. vtrn.32 \r3, \r7
  26. vtrn.16 \r0, \r2
  27. vtrn.16 \r1, \r3
  28. vtrn.16 \r4, \r6
  29. vtrn.16 \r5, \r7
  30. vtrn.8 \r0, \r1
  31. vtrn.8 \r2, \r3
  32. vtrn.8 \r4, \r5
  33. vtrn.8 \r6, \r7
  34. .endm
  35. .macro transpose_4x4 r0 r1 r2 r3
  36. vtrn.16 \r0, \r2
  37. vtrn.16 \r1, \r3
  38. vtrn.8 \r0, \r1
  39. vtrn.8 \r2, \r3
  40. .endm
  41. .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
  42. vswp \r0, \r4
  43. vswp \r1, \r5
  44. vswp \r2, \r6
  45. vswp \r3, \r7
  46. .endm
  47. .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
  48. vtrn.32 \r0, \r2
  49. vtrn.32 \r1, \r3
  50. vtrn.32 \r4, \r6
  51. vtrn.32 \r5, \r7
  52. vtrn.16 \r0, \r1
  53. vtrn.16 \r2, \r3
  54. vtrn.16 \r4, \r5
  55. vtrn.16 \r6, \r7
  56. .endm
  57. /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  58. .macro h264_chroma_mc8 type
  59. function ff_\type\()_h264_chroma_mc8_neon, export=1
  60. push {r4-r7, lr}
  61. ldrd r4, [sp, #20]
  62. .ifc \type,avg
  63. mov lr, r0
  64. .endif
  65. pld [r1]
  66. pld [r1, r2]
  67. A muls r7, r4, r5
  68. T mul r7, r4, r5
  69. T cmp r7, #0
  70. rsb r6, r7, r5, lsl #3
  71. rsb ip, r7, r4, lsl #3
  72. sub r4, r7, r4, lsl #3
  73. sub r4, r4, r5, lsl #3
  74. add r4, r4, #64
  75. beq 2f
  76. add r5, r1, r2
  77. vdup.8 d0, r4
  78. lsl r4, r2, #1
  79. vdup.8 d1, ip
  80. vld1.64 {d4, d5}, [r1], r4
  81. vdup.8 d2, r6
  82. vld1.64 {d6, d7}, [r5], r4
  83. vdup.8 d3, r7
  84. vext.8 d5, d4, d5, #1
  85. vext.8 d7, d6, d7, #1
  86. 1: pld [r5]
  87. vmull.u8 q8, d4, d0
  88. vmlal.u8 q8, d5, d1
  89. vld1.64 {d4, d5}, [r1], r4
  90. vmlal.u8 q8, d6, d2
  91. vext.8 d5, d4, d5, #1
  92. vmlal.u8 q8, d7, d3
  93. vmull.u8 q9, d6, d0
  94. subs r3, r3, #2
  95. vmlal.u8 q9, d7, d1
  96. vmlal.u8 q9, d4, d2
  97. vmlal.u8 q9, d5, d3
  98. vrshrn.u16 d16, q8, #6
  99. vld1.64 {d6, d7}, [r5], r4
  100. pld [r1]
  101. vrshrn.u16 d17, q9, #6
  102. .ifc \type,avg
  103. vld1.64 {d20}, [lr,:64], r2
  104. vld1.64 {d21}, [lr,:64], r2
  105. vrhadd.u8 q8, q8, q10
  106. .endif
  107. vext.8 d7, d6, d7, #1
  108. vst1.64 {d16}, [r0,:64], r2
  109. vst1.64 {d17}, [r0,:64], r2
  110. bgt 1b
  111. pop {r4-r7, pc}
  112. 2: tst r6, r6
  113. add ip, ip, r6
  114. vdup.8 d0, r4
  115. vdup.8 d1, ip
  116. beq 4f
  117. add r5, r1, r2
  118. lsl r4, r2, #1
  119. vld1.64 {d4}, [r1], r4
  120. vld1.64 {d6}, [r5], r4
  121. 3: pld [r5]
  122. vmull.u8 q8, d4, d0
  123. vmlal.u8 q8, d6, d1
  124. vld1.64 {d4}, [r1], r4
  125. vmull.u8 q9, d6, d0
  126. vmlal.u8 q9, d4, d1
  127. vld1.64 {d6}, [r5], r4
  128. vrshrn.u16 d16, q8, #6
  129. vrshrn.u16 d17, q9, #6
  130. .ifc \type,avg
  131. vld1.64 {d20}, [lr,:64], r2
  132. vld1.64 {d21}, [lr,:64], r2
  133. vrhadd.u8 q8, q8, q10
  134. .endif
  135. subs r3, r3, #2
  136. pld [r1]
  137. vst1.64 {d16}, [r0,:64], r2
  138. vst1.64 {d17}, [r0,:64], r2
  139. bgt 3b
  140. pop {r4-r7, pc}
  141. 4: vld1.64 {d4, d5}, [r1], r2
  142. vld1.64 {d6, d7}, [r1], r2
  143. vext.8 d5, d4, d5, #1
  144. vext.8 d7, d6, d7, #1
  145. 5: pld [r1]
  146. subs r3, r3, #2
  147. vmull.u8 q8, d4, d0
  148. vmlal.u8 q8, d5, d1
  149. vld1.64 {d4, d5}, [r1], r2
  150. vmull.u8 q9, d6, d0
  151. vmlal.u8 q9, d7, d1
  152. pld [r1]
  153. vext.8 d5, d4, d5, #1
  154. vrshrn.u16 d16, q8, #6
  155. vrshrn.u16 d17, q9, #6
  156. .ifc \type,avg
  157. vld1.64 {d20}, [lr,:64], r2
  158. vld1.64 {d21}, [lr,:64], r2
  159. vrhadd.u8 q8, q8, q10
  160. .endif
  161. vld1.64 {d6, d7}, [r1], r2
  162. vext.8 d7, d6, d7, #1
  163. vst1.64 {d16}, [r0,:64], r2
  164. vst1.64 {d17}, [r0,:64], r2
  165. bgt 5b
  166. pop {r4-r7, pc}
  167. endfunc
  168. .endm
  169. /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  170. .macro h264_chroma_mc4 type
  171. function ff_\type\()_h264_chroma_mc4_neon, export=1
  172. push {r4-r7, lr}
  173. ldrd r4, [sp, #20]
  174. .ifc \type,avg
  175. mov lr, r0
  176. .endif
  177. pld [r1]
  178. pld [r1, r2]
  179. A muls r7, r4, r5
  180. T mul r7, r4, r5
  181. T cmp r7, #0
  182. rsb r6, r7, r5, lsl #3
  183. rsb ip, r7, r4, lsl #3
  184. sub r4, r7, r4, lsl #3
  185. sub r4, r4, r5, lsl #3
  186. add r4, r4, #64
  187. beq 2f
  188. add r5, r1, r2
  189. vdup.8 d0, r4
  190. lsl r4, r2, #1
  191. vdup.8 d1, ip
  192. vld1.64 {d4}, [r1], r4
  193. vdup.8 d2, r6
  194. vld1.64 {d6}, [r5], r4
  195. vdup.8 d3, r7
  196. vext.8 d5, d4, d5, #1
  197. vext.8 d7, d6, d7, #1
  198. vtrn.32 d4, d5
  199. vtrn.32 d6, d7
  200. vtrn.32 d0, d1
  201. vtrn.32 d2, d3
  202. 1: pld [r5]
  203. vmull.u8 q8, d4, d0
  204. vmlal.u8 q8, d6, d2
  205. vld1.64 {d4}, [r1], r4
  206. vext.8 d5, d4, d5, #1
  207. vtrn.32 d4, d5
  208. vmull.u8 q9, d6, d0
  209. vmlal.u8 q9, d4, d2
  210. vld1.64 {d6}, [r5], r4
  211. vadd.i16 d16, d16, d17
  212. vadd.i16 d17, d18, d19
  213. vrshrn.u16 d16, q8, #6
  214. subs r3, r3, #2
  215. pld [r1]
  216. .ifc \type,avg
  217. vld1.32 {d20[0]}, [lr,:32], r2
  218. vld1.32 {d20[1]}, [lr,:32], r2
  219. vrhadd.u8 d16, d16, d20
  220. .endif
  221. vext.8 d7, d6, d7, #1
  222. vtrn.32 d6, d7
  223. vst1.32 {d16[0]}, [r0,:32], r2
  224. vst1.32 {d16[1]}, [r0,:32], r2
  225. bgt 1b
  226. pop {r4-r7, pc}
  227. 2: tst r6, r6
  228. add ip, ip, r6
  229. vdup.8 d0, r4
  230. vdup.8 d1, ip
  231. vtrn.32 d0, d1
  232. beq 4f
  233. vext.32 d1, d0, d1, #1
  234. add r5, r1, r2
  235. lsl r4, r2, #1
  236. vld1.32 {d4[0]}, [r1], r4
  237. vld1.32 {d4[1]}, [r5], r4
  238. 3: pld [r5]
  239. vmull.u8 q8, d4, d0
  240. vld1.32 {d4[0]}, [r1], r4
  241. vmull.u8 q9, d4, d1
  242. vld1.32 {d4[1]}, [r5], r4
  243. vadd.i16 d16, d16, d17
  244. vadd.i16 d17, d18, d19
  245. vrshrn.u16 d16, q8, #6
  246. .ifc \type,avg
  247. vld1.32 {d20[0]}, [lr,:32], r2
  248. vld1.32 {d20[1]}, [lr,:32], r2
  249. vrhadd.u8 d16, d16, d20
  250. .endif
  251. subs r3, r3, #2
  252. pld [r1]
  253. vst1.32 {d16[0]}, [r0,:32], r2
  254. vst1.32 {d16[1]}, [r0,:32], r2
  255. bgt 3b
  256. pop {r4-r7, pc}
  257. 4: vld1.64 {d4}, [r1], r2
  258. vld1.64 {d6}, [r1], r2
  259. vext.8 d5, d4, d5, #1
  260. vext.8 d7, d6, d7, #1
  261. vtrn.32 d4, d5
  262. vtrn.32 d6, d7
  263. 5: vmull.u8 q8, d4, d0
  264. vmull.u8 q9, d6, d0
  265. subs r3, r3, #2
  266. vld1.64 {d4}, [r1], r2
  267. vext.8 d5, d4, d5, #1
  268. vtrn.32 d4, d5
  269. vadd.i16 d16, d16, d17
  270. vadd.i16 d17, d18, d19
  271. pld [r1]
  272. vrshrn.u16 d16, q8, #6
  273. .ifc \type,avg
  274. vld1.32 {d20[0]}, [lr,:32], r2
  275. vld1.32 {d20[1]}, [lr,:32], r2
  276. vrhadd.u8 d16, d16, d20
  277. .endif
  278. vld1.64 {d6}, [r1], r2
  279. vext.8 d7, d6, d7, #1
  280. vtrn.32 d6, d7
  281. pld [r1]
  282. vst1.32 {d16[0]}, [r0,:32], r2
  283. vst1.32 {d16[1]}, [r0,:32], r2
  284. bgt 5b
  285. pop {r4-r7, pc}
  286. endfunc
  287. .endm
  288. .macro h264_chroma_mc2 type
  289. function ff_\type\()_h264_chroma_mc2_neon, export=1
  290. push {r4-r6, lr}
  291. ldr r4, [sp, #16]
  292. ldr lr, [sp, #20]
  293. pld [r1]
  294. pld [r1, r2]
  295. orrs r5, r4, lr
  296. beq 2f
  297. mul r5, r4, lr
  298. rsb r6, r5, lr, lsl #3
  299. rsb r12, r5, r4, lsl #3
  300. sub r4, r5, r4, lsl #3
  301. sub r4, r4, lr, lsl #3
  302. add r4, r4, #64
  303. vdup.8 d0, r4
  304. vdup.8 d2, r12
  305. vdup.8 d1, r6
  306. vdup.8 d3, r5
  307. vtrn.16 q0, q1
  308. 1:
  309. vld1.32 {d4[0]}, [r1], r2
  310. vld1.32 {d4[1]}, [r1], r2
  311. vrev64.32 d5, d4
  312. vld1.32 {d5[1]}, [r1]
  313. vext.8 q3, q2, q2, #1
  314. vtrn.16 q2, q3
  315. vmull.u8 q8, d4, d0
  316. vmlal.u8 q8, d5, d1
  317. .ifc \type,avg
  318. vld1.16 {d18[0]}, [r0,:16], r2
  319. vld1.16 {d18[1]}, [r0,:16]
  320. sub r0, r0, r2
  321. .endif
  322. vtrn.32 d16, d17
  323. vadd.i16 d16, d16, d17
  324. vrshrn.u16 d16, q8, #6
  325. .ifc \type,avg
  326. vrhadd.u8 d16, d16, d18
  327. .endif
  328. vst1.16 {d16[0]}, [r0,:16], r2
  329. vst1.16 {d16[1]}, [r0,:16], r2
  330. subs r3, r3, #2
  331. bgt 1b
  332. pop {r4-r6, pc}
  333. 2:
  334. .ifc \type,put
  335. ldrh_post r5, r1, r2
  336. strh_post r5, r0, r2
  337. ldrh_post r6, r1, r2
  338. strh_post r6, r0, r2
  339. .else
  340. vld1.16 {d16[0]}, [r1], r2
  341. vld1.16 {d16[1]}, [r1], r2
  342. vld1.16 {d18[0]}, [r0,:16], r2
  343. vld1.16 {d18[1]}, [r0,:16]
  344. sub r0, r0, r2
  345. vrhadd.u8 d16, d16, d18
  346. vst1.16 {d16[0]}, [r0,:16], r2
  347. vst1.16 {d16[1]}, [r0,:16], r2
  348. .endif
  349. subs r3, r3, #2
  350. bgt 2b
  351. pop {r4-r6, pc}
  352. endfunc
  353. .endm
  354. .text
  355. .align
  356. h264_chroma_mc8 put
  357. h264_chroma_mc8 avg
  358. h264_chroma_mc4 put
  359. h264_chroma_mc4 avg
  360. h264_chroma_mc2 put
  361. h264_chroma_mc2 avg
  362. /* H.264 loop filter */
  363. .macro h264_loop_filter_start
  364. ldr ip, [sp]
  365. tst r2, r2
  366. ldr ip, [ip]
  367. it ne
  368. tstne r3, r3
  369. vmov.32 d24[0], ip
  370. and ip, ip, ip, lsl #16
  371. it eq
  372. bxeq lr
  373. ands ip, ip, ip, lsl #8
  374. it lt
  375. bxlt lr
  376. .endm
  377. .macro h264_loop_filter_luma
  378. vdup.8 q11, r2 @ alpha
  379. vmovl.u8 q12, d24
  380. vabd.u8 q6, q8, q0 @ abs(p0 - q0)
  381. vmovl.u16 q12, d24
  382. vabd.u8 q14, q9, q8 @ abs(p1 - p0)
  383. vsli.16 q12, q12, #8
  384. vabd.u8 q15, q1, q0 @ abs(q1 - q0)
  385. vsli.32 q12, q12, #16
  386. vclt.u8 q6, q6, q11 @ < alpha
  387. vdup.8 q11, r3 @ beta
  388. vclt.s8 q7, q12, #0
  389. vclt.u8 q14, q14, q11 @ < beta
  390. vclt.u8 q15, q15, q11 @ < beta
  391. vbic q6, q6, q7
  392. vabd.u8 q4, q10, q8 @ abs(p2 - p0)
  393. vand q6, q6, q14
  394. vabd.u8 q5, q2, q0 @ abs(q2 - q0)
  395. vclt.u8 q4, q4, q11 @ < beta
  396. vand q6, q6, q15
  397. vclt.u8 q5, q5, q11 @ < beta
  398. vand q4, q4, q6
  399. vand q5, q5, q6
  400. vand q12, q12, q6
  401. vrhadd.u8 q14, q8, q0
  402. vsub.i8 q6, q12, q4
  403. vqadd.u8 q7, q9, q12
  404. vhadd.u8 q10, q10, q14
  405. vsub.i8 q6, q6, q5
  406. vhadd.u8 q14, q2, q14
  407. vmin.u8 q7, q7, q10
  408. vqsub.u8 q11, q9, q12
  409. vqadd.u8 q2, q1, q12
  410. vmax.u8 q7, q7, q11
  411. vqsub.u8 q11, q1, q12
  412. vmin.u8 q14, q2, q14
  413. vmovl.u8 q2, d0
  414. vmax.u8 q14, q14, q11
  415. vmovl.u8 q10, d1
  416. vsubw.u8 q2, q2, d16
  417. vsubw.u8 q10, q10, d17
  418. vshl.i16 q2, q2, #2
  419. vshl.i16 q10, q10, #2
  420. vaddw.u8 q2, q2, d18
  421. vaddw.u8 q10, q10, d19
  422. vsubw.u8 q2, q2, d2
  423. vsubw.u8 q10, q10, d3
  424. vrshrn.i16 d4, q2, #3
  425. vrshrn.i16 d5, q10, #3
  426. vbsl q4, q7, q9
  427. vbsl q5, q14, q1
  428. vneg.s8 q7, q6
  429. vmovl.u8 q14, d16
  430. vmin.s8 q2, q2, q6
  431. vmovl.u8 q6, d17
  432. vmax.s8 q2, q2, q7
  433. vmovl.u8 q11, d0
  434. vmovl.u8 q12, d1
  435. vaddw.s8 q14, q14, d4
  436. vaddw.s8 q6, q6, d5
  437. vsubw.s8 q11, q11, d4
  438. vsubw.s8 q12, q12, d5
  439. vqmovun.s16 d16, q14
  440. vqmovun.s16 d17, q6
  441. vqmovun.s16 d0, q11
  442. vqmovun.s16 d1, q12
  443. .endm
  444. function ff_h264_v_loop_filter_luma_neon, export=1
  445. h264_loop_filter_start
  446. vld1.64 {d0, d1}, [r0,:128], r1
  447. vld1.64 {d2, d3}, [r0,:128], r1
  448. vld1.64 {d4, d5}, [r0,:128], r1
  449. sub r0, r0, r1, lsl #2
  450. sub r0, r0, r1, lsl #1
  451. vld1.64 {d20,d21}, [r0,:128], r1
  452. vld1.64 {d18,d19}, [r0,:128], r1
  453. vld1.64 {d16,d17}, [r0,:128], r1
  454. vpush {d8-d15}
  455. h264_loop_filter_luma
  456. sub r0, r0, r1, lsl #1
  457. vst1.64 {d8, d9}, [r0,:128], r1
  458. vst1.64 {d16,d17}, [r0,:128], r1
  459. vst1.64 {d0, d1}, [r0,:128], r1
  460. vst1.64 {d10,d11}, [r0,:128]
  461. vpop {d8-d15}
  462. bx lr
  463. endfunc
  464. function ff_h264_h_loop_filter_luma_neon, export=1
  465. h264_loop_filter_start
  466. sub r0, r0, #4
  467. vld1.64 {d6}, [r0], r1
  468. vld1.64 {d20}, [r0], r1
  469. vld1.64 {d18}, [r0], r1
  470. vld1.64 {d16}, [r0], r1
  471. vld1.64 {d0}, [r0], r1
  472. vld1.64 {d2}, [r0], r1
  473. vld1.64 {d4}, [r0], r1
  474. vld1.64 {d26}, [r0], r1
  475. vld1.64 {d7}, [r0], r1
  476. vld1.64 {d21}, [r0], r1
  477. vld1.64 {d19}, [r0], r1
  478. vld1.64 {d17}, [r0], r1
  479. vld1.64 {d1}, [r0], r1
  480. vld1.64 {d3}, [r0], r1
  481. vld1.64 {d5}, [r0], r1
  482. vld1.64 {d27}, [r0], r1
  483. transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
  484. vpush {d8-d15}
  485. h264_loop_filter_luma
  486. transpose_4x4 q4, q8, q0, q5
  487. sub r0, r0, r1, lsl #4
  488. add r0, r0, #2
  489. vst1.32 {d8[0]}, [r0], r1
  490. vst1.32 {d16[0]}, [r0], r1
  491. vst1.32 {d0[0]}, [r0], r1
  492. vst1.32 {d10[0]}, [r0], r1
  493. vst1.32 {d8[1]}, [r0], r1
  494. vst1.32 {d16[1]}, [r0], r1
  495. vst1.32 {d0[1]}, [r0], r1
  496. vst1.32 {d10[1]}, [r0], r1
  497. vst1.32 {d9[0]}, [r0], r1
  498. vst1.32 {d17[0]}, [r0], r1
  499. vst1.32 {d1[0]}, [r0], r1
  500. vst1.32 {d11[0]}, [r0], r1
  501. vst1.32 {d9[1]}, [r0], r1
  502. vst1.32 {d17[1]}, [r0], r1
  503. vst1.32 {d1[1]}, [r0], r1
  504. vst1.32 {d11[1]}, [r0], r1
  505. vpop {d8-d15}
  506. bx lr
  507. endfunc
  508. .macro h264_loop_filter_chroma
  509. vdup.8 d22, r2 @ alpha
  510. vmovl.u8 q12, d24
  511. vabd.u8 d26, d16, d0 @ abs(p0 - q0)
  512. vmovl.u8 q2, d0
  513. vabd.u8 d28, d18, d16 @ abs(p1 - p0)
  514. vsubw.u8 q2, q2, d16
  515. vsli.16 d24, d24, #8
  516. vshl.i16 q2, q2, #2
  517. vabd.u8 d30, d2, d0 @ abs(q1 - q0)
  518. vaddw.u8 q2, q2, d18
  519. vclt.u8 d26, d26, d22 @ < alpha
  520. vsubw.u8 q2, q2, d2
  521. vdup.8 d22, r3 @ beta
  522. vrshrn.i16 d4, q2, #3
  523. vclt.u8 d28, d28, d22 @ < beta
  524. vclt.u8 d30, d30, d22 @ < beta
  525. vmin.s8 d4, d4, d24
  526. vneg.s8 d25, d24
  527. vand d26, d26, d28
  528. vmax.s8 d4, d4, d25
  529. vand d26, d26, d30
  530. vmovl.u8 q11, d0
  531. vand d4, d4, d26
  532. vmovl.u8 q14, d16
  533. vaddw.s8 q14, q14, d4
  534. vsubw.s8 q11, q11, d4
  535. vqmovun.s16 d16, q14
  536. vqmovun.s16 d0, q11
  537. .endm
  538. function ff_h264_v_loop_filter_chroma_neon, export=1
  539. h264_loop_filter_start
  540. sub r0, r0, r1, lsl #1
  541. vld1.64 {d18}, [r0,:64], r1
  542. vld1.64 {d16}, [r0,:64], r1
  543. vld1.64 {d0}, [r0,:64], r1
  544. vld1.64 {d2}, [r0,:64]
  545. h264_loop_filter_chroma
  546. sub r0, r0, r1, lsl #1
  547. vst1.64 {d16}, [r0,:64], r1
  548. vst1.64 {d0}, [r0,:64], r1
  549. bx lr
  550. endfunc
  551. function ff_h264_h_loop_filter_chroma_neon, export=1
  552. h264_loop_filter_start
  553. sub r0, r0, #2
  554. vld1.32 {d18[0]}, [r0], r1
  555. vld1.32 {d16[0]}, [r0], r1
  556. vld1.32 {d0[0]}, [r0], r1
  557. vld1.32 {d2[0]}, [r0], r1
  558. vld1.32 {d18[1]}, [r0], r1
  559. vld1.32 {d16[1]}, [r0], r1
  560. vld1.32 {d0[1]}, [r0], r1
  561. vld1.32 {d2[1]}, [r0], r1
  562. vtrn.16 d18, d0
  563. vtrn.16 d16, d2
  564. vtrn.8 d18, d16
  565. vtrn.8 d0, d2
  566. h264_loop_filter_chroma
  567. vtrn.16 d18, d0
  568. vtrn.16 d16, d2
  569. vtrn.8 d18, d16
  570. vtrn.8 d0, d2
  571. sub r0, r0, r1, lsl #3
  572. vst1.32 {d18[0]}, [r0], r1
  573. vst1.32 {d16[0]}, [r0], r1
  574. vst1.32 {d0[0]}, [r0], r1
  575. vst1.32 {d2[0]}, [r0], r1
  576. vst1.32 {d18[1]}, [r0], r1
  577. vst1.32 {d16[1]}, [r0], r1
  578. vst1.32 {d0[1]}, [r0], r1
  579. vst1.32 {d2[1]}, [r0], r1
  580. bx lr
  581. endfunc
  582. /* H.264 qpel MC */
  583. .macro lowpass_const r
  584. movw \r, #5
  585. movt \r, #20
  586. vmov.32 d6[0], \r
  587. .endm
  588. .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  589. .if \narrow
  590. t0 .req q0
  591. t1 .req q8
  592. .else
  593. t0 .req \d0
  594. t1 .req \d1
  595. .endif
  596. vext.8 d2, \r0, \r1, #2
  597. vext.8 d3, \r0, \r1, #3
  598. vaddl.u8 q1, d2, d3
  599. vext.8 d4, \r0, \r1, #1
  600. vext.8 d5, \r0, \r1, #4
  601. vaddl.u8 q2, d4, d5
  602. vext.8 d30, \r0, \r1, #5
  603. vaddl.u8 t0, \r0, d30
  604. vext.8 d18, \r2, \r3, #2
  605. vmla.i16 t0, q1, d6[1]
  606. vext.8 d19, \r2, \r3, #3
  607. vaddl.u8 q9, d18, d19
  608. vext.8 d20, \r2, \r3, #1
  609. vmls.i16 t0, q2, d6[0]
  610. vext.8 d21, \r2, \r3, #4
  611. vaddl.u8 q10, d20, d21
  612. vext.8 d31, \r2, \r3, #5
  613. vaddl.u8 t1, \r2, d31
  614. vmla.i16 t1, q9, d6[1]
  615. vmls.i16 t1, q10, d6[0]
  616. .if \narrow
  617. vqrshrun.s16 \d0, t0, #5
  618. vqrshrun.s16 \d1, t1, #5
  619. .endif
  620. .unreq t0
  621. .unreq t1
  622. .endm
  623. .macro lowpass_8_1 r0, r1, d0, narrow=1
  624. .if \narrow
  625. t0 .req q0
  626. .else
  627. t0 .req \d0
  628. .endif
  629. vext.8 d2, \r0, \r1, #2
  630. vext.8 d3, \r0, \r1, #3
  631. vaddl.u8 q1, d2, d3
  632. vext.8 d4, \r0, \r1, #1
  633. vext.8 d5, \r0, \r1, #4
  634. vaddl.u8 q2, d4, d5
  635. vext.8 d30, \r0, \r1, #5
  636. vaddl.u8 t0, \r0, d30
  637. vmla.i16 t0, q1, d6[1]
  638. vmls.i16 t0, q2, d6[0]
  639. .if \narrow
  640. vqrshrun.s16 \d0, t0, #5
  641. .endif
  642. .unreq t0
  643. .endm
  644. .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  645. vext.16 q1, \r0, \r1, #2
  646. vext.16 q0, \r0, \r1, #3
  647. vaddl.s16 q9, d2, d0
  648. vext.16 q2, \r0, \r1, #1
  649. vaddl.s16 q1, d3, d1
  650. vext.16 q3, \r0, \r1, #4
  651. vaddl.s16 q10, d4, d6
  652. vext.16 \r1, \r0, \r1, #5
  653. vaddl.s16 q2, d5, d7
  654. vaddl.s16 q0, \h0, \h1
  655. vaddl.s16 q8, \l0, \l1
  656. vshl.i32 q3, q9, #4
  657. vshl.i32 q9, q9, #2
  658. vshl.i32 q15, q10, #2
  659. vadd.i32 q9, q9, q3
  660. vadd.i32 q10, q10, q15
  661. vshl.i32 q3, q1, #4
  662. vshl.i32 q1, q1, #2
  663. vshl.i32 q15, q2, #2
  664. vadd.i32 q1, q1, q3
  665. vadd.i32 q2, q2, q15
  666. vadd.i32 q9, q9, q8
  667. vsub.i32 q9, q9, q10
  668. vadd.i32 q1, q1, q0
  669. vsub.i32 q1, q1, q2
  670. vrshrn.s32 d18, q9, #10
  671. vrshrn.s32 d19, q1, #10
  672. vqmovun.s16 \d, q9
  673. .endm
  674. function put_h264_qpel16_h_lowpass_neon_packed
  675. mov r4, lr
  676. mov ip, #16
  677. mov r3, #8
  678. bl put_h264_qpel8_h_lowpass_neon
  679. sub r1, r1, r2, lsl #4
  680. add r1, r1, #8
  681. mov ip, #16
  682. mov lr, r4
  683. b put_h264_qpel8_h_lowpass_neon
  684. endfunc
  685. .macro h264_qpel_h_lowpass type
  686. function \type\()_h264_qpel16_h_lowpass_neon
  687. push {lr}
  688. mov ip, #16
  689. bl \type\()_h264_qpel8_h_lowpass_neon
  690. sub r0, r0, r3, lsl #4
  691. sub r1, r1, r2, lsl #4
  692. add r0, r0, #8
  693. add r1, r1, #8
  694. mov ip, #16
  695. pop {lr}
  696. endfunc
  697. function \type\()_h264_qpel8_h_lowpass_neon
  698. 1: vld1.64 {d0, d1}, [r1], r2
  699. vld1.64 {d16,d17}, [r1], r2
  700. subs ip, ip, #2
  701. lowpass_8 d0, d1, d16, d17, d0, d16
  702. .ifc \type,avg
  703. vld1.8 {d2}, [r0,:64], r3
  704. vrhadd.u8 d0, d0, d2
  705. vld1.8 {d3}, [r0,:64]
  706. vrhadd.u8 d16, d16, d3
  707. sub r0, r0, r3
  708. .endif
  709. vst1.64 {d0}, [r0,:64], r3
  710. vst1.64 {d16}, [r0,:64], r3
  711. bne 1b
  712. bx lr
  713. endfunc
  714. .endm
  715. h264_qpel_h_lowpass put
  716. h264_qpel_h_lowpass avg
  717. .macro h264_qpel_h_lowpass_l2 type
  718. function \type\()_h264_qpel16_h_lowpass_l2_neon
  719. push {lr}
  720. mov ip, #16
  721. bl \type\()_h264_qpel8_h_lowpass_l2_neon
  722. sub r0, r0, r2, lsl #4
  723. sub r1, r1, r2, lsl #4
  724. sub r3, r3, r2, lsl #4
  725. add r0, r0, #8
  726. add r1, r1, #8
  727. add r3, r3, #8
  728. mov ip, #16
  729. pop {lr}
  730. endfunc
  731. function \type\()_h264_qpel8_h_lowpass_l2_neon
  732. 1: vld1.64 {d0, d1}, [r1], r2
  733. vld1.64 {d16,d17}, [r1], r2
  734. vld1.64 {d28}, [r3], r2
  735. vld1.64 {d29}, [r3], r2
  736. subs ip, ip, #2
  737. lowpass_8 d0, d1, d16, d17, d0, d1
  738. vrhadd.u8 q0, q0, q14
  739. .ifc \type,avg
  740. vld1.8 {d2}, [r0,:64], r2
  741. vrhadd.u8 d0, d0, d2
  742. vld1.8 {d3}, [r0,:64]
  743. vrhadd.u8 d1, d1, d3
  744. sub r0, r0, r2
  745. .endif
  746. vst1.64 {d0}, [r0,:64], r2
  747. vst1.64 {d1}, [r0,:64], r2
  748. bne 1b
  749. bx lr
  750. endfunc
  751. .endm
  752. h264_qpel_h_lowpass_l2 put
  753. h264_qpel_h_lowpass_l2 avg
  754. function put_h264_qpel16_v_lowpass_neon_packed
  755. mov r4, lr
  756. mov r2, #8
  757. bl put_h264_qpel8_v_lowpass_neon
  758. sub r1, r1, r3, lsl #2
  759. bl put_h264_qpel8_v_lowpass_neon
  760. sub r1, r1, r3, lsl #4
  761. sub r1, r1, r3, lsl #2
  762. add r1, r1, #8
  763. bl put_h264_qpel8_v_lowpass_neon
  764. sub r1, r1, r3, lsl #2
  765. mov lr, r4
  766. b put_h264_qpel8_v_lowpass_neon
  767. endfunc
  768. .macro h264_qpel_v_lowpass type
  769. function \type\()_h264_qpel16_v_lowpass_neon
  770. mov r4, lr
  771. bl \type\()_h264_qpel8_v_lowpass_neon
  772. sub r1, r1, r3, lsl #2
  773. bl \type\()_h264_qpel8_v_lowpass_neon
  774. sub r0, r0, r2, lsl #4
  775. add r0, r0, #8
  776. sub r1, r1, r3, lsl #4
  777. sub r1, r1, r3, lsl #2
  778. add r1, r1, #8
  779. bl \type\()_h264_qpel8_v_lowpass_neon
  780. sub r1, r1, r3, lsl #2
  781. mov lr, r4
  782. endfunc
  783. function \type\()_h264_qpel8_v_lowpass_neon
  784. vld1.64 {d8}, [r1], r3
  785. vld1.64 {d10}, [r1], r3
  786. vld1.64 {d12}, [r1], r3
  787. vld1.64 {d14}, [r1], r3
  788. vld1.64 {d22}, [r1], r3
  789. vld1.64 {d24}, [r1], r3
  790. vld1.64 {d26}, [r1], r3
  791. vld1.64 {d28}, [r1], r3
  792. vld1.64 {d9}, [r1], r3
  793. vld1.64 {d11}, [r1], r3
  794. vld1.64 {d13}, [r1], r3
  795. vld1.64 {d15}, [r1], r3
  796. vld1.64 {d23}, [r1]
  797. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  798. lowpass_8 d8, d9, d10, d11, d8, d10
  799. lowpass_8 d12, d13, d14, d15, d12, d14
  800. lowpass_8 d22, d23, d24, d25, d22, d24
  801. lowpass_8 d26, d27, d28, d29, d26, d28
  802. transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
  803. .ifc \type,avg
  804. vld1.8 {d9}, [r0,:64], r2
  805. vrhadd.u8 d8, d8, d9
  806. vld1.8 {d11}, [r0,:64], r2
  807. vrhadd.u8 d10, d10, d11
  808. vld1.8 {d13}, [r0,:64], r2
  809. vrhadd.u8 d12, d12, d13
  810. vld1.8 {d15}, [r0,:64], r2
  811. vrhadd.u8 d14, d14, d15
  812. vld1.8 {d23}, [r0,:64], r2
  813. vrhadd.u8 d22, d22, d23
  814. vld1.8 {d25}, [r0,:64], r2
  815. vrhadd.u8 d24, d24, d25
  816. vld1.8 {d27}, [r0,:64], r2
  817. vrhadd.u8 d26, d26, d27
  818. vld1.8 {d29}, [r0,:64], r2
  819. vrhadd.u8 d28, d28, d29
  820. sub r0, r0, r2, lsl #3
  821. .endif
  822. vst1.64 {d8}, [r0,:64], r2
  823. vst1.64 {d10}, [r0,:64], r2
  824. vst1.64 {d12}, [r0,:64], r2
  825. vst1.64 {d14}, [r0,:64], r2
  826. vst1.64 {d22}, [r0,:64], r2
  827. vst1.64 {d24}, [r0,:64], r2
  828. vst1.64 {d26}, [r0,:64], r2
  829. vst1.64 {d28}, [r0,:64], r2
  830. bx lr
  831. endfunc
  832. .endm
  833. h264_qpel_v_lowpass put
  834. h264_qpel_v_lowpass avg
  835. .macro h264_qpel_v_lowpass_l2 type
  836. function \type\()_h264_qpel16_v_lowpass_l2_neon
  837. mov r4, lr
  838. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  839. sub r1, r1, r3, lsl #2
  840. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  841. sub r0, r0, r3, lsl #4
  842. sub ip, ip, r2, lsl #4
  843. add r0, r0, #8
  844. add ip, ip, #8
  845. sub r1, r1, r3, lsl #4
  846. sub r1, r1, r3, lsl #2
  847. add r1, r1, #8
  848. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  849. sub r1, r1, r3, lsl #2
  850. mov lr, r4
  851. endfunc
  852. function \type\()_h264_qpel8_v_lowpass_l2_neon
  853. vld1.64 {d8}, [r1], r3
  854. vld1.64 {d10}, [r1], r3
  855. vld1.64 {d12}, [r1], r3
  856. vld1.64 {d14}, [r1], r3
  857. vld1.64 {d22}, [r1], r3
  858. vld1.64 {d24}, [r1], r3
  859. vld1.64 {d26}, [r1], r3
  860. vld1.64 {d28}, [r1], r3
  861. vld1.64 {d9}, [r1], r3
  862. vld1.64 {d11}, [r1], r3
  863. vld1.64 {d13}, [r1], r3
  864. vld1.64 {d15}, [r1], r3
  865. vld1.64 {d23}, [r1]
  866. transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
  867. lowpass_8 d8, d9, d10, d11, d8, d9
  868. lowpass_8 d12, d13, d14, d15, d12, d13
  869. lowpass_8 d22, d23, d24, d25, d22, d23
  870. lowpass_8 d26, d27, d28, d29, d26, d27
  871. transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
  872. vld1.64 {d0}, [ip], r2
  873. vld1.64 {d1}, [ip], r2
  874. vld1.64 {d2}, [ip], r2
  875. vld1.64 {d3}, [ip], r2
  876. vld1.64 {d4}, [ip], r2
  877. vrhadd.u8 q0, q0, q4
  878. vld1.64 {d5}, [ip], r2
  879. vrhadd.u8 q1, q1, q6
  880. vld1.64 {d10}, [ip], r2
  881. vrhadd.u8 q2, q2, q11
  882. vld1.64 {d11}, [ip], r2
  883. vrhadd.u8 q5, q5, q13
  884. .ifc \type,avg
  885. vld1.8 {d16}, [r0,:64], r3
  886. vrhadd.u8 d0, d0, d16
  887. vld1.8 {d17}, [r0,:64], r3
  888. vrhadd.u8 d1, d1, d17
  889. vld1.8 {d16}, [r0,:64], r3
  890. vrhadd.u8 d2, d2, d16
  891. vld1.8 {d17}, [r0,:64], r3
  892. vrhadd.u8 d3, d3, d17
  893. vld1.8 {d16}, [r0,:64], r3
  894. vrhadd.u8 d4, d4, d16
  895. vld1.8 {d17}, [r0,:64], r3
  896. vrhadd.u8 d5, d5, d17
  897. vld1.8 {d16}, [r0,:64], r3
  898. vrhadd.u8 d10, d10, d16
  899. vld1.8 {d17}, [r0,:64], r3
  900. vrhadd.u8 d11, d11, d17
  901. sub r0, r0, r3, lsl #3
  902. .endif
  903. vst1.64 {d0}, [r0,:64], r3
  904. vst1.64 {d1}, [r0,:64], r3
  905. vst1.64 {d2}, [r0,:64], r3
  906. vst1.64 {d3}, [r0,:64], r3
  907. vst1.64 {d4}, [r0,:64], r3
  908. vst1.64 {d5}, [r0,:64], r3
  909. vst1.64 {d10}, [r0,:64], r3
  910. vst1.64 {d11}, [r0,:64], r3
  911. bx lr
  912. endfunc
  913. .endm
  914. h264_qpel_v_lowpass_l2 put
  915. h264_qpel_v_lowpass_l2 avg
  916. function put_h264_qpel8_hv_lowpass_neon_top
  917. lowpass_const ip
  918. mov ip, #12
  919. 1: vld1.64 {d0, d1}, [r1], r3
  920. vld1.64 {d16,d17}, [r1], r3
  921. subs ip, ip, #2
  922. lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
  923. vst1.64 {d22-d25}, [r4,:128]!
  924. bne 1b
  925. vld1.64 {d0, d1}, [r1]
  926. lowpass_8_1 d0, d1, q12, narrow=0
  927. mov ip, #-16
  928. add r4, r4, ip
  929. vld1.64 {d30,d31}, [r4,:128], ip
  930. vld1.64 {d20,d21}, [r4,:128], ip
  931. vld1.64 {d18,d19}, [r4,:128], ip
  932. vld1.64 {d16,d17}, [r4,:128], ip
  933. vld1.64 {d14,d15}, [r4,:128], ip
  934. vld1.64 {d12,d13}, [r4,:128], ip
  935. vld1.64 {d10,d11}, [r4,:128], ip
  936. vld1.64 {d8, d9}, [r4,:128], ip
  937. vld1.64 {d6, d7}, [r4,:128], ip
  938. vld1.64 {d4, d5}, [r4,:128], ip
  939. vld1.64 {d2, d3}, [r4,:128], ip
  940. vld1.64 {d0, d1}, [r4,:128]
  941. swap4 d1, d3, d5, d7, d8, d10, d12, d14
  942. transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
  943. swap4 d17, d19, d21, d31, d24, d26, d28, d22
  944. transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
  945. vst1.64 {d30,d31}, [r4,:128]!
  946. vst1.64 {d6, d7}, [r4,:128]!
  947. vst1.64 {d20,d21}, [r4,:128]!
  948. vst1.64 {d4, d5}, [r4,:128]!
  949. vst1.64 {d18,d19}, [r4,:128]!
  950. vst1.64 {d2, d3}, [r4,:128]!
  951. vst1.64 {d16,d17}, [r4,:128]!
  952. vst1.64 {d0, d1}, [r4,:128]
  953. lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
  954. lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
  955. lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
  956. lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
  957. vld1.64 {d16,d17}, [r4,:128], ip
  958. vld1.64 {d30,d31}, [r4,:128], ip
  959. lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
  960. vld1.64 {d16,d17}, [r4,:128], ip
  961. vld1.64 {d30,d31}, [r4,:128], ip
  962. lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
  963. vld1.64 {d16,d17}, [r4,:128], ip
  964. vld1.64 {d30,d31}, [r4,:128], ip
  965. lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
  966. vld1.64 {d16,d17}, [r4,:128], ip
  967. vld1.64 {d30,d31}, [r4,:128]
  968. lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
  969. transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
  970. bx lr
  971. endfunc
  972. .macro h264_qpel8_hv_lowpass type
  973. function \type\()_h264_qpel8_hv_lowpass_neon
  974. mov r10, lr
  975. bl put_h264_qpel8_hv_lowpass_neon_top
  976. .ifc \type,avg
  977. vld1.8 {d0}, [r0,:64], r2
  978. vrhadd.u8 d12, d12, d0
  979. vld1.8 {d1}, [r0,:64], r2
  980. vrhadd.u8 d13, d13, d1
  981. vld1.8 {d2}, [r0,:64], r2
  982. vrhadd.u8 d14, d14, d2
  983. vld1.8 {d3}, [r0,:64], r2
  984. vrhadd.u8 d15, d15, d3
  985. vld1.8 {d4}, [r0,:64], r2
  986. vrhadd.u8 d8, d8, d4
  987. vld1.8 {d5}, [r0,:64], r2
  988. vrhadd.u8 d9, d9, d5
  989. vld1.8 {d6}, [r0,:64], r2
  990. vrhadd.u8 d10, d10, d6
  991. vld1.8 {d7}, [r0,:64], r2
  992. vrhadd.u8 d11, d11, d7
  993. sub r0, r0, r2, lsl #3
  994. .endif
  995. vst1.64 {d12}, [r0,:64], r2
  996. vst1.64 {d13}, [r0,:64], r2
  997. vst1.64 {d14}, [r0,:64], r2
  998. vst1.64 {d15}, [r0,:64], r2
  999. vst1.64 {d8}, [r0,:64], r2
  1000. vst1.64 {d9}, [r0,:64], r2
  1001. vst1.64 {d10}, [r0,:64], r2
  1002. vst1.64 {d11}, [r0,:64], r2
  1003. mov lr, r10
  1004. bx lr
  1005. endfunc
  1006. .endm
  1007. h264_qpel8_hv_lowpass put
  1008. h264_qpel8_hv_lowpass avg
  1009. .macro h264_qpel8_hv_lowpass_l2 type
  1010. function \type\()_h264_qpel8_hv_lowpass_l2_neon
  1011. mov r10, lr
  1012. bl put_h264_qpel8_hv_lowpass_neon_top
  1013. vld1.64 {d0, d1}, [r2,:128]!
  1014. vld1.64 {d2, d3}, [r2,:128]!
  1015. vrhadd.u8 q0, q0, q6
  1016. vld1.64 {d4, d5}, [r2,:128]!
  1017. vrhadd.u8 q1, q1, q7
  1018. vld1.64 {d6, d7}, [r2,:128]!
  1019. vrhadd.u8 q2, q2, q4
  1020. vrhadd.u8 q3, q3, q5
  1021. .ifc \type,avg
  1022. vld1.8 {d16}, [r0,:64], r3
  1023. vrhadd.u8 d0, d0, d16
  1024. vld1.8 {d17}, [r0,:64], r3
  1025. vrhadd.u8 d1, d1, d17
  1026. vld1.8 {d18}, [r0,:64], r3
  1027. vrhadd.u8 d2, d2, d18
  1028. vld1.8 {d19}, [r0,:64], r3
  1029. vrhadd.u8 d3, d3, d19
  1030. vld1.8 {d20}, [r0,:64], r3
  1031. vrhadd.u8 d4, d4, d20
  1032. vld1.8 {d21}, [r0,:64], r3
  1033. vrhadd.u8 d5, d5, d21
  1034. vld1.8 {d22}, [r0,:64], r3
  1035. vrhadd.u8 d6, d6, d22
  1036. vld1.8 {d23}, [r0,:64], r3
  1037. vrhadd.u8 d7, d7, d23
  1038. sub r0, r0, r3, lsl #3
  1039. .endif
  1040. vst1.64 {d0}, [r0,:64], r3
  1041. vst1.64 {d1}, [r0,:64], r3
  1042. vst1.64 {d2}, [r0,:64], r3
  1043. vst1.64 {d3}, [r0,:64], r3
  1044. vst1.64 {d4}, [r0,:64], r3
  1045. vst1.64 {d5}, [r0,:64], r3
  1046. vst1.64 {d6}, [r0,:64], r3
  1047. vst1.64 {d7}, [r0,:64], r3
  1048. mov lr, r10
  1049. bx lr
  1050. endfunc
  1051. .endm
  1052. h264_qpel8_hv_lowpass_l2 put
  1053. h264_qpel8_hv_lowpass_l2 avg
  1054. .macro h264_qpel16_hv type
  1055. function \type\()_h264_qpel16_hv_lowpass_neon
  1056. mov r9, lr
  1057. bl \type\()_h264_qpel8_hv_lowpass_neon
  1058. sub r1, r1, r3, lsl #2
  1059. bl \type\()_h264_qpel8_hv_lowpass_neon
  1060. sub r1, r1, r3, lsl #4
  1061. sub r1, r1, r3, lsl #2
  1062. add r1, r1, #8
  1063. sub r0, r0, r2, lsl #4
  1064. add r0, r0, #8
  1065. bl \type\()_h264_qpel8_hv_lowpass_neon
  1066. sub r1, r1, r3, lsl #2
  1067. mov lr, r9
  1068. b \type\()_h264_qpel8_hv_lowpass_neon
  1069. endfunc
  1070. function \type\()_h264_qpel16_hv_lowpass_l2_neon
  1071. mov r9, lr
  1072. sub r2, r4, #256
  1073. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1074. sub r1, r1, r3, lsl #2
  1075. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1076. sub r1, r1, r3, lsl #4
  1077. sub r1, r1, r3, lsl #2
  1078. add r1, r1, #8
  1079. sub r0, r0, r3, lsl #4
  1080. add r0, r0, #8
  1081. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1082. sub r1, r1, r3, lsl #2
  1083. mov lr, r9
  1084. b \type\()_h264_qpel8_hv_lowpass_l2_neon
  1085. endfunc
  1086. .endm
  1087. h264_qpel16_hv put
  1088. h264_qpel16_hv avg
  1089. .macro h264_qpel8 type
  1090. function ff_\type\()_h264_qpel8_mc10_neon, export=1
  1091. lowpass_const r3
  1092. mov r3, r1
  1093. sub r1, r1, #2
  1094. mov ip, #8
  1095. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1096. endfunc
  1097. function ff_\type\()_h264_qpel8_mc20_neon, export=1
  1098. lowpass_const r3
  1099. sub r1, r1, #2
  1100. mov r3, r2
  1101. mov ip, #8
  1102. b \type\()_h264_qpel8_h_lowpass_neon
  1103. endfunc
  1104. function ff_\type\()_h264_qpel8_mc30_neon, export=1
  1105. lowpass_const r3
  1106. add r3, r1, #1
  1107. sub r1, r1, #2
  1108. mov ip, #8
  1109. b \type\()_h264_qpel8_h_lowpass_l2_neon
  1110. endfunc
  1111. function ff_\type\()_h264_qpel8_mc01_neon, export=1
  1112. push {lr}
  1113. mov ip, r1
  1114. \type\()_h264_qpel8_mc01:
  1115. lowpass_const r3
  1116. mov r3, r2
  1117. sub r1, r1, r2, lsl #1
  1118. vpush {d8-d15}
  1119. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1120. vpop {d8-d15}
  1121. pop {pc}
  1122. endfunc
  1123. function ff_\type\()_h264_qpel8_mc11_neon, export=1
  1124. push {r0, r1, r11, lr}
  1125. \type\()_h264_qpel8_mc11:
  1126. lowpass_const r3
  1127. mov r11, sp
  1128. A bic sp, sp, #15
  1129. T bic r0, r11, #15
  1130. T mov sp, r0
  1131. sub sp, sp, #64
  1132. mov r0, sp
  1133. sub r1, r1, #2
  1134. mov r3, #8
  1135. mov ip, #8
  1136. vpush {d8-d15}
  1137. bl put_h264_qpel8_h_lowpass_neon
  1138. ldrd r0, [r11], #8
  1139. mov r3, r2
  1140. add ip, sp, #64
  1141. sub r1, r1, r2, lsl #1
  1142. mov r2, #8
  1143. bl \type\()_h264_qpel8_v_lowpass_l2_neon
  1144. vpop {d8-d15}
  1145. mov sp, r11
  1146. pop {r11, pc}
  1147. endfunc
  1148. function ff_\type\()_h264_qpel8_mc21_neon, export=1
  1149. push {r0, r1, r4, r10, r11, lr}
  1150. \type\()_h264_qpel8_mc21:
  1151. lowpass_const r3
  1152. mov r11, sp
  1153. A bic sp, sp, #15
  1154. T bic r0, r11, #15
  1155. T mov sp, r0
  1156. sub sp, sp, #(8*8+16*12)
  1157. sub r1, r1, #2
  1158. mov r3, #8
  1159. mov r0, sp
  1160. mov ip, #8
  1161. vpush {d8-d15}
  1162. bl put_h264_qpel8_h_lowpass_neon
  1163. mov r4, r0
  1164. ldrd r0, [r11], #8
  1165. sub r1, r1, r2, lsl #1
  1166. sub r1, r1, #2
  1167. mov r3, r2
  1168. sub r2, r4, #64
  1169. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1170. vpop {d8-d15}
  1171. mov sp, r11
  1172. pop {r4, r10, r11, pc}
  1173. endfunc
  1174. function ff_\type\()_h264_qpel8_mc31_neon, export=1
  1175. add r1, r1, #1
  1176. push {r0, r1, r11, lr}
  1177. sub r1, r1, #1
  1178. b \type\()_h264_qpel8_mc11
  1179. endfunc
  1180. function ff_\type\()_h264_qpel8_mc02_neon, export=1
  1181. push {lr}
  1182. lowpass_const r3
  1183. sub r1, r1, r2, lsl #1
  1184. mov r3, r2
  1185. vpush {d8-d15}
  1186. bl \type\()_h264_qpel8_v_lowpass_neon
  1187. vpop {d8-d15}
  1188. pop {pc}
  1189. endfunc
  1190. function ff_\type\()_h264_qpel8_mc12_neon, export=1
  1191. push {r0, r1, r4, r10, r11, lr}
  1192. \type\()_h264_qpel8_mc12:
  1193. lowpass_const r3
  1194. mov r11, sp
  1195. A bic sp, sp, #15
  1196. T bic r0, r11, #15
  1197. T mov sp, r0
  1198. sub sp, sp, #(8*8+16*12)
  1199. sub r1, r1, r2, lsl #1
  1200. mov r3, r2
  1201. mov r2, #8
  1202. mov r0, sp
  1203. vpush {d8-d15}
  1204. bl put_h264_qpel8_v_lowpass_neon
  1205. mov r4, r0
  1206. ldrd r0, [r11], #8
  1207. sub r1, r1, r3, lsl #1
  1208. sub r1, r1, #2
  1209. sub r2, r4, #64
  1210. bl \type\()_h264_qpel8_hv_lowpass_l2_neon
  1211. vpop {d8-d15}
  1212. mov sp, r11
  1213. pop {r4, r10, r11, pc}
  1214. endfunc
  1215. function ff_\type\()_h264_qpel8_mc22_neon, export=1
  1216. push {r4, r10, r11, lr}
  1217. mov r11, sp
  1218. A bic sp, sp, #15
  1219. T bic r4, r11, #15
  1220. T mov sp, r4
  1221. sub r1, r1, r2, lsl #1
  1222. sub r1, r1, #2
  1223. mov r3, r2
  1224. sub sp, sp, #(16*12)
  1225. mov r4, sp
  1226. vpush {d8-d15}
  1227. bl \type\()_h264_qpel8_hv_lowpass_neon
  1228. vpop {d8-d15}
  1229. mov sp, r11
  1230. pop {r4, r10, r11, pc}
  1231. endfunc
  1232. function ff_\type\()_h264_qpel8_mc32_neon, export=1
  1233. push {r0, r1, r4, r10, r11, lr}
  1234. add r1, r1, #1
  1235. b \type\()_h264_qpel8_mc12
  1236. endfunc
  1237. function ff_\type\()_h264_qpel8_mc03_neon, export=1
  1238. push {lr}
  1239. add ip, r1, r2
  1240. b \type\()_h264_qpel8_mc01
  1241. endfunc
  1242. function ff_\type\()_h264_qpel8_mc13_neon, export=1
  1243. push {r0, r1, r11, lr}
  1244. add r1, r1, r2
  1245. b \type\()_h264_qpel8_mc11
  1246. endfunc
  1247. function ff_\type\()_h264_qpel8_mc23_neon, export=1
  1248. push {r0, r1, r4, r10, r11, lr}
  1249. add r1, r1, r2
  1250. b \type\()_h264_qpel8_mc21
  1251. endfunc
  1252. function ff_\type\()_h264_qpel8_mc33_neon, export=1
  1253. add r1, r1, #1
  1254. push {r0, r1, r11, lr}
  1255. add r1, r1, r2
  1256. sub r1, r1, #1
  1257. b \type\()_h264_qpel8_mc11
  1258. endfunc
  1259. .endm
  1260. h264_qpel8 put
  1261. h264_qpel8 avg
  1262. .macro h264_qpel16 type
  1263. function ff_\type\()_h264_qpel16_mc10_neon, export=1
  1264. lowpass_const r3
  1265. mov r3, r1
  1266. sub r1, r1, #2
  1267. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1268. endfunc
  1269. function ff_\type\()_h264_qpel16_mc20_neon, export=1
  1270. lowpass_const r3
  1271. sub r1, r1, #2
  1272. mov r3, r2
  1273. b \type\()_h264_qpel16_h_lowpass_neon
  1274. endfunc
  1275. function ff_\type\()_h264_qpel16_mc30_neon, export=1
  1276. lowpass_const r3
  1277. add r3, r1, #1
  1278. sub r1, r1, #2
  1279. b \type\()_h264_qpel16_h_lowpass_l2_neon
  1280. endfunc
  1281. function ff_\type\()_h264_qpel16_mc01_neon, export=1
  1282. push {r4, lr}
  1283. mov ip, r1
  1284. \type\()_h264_qpel16_mc01:
  1285. lowpass_const r3
  1286. mov r3, r2
  1287. sub r1, r1, r2, lsl #1
  1288. vpush {d8-d15}
  1289. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1290. vpop {d8-d15}
  1291. pop {r4, pc}
  1292. endfunc
  1293. function ff_\type\()_h264_qpel16_mc11_neon, export=1
  1294. push {r0, r1, r4, r11, lr}
  1295. \type\()_h264_qpel16_mc11:
  1296. lowpass_const r3
  1297. mov r11, sp
  1298. A bic sp, sp, #15
  1299. T bic r0, r11, #15
  1300. T mov sp, r0
  1301. sub sp, sp, #256
  1302. mov r0, sp
  1303. sub r1, r1, #2
  1304. mov r3, #16
  1305. vpush {d8-d15}
  1306. bl put_h264_qpel16_h_lowpass_neon
  1307. ldrd r0, [r11], #8
  1308. mov r3, r2
  1309. add ip, sp, #64
  1310. sub r1, r1, r2, lsl #1
  1311. mov r2, #16
  1312. bl \type\()_h264_qpel16_v_lowpass_l2_neon
  1313. vpop {d8-d15}
  1314. mov sp, r11
  1315. pop {r4, r11, pc}
  1316. endfunc
  1317. function ff_\type\()_h264_qpel16_mc21_neon, export=1
  1318. push {r0, r1, r4-r5, r9-r11, lr}
  1319. \type\()_h264_qpel16_mc21:
  1320. lowpass_const r3
  1321. mov r11, sp
  1322. A bic sp, sp, #15
  1323. T bic r0, r11, #15
  1324. T mov sp, r0
  1325. sub sp, sp, #(16*16+16*12)
  1326. sub r1, r1, #2
  1327. mov r0, sp
  1328. vpush {d8-d15}
  1329. bl put_h264_qpel16_h_lowpass_neon_packed
  1330. mov r4, r0
  1331. ldrd r0, [r11], #8
  1332. sub r1, r1, r2, lsl #1
  1333. sub r1, r1, #2
  1334. mov r3, r2
  1335. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1336. vpop {d8-d15}
  1337. mov sp, r11
  1338. pop {r4-r5, r9-r11, pc}
  1339. endfunc
  1340. function ff_\type\()_h264_qpel16_mc31_neon, export=1
  1341. add r1, r1, #1
  1342. push {r0, r1, r4, r11, lr}
  1343. sub r1, r1, #1
  1344. b \type\()_h264_qpel16_mc11
  1345. endfunc
  1346. function ff_\type\()_h264_qpel16_mc02_neon, export=1
  1347. push {r4, lr}
  1348. lowpass_const r3
  1349. sub r1, r1, r2, lsl #1
  1350. mov r3, r2
  1351. vpush {d8-d15}
  1352. bl \type\()_h264_qpel16_v_lowpass_neon
  1353. vpop {d8-d15}
  1354. pop {r4, pc}
  1355. endfunc
  1356. function ff_\type\()_h264_qpel16_mc12_neon, export=1
  1357. push {r0, r1, r4-r5, r9-r11, lr}
  1358. \type\()_h264_qpel16_mc12:
  1359. lowpass_const r3
  1360. mov r11, sp
  1361. A bic sp, sp, #15
  1362. T bic r0, r11, #15
  1363. T mov sp, r0
  1364. sub sp, sp, #(16*16+16*12)
  1365. sub r1, r1, r2, lsl #1
  1366. mov r0, sp
  1367. mov r3, r2
  1368. vpush {d8-d15}
  1369. bl put_h264_qpel16_v_lowpass_neon_packed
  1370. mov r4, r0
  1371. ldrd r0, [r11], #8
  1372. sub r1, r1, r3, lsl #1
  1373. sub r1, r1, #2
  1374. mov r2, r3
  1375. bl \type\()_h264_qpel16_hv_lowpass_l2_neon
  1376. vpop {d8-d15}
  1377. mov sp, r11
  1378. pop {r4-r5, r9-r11, pc}
  1379. endfunc
  1380. function ff_\type\()_h264_qpel16_mc22_neon, export=1
  1381. push {r4, r9-r11, lr}
  1382. lowpass_const r3
  1383. mov r11, sp
  1384. A bic sp, sp, #15
  1385. T bic r4, r11, #15
  1386. T mov sp, r4
  1387. sub r1, r1, r2, lsl #1
  1388. sub r1, r1, #2
  1389. mov r3, r2
  1390. sub sp, sp, #(16*12)
  1391. mov r4, sp
  1392. vpush {d8-d15}
  1393. bl \type\()_h264_qpel16_hv_lowpass_neon
  1394. vpop {d8-d15}
  1395. mov sp, r11
  1396. pop {r4, r9-r11, pc}
  1397. endfunc
  1398. function ff_\type\()_h264_qpel16_mc32_neon, export=1
  1399. push {r0, r1, r4-r5, r9-r11, lr}
  1400. add r1, r1, #1
  1401. b \type\()_h264_qpel16_mc12
  1402. endfunc
  1403. function ff_\type\()_h264_qpel16_mc03_neon, export=1
  1404. push {r4, lr}
  1405. add ip, r1, r2
  1406. b \type\()_h264_qpel16_mc01
  1407. endfunc
  1408. function ff_\type\()_h264_qpel16_mc13_neon, export=1
  1409. push {r0, r1, r4, r11, lr}
  1410. add r1, r1, r2
  1411. b \type\()_h264_qpel16_mc11
  1412. endfunc
  1413. function ff_\type\()_h264_qpel16_mc23_neon, export=1
  1414. push {r0, r1, r4-r5, r9-r11, lr}
  1415. add r1, r1, r2
  1416. b \type\()_h264_qpel16_mc21
  1417. endfunc
  1418. function ff_\type\()_h264_qpel16_mc33_neon, export=1
  1419. add r1, r1, #1
  1420. push {r0, r1, r4, r11, lr}
  1421. add r1, r1, r2
  1422. sub r1, r1, #1
  1423. b \type\()_h264_qpel16_mc11
  1424. endfunc
  1425. .endm
  1426. h264_qpel16 put
  1427. h264_qpel16 avg
  1428. @ Biweighted prediction
  1429. .macro biweight_16 macs, macd
  1430. vdup.8 d0, r4
  1431. vdup.8 d1, r5
  1432. vmov q2, q8
  1433. vmov q3, q8
  1434. 1: subs r3, r3, #2
  1435. vld1.8 {d20-d21},[r0,:128], r2
  1436. \macd q2, d0, d20
  1437. pld [r0]
  1438. \macd q3, d0, d21
  1439. vld1.8 {d22-d23},[r1,:128], r2
  1440. \macs q2, d1, d22
  1441. pld [r1]
  1442. \macs q3, d1, d23
  1443. vmov q12, q8
  1444. vld1.8 {d28-d29},[r0,:128], r2
  1445. vmov q13, q8
  1446. \macd q12, d0, d28
  1447. pld [r0]
  1448. \macd q13, d0, d29
  1449. vld1.8 {d30-d31},[r1,:128], r2
  1450. \macs q12, d1, d30
  1451. pld [r1]
  1452. \macs q13, d1, d31
  1453. vshl.s16 q2, q2, q9
  1454. vshl.s16 q3, q3, q9
  1455. vqmovun.s16 d4, q2
  1456. vqmovun.s16 d5, q3
  1457. vshl.s16 q12, q12, q9
  1458. vshl.s16 q13, q13, q9
  1459. vqmovun.s16 d24, q12
  1460. vqmovun.s16 d25, q13
  1461. vmov q3, q8
  1462. vst1.8 {d4- d5}, [r6,:128], r2
  1463. vmov q2, q8
  1464. vst1.8 {d24-d25},[r6,:128], r2
  1465. bne 1b
  1466. pop {r4-r6, pc}
  1467. .endm
  1468. .macro biweight_8 macs, macd
  1469. vdup.8 d0, r4
  1470. vdup.8 d1, r5
  1471. vmov q1, q8
  1472. vmov q10, q8
  1473. 1: subs r3, r3, #2
  1474. vld1.8 {d4},[r0,:64], r2
  1475. \macd q1, d0, d4
  1476. pld [r0]
  1477. vld1.8 {d5},[r1,:64], r2
  1478. \macs q1, d1, d5
  1479. pld [r1]
  1480. vld1.8 {d6},[r0,:64], r2
  1481. \macd q10, d0, d6
  1482. pld [r0]
  1483. vld1.8 {d7},[r1,:64], r2
  1484. \macs q10, d1, d7
  1485. pld [r1]
  1486. vshl.s16 q1, q1, q9
  1487. vqmovun.s16 d2, q1
  1488. vshl.s16 q10, q10, q9
  1489. vqmovun.s16 d4, q10
  1490. vmov q10, q8
  1491. vst1.8 {d2},[r6,:64], r2
  1492. vmov q1, q8
  1493. vst1.8 {d4},[r6,:64], r2
  1494. bne 1b
  1495. pop {r4-r6, pc}
  1496. .endm
  1497. .macro biweight_4 macs, macd
  1498. vdup.8 d0, r4
  1499. vdup.8 d1, r5
  1500. vmov q1, q8
  1501. vmov q10, q8
  1502. 1: subs r3, r3, #4
  1503. vld1.32 {d4[0]},[r0,:32], r2
  1504. vld1.32 {d4[1]},[r0,:32], r2
  1505. \macd q1, d0, d4
  1506. pld [r0]
  1507. vld1.32 {d5[0]},[r1,:32], r2
  1508. vld1.32 {d5[1]},[r1,:32], r2
  1509. \macs q1, d1, d5
  1510. pld [r1]
  1511. blt 2f
  1512. vld1.32 {d6[0]},[r0,:32], r2
  1513. vld1.32 {d6[1]},[r0,:32], r2
  1514. \macd q10, d0, d6
  1515. pld [r0]
  1516. vld1.32 {d7[0]},[r1,:32], r2
  1517. vld1.32 {d7[1]},[r1,:32], r2
  1518. \macs q10, d1, d7
  1519. pld [r1]
  1520. vshl.s16 q1, q1, q9
  1521. vqmovun.s16 d2, q1
  1522. vshl.s16 q10, q10, q9
  1523. vqmovun.s16 d4, q10
  1524. vmov q10, q8
  1525. vst1.32 {d2[0]},[r6,:32], r2
  1526. vst1.32 {d2[1]},[r6,:32], r2
  1527. vmov q1, q8
  1528. vst1.32 {d4[0]},[r6,:32], r2
  1529. vst1.32 {d4[1]},[r6,:32], r2
  1530. bne 1b
  1531. pop {r4-r6, pc}
  1532. 2: vshl.s16 q1, q1, q9
  1533. vqmovun.s16 d2, q1
  1534. vst1.32 {d2[0]},[r6,:32], r2
  1535. vst1.32 {d2[1]},[r6,:32], r2
  1536. pop {r4-r6, pc}
  1537. .endm
  1538. .macro biweight_func w
  1539. function ff_biweight_h264_pixels_\w\()_neon, export=1
  1540. push {r4-r6, lr}
  1541. ldr r12, [sp, #16]
  1542. add r4, sp, #20
  1543. ldm r4, {r4-r6}
  1544. lsr lr, r4, #31
  1545. add r6, r6, #1
  1546. eors lr, lr, r5, lsr #30
  1547. orr r6, r6, #1
  1548. vdup.16 q9, r12
  1549. lsl r6, r6, r12
  1550. vmvn q9, q9
  1551. vdup.16 q8, r6
  1552. mov r6, r0
  1553. beq 10f
  1554. subs lr, lr, #1
  1555. beq 20f
  1556. subs lr, lr, #1
  1557. beq 30f
  1558. b 40f
  1559. 10: biweight_\w vmlal.u8, vmlal.u8
  1560. 20: rsb r4, r4, #0
  1561. biweight_\w vmlal.u8, vmlsl.u8
  1562. 30: rsb r4, r4, #0
  1563. rsb r5, r5, #0
  1564. biweight_\w vmlsl.u8, vmlsl.u8
  1565. 40: rsb r5, r5, #0
  1566. biweight_\w vmlsl.u8, vmlal.u8
  1567. endfunc
  1568. .endm
  1569. biweight_func 16
  1570. biweight_func 8
  1571. biweight_func 4
  1572. @ Weighted prediction
  1573. .macro weight_16 add
  1574. vdup.8 d0, r12
  1575. 1: subs r2, r2, #2
  1576. vld1.8 {d20-d21},[r0,:128], r1
  1577. vmull.u8 q2, d0, d20
  1578. pld [r0]
  1579. vmull.u8 q3, d0, d21
  1580. vld1.8 {d28-d29},[r0,:128], r1
  1581. vmull.u8 q12, d0, d28
  1582. pld [r0]
  1583. vmull.u8 q13, d0, d29
  1584. \add q2, q8, q2
  1585. vrshl.s16 q2, q2, q9
  1586. \add q3, q8, q3
  1587. vrshl.s16 q3, q3, q9
  1588. vqmovun.s16 d4, q2
  1589. vqmovun.s16 d5, q3
  1590. \add q12, q8, q12
  1591. vrshl.s16 q12, q12, q9
  1592. \add q13, q8, q13
  1593. vrshl.s16 q13, q13, q9
  1594. vqmovun.s16 d24, q12
  1595. vqmovun.s16 d25, q13
  1596. vst1.8 {d4- d5}, [r4,:128], r1
  1597. vst1.8 {d24-d25},[r4,:128], r1
  1598. bne 1b
  1599. pop {r4, pc}
  1600. .endm
  1601. .macro weight_8 add
  1602. vdup.8 d0, r12
  1603. 1: subs r2, r2, #2
  1604. vld1.8 {d4},[r0,:64], r1
  1605. vmull.u8 q1, d0, d4
  1606. pld [r0]
  1607. vld1.8 {d6},[r0,:64], r1
  1608. vmull.u8 q10, d0, d6
  1609. \add q1, q8, q1
  1610. pld [r0]
  1611. vrshl.s16 q1, q1, q9
  1612. vqmovun.s16 d2, q1
  1613. \add q10, q8, q10
  1614. vrshl.s16 q10, q10, q9
  1615. vqmovun.s16 d4, q10
  1616. vst1.8 {d2},[r4,:64], r1
  1617. vst1.8 {d4},[r4,:64], r1
  1618. bne 1b
  1619. pop {r4, pc}
  1620. .endm
  1621. .macro weight_4 add
  1622. vdup.8 d0, r12
  1623. vmov q1, q8
  1624. vmov q10, q8
  1625. 1: subs r2, r2, #4
  1626. vld1.32 {d4[0]},[r0,:32], r1
  1627. vld1.32 {d4[1]},[r0,:32], r1
  1628. vmull.u8 q1, d0, d4
  1629. pld [r0]
  1630. blt 2f
  1631. vld1.32 {d6[0]},[r0,:32], r1
  1632. vld1.32 {d6[1]},[r0,:32], r1
  1633. vmull.u8 q10, d0, d6
  1634. pld [r0]
  1635. \add q1, q8, q1
  1636. vrshl.s16 q1, q1, q9
  1637. vqmovun.s16 d2, q1
  1638. \add q10, q8, q10
  1639. vrshl.s16 q10, q10, q9
  1640. vqmovun.s16 d4, q10
  1641. vmov q10, q8
  1642. vst1.32 {d2[0]},[r4,:32], r1
  1643. vst1.32 {d2[1]},[r4,:32], r1
  1644. vmov q1, q8
  1645. vst1.32 {d4[0]},[r4,:32], r1
  1646. vst1.32 {d4[1]},[r4,:32], r1
  1647. bne 1b
  1648. pop {r4, pc}
  1649. 2: \add q1, q8, q1
  1650. vrshl.s16 q1, q1, q9
  1651. vqmovun.s16 d2, q1
  1652. vst1.32 {d2[0]},[r4,:32], r1
  1653. vst1.32 {d2[1]},[r4,:32], r1
  1654. pop {r4, pc}
  1655. .endm
  1656. .macro weight_func w
  1657. function ff_weight_h264_pixels_\w\()_neon, export=1
  1658. push {r4, lr}
  1659. ldr r12, [sp, #8]
  1660. ldr r4, [sp, #12]
  1661. cmp r3, #1
  1662. lsl r4, r4, r3
  1663. vdup.16 q8, r4
  1664. mov r4, r0
  1665. ble 20f
  1666. rsb lr, r3, #1
  1667. vdup.16 q9, lr
  1668. cmp r12, #0
  1669. blt 10f
  1670. weight_\w vhadd.s16
  1671. 10: rsb r12, r12, #0
  1672. weight_\w vhsub.s16
  1673. 20: rsb lr, r3, #0
  1674. vdup.16 q9, lr
  1675. cmp r12, #0
  1676. blt 10f
  1677. weight_\w vadd.s16
  1678. 10: rsb r12, r12, #0
  1679. weight_\w vsub.s16
  1680. endfunc
  1681. .endm
  1682. weight_func 16
  1683. weight_func 8
  1684. weight_func 4