You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

784 lines
28KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "asm.S"
  22. preserve8
  23. .fpu neon
  24. .text
  25. .macro pixels16 avg=0
  26. .if \avg
  27. mov ip, r0
  28. .endif
  29. 1: vld1.64 {d0, d1}, [r1], r2
  30. vld1.64 {d2, d3}, [r1], r2
  31. vld1.64 {d4, d5}, [r1], r2
  32. pld [r1, r2, lsl #2]
  33. vld1.64 {d6, d7}, [r1], r2
  34. pld [r1]
  35. pld [r1, r2]
  36. pld [r1, r2, lsl #1]
  37. .if \avg
  38. vld1.64 {d16,d17}, [ip,:128], r2
  39. vrhadd.u8 q0, q0, q8
  40. vld1.64 {d18,d19}, [ip,:128], r2
  41. vrhadd.u8 q1, q1, q9
  42. vld1.64 {d20,d21}, [ip,:128], r2
  43. vrhadd.u8 q2, q2, q10
  44. vld1.64 {d22,d23}, [ip,:128], r2
  45. vrhadd.u8 q3, q3, q11
  46. .endif
  47. subs r3, r3, #4
  48. vst1.64 {d0, d1}, [r0,:128], r2
  49. vst1.64 {d2, d3}, [r0,:128], r2
  50. vst1.64 {d4, d5}, [r0,:128], r2
  51. vst1.64 {d6, d7}, [r0,:128], r2
  52. bne 1b
  53. bx lr
  54. .endm
  55. .macro pixels16_x2 vhadd=vrhadd.u8
  56. 1: vld1.64 {d0-d2}, [r1], r2
  57. vld1.64 {d4-d6}, [r1], r2
  58. pld [r1]
  59. pld [r1, r2]
  60. subs r3, r3, #2
  61. vext.8 q1, q0, q1, #1
  62. \vhadd q0, q0, q1
  63. vext.8 q3, q2, q3, #1
  64. \vhadd q2, q2, q3
  65. vst1.64 {d0, d1}, [r0,:128], r2
  66. vst1.64 {d4, d5}, [r0,:128], r2
  67. bne 1b
  68. bx lr
  69. .endm
  70. .macro pixels16_y2 vhadd=vrhadd.u8
  71. push {lr}
  72. add ip, r1, r2
  73. lsl lr, r2, #1
  74. vld1.64 {d0, d1}, [r1], lr
  75. vld1.64 {d2, d3}, [ip], lr
  76. 1: subs r3, r3, #2
  77. \vhadd q2, q0, q1
  78. vld1.64 {d0, d1}, [r1], lr
  79. \vhadd q3, q0, q1
  80. vld1.64 {d2, d3}, [ip], lr
  81. pld [r1]
  82. pld [ip]
  83. vst1.64 {d4, d5}, [r0,:128], r2
  84. vst1.64 {d6, d7}, [r0,:128], r2
  85. bne 1b
  86. pop {pc}
  87. .endm
  88. .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
  89. push {lr}
  90. lsl lr, r2, #1
  91. add ip, r1, r2
  92. vld1.64 {d0-d2}, [r1], lr
  93. vld1.64 {d4-d6}, [ip], lr
  94. .if \no_rnd
  95. vmov.i16 q13, #1
  96. .endif
  97. pld [r1]
  98. pld [ip]
  99. vext.8 q1, q0, q1, #1
  100. vext.8 q3, q2, q3, #1
  101. vaddl.u8 q8, d0, d2
  102. vaddl.u8 q10, d1, d3
  103. vaddl.u8 q9, d4, d6
  104. vaddl.u8 q11, d5, d7
  105. 1: subs r3, r3, #2
  106. vld1.64 {d0-d2}, [r1], lr
  107. vadd.u16 q12, q8, q9
  108. pld [r1]
  109. .if \no_rnd
  110. vadd.u16 q12, q12, q13
  111. .endif
  112. vext.8 q15, q0, q1, #1
  113. vadd.u16 q1 , q10, q11
  114. \vshrn d28, q12, #2
  115. .if \no_rnd
  116. vadd.u16 q1, q1, q13
  117. .endif
  118. \vshrn d29, q1, #2
  119. vaddl.u8 q8, d0, d30
  120. vld1.64 {d2-d4}, [ip], lr
  121. vaddl.u8 q10, d1, d31
  122. vst1.64 {d28,d29}, [r0,:128], r2
  123. vadd.u16 q12, q8, q9
  124. pld [ip]
  125. .if \no_rnd
  126. vadd.u16 q12, q12, q13
  127. .endif
  128. vext.8 q2, q1, q2, #1
  129. vadd.u16 q0, q10, q11
  130. \vshrn d30, q12, #2
  131. .if \no_rnd
  132. vadd.u16 q0, q0, q13
  133. .endif
  134. \vshrn d31, q0, #2
  135. vaddl.u8 q9, d2, d4
  136. vaddl.u8 q11, d3, d5
  137. vst1.64 {d30,d31}, [r0,:128], r2
  138. bgt 1b
  139. pop {pc}
  140. .endm
  141. .macro pixels8
  142. 1: vld1.64 {d0}, [r1], r2
  143. vld1.64 {d1}, [r1], r2
  144. vld1.64 {d2}, [r1], r2
  145. pld [r1, r2, lsl #2]
  146. vld1.64 {d3}, [r1], r2
  147. pld [r1]
  148. pld [r1, r2]
  149. pld [r1, r2, lsl #1]
  150. subs r3, r3, #4
  151. vst1.64 {d0}, [r0,:64], r2
  152. vst1.64 {d1}, [r0,:64], r2
  153. vst1.64 {d2}, [r0,:64], r2
  154. vst1.64 {d3}, [r0,:64], r2
  155. bne 1b
  156. bx lr
  157. .endm
  158. .macro pixels8_x2 vhadd=vrhadd.u8
  159. 1: vld1.64 {d0, d1}, [r1], r2
  160. vext.8 d1, d0, d1, #1
  161. vld1.64 {d2, d3}, [r1], r2
  162. vext.8 d3, d2, d3, #1
  163. pld [r1]
  164. pld [r1, r2]
  165. subs r3, r3, #2
  166. vswp d1, d2
  167. \vhadd q0, q0, q1
  168. vst1.64 {d0}, [r0,:64], r2
  169. vst1.64 {d1}, [r0,:64], r2
  170. bne 1b
  171. bx lr
  172. .endm
  173. .macro pixels8_y2 vhadd=vrhadd.u8
  174. push {lr}
  175. add ip, r1, r2
  176. lsl lr, r2, #1
  177. vld1.64 {d0}, [r1], lr
  178. vld1.64 {d1}, [ip], lr
  179. 1: subs r3, r3, #2
  180. \vhadd d4, d0, d1
  181. vld1.64 {d0}, [r1], lr
  182. \vhadd d5, d0, d1
  183. vld1.64 {d1}, [ip], lr
  184. pld [r1]
  185. pld [ip]
  186. vst1.64 {d4}, [r0,:64], r2
  187. vst1.64 {d5}, [r0,:64], r2
  188. bne 1b
  189. pop {pc}
  190. .endm
  191. .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  192. push {lr}
  193. lsl lr, r2, #1
  194. add ip, r1, r2
  195. vld1.64 {d0, d1}, [r1], lr
  196. vld1.64 {d2, d3}, [ip], lr
  197. .if \no_rnd
  198. vmov.i16 q11, #1
  199. .endif
  200. pld [r1]
  201. pld [ip]
  202. vext.8 d4, d0, d1, #1
  203. vext.8 d6, d2, d3, #1
  204. vaddl.u8 q8, d0, d4
  205. vaddl.u8 q9, d2, d6
  206. 1: subs r3, r3, #2
  207. vld1.64 {d0, d1}, [r1], lr
  208. pld [r1]
  209. vadd.u16 q10, q8, q9
  210. vext.8 d4, d0, d1, #1
  211. .if \no_rnd
  212. vadd.u16 q10, q10, q11
  213. .endif
  214. vaddl.u8 q8, d0, d4
  215. \vshrn d5, q10, #2
  216. vld1.64 {d2, d3}, [ip], lr
  217. vadd.u16 q10, q8, q9
  218. pld [ip]
  219. .if \no_rnd
  220. vadd.u16 q10, q10, q11
  221. .endif
  222. vst1.64 {d5}, [r0,:64], r2
  223. \vshrn d7, q10, #2
  224. vext.8 d6, d2, d3, #1
  225. vaddl.u8 q9, d2, d6
  226. vst1.64 {d7}, [r0,:64], r2
  227. bgt 1b
  228. pop {pc}
  229. .endm
  230. .macro pixfunc pfx name suf rnd_op args:vararg
  231. function ff_\pfx\name\suf\()_neon, export=1
  232. \name \rnd_op \args
  233. .endfunc
  234. .endm
  235. .macro pixfunc2 pfx name args:vararg
  236. pixfunc \pfx \name
  237. pixfunc \pfx \name \args
  238. .endm
  239. function ff_put_h264_qpel16_mc00_neon, export=1
  240. mov r3, #16
  241. .endfunc
  242. pixfunc put_ pixels16
  243. pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
  244. pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
  245. pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  246. function ff_avg_h264_qpel16_mc00_neon, export=1
  247. mov r3, #16
  248. .endfunc
  249. pixfunc avg_ pixels16,, 1
  250. function ff_put_h264_qpel8_mc00_neon, export=1
  251. mov r3, #8
  252. .endfunc
  253. pixfunc put_ pixels8
  254. pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
  255. pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
  256. pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
  257. function ff_put_signed_pixels_clamped_neon, export=1
  258. vmov.u8 d31, #128
  259. vld1.64 {d16-d17}, [r0,:128]!
  260. vqmovn.s16 d0, q8
  261. vld1.64 {d18-d19}, [r0,:128]!
  262. vqmovn.s16 d1, q9
  263. vld1.64 {d16-d17}, [r0,:128]!
  264. vqmovn.s16 d2, q8
  265. vld1.64 {d18-d19}, [r0,:128]!
  266. vadd.u8 d0, d0, d31
  267. vld1.64 {d20-d21}, [r0,:128]!
  268. vadd.u8 d1, d1, d31
  269. vld1.64 {d22-d23}, [r0,:128]!
  270. vadd.u8 d2, d2, d31
  271. vst1.64 {d0}, [r1,:64], r2
  272. vqmovn.s16 d3, q9
  273. vst1.64 {d1}, [r1,:64], r2
  274. vqmovn.s16 d4, q10
  275. vst1.64 {d2}, [r1,:64], r2
  276. vqmovn.s16 d5, q11
  277. vld1.64 {d24-d25}, [r0,:128]!
  278. vadd.u8 d3, d3, d31
  279. vld1.64 {d26-d27}, [r0,:128]!
  280. vadd.u8 d4, d4, d31
  281. vadd.u8 d5, d5, d31
  282. vst1.64 {d3}, [r1,:64], r2
  283. vqmovn.s16 d6, q12
  284. vst1.64 {d4}, [r1,:64], r2
  285. vqmovn.s16 d7, q13
  286. vst1.64 {d5}, [r1,:64], r2
  287. vadd.u8 d6, d6, d31
  288. vadd.u8 d7, d7, d31
  289. vst1.64 {d6}, [r1,:64], r2
  290. vst1.64 {d7}, [r1,:64], r2
  291. bx lr
  292. .endfunc
  293. function ff_add_pixels_clamped_neon, export=1
  294. mov r3, r1
  295. vld1.64 {d16}, [r1,:64], r2
  296. vld1.64 {d0-d1}, [r0,:128]!
  297. vaddw.u8 q0, q0, d16
  298. vld1.64 {d17}, [r1,:64], r2
  299. vld1.64 {d2-d3}, [r0,:128]!
  300. vqmovun.s16 d0, q0
  301. vld1.64 {d18}, [r1,:64], r2
  302. vaddw.u8 q1, q1, d17
  303. vld1.64 {d4-d5}, [r0,:128]!
  304. vaddw.u8 q2, q2, d18
  305. vst1.64 {d0}, [r3,:64], r2
  306. vqmovun.s16 d2, q1
  307. vld1.64 {d19}, [r1,:64], r2
  308. vld1.64 {d6-d7}, [r0,:128]!
  309. vaddw.u8 q3, q3, d19
  310. vqmovun.s16 d4, q2
  311. vst1.64 {d2}, [r3,:64], r2
  312. vld1.64 {d16}, [r1,:64], r2
  313. vqmovun.s16 d6, q3
  314. vld1.64 {d0-d1}, [r0,:128]!
  315. vaddw.u8 q0, q0, d16
  316. vst1.64 {d4}, [r3,:64], r2
  317. vld1.64 {d17}, [r1,:64], r2
  318. vld1.64 {d2-d3}, [r0,:128]!
  319. vaddw.u8 q1, q1, d17
  320. vst1.64 {d6}, [r3,:64], r2
  321. vqmovun.s16 d0, q0
  322. vld1.64 {d18}, [r1,:64], r2
  323. vld1.64 {d4-d5}, [r0,:128]!
  324. vaddw.u8 q2, q2, d18
  325. vst1.64 {d0}, [r3,:64], r2
  326. vqmovun.s16 d2, q1
  327. vld1.64 {d19}, [r1,:64], r2
  328. vqmovun.s16 d4, q2
  329. vld1.64 {d6-d7}, [r0,:128]!
  330. vaddw.u8 q3, q3, d19
  331. vst1.64 {d2}, [r3,:64], r2
  332. vqmovun.s16 d6, q3
  333. vst1.64 {d4}, [r3,:64], r2
  334. vst1.64 {d6}, [r3,:64], r2
  335. bx lr
  336. .endfunc
  337. function ff_float_to_int16_neon, export=1
  338. subs r2, r2, #8
  339. vld1.64 {d0-d1}, [r1,:128]!
  340. vcvt.s32.f32 q8, q0, #16
  341. vld1.64 {d2-d3}, [r1,:128]!
  342. vcvt.s32.f32 q9, q1, #16
  343. beq 3f
  344. bics ip, r2, #15
  345. beq 2f
  346. 1: subs ip, ip, #16
  347. vshrn.s32 d4, q8, #16
  348. vld1.64 {d0-d1}, [r1,:128]!
  349. vcvt.s32.f32 q0, q0, #16
  350. vshrn.s32 d5, q9, #16
  351. vld1.64 {d2-d3}, [r1,:128]!
  352. vcvt.s32.f32 q1, q1, #16
  353. vshrn.s32 d6, q0, #16
  354. vst1.64 {d4-d5}, [r0,:128]!
  355. vshrn.s32 d7, q1, #16
  356. vld1.64 {d16-d17},[r1,:128]!
  357. vcvt.s32.f32 q8, q8, #16
  358. vld1.64 {d18-d19},[r1,:128]!
  359. vcvt.s32.f32 q9, q9, #16
  360. vst1.64 {d6-d7}, [r0,:128]!
  361. bne 1b
  362. ands r2, r2, #15
  363. beq 3f
  364. 2: vld1.64 {d0-d1}, [r1,:128]!
  365. vshrn.s32 d4, q8, #16
  366. vcvt.s32.f32 q0, q0, #16
  367. vld1.64 {d2-d3}, [r1,:128]!
  368. vshrn.s32 d5, q9, #16
  369. vcvt.s32.f32 q1, q1, #16
  370. vshrn.s32 d6, q0, #16
  371. vst1.64 {d4-d5}, [r0,:128]!
  372. vshrn.s32 d7, q1, #16
  373. vst1.64 {d6-d7}, [r0,:128]!
  374. bx lr
  375. 3: vshrn.s32 d4, q8, #16
  376. vshrn.s32 d5, q9, #16
  377. vst1.64 {d4-d5}, [r0,:128]!
  378. bx lr
  379. .endfunc
  380. function ff_float_to_int16_interleave_neon, export=1
  381. cmp r3, #2
  382. ldrlt r1, [r1]
  383. blt ff_float_to_int16_neon
  384. bne 4f
  385. ldr r3, [r1]
  386. ldr r1, [r1, #4]
  387. subs r2, r2, #8
  388. vld1.64 {d0-d1}, [r3,:128]!
  389. vcvt.s32.f32 q8, q0, #16
  390. vld1.64 {d2-d3}, [r3,:128]!
  391. vcvt.s32.f32 q9, q1, #16
  392. vld1.64 {d20-d21},[r1,:128]!
  393. vcvt.s32.f32 q10, q10, #16
  394. vld1.64 {d22-d23},[r1,:128]!
  395. vcvt.s32.f32 q11, q11, #16
  396. beq 3f
  397. bics ip, r2, #15
  398. beq 2f
  399. 1: subs ip, ip, #16
  400. vld1.64 {d0-d1}, [r3,:128]!
  401. vcvt.s32.f32 q0, q0, #16
  402. vsri.32 q10, q8, #16
  403. vld1.64 {d2-d3}, [r3,:128]!
  404. vcvt.s32.f32 q1, q1, #16
  405. vld1.64 {d24-d25},[r1,:128]!
  406. vcvt.s32.f32 q12, q12, #16
  407. vld1.64 {d26-d27},[r1,:128]!
  408. vsri.32 q11, q9, #16
  409. vst1.64 {d20-d21},[r0,:128]!
  410. vcvt.s32.f32 q13, q13, #16
  411. vst1.64 {d22-d23},[r0,:128]!
  412. vsri.32 q12, q0, #16
  413. vld1.64 {d16-d17},[r3,:128]!
  414. vsri.32 q13, q1, #16
  415. vst1.64 {d24-d25},[r0,:128]!
  416. vcvt.s32.f32 q8, q8, #16
  417. vld1.64 {d18-d19},[r3,:128]!
  418. vcvt.s32.f32 q9, q9, #16
  419. vld1.64 {d20-d21},[r1,:128]!
  420. vcvt.s32.f32 q10, q10, #16
  421. vld1.64 {d22-d23},[r1,:128]!
  422. vcvt.s32.f32 q11, q11, #16
  423. vst1.64 {d26-d27},[r0,:128]!
  424. bne 1b
  425. ands r2, r2, #15
  426. beq 3f
  427. 2: vsri.32 q10, q8, #16
  428. vld1.64 {d0-d1}, [r3,:128]!
  429. vcvt.s32.f32 q0, q0, #16
  430. vld1.64 {d2-d3}, [r3,:128]!
  431. vcvt.s32.f32 q1, q1, #16
  432. vld1.64 {d24-d25},[r1,:128]!
  433. vcvt.s32.f32 q12, q12, #16
  434. vsri.32 q11, q9, #16
  435. vld1.64 {d26-d27},[r1,:128]!
  436. vcvt.s32.f32 q13, q13, #16
  437. vst1.64 {d20-d21},[r0,:128]!
  438. vsri.32 q12, q0, #16
  439. vst1.64 {d22-d23},[r0,:128]!
  440. vsri.32 q13, q1, #16
  441. vst1.64 {d24-d27},[r0,:128]!
  442. bx lr
  443. 3: vsri.32 q10, q8, #16
  444. vsri.32 q11, q9, #16
  445. vst1.64 {d20-d23},[r0,:128]!
  446. bx lr
  447. 4: push {r4-r8,lr}
  448. cmp r3, #4
  449. lsl ip, r3, #1
  450. blt 4f
  451. @ 4 channels
  452. 5: ldmia r1!, {r4-r7}
  453. mov lr, r2
  454. mov r8, r0
  455. vld1.64 {d16-d17},[r4,:128]!
  456. vcvt.s32.f32 q8, q8, #16
  457. vld1.64 {d18-d19},[r5,:128]!
  458. vcvt.s32.f32 q9, q9, #16
  459. vld1.64 {d20-d21},[r6,:128]!
  460. vcvt.s32.f32 q10, q10, #16
  461. vld1.64 {d22-d23},[r7,:128]!
  462. vcvt.s32.f32 q11, q11, #16
  463. 6: subs lr, lr, #8
  464. vld1.64 {d0-d1}, [r4,:128]!
  465. vcvt.s32.f32 q0, q0, #16
  466. vsri.32 q9, q8, #16
  467. vld1.64 {d2-d3}, [r5,:128]!
  468. vcvt.s32.f32 q1, q1, #16
  469. vsri.32 q11, q10, #16
  470. vld1.64 {d4-d5}, [r6,:128]!
  471. vcvt.s32.f32 q2, q2, #16
  472. vzip.32 d18, d22
  473. vld1.64 {d6-d7}, [r7,:128]!
  474. vcvt.s32.f32 q3, q3, #16
  475. vzip.32 d19, d23
  476. vst1.64 {d18}, [r8], ip
  477. vsri.32 q1, q0, #16
  478. vst1.64 {d22}, [r8], ip
  479. vsri.32 q3, q2, #16
  480. vst1.64 {d19}, [r8], ip
  481. vzip.32 d2, d6
  482. vst1.64 {d23}, [r8], ip
  483. vzip.32 d3, d7
  484. beq 7f
  485. vld1.64 {d16-d17},[r4,:128]!
  486. vcvt.s32.f32 q8, q8, #16
  487. vst1.64 {d2}, [r8], ip
  488. vld1.64 {d18-d19},[r5,:128]!
  489. vcvt.s32.f32 q9, q9, #16
  490. vst1.64 {d6}, [r8], ip
  491. vld1.64 {d20-d21},[r6,:128]!
  492. vcvt.s32.f32 q10, q10, #16
  493. vst1.64 {d3}, [r8], ip
  494. vld1.64 {d22-d23},[r7,:128]!
  495. vcvt.s32.f32 q11, q11, #16
  496. vst1.64 {d7}, [r8], ip
  497. b 6b
  498. 7: vst1.64 {d2}, [r8], ip
  499. vst1.64 {d6}, [r8], ip
  500. vst1.64 {d3}, [r8], ip
  501. vst1.64 {d7}, [r8], ip
  502. subs r3, r3, #4
  503. popeq {r4-r8,pc}
  504. cmp r3, #4
  505. add r0, r0, #8
  506. bge 5b
  507. @ 2 channels
  508. 4: cmp r3, #2
  509. blt 4f
  510. ldmia r1!, {r4-r5}
  511. mov lr, r2
  512. mov r8, r0
  513. tst lr, #8
  514. vld1.64 {d16-d17},[r4,:128]!
  515. vcvt.s32.f32 q8, q8, #16
  516. vld1.64 {d18-d19},[r5,:128]!
  517. vcvt.s32.f32 q9, q9, #16
  518. vld1.64 {d20-d21},[r4,:128]!
  519. vcvt.s32.f32 q10, q10, #16
  520. vld1.64 {d22-d23},[r5,:128]!
  521. vcvt.s32.f32 q11, q11, #16
  522. beq 6f
  523. subs lr, lr, #8
  524. beq 7f
  525. vsri.32 d18, d16, #16
  526. vsri.32 d19, d17, #16
  527. vld1.64 {d16-d17},[r4,:128]!
  528. vcvt.s32.f32 q8, q8, #16
  529. vst1.32 {d18[0]}, [r8], ip
  530. vsri.32 d22, d20, #16
  531. vst1.32 {d18[1]}, [r8], ip
  532. vsri.32 d23, d21, #16
  533. vst1.32 {d19[0]}, [r8], ip
  534. vst1.32 {d19[1]}, [r8], ip
  535. vld1.64 {d18-d19},[r5,:128]!
  536. vcvt.s32.f32 q9, q9, #16
  537. vst1.32 {d22[0]}, [r8], ip
  538. vst1.32 {d22[1]}, [r8], ip
  539. vld1.64 {d20-d21},[r4,:128]!
  540. vcvt.s32.f32 q10, q10, #16
  541. vst1.32 {d23[0]}, [r8], ip
  542. vst1.32 {d23[1]}, [r8], ip
  543. vld1.64 {d22-d23},[r5,:128]!
  544. vcvt.s32.f32 q11, q11, #16
  545. 6: subs lr, lr, #16
  546. vld1.64 {d0-d1}, [r4,:128]!
  547. vcvt.s32.f32 q0, q0, #16
  548. vsri.32 d18, d16, #16
  549. vld1.64 {d2-d3}, [r5,:128]!
  550. vcvt.s32.f32 q1, q1, #16
  551. vsri.32 d19, d17, #16
  552. vld1.64 {d4-d5}, [r4,:128]!
  553. vcvt.s32.f32 q2, q2, #16
  554. vld1.64 {d6-d7}, [r5,:128]!
  555. vcvt.s32.f32 q3, q3, #16
  556. vst1.32 {d18[0]}, [r8], ip
  557. vsri.32 d22, d20, #16
  558. vst1.32 {d18[1]}, [r8], ip
  559. vsri.32 d23, d21, #16
  560. vst1.32 {d19[0]}, [r8], ip
  561. vsri.32 d2, d0, #16
  562. vst1.32 {d19[1]}, [r8], ip
  563. vsri.32 d3, d1, #16
  564. vst1.32 {d22[0]}, [r8], ip
  565. vsri.32 d6, d4, #16
  566. vst1.32 {d22[1]}, [r8], ip
  567. vsri.32 d7, d5, #16
  568. vst1.32 {d23[0]}, [r8], ip
  569. vst1.32 {d23[1]}, [r8], ip
  570. beq 6f
  571. vld1.64 {d16-d17},[r4,:128]!
  572. vcvt.s32.f32 q8, q8, #16
  573. vst1.32 {d2[0]}, [r8], ip
  574. vst1.32 {d2[1]}, [r8], ip
  575. vld1.64 {d18-d19},[r5,:128]!
  576. vcvt.s32.f32 q9, q9, #16
  577. vst1.32 {d3[0]}, [r8], ip
  578. vst1.32 {d3[1]}, [r8], ip
  579. vld1.64 {d20-d21},[r4,:128]!
  580. vcvt.s32.f32 q10, q10, #16
  581. vst1.32 {d6[0]}, [r8], ip
  582. vst1.32 {d6[1]}, [r8], ip
  583. vld1.64 {d22-d23},[r5,:128]!
  584. vcvt.s32.f32 q11, q11, #16
  585. vst1.32 {d7[0]}, [r8], ip
  586. vst1.32 {d7[1]}, [r8], ip
  587. bgt 6b
  588. 6: vst1.32 {d2[0]}, [r8], ip
  589. vst1.32 {d2[1]}, [r8], ip
  590. vst1.32 {d3[0]}, [r8], ip
  591. vst1.32 {d3[1]}, [r8], ip
  592. vst1.32 {d6[0]}, [r8], ip
  593. vst1.32 {d6[1]}, [r8], ip
  594. vst1.32 {d7[0]}, [r8], ip
  595. vst1.32 {d7[1]}, [r8], ip
  596. b 8f
  597. 7: vsri.32 d18, d16, #16
  598. vsri.32 d19, d17, #16
  599. vst1.32 {d18[0]}, [r8], ip
  600. vsri.32 d22, d20, #16
  601. vst1.32 {d18[1]}, [r8], ip
  602. vsri.32 d23, d21, #16
  603. vst1.32 {d19[0]}, [r8], ip
  604. vst1.32 {d19[1]}, [r8], ip
  605. vst1.32 {d22[0]}, [r8], ip
  606. vst1.32 {d22[1]}, [r8], ip
  607. vst1.32 {d23[0]}, [r8], ip
  608. vst1.32 {d23[1]}, [r8], ip
  609. 8: subs r3, r3, #2
  610. add r0, r0, #4
  611. popeq {r4-r8,pc}
  612. @ 1 channel
  613. 4: ldr r4, [r1],#4
  614. tst r2, #8
  615. mov lr, r2
  616. mov r5, r0
  617. vld1.64 {d0-d1}, [r4,:128]!
  618. vcvt.s32.f32 q0, q0, #16
  619. vld1.64 {d2-d3}, [r4,:128]!
  620. vcvt.s32.f32 q1, q1, #16
  621. bne 8f
  622. 6: subs lr, lr, #16
  623. vld1.64 {d4-d5}, [r4,:128]!
  624. vcvt.s32.f32 q2, q2, #16
  625. vld1.64 {d6-d7}, [r4,:128]!
  626. vcvt.s32.f32 q3, q3, #16
  627. vst1.16 {d0[1]}, [r5,:16], ip
  628. vst1.16 {d0[3]}, [r5,:16], ip
  629. vst1.16 {d1[1]}, [r5,:16], ip
  630. vst1.16 {d1[3]}, [r5,:16], ip
  631. vst1.16 {d2[1]}, [r5,:16], ip
  632. vst1.16 {d2[3]}, [r5,:16], ip
  633. vst1.16 {d3[1]}, [r5,:16], ip
  634. vst1.16 {d3[3]}, [r5,:16], ip
  635. beq 7f
  636. vld1.64 {d0-d1}, [r4,:128]!
  637. vcvt.s32.f32 q0, q0, #16
  638. vld1.64 {d2-d3}, [r4,:128]!
  639. vcvt.s32.f32 q1, q1, #16
  640. 7: vst1.16 {d4[1]}, [r5,:16], ip
  641. vst1.16 {d4[3]}, [r5,:16], ip
  642. vst1.16 {d5[1]}, [r5,:16], ip
  643. vst1.16 {d5[3]}, [r5,:16], ip
  644. vst1.16 {d6[1]}, [r5,:16], ip
  645. vst1.16 {d6[3]}, [r5,:16], ip
  646. vst1.16 {d7[1]}, [r5,:16], ip
  647. vst1.16 {d7[3]}, [r5,:16], ip
  648. bgt 6b
  649. pop {r4-r8,pc}
  650. 8: subs lr, lr, #8
  651. vst1.16 {d0[1]}, [r5,:16], ip
  652. vst1.16 {d0[3]}, [r5,:16], ip
  653. vst1.16 {d1[1]}, [r5,:16], ip
  654. vst1.16 {d1[3]}, [r5,:16], ip
  655. vst1.16 {d2[1]}, [r5,:16], ip
  656. vst1.16 {d2[3]}, [r5,:16], ip
  657. vst1.16 {d3[1]}, [r5,:16], ip
  658. vst1.16 {d3[3]}, [r5,:16], ip
  659. popeq {r4-r8,pc}
  660. vld1.64 {d0-d1}, [r4,:128]!
  661. vcvt.s32.f32 q0, q0, #16
  662. vld1.64 {d2-d3}, [r4,:128]!
  663. vcvt.s32.f32 q1, q1, #16
  664. b 6b
  665. .endfunc
  666. function ff_vector_fmul_neon, export=1
  667. mov r3, r0
  668. subs r2, r2, #8
  669. vld1.64 {d0-d3}, [r0,:128]!
  670. vld1.64 {d4-d7}, [r1,:128]!
  671. vmul.f32 q8, q0, q2
  672. vmul.f32 q9, q1, q3
  673. beq 3f
  674. bics ip, r2, #15
  675. beq 2f
  676. 1: subs ip, ip, #16
  677. vld1.64 {d0-d1}, [r0,:128]!
  678. vld1.64 {d4-d5}, [r1,:128]!
  679. vmul.f32 q10, q0, q2
  680. vld1.64 {d2-d3}, [r0,:128]!
  681. vld1.64 {d6-d7}, [r1,:128]!
  682. vmul.f32 q11, q1, q3
  683. vst1.64 {d16-d19},[r3,:128]!
  684. vld1.64 {d0-d1}, [r0,:128]!
  685. vld1.64 {d4-d5}, [r1,:128]!
  686. vmul.f32 q8, q0, q2
  687. vld1.64 {d2-d3}, [r0,:128]!
  688. vld1.64 {d6-d7}, [r1,:128]!
  689. vmul.f32 q9, q1, q3
  690. vst1.64 {d20-d23},[r3,:128]!
  691. bne 1b
  692. ands r2, r2, #15
  693. beq 3f
  694. 2: vld1.64 {d0-d1}, [r0,:128]!
  695. vld1.64 {d4-d5}, [r1,:128]!
  696. vst1.64 {d16-d17},[r3,:128]!
  697. vmul.f32 q8, q0, q2
  698. vld1.64 {d2-d3}, [r0,:128]!
  699. vld1.64 {d6-d7}, [r1,:128]!
  700. vst1.64 {d18-d19},[r3,:128]!
  701. vmul.f32 q9, q1, q3
  702. 3: vst1.64 {d16-d19},[r3,:128]!
  703. bx lr
  704. .endfunc
  705. function ff_vector_fmul_window_neon, export=1
  706. vld1.32 {d16[],d17[]}, [sp,:32]
  707. push {r4,r5,lr}
  708. ldr lr, [sp, #16]
  709. sub r2, r2, #8
  710. sub r5, lr, #2
  711. add r2, r2, r5, lsl #2
  712. add r4, r3, r5, lsl #3
  713. add ip, r0, r5, lsl #3
  714. mov r5, #-16
  715. vld1.64 {d0,d1}, [r1,:128]!
  716. vld1.64 {d2,d3}, [r2,:128], r5
  717. vld1.64 {d4,d5}, [r3,:128]!
  718. vld1.64 {d6,d7}, [r4,:128], r5
  719. 1: subs lr, lr, #4
  720. vmov q11, q8
  721. vmla.f32 d22, d0, d4
  722. vmov q10, q8
  723. vmla.f32 d23, d1, d5
  724. vrev64.32 q3, q3
  725. vmla.f32 d20, d0, d7
  726. vrev64.32 q1, q1
  727. vmla.f32 d21, d1, d6
  728. beq 2f
  729. vmla.f32 d22, d3, d7
  730. vld1.64 {d0,d1}, [r1,:128]!
  731. vmla.f32 d23, d2, d6
  732. vld1.64 {d18,d19},[r2,:128], r5
  733. vmls.f32 d20, d3, d4
  734. vld1.64 {d24,d25},[r3,:128]!
  735. vmls.f32 d21, d2, d5
  736. vld1.64 {d6,d7}, [r4,:128], r5
  737. vmov q1, q9
  738. vrev64.32 q11, q11
  739. vmov q2, q12
  740. vswp d22, d23
  741. vst1.64 {d20,d21},[r0,:128]!
  742. vst1.64 {d22,d23},[ip,:128], r5
  743. b 1b
  744. 2: vmla.f32 d22, d3, d7
  745. vmla.f32 d23, d2, d6
  746. vmls.f32 d20, d3, d4
  747. vmls.f32 d21, d2, d5
  748. vrev64.32 q11, q11
  749. vswp d22, d23
  750. vst1.64 {d20,d21},[r0,:128]!
  751. vst1.64 {d22,d23},[ip,:128], r5
  752. pop {r4,r5,pc}
  753. .endfunc