You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

777 lines
26KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. function ff_clear_block_neon, export=1
  25. vmov.i16 q0, #0
  26. .rept 8
  27. vst1.16 {q0}, [r0,:128]!
  28. .endr
  29. bx lr
  30. endfunc
  31. function ff_clear_blocks_neon, export=1
  32. vmov.i16 q0, #0
  33. .rept 8*6
  34. vst1.16 {q0}, [r0,:128]!
  35. .endr
  36. bx lr
  37. endfunc
  38. .macro pixels16 avg=0
  39. .if \avg
  40. mov ip, r0
  41. .endif
  42. 1: vld1.64 {d0, d1}, [r1], r2
  43. vld1.64 {d2, d3}, [r1], r2
  44. vld1.64 {d4, d5}, [r1], r2
  45. pld [r1, r2, lsl #2]
  46. vld1.64 {d6, d7}, [r1], r2
  47. pld [r1]
  48. pld [r1, r2]
  49. pld [r1, r2, lsl #1]
  50. .if \avg
  51. vld1.64 {d16,d17}, [ip,:128], r2
  52. vrhadd.u8 q0, q0, q8
  53. vld1.64 {d18,d19}, [ip,:128], r2
  54. vrhadd.u8 q1, q1, q9
  55. vld1.64 {d20,d21}, [ip,:128], r2
  56. vrhadd.u8 q2, q2, q10
  57. vld1.64 {d22,d23}, [ip,:128], r2
  58. vrhadd.u8 q3, q3, q11
  59. .endif
  60. subs r3, r3, #4
  61. vst1.64 {d0, d1}, [r0,:128], r2
  62. vst1.64 {d2, d3}, [r0,:128], r2
  63. vst1.64 {d4, d5}, [r0,:128], r2
  64. vst1.64 {d6, d7}, [r0,:128], r2
  65. bne 1b
  66. bx lr
  67. .endm
  68. .macro pixels16_x2 vhadd=vrhadd.u8
  69. 1: vld1.64 {d0-d2}, [r1], r2
  70. vld1.64 {d4-d6}, [r1], r2
  71. pld [r1]
  72. pld [r1, r2]
  73. subs r3, r3, #2
  74. vext.8 q1, q0, q1, #1
  75. \vhadd q0, q0, q1
  76. vext.8 q3, q2, q3, #1
  77. \vhadd q2, q2, q3
  78. vst1.64 {d0, d1}, [r0,:128], r2
  79. vst1.64 {d4, d5}, [r0,:128], r2
  80. bne 1b
  81. bx lr
  82. .endm
  83. .macro pixels16_y2 vhadd=vrhadd.u8
  84. vld1.64 {d0, d1}, [r1], r2
  85. vld1.64 {d2, d3}, [r1], r2
  86. 1: subs r3, r3, #2
  87. \vhadd q2, q0, q1
  88. vld1.64 {d0, d1}, [r1], r2
  89. \vhadd q3, q0, q1
  90. vld1.64 {d2, d3}, [r1], r2
  91. pld [r1]
  92. pld [r1, r2]
  93. vst1.64 {d4, d5}, [r0,:128], r2
  94. vst1.64 {d6, d7}, [r0,:128], r2
  95. bne 1b
  96. bx lr
  97. .endm
  98. .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
  99. vld1.64 {d0-d2}, [r1], r2
  100. vld1.64 {d4-d6}, [r1], r2
  101. .if \no_rnd
  102. vmov.i16 q13, #1
  103. .endif
  104. pld [r1]
  105. pld [r1, r2]
  106. vext.8 q1, q0, q1, #1
  107. vext.8 q3, q2, q3, #1
  108. vaddl.u8 q8, d0, d2
  109. vaddl.u8 q10, d1, d3
  110. vaddl.u8 q9, d4, d6
  111. vaddl.u8 q11, d5, d7
  112. 1: subs r3, r3, #2
  113. vld1.64 {d0-d2}, [r1], r2
  114. vadd.u16 q12, q8, q9
  115. pld [r1]
  116. .if \no_rnd
  117. vadd.u16 q12, q12, q13
  118. .endif
  119. vext.8 q15, q0, q1, #1
  120. vadd.u16 q1 , q10, q11
  121. \vshrn d28, q12, #2
  122. .if \no_rnd
  123. vadd.u16 q1, q1, q13
  124. .endif
  125. \vshrn d29, q1, #2
  126. vaddl.u8 q8, d0, d30
  127. vld1.64 {d2-d4}, [r1], r2
  128. vaddl.u8 q10, d1, d31
  129. vst1.64 {d28,d29}, [r0,:128], r2
  130. vadd.u16 q12, q8, q9
  131. pld [r1, r2]
  132. .if \no_rnd
  133. vadd.u16 q12, q12, q13
  134. .endif
  135. vext.8 q2, q1, q2, #1
  136. vadd.u16 q0, q10, q11
  137. \vshrn d30, q12, #2
  138. .if \no_rnd
  139. vadd.u16 q0, q0, q13
  140. .endif
  141. \vshrn d31, q0, #2
  142. vaddl.u8 q9, d2, d4
  143. vaddl.u8 q11, d3, d5
  144. vst1.64 {d30,d31}, [r0,:128], r2
  145. bgt 1b
  146. bx lr
  147. .endm
  148. .macro pixels8 avg=0
  149. 1: vld1.64 {d0}, [r1], r2
  150. vld1.64 {d1}, [r1], r2
  151. vld1.64 {d2}, [r1], r2
  152. pld [r1, r2, lsl #2]
  153. vld1.64 {d3}, [r1], r2
  154. pld [r1]
  155. pld [r1, r2]
  156. pld [r1, r2, lsl #1]
  157. .if \avg
  158. vld1.64 {d4}, [r0,:64], r2
  159. vrhadd.u8 d0, d0, d4
  160. vld1.64 {d5}, [r0,:64], r2
  161. vrhadd.u8 d1, d1, d5
  162. vld1.64 {d6}, [r0,:64], r2
  163. vrhadd.u8 d2, d2, d6
  164. vld1.64 {d7}, [r0,:64], r2
  165. vrhadd.u8 d3, d3, d7
  166. sub r0, r0, r2, lsl #2
  167. .endif
  168. subs r3, r3, #4
  169. vst1.64 {d0}, [r0,:64], r2
  170. vst1.64 {d1}, [r0,:64], r2
  171. vst1.64 {d2}, [r0,:64], r2
  172. vst1.64 {d3}, [r0,:64], r2
  173. bne 1b
  174. bx lr
  175. .endm
  176. .macro pixels8_x2 vhadd=vrhadd.u8
  177. 1: vld1.64 {d0, d1}, [r1], r2
  178. vext.8 d1, d0, d1, #1
  179. vld1.64 {d2, d3}, [r1], r2
  180. vext.8 d3, d2, d3, #1
  181. pld [r1]
  182. pld [r1, r2]
  183. subs r3, r3, #2
  184. vswp d1, d2
  185. \vhadd q0, q0, q1
  186. vst1.64 {d0}, [r0,:64], r2
  187. vst1.64 {d1}, [r0,:64], r2
  188. bne 1b
  189. bx lr
  190. .endm
  191. .macro pixels8_y2 vhadd=vrhadd.u8
  192. vld1.64 {d0}, [r1], r2
  193. vld1.64 {d1}, [r1], r2
  194. 1: subs r3, r3, #2
  195. \vhadd d4, d0, d1
  196. vld1.64 {d0}, [r1], r2
  197. \vhadd d5, d0, d1
  198. vld1.64 {d1}, [r1], r2
  199. pld [r1]
  200. pld [r1, r2]
  201. vst1.64 {d4}, [r0,:64], r2
  202. vst1.64 {d5}, [r0,:64], r2
  203. bne 1b
  204. bx lr
  205. .endm
  206. .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  207. vld1.64 {d0, d1}, [r1], r2
  208. vld1.64 {d2, d3}, [r1], r2
  209. .if \no_rnd
  210. vmov.i16 q11, #1
  211. .endif
  212. pld [r1]
  213. pld [r1, r2]
  214. vext.8 d4, d0, d1, #1
  215. vext.8 d6, d2, d3, #1
  216. vaddl.u8 q8, d0, d4
  217. vaddl.u8 q9, d2, d6
  218. 1: subs r3, r3, #2
  219. vld1.64 {d0, d1}, [r1], r2
  220. pld [r1]
  221. vadd.u16 q10, q8, q9
  222. vext.8 d4, d0, d1, #1
  223. .if \no_rnd
  224. vadd.u16 q10, q10, q11
  225. .endif
  226. vaddl.u8 q8, d0, d4
  227. \vshrn d5, q10, #2
  228. vld1.64 {d2, d3}, [r1], r2
  229. vadd.u16 q10, q8, q9
  230. pld [r1, r2]
  231. .if \no_rnd
  232. vadd.u16 q10, q10, q11
  233. .endif
  234. vst1.64 {d5}, [r0,:64], r2
  235. \vshrn d7, q10, #2
  236. vext.8 d6, d2, d3, #1
  237. vaddl.u8 q9, d2, d6
  238. vst1.64 {d7}, [r0,:64], r2
  239. bgt 1b
  240. bx lr
  241. .endm
  242. .macro pixfunc pfx name suf rnd_op args:vararg
  243. function ff_\pfx\name\suf\()_neon, export=1
  244. \name \rnd_op \args
  245. endfunc
  246. .endm
  247. .macro pixfunc2 pfx name args:vararg
  248. pixfunc \pfx \name
  249. pixfunc \pfx \name \args
  250. .endm
  251. function ff_put_h264_qpel16_mc00_neon, export=1
  252. mov r3, #16
  253. endfunc
  254. pixfunc put_ pixels16
  255. pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
  256. pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
  257. pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  258. function ff_avg_h264_qpel16_mc00_neon, export=1
  259. mov r3, #16
  260. endfunc
  261. pixfunc avg_ pixels16,, 1
  262. function ff_put_h264_qpel8_mc00_neon, export=1
  263. mov r3, #8
  264. endfunc
  265. pixfunc put_ pixels8
  266. pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
  267. pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
  268. pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
  269. function ff_avg_h264_qpel8_mc00_neon, export=1
  270. mov r3, #8
  271. endfunc
  272. pixfunc avg_ pixels8,, 1
  273. function ff_put_pixels_clamped_neon, export=1
  274. vld1.64 {d16-d19}, [r0,:128]!
  275. vqmovun.s16 d0, q8
  276. vld1.64 {d20-d23}, [r0,:128]!
  277. vqmovun.s16 d1, q9
  278. vld1.64 {d24-d27}, [r0,:128]!
  279. vqmovun.s16 d2, q10
  280. vld1.64 {d28-d31}, [r0,:128]!
  281. vqmovun.s16 d3, q11
  282. vst1.64 {d0}, [r1,:64], r2
  283. vqmovun.s16 d4, q12
  284. vst1.64 {d1}, [r1,:64], r2
  285. vqmovun.s16 d5, q13
  286. vst1.64 {d2}, [r1,:64], r2
  287. vqmovun.s16 d6, q14
  288. vst1.64 {d3}, [r1,:64], r2
  289. vqmovun.s16 d7, q15
  290. vst1.64 {d4}, [r1,:64], r2
  291. vst1.64 {d5}, [r1,:64], r2
  292. vst1.64 {d6}, [r1,:64], r2
  293. vst1.64 {d7}, [r1,:64], r2
  294. bx lr
  295. endfunc
  296. function ff_put_signed_pixels_clamped_neon, export=1
  297. vmov.u8 d31, #128
  298. vld1.64 {d16-d17}, [r0,:128]!
  299. vqmovn.s16 d0, q8
  300. vld1.64 {d18-d19}, [r0,:128]!
  301. vqmovn.s16 d1, q9
  302. vld1.64 {d16-d17}, [r0,:128]!
  303. vqmovn.s16 d2, q8
  304. vld1.64 {d18-d19}, [r0,:128]!
  305. vadd.u8 d0, d0, d31
  306. vld1.64 {d20-d21}, [r0,:128]!
  307. vadd.u8 d1, d1, d31
  308. vld1.64 {d22-d23}, [r0,:128]!
  309. vadd.u8 d2, d2, d31
  310. vst1.64 {d0}, [r1,:64], r2
  311. vqmovn.s16 d3, q9
  312. vst1.64 {d1}, [r1,:64], r2
  313. vqmovn.s16 d4, q10
  314. vst1.64 {d2}, [r1,:64], r2
  315. vqmovn.s16 d5, q11
  316. vld1.64 {d24-d25}, [r0,:128]!
  317. vadd.u8 d3, d3, d31
  318. vld1.64 {d26-d27}, [r0,:128]!
  319. vadd.u8 d4, d4, d31
  320. vadd.u8 d5, d5, d31
  321. vst1.64 {d3}, [r1,:64], r2
  322. vqmovn.s16 d6, q12
  323. vst1.64 {d4}, [r1,:64], r2
  324. vqmovn.s16 d7, q13
  325. vst1.64 {d5}, [r1,:64], r2
  326. vadd.u8 d6, d6, d31
  327. vadd.u8 d7, d7, d31
  328. vst1.64 {d6}, [r1,:64], r2
  329. vst1.64 {d7}, [r1,:64], r2
  330. bx lr
  331. endfunc
  332. function ff_add_pixels_clamped_neon, export=1
  333. mov r3, r1
  334. vld1.64 {d16}, [r1,:64], r2
  335. vld1.64 {d0-d1}, [r0,:128]!
  336. vaddw.u8 q0, q0, d16
  337. vld1.64 {d17}, [r1,:64], r2
  338. vld1.64 {d2-d3}, [r0,:128]!
  339. vqmovun.s16 d0, q0
  340. vld1.64 {d18}, [r1,:64], r2
  341. vaddw.u8 q1, q1, d17
  342. vld1.64 {d4-d5}, [r0,:128]!
  343. vaddw.u8 q2, q2, d18
  344. vst1.64 {d0}, [r3,:64], r2
  345. vqmovun.s16 d2, q1
  346. vld1.64 {d19}, [r1,:64], r2
  347. vld1.64 {d6-d7}, [r0,:128]!
  348. vaddw.u8 q3, q3, d19
  349. vqmovun.s16 d4, q2
  350. vst1.64 {d2}, [r3,:64], r2
  351. vld1.64 {d16}, [r1,:64], r2
  352. vqmovun.s16 d6, q3
  353. vld1.64 {d0-d1}, [r0,:128]!
  354. vaddw.u8 q0, q0, d16
  355. vst1.64 {d4}, [r3,:64], r2
  356. vld1.64 {d17}, [r1,:64], r2
  357. vld1.64 {d2-d3}, [r0,:128]!
  358. vaddw.u8 q1, q1, d17
  359. vst1.64 {d6}, [r3,:64], r2
  360. vqmovun.s16 d0, q0
  361. vld1.64 {d18}, [r1,:64], r2
  362. vld1.64 {d4-d5}, [r0,:128]!
  363. vaddw.u8 q2, q2, d18
  364. vst1.64 {d0}, [r3,:64], r2
  365. vqmovun.s16 d2, q1
  366. vld1.64 {d19}, [r1,:64], r2
  367. vqmovun.s16 d4, q2
  368. vld1.64 {d6-d7}, [r0,:128]!
  369. vaddw.u8 q3, q3, d19
  370. vst1.64 {d2}, [r3,:64], r2
  371. vqmovun.s16 d6, q3
  372. vst1.64 {d4}, [r3,:64], r2
  373. vst1.64 {d6}, [r3,:64], r2
  374. bx lr
  375. endfunc
  376. function ff_vector_fmul_neon, export=1
  377. subs r3, r3, #8
  378. vld1.64 {d0-d3}, [r1,:128]!
  379. vld1.64 {d4-d7}, [r2,:128]!
  380. vmul.f32 q8, q0, q2
  381. vmul.f32 q9, q1, q3
  382. beq 3f
  383. bics ip, r3, #15
  384. beq 2f
  385. 1: subs ip, ip, #16
  386. vld1.64 {d0-d1}, [r1,:128]!
  387. vld1.64 {d4-d5}, [r2,:128]!
  388. vmul.f32 q10, q0, q2
  389. vld1.64 {d2-d3}, [r1,:128]!
  390. vld1.64 {d6-d7}, [r2,:128]!
  391. vmul.f32 q11, q1, q3
  392. vst1.64 {d16-d19},[r0,:128]!
  393. vld1.64 {d0-d1}, [r1,:128]!
  394. vld1.64 {d4-d5}, [r2,:128]!
  395. vmul.f32 q8, q0, q2
  396. vld1.64 {d2-d3}, [r1,:128]!
  397. vld1.64 {d6-d7}, [r2,:128]!
  398. vmul.f32 q9, q1, q3
  399. vst1.64 {d20-d23},[r0,:128]!
  400. bne 1b
  401. ands r3, r3, #15
  402. beq 3f
  403. 2: vld1.64 {d0-d1}, [r1,:128]!
  404. vld1.64 {d4-d5}, [r2,:128]!
  405. vst1.64 {d16-d17},[r0,:128]!
  406. vmul.f32 q8, q0, q2
  407. vld1.64 {d2-d3}, [r1,:128]!
  408. vld1.64 {d6-d7}, [r2,:128]!
  409. vst1.64 {d18-d19},[r0,:128]!
  410. vmul.f32 q9, q1, q3
  411. 3: vst1.64 {d16-d19},[r0,:128]!
  412. bx lr
  413. endfunc
  414. function ff_vector_fmul_window_neon, export=1
  415. push {r4,r5,lr}
  416. ldr lr, [sp, #12]
  417. sub r2, r2, #8
  418. sub r5, lr, #2
  419. add r2, r2, r5, lsl #2
  420. add r4, r3, r5, lsl #3
  421. add ip, r0, r5, lsl #3
  422. mov r5, #-16
  423. vld1.64 {d0,d1}, [r1,:128]!
  424. vld1.64 {d2,d3}, [r2,:128], r5
  425. vld1.64 {d4,d5}, [r3,:128]!
  426. vld1.64 {d6,d7}, [r4,:128], r5
  427. 1: subs lr, lr, #4
  428. vmul.f32 d22, d0, d4
  429. vrev64.32 q3, q3
  430. vmul.f32 d23, d1, d5
  431. vrev64.32 q1, q1
  432. vmul.f32 d20, d0, d7
  433. vmul.f32 d21, d1, d6
  434. beq 2f
  435. vmla.f32 d22, d3, d7
  436. vld1.64 {d0,d1}, [r1,:128]!
  437. vmla.f32 d23, d2, d6
  438. vld1.64 {d18,d19},[r2,:128], r5
  439. vmls.f32 d20, d3, d4
  440. vld1.64 {d24,d25},[r3,:128]!
  441. vmls.f32 d21, d2, d5
  442. vld1.64 {d6,d7}, [r4,:128], r5
  443. vmov q1, q9
  444. vrev64.32 q11, q11
  445. vmov q2, q12
  446. vswp d22, d23
  447. vst1.64 {d20,d21},[r0,:128]!
  448. vst1.64 {d22,d23},[ip,:128], r5
  449. b 1b
  450. 2: vmla.f32 d22, d3, d7
  451. vmla.f32 d23, d2, d6
  452. vmls.f32 d20, d3, d4
  453. vmls.f32 d21, d2, d5
  454. vrev64.32 q11, q11
  455. vswp d22, d23
  456. vst1.64 {d20,d21},[r0,:128]!
  457. vst1.64 {d22,d23},[ip,:128], r5
  458. pop {r4,r5,pc}
  459. endfunc
  460. #if CONFIG_VORBIS_DECODER
  461. function ff_vorbis_inverse_coupling_neon, export=1
  462. vmov.i32 q10, #1<<31
  463. subs r2, r2, #4
  464. mov r3, r0
  465. mov r12, r1
  466. beq 3f
  467. vld1.32 {d24-d25},[r1,:128]!
  468. vld1.32 {d22-d23},[r0,:128]!
  469. vcle.s32 q8, q12, #0
  470. vand q9, q11, q10
  471. veor q12, q12, q9
  472. vand q2, q12, q8
  473. vbic q3, q12, q8
  474. vadd.f32 q12, q11, q2
  475. vsub.f32 q11, q11, q3
  476. 1: vld1.32 {d2-d3}, [r1,:128]!
  477. vld1.32 {d0-d1}, [r0,:128]!
  478. vcle.s32 q8, q1, #0
  479. vand q9, q0, q10
  480. veor q1, q1, q9
  481. vst1.32 {d24-d25},[r3, :128]!
  482. vst1.32 {d22-d23},[r12,:128]!
  483. vand q2, q1, q8
  484. vbic q3, q1, q8
  485. vadd.f32 q1, q0, q2
  486. vsub.f32 q0, q0, q3
  487. subs r2, r2, #8
  488. ble 2f
  489. vld1.32 {d24-d25},[r1,:128]!
  490. vld1.32 {d22-d23},[r0,:128]!
  491. vcle.s32 q8, q12, #0
  492. vand q9, q11, q10
  493. veor q12, q12, q9
  494. vst1.32 {d2-d3}, [r3, :128]!
  495. vst1.32 {d0-d1}, [r12,:128]!
  496. vand q2, q12, q8
  497. vbic q3, q12, q8
  498. vadd.f32 q12, q11, q2
  499. vsub.f32 q11, q11, q3
  500. b 1b
  501. 2: vst1.32 {d2-d3}, [r3, :128]!
  502. vst1.32 {d0-d1}, [r12,:128]!
  503. it lt
  504. bxlt lr
  505. 3: vld1.32 {d2-d3}, [r1,:128]
  506. vld1.32 {d0-d1}, [r0,:128]
  507. vcle.s32 q8, q1, #0
  508. vand q9, q0, q10
  509. veor q1, q1, q9
  510. vand q2, q1, q8
  511. vbic q3, q1, q8
  512. vadd.f32 q1, q0, q2
  513. vsub.f32 q0, q0, q3
  514. vst1.32 {d2-d3}, [r0,:128]!
  515. vst1.32 {d0-d1}, [r1,:128]!
  516. bx lr
  517. endfunc
  518. #endif
  519. function ff_vector_fmul_scalar_neon, export=1
  520. VFP len .req r2
  521. NOVFP len .req r3
  522. VFP vdup.32 q8, d0[0]
  523. NOVFP vdup.32 q8, r2
  524. bics r12, len, #15
  525. beq 3f
  526. vld1.32 {q0},[r1,:128]!
  527. vld1.32 {q1},[r1,:128]!
  528. 1: vmul.f32 q0, q0, q8
  529. vld1.32 {q2},[r1,:128]!
  530. vmul.f32 q1, q1, q8
  531. vld1.32 {q3},[r1,:128]!
  532. vmul.f32 q2, q2, q8
  533. vst1.32 {q0},[r0,:128]!
  534. vmul.f32 q3, q3, q8
  535. vst1.32 {q1},[r0,:128]!
  536. subs r12, r12, #16
  537. beq 2f
  538. vld1.32 {q0},[r1,:128]!
  539. vst1.32 {q2},[r0,:128]!
  540. vld1.32 {q1},[r1,:128]!
  541. vst1.32 {q3},[r0,:128]!
  542. b 1b
  543. 2: vst1.32 {q2},[r0,:128]!
  544. vst1.32 {q3},[r0,:128]!
  545. ands len, len, #15
  546. it eq
  547. bxeq lr
  548. 3: vld1.32 {q0},[r1,:128]!
  549. vmul.f32 q0, q0, q8
  550. vst1.32 {q0},[r0,:128]!
  551. subs len, len, #4
  552. bgt 3b
  553. bx lr
  554. .unreq len
  555. endfunc
  556. function ff_vector_fmac_scalar_neon, export=1
  557. VFP len .req r2
  558. VFP acc .req r3
  559. NOVFP len .req r3
  560. NOVFP acc .req r2
  561. VFP vdup.32 q15, d0[0]
  562. NOVFP vdup.32 q15, r2
  563. bics r12, len, #15
  564. mov acc, r0
  565. beq 3f
  566. vld1.32 {q0}, [r1,:128]!
  567. vld1.32 {q8}, [acc,:128]!
  568. vld1.32 {q1}, [r1,:128]!
  569. vld1.32 {q9}, [acc,:128]!
  570. 1: vmla.f32 q8, q0, q15
  571. vld1.32 {q2}, [r1,:128]!
  572. vld1.32 {q10}, [acc,:128]!
  573. vmla.f32 q9, q1, q15
  574. vld1.32 {q3}, [r1,:128]!
  575. vld1.32 {q11}, [acc,:128]!
  576. vmla.f32 q10, q2, q15
  577. vst1.32 {q8}, [r0,:128]!
  578. vmla.f32 q11, q3, q15
  579. vst1.32 {q9}, [r0,:128]!
  580. subs r12, r12, #16
  581. beq 2f
  582. vld1.32 {q0}, [r1,:128]!
  583. vld1.32 {q8}, [acc,:128]!
  584. vst1.32 {q10}, [r0,:128]!
  585. vld1.32 {q1}, [r1,:128]!
  586. vld1.32 {q9}, [acc,:128]!
  587. vst1.32 {q11}, [r0,:128]!
  588. b 1b
  589. 2: vst1.32 {q10}, [r0,:128]!
  590. vst1.32 {q11}, [r0,:128]!
  591. ands len, len, #15
  592. it eq
  593. bxeq lr
  594. 3: vld1.32 {q0}, [r1,:128]!
  595. vld1.32 {q8}, [acc,:128]!
  596. vmla.f32 q8, q0, q15
  597. vst1.32 {q8}, [r0,:128]!
  598. subs len, len, #4
  599. bgt 3b
  600. bx lr
  601. .unreq len
  602. endfunc
  603. function ff_butterflies_float_neon, export=1
  604. 1: vld1.32 {q0},[r0,:128]
  605. vld1.32 {q1},[r1,:128]
  606. vsub.f32 q2, q0, q1
  607. vadd.f32 q1, q0, q1
  608. vst1.32 {q2},[r1,:128]!
  609. vst1.32 {q1},[r0,:128]!
  610. subs r2, r2, #4
  611. bgt 1b
  612. bx lr
  613. endfunc
  614. function ff_scalarproduct_float_neon, export=1
  615. vmov.f32 q2, #0.0
  616. 1: vld1.32 {q0},[r0,:128]!
  617. vld1.32 {q1},[r1,:128]!
  618. vmla.f32 q2, q0, q1
  619. subs r2, r2, #4
  620. bgt 1b
  621. vadd.f32 d0, d4, d5
  622. vpadd.f32 d0, d0, d0
  623. NOVFP vmov.32 r0, d0[0]
  624. bx lr
  625. endfunc
  626. function ff_vector_fmul_reverse_neon, export=1
  627. add r2, r2, r3, lsl #2
  628. sub r2, r2, #32
  629. mov r12, #-32
  630. vld1.32 {q0-q1}, [r1,:128]!
  631. vld1.32 {q2-q3}, [r2,:128], r12
  632. 1: pld [r1, #32]
  633. vrev64.32 q3, q3
  634. vmul.f32 d16, d0, d7
  635. vmul.f32 d17, d1, d6
  636. pld [r2, #-32]
  637. vrev64.32 q2, q2
  638. vmul.f32 d18, d2, d5
  639. vmul.f32 d19, d3, d4
  640. subs r3, r3, #8
  641. beq 2f
  642. vld1.32 {q0-q1}, [r1,:128]!
  643. vld1.32 {q2-q3}, [r2,:128], r12
  644. vst1.32 {q8-q9}, [r0,:128]!
  645. b 1b
  646. 2: vst1.32 {q8-q9}, [r0,:128]!
  647. bx lr
  648. endfunc
  649. function ff_vector_fmul_add_neon, export=1
  650. ldr r12, [sp]
  651. vld1.32 {q0-q1}, [r1,:128]!
  652. vld1.32 {q8-q9}, [r2,:128]!
  653. vld1.32 {q2-q3}, [r3,:128]!
  654. vmul.f32 q10, q0, q8
  655. vmul.f32 q11, q1, q9
  656. 1: vadd.f32 q12, q2, q10
  657. vadd.f32 q13, q3, q11
  658. pld [r1, #16]
  659. pld [r2, #16]
  660. pld [r3, #16]
  661. subs r12, r12, #8
  662. beq 2f
  663. vld1.32 {q0}, [r1,:128]!
  664. vld1.32 {q8}, [r2,:128]!
  665. vmul.f32 q10, q0, q8
  666. vld1.32 {q1}, [r1,:128]!
  667. vld1.32 {q9}, [r2,:128]!
  668. vmul.f32 q11, q1, q9
  669. vld1.32 {q2-q3}, [r3,:128]!
  670. vst1.32 {q12-q13},[r0,:128]!
  671. b 1b
  672. 2: vst1.32 {q12-q13},[r0,:128]!
  673. bx lr
  674. endfunc
  675. function ff_vector_clipf_neon, export=1
  676. VFP vdup.32 q1, d0[1]
  677. VFP vdup.32 q0, d0[0]
  678. NOVFP vdup.32 q0, r2
  679. NOVFP vdup.32 q1, r3
  680. NOVFP ldr r2, [sp]
  681. vld1.f32 {q2},[r1,:128]!
  682. vmin.f32 q10, q2, q1
  683. vld1.f32 {q3},[r1,:128]!
  684. vmin.f32 q11, q3, q1
  685. 1: vmax.f32 q8, q10, q0
  686. vmax.f32 q9, q11, q0
  687. subs r2, r2, #8
  688. beq 2f
  689. vld1.f32 {q2},[r1,:128]!
  690. vmin.f32 q10, q2, q1
  691. vld1.f32 {q3},[r1,:128]!
  692. vmin.f32 q11, q3, q1
  693. vst1.f32 {q8},[r0,:128]!
  694. vst1.f32 {q9},[r0,:128]!
  695. b 1b
  696. 2: vst1.f32 {q8},[r0,:128]!
  697. vst1.f32 {q9},[r0,:128]!
  698. bx lr
  699. endfunc
  700. function ff_apply_window_int16_neon, export=1
  701. push {r4,lr}
  702. add r4, r1, r3, lsl #1
  703. add lr, r0, r3, lsl #1
  704. sub r4, r4, #16
  705. sub lr, lr, #16
  706. mov r12, #-16
  707. 1:
  708. vld1.16 {q0}, [r1,:128]!
  709. vld1.16 {q2}, [r2,:128]!
  710. vld1.16 {q1}, [r4,:128], r12
  711. vrev64.16 q3, q2
  712. vqrdmulh.s16 q0, q0, q2
  713. vqrdmulh.s16 d2, d2, d7
  714. vqrdmulh.s16 d3, d3, d6
  715. vst1.16 {q0}, [r0,:128]!
  716. vst1.16 {q1}, [lr,:128], r12
  717. subs r3, r3, #16
  718. bgt 1b
  719. pop {r4,pc}
  720. endfunc
  721. function ff_vector_clip_int32_neon, export=1
  722. vdup.32 q0, r2
  723. vdup.32 q1, r3
  724. ldr r2, [sp]
  725. 1:
  726. vld1.32 {q2-q3}, [r1,:128]!
  727. vmin.s32 q2, q2, q1
  728. vmin.s32 q3, q3, q1
  729. vmax.s32 q2, q2, q0
  730. vmax.s32 q3, q3, q0
  731. vst1.32 {q2-q3}, [r0,:128]!
  732. subs r2, r2, #8
  733. bgt 1b
  734. bx lr
  735. endfunc