You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

792 lines
26KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. .text
  25. function ff_clear_block_neon, export=1
  26. vmov.i16 q0, #0
  27. .rept 8
  28. vst1.16 {q0}, [r0,:128]!
  29. .endr
  30. bx lr
  31. endfunc
  32. function ff_clear_blocks_neon, export=1
  33. vmov.i16 q0, #0
  34. .rept 8*6
  35. vst1.16 {q0}, [r0,:128]!
  36. .endr
  37. bx lr
  38. endfunc
  39. .macro pixels16 avg=0
  40. .if \avg
  41. mov ip, r0
  42. .endif
  43. 1: vld1.64 {d0, d1}, [r1], r2
  44. vld1.64 {d2, d3}, [r1], r2
  45. vld1.64 {d4, d5}, [r1], r2
  46. pld [r1, r2, lsl #2]
  47. vld1.64 {d6, d7}, [r1], r2
  48. pld [r1]
  49. pld [r1, r2]
  50. pld [r1, r2, lsl #1]
  51. .if \avg
  52. vld1.64 {d16,d17}, [ip,:128], r2
  53. vrhadd.u8 q0, q0, q8
  54. vld1.64 {d18,d19}, [ip,:128], r2
  55. vrhadd.u8 q1, q1, q9
  56. vld1.64 {d20,d21}, [ip,:128], r2
  57. vrhadd.u8 q2, q2, q10
  58. vld1.64 {d22,d23}, [ip,:128], r2
  59. vrhadd.u8 q3, q3, q11
  60. .endif
  61. subs r3, r3, #4
  62. vst1.64 {d0, d1}, [r0,:128], r2
  63. vst1.64 {d2, d3}, [r0,:128], r2
  64. vst1.64 {d4, d5}, [r0,:128], r2
  65. vst1.64 {d6, d7}, [r0,:128], r2
  66. bne 1b
  67. bx lr
  68. .endm
  69. .macro pixels16_x2 vhadd=vrhadd.u8
  70. 1: vld1.64 {d0-d2}, [r1], r2
  71. vld1.64 {d4-d6}, [r1], r2
  72. pld [r1]
  73. pld [r1, r2]
  74. subs r3, r3, #2
  75. vext.8 q1, q0, q1, #1
  76. \vhadd q0, q0, q1
  77. vext.8 q3, q2, q3, #1
  78. \vhadd q2, q2, q3
  79. vst1.64 {d0, d1}, [r0,:128], r2
  80. vst1.64 {d4, d5}, [r0,:128], r2
  81. bne 1b
  82. bx lr
  83. .endm
  84. .macro pixels16_y2 vhadd=vrhadd.u8
  85. vld1.64 {d0, d1}, [r1], r2
  86. vld1.64 {d2, d3}, [r1], r2
  87. 1: subs r3, r3, #2
  88. \vhadd q2, q0, q1
  89. vld1.64 {d0, d1}, [r1], r2
  90. \vhadd q3, q0, q1
  91. vld1.64 {d2, d3}, [r1], r2
  92. pld [r1]
  93. pld [r1, r2]
  94. vst1.64 {d4, d5}, [r0,:128], r2
  95. vst1.64 {d6, d7}, [r0,:128], r2
  96. bne 1b
  97. bx lr
  98. .endm
  99. .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
  100. vld1.64 {d0-d2}, [r1], r2
  101. vld1.64 {d4-d6}, [r1], r2
  102. .if \no_rnd
  103. vmov.i16 q13, #1
  104. .endif
  105. pld [r1]
  106. pld [r1, r2]
  107. vext.8 q1, q0, q1, #1
  108. vext.8 q3, q2, q3, #1
  109. vaddl.u8 q8, d0, d2
  110. vaddl.u8 q10, d1, d3
  111. vaddl.u8 q9, d4, d6
  112. vaddl.u8 q11, d5, d7
  113. 1: subs r3, r3, #2
  114. vld1.64 {d0-d2}, [r1], r2
  115. vadd.u16 q12, q8, q9
  116. pld [r1]
  117. .if \no_rnd
  118. vadd.u16 q12, q12, q13
  119. .endif
  120. vext.8 q15, q0, q1, #1
  121. vadd.u16 q1 , q10, q11
  122. \vshrn d28, q12, #2
  123. .if \no_rnd
  124. vadd.u16 q1, q1, q13
  125. .endif
  126. \vshrn d29, q1, #2
  127. vaddl.u8 q8, d0, d30
  128. vld1.64 {d2-d4}, [r1], r2
  129. vaddl.u8 q10, d1, d31
  130. vst1.64 {d28,d29}, [r0,:128], r2
  131. vadd.u16 q12, q8, q9
  132. pld [r1, r2]
  133. .if \no_rnd
  134. vadd.u16 q12, q12, q13
  135. .endif
  136. vext.8 q2, q1, q2, #1
  137. vadd.u16 q0, q10, q11
  138. \vshrn d30, q12, #2
  139. .if \no_rnd
  140. vadd.u16 q0, q0, q13
  141. .endif
  142. \vshrn d31, q0, #2
  143. vaddl.u8 q9, d2, d4
  144. vaddl.u8 q11, d3, d5
  145. vst1.64 {d30,d31}, [r0,:128], r2
  146. bgt 1b
  147. bx lr
  148. .endm
  149. .macro pixels8 avg=0
  150. 1: vld1.64 {d0}, [r1], r2
  151. vld1.64 {d1}, [r1], r2
  152. vld1.64 {d2}, [r1], r2
  153. pld [r1, r2, lsl #2]
  154. vld1.64 {d3}, [r1], r2
  155. pld [r1]
  156. pld [r1, r2]
  157. pld [r1, r2, lsl #1]
  158. .if \avg
  159. vld1.64 {d4}, [r0,:64], r2
  160. vrhadd.u8 d0, d0, d4
  161. vld1.64 {d5}, [r0,:64], r2
  162. vrhadd.u8 d1, d1, d5
  163. vld1.64 {d6}, [r0,:64], r2
  164. vrhadd.u8 d2, d2, d6
  165. vld1.64 {d7}, [r0,:64], r2
  166. vrhadd.u8 d3, d3, d7
  167. sub r0, r0, r2, lsl #2
  168. .endif
  169. subs r3, r3, #4
  170. vst1.64 {d0}, [r0,:64], r2
  171. vst1.64 {d1}, [r0,:64], r2
  172. vst1.64 {d2}, [r0,:64], r2
  173. vst1.64 {d3}, [r0,:64], r2
  174. bne 1b
  175. bx lr
  176. .endm
  177. .macro pixels8_x2 vhadd=vrhadd.u8
  178. 1: vld1.64 {d0, d1}, [r1], r2
  179. vext.8 d1, d0, d1, #1
  180. vld1.64 {d2, d3}, [r1], r2
  181. vext.8 d3, d2, d3, #1
  182. pld [r1]
  183. pld [r1, r2]
  184. subs r3, r3, #2
  185. vswp d1, d2
  186. \vhadd q0, q0, q1
  187. vst1.64 {d0}, [r0,:64], r2
  188. vst1.64 {d1}, [r0,:64], r2
  189. bne 1b
  190. bx lr
  191. .endm
  192. .macro pixels8_y2 vhadd=vrhadd.u8
  193. vld1.64 {d0}, [r1], r2
  194. vld1.64 {d1}, [r1], r2
  195. 1: subs r3, r3, #2
  196. \vhadd d4, d0, d1
  197. vld1.64 {d0}, [r1], r2
  198. \vhadd d5, d0, d1
  199. vld1.64 {d1}, [r1], r2
  200. pld [r1]
  201. pld [r1, r2]
  202. vst1.64 {d4}, [r0,:64], r2
  203. vst1.64 {d5}, [r0,:64], r2
  204. bne 1b
  205. bx lr
  206. .endm
  207. .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  208. vld1.64 {d0, d1}, [r1], r2
  209. vld1.64 {d2, d3}, [r1], r2
  210. .if \no_rnd
  211. vmov.i16 q11, #1
  212. .endif
  213. pld [r1]
  214. pld [r1, r2]
  215. vext.8 d4, d0, d1, #1
  216. vext.8 d6, d2, d3, #1
  217. vaddl.u8 q8, d0, d4
  218. vaddl.u8 q9, d2, d6
  219. 1: subs r3, r3, #2
  220. vld1.64 {d0, d1}, [r1], r2
  221. pld [r1]
  222. vadd.u16 q10, q8, q9
  223. vext.8 d4, d0, d1, #1
  224. .if \no_rnd
  225. vadd.u16 q10, q10, q11
  226. .endif
  227. vaddl.u8 q8, d0, d4
  228. \vshrn d5, q10, #2
  229. vld1.64 {d2, d3}, [r1], r2
  230. vadd.u16 q10, q8, q9
  231. pld [r1, r2]
  232. .if \no_rnd
  233. vadd.u16 q10, q10, q11
  234. .endif
  235. vst1.64 {d5}, [r0,:64], r2
  236. \vshrn d7, q10, #2
  237. vext.8 d6, d2, d3, #1
  238. vaddl.u8 q9, d2, d6
  239. vst1.64 {d7}, [r0,:64], r2
  240. bgt 1b
  241. bx lr
  242. .endm
  243. .macro pixfunc pfx name suf rnd_op args:vararg
  244. function ff_\pfx\name\suf\()_neon, export=1
  245. \name \rnd_op \args
  246. endfunc
  247. .endm
  248. .macro pixfunc2 pfx name args:vararg
  249. pixfunc \pfx \name
  250. pixfunc \pfx \name \args
  251. .endm
  252. function ff_put_h264_qpel16_mc00_neon, export=1
  253. mov r3, #16
  254. endfunc
  255. pixfunc put_ pixels16
  256. pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
  257. pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
  258. pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  259. function ff_avg_h264_qpel16_mc00_neon, export=1
  260. mov r3, #16
  261. endfunc
  262. pixfunc avg_ pixels16,, 1
  263. function ff_put_h264_qpel8_mc00_neon, export=1
  264. mov r3, #8
  265. endfunc
  266. pixfunc put_ pixels8
  267. pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
  268. pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
  269. pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
  270. function ff_avg_h264_qpel8_mc00_neon, export=1
  271. mov r3, #8
  272. endfunc
  273. pixfunc avg_ pixels8,, 1
  274. function ff_put_pixels_clamped_neon, export=1
  275. vld1.64 {d16-d19}, [r0,:128]!
  276. vqmovun.s16 d0, q8
  277. vld1.64 {d20-d23}, [r0,:128]!
  278. vqmovun.s16 d1, q9
  279. vld1.64 {d24-d27}, [r0,:128]!
  280. vqmovun.s16 d2, q10
  281. vld1.64 {d28-d31}, [r0,:128]!
  282. vqmovun.s16 d3, q11
  283. vst1.64 {d0}, [r1,:64], r2
  284. vqmovun.s16 d4, q12
  285. vst1.64 {d1}, [r1,:64], r2
  286. vqmovun.s16 d5, q13
  287. vst1.64 {d2}, [r1,:64], r2
  288. vqmovun.s16 d6, q14
  289. vst1.64 {d3}, [r1,:64], r2
  290. vqmovun.s16 d7, q15
  291. vst1.64 {d4}, [r1,:64], r2
  292. vst1.64 {d5}, [r1,:64], r2
  293. vst1.64 {d6}, [r1,:64], r2
  294. vst1.64 {d7}, [r1,:64], r2
  295. bx lr
  296. endfunc
  297. function ff_put_signed_pixels_clamped_neon, export=1
  298. vmov.u8 d31, #128
  299. vld1.64 {d16-d17}, [r0,:128]!
  300. vqmovn.s16 d0, q8
  301. vld1.64 {d18-d19}, [r0,:128]!
  302. vqmovn.s16 d1, q9
  303. vld1.64 {d16-d17}, [r0,:128]!
  304. vqmovn.s16 d2, q8
  305. vld1.64 {d18-d19}, [r0,:128]!
  306. vadd.u8 d0, d0, d31
  307. vld1.64 {d20-d21}, [r0,:128]!
  308. vadd.u8 d1, d1, d31
  309. vld1.64 {d22-d23}, [r0,:128]!
  310. vadd.u8 d2, d2, d31
  311. vst1.64 {d0}, [r1,:64], r2
  312. vqmovn.s16 d3, q9
  313. vst1.64 {d1}, [r1,:64], r2
  314. vqmovn.s16 d4, q10
  315. vst1.64 {d2}, [r1,:64], r2
  316. vqmovn.s16 d5, q11
  317. vld1.64 {d24-d25}, [r0,:128]!
  318. vadd.u8 d3, d3, d31
  319. vld1.64 {d26-d27}, [r0,:128]!
  320. vadd.u8 d4, d4, d31
  321. vadd.u8 d5, d5, d31
  322. vst1.64 {d3}, [r1,:64], r2
  323. vqmovn.s16 d6, q12
  324. vst1.64 {d4}, [r1,:64], r2
  325. vqmovn.s16 d7, q13
  326. vst1.64 {d5}, [r1,:64], r2
  327. vadd.u8 d6, d6, d31
  328. vadd.u8 d7, d7, d31
  329. vst1.64 {d6}, [r1,:64], r2
  330. vst1.64 {d7}, [r1,:64], r2
  331. bx lr
  332. endfunc
  333. function ff_add_pixels_clamped_neon, export=1
  334. mov r3, r1
  335. vld1.64 {d16}, [r1,:64], r2
  336. vld1.64 {d0-d1}, [r0,:128]!
  337. vaddw.u8 q0, q0, d16
  338. vld1.64 {d17}, [r1,:64], r2
  339. vld1.64 {d2-d3}, [r0,:128]!
  340. vqmovun.s16 d0, q0
  341. vld1.64 {d18}, [r1,:64], r2
  342. vaddw.u8 q1, q1, d17
  343. vld1.64 {d4-d5}, [r0,:128]!
  344. vaddw.u8 q2, q2, d18
  345. vst1.64 {d0}, [r3,:64], r2
  346. vqmovun.s16 d2, q1
  347. vld1.64 {d19}, [r1,:64], r2
  348. vld1.64 {d6-d7}, [r0,:128]!
  349. vaddw.u8 q3, q3, d19
  350. vqmovun.s16 d4, q2
  351. vst1.64 {d2}, [r3,:64], r2
  352. vld1.64 {d16}, [r1,:64], r2
  353. vqmovun.s16 d6, q3
  354. vld1.64 {d0-d1}, [r0,:128]!
  355. vaddw.u8 q0, q0, d16
  356. vst1.64 {d4}, [r3,:64], r2
  357. vld1.64 {d17}, [r1,:64], r2
  358. vld1.64 {d2-d3}, [r0,:128]!
  359. vaddw.u8 q1, q1, d17
  360. vst1.64 {d6}, [r3,:64], r2
  361. vqmovun.s16 d0, q0
  362. vld1.64 {d18}, [r1,:64], r2
  363. vld1.64 {d4-d5}, [r0,:128]!
  364. vaddw.u8 q2, q2, d18
  365. vst1.64 {d0}, [r3,:64], r2
  366. vqmovun.s16 d2, q1
  367. vld1.64 {d19}, [r1,:64], r2
  368. vqmovun.s16 d4, q2
  369. vld1.64 {d6-d7}, [r0,:128]!
  370. vaddw.u8 q3, q3, d19
  371. vst1.64 {d2}, [r3,:64], r2
  372. vqmovun.s16 d6, q3
  373. vst1.64 {d4}, [r3,:64], r2
  374. vst1.64 {d6}, [r3,:64], r2
  375. bx lr
  376. endfunc
  377. function ff_vector_fmul_neon, export=1
  378. subs r3, r3, #8
  379. vld1.64 {d0-d3}, [r1,:128]!
  380. vld1.64 {d4-d7}, [r2,:128]!
  381. vmul.f32 q8, q0, q2
  382. vmul.f32 q9, q1, q3
  383. beq 3f
  384. bics ip, r3, #15
  385. beq 2f
  386. 1: subs ip, ip, #16
  387. vld1.64 {d0-d1}, [r1,:128]!
  388. vld1.64 {d4-d5}, [r2,:128]!
  389. vmul.f32 q10, q0, q2
  390. vld1.64 {d2-d3}, [r1,:128]!
  391. vld1.64 {d6-d7}, [r2,:128]!
  392. vmul.f32 q11, q1, q3
  393. vst1.64 {d16-d19},[r0,:128]!
  394. vld1.64 {d0-d1}, [r1,:128]!
  395. vld1.64 {d4-d5}, [r2,:128]!
  396. vmul.f32 q8, q0, q2
  397. vld1.64 {d2-d3}, [r1,:128]!
  398. vld1.64 {d6-d7}, [r2,:128]!
  399. vmul.f32 q9, q1, q3
  400. vst1.64 {d20-d23},[r0,:128]!
  401. bne 1b
  402. ands r3, r3, #15
  403. beq 3f
  404. 2: vld1.64 {d0-d1}, [r1,:128]!
  405. vld1.64 {d4-d5}, [r2,:128]!
  406. vst1.64 {d16-d17},[r0,:128]!
  407. vmul.f32 q8, q0, q2
  408. vld1.64 {d2-d3}, [r1,:128]!
  409. vld1.64 {d6-d7}, [r2,:128]!
  410. vst1.64 {d18-d19},[r0,:128]!
  411. vmul.f32 q9, q1, q3
  412. 3: vst1.64 {d16-d19},[r0,:128]!
  413. bx lr
  414. endfunc
  415. function ff_vector_fmul_window_neon, export=1
  416. push {r4,r5,lr}
  417. ldr lr, [sp, #12]
  418. sub r2, r2, #8
  419. sub r5, lr, #2
  420. add r2, r2, r5, lsl #2
  421. add r4, r3, r5, lsl #3
  422. add ip, r0, r5, lsl #3
  423. mov r5, #-16
  424. vld1.64 {d0,d1}, [r1,:128]!
  425. vld1.64 {d2,d3}, [r2,:128], r5
  426. vld1.64 {d4,d5}, [r3,:128]!
  427. vld1.64 {d6,d7}, [r4,:128], r5
  428. 1: subs lr, lr, #4
  429. vmul.f32 d22, d0, d4
  430. vrev64.32 q3, q3
  431. vmul.f32 d23, d1, d5
  432. vrev64.32 q1, q1
  433. vmul.f32 d20, d0, d7
  434. vmul.f32 d21, d1, d6
  435. beq 2f
  436. vmla.f32 d22, d3, d7
  437. vld1.64 {d0,d1}, [r1,:128]!
  438. vmla.f32 d23, d2, d6
  439. vld1.64 {d18,d19},[r2,:128], r5
  440. vmls.f32 d20, d3, d4
  441. vld1.64 {d24,d25},[r3,:128]!
  442. vmls.f32 d21, d2, d5
  443. vld1.64 {d6,d7}, [r4,:128], r5
  444. vmov q1, q9
  445. vrev64.32 q11, q11
  446. vmov q2, q12
  447. vswp d22, d23
  448. vst1.64 {d20,d21},[r0,:128]!
  449. vst1.64 {d22,d23},[ip,:128], r5
  450. b 1b
  451. 2: vmla.f32 d22, d3, d7
  452. vmla.f32 d23, d2, d6
  453. vmls.f32 d20, d3, d4
  454. vmls.f32 d21, d2, d5
  455. vrev64.32 q11, q11
  456. vswp d22, d23
  457. vst1.64 {d20,d21},[r0,:128]!
  458. vst1.64 {d22,d23},[ip,:128], r5
  459. pop {r4,r5,pc}
  460. endfunc
  461. #if CONFIG_VORBIS_DECODER
  462. function ff_vorbis_inverse_coupling_neon, export=1
  463. vmov.i32 q10, #1<<31
  464. subs r2, r2, #4
  465. mov r3, r0
  466. mov r12, r1
  467. beq 3f
  468. vld1.32 {d24-d25},[r1,:128]!
  469. vld1.32 {d22-d23},[r0,:128]!
  470. vcle.s32 q8, q12, #0
  471. vand q9, q11, q10
  472. veor q12, q12, q9
  473. vand q2, q12, q8
  474. vbic q3, q12, q8
  475. vadd.f32 q12, q11, q2
  476. vsub.f32 q11, q11, q3
  477. 1: vld1.32 {d2-d3}, [r1,:128]!
  478. vld1.32 {d0-d1}, [r0,:128]!
  479. vcle.s32 q8, q1, #0
  480. vand q9, q0, q10
  481. veor q1, q1, q9
  482. vst1.32 {d24-d25},[r3, :128]!
  483. vst1.32 {d22-d23},[r12,:128]!
  484. vand q2, q1, q8
  485. vbic q3, q1, q8
  486. vadd.f32 q1, q0, q2
  487. vsub.f32 q0, q0, q3
  488. subs r2, r2, #8
  489. ble 2f
  490. vld1.32 {d24-d25},[r1,:128]!
  491. vld1.32 {d22-d23},[r0,:128]!
  492. vcle.s32 q8, q12, #0
  493. vand q9, q11, q10
  494. veor q12, q12, q9
  495. vst1.32 {d2-d3}, [r3, :128]!
  496. vst1.32 {d0-d1}, [r12,:128]!
  497. vand q2, q12, q8
  498. vbic q3, q12, q8
  499. vadd.f32 q12, q11, q2
  500. vsub.f32 q11, q11, q3
  501. b 1b
  502. 2: vst1.32 {d2-d3}, [r3, :128]!
  503. vst1.32 {d0-d1}, [r12,:128]!
  504. bxlt lr
  505. 3: vld1.32 {d2-d3}, [r1,:128]
  506. vld1.32 {d0-d1}, [r0,:128]
  507. vcle.s32 q8, q1, #0
  508. vand q9, q0, q10
  509. veor q1, q1, q9
  510. vand q2, q1, q8
  511. vbic q3, q1, q8
  512. vadd.f32 q1, q0, q2
  513. vsub.f32 q0, q0, q3
  514. vst1.32 {d2-d3}, [r0,:128]!
  515. vst1.32 {d0-d1}, [r1,:128]!
  516. bx lr
  517. endfunc
  518. #endif
  519. function ff_vector_fmul_scalar_neon, export=1
  520. VFP len .req r2
  521. NOVFP len .req r3
  522. VFP vdup.32 q8, d0[0]
  523. NOVFP vdup.32 q8, r2
  524. bics r12, len, #15
  525. beq 3f
  526. vld1.32 {q0},[r1,:128]!
  527. vld1.32 {q1},[r1,:128]!
  528. 1: vmul.f32 q0, q0, q8
  529. vld1.32 {q2},[r1,:128]!
  530. vmul.f32 q1, q1, q8
  531. vld1.32 {q3},[r1,:128]!
  532. vmul.f32 q2, q2, q8
  533. vst1.32 {q0},[r0,:128]!
  534. vmul.f32 q3, q3, q8
  535. vst1.32 {q1},[r0,:128]!
  536. subs r12, r12, #16
  537. beq 2f
  538. vld1.32 {q0},[r1,:128]!
  539. vst1.32 {q2},[r0,:128]!
  540. vld1.32 {q1},[r1,:128]!
  541. vst1.32 {q3},[r0,:128]!
  542. b 1b
  543. 2: vst1.32 {q2},[r0,:128]!
  544. vst1.32 {q3},[r0,:128]!
  545. ands len, len, #15
  546. bxeq lr
  547. 3: vld1.32 {q0},[r1,:128]!
  548. vmul.f32 q0, q0, q8
  549. vst1.32 {q0},[r0,:128]!
  550. subs len, len, #4
  551. bgt 3b
  552. bx lr
  553. .unreq len
  554. endfunc
  555. function ff_vector_fmul_sv_scalar_2_neon, export=1
  556. VFP vdup.32 d16, d0[0]
  557. NOVFP vdup.32 d16, r3
  558. NOVFP ldr r3, [sp]
  559. vld1.32 {d0},[r1,:64]!
  560. vld1.32 {d1},[r1,:64]!
  561. 1: subs r3, r3, #4
  562. vmul.f32 d4, d0, d16
  563. vmul.f32 d5, d1, d16
  564. ldr r12, [r2], #4
  565. vld1.32 {d2},[r12,:64]
  566. ldr r12, [r2], #4
  567. vld1.32 {d3},[r12,:64]
  568. vmul.f32 d4, d4, d2
  569. vmul.f32 d5, d5, d3
  570. beq 2f
  571. vld1.32 {d0},[r1,:64]!
  572. vld1.32 {d1},[r1,:64]!
  573. vst1.32 {d4},[r0,:64]!
  574. vst1.32 {d5},[r0,:64]!
  575. b 1b
  576. 2: vst1.32 {d4},[r0,:64]!
  577. vst1.32 {d5},[r0,:64]!
  578. bx lr
  579. endfunc
  580. function ff_vector_fmul_sv_scalar_4_neon, export=1
  581. VFP vdup.32 q10, d0[0]
  582. NOVFP vdup.32 q10, r3
  583. NOVFP ldr r3, [sp]
  584. push {lr}
  585. bics lr, r3, #7
  586. beq 3f
  587. vld1.32 {q0},[r1,:128]!
  588. vld1.32 {q2},[r1,:128]!
  589. 1: ldr r12, [r2], #4
  590. vld1.32 {q1},[r12,:128]
  591. ldr r12, [r2], #4
  592. vld1.32 {q3},[r12,:128]
  593. vmul.f32 q8, q0, q10
  594. vmul.f32 q8, q8, q1
  595. vmul.f32 q9, q2, q10
  596. vmul.f32 q9, q9, q3
  597. subs lr, lr, #8
  598. beq 2f
  599. vld1.32 {q0},[r1,:128]!
  600. vld1.32 {q2},[r1,:128]!
  601. vst1.32 {q8},[r0,:128]!
  602. vst1.32 {q9},[r0,:128]!
  603. b 1b
  604. 2: vst1.32 {q8},[r0,:128]!
  605. vst1.32 {q9},[r0,:128]!
  606. ands r3, r3, #7
  607. popeq {pc}
  608. 3: vld1.32 {q0},[r1,:128]!
  609. ldr r12, [r2], #4
  610. vld1.32 {q1},[r12,:128]
  611. vmul.f32 q0, q0, q10
  612. vmul.f32 q0, q0, q1
  613. vst1.32 {q0},[r0,:128]!
  614. subs r3, r3, #4
  615. bgt 3b
  616. pop {pc}
  617. endfunc
  618. function ff_sv_fmul_scalar_2_neon, export=1
  619. VFP len .req r2
  620. NOVFP len .req r3
  621. VFP vdup.32 q8, d0[0]
  622. NOVFP vdup.32 q8, r2
  623. ldr r12, [r1], #4
  624. vld1.32 {d0},[r12,:64]
  625. ldr r12, [r1], #4
  626. vld1.32 {d1},[r12,:64]
  627. 1: vmul.f32 q1, q0, q8
  628. subs len, len, #4
  629. beq 2f
  630. ldr r12, [r1], #4
  631. vld1.32 {d0},[r12,:64]
  632. ldr r12, [r1], #4
  633. vld1.32 {d1},[r12,:64]
  634. vst1.32 {q1},[r0,:128]!
  635. b 1b
  636. 2: vst1.32 {q1},[r0,:128]!
  637. bx lr
  638. .unreq len
  639. endfunc
  640. function ff_sv_fmul_scalar_4_neon, export=1
  641. VFP len .req r2
  642. NOVFP len .req r3
  643. VFP vdup.32 q8, d0[0]
  644. NOVFP vdup.32 q8, r2
  645. 1: ldr r12, [r1], #4
  646. vld1.32 {q0},[r12,:128]
  647. vmul.f32 q0, q0, q8
  648. vst1.32 {q0},[r0,:128]!
  649. subs len, len, #4
  650. bgt 1b
  651. bx lr
  652. .unreq len
  653. endfunc
  654. function ff_butterflies_float_neon, export=1
  655. 1: vld1.32 {q0},[r0,:128]
  656. vld1.32 {q1},[r1,:128]
  657. vsub.f32 q2, q0, q1
  658. vadd.f32 q1, q0, q1
  659. vst1.32 {q2},[r1,:128]!
  660. vst1.32 {q1},[r0,:128]!
  661. subs r2, r2, #4
  662. bgt 1b
  663. bx lr
  664. endfunc
  665. function ff_scalarproduct_float_neon, export=1
  666. vmov.f32 q2, #0.0
  667. 1: vld1.32 {q0},[r0,:128]!
  668. vld1.32 {q1},[r1,:128]!
  669. vmla.f32 q2, q0, q1
  670. subs r2, r2, #4
  671. bgt 1b
  672. vadd.f32 d0, d4, d5
  673. vpadd.f32 d0, d0, d0
  674. NOVFP vmov.32 r0, d0[0]
  675. bx lr
  676. endfunc
  677. function ff_vector_fmul_reverse_neon, export=1
  678. add r2, r2, r3, lsl #2
  679. sub r2, r2, #32
  680. mov r12, #-32
  681. vld1.32 {q0-q1}, [r1,:128]!
  682. vld1.32 {q2-q3}, [r2,:128], r12
  683. 1: pld [r1, #32]
  684. vrev64.32 q3, q3
  685. vmul.f32 d16, d0, d7
  686. vmul.f32 d17, d1, d6
  687. pld [r2, #-32]
  688. vrev64.32 q2, q2
  689. vmul.f32 d18, d2, d5
  690. vmul.f32 d19, d3, d4
  691. subs r3, r3, #8
  692. beq 2f
  693. vld1.32 {q0-q1}, [r1,:128]!
  694. vld1.32 {q2-q3}, [r2,:128], r12
  695. vst1.32 {q8-q9}, [r0,:128]!
  696. b 1b
  697. 2: vst1.32 {q8-q9}, [r0,:128]!
  698. bx lr
  699. endfunc
  700. function ff_vector_fmul_add_neon, export=1
  701. ldr r12, [sp]
  702. vld1.32 {q0-q1}, [r1,:128]!
  703. vld1.32 {q8-q9}, [r2,:128]!
  704. vld1.32 {q2-q3}, [r3,:128]!
  705. vmul.f32 q10, q0, q8
  706. vmul.f32 q11, q1, q9
  707. 1: vadd.f32 q12, q2, q10
  708. vadd.f32 q13, q3, q11
  709. pld [r1, #16]
  710. pld [r2, #16]
  711. pld [r3, #16]
  712. subs r12, r12, #8
  713. beq 2f
  714. vld1.32 {q0}, [r1,:128]!
  715. vld1.32 {q8}, [r2,:128]!
  716. vmul.f32 q10, q0, q8
  717. vld1.32 {q1}, [r1,:128]!
  718. vld1.32 {q9}, [r2,:128]!
  719. vmul.f32 q11, q1, q9
  720. vld1.32 {q2-q3}, [r3,:128]!
  721. vst1.32 {q12-q13},[r0,:128]!
  722. b 1b
  723. 2: vst1.32 {q12-q13},[r0,:128]!
  724. bx lr
  725. endfunc
  726. function ff_vector_clipf_neon, export=1
  727. VFP vdup.32 q1, d0[1]
  728. VFP vdup.32 q0, d0[0]
  729. NOVFP vdup.32 q0, r2
  730. NOVFP vdup.32 q1, r3
  731. NOVFP ldr r2, [sp]
  732. vld1.f32 {q2},[r1,:128]!
  733. vmin.f32 q10, q2, q1
  734. vld1.f32 {q3},[r1,:128]!
  735. vmin.f32 q11, q3, q1
  736. 1: vmax.f32 q8, q10, q0
  737. vmax.f32 q9, q11, q0
  738. subs r2, r2, #8
  739. beq 2f
  740. vld1.f32 {q2},[r1,:128]!
  741. vmin.f32 q10, q2, q1
  742. vld1.f32 {q3},[r1,:128]!
  743. vmin.f32 q11, q3, q1
  744. vst1.f32 {q8},[r0,:128]!
  745. vst1.f32 {q9},[r0,:128]!
  746. b 1b
  747. 2: vst1.f32 {q8},[r0,:128]!
  748. vst1.f32 {q9},[r0,:128]!
  749. bx lr
  750. endfunc