You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

842 lines
28KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. function ff_clear_block_neon, export=1
  25. vmov.i16 q0, #0
  26. .rept 8
  27. vst1.16 {q0}, [r0,:128]!
  28. .endr
  29. bx lr
  30. endfunc
  31. function ff_clear_blocks_neon, export=1
  32. vmov.i16 q0, #0
  33. .rept 8*6
  34. vst1.16 {q0}, [r0,:128]!
  35. .endr
  36. bx lr
  37. endfunc
  38. .macro pixels16 rnd=1, avg=0
  39. .if \avg
  40. mov r12, r0
  41. .endif
  42. 1: vld1.64 {q0}, [r1], r2
  43. vld1.64 {q1}, [r1], r2
  44. vld1.64 {q2}, [r1], r2
  45. pld [r1, r2, lsl #2]
  46. vld1.64 {q3}, [r1], r2
  47. pld [r1]
  48. pld [r1, r2]
  49. pld [r1, r2, lsl #1]
  50. .if \avg
  51. vld1.64 {q8}, [r12,:128], r2
  52. vrhadd.u8 q0, q0, q8
  53. vld1.64 {q9}, [r12,:128], r2
  54. vrhadd.u8 q1, q1, q9
  55. vld1.64 {q10}, [r12,:128], r2
  56. vrhadd.u8 q2, q2, q10
  57. vld1.64 {q11}, [r12,:128], r2
  58. vrhadd.u8 q3, q3, q11
  59. .endif
  60. subs r3, r3, #4
  61. vst1.64 {q0}, [r0,:128], r2
  62. vst1.64 {q1}, [r0,:128], r2
  63. vst1.64 {q2}, [r0,:128], r2
  64. vst1.64 {q3}, [r0,:128], r2
  65. bne 1b
  66. bx lr
  67. .endm
  68. .macro pixels16_x2 rnd=1, avg=0
  69. 1: vld1.64 {d0-d2}, [r1], r2
  70. vld1.64 {d4-d6}, [r1], r2
  71. pld [r1]
  72. pld [r1, r2]
  73. subs r3, r3, #2
  74. vext.8 q1, q0, q1, #1
  75. avg q0, q0, q1
  76. vext.8 q3, q2, q3, #1
  77. avg q2, q2, q3
  78. .if \avg
  79. vld1.8 {q1}, [r0,:128], r2
  80. vld1.8 {q3}, [r0,:128]
  81. vrhadd.u8 q0, q0, q1
  82. vrhadd.u8 q2, q2, q3
  83. sub r0, r0, r2
  84. .endif
  85. vst1.64 {q0}, [r0,:128], r2
  86. vst1.64 {q2}, [r0,:128], r2
  87. bne 1b
  88. bx lr
  89. .endm
  90. .macro pixels16_y2 rnd=1, avg=0
  91. vld1.64 {q0}, [r1], r2
  92. vld1.64 {q1}, [r1], r2
  93. 1: subs r3, r3, #2
  94. avg q2, q0, q1
  95. vld1.64 {q0}, [r1], r2
  96. avg q3, q0, q1
  97. vld1.64 {q1}, [r1], r2
  98. pld [r1]
  99. pld [r1, r2]
  100. .if \avg
  101. vld1.8 {q8}, [r0,:128], r2
  102. vld1.8 {q9}, [r0,:128]
  103. vrhadd.u8 q2, q2, q8
  104. vrhadd.u8 q3, q3, q9
  105. sub r0, r0, r2
  106. .endif
  107. vst1.64 {q2}, [r0,:128], r2
  108. vst1.64 {q3}, [r0,:128], r2
  109. bne 1b
  110. bx lr
  111. .endm
  112. .macro pixels16_xy2 rnd=1, avg=0
  113. vld1.64 {d0-d2}, [r1], r2
  114. vld1.64 {d4-d6}, [r1], r2
  115. .ifeq \rnd
  116. vmov.i16 q13, #1
  117. .endif
  118. pld [r1]
  119. pld [r1, r2]
  120. vext.8 q1, q0, q1, #1
  121. vext.8 q3, q2, q3, #1
  122. vaddl.u8 q8, d0, d2
  123. vaddl.u8 q10, d1, d3
  124. vaddl.u8 q9, d4, d6
  125. vaddl.u8 q11, d5, d7
  126. 1: subs r3, r3, #2
  127. vld1.64 {d0-d2}, [r1], r2
  128. vadd.u16 q12, q8, q9
  129. pld [r1]
  130. .ifeq \rnd
  131. vadd.u16 q12, q12, q13
  132. .endif
  133. vext.8 q15, q0, q1, #1
  134. vadd.u16 q1 , q10, q11
  135. shrn d28, q12, #2
  136. .ifeq \rnd
  137. vadd.u16 q1, q1, q13
  138. .endif
  139. shrn d29, q1, #2
  140. .if \avg
  141. vld1.8 {q8}, [r0,:128]
  142. vrhadd.u8 q14, q14, q8
  143. .endif
  144. vaddl.u8 q8, d0, d30
  145. vld1.64 {d2-d4}, [r1], r2
  146. vaddl.u8 q10, d1, d31
  147. vst1.64 {q14}, [r0,:128], r2
  148. vadd.u16 q12, q8, q9
  149. pld [r1, r2]
  150. .ifeq \rnd
  151. vadd.u16 q12, q12, q13
  152. .endif
  153. vext.8 q2, q1, q2, #1
  154. vadd.u16 q0, q10, q11
  155. shrn d30, q12, #2
  156. .ifeq \rnd
  157. vadd.u16 q0, q0, q13
  158. .endif
  159. shrn d31, q0, #2
  160. .if \avg
  161. vld1.8 {q9}, [r0,:128]
  162. vrhadd.u8 q15, q15, q9
  163. .endif
  164. vaddl.u8 q9, d2, d4
  165. vaddl.u8 q11, d3, d5
  166. vst1.64 {q15}, [r0,:128], r2
  167. bgt 1b
  168. bx lr
  169. .endm
  170. .macro pixels8 rnd=1, avg=0
  171. 1: vld1.64 {d0}, [r1], r2
  172. vld1.64 {d1}, [r1], r2
  173. vld1.64 {d2}, [r1], r2
  174. pld [r1, r2, lsl #2]
  175. vld1.64 {d3}, [r1], r2
  176. pld [r1]
  177. pld [r1, r2]
  178. pld [r1, r2, lsl #1]
  179. .if \avg
  180. vld1.64 {d4}, [r0,:64], r2
  181. vrhadd.u8 d0, d0, d4
  182. vld1.64 {d5}, [r0,:64], r2
  183. vrhadd.u8 d1, d1, d5
  184. vld1.64 {d6}, [r0,:64], r2
  185. vrhadd.u8 d2, d2, d6
  186. vld1.64 {d7}, [r0,:64], r2
  187. vrhadd.u8 d3, d3, d7
  188. sub r0, r0, r2, lsl #2
  189. .endif
  190. subs r3, r3, #4
  191. vst1.64 {d0}, [r0,:64], r2
  192. vst1.64 {d1}, [r0,:64], r2
  193. vst1.64 {d2}, [r0,:64], r2
  194. vst1.64 {d3}, [r0,:64], r2
  195. bne 1b
  196. bx lr
  197. .endm
  198. .macro pixels8_x2 rnd=1, avg=0
  199. 1: vld1.64 {q0}, [r1], r2
  200. vext.8 d1, d0, d1, #1
  201. vld1.64 {q1}, [r1], r2
  202. vext.8 d3, d2, d3, #1
  203. pld [r1]
  204. pld [r1, r2]
  205. subs r3, r3, #2
  206. vswp d1, d2
  207. avg q0, q0, q1
  208. .if \avg
  209. vld1.8 {d4}, [r0,:64], r2
  210. vld1.8 {d5}, [r0,:64]
  211. vrhadd.u8 q0, q0, q2
  212. sub r0, r0, r2
  213. .endif
  214. vst1.64 {d0}, [r0,:64], r2
  215. vst1.64 {d1}, [r0,:64], r2
  216. bne 1b
  217. bx lr
  218. .endm
  219. .macro pixels8_y2 rnd=1, avg=0
  220. vld1.64 {d0}, [r1], r2
  221. vld1.64 {d1}, [r1], r2
  222. 1: subs r3, r3, #2
  223. avg d4, d0, d1
  224. vld1.64 {d0}, [r1], r2
  225. avg d5, d0, d1
  226. vld1.64 {d1}, [r1], r2
  227. pld [r1]
  228. pld [r1, r2]
  229. .if \avg
  230. vld1.8 {d2}, [r0,:64], r2
  231. vld1.8 {d3}, [r0,:64]
  232. vrhadd.u8 q2, q2, q1
  233. sub r0, r0, r2
  234. .endif
  235. vst1.64 {d4}, [r0,:64], r2
  236. vst1.64 {d5}, [r0,:64], r2
  237. bne 1b
  238. bx lr
  239. .endm
  240. .macro pixels8_xy2 rnd=1, avg=0
  241. vld1.64 {q0}, [r1], r2
  242. vld1.64 {q1}, [r1], r2
  243. .ifeq \rnd
  244. vmov.i16 q11, #1
  245. .endif
  246. pld [r1]
  247. pld [r1, r2]
  248. vext.8 d4, d0, d1, #1
  249. vext.8 d6, d2, d3, #1
  250. vaddl.u8 q8, d0, d4
  251. vaddl.u8 q9, d2, d6
  252. 1: subs r3, r3, #2
  253. vld1.64 {q0}, [r1], r2
  254. pld [r1]
  255. vadd.u16 q10, q8, q9
  256. vext.8 d4, d0, d1, #1
  257. .ifeq \rnd
  258. vadd.u16 q10, q10, q11
  259. .endif
  260. vaddl.u8 q8, d0, d4
  261. shrn d5, q10, #2
  262. vld1.64 {q1}, [r1], r2
  263. vadd.u16 q10, q8, q9
  264. pld [r1, r2]
  265. .if \avg
  266. vld1.8 {d7}, [r0,:64]
  267. vrhadd.u8 d5, d5, d7
  268. .endif
  269. .ifeq \rnd
  270. vadd.u16 q10, q10, q11
  271. .endif
  272. vst1.64 {d5}, [r0,:64], r2
  273. shrn d7, q10, #2
  274. .if \avg
  275. vld1.8 {d5}, [r0,:64]
  276. vrhadd.u8 d7, d7, d5
  277. .endif
  278. vext.8 d6, d2, d3, #1
  279. vaddl.u8 q9, d2, d6
  280. vst1.64 {d7}, [r0,:64], r2
  281. bgt 1b
  282. bx lr
  283. .endm
  284. .macro pixfunc pfx, name, suf, rnd=1, avg=0
  285. .if \rnd
  286. .macro avg rd, rn, rm
  287. vrhadd.u8 \rd, \rn, \rm
  288. .endm
  289. .macro shrn rd, rn, rm
  290. vrshrn.u16 \rd, \rn, \rm
  291. .endm
  292. .else
  293. .macro avg rd, rn, rm
  294. vhadd.u8 \rd, \rn, \rm
  295. .endm
  296. .macro shrn rd, rn, rm
  297. vshrn.u16 \rd, \rn, \rm
  298. .endm
  299. .endif
  300. function ff_\pfx\name\suf\()_neon, export=1
  301. \name \rnd, \avg
  302. endfunc
  303. .purgem avg
  304. .purgem shrn
  305. .endm
  306. .macro pixfunc2 pfx, name, avg=0
  307. pixfunc \pfx, \name, rnd=1, avg=\avg
  308. pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
  309. .endm
  310. function ff_put_h264_qpel16_mc00_neon, export=1
  311. mov r3, #16
  312. endfunc
  313. pixfunc put_, pixels16, avg=0
  314. pixfunc2 put_, pixels16_x2, avg=0
  315. pixfunc2 put_, pixels16_y2, avg=0
  316. pixfunc2 put_, pixels16_xy2, avg=0
  317. function ff_avg_h264_qpel16_mc00_neon, export=1
  318. mov r3, #16
  319. endfunc
  320. pixfunc avg_, pixels16, avg=1
  321. pixfunc2 avg_, pixels16_x2, avg=1
  322. pixfunc2 avg_, pixels16_y2, avg=1
  323. pixfunc2 avg_, pixels16_xy2, avg=1
  324. function ff_put_h264_qpel8_mc00_neon, export=1
  325. mov r3, #8
  326. endfunc
  327. pixfunc put_, pixels8, avg=0
  328. pixfunc2 put_, pixels8_x2, avg=0
  329. pixfunc2 put_, pixels8_y2, avg=0
  330. pixfunc2 put_, pixels8_xy2, avg=0
  331. function ff_avg_h264_qpel8_mc00_neon, export=1
  332. mov r3, #8
  333. endfunc
  334. pixfunc avg_, pixels8, avg=1
  335. pixfunc2 avg_, pixels8_x2, avg=1
  336. pixfunc2 avg_, pixels8_y2, avg=1
  337. pixfunc2 avg_, pixels8_xy2, avg=1
  338. function ff_put_pixels_clamped_neon, export=1
  339. vld1.64 {d16-d19}, [r0,:128]!
  340. vqmovun.s16 d0, q8
  341. vld1.64 {d20-d23}, [r0,:128]!
  342. vqmovun.s16 d1, q9
  343. vld1.64 {d24-d27}, [r0,:128]!
  344. vqmovun.s16 d2, q10
  345. vld1.64 {d28-d31}, [r0,:128]!
  346. vqmovun.s16 d3, q11
  347. vst1.64 {d0}, [r1,:64], r2
  348. vqmovun.s16 d4, q12
  349. vst1.64 {d1}, [r1,:64], r2
  350. vqmovun.s16 d5, q13
  351. vst1.64 {d2}, [r1,:64], r2
  352. vqmovun.s16 d6, q14
  353. vst1.64 {d3}, [r1,:64], r2
  354. vqmovun.s16 d7, q15
  355. vst1.64 {d4}, [r1,:64], r2
  356. vst1.64 {d5}, [r1,:64], r2
  357. vst1.64 {d6}, [r1,:64], r2
  358. vst1.64 {d7}, [r1,:64], r2
  359. bx lr
  360. endfunc
  361. function ff_put_signed_pixels_clamped_neon, export=1
  362. vmov.u8 d31, #128
  363. vld1.64 {d16-d17}, [r0,:128]!
  364. vqmovn.s16 d0, q8
  365. vld1.64 {d18-d19}, [r0,:128]!
  366. vqmovn.s16 d1, q9
  367. vld1.64 {d16-d17}, [r0,:128]!
  368. vqmovn.s16 d2, q8
  369. vld1.64 {d18-d19}, [r0,:128]!
  370. vadd.u8 d0, d0, d31
  371. vld1.64 {d20-d21}, [r0,:128]!
  372. vadd.u8 d1, d1, d31
  373. vld1.64 {d22-d23}, [r0,:128]!
  374. vadd.u8 d2, d2, d31
  375. vst1.64 {d0}, [r1,:64], r2
  376. vqmovn.s16 d3, q9
  377. vst1.64 {d1}, [r1,:64], r2
  378. vqmovn.s16 d4, q10
  379. vst1.64 {d2}, [r1,:64], r2
  380. vqmovn.s16 d5, q11
  381. vld1.64 {d24-d25}, [r0,:128]!
  382. vadd.u8 d3, d3, d31
  383. vld1.64 {d26-d27}, [r0,:128]!
  384. vadd.u8 d4, d4, d31
  385. vadd.u8 d5, d5, d31
  386. vst1.64 {d3}, [r1,:64], r2
  387. vqmovn.s16 d6, q12
  388. vst1.64 {d4}, [r1,:64], r2
  389. vqmovn.s16 d7, q13
  390. vst1.64 {d5}, [r1,:64], r2
  391. vadd.u8 d6, d6, d31
  392. vadd.u8 d7, d7, d31
  393. vst1.64 {d6}, [r1,:64], r2
  394. vst1.64 {d7}, [r1,:64], r2
  395. bx lr
  396. endfunc
  397. function ff_add_pixels_clamped_neon, export=1
  398. mov r3, r1
  399. vld1.64 {d16}, [r1,:64], r2
  400. vld1.64 {d0-d1}, [r0,:128]!
  401. vaddw.u8 q0, q0, d16
  402. vld1.64 {d17}, [r1,:64], r2
  403. vld1.64 {d2-d3}, [r0,:128]!
  404. vqmovun.s16 d0, q0
  405. vld1.64 {d18}, [r1,:64], r2
  406. vaddw.u8 q1, q1, d17
  407. vld1.64 {d4-d5}, [r0,:128]!
  408. vaddw.u8 q2, q2, d18
  409. vst1.64 {d0}, [r3,:64], r2
  410. vqmovun.s16 d2, q1
  411. vld1.64 {d19}, [r1,:64], r2
  412. vld1.64 {d6-d7}, [r0,:128]!
  413. vaddw.u8 q3, q3, d19
  414. vqmovun.s16 d4, q2
  415. vst1.64 {d2}, [r3,:64], r2
  416. vld1.64 {d16}, [r1,:64], r2
  417. vqmovun.s16 d6, q3
  418. vld1.64 {d0-d1}, [r0,:128]!
  419. vaddw.u8 q0, q0, d16
  420. vst1.64 {d4}, [r3,:64], r2
  421. vld1.64 {d17}, [r1,:64], r2
  422. vld1.64 {d2-d3}, [r0,:128]!
  423. vaddw.u8 q1, q1, d17
  424. vst1.64 {d6}, [r3,:64], r2
  425. vqmovun.s16 d0, q0
  426. vld1.64 {d18}, [r1,:64], r2
  427. vld1.64 {d4-d5}, [r0,:128]!
  428. vaddw.u8 q2, q2, d18
  429. vst1.64 {d0}, [r3,:64], r2
  430. vqmovun.s16 d2, q1
  431. vld1.64 {d19}, [r1,:64], r2
  432. vqmovun.s16 d4, q2
  433. vld1.64 {d6-d7}, [r0,:128]!
  434. vaddw.u8 q3, q3, d19
  435. vst1.64 {d2}, [r3,:64], r2
  436. vqmovun.s16 d6, q3
  437. vst1.64 {d4}, [r3,:64], r2
  438. vst1.64 {d6}, [r3,:64], r2
  439. bx lr
  440. endfunc
  441. function ff_vector_fmul_neon, export=1
  442. subs r3, r3, #8
  443. vld1.64 {d0-d3}, [r1,:128]!
  444. vld1.64 {d4-d7}, [r2,:128]!
  445. vmul.f32 q8, q0, q2
  446. vmul.f32 q9, q1, q3
  447. beq 3f
  448. bics ip, r3, #15
  449. beq 2f
  450. 1: subs ip, ip, #16
  451. vld1.64 {d0-d1}, [r1,:128]!
  452. vld1.64 {d4-d5}, [r2,:128]!
  453. vmul.f32 q10, q0, q2
  454. vld1.64 {d2-d3}, [r1,:128]!
  455. vld1.64 {d6-d7}, [r2,:128]!
  456. vmul.f32 q11, q1, q3
  457. vst1.64 {d16-d19},[r0,:128]!
  458. vld1.64 {d0-d1}, [r1,:128]!
  459. vld1.64 {d4-d5}, [r2,:128]!
  460. vmul.f32 q8, q0, q2
  461. vld1.64 {d2-d3}, [r1,:128]!
  462. vld1.64 {d6-d7}, [r2,:128]!
  463. vmul.f32 q9, q1, q3
  464. vst1.64 {d20-d23},[r0,:128]!
  465. bne 1b
  466. ands r3, r3, #15
  467. beq 3f
  468. 2: vld1.64 {d0-d1}, [r1,:128]!
  469. vld1.64 {d4-d5}, [r2,:128]!
  470. vst1.64 {d16-d17},[r0,:128]!
  471. vmul.f32 q8, q0, q2
  472. vld1.64 {d2-d3}, [r1,:128]!
  473. vld1.64 {d6-d7}, [r2,:128]!
  474. vst1.64 {d18-d19},[r0,:128]!
  475. vmul.f32 q9, q1, q3
  476. 3: vst1.64 {d16-d19},[r0,:128]!
  477. bx lr
  478. endfunc
  479. function ff_vector_fmul_window_neon, export=1
  480. push {r4,r5,lr}
  481. ldr lr, [sp, #12]
  482. sub r2, r2, #8
  483. sub r5, lr, #2
  484. add r2, r2, r5, lsl #2
  485. add r4, r3, r5, lsl #3
  486. add ip, r0, r5, lsl #3
  487. mov r5, #-16
  488. vld1.64 {d0,d1}, [r1,:128]!
  489. vld1.64 {d2,d3}, [r2,:128], r5
  490. vld1.64 {d4,d5}, [r3,:128]!
  491. vld1.64 {d6,d7}, [r4,:128], r5
  492. 1: subs lr, lr, #4
  493. vmul.f32 d22, d0, d4
  494. vrev64.32 q3, q3
  495. vmul.f32 d23, d1, d5
  496. vrev64.32 q1, q1
  497. vmul.f32 d20, d0, d7
  498. vmul.f32 d21, d1, d6
  499. beq 2f
  500. vmla.f32 d22, d3, d7
  501. vld1.64 {d0,d1}, [r1,:128]!
  502. vmla.f32 d23, d2, d6
  503. vld1.64 {d18,d19},[r2,:128], r5
  504. vmls.f32 d20, d3, d4
  505. vld1.64 {d24,d25},[r3,:128]!
  506. vmls.f32 d21, d2, d5
  507. vld1.64 {d6,d7}, [r4,:128], r5
  508. vmov q1, q9
  509. vrev64.32 q11, q11
  510. vmov q2, q12
  511. vswp d22, d23
  512. vst1.64 {d20,d21},[r0,:128]!
  513. vst1.64 {d22,d23},[ip,:128], r5
  514. b 1b
  515. 2: vmla.f32 d22, d3, d7
  516. vmla.f32 d23, d2, d6
  517. vmls.f32 d20, d3, d4
  518. vmls.f32 d21, d2, d5
  519. vrev64.32 q11, q11
  520. vswp d22, d23
  521. vst1.64 {d20,d21},[r0,:128]!
  522. vst1.64 {d22,d23},[ip,:128], r5
  523. pop {r4,r5,pc}
  524. endfunc
  525. #if CONFIG_VORBIS_DECODER
  526. function ff_vorbis_inverse_coupling_neon, export=1
  527. vmov.i32 q10, #1<<31
  528. subs r2, r2, #4
  529. mov r3, r0
  530. mov r12, r1
  531. beq 3f
  532. vld1.32 {d24-d25},[r1,:128]!
  533. vld1.32 {d22-d23},[r0,:128]!
  534. vcle.s32 q8, q12, #0
  535. vand q9, q11, q10
  536. veor q12, q12, q9
  537. vand q2, q12, q8
  538. vbic q3, q12, q8
  539. vadd.f32 q12, q11, q2
  540. vsub.f32 q11, q11, q3
  541. 1: vld1.32 {d2-d3}, [r1,:128]!
  542. vld1.32 {d0-d1}, [r0,:128]!
  543. vcle.s32 q8, q1, #0
  544. vand q9, q0, q10
  545. veor q1, q1, q9
  546. vst1.32 {d24-d25},[r3, :128]!
  547. vst1.32 {d22-d23},[r12,:128]!
  548. vand q2, q1, q8
  549. vbic q3, q1, q8
  550. vadd.f32 q1, q0, q2
  551. vsub.f32 q0, q0, q3
  552. subs r2, r2, #8
  553. ble 2f
  554. vld1.32 {d24-d25},[r1,:128]!
  555. vld1.32 {d22-d23},[r0,:128]!
  556. vcle.s32 q8, q12, #0
  557. vand q9, q11, q10
  558. veor q12, q12, q9
  559. vst1.32 {d2-d3}, [r3, :128]!
  560. vst1.32 {d0-d1}, [r12,:128]!
  561. vand q2, q12, q8
  562. vbic q3, q12, q8
  563. vadd.f32 q12, q11, q2
  564. vsub.f32 q11, q11, q3
  565. b 1b
  566. 2: vst1.32 {d2-d3}, [r3, :128]!
  567. vst1.32 {d0-d1}, [r12,:128]!
  568. it lt
  569. bxlt lr
  570. 3: vld1.32 {d2-d3}, [r1,:128]
  571. vld1.32 {d0-d1}, [r0,:128]
  572. vcle.s32 q8, q1, #0
  573. vand q9, q0, q10
  574. veor q1, q1, q9
  575. vand q2, q1, q8
  576. vbic q3, q1, q8
  577. vadd.f32 q1, q0, q2
  578. vsub.f32 q0, q0, q3
  579. vst1.32 {d2-d3}, [r0,:128]!
  580. vst1.32 {d0-d1}, [r1,:128]!
  581. bx lr
  582. endfunc
  583. #endif
  584. function ff_vector_fmul_scalar_neon, export=1
  585. VFP len .req r2
  586. NOVFP len .req r3
  587. VFP vdup.32 q8, d0[0]
  588. NOVFP vdup.32 q8, r2
  589. bics r12, len, #15
  590. beq 3f
  591. vld1.32 {q0},[r1,:128]!
  592. vld1.32 {q1},[r1,:128]!
  593. 1: vmul.f32 q0, q0, q8
  594. vld1.32 {q2},[r1,:128]!
  595. vmul.f32 q1, q1, q8
  596. vld1.32 {q3},[r1,:128]!
  597. vmul.f32 q2, q2, q8
  598. vst1.32 {q0},[r0,:128]!
  599. vmul.f32 q3, q3, q8
  600. vst1.32 {q1},[r0,:128]!
  601. subs r12, r12, #16
  602. beq 2f
  603. vld1.32 {q0},[r1,:128]!
  604. vst1.32 {q2},[r0,:128]!
  605. vld1.32 {q1},[r1,:128]!
  606. vst1.32 {q3},[r0,:128]!
  607. b 1b
  608. 2: vst1.32 {q2},[r0,:128]!
  609. vst1.32 {q3},[r0,:128]!
  610. ands len, len, #15
  611. it eq
  612. bxeq lr
  613. 3: vld1.32 {q0},[r1,:128]!
  614. vmul.f32 q0, q0, q8
  615. vst1.32 {q0},[r0,:128]!
  616. subs len, len, #4
  617. bgt 3b
  618. bx lr
  619. .unreq len
  620. endfunc
  621. function ff_vector_fmac_scalar_neon, export=1
  622. VFP len .req r2
  623. VFP acc .req r3
  624. NOVFP len .req r3
  625. NOVFP acc .req r2
  626. VFP vdup.32 q15, d0[0]
  627. NOVFP vdup.32 q15, r2
  628. bics r12, len, #15
  629. mov acc, r0
  630. beq 3f
  631. vld1.32 {q0}, [r1,:128]!
  632. vld1.32 {q8}, [acc,:128]!
  633. vld1.32 {q1}, [r1,:128]!
  634. vld1.32 {q9}, [acc,:128]!
  635. 1: vmla.f32 q8, q0, q15
  636. vld1.32 {q2}, [r1,:128]!
  637. vld1.32 {q10}, [acc,:128]!
  638. vmla.f32 q9, q1, q15
  639. vld1.32 {q3}, [r1,:128]!
  640. vld1.32 {q11}, [acc,:128]!
  641. vmla.f32 q10, q2, q15
  642. vst1.32 {q8}, [r0,:128]!
  643. vmla.f32 q11, q3, q15
  644. vst1.32 {q9}, [r0,:128]!
  645. subs r12, r12, #16
  646. beq 2f
  647. vld1.32 {q0}, [r1,:128]!
  648. vld1.32 {q8}, [acc,:128]!
  649. vst1.32 {q10}, [r0,:128]!
  650. vld1.32 {q1}, [r1,:128]!
  651. vld1.32 {q9}, [acc,:128]!
  652. vst1.32 {q11}, [r0,:128]!
  653. b 1b
  654. 2: vst1.32 {q10}, [r0,:128]!
  655. vst1.32 {q11}, [r0,:128]!
  656. ands len, len, #15
  657. it eq
  658. bxeq lr
  659. 3: vld1.32 {q0}, [r1,:128]!
  660. vld1.32 {q8}, [acc,:128]!
  661. vmla.f32 q8, q0, q15
  662. vst1.32 {q8}, [r0,:128]!
  663. subs len, len, #4
  664. bgt 3b
  665. bx lr
  666. .unreq len
  667. endfunc
  668. function ff_butterflies_float_neon, export=1
  669. 1: vld1.32 {q0},[r0,:128]
  670. vld1.32 {q1},[r1,:128]
  671. vsub.f32 q2, q0, q1
  672. vadd.f32 q1, q0, q1
  673. vst1.32 {q2},[r1,:128]!
  674. vst1.32 {q1},[r0,:128]!
  675. subs r2, r2, #4
  676. bgt 1b
  677. bx lr
  678. endfunc
  679. function ff_scalarproduct_float_neon, export=1
  680. vmov.f32 q2, #0.0
  681. 1: vld1.32 {q0},[r0,:128]!
  682. vld1.32 {q1},[r1,:128]!
  683. vmla.f32 q2, q0, q1
  684. subs r2, r2, #4
  685. bgt 1b
  686. vadd.f32 d0, d4, d5
  687. vpadd.f32 d0, d0, d0
  688. NOVFP vmov.32 r0, d0[0]
  689. bx lr
  690. endfunc
  691. function ff_vector_fmul_reverse_neon, export=1
  692. add r2, r2, r3, lsl #2
  693. sub r2, r2, #32
  694. mov r12, #-32
  695. vld1.32 {q0-q1}, [r1,:128]!
  696. vld1.32 {q2-q3}, [r2,:128], r12
  697. 1: pld [r1, #32]
  698. vrev64.32 q3, q3
  699. vmul.f32 d16, d0, d7
  700. vmul.f32 d17, d1, d6
  701. pld [r2, #-32]
  702. vrev64.32 q2, q2
  703. vmul.f32 d18, d2, d5
  704. vmul.f32 d19, d3, d4
  705. subs r3, r3, #8
  706. beq 2f
  707. vld1.32 {q0-q1}, [r1,:128]!
  708. vld1.32 {q2-q3}, [r2,:128], r12
  709. vst1.32 {q8-q9}, [r0,:128]!
  710. b 1b
  711. 2: vst1.32 {q8-q9}, [r0,:128]!
  712. bx lr
  713. endfunc
  714. function ff_vector_fmul_add_neon, export=1
  715. ldr r12, [sp]
  716. vld1.32 {q0-q1}, [r1,:128]!
  717. vld1.32 {q8-q9}, [r2,:128]!
  718. vld1.32 {q2-q3}, [r3,:128]!
  719. vmul.f32 q10, q0, q8
  720. vmul.f32 q11, q1, q9
  721. 1: vadd.f32 q12, q2, q10
  722. vadd.f32 q13, q3, q11
  723. pld [r1, #16]
  724. pld [r2, #16]
  725. pld [r3, #16]
  726. subs r12, r12, #8
  727. beq 2f
  728. vld1.32 {q0}, [r1,:128]!
  729. vld1.32 {q8}, [r2,:128]!
  730. vmul.f32 q10, q0, q8
  731. vld1.32 {q1}, [r1,:128]!
  732. vld1.32 {q9}, [r2,:128]!
  733. vmul.f32 q11, q1, q9
  734. vld1.32 {q2-q3}, [r3,:128]!
  735. vst1.32 {q12-q13},[r0,:128]!
  736. b 1b
  737. 2: vst1.32 {q12-q13},[r0,:128]!
  738. bx lr
  739. endfunc
  740. function ff_vector_clipf_neon, export=1
  741. VFP vdup.32 q1, d0[1]
  742. VFP vdup.32 q0, d0[0]
  743. NOVFP vdup.32 q0, r2
  744. NOVFP vdup.32 q1, r3
  745. NOVFP ldr r2, [sp]
  746. vld1.f32 {q2},[r1,:128]!
  747. vmin.f32 q10, q2, q1
  748. vld1.f32 {q3},[r1,:128]!
  749. vmin.f32 q11, q3, q1
  750. 1: vmax.f32 q8, q10, q0
  751. vmax.f32 q9, q11, q0
  752. subs r2, r2, #8
  753. beq 2f
  754. vld1.f32 {q2},[r1,:128]!
  755. vmin.f32 q10, q2, q1
  756. vld1.f32 {q3},[r1,:128]!
  757. vmin.f32 q11, q3, q1
  758. vst1.f32 {q8},[r0,:128]!
  759. vst1.f32 {q9},[r0,:128]!
  760. b 1b
  761. 2: vst1.f32 {q8},[r0,:128]!
  762. vst1.f32 {q9},[r0,:128]!
  763. bx lr
  764. endfunc
  765. function ff_apply_window_int16_neon, export=1
  766. push {r4,lr}
  767. add r4, r1, r3, lsl #1
  768. add lr, r0, r3, lsl #1
  769. sub r4, r4, #16
  770. sub lr, lr, #16
  771. mov r12, #-16
  772. 1:
  773. vld1.16 {q0}, [r1,:128]!
  774. vld1.16 {q2}, [r2,:128]!
  775. vld1.16 {q1}, [r4,:128], r12
  776. vrev64.16 q3, q2
  777. vqrdmulh.s16 q0, q0, q2
  778. vqrdmulh.s16 d2, d2, d7
  779. vqrdmulh.s16 d3, d3, d6
  780. vst1.16 {q0}, [r0,:128]!
  781. vst1.16 {q1}, [lr,:128], r12
  782. subs r3, r3, #16
  783. bgt 1b
  784. pop {r4,pc}
  785. endfunc
  786. function ff_vector_clip_int32_neon, export=1
  787. vdup.32 q0, r2
  788. vdup.32 q1, r3
  789. ldr r2, [sp]
  790. 1:
  791. vld1.32 {q2-q3}, [r1,:128]!
  792. vmin.s32 q2, q2, q1
  793. vmin.s32 q3, q3, q1
  794. vmax.s32 q2, q2, q0
  795. vmax.s32 q3, q3, q0
  796. vst1.32 {q2-q3}, [r0,:128]!
  797. subs r2, r2, #8
  798. bgt 1b
  799. bx lr
  800. endfunc