You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

823 lines
27KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/arm/asm.S"
  23. function ff_clear_block_neon, export=1
  24. vmov.i16 q0, #0
  25. .rept 8
  26. vst1.16 {q0}, [r0,:128]!
  27. .endr
  28. bx lr
  29. endfunc
  30. function ff_clear_blocks_neon, export=1
  31. vmov.i16 q0, #0
  32. .rept 8*6
  33. vst1.16 {q0}, [r0,:128]!
  34. .endr
  35. bx lr
  36. endfunc
  37. .macro pixels16 rnd=1, avg=0
  38. .if \avg
  39. mov r12, r0
  40. .endif
  41. 1: vld1.8 {q0}, [r1], r2
  42. vld1.8 {q1}, [r1], r2
  43. vld1.8 {q2}, [r1], r2
  44. pld [r1, r2, lsl #2]
  45. vld1.8 {q3}, [r1], r2
  46. pld [r1]
  47. pld [r1, r2]
  48. pld [r1, r2, lsl #1]
  49. .if \avg
  50. vld1.8 {q8}, [r12,:128], r2
  51. vrhadd.u8 q0, q0, q8
  52. vld1.8 {q9}, [r12,:128], r2
  53. vrhadd.u8 q1, q1, q9
  54. vld1.8 {q10}, [r12,:128], r2
  55. vrhadd.u8 q2, q2, q10
  56. vld1.8 {q11}, [r12,:128], r2
  57. vrhadd.u8 q3, q3, q11
  58. .endif
  59. subs r3, r3, #4
  60. vst1.64 {q0}, [r0,:128], r2
  61. vst1.64 {q1}, [r0,:128], r2
  62. vst1.64 {q2}, [r0,:128], r2
  63. vst1.64 {q3}, [r0,:128], r2
  64. bne 1b
  65. bx lr
  66. .endm
  67. .macro pixels16_x2 rnd=1, avg=0
  68. 1: vld1.8 {d0-d2}, [r1], r2
  69. vld1.8 {d4-d6}, [r1], r2
  70. pld [r1]
  71. pld [r1, r2]
  72. subs r3, r3, #2
  73. vext.8 q1, q0, q1, #1
  74. avg q0, q0, q1
  75. vext.8 q3, q2, q3, #1
  76. avg q2, q2, q3
  77. .if \avg
  78. vld1.8 {q1}, [r0,:128], r2
  79. vld1.8 {q3}, [r0,:128]
  80. vrhadd.u8 q0, q0, q1
  81. vrhadd.u8 q2, q2, q3
  82. sub r0, r0, r2
  83. .endif
  84. vst1.8 {q0}, [r0,:128], r2
  85. vst1.8 {q2}, [r0,:128], r2
  86. bne 1b
  87. bx lr
  88. .endm
  89. .macro pixels16_y2 rnd=1, avg=0
  90. sub r3, r3, #2
  91. vld1.8 {q0}, [r1], r2
  92. vld1.8 {q1}, [r1], r2
  93. 1: subs r3, r3, #2
  94. avg q2, q0, q1
  95. vld1.8 {q0}, [r1], r2
  96. avg q3, q0, q1
  97. vld1.8 {q1}, [r1], r2
  98. pld [r1]
  99. pld [r1, r2]
  100. .if \avg
  101. vld1.8 {q8}, [r0,:128], r2
  102. vld1.8 {q9}, [r0,:128]
  103. vrhadd.u8 q2, q2, q8
  104. vrhadd.u8 q3, q3, q9
  105. sub r0, r0, r2
  106. .endif
  107. vst1.8 {q2}, [r0,:128], r2
  108. vst1.8 {q3}, [r0,:128], r2
  109. bne 1b
  110. avg q2, q0, q1
  111. vld1.8 {q0}, [r1], r2
  112. avg q3, q0, q1
  113. .if \avg
  114. vld1.8 {q8}, [r0,:128], r2
  115. vld1.8 {q9}, [r0,:128]
  116. vrhadd.u8 q2, q2, q8
  117. vrhadd.u8 q3, q3, q9
  118. sub r0, r0, r2
  119. .endif
  120. vst1.8 {q2}, [r0,:128], r2
  121. vst1.8 {q3}, [r0,:128], r2
  122. bx lr
  123. .endm
  124. .macro pixels16_xy2 rnd=1, avg=0
  125. sub r3, r3, #2
  126. vld1.8 {d0-d2}, [r1], r2
  127. vld1.8 {d4-d6}, [r1], r2
  128. NRND vmov.i16 q13, #1
  129. pld [r1]
  130. pld [r1, r2]
  131. vext.8 q1, q0, q1, #1
  132. vext.8 q3, q2, q3, #1
  133. vaddl.u8 q8, d0, d2
  134. vaddl.u8 q10, d1, d3
  135. vaddl.u8 q9, d4, d6
  136. vaddl.u8 q11, d5, d7
  137. 1: subs r3, r3, #2
  138. vld1.8 {d0-d2}, [r1], r2
  139. vadd.u16 q12, q8, q9
  140. pld [r1]
  141. NRND vadd.u16 q12, q12, q13
  142. vext.8 q15, q0, q1, #1
  143. vadd.u16 q1 , q10, q11
  144. shrn d28, q12, #2
  145. NRND vadd.u16 q1, q1, q13
  146. shrn d29, q1, #2
  147. .if \avg
  148. vld1.8 {q8}, [r0,:128]
  149. vrhadd.u8 q14, q14, q8
  150. .endif
  151. vaddl.u8 q8, d0, d30
  152. vld1.8 {d2-d4}, [r1], r2
  153. vaddl.u8 q10, d1, d31
  154. vst1.8 {q14}, [r0,:128], r2
  155. vadd.u16 q12, q8, q9
  156. pld [r1, r2]
  157. NRND vadd.u16 q12, q12, q13
  158. vext.8 q2, q1, q2, #1
  159. vadd.u16 q0, q10, q11
  160. shrn d30, q12, #2
  161. NRND vadd.u16 q0, q0, q13
  162. shrn d31, q0, #2
  163. .if \avg
  164. vld1.8 {q9}, [r0,:128]
  165. vrhadd.u8 q15, q15, q9
  166. .endif
  167. vaddl.u8 q9, d2, d4
  168. vaddl.u8 q11, d3, d5
  169. vst1.8 {q15}, [r0,:128], r2
  170. bgt 1b
  171. vld1.8 {d0-d2}, [r1], r2
  172. vadd.u16 q12, q8, q9
  173. NRND vadd.u16 q12, q12, q13
  174. vext.8 q15, q0, q1, #1
  175. vadd.u16 q1 , q10, q11
  176. shrn d28, q12, #2
  177. NRND vadd.u16 q1, q1, q13
  178. shrn d29, q1, #2
  179. .if \avg
  180. vld1.8 {q8}, [r0,:128]
  181. vrhadd.u8 q14, q14, q8
  182. .endif
  183. vaddl.u8 q8, d0, d30
  184. vaddl.u8 q10, d1, d31
  185. vst1.8 {q14}, [r0,:128], r2
  186. vadd.u16 q12, q8, q9
  187. NRND vadd.u16 q12, q12, q13
  188. vadd.u16 q0, q10, q11
  189. shrn d30, q12, #2
  190. NRND vadd.u16 q0, q0, q13
  191. shrn d31, q0, #2
  192. .if \avg
  193. vld1.8 {q9}, [r0,:128]
  194. vrhadd.u8 q15, q15, q9
  195. .endif
  196. vst1.8 {q15}, [r0,:128], r2
  197. bx lr
  198. .endm
  199. .macro pixels8 rnd=1, avg=0
  200. 1: vld1.8 {d0}, [r1], r2
  201. vld1.8 {d1}, [r1], r2
  202. vld1.8 {d2}, [r1], r2
  203. pld [r1, r2, lsl #2]
  204. vld1.8 {d3}, [r1], r2
  205. pld [r1]
  206. pld [r1, r2]
  207. pld [r1, r2, lsl #1]
  208. .if \avg
  209. vld1.8 {d4}, [r0,:64], r2
  210. vrhadd.u8 d0, d0, d4
  211. vld1.8 {d5}, [r0,:64], r2
  212. vrhadd.u8 d1, d1, d5
  213. vld1.8 {d6}, [r0,:64], r2
  214. vrhadd.u8 d2, d2, d6
  215. vld1.8 {d7}, [r0,:64], r2
  216. vrhadd.u8 d3, d3, d7
  217. sub r0, r0, r2, lsl #2
  218. .endif
  219. subs r3, r3, #4
  220. vst1.8 {d0}, [r0,:64], r2
  221. vst1.8 {d1}, [r0,:64], r2
  222. vst1.8 {d2}, [r0,:64], r2
  223. vst1.8 {d3}, [r0,:64], r2
  224. bne 1b
  225. bx lr
  226. .endm
  227. .macro pixels8_x2 rnd=1, avg=0
  228. 1: vld1.8 {q0}, [r1], r2
  229. vext.8 d1, d0, d1, #1
  230. vld1.8 {q1}, [r1], r2
  231. vext.8 d3, d2, d3, #1
  232. pld [r1]
  233. pld [r1, r2]
  234. subs r3, r3, #2
  235. vswp d1, d2
  236. avg q0, q0, q1
  237. .if \avg
  238. vld1.8 {d4}, [r0,:64], r2
  239. vld1.8 {d5}, [r0,:64]
  240. vrhadd.u8 q0, q0, q2
  241. sub r0, r0, r2
  242. .endif
  243. vst1.8 {d0}, [r0,:64], r2
  244. vst1.8 {d1}, [r0,:64], r2
  245. bne 1b
  246. bx lr
  247. .endm
  248. .macro pixels8_y2 rnd=1, avg=0
  249. sub r3, r3, #2
  250. vld1.8 {d0}, [r1], r2
  251. vld1.8 {d1}, [r1], r2
  252. 1: subs r3, r3, #2
  253. avg d4, d0, d1
  254. vld1.8 {d0}, [r1], r2
  255. avg d5, d0, d1
  256. vld1.8 {d1}, [r1], r2
  257. pld [r1]
  258. pld [r1, r2]
  259. .if \avg
  260. vld1.8 {d2}, [r0,:64], r2
  261. vld1.8 {d3}, [r0,:64]
  262. vrhadd.u8 q2, q2, q1
  263. sub r0, r0, r2
  264. .endif
  265. vst1.8 {d4}, [r0,:64], r2
  266. vst1.8 {d5}, [r0,:64], r2
  267. bne 1b
  268. avg d4, d0, d1
  269. vld1.8 {d0}, [r1], r2
  270. avg d5, d0, d1
  271. .if \avg
  272. vld1.8 {d2}, [r0,:64], r2
  273. vld1.8 {d3}, [r0,:64]
  274. vrhadd.u8 q2, q2, q1
  275. sub r0, r0, r2
  276. .endif
  277. vst1.8 {d4}, [r0,:64], r2
  278. vst1.8 {d5}, [r0,:64], r2
  279. bx lr
  280. .endm
  281. .macro pixels8_xy2 rnd=1, avg=0
  282. sub r3, r3, #2
  283. vld1.8 {q0}, [r1], r2
  284. vld1.8 {q1}, [r1], r2
  285. NRND vmov.i16 q11, #1
  286. pld [r1]
  287. pld [r1, r2]
  288. vext.8 d4, d0, d1, #1
  289. vext.8 d6, d2, d3, #1
  290. vaddl.u8 q8, d0, d4
  291. vaddl.u8 q9, d2, d6
  292. 1: subs r3, r3, #2
  293. vld1.8 {q0}, [r1], r2
  294. pld [r1]
  295. vadd.u16 q10, q8, q9
  296. vext.8 d4, d0, d1, #1
  297. NRND vadd.u16 q10, q10, q11
  298. vaddl.u8 q8, d0, d4
  299. shrn d5, q10, #2
  300. vld1.8 {q1}, [r1], r2
  301. vadd.u16 q10, q8, q9
  302. pld [r1, r2]
  303. .if \avg
  304. vld1.8 {d7}, [r0,:64]
  305. vrhadd.u8 d5, d5, d7
  306. .endif
  307. NRND vadd.u16 q10, q10, q11
  308. vst1.8 {d5}, [r0,:64], r2
  309. shrn d7, q10, #2
  310. .if \avg
  311. vld1.8 {d5}, [r0,:64]
  312. vrhadd.u8 d7, d7, d5
  313. .endif
  314. vext.8 d6, d2, d3, #1
  315. vaddl.u8 q9, d2, d6
  316. vst1.8 {d7}, [r0,:64], r2
  317. bgt 1b
  318. vld1.8 {q0}, [r1], r2
  319. vadd.u16 q10, q8, q9
  320. vext.8 d4, d0, d1, #1
  321. NRND vadd.u16 q10, q10, q11
  322. vaddl.u8 q8, d0, d4
  323. shrn d5, q10, #2
  324. vadd.u16 q10, q8, q9
  325. .if \avg
  326. vld1.8 {d7}, [r0,:64]
  327. vrhadd.u8 d5, d5, d7
  328. .endif
  329. NRND vadd.u16 q10, q10, q11
  330. vst1.8 {d5}, [r0,:64], r2
  331. shrn d7, q10, #2
  332. .if \avg
  333. vld1.8 {d5}, [r0,:64]
  334. vrhadd.u8 d7, d7, d5
  335. .endif
  336. vst1.8 {d7}, [r0,:64], r2
  337. bx lr
  338. .endm
  339. .macro pixfunc pfx, name, suf, rnd=1, avg=0
  340. .if \rnd
  341. .macro avg rd, rn, rm
  342. vrhadd.u8 \rd, \rn, \rm
  343. .endm
  344. .macro shrn rd, rn, rm
  345. vrshrn.u16 \rd, \rn, \rm
  346. .endm
  347. .macro NRND insn:vararg
  348. .endm
  349. .else
  350. .macro avg rd, rn, rm
  351. vhadd.u8 \rd, \rn, \rm
  352. .endm
  353. .macro shrn rd, rn, rm
  354. vshrn.u16 \rd, \rn, \rm
  355. .endm
  356. .macro NRND insn:vararg
  357. \insn
  358. .endm
  359. .endif
  360. function ff_\pfx\name\suf\()_neon, export=1
  361. \name \rnd, \avg
  362. endfunc
  363. .purgem avg
  364. .purgem shrn
  365. .purgem NRND
  366. .endm
  367. .macro pixfunc2 pfx, name, avg=0
  368. pixfunc \pfx, \name, rnd=1, avg=\avg
  369. pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
  370. .endm
  371. function ff_put_h264_qpel16_mc00_neon, export=1
  372. mov r3, #16
  373. endfunc
  374. pixfunc put_, pixels16, avg=0
  375. pixfunc2 put_, pixels16_x2, avg=0
  376. pixfunc2 put_, pixels16_y2, avg=0
  377. pixfunc2 put_, pixels16_xy2, avg=0
  378. function ff_avg_h264_qpel16_mc00_neon, export=1
  379. mov r3, #16
  380. endfunc
  381. pixfunc avg_, pixels16, avg=1
  382. pixfunc2 avg_, pixels16_x2, avg=1
  383. pixfunc2 avg_, pixels16_y2, avg=1
  384. pixfunc2 avg_, pixels16_xy2, avg=1
  385. function ff_put_h264_qpel8_mc00_neon, export=1
  386. mov r3, #8
  387. endfunc
  388. pixfunc put_, pixels8, avg=0
  389. pixfunc2 put_, pixels8_x2, avg=0
  390. pixfunc2 put_, pixels8_y2, avg=0
  391. pixfunc2 put_, pixels8_xy2, avg=0
  392. function ff_avg_h264_qpel8_mc00_neon, export=1
  393. mov r3, #8
  394. endfunc
  395. pixfunc avg_, pixels8, avg=1
  396. pixfunc2 avg_, pixels8_x2, avg=1
  397. pixfunc2 avg_, pixels8_y2, avg=1
  398. pixfunc2 avg_, pixels8_xy2, avg=1
  399. function ff_put_pixels_clamped_neon, export=1
  400. vld1.16 {d16-d19}, [r0,:128]!
  401. vqmovun.s16 d0, q8
  402. vld1.16 {d20-d23}, [r0,:128]!
  403. vqmovun.s16 d1, q9
  404. vld1.16 {d24-d27}, [r0,:128]!
  405. vqmovun.s16 d2, q10
  406. vld1.16 {d28-d31}, [r0,:128]!
  407. vqmovun.s16 d3, q11
  408. vst1.8 {d0}, [r1,:64], r2
  409. vqmovun.s16 d4, q12
  410. vst1.8 {d1}, [r1,:64], r2
  411. vqmovun.s16 d5, q13
  412. vst1.8 {d2}, [r1,:64], r2
  413. vqmovun.s16 d6, q14
  414. vst1.8 {d3}, [r1,:64], r2
  415. vqmovun.s16 d7, q15
  416. vst1.8 {d4}, [r1,:64], r2
  417. vst1.8 {d5}, [r1,:64], r2
  418. vst1.8 {d6}, [r1,:64], r2
  419. vst1.8 {d7}, [r1,:64], r2
  420. bx lr
  421. endfunc
  422. function ff_put_signed_pixels_clamped_neon, export=1
  423. vmov.u8 d31, #128
  424. vld1.16 {d16-d17}, [r0,:128]!
  425. vqmovn.s16 d0, q8
  426. vld1.16 {d18-d19}, [r0,:128]!
  427. vqmovn.s16 d1, q9
  428. vld1.16 {d16-d17}, [r0,:128]!
  429. vqmovn.s16 d2, q8
  430. vld1.16 {d18-d19}, [r0,:128]!
  431. vadd.u8 d0, d0, d31
  432. vld1.16 {d20-d21}, [r0,:128]!
  433. vadd.u8 d1, d1, d31
  434. vld1.16 {d22-d23}, [r0,:128]!
  435. vadd.u8 d2, d2, d31
  436. vst1.8 {d0}, [r1,:64], r2
  437. vqmovn.s16 d3, q9
  438. vst1.8 {d1}, [r1,:64], r2
  439. vqmovn.s16 d4, q10
  440. vst1.8 {d2}, [r1,:64], r2
  441. vqmovn.s16 d5, q11
  442. vld1.16 {d24-d25}, [r0,:128]!
  443. vadd.u8 d3, d3, d31
  444. vld1.16 {d26-d27}, [r0,:128]!
  445. vadd.u8 d4, d4, d31
  446. vadd.u8 d5, d5, d31
  447. vst1.8 {d3}, [r1,:64], r2
  448. vqmovn.s16 d6, q12
  449. vst1.8 {d4}, [r1,:64], r2
  450. vqmovn.s16 d7, q13
  451. vst1.8 {d5}, [r1,:64], r2
  452. vadd.u8 d6, d6, d31
  453. vadd.u8 d7, d7, d31
  454. vst1.8 {d6}, [r1,:64], r2
  455. vst1.8 {d7}, [r1,:64], r2
  456. bx lr
  457. endfunc
  458. function ff_add_pixels_clamped_neon, export=1
  459. mov r3, r1
  460. vld1.8 {d16}, [r1,:64], r2
  461. vld1.16 {d0-d1}, [r0,:128]!
  462. vaddw.u8 q0, q0, d16
  463. vld1.8 {d17}, [r1,:64], r2
  464. vld1.16 {d2-d3}, [r0,:128]!
  465. vqmovun.s16 d0, q0
  466. vld1.8 {d18}, [r1,:64], r2
  467. vaddw.u8 q1, q1, d17
  468. vld1.16 {d4-d5}, [r0,:128]!
  469. vaddw.u8 q2, q2, d18
  470. vst1.8 {d0}, [r3,:64], r2
  471. vqmovun.s16 d2, q1
  472. vld1.8 {d19}, [r1,:64], r2
  473. vld1.16 {d6-d7}, [r0,:128]!
  474. vaddw.u8 q3, q3, d19
  475. vqmovun.s16 d4, q2
  476. vst1.8 {d2}, [r3,:64], r2
  477. vld1.8 {d16}, [r1,:64], r2
  478. vqmovun.s16 d6, q3
  479. vld1.16 {d0-d1}, [r0,:128]!
  480. vaddw.u8 q0, q0, d16
  481. vst1.8 {d4}, [r3,:64], r2
  482. vld1.8 {d17}, [r1,:64], r2
  483. vld1.16 {d2-d3}, [r0,:128]!
  484. vaddw.u8 q1, q1, d17
  485. vst1.8 {d6}, [r3,:64], r2
  486. vqmovun.s16 d0, q0
  487. vld1.8 {d18}, [r1,:64], r2
  488. vld1.16 {d4-d5}, [r0,:128]!
  489. vaddw.u8 q2, q2, d18
  490. vst1.8 {d0}, [r3,:64], r2
  491. vqmovun.s16 d2, q1
  492. vld1.8 {d19}, [r1,:64], r2
  493. vqmovun.s16 d4, q2
  494. vld1.16 {d6-d7}, [r0,:128]!
  495. vaddw.u8 q3, q3, d19
  496. vst1.8 {d2}, [r3,:64], r2
  497. vqmovun.s16 d6, q3
  498. vst1.8 {d4}, [r3,:64], r2
  499. vst1.8 {d6}, [r3,:64], r2
  500. bx lr
  501. endfunc
  502. function ff_vector_fmul_window_neon, export=1
  503. push {r4,r5,lr}
  504. ldr lr, [sp, #12]
  505. sub r2, r2, #8
  506. sub r5, lr, #2
  507. add r2, r2, r5, lsl #2
  508. add r4, r3, r5, lsl #3
  509. add ip, r0, r5, lsl #3
  510. mov r5, #-16
  511. vld1.32 {d0,d1}, [r1,:128]!
  512. vld1.32 {d2,d3}, [r2,:128], r5
  513. vld1.32 {d4,d5}, [r3,:128]!
  514. vld1.32 {d6,d7}, [r4,:128], r5
  515. 1: subs lr, lr, #4
  516. vmul.f32 d22, d0, d4
  517. vrev64.32 q3, q3
  518. vmul.f32 d23, d1, d5
  519. vrev64.32 q1, q1
  520. vmul.f32 d20, d0, d7
  521. vmul.f32 d21, d1, d6
  522. beq 2f
  523. vmla.f32 d22, d3, d7
  524. vld1.32 {d0,d1}, [r1,:128]!
  525. vmla.f32 d23, d2, d6
  526. vld1.32 {d18,d19},[r2,:128], r5
  527. vmls.f32 d20, d3, d4
  528. vld1.32 {d24,d25},[r3,:128]!
  529. vmls.f32 d21, d2, d5
  530. vld1.32 {d6,d7}, [r4,:128], r5
  531. vmov q1, q9
  532. vrev64.32 q11, q11
  533. vmov q2, q12
  534. vswp d22, d23
  535. vst1.32 {d20,d21},[r0,:128]!
  536. vst1.32 {d22,d23},[ip,:128], r5
  537. b 1b
  538. 2: vmla.f32 d22, d3, d7
  539. vmla.f32 d23, d2, d6
  540. vmls.f32 d20, d3, d4
  541. vmls.f32 d21, d2, d5
  542. vrev64.32 q11, q11
  543. vswp d22, d23
  544. vst1.32 {d20,d21},[r0,:128]!
  545. vst1.32 {d22,d23},[ip,:128], r5
  546. pop {r4,r5,pc}
  547. endfunc
  548. #if CONFIG_VORBIS_DECODER
  549. function ff_vorbis_inverse_coupling_neon, export=1
  550. vmov.i32 q10, #1<<31
  551. subs r2, r2, #4
  552. mov r3, r0
  553. mov r12, r1
  554. beq 3f
  555. vld1.32 {d24-d25},[r1,:128]!
  556. vld1.32 {d22-d23},[r0,:128]!
  557. vcle.s32 q8, q12, #0
  558. vand q9, q11, q10
  559. veor q12, q12, q9
  560. vand q2, q12, q8
  561. vbic q3, q12, q8
  562. vadd.f32 q12, q11, q2
  563. vsub.f32 q11, q11, q3
  564. 1: vld1.32 {d2-d3}, [r1,:128]!
  565. vld1.32 {d0-d1}, [r0,:128]!
  566. vcle.s32 q8, q1, #0
  567. vand q9, q0, q10
  568. veor q1, q1, q9
  569. vst1.32 {d24-d25},[r3, :128]!
  570. vst1.32 {d22-d23},[r12,:128]!
  571. vand q2, q1, q8
  572. vbic q3, q1, q8
  573. vadd.f32 q1, q0, q2
  574. vsub.f32 q0, q0, q3
  575. subs r2, r2, #8
  576. ble 2f
  577. vld1.32 {d24-d25},[r1,:128]!
  578. vld1.32 {d22-d23},[r0,:128]!
  579. vcle.s32 q8, q12, #0
  580. vand q9, q11, q10
  581. veor q12, q12, q9
  582. vst1.32 {d2-d3}, [r3, :128]!
  583. vst1.32 {d0-d1}, [r12,:128]!
  584. vand q2, q12, q8
  585. vbic q3, q12, q8
  586. vadd.f32 q12, q11, q2
  587. vsub.f32 q11, q11, q3
  588. b 1b
  589. 2: vst1.32 {d2-d3}, [r3, :128]!
  590. vst1.32 {d0-d1}, [r12,:128]!
  591. it lt
  592. bxlt lr
  593. 3: vld1.32 {d2-d3}, [r1,:128]
  594. vld1.32 {d0-d1}, [r0,:128]
  595. vcle.s32 q8, q1, #0
  596. vand q9, q0, q10
  597. veor q1, q1, q9
  598. vand q2, q1, q8
  599. vbic q3, q1, q8
  600. vadd.f32 q1, q0, q2
  601. vsub.f32 q0, q0, q3
  602. vst1.32 {d2-d3}, [r0,:128]!
  603. vst1.32 {d0-d1}, [r1,:128]!
  604. bx lr
  605. endfunc
  606. #endif
  607. function ff_vector_fmul_scalar_neon, export=1
  608. VFP len .req r2
  609. NOVFP len .req r3
  610. VFP vdup.32 q8, d0[0]
  611. NOVFP vdup.32 q8, r2
  612. bics r12, len, #15
  613. beq 3f
  614. vld1.32 {q0},[r1,:128]!
  615. vld1.32 {q1},[r1,:128]!
  616. 1: vmul.f32 q0, q0, q8
  617. vld1.32 {q2},[r1,:128]!
  618. vmul.f32 q1, q1, q8
  619. vld1.32 {q3},[r1,:128]!
  620. vmul.f32 q2, q2, q8
  621. vst1.32 {q0},[r0,:128]!
  622. vmul.f32 q3, q3, q8
  623. vst1.32 {q1},[r0,:128]!
  624. subs r12, r12, #16
  625. beq 2f
  626. vld1.32 {q0},[r1,:128]!
  627. vst1.32 {q2},[r0,:128]!
  628. vld1.32 {q1},[r1,:128]!
  629. vst1.32 {q3},[r0,:128]!
  630. b 1b
  631. 2: vst1.32 {q2},[r0,:128]!
  632. vst1.32 {q3},[r0,:128]!
  633. ands len, len, #15
  634. it eq
  635. bxeq lr
  636. 3: vld1.32 {q0},[r1,:128]!
  637. vmul.f32 q0, q0, q8
  638. vst1.32 {q0},[r0,:128]!
  639. subs len, len, #4
  640. bgt 3b
  641. bx lr
  642. .unreq len
  643. endfunc
  644. function ff_butterflies_float_neon, export=1
  645. 1: vld1.32 {q0},[r0,:128]
  646. vld1.32 {q1},[r1,:128]
  647. vsub.f32 q2, q0, q1
  648. vadd.f32 q1, q0, q1
  649. vst1.32 {q2},[r1,:128]!
  650. vst1.32 {q1},[r0,:128]!
  651. subs r2, r2, #4
  652. bgt 1b
  653. bx lr
  654. endfunc
  655. function ff_scalarproduct_float_neon, export=1
  656. vmov.f32 q2, #0.0
  657. 1: vld1.32 {q0},[r0,:128]!
  658. vld1.32 {q1},[r1,:128]!
  659. vmla.f32 q2, q0, q1
  660. subs r2, r2, #4
  661. bgt 1b
  662. vadd.f32 d0, d4, d5
  663. vpadd.f32 d0, d0, d0
  664. NOVFP vmov.32 r0, d0[0]
  665. bx lr
  666. endfunc
  667. function ff_vector_fmul_reverse_neon, export=1
  668. add r2, r2, r3, lsl #2
  669. sub r2, r2, #32
  670. mov r12, #-32
  671. vld1.32 {q0-q1}, [r1,:128]!
  672. vld1.32 {q2-q3}, [r2,:128], r12
  673. 1: pld [r1, #32]
  674. vrev64.32 q3, q3
  675. vmul.f32 d16, d0, d7
  676. vmul.f32 d17, d1, d6
  677. pld [r2, #-32]
  678. vrev64.32 q2, q2
  679. vmul.f32 d18, d2, d5
  680. vmul.f32 d19, d3, d4
  681. subs r3, r3, #8
  682. beq 2f
  683. vld1.32 {q0-q1}, [r1,:128]!
  684. vld1.32 {q2-q3}, [r2,:128], r12
  685. vst1.32 {q8-q9}, [r0,:128]!
  686. b 1b
  687. 2: vst1.32 {q8-q9}, [r0,:128]!
  688. bx lr
  689. endfunc
  690. function ff_vector_fmul_add_neon, export=1
  691. ldr r12, [sp]
  692. vld1.32 {q0-q1}, [r1,:128]!
  693. vld1.32 {q8-q9}, [r2,:128]!
  694. vld1.32 {q2-q3}, [r3,:128]!
  695. vmul.f32 q10, q0, q8
  696. vmul.f32 q11, q1, q9
  697. 1: vadd.f32 q12, q2, q10
  698. vadd.f32 q13, q3, q11
  699. pld [r1, #16]
  700. pld [r2, #16]
  701. pld [r3, #16]
  702. subs r12, r12, #8
  703. beq 2f
  704. vld1.32 {q0}, [r1,:128]!
  705. vld1.32 {q8}, [r2,:128]!
  706. vmul.f32 q10, q0, q8
  707. vld1.32 {q1}, [r1,:128]!
  708. vld1.32 {q9}, [r2,:128]!
  709. vmul.f32 q11, q1, q9
  710. vld1.32 {q2-q3}, [r3,:128]!
  711. vst1.32 {q12-q13},[r0,:128]!
  712. b 1b
  713. 2: vst1.32 {q12-q13},[r0,:128]!
  714. bx lr
  715. endfunc
  716. function ff_vector_clipf_neon, export=1
  717. VFP vdup.32 q1, d0[1]
  718. VFP vdup.32 q0, d0[0]
  719. NOVFP vdup.32 q0, r2
  720. NOVFP vdup.32 q1, r3
  721. NOVFP ldr r2, [sp]
  722. vld1.f32 {q2},[r1,:128]!
  723. vmin.f32 q10, q2, q1
  724. vld1.f32 {q3},[r1,:128]!
  725. vmin.f32 q11, q3, q1
  726. 1: vmax.f32 q8, q10, q0
  727. vmax.f32 q9, q11, q0
  728. subs r2, r2, #8
  729. beq 2f
  730. vld1.f32 {q2},[r1,:128]!
  731. vmin.f32 q10, q2, q1
  732. vld1.f32 {q3},[r1,:128]!
  733. vmin.f32 q11, q3, q1
  734. vst1.f32 {q8},[r0,:128]!
  735. vst1.f32 {q9},[r0,:128]!
  736. b 1b
  737. 2: vst1.f32 {q8},[r0,:128]!
  738. vst1.f32 {q9},[r0,:128]!
  739. bx lr
  740. endfunc
  741. function ff_apply_window_int16_neon, export=1
  742. push {r4,lr}
  743. add r4, r1, r3, lsl #1
  744. add lr, r0, r3, lsl #1
  745. sub r4, r4, #16
  746. sub lr, lr, #16
  747. mov r12, #-16
  748. 1:
  749. vld1.16 {q0}, [r1,:128]!
  750. vld1.16 {q2}, [r2,:128]!
  751. vld1.16 {q1}, [r4,:128], r12
  752. vrev64.16 q3, q2
  753. vqrdmulh.s16 q0, q0, q2
  754. vqrdmulh.s16 d2, d2, d7
  755. vqrdmulh.s16 d3, d3, d6
  756. vst1.16 {q0}, [r0,:128]!
  757. vst1.16 {q1}, [lr,:128], r12
  758. subs r3, r3, #16
  759. bgt 1b
  760. pop {r4,pc}
  761. endfunc
  762. function ff_vector_clip_int32_neon, export=1
  763. vdup.32 q0, r2
  764. vdup.32 q1, r3
  765. ldr r2, [sp]
  766. 1:
  767. vld1.32 {q2-q3}, [r1,:128]!
  768. vmin.s32 q2, q2, q1
  769. vmin.s32 q3, q3, q1
  770. vmax.s32 q2, q2, q0
  771. vmax.s32 q3, q3, q0
  772. vst1.32 {q2-q3}, [r0,:128]!
  773. subs r2, r2, #8
  774. bgt 1b
  775. bx lr
  776. endfunc