You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

912 lines
30KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. function ff_clear_block_neon, export=1
  25. vmov.i16 q0, #0
  26. .rept 8
  27. vst1.16 {q0}, [r0,:128]!
  28. .endr
  29. bx lr
  30. endfunc
  31. function ff_clear_blocks_neon, export=1
  32. vmov.i16 q0, #0
  33. .rept 8*6
  34. vst1.16 {q0}, [r0,:128]!
  35. .endr
  36. bx lr
  37. endfunc
  38. .macro pixels16 rnd=1, avg=0
  39. .if \avg
  40. mov r12, r0
  41. .endif
  42. 1: vld1.8 {q0}, [r1], r2
  43. vld1.8 {q1}, [r1], r2
  44. vld1.8 {q2}, [r1], r2
  45. pld [r1, r2, lsl #2]
  46. vld1.8 {q3}, [r1], r2
  47. pld [r1]
  48. pld [r1, r2]
  49. pld [r1, r2, lsl #1]
  50. .if \avg
  51. vld1.8 {q8}, [r12,:128], r2
  52. vrhadd.u8 q0, q0, q8
  53. vld1.8 {q9}, [r12,:128], r2
  54. vrhadd.u8 q1, q1, q9
  55. vld1.8 {q10}, [r12,:128], r2
  56. vrhadd.u8 q2, q2, q10
  57. vld1.8 {q11}, [r12,:128], r2
  58. vrhadd.u8 q3, q3, q11
  59. .endif
  60. subs r3, r3, #4
  61. vst1.64 {q0}, [r0,:128], r2
  62. vst1.64 {q1}, [r0,:128], r2
  63. vst1.64 {q2}, [r0,:128], r2
  64. vst1.64 {q3}, [r0,:128], r2
  65. bne 1b
  66. bx lr
  67. .endm
  68. .macro pixels16_x2 rnd=1, avg=0
  69. 1: vld1.8 {d0-d2}, [r1], r2
  70. vld1.8 {d4-d6}, [r1], r2
  71. pld [r1]
  72. pld [r1, r2]
  73. subs r3, r3, #2
  74. vext.8 q1, q0, q1, #1
  75. avg q0, q0, q1
  76. vext.8 q3, q2, q3, #1
  77. avg q2, q2, q3
  78. .if \avg
  79. vld1.8 {q1}, [r0,:128], r2
  80. vld1.8 {q3}, [r0,:128]
  81. vrhadd.u8 q0, q0, q1
  82. vrhadd.u8 q2, q2, q3
  83. sub r0, r0, r2
  84. .endif
  85. vst1.8 {q0}, [r0,:128], r2
  86. vst1.8 {q2}, [r0,:128], r2
  87. bne 1b
  88. bx lr
  89. .endm
  90. .macro pixels16_y2 rnd=1, avg=0
  91. sub r3, r3, #2
  92. vld1.8 {q0}, [r1], r2
  93. vld1.8 {q1}, [r1], r2
  94. 1: subs r3, r3, #2
  95. avg q2, q0, q1
  96. vld1.8 {q0}, [r1], r2
  97. avg q3, q0, q1
  98. vld1.8 {q1}, [r1], r2
  99. pld [r1]
  100. pld [r1, r2]
  101. .if \avg
  102. vld1.8 {q8}, [r0,:128], r2
  103. vld1.8 {q9}, [r0,:128]
  104. vrhadd.u8 q2, q2, q8
  105. vrhadd.u8 q3, q3, q9
  106. sub r0, r0, r2
  107. .endif
  108. vst1.8 {q2}, [r0,:128], r2
  109. vst1.8 {q3}, [r0,:128], r2
  110. bne 1b
  111. avg q2, q0, q1
  112. vld1.8 {q0}, [r1], r2
  113. avg q3, q0, q1
  114. .if \avg
  115. vld1.8 {q8}, [r0,:128], r2
  116. vld1.8 {q9}, [r0,:128]
  117. vrhadd.u8 q2, q2, q8
  118. vrhadd.u8 q3, q3, q9
  119. sub r0, r0, r2
  120. .endif
  121. vst1.8 {q2}, [r0,:128], r2
  122. vst1.8 {q3}, [r0,:128], r2
  123. bx lr
  124. .endm
  125. .macro pixels16_xy2 rnd=1, avg=0
  126. sub r3, r3, #2
  127. vld1.8 {d0-d2}, [r1], r2
  128. vld1.8 {d4-d6}, [r1], r2
  129. NRND vmov.i16 q13, #1
  130. pld [r1]
  131. pld [r1, r2]
  132. vext.8 q1, q0, q1, #1
  133. vext.8 q3, q2, q3, #1
  134. vaddl.u8 q8, d0, d2
  135. vaddl.u8 q10, d1, d3
  136. vaddl.u8 q9, d4, d6
  137. vaddl.u8 q11, d5, d7
  138. 1: subs r3, r3, #2
  139. vld1.8 {d0-d2}, [r1], r2
  140. vadd.u16 q12, q8, q9
  141. pld [r1]
  142. NRND vadd.u16 q12, q12, q13
  143. vext.8 q15, q0, q1, #1
  144. vadd.u16 q1 , q10, q11
  145. shrn d28, q12, #2
  146. NRND vadd.u16 q1, q1, q13
  147. shrn d29, q1, #2
  148. .if \avg
  149. vld1.8 {q8}, [r0,:128]
  150. vrhadd.u8 q14, q14, q8
  151. .endif
  152. vaddl.u8 q8, d0, d30
  153. vld1.8 {d2-d4}, [r1], r2
  154. vaddl.u8 q10, d1, d31
  155. vst1.8 {q14}, [r0,:128], r2
  156. vadd.u16 q12, q8, q9
  157. pld [r1, r2]
  158. NRND vadd.u16 q12, q12, q13
  159. vext.8 q2, q1, q2, #1
  160. vadd.u16 q0, q10, q11
  161. shrn d30, q12, #2
  162. NRND vadd.u16 q0, q0, q13
  163. shrn d31, q0, #2
  164. .if \avg
  165. vld1.8 {q9}, [r0,:128]
  166. vrhadd.u8 q15, q15, q9
  167. .endif
  168. vaddl.u8 q9, d2, d4
  169. vaddl.u8 q11, d3, d5
  170. vst1.8 {q15}, [r0,:128], r2
  171. bgt 1b
  172. vld1.8 {d0-d2}, [r1], r2
  173. vadd.u16 q12, q8, q9
  174. NRND vadd.u16 q12, q12, q13
  175. vext.8 q15, q0, q1, #1
  176. vadd.u16 q1 , q10, q11
  177. shrn d28, q12, #2
  178. NRND vadd.u16 q1, q1, q13
  179. shrn d29, q1, #2
  180. .if \avg
  181. vld1.8 {q8}, [r0,:128]
  182. vrhadd.u8 q14, q14, q8
  183. .endif
  184. vaddl.u8 q8, d0, d30
  185. vaddl.u8 q10, d1, d31
  186. vst1.8 {q14}, [r0,:128], r2
  187. vadd.u16 q12, q8, q9
  188. NRND vadd.u16 q12, q12, q13
  189. vadd.u16 q0, q10, q11
  190. shrn d30, q12, #2
  191. NRND vadd.u16 q0, q0, q13
  192. shrn d31, q0, #2
  193. .if \avg
  194. vld1.8 {q9}, [r0,:128]
  195. vrhadd.u8 q15, q15, q9
  196. .endif
  197. vst1.8 {q15}, [r0,:128], r2
  198. bx lr
  199. .endm
  200. .macro pixels8 rnd=1, avg=0
  201. 1: vld1.8 {d0}, [r1], r2
  202. vld1.8 {d1}, [r1], r2
  203. vld1.8 {d2}, [r1], r2
  204. pld [r1, r2, lsl #2]
  205. vld1.8 {d3}, [r1], r2
  206. pld [r1]
  207. pld [r1, r2]
  208. pld [r1, r2, lsl #1]
  209. .if \avg
  210. vld1.8 {d4}, [r0,:64], r2
  211. vrhadd.u8 d0, d0, d4
  212. vld1.8 {d5}, [r0,:64], r2
  213. vrhadd.u8 d1, d1, d5
  214. vld1.8 {d6}, [r0,:64], r2
  215. vrhadd.u8 d2, d2, d6
  216. vld1.8 {d7}, [r0,:64], r2
  217. vrhadd.u8 d3, d3, d7
  218. sub r0, r0, r2, lsl #2
  219. .endif
  220. subs r3, r3, #4
  221. vst1.8 {d0}, [r0,:64], r2
  222. vst1.8 {d1}, [r0,:64], r2
  223. vst1.8 {d2}, [r0,:64], r2
  224. vst1.8 {d3}, [r0,:64], r2
  225. bne 1b
  226. bx lr
  227. .endm
  228. .macro pixels8_x2 rnd=1, avg=0
  229. 1: vld1.8 {q0}, [r1], r2
  230. vext.8 d1, d0, d1, #1
  231. vld1.8 {q1}, [r1], r2
  232. vext.8 d3, d2, d3, #1
  233. pld [r1]
  234. pld [r1, r2]
  235. subs r3, r3, #2
  236. vswp d1, d2
  237. avg q0, q0, q1
  238. .if \avg
  239. vld1.8 {d4}, [r0,:64], r2
  240. vld1.8 {d5}, [r0,:64]
  241. vrhadd.u8 q0, q0, q2
  242. sub r0, r0, r2
  243. .endif
  244. vst1.8 {d0}, [r0,:64], r2
  245. vst1.8 {d1}, [r0,:64], r2
  246. bne 1b
  247. bx lr
  248. .endm
  249. .macro pixels8_y2 rnd=1, avg=0
  250. sub r3, r3, #2
  251. vld1.8 {d0}, [r1], r2
  252. vld1.8 {d1}, [r1], r2
  253. 1: subs r3, r3, #2
  254. avg d4, d0, d1
  255. vld1.8 {d0}, [r1], r2
  256. avg d5, d0, d1
  257. vld1.8 {d1}, [r1], r2
  258. pld [r1]
  259. pld [r1, r2]
  260. .if \avg
  261. vld1.8 {d2}, [r0,:64], r2
  262. vld1.8 {d3}, [r0,:64]
  263. vrhadd.u8 q2, q2, q1
  264. sub r0, r0, r2
  265. .endif
  266. vst1.8 {d4}, [r0,:64], r2
  267. vst1.8 {d5}, [r0,:64], r2
  268. bne 1b
  269. avg d4, d0, d1
  270. vld1.8 {d0}, [r1], r2
  271. avg d5, d0, d1
  272. .if \avg
  273. vld1.8 {d2}, [r0,:64], r2
  274. vld1.8 {d3}, [r0,:64]
  275. vrhadd.u8 q2, q2, q1
  276. sub r0, r0, r2
  277. .endif
  278. vst1.8 {d4}, [r0,:64], r2
  279. vst1.8 {d5}, [r0,:64], r2
  280. bx lr
  281. .endm
  282. .macro pixels8_xy2 rnd=1, avg=0
  283. sub r3, r3, #2
  284. vld1.8 {q0}, [r1], r2
  285. vld1.8 {q1}, [r1], r2
  286. NRND vmov.i16 q11, #1
  287. pld [r1]
  288. pld [r1, r2]
  289. vext.8 d4, d0, d1, #1
  290. vext.8 d6, d2, d3, #1
  291. vaddl.u8 q8, d0, d4
  292. vaddl.u8 q9, d2, d6
  293. 1: subs r3, r3, #2
  294. vld1.8 {q0}, [r1], r2
  295. pld [r1]
  296. vadd.u16 q10, q8, q9
  297. vext.8 d4, d0, d1, #1
  298. NRND vadd.u16 q10, q10, q11
  299. vaddl.u8 q8, d0, d4
  300. shrn d5, q10, #2
  301. vld1.8 {q1}, [r1], r2
  302. vadd.u16 q10, q8, q9
  303. pld [r1, r2]
  304. .if \avg
  305. vld1.8 {d7}, [r0,:64]
  306. vrhadd.u8 d5, d5, d7
  307. .endif
  308. NRND vadd.u16 q10, q10, q11
  309. vst1.8 {d5}, [r0,:64], r2
  310. shrn d7, q10, #2
  311. .if \avg
  312. vld1.8 {d5}, [r0,:64]
  313. vrhadd.u8 d7, d7, d5
  314. .endif
  315. vext.8 d6, d2, d3, #1
  316. vaddl.u8 q9, d2, d6
  317. vst1.8 {d7}, [r0,:64], r2
  318. bgt 1b
  319. vld1.8 {q0}, [r1], r2
  320. vadd.u16 q10, q8, q9
  321. vext.8 d4, d0, d1, #1
  322. NRND vadd.u16 q10, q10, q11
  323. vaddl.u8 q8, d0, d4
  324. shrn d5, q10, #2
  325. vadd.u16 q10, q8, q9
  326. .if \avg
  327. vld1.8 {d7}, [r0,:64]
  328. vrhadd.u8 d5, d5, d7
  329. .endif
  330. NRND vadd.u16 q10, q10, q11
  331. vst1.8 {d5}, [r0,:64], r2
  332. shrn d7, q10, #2
  333. .if \avg
  334. vld1.8 {d5}, [r0,:64]
  335. vrhadd.u8 d7, d7, d5
  336. .endif
  337. vst1.8 {d7}, [r0,:64], r2
  338. bx lr
  339. .endm
  340. .macro pixfunc pfx, name, suf, rnd=1, avg=0
  341. .if \rnd
  342. .macro avg rd, rn, rm
  343. vrhadd.u8 \rd, \rn, \rm
  344. .endm
  345. .macro shrn rd, rn, rm
  346. vrshrn.u16 \rd, \rn, \rm
  347. .endm
  348. .macro NRND insn:vararg
  349. .endm
  350. .else
  351. .macro avg rd, rn, rm
  352. vhadd.u8 \rd, \rn, \rm
  353. .endm
  354. .macro shrn rd, rn, rm
  355. vshrn.u16 \rd, \rn, \rm
  356. .endm
  357. .macro NRND insn:vararg
  358. \insn
  359. .endm
  360. .endif
  361. function ff_\pfx\name\suf\()_neon, export=1
  362. \name \rnd, \avg
  363. endfunc
  364. .purgem avg
  365. .purgem shrn
  366. .purgem NRND
  367. .endm
  368. .macro pixfunc2 pfx, name, avg=0
  369. pixfunc \pfx, \name, rnd=1, avg=\avg
  370. pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
  371. .endm
  372. function ff_put_h264_qpel16_mc00_neon, export=1
  373. mov r3, #16
  374. endfunc
  375. pixfunc put_, pixels16, avg=0
  376. pixfunc2 put_, pixels16_x2, avg=0
  377. pixfunc2 put_, pixels16_y2, avg=0
  378. pixfunc2 put_, pixels16_xy2, avg=0
  379. function ff_avg_h264_qpel16_mc00_neon, export=1
  380. mov r3, #16
  381. endfunc
  382. pixfunc avg_, pixels16, avg=1
  383. pixfunc2 avg_, pixels16_x2, avg=1
  384. pixfunc2 avg_, pixels16_y2, avg=1
  385. pixfunc2 avg_, pixels16_xy2, avg=1
  386. function ff_put_h264_qpel8_mc00_neon, export=1
  387. mov r3, #8
  388. endfunc
  389. pixfunc put_, pixels8, avg=0
  390. pixfunc2 put_, pixels8_x2, avg=0
  391. pixfunc2 put_, pixels8_y2, avg=0
  392. pixfunc2 put_, pixels8_xy2, avg=0
  393. function ff_avg_h264_qpel8_mc00_neon, export=1
  394. mov r3, #8
  395. endfunc
  396. pixfunc avg_, pixels8, avg=1
  397. pixfunc2 avg_, pixels8_x2, avg=1
  398. pixfunc2 avg_, pixels8_y2, avg=1
  399. pixfunc2 avg_, pixels8_xy2, avg=1
  400. function ff_put_pixels_clamped_neon, export=1
  401. vld1.16 {d16-d19}, [r0,:128]!
  402. vqmovun.s16 d0, q8
  403. vld1.16 {d20-d23}, [r0,:128]!
  404. vqmovun.s16 d1, q9
  405. vld1.16 {d24-d27}, [r0,:128]!
  406. vqmovun.s16 d2, q10
  407. vld1.16 {d28-d31}, [r0,:128]!
  408. vqmovun.s16 d3, q11
  409. vst1.8 {d0}, [r1,:64], r2
  410. vqmovun.s16 d4, q12
  411. vst1.8 {d1}, [r1,:64], r2
  412. vqmovun.s16 d5, q13
  413. vst1.8 {d2}, [r1,:64], r2
  414. vqmovun.s16 d6, q14
  415. vst1.8 {d3}, [r1,:64], r2
  416. vqmovun.s16 d7, q15
  417. vst1.8 {d4}, [r1,:64], r2
  418. vst1.8 {d5}, [r1,:64], r2
  419. vst1.8 {d6}, [r1,:64], r2
  420. vst1.8 {d7}, [r1,:64], r2
  421. bx lr
  422. endfunc
  423. function ff_put_signed_pixels_clamped_neon, export=1
  424. vmov.u8 d31, #128
  425. vld1.16 {d16-d17}, [r0,:128]!
  426. vqmovn.s16 d0, q8
  427. vld1.16 {d18-d19}, [r0,:128]!
  428. vqmovn.s16 d1, q9
  429. vld1.16 {d16-d17}, [r0,:128]!
  430. vqmovn.s16 d2, q8
  431. vld1.16 {d18-d19}, [r0,:128]!
  432. vadd.u8 d0, d0, d31
  433. vld1.16 {d20-d21}, [r0,:128]!
  434. vadd.u8 d1, d1, d31
  435. vld1.16 {d22-d23}, [r0,:128]!
  436. vadd.u8 d2, d2, d31
  437. vst1.8 {d0}, [r1,:64], r2
  438. vqmovn.s16 d3, q9
  439. vst1.8 {d1}, [r1,:64], r2
  440. vqmovn.s16 d4, q10
  441. vst1.8 {d2}, [r1,:64], r2
  442. vqmovn.s16 d5, q11
  443. vld1.16 {d24-d25}, [r0,:128]!
  444. vadd.u8 d3, d3, d31
  445. vld1.16 {d26-d27}, [r0,:128]!
  446. vadd.u8 d4, d4, d31
  447. vadd.u8 d5, d5, d31
  448. vst1.8 {d3}, [r1,:64], r2
  449. vqmovn.s16 d6, q12
  450. vst1.8 {d4}, [r1,:64], r2
  451. vqmovn.s16 d7, q13
  452. vst1.8 {d5}, [r1,:64], r2
  453. vadd.u8 d6, d6, d31
  454. vadd.u8 d7, d7, d31
  455. vst1.8 {d6}, [r1,:64], r2
  456. vst1.8 {d7}, [r1,:64], r2
  457. bx lr
  458. endfunc
  459. function ff_add_pixels_clamped_neon, export=1
  460. mov r3, r1
  461. vld1.8 {d16}, [r1,:64], r2
  462. vld1.16 {d0-d1}, [r0,:128]!
  463. vaddw.u8 q0, q0, d16
  464. vld1.8 {d17}, [r1,:64], r2
  465. vld1.16 {d2-d3}, [r0,:128]!
  466. vqmovun.s16 d0, q0
  467. vld1.8 {d18}, [r1,:64], r2
  468. vaddw.u8 q1, q1, d17
  469. vld1.16 {d4-d5}, [r0,:128]!
  470. vaddw.u8 q2, q2, d18
  471. vst1.8 {d0}, [r3,:64], r2
  472. vqmovun.s16 d2, q1
  473. vld1.8 {d19}, [r1,:64], r2
  474. vld1.16 {d6-d7}, [r0,:128]!
  475. vaddw.u8 q3, q3, d19
  476. vqmovun.s16 d4, q2
  477. vst1.8 {d2}, [r3,:64], r2
  478. vld1.8 {d16}, [r1,:64], r2
  479. vqmovun.s16 d6, q3
  480. vld1.16 {d0-d1}, [r0,:128]!
  481. vaddw.u8 q0, q0, d16
  482. vst1.8 {d4}, [r3,:64], r2
  483. vld1.8 {d17}, [r1,:64], r2
  484. vld1.16 {d2-d3}, [r0,:128]!
  485. vaddw.u8 q1, q1, d17
  486. vst1.8 {d6}, [r3,:64], r2
  487. vqmovun.s16 d0, q0
  488. vld1.8 {d18}, [r1,:64], r2
  489. vld1.16 {d4-d5}, [r0,:128]!
  490. vaddw.u8 q2, q2, d18
  491. vst1.8 {d0}, [r3,:64], r2
  492. vqmovun.s16 d2, q1
  493. vld1.8 {d19}, [r1,:64], r2
  494. vqmovun.s16 d4, q2
  495. vld1.16 {d6-d7}, [r0,:128]!
  496. vaddw.u8 q3, q3, d19
  497. vst1.8 {d2}, [r3,:64], r2
  498. vqmovun.s16 d6, q3
  499. vst1.8 {d4}, [r3,:64], r2
  500. vst1.8 {d6}, [r3,:64], r2
  501. bx lr
  502. endfunc
  503. function ff_vector_fmul_neon, export=1
  504. subs r3, r3, #8
  505. vld1.32 {d0-d3}, [r1,:128]!
  506. vld1.32 {d4-d7}, [r2,:128]!
  507. vmul.f32 q8, q0, q2
  508. vmul.f32 q9, q1, q3
  509. beq 3f
  510. bics ip, r3, #15
  511. beq 2f
  512. 1: subs ip, ip, #16
  513. vld1.32 {d0-d1}, [r1,:128]!
  514. vld1.32 {d4-d5}, [r2,:128]!
  515. vmul.f32 q10, q0, q2
  516. vld1.32 {d2-d3}, [r1,:128]!
  517. vld1.32 {d6-d7}, [r2,:128]!
  518. vmul.f32 q11, q1, q3
  519. vst1.32 {d16-d19},[r0,:128]!
  520. vld1.32 {d0-d1}, [r1,:128]!
  521. vld1.32 {d4-d5}, [r2,:128]!
  522. vmul.f32 q8, q0, q2
  523. vld1.32 {d2-d3}, [r1,:128]!
  524. vld1.32 {d6-d7}, [r2,:128]!
  525. vmul.f32 q9, q1, q3
  526. vst1.32 {d20-d23},[r0,:128]!
  527. bne 1b
  528. ands r3, r3, #15
  529. beq 3f
  530. 2: vld1.32 {d0-d1}, [r1,:128]!
  531. vld1.32 {d4-d5}, [r2,:128]!
  532. vst1.32 {d16-d17},[r0,:128]!
  533. vmul.f32 q8, q0, q2
  534. vld1.32 {d2-d3}, [r1,:128]!
  535. vld1.32 {d6-d7}, [r2,:128]!
  536. vst1.32 {d18-d19},[r0,:128]!
  537. vmul.f32 q9, q1, q3
  538. 3: vst1.32 {d16-d19},[r0,:128]!
  539. bx lr
  540. endfunc
  541. function ff_vector_fmul_window_neon, export=1
  542. push {r4,r5,lr}
  543. ldr lr, [sp, #12]
  544. sub r2, r2, #8
  545. sub r5, lr, #2
  546. add r2, r2, r5, lsl #2
  547. add r4, r3, r5, lsl #3
  548. add ip, r0, r5, lsl #3
  549. mov r5, #-16
  550. vld1.32 {d0,d1}, [r1,:128]!
  551. vld1.32 {d2,d3}, [r2,:128], r5
  552. vld1.32 {d4,d5}, [r3,:128]!
  553. vld1.32 {d6,d7}, [r4,:128], r5
  554. 1: subs lr, lr, #4
  555. vmul.f32 d22, d0, d4
  556. vrev64.32 q3, q3
  557. vmul.f32 d23, d1, d5
  558. vrev64.32 q1, q1
  559. vmul.f32 d20, d0, d7
  560. vmul.f32 d21, d1, d6
  561. beq 2f
  562. vmla.f32 d22, d3, d7
  563. vld1.32 {d0,d1}, [r1,:128]!
  564. vmla.f32 d23, d2, d6
  565. vld1.32 {d18,d19},[r2,:128], r5
  566. vmls.f32 d20, d3, d4
  567. vld1.32 {d24,d25},[r3,:128]!
  568. vmls.f32 d21, d2, d5
  569. vld1.32 {d6,d7}, [r4,:128], r5
  570. vmov q1, q9
  571. vrev64.32 q11, q11
  572. vmov q2, q12
  573. vswp d22, d23
  574. vst1.32 {d20,d21},[r0,:128]!
  575. vst1.32 {d22,d23},[ip,:128], r5
  576. b 1b
  577. 2: vmla.f32 d22, d3, d7
  578. vmla.f32 d23, d2, d6
  579. vmls.f32 d20, d3, d4
  580. vmls.f32 d21, d2, d5
  581. vrev64.32 q11, q11
  582. vswp d22, d23
  583. vst1.32 {d20,d21},[r0,:128]!
  584. vst1.32 {d22,d23},[ip,:128], r5
  585. pop {r4,r5,pc}
  586. endfunc
  587. #if CONFIG_VORBIS_DECODER
  588. function ff_vorbis_inverse_coupling_neon, export=1
  589. vmov.i32 q10, #1<<31
  590. subs r2, r2, #4
  591. mov r3, r0
  592. mov r12, r1
  593. beq 3f
  594. vld1.32 {d24-d25},[r1,:128]!
  595. vld1.32 {d22-d23},[r0,:128]!
  596. vcle.s32 q8, q12, #0
  597. vand q9, q11, q10
  598. veor q12, q12, q9
  599. vand q2, q12, q8
  600. vbic q3, q12, q8
  601. vadd.f32 q12, q11, q2
  602. vsub.f32 q11, q11, q3
  603. 1: vld1.32 {d2-d3}, [r1,:128]!
  604. vld1.32 {d0-d1}, [r0,:128]!
  605. vcle.s32 q8, q1, #0
  606. vand q9, q0, q10
  607. veor q1, q1, q9
  608. vst1.32 {d24-d25},[r3, :128]!
  609. vst1.32 {d22-d23},[r12,:128]!
  610. vand q2, q1, q8
  611. vbic q3, q1, q8
  612. vadd.f32 q1, q0, q2
  613. vsub.f32 q0, q0, q3
  614. subs r2, r2, #8
  615. ble 2f
  616. vld1.32 {d24-d25},[r1,:128]!
  617. vld1.32 {d22-d23},[r0,:128]!
  618. vcle.s32 q8, q12, #0
  619. vand q9, q11, q10
  620. veor q12, q12, q9
  621. vst1.32 {d2-d3}, [r3, :128]!
  622. vst1.32 {d0-d1}, [r12,:128]!
  623. vand q2, q12, q8
  624. vbic q3, q12, q8
  625. vadd.f32 q12, q11, q2
  626. vsub.f32 q11, q11, q3
  627. b 1b
  628. 2: vst1.32 {d2-d3}, [r3, :128]!
  629. vst1.32 {d0-d1}, [r12,:128]!
  630. it lt
  631. bxlt lr
  632. 3: vld1.32 {d2-d3}, [r1,:128]
  633. vld1.32 {d0-d1}, [r0,:128]
  634. vcle.s32 q8, q1, #0
  635. vand q9, q0, q10
  636. veor q1, q1, q9
  637. vand q2, q1, q8
  638. vbic q3, q1, q8
  639. vadd.f32 q1, q0, q2
  640. vsub.f32 q0, q0, q3
  641. vst1.32 {d2-d3}, [r0,:128]!
  642. vst1.32 {d0-d1}, [r1,:128]!
  643. bx lr
  644. endfunc
  645. #endif
  646. function ff_vector_fmul_scalar_neon, export=1
  647. VFP len .req r2
  648. NOVFP len .req r3
  649. VFP vdup.32 q8, d0[0]
  650. NOVFP vdup.32 q8, r2
  651. bics r12, len, #15
  652. beq 3f
  653. vld1.32 {q0},[r1,:128]!
  654. vld1.32 {q1},[r1,:128]!
  655. 1: vmul.f32 q0, q0, q8
  656. vld1.32 {q2},[r1,:128]!
  657. vmul.f32 q1, q1, q8
  658. vld1.32 {q3},[r1,:128]!
  659. vmul.f32 q2, q2, q8
  660. vst1.32 {q0},[r0,:128]!
  661. vmul.f32 q3, q3, q8
  662. vst1.32 {q1},[r0,:128]!
  663. subs r12, r12, #16
  664. beq 2f
  665. vld1.32 {q0},[r1,:128]!
  666. vst1.32 {q2},[r0,:128]!
  667. vld1.32 {q1},[r1,:128]!
  668. vst1.32 {q3},[r0,:128]!
  669. b 1b
  670. 2: vst1.32 {q2},[r0,:128]!
  671. vst1.32 {q3},[r0,:128]!
  672. ands len, len, #15
  673. it eq
  674. bxeq lr
  675. 3: vld1.32 {q0},[r1,:128]!
  676. vmul.f32 q0, q0, q8
  677. vst1.32 {q0},[r0,:128]!
  678. subs len, len, #4
  679. bgt 3b
  680. bx lr
  681. .unreq len
  682. endfunc
  683. function ff_vector_fmac_scalar_neon, export=1
  684. VFP len .req r2
  685. VFP acc .req r3
  686. NOVFP len .req r3
  687. NOVFP acc .req r2
  688. VFP vdup.32 q15, d0[0]
  689. NOVFP vdup.32 q15, r2
  690. bics r12, len, #15
  691. mov acc, r0
  692. beq 3f
  693. vld1.32 {q0}, [r1,:128]!
  694. vld1.32 {q8}, [acc,:128]!
  695. vld1.32 {q1}, [r1,:128]!
  696. vld1.32 {q9}, [acc,:128]!
  697. 1: vmla.f32 q8, q0, q15
  698. vld1.32 {q2}, [r1,:128]!
  699. vld1.32 {q10}, [acc,:128]!
  700. vmla.f32 q9, q1, q15
  701. vld1.32 {q3}, [r1,:128]!
  702. vld1.32 {q11}, [acc,:128]!
  703. vmla.f32 q10, q2, q15
  704. vst1.32 {q8}, [r0,:128]!
  705. vmla.f32 q11, q3, q15
  706. vst1.32 {q9}, [r0,:128]!
  707. subs r12, r12, #16
  708. beq 2f
  709. vld1.32 {q0}, [r1,:128]!
  710. vld1.32 {q8}, [acc,:128]!
  711. vst1.32 {q10}, [r0,:128]!
  712. vld1.32 {q1}, [r1,:128]!
  713. vld1.32 {q9}, [acc,:128]!
  714. vst1.32 {q11}, [r0,:128]!
  715. b 1b
  716. 2: vst1.32 {q10}, [r0,:128]!
  717. vst1.32 {q11}, [r0,:128]!
  718. ands len, len, #15
  719. it eq
  720. bxeq lr
  721. 3: vld1.32 {q0}, [r1,:128]!
  722. vld1.32 {q8}, [acc,:128]!
  723. vmla.f32 q8, q0, q15
  724. vst1.32 {q8}, [r0,:128]!
  725. subs len, len, #4
  726. bgt 3b
  727. bx lr
  728. .unreq len
  729. endfunc
  730. function ff_butterflies_float_neon, export=1
  731. 1: vld1.32 {q0},[r0,:128]
  732. vld1.32 {q1},[r1,:128]
  733. vsub.f32 q2, q0, q1
  734. vadd.f32 q1, q0, q1
  735. vst1.32 {q2},[r1,:128]!
  736. vst1.32 {q1},[r0,:128]!
  737. subs r2, r2, #4
  738. bgt 1b
  739. bx lr
  740. endfunc
  741. function ff_scalarproduct_float_neon, export=1
  742. vmov.f32 q2, #0.0
  743. 1: vld1.32 {q0},[r0,:128]!
  744. vld1.32 {q1},[r1,:128]!
  745. vmla.f32 q2, q0, q1
  746. subs r2, r2, #4
  747. bgt 1b
  748. vadd.f32 d0, d4, d5
  749. vpadd.f32 d0, d0, d0
  750. NOVFP vmov.32 r0, d0[0]
  751. bx lr
  752. endfunc
  753. function ff_vector_fmul_reverse_neon, export=1
  754. add r2, r2, r3, lsl #2
  755. sub r2, r2, #32
  756. mov r12, #-32
  757. vld1.32 {q0-q1}, [r1,:128]!
  758. vld1.32 {q2-q3}, [r2,:128], r12
  759. 1: pld [r1, #32]
  760. vrev64.32 q3, q3
  761. vmul.f32 d16, d0, d7
  762. vmul.f32 d17, d1, d6
  763. pld [r2, #-32]
  764. vrev64.32 q2, q2
  765. vmul.f32 d18, d2, d5
  766. vmul.f32 d19, d3, d4
  767. subs r3, r3, #8
  768. beq 2f
  769. vld1.32 {q0-q1}, [r1,:128]!
  770. vld1.32 {q2-q3}, [r2,:128], r12
  771. vst1.32 {q8-q9}, [r0,:128]!
  772. b 1b
  773. 2: vst1.32 {q8-q9}, [r0,:128]!
  774. bx lr
  775. endfunc
  776. function ff_vector_fmul_add_neon, export=1
  777. ldr r12, [sp]
  778. vld1.32 {q0-q1}, [r1,:128]!
  779. vld1.32 {q8-q9}, [r2,:128]!
  780. vld1.32 {q2-q3}, [r3,:128]!
  781. vmul.f32 q10, q0, q8
  782. vmul.f32 q11, q1, q9
  783. 1: vadd.f32 q12, q2, q10
  784. vadd.f32 q13, q3, q11
  785. pld [r1, #16]
  786. pld [r2, #16]
  787. pld [r3, #16]
  788. subs r12, r12, #8
  789. beq 2f
  790. vld1.32 {q0}, [r1,:128]!
  791. vld1.32 {q8}, [r2,:128]!
  792. vmul.f32 q10, q0, q8
  793. vld1.32 {q1}, [r1,:128]!
  794. vld1.32 {q9}, [r2,:128]!
  795. vmul.f32 q11, q1, q9
  796. vld1.32 {q2-q3}, [r3,:128]!
  797. vst1.32 {q12-q13},[r0,:128]!
  798. b 1b
  799. 2: vst1.32 {q12-q13},[r0,:128]!
  800. bx lr
  801. endfunc
  802. function ff_vector_clipf_neon, export=1
  803. VFP vdup.32 q1, d0[1]
  804. VFP vdup.32 q0, d0[0]
  805. NOVFP vdup.32 q0, r2
  806. NOVFP vdup.32 q1, r3
  807. NOVFP ldr r2, [sp]
  808. vld1.f32 {q2},[r1,:128]!
  809. vmin.f32 q10, q2, q1
  810. vld1.f32 {q3},[r1,:128]!
  811. vmin.f32 q11, q3, q1
  812. 1: vmax.f32 q8, q10, q0
  813. vmax.f32 q9, q11, q0
  814. subs r2, r2, #8
  815. beq 2f
  816. vld1.f32 {q2},[r1,:128]!
  817. vmin.f32 q10, q2, q1
  818. vld1.f32 {q3},[r1,:128]!
  819. vmin.f32 q11, q3, q1
  820. vst1.f32 {q8},[r0,:128]!
  821. vst1.f32 {q9},[r0,:128]!
  822. b 1b
  823. 2: vst1.f32 {q8},[r0,:128]!
  824. vst1.f32 {q9},[r0,:128]!
  825. bx lr
  826. endfunc
  827. function ff_apply_window_int16_neon, export=1
  828. push {r4,lr}
  829. add r4, r1, r3, lsl #1
  830. add lr, r0, r3, lsl #1
  831. sub r4, r4, #16
  832. sub lr, lr, #16
  833. mov r12, #-16
  834. 1:
  835. vld1.16 {q0}, [r1,:128]!
  836. vld1.16 {q2}, [r2,:128]!
  837. vld1.16 {q1}, [r4,:128], r12
  838. vrev64.16 q3, q2
  839. vqrdmulh.s16 q0, q0, q2
  840. vqrdmulh.s16 d2, d2, d7
  841. vqrdmulh.s16 d3, d3, d6
  842. vst1.16 {q0}, [r0,:128]!
  843. vst1.16 {q1}, [lr,:128], r12
  844. subs r3, r3, #16
  845. bgt 1b
  846. pop {r4,pc}
  847. endfunc
  848. function ff_vector_clip_int32_neon, export=1
  849. vdup.32 q0, r2
  850. vdup.32 q1, r3
  851. ldr r2, [sp]
  852. 1:
  853. vld1.32 {q2-q3}, [r1,:128]!
  854. vmin.s32 q2, q2, q1
  855. vmin.s32 q3, q3, q1
  856. vmax.s32 q2, q2, q0
  857. vmax.s32 q3, q3, q0
  858. vst1.32 {q2-q3}, [r0,:128]!
  859. subs r2, r2, #8
  860. bgt 1b
  861. bx lr
  862. endfunc