You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

494 lines
16KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. POUT .req a1
  23. PIN .req a2
  24. PCOEF .req a3
  25. DECIFACTOR .req a4
  26. OLDFPSCR .req a4
  27. COUNTER .req ip
  28. SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
  29. SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
  30. IN0 .req s4
  31. IN1 .req s5
  32. IN2 .req s6
  33. IN3 .req s7
  34. IN4 .req s0
  35. IN5 .req s1
  36. IN6 .req s2
  37. IN7 .req s3
  38. COEF0 .req s8 @ coefficient elements
  39. COEF1 .req s9
  40. COEF2 .req s10
  41. COEF3 .req s11
  42. COEF4 .req s12
  43. COEF5 .req s13
  44. COEF6 .req s14
  45. COEF7 .req s15
  46. ACCUM0 .req s16 @ double-buffered multiply-accumulate results
  47. ACCUM4 .req s20
  48. POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
  49. POST1 .req s25
  50. POST2 .req s26
  51. POST3 .req s27
  52. .macro inner_loop decifactor, dir, tail, head
  53. .ifc "\dir","up"
  54. .set X, 0
  55. .set Y, 4
  56. .else
  57. .set X, 4*JMAX*4 - 4
  58. .set Y, -4
  59. .endif
  60. .ifnc "\head",""
  61. vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
  62. vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
  63. vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
  64. vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
  65. .endif
  66. .ifnc "\tail",""
  67. vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
  68. .endif
  69. .ifnc "\head",""
  70. vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
  71. vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
  72. vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
  73. vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
  74. .endif
  75. .ifnc "\tail",""
  76. vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
  77. .endif
  78. .ifnc "\head",""
  79. vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
  80. .ifc "\tail",""
  81. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  82. .endif
  83. vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
  84. vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
  85. .ifnc "\tail",""
  86. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  87. .endif
  88. vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
  89. vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
  90. .endif
  91. .ifnc "\tail",""
  92. vstmia POUT!, {POST0-POST3}
  93. .endif
  94. .ifnc "\head",""
  95. vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
  96. vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
  97. vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
  98. vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
  99. vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
  100. vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
  101. .if \decifactor == 32
  102. vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
  103. vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
  104. vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
  105. vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
  106. vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
  107. vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
  108. vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
  109. vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
  110. vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
  111. vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
  112. vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
  113. vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
  114. vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
  115. vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
  116. vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
  117. vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
  118. vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
  119. vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
  120. vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
  121. vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
  122. .endif
  123. .endif
  124. .endm
  125. .macro dca_lfe_fir decifactor
  126. .if \decifactor == 32
  127. .set JMAX, 8
  128. vpush {s16-s31}
  129. vmov SCALE32, s0 @ duplicate scalar across vector
  130. vldr IN4, [PIN, #-4*4]
  131. vldr IN5, [PIN, #-5*4]
  132. vldr IN6, [PIN, #-6*4]
  133. vldr IN7, [PIN, #-7*4]
  134. .else
  135. .set JMAX, 4
  136. vpush {s16-s27}
  137. .endif
  138. mov COUNTER, #\decifactor/4 - 1
  139. inner_loop \decifactor, up,, head
  140. 1: add PCOEF, PCOEF, #4*JMAX*4
  141. subs COUNTER, COUNTER, #1
  142. inner_loop \decifactor, up, tail, head
  143. bne 1b
  144. inner_loop \decifactor, up, tail
  145. mov COUNTER, #\decifactor/4 - 1
  146. inner_loop \decifactor, down,, head
  147. 1: sub PCOEF, PCOEF, #4*JMAX*4
  148. subs COUNTER, COUNTER, #1
  149. inner_loop \decifactor, down, tail, head
  150. bne 1b
  151. inner_loop \decifactor, down, tail
  152. .if \decifactor == 32
  153. vpop {s16-s31}
  154. .else
  155. vpop {s16-s27}
  156. .endif
  157. fmxr FPSCR, OLDFPSCR
  158. bx lr
  159. .endm
  160. /* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
  161. * int decifactor, float scale)
  162. */
  163. function ff_dca_lfe_fir_vfp, export=1
  164. teq DECIFACTOR, #32
  165. fmrx OLDFPSCR, FPSCR
  166. ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  167. fmxr FPSCR, ip
  168. NOVFP vldr s0, [sp]
  169. vldr IN0, [PIN, #-0*4]
  170. vldr IN1, [PIN, #-1*4]
  171. vldr IN2, [PIN, #-2*4]
  172. vldr IN3, [PIN, #-3*4]
  173. beq 32f
  174. 64: dca_lfe_fir 64
  175. .ltorg
  176. 32: dca_lfe_fir 32
  177. endfunc
  178. .unreq POUT
  179. .unreq PIN
  180. .unreq PCOEF
  181. .unreq DECIFACTOR
  182. .unreq OLDFPSCR
  183. .unreq COUNTER
  184. .unreq SCALE32
  185. .unreq SCALE64
  186. .unreq IN0
  187. .unreq IN1
  188. .unreq IN2
  189. .unreq IN3
  190. .unreq IN4
  191. .unreq IN5
  192. .unreq IN6
  193. .unreq IN7
  194. .unreq COEF0
  195. .unreq COEF1
  196. .unreq COEF2
  197. .unreq COEF3
  198. .unreq COEF4
  199. .unreq COEF5
  200. .unreq COEF6
  201. .unreq COEF7
  202. .unreq ACCUM0
  203. .unreq ACCUM4
  204. .unreq POST0
  205. .unreq POST1
  206. .unreq POST2
  207. .unreq POST3
  208. IN .req a1
  209. SBACT .req a2
  210. OLDFPSCR .req a3
  211. IMDCT .req a4
  212. WINDOW .req v1
  213. OUT .req v2
  214. BUF .req v3
  215. SCALEINT .req v4 @ only used in softfp case
  216. COUNT .req v5
  217. SCALE .req s0
  218. /* Stack layout differs in softfp and hardfp cases:
  219. *
  220. * hardfp
  221. * fp -> 6 arg words saved by caller
  222. * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
  223. * s16-s23 on entry
  224. * align 16
  225. * buf -> 8*32*4 bytes buffer
  226. * s0 on entry
  227. * sp -> 3 arg words for callee
  228. *
  229. * softfp
  230. * fp -> 7 arg words saved by caller
  231. * a4,v1-v5,fp,lr on entry
  232. * s16-s23 on entry
  233. * align 16
  234. * buf -> 8*32*4 bytes buffer
  235. * sp -> 4 arg words for callee
  236. */
  237. /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
  238. * SynthFilterContext *synth, FFTContext *imdct,
  239. * float (*synth_buf_ptr)[512],
  240. * int *synth_buf_offset, float (*synth_buf2)[32],
  241. * const float (*window)[512], float *samples_out,
  242. * float (*raXin)[32], float scale);
  243. */
  244. function ff_dca_qmf_32_subbands_vfp, export=1
  245. VFP push {a3-a4,v1-v3,v5,fp,lr}
  246. NOVFP push {a4,v1-v5,fp,lr}
  247. add fp, sp, #8*4
  248. vpush {s16-s23}
  249. @ The buffer pointed at by raXin isn't big enough for us to do a
  250. @ complete matrix transposition as we want to, so allocate an
  251. @ alternative buffer from the stack. Align to 4 words for speed.
  252. sub BUF, sp, #8*32*4
  253. bic BUF, BUF, #15
  254. mov sp, BUF
  255. ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
  256. fmrx OLDFPSCR, FPSCR
  257. fmxr FPSCR, lr
  258. @ COUNT is used to count down 2 things at once:
  259. @ bits 0-4 are the number of word pairs remaining in the output row
  260. @ bits 5-31 are the number of words to copy (with possible negation)
  261. @ from the source matrix before we start zeroing the remainder
  262. mov COUNT, #(-4 << 5) + 16
  263. adds COUNT, COUNT, SBACT, lsl #5
  264. bmi 2f
  265. 1:
  266. vldr s8, [IN, #(0*8+0)*4]
  267. vldr s10, [IN, #(0*8+1)*4]
  268. vldr s12, [IN, #(0*8+2)*4]
  269. vldr s14, [IN, #(0*8+3)*4]
  270. vldr s16, [IN, #(0*8+4)*4]
  271. vldr s18, [IN, #(0*8+5)*4]
  272. vldr s20, [IN, #(0*8+6)*4]
  273. vldr s22, [IN, #(0*8+7)*4]
  274. vneg.f s8, s8
  275. vldr s9, [IN, #(1*8+0)*4]
  276. vldr s11, [IN, #(1*8+1)*4]
  277. vldr s13, [IN, #(1*8+2)*4]
  278. vldr s15, [IN, #(1*8+3)*4]
  279. vneg.f s16, s16
  280. vldr s17, [IN, #(1*8+4)*4]
  281. vldr s19, [IN, #(1*8+5)*4]
  282. vldr s21, [IN, #(1*8+6)*4]
  283. vldr s23, [IN, #(1*8+7)*4]
  284. vstr d4, [BUF, #(0*32+0)*4]
  285. vstr d5, [BUF, #(1*32+0)*4]
  286. vstr d6, [BUF, #(2*32+0)*4]
  287. vstr d7, [BUF, #(3*32+0)*4]
  288. vstr d8, [BUF, #(4*32+0)*4]
  289. vstr d9, [BUF, #(5*32+0)*4]
  290. vstr d10, [BUF, #(6*32+0)*4]
  291. vstr d11, [BUF, #(7*32+0)*4]
  292. vldr s9, [IN, #(3*8+0)*4]
  293. vldr s11, [IN, #(3*8+1)*4]
  294. vldr s13, [IN, #(3*8+2)*4]
  295. vldr s15, [IN, #(3*8+3)*4]
  296. vldr s17, [IN, #(3*8+4)*4]
  297. vldr s19, [IN, #(3*8+5)*4]
  298. vldr s21, [IN, #(3*8+6)*4]
  299. vldr s23, [IN, #(3*8+7)*4]
  300. vneg.f s9, s9
  301. vldr s8, [IN, #(2*8+0)*4]
  302. vldr s10, [IN, #(2*8+1)*4]
  303. vldr s12, [IN, #(2*8+2)*4]
  304. vldr s14, [IN, #(2*8+3)*4]
  305. vneg.f s17, s17
  306. vldr s16, [IN, #(2*8+4)*4]
  307. vldr s18, [IN, #(2*8+5)*4]
  308. vldr s20, [IN, #(2*8+6)*4]
  309. vldr s22, [IN, #(2*8+7)*4]
  310. vstr d4, [BUF, #(0*32+2)*4]
  311. vstr d5, [BUF, #(1*32+2)*4]
  312. vstr d6, [BUF, #(2*32+2)*4]
  313. vstr d7, [BUF, #(3*32+2)*4]
  314. vstr d8, [BUF, #(4*32+2)*4]
  315. vstr d9, [BUF, #(5*32+2)*4]
  316. vstr d10, [BUF, #(6*32+2)*4]
  317. vstr d11, [BUF, #(7*32+2)*4]
  318. add IN, IN, #4*8*4
  319. add BUF, BUF, #4*4
  320. subs COUNT, COUNT, #(4 << 5) + 2
  321. bpl 1b
  322. 2: @ Now deal with trailing < 4 samples
  323. adds COUNT, COUNT, #3 << 5
  324. bmi 4f @ sb_act was a multiple of 4
  325. bics lr, COUNT, #0x1F
  326. bne 3f
  327. @ sb_act was n*4+1
  328. vldr s8, [IN, #(0*8+0)*4]
  329. vldr s10, [IN, #(0*8+1)*4]
  330. vldr s12, [IN, #(0*8+2)*4]
  331. vldr s14, [IN, #(0*8+3)*4]
  332. vldr s16, [IN, #(0*8+4)*4]
  333. vldr s18, [IN, #(0*8+5)*4]
  334. vldr s20, [IN, #(0*8+6)*4]
  335. vldr s22, [IN, #(0*8+7)*4]
  336. vneg.f s8, s8
  337. vldr s9, zero
  338. vldr s11, zero
  339. vldr s13, zero
  340. vldr s15, zero
  341. vneg.f s16, s16
  342. vldr s17, zero
  343. vldr s19, zero
  344. vldr s21, zero
  345. vldr s23, zero
  346. vstr d4, [BUF, #(0*32+0)*4]
  347. vstr d5, [BUF, #(1*32+0)*4]
  348. vstr d6, [BUF, #(2*32+0)*4]
  349. vstr d7, [BUF, #(3*32+0)*4]
  350. vstr d8, [BUF, #(4*32+0)*4]
  351. vstr d9, [BUF, #(5*32+0)*4]
  352. vstr d10, [BUF, #(6*32+0)*4]
  353. vstr d11, [BUF, #(7*32+0)*4]
  354. add BUF, BUF, #2*4
  355. sub COUNT, COUNT, #1
  356. b 4f
  357. 3: @ sb_act was n*4+2 or n*4+3, so do the first 2
  358. vldr s8, [IN, #(0*8+0)*4]
  359. vldr s10, [IN, #(0*8+1)*4]
  360. vldr s12, [IN, #(0*8+2)*4]
  361. vldr s14, [IN, #(0*8+3)*4]
  362. vldr s16, [IN, #(0*8+4)*4]
  363. vldr s18, [IN, #(0*8+5)*4]
  364. vldr s20, [IN, #(0*8+6)*4]
  365. vldr s22, [IN, #(0*8+7)*4]
  366. vneg.f s8, s8
  367. vldr s9, [IN, #(1*8+0)*4]
  368. vldr s11, [IN, #(1*8+1)*4]
  369. vldr s13, [IN, #(1*8+2)*4]
  370. vldr s15, [IN, #(1*8+3)*4]
  371. vneg.f s16, s16
  372. vldr s17, [IN, #(1*8+4)*4]
  373. vldr s19, [IN, #(1*8+5)*4]
  374. vldr s21, [IN, #(1*8+6)*4]
  375. vldr s23, [IN, #(1*8+7)*4]
  376. vstr d4, [BUF, #(0*32+0)*4]
  377. vstr d5, [BUF, #(1*32+0)*4]
  378. vstr d6, [BUF, #(2*32+0)*4]
  379. vstr d7, [BUF, #(3*32+0)*4]
  380. vstr d8, [BUF, #(4*32+0)*4]
  381. vstr d9, [BUF, #(5*32+0)*4]
  382. vstr d10, [BUF, #(6*32+0)*4]
  383. vstr d11, [BUF, #(7*32+0)*4]
  384. add BUF, BUF, #2*4
  385. sub COUNT, COUNT, #(2 << 5) + 1
  386. bics lr, COUNT, #0x1F
  387. bne 4f
  388. @ sb_act was n*4+3
  389. vldr s8, [IN, #(2*8+0)*4]
  390. vldr s10, [IN, #(2*8+1)*4]
  391. vldr s12, [IN, #(2*8+2)*4]
  392. vldr s14, [IN, #(2*8+3)*4]
  393. vldr s16, [IN, #(2*8+4)*4]
  394. vldr s18, [IN, #(2*8+5)*4]
  395. vldr s20, [IN, #(2*8+6)*4]
  396. vldr s22, [IN, #(2*8+7)*4]
  397. vldr s9, zero
  398. vldr s11, zero
  399. vldr s13, zero
  400. vldr s15, zero
  401. vldr s17, zero
  402. vldr s19, zero
  403. vldr s21, zero
  404. vldr s23, zero
  405. vstr d4, [BUF, #(0*32+0)*4]
  406. vstr d5, [BUF, #(1*32+0)*4]
  407. vstr d6, [BUF, #(2*32+0)*4]
  408. vstr d7, [BUF, #(3*32+0)*4]
  409. vstr d8, [BUF, #(4*32+0)*4]
  410. vstr d9, [BUF, #(5*32+0)*4]
  411. vstr d10, [BUF, #(6*32+0)*4]
  412. vstr d11, [BUF, #(7*32+0)*4]
  413. add BUF, BUF, #2*4
  414. sub COUNT, COUNT, #1
  415. 4: @ Now fill the remainder with 0
  416. vldr s8, zero
  417. vldr s9, zero
  418. ands COUNT, COUNT, #0x1F
  419. beq 6f
  420. 5: vstr d4, [BUF, #(0*32+0)*4]
  421. vstr d4, [BUF, #(1*32+0)*4]
  422. vstr d4, [BUF, #(2*32+0)*4]
  423. vstr d4, [BUF, #(3*32+0)*4]
  424. vstr d4, [BUF, #(4*32+0)*4]
  425. vstr d4, [BUF, #(5*32+0)*4]
  426. vstr d4, [BUF, #(6*32+0)*4]
  427. vstr d4, [BUF, #(7*32+0)*4]
  428. add BUF, BUF, #2*4
  429. subs COUNT, COUNT, #1
  430. bne 5b
  431. 6:
  432. fmxr FPSCR, OLDFPSCR
  433. ldr WINDOW, [fp, #3*4]
  434. ldr OUT, [fp, #4*4]
  435. sub BUF, BUF, #32*4
  436. NOVFP ldr SCALEINT, [fp, #6*4]
  437. mov COUNT, #8
  438. VFP vpush {SCALE}
  439. VFP sub sp, sp, #3*4
  440. NOVFP sub sp, sp, #4*4
  441. 7:
  442. VFP ldr a1, [fp, #-7*4] @ imdct
  443. NOVFP ldr a1, [fp, #-8*4]
  444. ldmia fp, {a2-a4}
  445. VFP stmia sp, {WINDOW, OUT, BUF}
  446. NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
  447. VFP vldr SCALE, [sp, #3*4]
  448. bl X(ff_synth_filter_float_vfp)
  449. add OUT, OUT, #32*4
  450. add BUF, BUF, #32*4
  451. subs COUNT, COUNT, #1
  452. bne 7b
  453. A sub sp, fp, #(8+8)*4
  454. T sub fp, fp, #(8+8)*4
  455. T mov sp, fp
  456. vpop {s16-s23}
  457. VFP pop {a3-a4,v1-v3,v5,fp,pc}
  458. NOVFP pop {a4,v1-v5,fp,pc}
  459. endfunc
  460. .unreq IN
  461. .unreq SBACT
  462. .unreq OLDFPSCR
  463. .unreq IMDCT
  464. .unreq WINDOW
  465. .unreq OUT
  466. .unreq BUF
  467. .unreq SCALEINT
  468. .unreq COUNT
  469. .unreq SCALE
  470. .align 2
  471. zero: .word 0