You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

486 lines
16KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. POUT .req a1
  23. PIN .req a2
  24. PCOEF .req a3
  25. OLDFPSCR .req a4
  26. COUNTER .req ip
  27. SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
  28. SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
  29. IN0 .req s4
  30. IN1 .req s5
  31. IN2 .req s6
  32. IN3 .req s7
  33. IN4 .req s0
  34. IN5 .req s1
  35. IN6 .req s2
  36. IN7 .req s3
  37. COEF0 .req s8 @ coefficient elements
  38. COEF1 .req s9
  39. COEF2 .req s10
  40. COEF3 .req s11
  41. COEF4 .req s12
  42. COEF5 .req s13
  43. COEF6 .req s14
  44. COEF7 .req s15
  45. ACCUM0 .req s16 @ double-buffered multiply-accumulate results
  46. ACCUM4 .req s20
  47. POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
  48. POST1 .req s25
  49. POST2 .req s26
  50. POST3 .req s27
  51. .macro inner_loop decifactor, dir, tail, head
  52. .ifc "\dir","up"
  53. .set X, 0
  54. .set Y, 4
  55. .else
  56. .set X, 4*JMAX*4 - 4
  57. .set Y, -4
  58. .endif
  59. .ifnc "\head",""
  60. vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
  61. vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
  62. vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
  63. vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
  64. .endif
  65. .ifnc "\tail",""
  66. vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
  67. .endif
  68. .ifnc "\head",""
  69. vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
  70. vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
  71. vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
  72. vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
  73. .endif
  74. .ifnc "\tail",""
  75. vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
  76. .endif
  77. .ifnc "\head",""
  78. vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
  79. .ifc "\tail",""
  80. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  81. .endif
  82. vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
  83. vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
  84. .ifnc "\tail",""
  85. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  86. .endif
  87. vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
  88. vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
  89. .endif
  90. .ifnc "\tail",""
  91. vstmia POUT!, {POST0-POST3}
  92. .endif
  93. .ifnc "\head",""
  94. vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
  95. vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
  96. vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
  97. vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
  98. vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
  99. vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
  100. .if \decifactor == 32
  101. vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
  102. vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
  103. vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
  104. vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
  105. vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
  106. vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
  107. vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
  108. vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
  109. vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
  110. vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
  111. vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
  112. vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
  113. vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
  114. vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
  115. vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
  116. vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
  117. vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
  118. vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
  119. vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
  120. vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
  121. .endif
  122. .endif
  123. .endm
  124. .macro dca_lfe_fir decifactor
  125. function ff_dca_lfe_fir\decifactor\()_vfp, export=1
  126. NOVFP vmov s0, r3
  127. fmrx OLDFPSCR, FPSCR
  128. ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  129. fmxr FPSCR, ip
  130. vldr IN0, [PIN, #-0*4]
  131. vldr IN1, [PIN, #-1*4]
  132. vldr IN2, [PIN, #-2*4]
  133. vldr IN3, [PIN, #-3*4]
  134. .if \decifactor == 32
  135. .set JMAX, 8
  136. vpush {s16-s31}
  137. vmov SCALE32, s0 @ duplicate scalar across vector
  138. vldr IN4, [PIN, #-4*4]
  139. vldr IN5, [PIN, #-5*4]
  140. vldr IN6, [PIN, #-6*4]
  141. vldr IN7, [PIN, #-7*4]
  142. .else
  143. .set JMAX, 4
  144. vpush {s16-s27}
  145. .endif
  146. mov COUNTER, #\decifactor/4 - 1
  147. inner_loop \decifactor, up,, head
  148. 1: add PCOEF, PCOEF, #4*JMAX*4
  149. subs COUNTER, COUNTER, #1
  150. inner_loop \decifactor, up, tail, head
  151. bne 1b
  152. inner_loop \decifactor, up, tail
  153. mov COUNTER, #\decifactor/4 - 1
  154. inner_loop \decifactor, down,, head
  155. 1: sub PCOEF, PCOEF, #4*JMAX*4
  156. subs COUNTER, COUNTER, #1
  157. inner_loop \decifactor, down, tail, head
  158. bne 1b
  159. inner_loop \decifactor, down, tail
  160. .if \decifactor == 32
  161. vpop {s16-s31}
  162. .else
  163. vpop {s16-s27}
  164. .endif
  165. fmxr FPSCR, OLDFPSCR
  166. bx lr
  167. endfunc
  168. .endm
  169. dca_lfe_fir 64
  170. .ltorg
  171. dca_lfe_fir 32
  172. .unreq POUT
  173. .unreq PIN
  174. .unreq PCOEF
  175. .unreq OLDFPSCR
  176. .unreq COUNTER
  177. .unreq SCALE32
  178. .unreq SCALE64
  179. .unreq IN0
  180. .unreq IN1
  181. .unreq IN2
  182. .unreq IN3
  183. .unreq IN4
  184. .unreq IN5
  185. .unreq IN6
  186. .unreq IN7
  187. .unreq COEF0
  188. .unreq COEF1
  189. .unreq COEF2
  190. .unreq COEF3
  191. .unreq COEF4
  192. .unreq COEF5
  193. .unreq COEF6
  194. .unreq COEF7
  195. .unreq ACCUM0
  196. .unreq ACCUM4
  197. .unreq POST0
  198. .unreq POST1
  199. .unreq POST2
  200. .unreq POST3
  201. IN .req a1
  202. SBACT .req a2
  203. OLDFPSCR .req a3
  204. IMDCT .req a4
  205. WINDOW .req v1
  206. OUT .req v2
  207. BUF .req v3
  208. SCALEINT .req v4 @ only used in softfp case
  209. COUNT .req v5
  210. SCALE .req s0
  211. /* Stack layout differs in softfp and hardfp cases:
  212. *
  213. * hardfp
  214. * fp -> 6 arg words saved by caller
  215. * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
  216. * s16-s23 on entry
  217. * align 16
  218. * buf -> 8*32*4 bytes buffer
  219. * s0 on entry
  220. * sp -> 3 arg words for callee
  221. *
  222. * softfp
  223. * fp -> 7 arg words saved by caller
  224. * a4,v1-v5,fp,lr on entry
  225. * s16-s23 on entry
  226. * align 16
  227. * buf -> 8*32*4 bytes buffer
  228. * sp -> 4 arg words for callee
  229. */
  230. /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
  231. * SynthFilterContext *synth, FFTContext *imdct,
  232. * float (*synth_buf_ptr)[512],
  233. * int *synth_buf_offset, float (*synth_buf2)[32],
  234. * const float (*window)[512], float *samples_out,
  235. * float (*raXin)[32], float scale);
  236. */
  237. function ff_dca_qmf_32_subbands_vfp, export=1
  238. VFP push {a3-a4,v1-v3,v5,fp,lr}
  239. NOVFP push {a4,v1-v5,fp,lr}
  240. add fp, sp, #8*4
  241. vpush {s16-s23}
  242. @ The buffer pointed at by raXin isn't big enough for us to do a
  243. @ complete matrix transposition as we want to, so allocate an
  244. @ alternative buffer from the stack. Align to 4 words for speed.
  245. sub BUF, sp, #8*32*4
  246. bic BUF, BUF, #15
  247. mov sp, BUF
  248. ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
  249. fmrx OLDFPSCR, FPSCR
  250. fmxr FPSCR, lr
  251. @ COUNT is used to count down 2 things at once:
  252. @ bits 0-4 are the number of word pairs remaining in the output row
  253. @ bits 5-31 are the number of words to copy (with possible negation)
  254. @ from the source matrix before we start zeroing the remainder
  255. mov COUNT, #(-4 << 5) + 16
  256. adds COUNT, COUNT, SBACT, lsl #5
  257. bmi 2f
  258. 1:
  259. vldr s8, [IN, #(0*8+0)*4]
  260. vldr s10, [IN, #(0*8+1)*4]
  261. vldr s12, [IN, #(0*8+2)*4]
  262. vldr s14, [IN, #(0*8+3)*4]
  263. vldr s16, [IN, #(0*8+4)*4]
  264. vldr s18, [IN, #(0*8+5)*4]
  265. vldr s20, [IN, #(0*8+6)*4]
  266. vldr s22, [IN, #(0*8+7)*4]
  267. vneg.f s8, s8
  268. vldr s9, [IN, #(1*8+0)*4]
  269. vldr s11, [IN, #(1*8+1)*4]
  270. vldr s13, [IN, #(1*8+2)*4]
  271. vldr s15, [IN, #(1*8+3)*4]
  272. vneg.f s16, s16
  273. vldr s17, [IN, #(1*8+4)*4]
  274. vldr s19, [IN, #(1*8+5)*4]
  275. vldr s21, [IN, #(1*8+6)*4]
  276. vldr s23, [IN, #(1*8+7)*4]
  277. vstr d4, [BUF, #(0*32+0)*4]
  278. vstr d5, [BUF, #(1*32+0)*4]
  279. vstr d6, [BUF, #(2*32+0)*4]
  280. vstr d7, [BUF, #(3*32+0)*4]
  281. vstr d8, [BUF, #(4*32+0)*4]
  282. vstr d9, [BUF, #(5*32+0)*4]
  283. vstr d10, [BUF, #(6*32+0)*4]
  284. vstr d11, [BUF, #(7*32+0)*4]
  285. vldr s9, [IN, #(3*8+0)*4]
  286. vldr s11, [IN, #(3*8+1)*4]
  287. vldr s13, [IN, #(3*8+2)*4]
  288. vldr s15, [IN, #(3*8+3)*4]
  289. vldr s17, [IN, #(3*8+4)*4]
  290. vldr s19, [IN, #(3*8+5)*4]
  291. vldr s21, [IN, #(3*8+6)*4]
  292. vldr s23, [IN, #(3*8+7)*4]
  293. vneg.f s9, s9
  294. vldr s8, [IN, #(2*8+0)*4]
  295. vldr s10, [IN, #(2*8+1)*4]
  296. vldr s12, [IN, #(2*8+2)*4]
  297. vldr s14, [IN, #(2*8+3)*4]
  298. vneg.f s17, s17
  299. vldr s16, [IN, #(2*8+4)*4]
  300. vldr s18, [IN, #(2*8+5)*4]
  301. vldr s20, [IN, #(2*8+6)*4]
  302. vldr s22, [IN, #(2*8+7)*4]
  303. vstr d4, [BUF, #(0*32+2)*4]
  304. vstr d5, [BUF, #(1*32+2)*4]
  305. vstr d6, [BUF, #(2*32+2)*4]
  306. vstr d7, [BUF, #(3*32+2)*4]
  307. vstr d8, [BUF, #(4*32+2)*4]
  308. vstr d9, [BUF, #(5*32+2)*4]
  309. vstr d10, [BUF, #(6*32+2)*4]
  310. vstr d11, [BUF, #(7*32+2)*4]
  311. add IN, IN, #4*8*4
  312. add BUF, BUF, #4*4
  313. subs COUNT, COUNT, #(4 << 5) + 2
  314. bpl 1b
  315. 2: @ Now deal with trailing < 4 samples
  316. adds COUNT, COUNT, #3 << 5
  317. bmi 4f @ sb_act was a multiple of 4
  318. bics lr, COUNT, #0x1F
  319. bne 3f
  320. @ sb_act was n*4+1
  321. vldr s8, [IN, #(0*8+0)*4]
  322. vldr s10, [IN, #(0*8+1)*4]
  323. vldr s12, [IN, #(0*8+2)*4]
  324. vldr s14, [IN, #(0*8+3)*4]
  325. vldr s16, [IN, #(0*8+4)*4]
  326. vldr s18, [IN, #(0*8+5)*4]
  327. vldr s20, [IN, #(0*8+6)*4]
  328. vldr s22, [IN, #(0*8+7)*4]
  329. vneg.f s8, s8
  330. vldr s9, zero
  331. vldr s11, zero
  332. vldr s13, zero
  333. vldr s15, zero
  334. vneg.f s16, s16
  335. vldr s17, zero
  336. vldr s19, zero
  337. vldr s21, zero
  338. vldr s23, zero
  339. vstr d4, [BUF, #(0*32+0)*4]
  340. vstr d5, [BUF, #(1*32+0)*4]
  341. vstr d6, [BUF, #(2*32+0)*4]
  342. vstr d7, [BUF, #(3*32+0)*4]
  343. vstr d8, [BUF, #(4*32+0)*4]
  344. vstr d9, [BUF, #(5*32+0)*4]
  345. vstr d10, [BUF, #(6*32+0)*4]
  346. vstr d11, [BUF, #(7*32+0)*4]
  347. add BUF, BUF, #2*4
  348. sub COUNT, COUNT, #1
  349. b 4f
  350. 3: @ sb_act was n*4+2 or n*4+3, so do the first 2
  351. vldr s8, [IN, #(0*8+0)*4]
  352. vldr s10, [IN, #(0*8+1)*4]
  353. vldr s12, [IN, #(0*8+2)*4]
  354. vldr s14, [IN, #(0*8+3)*4]
  355. vldr s16, [IN, #(0*8+4)*4]
  356. vldr s18, [IN, #(0*8+5)*4]
  357. vldr s20, [IN, #(0*8+6)*4]
  358. vldr s22, [IN, #(0*8+7)*4]
  359. vneg.f s8, s8
  360. vldr s9, [IN, #(1*8+0)*4]
  361. vldr s11, [IN, #(1*8+1)*4]
  362. vldr s13, [IN, #(1*8+2)*4]
  363. vldr s15, [IN, #(1*8+3)*4]
  364. vneg.f s16, s16
  365. vldr s17, [IN, #(1*8+4)*4]
  366. vldr s19, [IN, #(1*8+5)*4]
  367. vldr s21, [IN, #(1*8+6)*4]
  368. vldr s23, [IN, #(1*8+7)*4]
  369. vstr d4, [BUF, #(0*32+0)*4]
  370. vstr d5, [BUF, #(1*32+0)*4]
  371. vstr d6, [BUF, #(2*32+0)*4]
  372. vstr d7, [BUF, #(3*32+0)*4]
  373. vstr d8, [BUF, #(4*32+0)*4]
  374. vstr d9, [BUF, #(5*32+0)*4]
  375. vstr d10, [BUF, #(6*32+0)*4]
  376. vstr d11, [BUF, #(7*32+0)*4]
  377. add BUF, BUF, #2*4
  378. sub COUNT, COUNT, #(2 << 5) + 1
  379. bics lr, COUNT, #0x1F
  380. bne 4f
  381. @ sb_act was n*4+3
  382. vldr s8, [IN, #(2*8+0)*4]
  383. vldr s10, [IN, #(2*8+1)*4]
  384. vldr s12, [IN, #(2*8+2)*4]
  385. vldr s14, [IN, #(2*8+3)*4]
  386. vldr s16, [IN, #(2*8+4)*4]
  387. vldr s18, [IN, #(2*8+5)*4]
  388. vldr s20, [IN, #(2*8+6)*4]
  389. vldr s22, [IN, #(2*8+7)*4]
  390. vldr s9, zero
  391. vldr s11, zero
  392. vldr s13, zero
  393. vldr s15, zero
  394. vldr s17, zero
  395. vldr s19, zero
  396. vldr s21, zero
  397. vldr s23, zero
  398. vstr d4, [BUF, #(0*32+0)*4]
  399. vstr d5, [BUF, #(1*32+0)*4]
  400. vstr d6, [BUF, #(2*32+0)*4]
  401. vstr d7, [BUF, #(3*32+0)*4]
  402. vstr d8, [BUF, #(4*32+0)*4]
  403. vstr d9, [BUF, #(5*32+0)*4]
  404. vstr d10, [BUF, #(6*32+0)*4]
  405. vstr d11, [BUF, #(7*32+0)*4]
  406. add BUF, BUF, #2*4
  407. sub COUNT, COUNT, #1
  408. 4: @ Now fill the remainder with 0
  409. vldr s8, zero
  410. vldr s9, zero
  411. ands COUNT, COUNT, #0x1F
  412. beq 6f
  413. 5: vstr d4, [BUF, #(0*32+0)*4]
  414. vstr d4, [BUF, #(1*32+0)*4]
  415. vstr d4, [BUF, #(2*32+0)*4]
  416. vstr d4, [BUF, #(3*32+0)*4]
  417. vstr d4, [BUF, #(4*32+0)*4]
  418. vstr d4, [BUF, #(5*32+0)*4]
  419. vstr d4, [BUF, #(6*32+0)*4]
  420. vstr d4, [BUF, #(7*32+0)*4]
  421. add BUF, BUF, #2*4
  422. subs COUNT, COUNT, #1
  423. bne 5b
  424. 6:
  425. fmxr FPSCR, OLDFPSCR
  426. ldr WINDOW, [fp, #3*4]
  427. ldr OUT, [fp, #4*4]
  428. sub BUF, BUF, #32*4
  429. NOVFP ldr SCALEINT, [fp, #6*4]
  430. mov COUNT, #8
  431. VFP vpush {SCALE}
  432. VFP sub sp, sp, #3*4
  433. NOVFP sub sp, sp, #4*4
  434. 7:
  435. VFP ldr a1, [fp, #-7*4] @ imdct
  436. NOVFP ldr a1, [fp, #-8*4]
  437. ldmia fp, {a2-a4}
  438. VFP stmia sp, {WINDOW, OUT, BUF}
  439. NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
  440. VFP vldr SCALE, [sp, #3*4]
  441. bl X(ff_synth_filter_float_vfp)
  442. add OUT, OUT, #32*4
  443. add BUF, BUF, #32*4
  444. subs COUNT, COUNT, #1
  445. bne 7b
  446. A sub sp, fp, #(8+8)*4
  447. T sub fp, fp, #(8+8)*4
  448. T mov sp, fp
  449. vpop {s16-s23}
  450. VFP pop {a3-a4,v1-v3,v5,fp,pc}
  451. NOVFP pop {a4,v1-v5,fp,pc}
  452. endfunc
  453. .unreq IN
  454. .unreq SBACT
  455. .unreq OLDFPSCR
  456. .unreq IMDCT
  457. .unreq WINDOW
  458. .unreq OUT
  459. .unreq BUF
  460. .unreq SCALEINT
  461. .unreq COUNT
  462. .unreq SCALE
  463. .align 2
  464. zero: .word 0