You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

477 lines
15KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. POUT .req a1
  23. PIN .req a2
  24. PCOEF .req a3
  25. OLDFPSCR .req a4
  26. COUNTER .req ip
  27. IN0 .req s4
  28. IN1 .req s5
  29. IN2 .req s6
  30. IN3 .req s7
  31. IN4 .req s0
  32. IN5 .req s1
  33. IN6 .req s2
  34. IN7 .req s3
  35. COEF0 .req s8 @ coefficient elements
  36. COEF1 .req s9
  37. COEF2 .req s10
  38. COEF3 .req s11
  39. COEF4 .req s12
  40. COEF5 .req s13
  41. COEF6 .req s14
  42. COEF7 .req s15
  43. ACCUM0 .req s16 @ double-buffered multiply-accumulate results
  44. ACCUM4 .req s20
  45. POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
  46. POST1 .req s25
  47. POST2 .req s26
  48. POST3 .req s27
  49. .macro inner_loop decifactor, dir, tail, head
  50. .ifc "\dir","up"
  51. .set X, 0
  52. .set Y, 4
  53. .else
  54. .set X, 4*JMAX*4 - 4
  55. .set Y, -4
  56. .endif
  57. .ifnc "\head",""
  58. vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
  59. vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
  60. vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
  61. vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
  62. .endif
  63. .ifnc "\tail",""
  64. vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
  65. .endif
  66. .ifnc "\head",""
  67. vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
  68. vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
  69. vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
  70. vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
  71. .endif
  72. .ifnc "\head",""
  73. vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
  74. .ifc "\tail",""
  75. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  76. .endif
  77. vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
  78. vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
  79. .ifnc "\tail",""
  80. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  81. .endif
  82. vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
  83. vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
  84. .endif
  85. .ifnc "\tail",""
  86. vstmia POUT!, {POST0-POST3}
  87. .endif
  88. .ifnc "\head",""
  89. vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
  90. vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
  91. vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
  92. vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
  93. vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
  94. vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
  95. .if \decifactor == 32
  96. vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
  97. vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
  98. vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
  99. vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
  100. vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
  101. vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
  102. vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
  103. vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
  104. vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
  105. vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
  106. vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
  107. vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
  108. vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
  109. vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
  110. vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
  111. vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
  112. vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
  113. vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
  114. vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
  115. vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
  116. .endif
  117. .endif
  118. .endm
  119. .macro dca_lfe_fir decifactor
  120. function ff_dca_lfe_fir\decifactor\()_vfp, export=1
  121. fmrx OLDFPSCR, FPSCR
  122. ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  123. fmxr FPSCR, ip
  124. vldr IN0, [PIN, #-0*4]
  125. vldr IN1, [PIN, #-1*4]
  126. vldr IN2, [PIN, #-2*4]
  127. vldr IN3, [PIN, #-3*4]
  128. .if \decifactor == 32
  129. .set JMAX, 8
  130. vpush {s16-s31}
  131. vldr IN4, [PIN, #-4*4]
  132. vldr IN5, [PIN, #-5*4]
  133. vldr IN6, [PIN, #-6*4]
  134. vldr IN7, [PIN, #-7*4]
  135. .else
  136. .set JMAX, 4
  137. vpush {s16-s27}
  138. .endif
  139. mov COUNTER, #\decifactor/4 - 1
  140. inner_loop \decifactor, up,, head
  141. 1: add PCOEF, PCOEF, #4*JMAX*4
  142. subs COUNTER, COUNTER, #1
  143. inner_loop \decifactor, up, tail, head
  144. bne 1b
  145. inner_loop \decifactor, up, tail
  146. mov COUNTER, #\decifactor/4 - 1
  147. inner_loop \decifactor, down,, head
  148. 1: sub PCOEF, PCOEF, #4*JMAX*4
  149. subs COUNTER, COUNTER, #1
  150. inner_loop \decifactor, down, tail, head
  151. bne 1b
  152. inner_loop \decifactor, down, tail
  153. .if \decifactor == 32
  154. vpop {s16-s31}
  155. .else
  156. vpop {s16-s27}
  157. .endif
  158. fmxr FPSCR, OLDFPSCR
  159. bx lr
  160. endfunc
  161. .endm
  162. dca_lfe_fir 64
  163. .ltorg
  164. dca_lfe_fir 32
  165. .unreq POUT
  166. .unreq PIN
  167. .unreq PCOEF
  168. .unreq OLDFPSCR
  169. .unreq COUNTER
  170. .unreq IN0
  171. .unreq IN1
  172. .unreq IN2
  173. .unreq IN3
  174. .unreq IN4
  175. .unreq IN5
  176. .unreq IN6
  177. .unreq IN7
  178. .unreq COEF0
  179. .unreq COEF1
  180. .unreq COEF2
  181. .unreq COEF3
  182. .unreq COEF4
  183. .unreq COEF5
  184. .unreq COEF6
  185. .unreq COEF7
  186. .unreq ACCUM0
  187. .unreq ACCUM4
  188. .unreq POST0
  189. .unreq POST1
  190. .unreq POST2
  191. .unreq POST3
  192. IN .req a1
  193. SBACT .req a2
  194. OLDFPSCR .req a3
  195. IMDCT .req a4
  196. WINDOW .req v1
  197. OUT .req v2
  198. BUF .req v3
  199. SCALEINT .req v4 @ only used in softfp case
  200. COUNT .req v5
  201. SCALE .req s0
  202. /* Stack layout differs in softfp and hardfp cases:
  203. *
  204. * hardfp
  205. * fp -> 6 arg words saved by caller
  206. * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
  207. * s16-s23 on entry
  208. * align 16
  209. * buf -> 8*32*4 bytes buffer
  210. * s0 on entry
  211. * sp -> 3 arg words for callee
  212. *
  213. * softfp
  214. * fp -> 7 arg words saved by caller
  215. * a4,v1-v5,fp,lr on entry
  216. * s16-s23 on entry
  217. * align 16
  218. * buf -> 8*32*4 bytes buffer
  219. * sp -> 4 arg words for callee
  220. */
  221. /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
  222. * SynthFilterContext *synth, FFTContext *imdct,
  223. * float (*synth_buf_ptr)[512],
  224. * int *synth_buf_offset, float (*synth_buf2)[32],
  225. * const float (*window)[512], float *samples_out,
  226. * float (*raXin)[32], float scale);
  227. */
  228. function ff_dca_qmf_32_subbands_vfp, export=1
  229. VFP push {a3-a4,v1-v3,v5,fp,lr}
  230. NOVFP push {a4,v1-v5,fp,lr}
  231. add fp, sp, #8*4
  232. vpush {s16-s23}
  233. @ The buffer pointed at by raXin isn't big enough for us to do a
  234. @ complete matrix transposition as we want to, so allocate an
  235. @ alternative buffer from the stack. Align to 4 words for speed.
  236. sub BUF, sp, #8*32*4
  237. bic BUF, BUF, #15
  238. mov sp, BUF
  239. ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
  240. fmrx OLDFPSCR, FPSCR
  241. fmxr FPSCR, lr
  242. @ COUNT is used to count down 2 things at once:
  243. @ bits 0-4 are the number of word pairs remaining in the output row
  244. @ bits 5-31 are the number of words to copy (with possible negation)
  245. @ from the source matrix before we start zeroing the remainder
  246. mov COUNT, #(-4 << 5) + 16
  247. adds COUNT, COUNT, SBACT, lsl #5
  248. bmi 2f
  249. 1:
  250. vldr s8, [IN, #(0*8+0)*4]
  251. vldr s10, [IN, #(0*8+1)*4]
  252. vldr s12, [IN, #(0*8+2)*4]
  253. vldr s14, [IN, #(0*8+3)*4]
  254. vldr s16, [IN, #(0*8+4)*4]
  255. vldr s18, [IN, #(0*8+5)*4]
  256. vldr s20, [IN, #(0*8+6)*4]
  257. vldr s22, [IN, #(0*8+7)*4]
  258. vneg.f s8, s8
  259. vldr s9, [IN, #(1*8+0)*4]
  260. vldr s11, [IN, #(1*8+1)*4]
  261. vldr s13, [IN, #(1*8+2)*4]
  262. vldr s15, [IN, #(1*8+3)*4]
  263. vneg.f s16, s16
  264. vldr s17, [IN, #(1*8+4)*4]
  265. vldr s19, [IN, #(1*8+5)*4]
  266. vldr s21, [IN, #(1*8+6)*4]
  267. vldr s23, [IN, #(1*8+7)*4]
  268. vstr d4, [BUF, #(0*32+0)*4]
  269. vstr d5, [BUF, #(1*32+0)*4]
  270. vstr d6, [BUF, #(2*32+0)*4]
  271. vstr d7, [BUF, #(3*32+0)*4]
  272. vstr d8, [BUF, #(4*32+0)*4]
  273. vstr d9, [BUF, #(5*32+0)*4]
  274. vstr d10, [BUF, #(6*32+0)*4]
  275. vstr d11, [BUF, #(7*32+0)*4]
  276. vldr s9, [IN, #(3*8+0)*4]
  277. vldr s11, [IN, #(3*8+1)*4]
  278. vldr s13, [IN, #(3*8+2)*4]
  279. vldr s15, [IN, #(3*8+3)*4]
  280. vldr s17, [IN, #(3*8+4)*4]
  281. vldr s19, [IN, #(3*8+5)*4]
  282. vldr s21, [IN, #(3*8+6)*4]
  283. vldr s23, [IN, #(3*8+7)*4]
  284. vneg.f s9, s9
  285. vldr s8, [IN, #(2*8+0)*4]
  286. vldr s10, [IN, #(2*8+1)*4]
  287. vldr s12, [IN, #(2*8+2)*4]
  288. vldr s14, [IN, #(2*8+3)*4]
  289. vneg.f s17, s17
  290. vldr s16, [IN, #(2*8+4)*4]
  291. vldr s18, [IN, #(2*8+5)*4]
  292. vldr s20, [IN, #(2*8+6)*4]
  293. vldr s22, [IN, #(2*8+7)*4]
  294. vstr d4, [BUF, #(0*32+2)*4]
  295. vstr d5, [BUF, #(1*32+2)*4]
  296. vstr d6, [BUF, #(2*32+2)*4]
  297. vstr d7, [BUF, #(3*32+2)*4]
  298. vstr d8, [BUF, #(4*32+2)*4]
  299. vstr d9, [BUF, #(5*32+2)*4]
  300. vstr d10, [BUF, #(6*32+2)*4]
  301. vstr d11, [BUF, #(7*32+2)*4]
  302. add IN, IN, #4*8*4
  303. add BUF, BUF, #4*4
  304. subs COUNT, COUNT, #(4 << 5) + 2
  305. bpl 1b
  306. 2: @ Now deal with trailing < 4 samples
  307. adds COUNT, COUNT, #3 << 5
  308. bmi 4f @ sb_act was a multiple of 4
  309. bics lr, COUNT, #0x1F
  310. bne 3f
  311. @ sb_act was n*4+1
  312. vldr s8, [IN, #(0*8+0)*4]
  313. vldr s10, [IN, #(0*8+1)*4]
  314. vldr s12, [IN, #(0*8+2)*4]
  315. vldr s14, [IN, #(0*8+3)*4]
  316. vldr s16, [IN, #(0*8+4)*4]
  317. vldr s18, [IN, #(0*8+5)*4]
  318. vldr s20, [IN, #(0*8+6)*4]
  319. vldr s22, [IN, #(0*8+7)*4]
  320. vneg.f s8, s8
  321. vldr s9, zero
  322. vldr s11, zero
  323. vldr s13, zero
  324. vldr s15, zero
  325. vneg.f s16, s16
  326. vldr s17, zero
  327. vldr s19, zero
  328. vldr s21, zero
  329. vldr s23, zero
  330. vstr d4, [BUF, #(0*32+0)*4]
  331. vstr d5, [BUF, #(1*32+0)*4]
  332. vstr d6, [BUF, #(2*32+0)*4]
  333. vstr d7, [BUF, #(3*32+0)*4]
  334. vstr d8, [BUF, #(4*32+0)*4]
  335. vstr d9, [BUF, #(5*32+0)*4]
  336. vstr d10, [BUF, #(6*32+0)*4]
  337. vstr d11, [BUF, #(7*32+0)*4]
  338. add BUF, BUF, #2*4
  339. sub COUNT, COUNT, #1
  340. b 4f
  341. 3: @ sb_act was n*4+2 or n*4+3, so do the first 2
  342. vldr s8, [IN, #(0*8+0)*4]
  343. vldr s10, [IN, #(0*8+1)*4]
  344. vldr s12, [IN, #(0*8+2)*4]
  345. vldr s14, [IN, #(0*8+3)*4]
  346. vldr s16, [IN, #(0*8+4)*4]
  347. vldr s18, [IN, #(0*8+5)*4]
  348. vldr s20, [IN, #(0*8+6)*4]
  349. vldr s22, [IN, #(0*8+7)*4]
  350. vneg.f s8, s8
  351. vldr s9, [IN, #(1*8+0)*4]
  352. vldr s11, [IN, #(1*8+1)*4]
  353. vldr s13, [IN, #(1*8+2)*4]
  354. vldr s15, [IN, #(1*8+3)*4]
  355. vneg.f s16, s16
  356. vldr s17, [IN, #(1*8+4)*4]
  357. vldr s19, [IN, #(1*8+5)*4]
  358. vldr s21, [IN, #(1*8+6)*4]
  359. vldr s23, [IN, #(1*8+7)*4]
  360. vstr d4, [BUF, #(0*32+0)*4]
  361. vstr d5, [BUF, #(1*32+0)*4]
  362. vstr d6, [BUF, #(2*32+0)*4]
  363. vstr d7, [BUF, #(3*32+0)*4]
  364. vstr d8, [BUF, #(4*32+0)*4]
  365. vstr d9, [BUF, #(5*32+0)*4]
  366. vstr d10, [BUF, #(6*32+0)*4]
  367. vstr d11, [BUF, #(7*32+0)*4]
  368. add BUF, BUF, #2*4
  369. sub COUNT, COUNT, #(2 << 5) + 1
  370. bics lr, COUNT, #0x1F
  371. bne 4f
  372. @ sb_act was n*4+3
  373. vldr s8, [IN, #(2*8+0)*4]
  374. vldr s10, [IN, #(2*8+1)*4]
  375. vldr s12, [IN, #(2*8+2)*4]
  376. vldr s14, [IN, #(2*8+3)*4]
  377. vldr s16, [IN, #(2*8+4)*4]
  378. vldr s18, [IN, #(2*8+5)*4]
  379. vldr s20, [IN, #(2*8+6)*4]
  380. vldr s22, [IN, #(2*8+7)*4]
  381. vldr s9, zero
  382. vldr s11, zero
  383. vldr s13, zero
  384. vldr s15, zero
  385. vldr s17, zero
  386. vldr s19, zero
  387. vldr s21, zero
  388. vldr s23, zero
  389. vstr d4, [BUF, #(0*32+0)*4]
  390. vstr d5, [BUF, #(1*32+0)*4]
  391. vstr d6, [BUF, #(2*32+0)*4]
  392. vstr d7, [BUF, #(3*32+0)*4]
  393. vstr d8, [BUF, #(4*32+0)*4]
  394. vstr d9, [BUF, #(5*32+0)*4]
  395. vstr d10, [BUF, #(6*32+0)*4]
  396. vstr d11, [BUF, #(7*32+0)*4]
  397. add BUF, BUF, #2*4
  398. sub COUNT, COUNT, #1
  399. 4: @ Now fill the remainder with 0
  400. vldr s8, zero
  401. vldr s9, zero
  402. ands COUNT, COUNT, #0x1F
  403. beq 6f
  404. 5: vstr d4, [BUF, #(0*32+0)*4]
  405. vstr d4, [BUF, #(1*32+0)*4]
  406. vstr d4, [BUF, #(2*32+0)*4]
  407. vstr d4, [BUF, #(3*32+0)*4]
  408. vstr d4, [BUF, #(4*32+0)*4]
  409. vstr d4, [BUF, #(5*32+0)*4]
  410. vstr d4, [BUF, #(6*32+0)*4]
  411. vstr d4, [BUF, #(7*32+0)*4]
  412. add BUF, BUF, #2*4
  413. subs COUNT, COUNT, #1
  414. bne 5b
  415. 6:
  416. fmxr FPSCR, OLDFPSCR
  417. ldr WINDOW, [fp, #3*4]
  418. ldr OUT, [fp, #4*4]
  419. sub BUF, BUF, #32*4
  420. NOVFP ldr SCALEINT, [fp, #6*4]
  421. mov COUNT, #8
  422. VFP vpush {SCALE}
  423. VFP sub sp, sp, #3*4
  424. NOVFP sub sp, sp, #4*4
  425. 7:
  426. VFP ldr a1, [fp, #-7*4] @ imdct
  427. NOVFP ldr a1, [fp, #-8*4]
  428. ldmia fp, {a2-a4}
  429. VFP stmia sp, {WINDOW, OUT, BUF}
  430. NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
  431. VFP vldr SCALE, [sp, #3*4]
  432. bl X(ff_synth_filter_float_vfp)
  433. add OUT, OUT, #32*4
  434. add BUF, BUF, #32*4
  435. subs COUNT, COUNT, #1
  436. bne 7b
  437. A sub sp, fp, #(8+8)*4
  438. T sub fp, fp, #(8+8)*4
  439. T mov sp, fp
  440. vpop {s16-s23}
  441. VFP pop {a3-a4,v1-v3,v5,fp,pc}
  442. NOVFP pop {a4,v1-v5,fp,pc}
  443. endfunc
  444. .unreq IN
  445. .unreq SBACT
  446. .unreq OLDFPSCR
  447. .unreq IMDCT
  448. .unreq WINDOW
  449. .unreq OUT
  450. .unreq BUF
  451. .unreq SCALEINT
  452. .unreq COUNT
  453. .unreq SCALE
  454. .align 2
  455. zero: .word 0