You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

222 lines
7.3KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "config.h"
  21. #include "libavutil/arm/asm.S"
  22. /**
  23. * ARM VFP optimised int32 to float conversion.
  24. * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
  25. * (16 bytes alignment is best for BCM2835), little-endian.
  26. */
  27. @ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
  28. function ff_int32_to_float_fmul_array8_vfp, export=1
  29. push {lr}
  30. ldr a1, [sp, #4]
  31. subs lr, a1, #3*8
  32. bcc 50f @ too short to pipeline
  33. @ Now need to find (len / 8) % 3. The approximation
  34. @ x / 24 = (x * 0xAB) >> 12
  35. @ is good for x < 4096, which is true for both AC3 and DCA.
  36. mov a1, #0xAB
  37. ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
  38. mul a1, lr, a1
  39. vpush {s16-s31}
  40. mov a1, a1, lsr #12
  41. add a1, a1, a1, lsl #1
  42. rsb a1, a1, lr, lsr #3
  43. cmp a1, #1
  44. fmrx a1, FPSCR
  45. fmxr FPSCR, ip
  46. beq 11f
  47. blo 10f
  48. @ Array is (2 + multiple of 3) x 8 floats long
  49. @ drop through...
  50. vldmia a3!, {s16-s23}
  51. vldmia a4!, {s2,s3}
  52. vldmia a3!, {s24-s31}
  53. vcvt.f32.s32 s16, s16
  54. vcvt.f32.s32 s17, s17
  55. vcvt.f32.s32 s18, s18
  56. vcvt.f32.s32 s19, s19
  57. vcvt.f32.s32 s20, s20
  58. vcvt.f32.s32 s21, s21
  59. vcvt.f32.s32 s22, s22
  60. vcvt.f32.s32 s23, s23
  61. vmul.f32 s16, s16, s2
  62. @ drop through...
  63. 3:
  64. vldmia a3!, {s8-s15}
  65. vldmia a4!, {s1}
  66. vcvt.f32.s32 s24, s24
  67. vcvt.f32.s32 s25, s25
  68. vcvt.f32.s32 s26, s26
  69. vcvt.f32.s32 s27, s27
  70. vcvt.f32.s32 s28, s28
  71. vcvt.f32.s32 s29, s29
  72. vcvt.f32.s32 s30, s30
  73. vcvt.f32.s32 s31, s31
  74. vmul.f32 s24, s24, s3
  75. vstmia a2!, {s16-s19}
  76. vstmia a2!, {s20-s23}
  77. 2:
  78. vldmia a3!, {s16-s23}
  79. vldmia a4!, {s2}
  80. vcvt.f32.s32 s8, s8
  81. vcvt.f32.s32 s9, s9
  82. vcvt.f32.s32 s10, s10
  83. vcvt.f32.s32 s11, s11
  84. vcvt.f32.s32 s12, s12
  85. vcvt.f32.s32 s13, s13
  86. vcvt.f32.s32 s14, s14
  87. vcvt.f32.s32 s15, s15
  88. vmul.f32 s8, s8, s1
  89. vstmia a2!, {s24-s27}
  90. vstmia a2!, {s28-s31}
  91. 1:
  92. vldmia a3!, {s24-s31}
  93. vldmia a4!, {s3}
  94. vcvt.f32.s32 s16, s16
  95. vcvt.f32.s32 s17, s17
  96. vcvt.f32.s32 s18, s18
  97. vcvt.f32.s32 s19, s19
  98. vcvt.f32.s32 s20, s20
  99. vcvt.f32.s32 s21, s21
  100. vcvt.f32.s32 s22, s22
  101. vcvt.f32.s32 s23, s23
  102. vmul.f32 s16, s16, s2
  103. vstmia a2!, {s8-s11}
  104. vstmia a2!, {s12-s15}
  105. subs lr, lr, #8*3
  106. bpl 3b
  107. vcvt.f32.s32 s24, s24
  108. vcvt.f32.s32 s25, s25
  109. vcvt.f32.s32 s26, s26
  110. vcvt.f32.s32 s27, s27
  111. vcvt.f32.s32 s28, s28
  112. vcvt.f32.s32 s29, s29
  113. vcvt.f32.s32 s30, s30
  114. vcvt.f32.s32 s31, s31
  115. vmul.f32 s24, s24, s3
  116. vstmia a2!, {s16-s19}
  117. vstmia a2!, {s20-s23}
  118. vstmia a2!, {s24-s27}
  119. vstmia a2!, {s28-s31}
  120. fmxr FPSCR, a1
  121. vpop {s16-s31}
  122. pop {pc}
  123. 10: @ Array is (multiple of 3) x 8 floats long
  124. vldmia a3!, {s8-s15}
  125. vldmia a4!, {s1,s2}
  126. vldmia a3!, {s16-s23}
  127. vcvt.f32.s32 s8, s8
  128. vcvt.f32.s32 s9, s9
  129. vcvt.f32.s32 s10, s10
  130. vcvt.f32.s32 s11, s11
  131. vcvt.f32.s32 s12, s12
  132. vcvt.f32.s32 s13, s13
  133. vcvt.f32.s32 s14, s14
  134. vcvt.f32.s32 s15, s15
  135. vmul.f32 s8, s8, s1
  136. b 1b
  137. 11: @ Array is (1 + multiple of 3) x 8 floats long
  138. vldmia a3!, {s24-s31}
  139. vldmia a4!, {s3}
  140. vldmia a3!, {s8-s15}
  141. vldmia a4!, {s1}
  142. vcvt.f32.s32 s24, s24
  143. vcvt.f32.s32 s25, s25
  144. vcvt.f32.s32 s26, s26
  145. vcvt.f32.s32 s27, s27
  146. vcvt.f32.s32 s28, s28
  147. vcvt.f32.s32 s29, s29
  148. vcvt.f32.s32 s30, s30
  149. vcvt.f32.s32 s31, s31
  150. vmul.f32 s24, s24, s3
  151. b 2b
  152. 50:
  153. ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
  154. fmrx ip, FPSCR
  155. fmxr FPSCR, lr
  156. 51:
  157. vldmia a3!, {s8-s15}
  158. vldmia a4!, {s0}
  159. vcvt.f32.s32 s8, s8
  160. vcvt.f32.s32 s9, s9
  161. vcvt.f32.s32 s10, s10
  162. vcvt.f32.s32 s11, s11
  163. vcvt.f32.s32 s12, s12
  164. vcvt.f32.s32 s13, s13
  165. vcvt.f32.s32 s14, s14
  166. vcvt.f32.s32 s15, s15
  167. vmul.f32 s8, s8, s0
  168. subs a1, a1, #8
  169. vstmia a2!, {s8-s11}
  170. vstmia a2!, {s12-s15}
  171. bne 51b
  172. fmxr FPSCR, ip
  173. pop {pc}
  174. endfunc
  175. /**
  176. * ARM VFP optimised int32 to float conversion.
  177. * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
  178. * (16 bytes alignment is best for BCM2835), little-endian.
  179. * TODO: could be further optimised by unrolling and interleaving, as above
  180. */
  181. @ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
  182. function ff_int32_to_float_fmul_scalar_vfp, export=1
  183. VFP tmp .req a4
  184. VFP len .req a3
  185. NOVFP tmp .req a3
  186. NOVFP len .req a4
  187. NOVFP vmov s0, a3
  188. ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
  189. fmrx ip, FPSCR
  190. fmxr FPSCR, tmp
  191. 1:
  192. vldmia a2!, {s8-s15}
  193. vcvt.f32.s32 s8, s8
  194. vcvt.f32.s32 s9, s9
  195. vcvt.f32.s32 s10, s10
  196. vcvt.f32.s32 s11, s11
  197. vcvt.f32.s32 s12, s12
  198. vcvt.f32.s32 s13, s13
  199. vcvt.f32.s32 s14, s14
  200. vcvt.f32.s32 s15, s15
  201. vmul.f32 s8, s8, s0
  202. subs len, len, #8
  203. vstmia a1!, {s8-s11}
  204. vstmia a1!, {s12-s15}
  205. bne 1b
  206. fmxr FPSCR, ip
  207. bx lr
  208. endfunc
  209. .unreq tmp
  210. .unreq len