You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

395 lines
15KB

  1. /*
  2. * ARM NEON optimised Format Conversion Utils
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/arm/asm.S"
  23. preserve8
  24. function ff_float_to_int16_neon, export=1
  25. subs r2, r2, #8
  26. vld1.64 {d0-d1}, [r1,:128]!
  27. vcvt.s32.f32 q8, q0, #16
  28. vld1.64 {d2-d3}, [r1,:128]!
  29. vcvt.s32.f32 q9, q1, #16
  30. beq 3f
  31. bics ip, r2, #15
  32. beq 2f
  33. 1: subs ip, ip, #16
  34. vshrn.s32 d4, q8, #16
  35. vld1.64 {d0-d1}, [r1,:128]!
  36. vcvt.s32.f32 q0, q0, #16
  37. vshrn.s32 d5, q9, #16
  38. vld1.64 {d2-d3}, [r1,:128]!
  39. vcvt.s32.f32 q1, q1, #16
  40. vshrn.s32 d6, q0, #16
  41. vst1.64 {d4-d5}, [r0,:128]!
  42. vshrn.s32 d7, q1, #16
  43. vld1.64 {d16-d17},[r1,:128]!
  44. vcvt.s32.f32 q8, q8, #16
  45. vld1.64 {d18-d19},[r1,:128]!
  46. vcvt.s32.f32 q9, q9, #16
  47. vst1.64 {d6-d7}, [r0,:128]!
  48. bne 1b
  49. ands r2, r2, #15
  50. beq 3f
  51. 2: vld1.64 {d0-d1}, [r1,:128]!
  52. vshrn.s32 d4, q8, #16
  53. vcvt.s32.f32 q0, q0, #16
  54. vld1.64 {d2-d3}, [r1,:128]!
  55. vshrn.s32 d5, q9, #16
  56. vcvt.s32.f32 q1, q1, #16
  57. vshrn.s32 d6, q0, #16
  58. vst1.64 {d4-d5}, [r0,:128]!
  59. vshrn.s32 d7, q1, #16
  60. vst1.64 {d6-d7}, [r0,:128]!
  61. bx lr
  62. 3: vshrn.s32 d4, q8, #16
  63. vshrn.s32 d5, q9, #16
  64. vst1.64 {d4-d5}, [r0,:128]!
  65. bx lr
  66. endfunc
  67. function ff_float_to_int16_interleave_neon, export=1
  68. cmp r3, #2
  69. itt lt
  70. ldrlt r1, [r1]
  71. blt ff_float_to_int16_neon
  72. bne 4f
  73. ldr r3, [r1]
  74. ldr r1, [r1, #4]
  75. subs r2, r2, #8
  76. vld1.64 {d0-d1}, [r3,:128]!
  77. vcvt.s32.f32 q8, q0, #16
  78. vld1.64 {d2-d3}, [r3,:128]!
  79. vcvt.s32.f32 q9, q1, #16
  80. vld1.64 {d20-d21},[r1,:128]!
  81. vcvt.s32.f32 q10, q10, #16
  82. vld1.64 {d22-d23},[r1,:128]!
  83. vcvt.s32.f32 q11, q11, #16
  84. beq 3f
  85. bics ip, r2, #15
  86. beq 2f
  87. 1: subs ip, ip, #16
  88. vld1.64 {d0-d1}, [r3,:128]!
  89. vcvt.s32.f32 q0, q0, #16
  90. vsri.32 q10, q8, #16
  91. vld1.64 {d2-d3}, [r3,:128]!
  92. vcvt.s32.f32 q1, q1, #16
  93. vld1.64 {d24-d25},[r1,:128]!
  94. vcvt.s32.f32 q12, q12, #16
  95. vld1.64 {d26-d27},[r1,:128]!
  96. vsri.32 q11, q9, #16
  97. vst1.64 {d20-d21},[r0,:128]!
  98. vcvt.s32.f32 q13, q13, #16
  99. vst1.64 {d22-d23},[r0,:128]!
  100. vsri.32 q12, q0, #16
  101. vld1.64 {d16-d17},[r3,:128]!
  102. vsri.32 q13, q1, #16
  103. vst1.64 {d24-d25},[r0,:128]!
  104. vcvt.s32.f32 q8, q8, #16
  105. vld1.64 {d18-d19},[r3,:128]!
  106. vcvt.s32.f32 q9, q9, #16
  107. vld1.64 {d20-d21},[r1,:128]!
  108. vcvt.s32.f32 q10, q10, #16
  109. vld1.64 {d22-d23},[r1,:128]!
  110. vcvt.s32.f32 q11, q11, #16
  111. vst1.64 {d26-d27},[r0,:128]!
  112. bne 1b
  113. ands r2, r2, #15
  114. beq 3f
  115. 2: vsri.32 q10, q8, #16
  116. vld1.64 {d0-d1}, [r3,:128]!
  117. vcvt.s32.f32 q0, q0, #16
  118. vld1.64 {d2-d3}, [r3,:128]!
  119. vcvt.s32.f32 q1, q1, #16
  120. vld1.64 {d24-d25},[r1,:128]!
  121. vcvt.s32.f32 q12, q12, #16
  122. vsri.32 q11, q9, #16
  123. vld1.64 {d26-d27},[r1,:128]!
  124. vcvt.s32.f32 q13, q13, #16
  125. vst1.64 {d20-d21},[r0,:128]!
  126. vsri.32 q12, q0, #16
  127. vst1.64 {d22-d23},[r0,:128]!
  128. vsri.32 q13, q1, #16
  129. vst1.64 {d24-d27},[r0,:128]!
  130. bx lr
  131. 3: vsri.32 q10, q8, #16
  132. vsri.32 q11, q9, #16
  133. vst1.64 {d20-d23},[r0,:128]!
  134. bx lr
  135. 4: push {r4-r8,lr}
  136. cmp r3, #4
  137. lsl ip, r3, #1
  138. blt 4f
  139. @ 4 channels
  140. 5: ldmia r1!, {r4-r7}
  141. mov lr, r2
  142. mov r8, r0
  143. vld1.64 {d16-d17},[r4,:128]!
  144. vcvt.s32.f32 q8, q8, #16
  145. vld1.64 {d18-d19},[r5,:128]!
  146. vcvt.s32.f32 q9, q9, #16
  147. vld1.64 {d20-d21},[r6,:128]!
  148. vcvt.s32.f32 q10, q10, #16
  149. vld1.64 {d22-d23},[r7,:128]!
  150. vcvt.s32.f32 q11, q11, #16
  151. 6: subs lr, lr, #8
  152. vld1.64 {d0-d1}, [r4,:128]!
  153. vcvt.s32.f32 q0, q0, #16
  154. vsri.32 q9, q8, #16
  155. vld1.64 {d2-d3}, [r5,:128]!
  156. vcvt.s32.f32 q1, q1, #16
  157. vsri.32 q11, q10, #16
  158. vld1.64 {d4-d5}, [r6,:128]!
  159. vcvt.s32.f32 q2, q2, #16
  160. vzip.32 d18, d22
  161. vld1.64 {d6-d7}, [r7,:128]!
  162. vcvt.s32.f32 q3, q3, #16
  163. vzip.32 d19, d23
  164. vst1.64 {d18}, [r8], ip
  165. vsri.32 q1, q0, #16
  166. vst1.64 {d22}, [r8], ip
  167. vsri.32 q3, q2, #16
  168. vst1.64 {d19}, [r8], ip
  169. vzip.32 d2, d6
  170. vst1.64 {d23}, [r8], ip
  171. vzip.32 d3, d7
  172. beq 7f
  173. vld1.64 {d16-d17},[r4,:128]!
  174. vcvt.s32.f32 q8, q8, #16
  175. vst1.64 {d2}, [r8], ip
  176. vld1.64 {d18-d19},[r5,:128]!
  177. vcvt.s32.f32 q9, q9, #16
  178. vst1.64 {d6}, [r8], ip
  179. vld1.64 {d20-d21},[r6,:128]!
  180. vcvt.s32.f32 q10, q10, #16
  181. vst1.64 {d3}, [r8], ip
  182. vld1.64 {d22-d23},[r7,:128]!
  183. vcvt.s32.f32 q11, q11, #16
  184. vst1.64 {d7}, [r8], ip
  185. b 6b
  186. 7: vst1.64 {d2}, [r8], ip
  187. vst1.64 {d6}, [r8], ip
  188. vst1.64 {d3}, [r8], ip
  189. vst1.64 {d7}, [r8], ip
  190. subs r3, r3, #4
  191. it eq
  192. popeq {r4-r8,pc}
  193. cmp r3, #4
  194. add r0, r0, #8
  195. bge 5b
  196. @ 2 channels
  197. 4: cmp r3, #2
  198. blt 4f
  199. ldmia r1!, {r4-r5}
  200. mov lr, r2
  201. mov r8, r0
  202. tst lr, #8
  203. vld1.64 {d16-d17},[r4,:128]!
  204. vcvt.s32.f32 q8, q8, #16
  205. vld1.64 {d18-d19},[r5,:128]!
  206. vcvt.s32.f32 q9, q9, #16
  207. vld1.64 {d20-d21},[r4,:128]!
  208. vcvt.s32.f32 q10, q10, #16
  209. vld1.64 {d22-d23},[r5,:128]!
  210. vcvt.s32.f32 q11, q11, #16
  211. beq 6f
  212. subs lr, lr, #8
  213. beq 7f
  214. vsri.32 d18, d16, #16
  215. vsri.32 d19, d17, #16
  216. vld1.64 {d16-d17},[r4,:128]!
  217. vcvt.s32.f32 q8, q8, #16
  218. vst1.32 {d18[0]}, [r8], ip
  219. vsri.32 d22, d20, #16
  220. vst1.32 {d18[1]}, [r8], ip
  221. vsri.32 d23, d21, #16
  222. vst1.32 {d19[0]}, [r8], ip
  223. vst1.32 {d19[1]}, [r8], ip
  224. vld1.64 {d18-d19},[r5,:128]!
  225. vcvt.s32.f32 q9, q9, #16
  226. vst1.32 {d22[0]}, [r8], ip
  227. vst1.32 {d22[1]}, [r8], ip
  228. vld1.64 {d20-d21},[r4,:128]!
  229. vcvt.s32.f32 q10, q10, #16
  230. vst1.32 {d23[0]}, [r8], ip
  231. vst1.32 {d23[1]}, [r8], ip
  232. vld1.64 {d22-d23},[r5,:128]!
  233. vcvt.s32.f32 q11, q11, #16
  234. 6: subs lr, lr, #16
  235. vld1.64 {d0-d1}, [r4,:128]!
  236. vcvt.s32.f32 q0, q0, #16
  237. vsri.32 d18, d16, #16
  238. vld1.64 {d2-d3}, [r5,:128]!
  239. vcvt.s32.f32 q1, q1, #16
  240. vsri.32 d19, d17, #16
  241. vld1.64 {d4-d5}, [r4,:128]!
  242. vcvt.s32.f32 q2, q2, #16
  243. vld1.64 {d6-d7}, [r5,:128]!
  244. vcvt.s32.f32 q3, q3, #16
  245. vst1.32 {d18[0]}, [r8], ip
  246. vsri.32 d22, d20, #16
  247. vst1.32 {d18[1]}, [r8], ip
  248. vsri.32 d23, d21, #16
  249. vst1.32 {d19[0]}, [r8], ip
  250. vsri.32 d2, d0, #16
  251. vst1.32 {d19[1]}, [r8], ip
  252. vsri.32 d3, d1, #16
  253. vst1.32 {d22[0]}, [r8], ip
  254. vsri.32 d6, d4, #16
  255. vst1.32 {d22[1]}, [r8], ip
  256. vsri.32 d7, d5, #16
  257. vst1.32 {d23[0]}, [r8], ip
  258. vst1.32 {d23[1]}, [r8], ip
  259. beq 6f
  260. vld1.64 {d16-d17},[r4,:128]!
  261. vcvt.s32.f32 q8, q8, #16
  262. vst1.32 {d2[0]}, [r8], ip
  263. vst1.32 {d2[1]}, [r8], ip
  264. vld1.64 {d18-d19},[r5,:128]!
  265. vcvt.s32.f32 q9, q9, #16
  266. vst1.32 {d3[0]}, [r8], ip
  267. vst1.32 {d3[1]}, [r8], ip
  268. vld1.64 {d20-d21},[r4,:128]!
  269. vcvt.s32.f32 q10, q10, #16
  270. vst1.32 {d6[0]}, [r8], ip
  271. vst1.32 {d6[1]}, [r8], ip
  272. vld1.64 {d22-d23},[r5,:128]!
  273. vcvt.s32.f32 q11, q11, #16
  274. vst1.32 {d7[0]}, [r8], ip
  275. vst1.32 {d7[1]}, [r8], ip
  276. bgt 6b
  277. 6: vst1.32 {d2[0]}, [r8], ip
  278. vst1.32 {d2[1]}, [r8], ip
  279. vst1.32 {d3[0]}, [r8], ip
  280. vst1.32 {d3[1]}, [r8], ip
  281. vst1.32 {d6[0]}, [r8], ip
  282. vst1.32 {d6[1]}, [r8], ip
  283. vst1.32 {d7[0]}, [r8], ip
  284. vst1.32 {d7[1]}, [r8], ip
  285. b 8f
  286. 7: vsri.32 d18, d16, #16
  287. vsri.32 d19, d17, #16
  288. vst1.32 {d18[0]}, [r8], ip
  289. vsri.32 d22, d20, #16
  290. vst1.32 {d18[1]}, [r8], ip
  291. vsri.32 d23, d21, #16
  292. vst1.32 {d19[0]}, [r8], ip
  293. vst1.32 {d19[1]}, [r8], ip
  294. vst1.32 {d22[0]}, [r8], ip
  295. vst1.32 {d22[1]}, [r8], ip
  296. vst1.32 {d23[0]}, [r8], ip
  297. vst1.32 {d23[1]}, [r8], ip
  298. 8: subs r3, r3, #2
  299. add r0, r0, #4
  300. it eq
  301. popeq {r4-r8,pc}
  302. @ 1 channel
  303. 4: ldr r4, [r1],#4
  304. tst r2, #8
  305. mov lr, r2
  306. mov r5, r0
  307. vld1.64 {d0-d1}, [r4,:128]!
  308. vcvt.s32.f32 q0, q0, #16
  309. vld1.64 {d2-d3}, [r4,:128]!
  310. vcvt.s32.f32 q1, q1, #16
  311. bne 8f
  312. 6: subs lr, lr, #16
  313. vld1.64 {d4-d5}, [r4,:128]!
  314. vcvt.s32.f32 q2, q2, #16
  315. vld1.64 {d6-d7}, [r4,:128]!
  316. vcvt.s32.f32 q3, q3, #16
  317. vst1.16 {d0[1]}, [r5,:16], ip
  318. vst1.16 {d0[3]}, [r5,:16], ip
  319. vst1.16 {d1[1]}, [r5,:16], ip
  320. vst1.16 {d1[3]}, [r5,:16], ip
  321. vst1.16 {d2[1]}, [r5,:16], ip
  322. vst1.16 {d2[3]}, [r5,:16], ip
  323. vst1.16 {d3[1]}, [r5,:16], ip
  324. vst1.16 {d3[3]}, [r5,:16], ip
  325. beq 7f
  326. vld1.64 {d0-d1}, [r4,:128]!
  327. vcvt.s32.f32 q0, q0, #16
  328. vld1.64 {d2-d3}, [r4,:128]!
  329. vcvt.s32.f32 q1, q1, #16
  330. 7: vst1.16 {d4[1]}, [r5,:16], ip
  331. vst1.16 {d4[3]}, [r5,:16], ip
  332. vst1.16 {d5[1]}, [r5,:16], ip
  333. vst1.16 {d5[3]}, [r5,:16], ip
  334. vst1.16 {d6[1]}, [r5,:16], ip
  335. vst1.16 {d6[3]}, [r5,:16], ip
  336. vst1.16 {d7[1]}, [r5,:16], ip
  337. vst1.16 {d7[3]}, [r5,:16], ip
  338. bgt 6b
  339. pop {r4-r8,pc}
  340. 8: subs lr, lr, #8
  341. vst1.16 {d0[1]}, [r5,:16], ip
  342. vst1.16 {d0[3]}, [r5,:16], ip
  343. vst1.16 {d1[1]}, [r5,:16], ip
  344. vst1.16 {d1[3]}, [r5,:16], ip
  345. vst1.16 {d2[1]}, [r5,:16], ip
  346. vst1.16 {d2[3]}, [r5,:16], ip
  347. vst1.16 {d3[1]}, [r5,:16], ip
  348. vst1.16 {d3[3]}, [r5,:16], ip
  349. it eq
  350. popeq {r4-r8,pc}
  351. vld1.64 {d0-d1}, [r4,:128]!
  352. vcvt.s32.f32 q0, q0, #16
  353. vld1.64 {d2-d3}, [r4,:128]!
  354. vcvt.s32.f32 q1, q1, #16
  355. b 6b
  356. endfunc
  357. function ff_int32_to_float_fmul_scalar_neon, export=1
  358. VFP vdup.32 q0, d0[0]
  359. VFP len .req r2
  360. NOVFP vdup.32 q0, r2
  361. NOVFP len .req r3
  362. vld1.32 {q1},[r1,:128]!
  363. vcvt.f32.s32 q3, q1
  364. vld1.32 {q2},[r1,:128]!
  365. vcvt.f32.s32 q8, q2
  366. 1: subs len, len, #8
  367. pld [r1, #16]
  368. vmul.f32 q9, q3, q0
  369. vmul.f32 q10, q8, q0
  370. beq 2f
  371. vld1.32 {q1},[r1,:128]!
  372. vcvt.f32.s32 q3, q1
  373. vld1.32 {q2},[r1,:128]!
  374. vcvt.f32.s32 q8, q2
  375. vst1.32 {q9}, [r0,:128]!
  376. vst1.32 {q10},[r0,:128]!
  377. b 1b
  378. 2: vst1.32 {q9}, [r0,:128]!
  379. vst1.32 {q10},[r0,:128]!
  380. bx lr
  381. .unreq len
  382. endfunc