You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
15KB

  1. /*
  2. * ARM NEON optimised Format Conversion Utils
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. preserve8
  24. .text
  25. function ff_float_to_int16_neon, export=1
  26. subs r2, r2, #8
  27. vld1.64 {d0-d1}, [r1,:128]!
  28. vcvt.s32.f32 q8, q0, #16
  29. vld1.64 {d2-d3}, [r1,:128]!
  30. vcvt.s32.f32 q9, q1, #16
  31. beq 3f
  32. bics ip, r2, #15
  33. beq 2f
  34. 1: subs ip, ip, #16
  35. vshrn.s32 d4, q8, #16
  36. vld1.64 {d0-d1}, [r1,:128]!
  37. vcvt.s32.f32 q0, q0, #16
  38. vshrn.s32 d5, q9, #16
  39. vld1.64 {d2-d3}, [r1,:128]!
  40. vcvt.s32.f32 q1, q1, #16
  41. vshrn.s32 d6, q0, #16
  42. vst1.64 {d4-d5}, [r0,:128]!
  43. vshrn.s32 d7, q1, #16
  44. vld1.64 {d16-d17},[r1,:128]!
  45. vcvt.s32.f32 q8, q8, #16
  46. vld1.64 {d18-d19},[r1,:128]!
  47. vcvt.s32.f32 q9, q9, #16
  48. vst1.64 {d6-d7}, [r0,:128]!
  49. bne 1b
  50. ands r2, r2, #15
  51. beq 3f
  52. 2: vld1.64 {d0-d1}, [r1,:128]!
  53. vshrn.s32 d4, q8, #16
  54. vcvt.s32.f32 q0, q0, #16
  55. vld1.64 {d2-d3}, [r1,:128]!
  56. vshrn.s32 d5, q9, #16
  57. vcvt.s32.f32 q1, q1, #16
  58. vshrn.s32 d6, q0, #16
  59. vst1.64 {d4-d5}, [r0,:128]!
  60. vshrn.s32 d7, q1, #16
  61. vst1.64 {d6-d7}, [r0,:128]!
  62. bx lr
  63. 3: vshrn.s32 d4, q8, #16
  64. vshrn.s32 d5, q9, #16
  65. vst1.64 {d4-d5}, [r0,:128]!
  66. bx lr
  67. endfunc
  68. function ff_float_to_int16_interleave_neon, export=1
  69. cmp r3, #2
  70. itt lt
  71. ldrlt r1, [r1]
  72. blt ff_float_to_int16_neon
  73. bne 4f
  74. ldr r3, [r1]
  75. ldr r1, [r1, #4]
  76. subs r2, r2, #8
  77. vld1.64 {d0-d1}, [r3,:128]!
  78. vcvt.s32.f32 q8, q0, #16
  79. vld1.64 {d2-d3}, [r3,:128]!
  80. vcvt.s32.f32 q9, q1, #16
  81. vld1.64 {d20-d21},[r1,:128]!
  82. vcvt.s32.f32 q10, q10, #16
  83. vld1.64 {d22-d23},[r1,:128]!
  84. vcvt.s32.f32 q11, q11, #16
  85. beq 3f
  86. bics ip, r2, #15
  87. beq 2f
  88. 1: subs ip, ip, #16
  89. vld1.64 {d0-d1}, [r3,:128]!
  90. vcvt.s32.f32 q0, q0, #16
  91. vsri.32 q10, q8, #16
  92. vld1.64 {d2-d3}, [r3,:128]!
  93. vcvt.s32.f32 q1, q1, #16
  94. vld1.64 {d24-d25},[r1,:128]!
  95. vcvt.s32.f32 q12, q12, #16
  96. vld1.64 {d26-d27},[r1,:128]!
  97. vsri.32 q11, q9, #16
  98. vst1.64 {d20-d21},[r0,:128]!
  99. vcvt.s32.f32 q13, q13, #16
  100. vst1.64 {d22-d23},[r0,:128]!
  101. vsri.32 q12, q0, #16
  102. vld1.64 {d16-d17},[r3,:128]!
  103. vsri.32 q13, q1, #16
  104. vst1.64 {d24-d25},[r0,:128]!
  105. vcvt.s32.f32 q8, q8, #16
  106. vld1.64 {d18-d19},[r3,:128]!
  107. vcvt.s32.f32 q9, q9, #16
  108. vld1.64 {d20-d21},[r1,:128]!
  109. vcvt.s32.f32 q10, q10, #16
  110. vld1.64 {d22-d23},[r1,:128]!
  111. vcvt.s32.f32 q11, q11, #16
  112. vst1.64 {d26-d27},[r0,:128]!
  113. bne 1b
  114. ands r2, r2, #15
  115. beq 3f
  116. 2: vsri.32 q10, q8, #16
  117. vld1.64 {d0-d1}, [r3,:128]!
  118. vcvt.s32.f32 q0, q0, #16
  119. vld1.64 {d2-d3}, [r3,:128]!
  120. vcvt.s32.f32 q1, q1, #16
  121. vld1.64 {d24-d25},[r1,:128]!
  122. vcvt.s32.f32 q12, q12, #16
  123. vsri.32 q11, q9, #16
  124. vld1.64 {d26-d27},[r1,:128]!
  125. vcvt.s32.f32 q13, q13, #16
  126. vst1.64 {d20-d21},[r0,:128]!
  127. vsri.32 q12, q0, #16
  128. vst1.64 {d22-d23},[r0,:128]!
  129. vsri.32 q13, q1, #16
  130. vst1.64 {d24-d27},[r0,:128]!
  131. bx lr
  132. 3: vsri.32 q10, q8, #16
  133. vsri.32 q11, q9, #16
  134. vst1.64 {d20-d23},[r0,:128]!
  135. bx lr
  136. 4: push {r4-r8,lr}
  137. cmp r3, #4
  138. lsl ip, r3, #1
  139. blt 4f
  140. @ 4 channels
  141. 5: ldmia r1!, {r4-r7}
  142. mov lr, r2
  143. mov r8, r0
  144. vld1.64 {d16-d17},[r4,:128]!
  145. vcvt.s32.f32 q8, q8, #16
  146. vld1.64 {d18-d19},[r5,:128]!
  147. vcvt.s32.f32 q9, q9, #16
  148. vld1.64 {d20-d21},[r6,:128]!
  149. vcvt.s32.f32 q10, q10, #16
  150. vld1.64 {d22-d23},[r7,:128]!
  151. vcvt.s32.f32 q11, q11, #16
  152. 6: subs lr, lr, #8
  153. vld1.64 {d0-d1}, [r4,:128]!
  154. vcvt.s32.f32 q0, q0, #16
  155. vsri.32 q9, q8, #16
  156. vld1.64 {d2-d3}, [r5,:128]!
  157. vcvt.s32.f32 q1, q1, #16
  158. vsri.32 q11, q10, #16
  159. vld1.64 {d4-d5}, [r6,:128]!
  160. vcvt.s32.f32 q2, q2, #16
  161. vzip.32 d18, d22
  162. vld1.64 {d6-d7}, [r7,:128]!
  163. vcvt.s32.f32 q3, q3, #16
  164. vzip.32 d19, d23
  165. vst1.64 {d18}, [r8], ip
  166. vsri.32 q1, q0, #16
  167. vst1.64 {d22}, [r8], ip
  168. vsri.32 q3, q2, #16
  169. vst1.64 {d19}, [r8], ip
  170. vzip.32 d2, d6
  171. vst1.64 {d23}, [r8], ip
  172. vzip.32 d3, d7
  173. beq 7f
  174. vld1.64 {d16-d17},[r4,:128]!
  175. vcvt.s32.f32 q8, q8, #16
  176. vst1.64 {d2}, [r8], ip
  177. vld1.64 {d18-d19},[r5,:128]!
  178. vcvt.s32.f32 q9, q9, #16
  179. vst1.64 {d6}, [r8], ip
  180. vld1.64 {d20-d21},[r6,:128]!
  181. vcvt.s32.f32 q10, q10, #16
  182. vst1.64 {d3}, [r8], ip
  183. vld1.64 {d22-d23},[r7,:128]!
  184. vcvt.s32.f32 q11, q11, #16
  185. vst1.64 {d7}, [r8], ip
  186. b 6b
  187. 7: vst1.64 {d2}, [r8], ip
  188. vst1.64 {d6}, [r8], ip
  189. vst1.64 {d3}, [r8], ip
  190. vst1.64 {d7}, [r8], ip
  191. subs r3, r3, #4
  192. it eq
  193. popeq {r4-r8,pc}
  194. cmp r3, #4
  195. add r0, r0, #8
  196. bge 5b
  197. @ 2 channels
  198. 4: cmp r3, #2
  199. blt 4f
  200. ldmia r1!, {r4-r5}
  201. mov lr, r2
  202. mov r8, r0
  203. tst lr, #8
  204. vld1.64 {d16-d17},[r4,:128]!
  205. vcvt.s32.f32 q8, q8, #16
  206. vld1.64 {d18-d19},[r5,:128]!
  207. vcvt.s32.f32 q9, q9, #16
  208. vld1.64 {d20-d21},[r4,:128]!
  209. vcvt.s32.f32 q10, q10, #16
  210. vld1.64 {d22-d23},[r5,:128]!
  211. vcvt.s32.f32 q11, q11, #16
  212. beq 6f
  213. subs lr, lr, #8
  214. beq 7f
  215. vsri.32 d18, d16, #16
  216. vsri.32 d19, d17, #16
  217. vld1.64 {d16-d17},[r4,:128]!
  218. vcvt.s32.f32 q8, q8, #16
  219. vst1.32 {d18[0]}, [r8], ip
  220. vsri.32 d22, d20, #16
  221. vst1.32 {d18[1]}, [r8], ip
  222. vsri.32 d23, d21, #16
  223. vst1.32 {d19[0]}, [r8], ip
  224. vst1.32 {d19[1]}, [r8], ip
  225. vld1.64 {d18-d19},[r5,:128]!
  226. vcvt.s32.f32 q9, q9, #16
  227. vst1.32 {d22[0]}, [r8], ip
  228. vst1.32 {d22[1]}, [r8], ip
  229. vld1.64 {d20-d21},[r4,:128]!
  230. vcvt.s32.f32 q10, q10, #16
  231. vst1.32 {d23[0]}, [r8], ip
  232. vst1.32 {d23[1]}, [r8], ip
  233. vld1.64 {d22-d23},[r5,:128]!
  234. vcvt.s32.f32 q11, q11, #16
  235. 6: subs lr, lr, #16
  236. vld1.64 {d0-d1}, [r4,:128]!
  237. vcvt.s32.f32 q0, q0, #16
  238. vsri.32 d18, d16, #16
  239. vld1.64 {d2-d3}, [r5,:128]!
  240. vcvt.s32.f32 q1, q1, #16
  241. vsri.32 d19, d17, #16
  242. vld1.64 {d4-d5}, [r4,:128]!
  243. vcvt.s32.f32 q2, q2, #16
  244. vld1.64 {d6-d7}, [r5,:128]!
  245. vcvt.s32.f32 q3, q3, #16
  246. vst1.32 {d18[0]}, [r8], ip
  247. vsri.32 d22, d20, #16
  248. vst1.32 {d18[1]}, [r8], ip
  249. vsri.32 d23, d21, #16
  250. vst1.32 {d19[0]}, [r8], ip
  251. vsri.32 d2, d0, #16
  252. vst1.32 {d19[1]}, [r8], ip
  253. vsri.32 d3, d1, #16
  254. vst1.32 {d22[0]}, [r8], ip
  255. vsri.32 d6, d4, #16
  256. vst1.32 {d22[1]}, [r8], ip
  257. vsri.32 d7, d5, #16
  258. vst1.32 {d23[0]}, [r8], ip
  259. vst1.32 {d23[1]}, [r8], ip
  260. beq 6f
  261. vld1.64 {d16-d17},[r4,:128]!
  262. vcvt.s32.f32 q8, q8, #16
  263. vst1.32 {d2[0]}, [r8], ip
  264. vst1.32 {d2[1]}, [r8], ip
  265. vld1.64 {d18-d19},[r5,:128]!
  266. vcvt.s32.f32 q9, q9, #16
  267. vst1.32 {d3[0]}, [r8], ip
  268. vst1.32 {d3[1]}, [r8], ip
  269. vld1.64 {d20-d21},[r4,:128]!
  270. vcvt.s32.f32 q10, q10, #16
  271. vst1.32 {d6[0]}, [r8], ip
  272. vst1.32 {d6[1]}, [r8], ip
  273. vld1.64 {d22-d23},[r5,:128]!
  274. vcvt.s32.f32 q11, q11, #16
  275. vst1.32 {d7[0]}, [r8], ip
  276. vst1.32 {d7[1]}, [r8], ip
  277. bgt 6b
  278. 6: vst1.32 {d2[0]}, [r8], ip
  279. vst1.32 {d2[1]}, [r8], ip
  280. vst1.32 {d3[0]}, [r8], ip
  281. vst1.32 {d3[1]}, [r8], ip
  282. vst1.32 {d6[0]}, [r8], ip
  283. vst1.32 {d6[1]}, [r8], ip
  284. vst1.32 {d7[0]}, [r8], ip
  285. vst1.32 {d7[1]}, [r8], ip
  286. b 8f
  287. 7: vsri.32 d18, d16, #16
  288. vsri.32 d19, d17, #16
  289. vst1.32 {d18[0]}, [r8], ip
  290. vsri.32 d22, d20, #16
  291. vst1.32 {d18[1]}, [r8], ip
  292. vsri.32 d23, d21, #16
  293. vst1.32 {d19[0]}, [r8], ip
  294. vst1.32 {d19[1]}, [r8], ip
  295. vst1.32 {d22[0]}, [r8], ip
  296. vst1.32 {d22[1]}, [r8], ip
  297. vst1.32 {d23[0]}, [r8], ip
  298. vst1.32 {d23[1]}, [r8], ip
  299. 8: subs r3, r3, #2
  300. add r0, r0, #4
  301. it eq
  302. popeq {r4-r8,pc}
  303. @ 1 channel
  304. 4: ldr r4, [r1],#4
  305. tst r2, #8
  306. mov lr, r2
  307. mov r5, r0
  308. vld1.64 {d0-d1}, [r4,:128]!
  309. vcvt.s32.f32 q0, q0, #16
  310. vld1.64 {d2-d3}, [r4,:128]!
  311. vcvt.s32.f32 q1, q1, #16
  312. bne 8f
  313. 6: subs lr, lr, #16
  314. vld1.64 {d4-d5}, [r4,:128]!
  315. vcvt.s32.f32 q2, q2, #16
  316. vld1.64 {d6-d7}, [r4,:128]!
  317. vcvt.s32.f32 q3, q3, #16
  318. vst1.16 {d0[1]}, [r5,:16], ip
  319. vst1.16 {d0[3]}, [r5,:16], ip
  320. vst1.16 {d1[1]}, [r5,:16], ip
  321. vst1.16 {d1[3]}, [r5,:16], ip
  322. vst1.16 {d2[1]}, [r5,:16], ip
  323. vst1.16 {d2[3]}, [r5,:16], ip
  324. vst1.16 {d3[1]}, [r5,:16], ip
  325. vst1.16 {d3[3]}, [r5,:16], ip
  326. beq 7f
  327. vld1.64 {d0-d1}, [r4,:128]!
  328. vcvt.s32.f32 q0, q0, #16
  329. vld1.64 {d2-d3}, [r4,:128]!
  330. vcvt.s32.f32 q1, q1, #16
  331. 7: vst1.16 {d4[1]}, [r5,:16], ip
  332. vst1.16 {d4[3]}, [r5,:16], ip
  333. vst1.16 {d5[1]}, [r5,:16], ip
  334. vst1.16 {d5[3]}, [r5,:16], ip
  335. vst1.16 {d6[1]}, [r5,:16], ip
  336. vst1.16 {d6[3]}, [r5,:16], ip
  337. vst1.16 {d7[1]}, [r5,:16], ip
  338. vst1.16 {d7[3]}, [r5,:16], ip
  339. bgt 6b
  340. pop {r4-r8,pc}
  341. 8: subs lr, lr, #8
  342. vst1.16 {d0[1]}, [r5,:16], ip
  343. vst1.16 {d0[3]}, [r5,:16], ip
  344. vst1.16 {d1[1]}, [r5,:16], ip
  345. vst1.16 {d1[3]}, [r5,:16], ip
  346. vst1.16 {d2[1]}, [r5,:16], ip
  347. vst1.16 {d2[3]}, [r5,:16], ip
  348. vst1.16 {d3[1]}, [r5,:16], ip
  349. vst1.16 {d3[3]}, [r5,:16], ip
  350. it eq
  351. popeq {r4-r8,pc}
  352. vld1.64 {d0-d1}, [r4,:128]!
  353. vcvt.s32.f32 q0, q0, #16
  354. vld1.64 {d2-d3}, [r4,:128]!
  355. vcvt.s32.f32 q1, q1, #16
  356. b 6b
  357. endfunc
  358. function ff_int32_to_float_fmul_scalar_neon, export=1
  359. VFP vdup.32 q0, d0[0]
  360. VFP len .req r2
  361. NOVFP vdup.32 q0, r2
  362. NOVFP len .req r3
  363. vld1.32 {q1},[r1,:128]!
  364. vcvt.f32.s32 q3, q1
  365. vld1.32 {q2},[r1,:128]!
  366. vcvt.f32.s32 q8, q2
  367. 1: subs len, len, #8
  368. pld [r1, #16]
  369. vmul.f32 q9, q3, q0
  370. vmul.f32 q10, q8, q0
  371. beq 2f
  372. vld1.32 {q1},[r1,:128]!
  373. vcvt.f32.s32 q3, q1
  374. vld1.32 {q2},[r1,:128]!
  375. vcvt.f32.s32 q8, q2
  376. vst1.32 {q9}, [r0,:128]!
  377. vst1.32 {q10},[r0,:128]!
  378. b 1b
  379. 2: vst1.32 {q9}, [r0,:128]!
  380. vst1.32 {q10},[r0,:128]!
  381. bx lr
  382. .unreq len
  383. endfunc