You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

412 lines
14KB

  1. /*
  2. * Copyright (c) 2012 Mans Rullgard
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. function ff_sbr_sum64x5_neon, export=1
  22. push {lr}
  23. add r1, r0, # 64*4
  24. add r2, r0, #128*4
  25. add r3, r0, #192*4
  26. add lr, r0, #256*4
  27. mov r12, #64
  28. 1:
  29. vld1.32 {q0}, [r0,:128]
  30. vld1.32 {q1}, [r1,:128]!
  31. vadd.f32 q0, q0, q1
  32. vld1.32 {q2}, [r2,:128]!
  33. vadd.f32 q0, q0, q2
  34. vld1.32 {q3}, [r3,:128]!
  35. vadd.f32 q0, q0, q3
  36. vld1.32 {q8}, [lr,:128]!
  37. vadd.f32 q0, q0, q8
  38. vst1.32 {q0}, [r0,:128]!
  39. subs r12, #4
  40. bgt 1b
  41. pop {pc}
  42. endfunc
  43. function ff_sbr_sum_square_neon, export=1
  44. vmov.f32 q0, #0.0
  45. 1:
  46. vld1.32 {q1}, [r0,:128]!
  47. vmla.f32 q0, q1, q1
  48. subs r1, r1, #2
  49. bgt 1b
  50. vadd.f32 d0, d0, d1
  51. vpadd.f32 d0, d0, d0
  52. NOVFP vmov.32 r0, d0[0]
  53. bx lr
  54. endfunc
  55. function ff_sbr_neg_odd_64_neon, export=1
  56. mov r1, r0
  57. vmov.i32 q8, #1<<31
  58. vld2.32 {q0,q1}, [r0,:128]!
  59. veor q1, q1, q8
  60. vld2.32 {q2,q3}, [r0,:128]!
  61. .rept 3
  62. vst2.32 {q0,q1}, [r1,:128]!
  63. veor q3, q3, q8
  64. vld2.32 {q0,q1}, [r0,:128]!
  65. vst2.32 {q2,q3}, [r1,:128]!
  66. veor q1, q1, q8
  67. vld2.32 {q2,q3}, [r0,:128]!
  68. .endr
  69. veor q3, q3, q8
  70. vst2.32 {q0,q1}, [r1,:128]!
  71. vst2.32 {q2,q3}, [r1,:128]!
  72. bx lr
  73. endfunc
  74. function ff_sbr_qmf_pre_shuffle_neon, export=1
  75. add r1, r0, #60*4
  76. add r2, r0, #64*4
  77. vld1.32 {d0}, [r0,:64]!
  78. vst1.32 {d0}, [r2,:64]!
  79. mov r3, #-16
  80. mov r12, #24
  81. vmov.i32 q8, #1<<31
  82. vld1.32 {q0}, [r1,:128], r3
  83. vld1.32 {d2}, [r0,:64]!
  84. 1:
  85. vld1.32 {d3,d4}, [r0,:128]!
  86. vrev64.32 q0, q0
  87. vld1.32 {q9}, [r1,:128], r3
  88. veor q0, q0, q8
  89. vld1.32 {d5,d6}, [r0,:128]!
  90. vswp d0, d1
  91. vrev64.32 q9, q9
  92. vst2.32 {q0,q1}, [r2,:64]!
  93. vmov q10, q2
  94. veor q9, q9, q8
  95. vmov d2, d6
  96. vswp d18, d19
  97. vld1.32 {q0}, [r1,:128], r3
  98. vst2.32 {q9,q10}, [r2,:64]!
  99. subs r12, r12, #8
  100. bgt 1b
  101. vld1.32 {d3,d4}, [r0,:128]!
  102. vrev64.32 q0, q0
  103. vld1.32 {q9}, [r1,:128], r3
  104. veor q0, q0, q8
  105. vld1.32 {d5}, [r0,:64]!
  106. vswp d0, d1
  107. vrev64.32 q9, q9
  108. vst2.32 {q0,q1}, [r2,:64]!
  109. vswp d4, d5
  110. veor q1, q9, q8
  111. vst2.32 {d3,d5}, [r2,:64]!
  112. vst2.32 {d2[0],d4[0]}, [r2,:64]!
  113. bx lr
  114. endfunc
  115. function ff_sbr_qmf_post_shuffle_neon, export=1
  116. add r2, r1, #60*4
  117. mov r3, #-16
  118. mov r12, #32
  119. vmov.i32 q8, #1<<31
  120. vld1.32 {q0}, [r2,:128], r3
  121. vld1.32 {q1}, [r1,:128]!
  122. 1:
  123. pld [r2, #-32]
  124. vrev64.32 q0, q0
  125. vswp d2, d3
  126. veor q0, q0, q8
  127. vld1.32 {q2}, [r2,:128], r3
  128. vld1.32 {q3}, [r1,:128]!
  129. vst2.32 {d1,d3}, [r0,:128]!
  130. vst2.32 {d0,d2}, [r0,:128]!
  131. pld [r2, #-32]
  132. vrev64.32 q2, q2
  133. vswp d6, d7
  134. veor q2, q2, q8
  135. vld1.32 {q0}, [r2,:128], r3
  136. vld1.32 {q1}, [r1,:128]!
  137. vst2.32 {d5,d7}, [r0,:128]!
  138. vst2.32 {d4,d6}, [r0,:128]!
  139. subs r12, r12, #8
  140. bgt 1b
  141. bx lr
  142. endfunc
  143. function ff_sbr_qmf_deint_neg_neon, export=1
  144. add r1, r1, #60*4
  145. add r2, r0, #62*4
  146. mov r3, #-16
  147. mov r12, #32
  148. vmov.i32 d2, #1<<31
  149. 1:
  150. vld2.32 {d0,d1}, [r1,:128], r3
  151. veor d0, d0, d2
  152. vrev64.32 d1, d1
  153. vst1.32 {d0}, [r2,:64]
  154. vst1.32 {d1}, [r0,:64]!
  155. sub r2, r2, #8
  156. subs r12, r12, #2
  157. bgt 1b
  158. bx lr
  159. endfunc
  160. function ff_sbr_qmf_deint_bfly_neon, export=1
  161. push {lr}
  162. add r2, r2, #60*4
  163. add r3, r0, #124*4
  164. mov r12, #64
  165. mov lr, #-16
  166. 1:
  167. vld1.32 {q0}, [r1,:128]!
  168. vld1.32 {q1}, [r2,:128], lr
  169. vrev64.32 q2, q0
  170. vrev64.32 q3, q1
  171. vadd.f32 d3, d4, d3
  172. vadd.f32 d2, d5, d2
  173. vsub.f32 d0, d0, d7
  174. vsub.f32 d1, d1, d6
  175. vst1.32 {q1}, [r3,:128], lr
  176. vst1.32 {q0}, [r0,:128]!
  177. subs r12, r12, #4
  178. bgt 1b
  179. pop {pc}
  180. endfunc
  181. function ff_sbr_hf_g_filt_neon, export=1
  182. ldr r12, [sp]
  183. add r1, r1, r12, lsl #3
  184. mov r12, #40*2*4
  185. sub r3, r3, #1
  186. vld2.32 {d2[],d3[]},[r2,:64]!
  187. vld1.32 {d0}, [r1,:64], r12
  188. 1:
  189. vld1.32 {d1}, [r1,:64], r12
  190. vmul.f32 q3, q0, q1
  191. vld2.32 {d2[],d3[]},[r2,:64]!
  192. vld1.32 {d0}, [r1,:64], r12
  193. vst1.32 {q3}, [r0,:64]!
  194. subs r3, r3, #2
  195. bgt 1b
  196. it lt
  197. bxlt lr
  198. vmul.f32 d0, d0, d2
  199. vst1.32 {d0}, [r0,:64]!
  200. bx lr
  201. endfunc
  202. function ff_sbr_hf_gen_neon, export=1
  203. NOVFP vld1.32 {d1[]}, [sp,:32]
  204. VFP vdup.32 d1, d0[0]
  205. vmul.f32 d0, d1, d1
  206. vld1.32 {d3}, [r2,:64]
  207. vld1.32 {d2}, [r3,:64]
  208. vmul.f32 q0, q0, q1
  209. ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
  210. vtrn.32 d0, d1
  211. vneg.f32 d18, d1
  212. vtrn.32 d18, d1
  213. add r0, r0, r2, lsl #3
  214. add r1, r1, r2, lsl #3
  215. sub r1, r1, #2*8
  216. sub r3, r3, r2
  217. vld1.32 {q1}, [r1,:128]!
  218. 1:
  219. vld1.32 {q3}, [r1,:128]!
  220. vrev64.32 q2, q1
  221. vmov q8, q3
  222. vrev64.32 d20, d3
  223. vrev64.32 d21, d6
  224. vmla.f32 q3, q1, d0[0]
  225. vmla.f32 d6, d4, d18
  226. vmla.f32 d7, d20, d18
  227. vmla.f32 d6, d3, d0[1]
  228. vmla.f32 d7, d16, d0[1]
  229. vmla.f32 d6, d5, d1
  230. vmla.f32 d7, d21, d1
  231. vmov q1, q8
  232. vst1.32 {q3}, [r0,:128]!
  233. subs r3, r3, #2
  234. bgt 1b
  235. bx lr
  236. endfunc
  237. function ff_sbr_autocorrelate_neon, export=1
  238. vld1.32 {q0}, [r0,:128]!
  239. vmov.f32 q1, #0.0
  240. vmov.f32 q3, #0.0
  241. vmov.f32 d20, #0.0
  242. vmul.f32 d21, d1, d1
  243. vmov q8, q0
  244. vmov q11, q0
  245. mov r12, #36
  246. 1:
  247. vld1.32 {q2}, [r0,:128]!
  248. vrev64.32 q12, q2
  249. vmla.f32 q10, q2, q2
  250. vmla.f32 d2, d1, d4
  251. vmla.f32 d3, d1, d24
  252. vmla.f32 d6, d0, d4
  253. vmla.f32 d7, d0, d24
  254. vmla.f32 d2, d4, d5
  255. vmla.f32 d3, d4, d25
  256. vmla.f32 d6, d1, d5
  257. vmla.f32 d7, d1, d25
  258. vmov q0, q2
  259. subs r12, r12, #2
  260. bgt 1b
  261. vld1.32 {q2}, [r0,:128]!
  262. vrev64.32 q12, q2
  263. vmla.f32 d2, d1, d4
  264. vmla.f32 d3, d1, d24
  265. vmla.f32 d6, d0, d4
  266. vmla.f32 d7, d0, d24
  267. vadd.f32 d20, d20, d21
  268. vrev64.32 d18, d17
  269. vmla.f32 d6, d1, d5
  270. vmla.f32 d7, d1, d25
  271. vmov q0, q1
  272. vmla.f32 d0, d16, d17
  273. vmla.f32 d1, d16, d18
  274. vmla.f32 d2, d4, d5
  275. vmla.f32 d3, d4, d25
  276. vneg.f32 s15, s15
  277. vmov d21, d20
  278. vpadd.f32 d0, d0, d2
  279. vpadd.f32 d7, d6, d7
  280. vtrn.32 d1, d3
  281. vsub.f32 d6, d1, d3
  282. vmla.f32 d20, d22, d22
  283. vmla.f32 d21, d4, d4
  284. vtrn.32 d0, d6
  285. vpadd.f32 d20, d20, d21
  286. vst1.32 {q3}, [r1,:128]!
  287. vst1.32 {d20[1]}, [r1,:32]
  288. add r1, r1, #2*4
  289. vst1.32 {d0}, [r1,:64]
  290. add r1, r1, #4*4
  291. vst1.32 {d20[0]}, [r1,:32]
  292. bx lr
  293. endfunc
  294. function ff_sbr_hf_apply_noise_0_neon, export=1
  295. vmov.i32 d3, #0
  296. .Lhf_apply_noise_0:
  297. push {r4,lr}
  298. movrelx r4, X(ff_sbr_noise_table)
  299. ldr r12, [sp, #12]
  300. add r3, r3, #1
  301. bfc r3, #9, #23
  302. sub r12, r12, #1
  303. 1:
  304. add lr, r4, r3, lsl #3
  305. vld2.32 {q0}, [r0,:64]
  306. vld2.32 {q3}, [lr,:64]
  307. vld1.32 {d2}, [r1,:64]!
  308. vld1.32 {d18}, [r2,:64]!
  309. vceq.f32 d16, d2, #0
  310. veor d2, d2, d3
  311. vmov q2, q0
  312. vmla.f32 d0, d6, d18
  313. vmla.f32 d1, d7, d18
  314. vadd.f32 d4, d4, d2
  315. add r3, r3, #2
  316. bfc r3, #9, #23
  317. vbif d0, d4, d16
  318. vbif d1, d5, d16
  319. vst2.32 {q0}, [r0,:64]!
  320. subs r12, r12, #2
  321. bgt 1b
  322. blt 2f
  323. add lr, r4, r3, lsl #3
  324. vld1.32 {d0}, [r0,:64]
  325. vld1.32 {d6}, [lr,:64]
  326. vld1.32 {d2[]}, [r1,:32]!
  327. vld1.32 {d3[]}, [r2,:32]!
  328. vceq.f32 d4, d2, #0
  329. veor d2, d2, d3
  330. vmov d1, d0
  331. vmla.f32 d0, d6, d3
  332. vadd.f32 s2, s2, s4
  333. vbif d0, d1, d4
  334. vst1.32 {d0}, [r0,:64]!
  335. 2:
  336. pop {r4,pc}
  337. endfunc
  338. function ff_sbr_hf_apply_noise_1_neon, export=1
  339. ldr r12, [sp]
  340. push {r4,lr}
  341. lsl r12, r12, #31
  342. eor lr, r12, #1<<31
  343. vmov d3, r12, lr
  344. .Lhf_apply_noise_1:
  345. movrelx r4, X(ff_sbr_noise_table)
  346. ldr r12, [sp, #12]
  347. add r3, r3, #1
  348. bfc r3, #9, #23
  349. sub r12, r12, #1
  350. 1:
  351. add lr, r4, r3, lsl #3
  352. vld2.32 {q0}, [r0,:64]
  353. vld2.32 {q3}, [lr,:64]
  354. vld1.32 {d2}, [r1,:64]!
  355. vld1.32 {d18}, [r2,:64]!
  356. vceq.f32 d16, d2, #0
  357. veor d2, d2, d3
  358. vmov q2, q0
  359. vmla.f32 d0, d6, d18
  360. vmla.f32 d1, d7, d18
  361. vadd.f32 d5, d5, d2
  362. add r3, r3, #2
  363. bfc r3, #9, #23
  364. vbif d0, d4, d16
  365. vbif d1, d5, d16
  366. vst2.32 {q0}, [r0,:64]!
  367. subs r12, r12, #2
  368. bgt 1b
  369. blt 2f
  370. add lr, r4, r3, lsl #3
  371. vld1.32 {d0}, [r0,:64]
  372. vld1.32 {d6}, [lr,:64]
  373. vld1.32 {d2[]}, [r1,:32]!
  374. vld1.32 {d18[]}, [r2,:32]!
  375. vceq.f32 d4, d2, #0
  376. veor d2, d2, d3
  377. vmov d1, d0
  378. vmla.f32 d0, d6, d18
  379. vadd.f32 s3, s3, s5
  380. vbif d0, d1, d4
  381. vst1.32 {d0}, [r0,:64]!
  382. 2:
  383. pop {r4,pc}
  384. endfunc
  385. function ff_sbr_hf_apply_noise_2_neon, export=1
  386. vmov.i32 d3, #1<<31
  387. b .Lhf_apply_noise_0
  388. endfunc
  389. function ff_sbr_hf_apply_noise_3_neon, export=1
  390. ldr r12, [sp]
  391. push {r4,lr}
  392. lsl r12, r12, #31
  393. eor lr, r12, #1<<31
  394. vmov d3, lr, r12
  395. b .Lhf_apply_noise_1
  396. endfunc