You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

210 lines
7.3KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. function ff_clear_block_neon, export=1
  23. vmov.i16 q0, #0
  24. .rept 8
  25. vst1.16 {q0}, [r0,:128]!
  26. .endr
  27. bx lr
  28. endfunc
  29. function ff_clear_blocks_neon, export=1
  30. vmov.i16 q0, #0
  31. .rept 8*6
  32. vst1.16 {q0}, [r0,:128]!
  33. .endr
  34. bx lr
  35. endfunc
  36. function ff_put_pixels_clamped_neon, export=1
  37. vld1.16 {d16-d19}, [r0,:128]!
  38. vqmovun.s16 d0, q8
  39. vld1.16 {d20-d23}, [r0,:128]!
  40. vqmovun.s16 d1, q9
  41. vld1.16 {d24-d27}, [r0,:128]!
  42. vqmovun.s16 d2, q10
  43. vld1.16 {d28-d31}, [r0,:128]!
  44. vqmovun.s16 d3, q11
  45. vst1.8 {d0}, [r1,:64], r2
  46. vqmovun.s16 d4, q12
  47. vst1.8 {d1}, [r1,:64], r2
  48. vqmovun.s16 d5, q13
  49. vst1.8 {d2}, [r1,:64], r2
  50. vqmovun.s16 d6, q14
  51. vst1.8 {d3}, [r1,:64], r2
  52. vqmovun.s16 d7, q15
  53. vst1.8 {d4}, [r1,:64], r2
  54. vst1.8 {d5}, [r1,:64], r2
  55. vst1.8 {d6}, [r1,:64], r2
  56. vst1.8 {d7}, [r1,:64], r2
  57. bx lr
  58. endfunc
  59. function ff_put_signed_pixels_clamped_neon, export=1
  60. vmov.u8 d31, #128
  61. vld1.16 {d16-d17}, [r0,:128]!
  62. vqmovn.s16 d0, q8
  63. vld1.16 {d18-d19}, [r0,:128]!
  64. vqmovn.s16 d1, q9
  65. vld1.16 {d16-d17}, [r0,:128]!
  66. vqmovn.s16 d2, q8
  67. vld1.16 {d18-d19}, [r0,:128]!
  68. vadd.u8 d0, d0, d31
  69. vld1.16 {d20-d21}, [r0,:128]!
  70. vadd.u8 d1, d1, d31
  71. vld1.16 {d22-d23}, [r0,:128]!
  72. vadd.u8 d2, d2, d31
  73. vst1.8 {d0}, [r1,:64], r2
  74. vqmovn.s16 d3, q9
  75. vst1.8 {d1}, [r1,:64], r2
  76. vqmovn.s16 d4, q10
  77. vst1.8 {d2}, [r1,:64], r2
  78. vqmovn.s16 d5, q11
  79. vld1.16 {d24-d25}, [r0,:128]!
  80. vadd.u8 d3, d3, d31
  81. vld1.16 {d26-d27}, [r0,:128]!
  82. vadd.u8 d4, d4, d31
  83. vadd.u8 d5, d5, d31
  84. vst1.8 {d3}, [r1,:64], r2
  85. vqmovn.s16 d6, q12
  86. vst1.8 {d4}, [r1,:64], r2
  87. vqmovn.s16 d7, q13
  88. vst1.8 {d5}, [r1,:64], r2
  89. vadd.u8 d6, d6, d31
  90. vadd.u8 d7, d7, d31
  91. vst1.8 {d6}, [r1,:64], r2
  92. vst1.8 {d7}, [r1,:64], r2
  93. bx lr
  94. endfunc
  95. function ff_add_pixels_clamped_neon, export=1
  96. mov r3, r1
  97. vld1.8 {d16}, [r1,:64], r2
  98. vld1.16 {d0-d1}, [r0,:128]!
  99. vaddw.u8 q0, q0, d16
  100. vld1.8 {d17}, [r1,:64], r2
  101. vld1.16 {d2-d3}, [r0,:128]!
  102. vqmovun.s16 d0, q0
  103. vld1.8 {d18}, [r1,:64], r2
  104. vaddw.u8 q1, q1, d17
  105. vld1.16 {d4-d5}, [r0,:128]!
  106. vaddw.u8 q2, q2, d18
  107. vst1.8 {d0}, [r3,:64], r2
  108. vqmovun.s16 d2, q1
  109. vld1.8 {d19}, [r1,:64], r2
  110. vld1.16 {d6-d7}, [r0,:128]!
  111. vaddw.u8 q3, q3, d19
  112. vqmovun.s16 d4, q2
  113. vst1.8 {d2}, [r3,:64], r2
  114. vld1.8 {d16}, [r1,:64], r2
  115. vqmovun.s16 d6, q3
  116. vld1.16 {d0-d1}, [r0,:128]!
  117. vaddw.u8 q0, q0, d16
  118. vst1.8 {d4}, [r3,:64], r2
  119. vld1.8 {d17}, [r1,:64], r2
  120. vld1.16 {d2-d3}, [r0,:128]!
  121. vaddw.u8 q1, q1, d17
  122. vst1.8 {d6}, [r3,:64], r2
  123. vqmovun.s16 d0, q0
  124. vld1.8 {d18}, [r1,:64], r2
  125. vld1.16 {d4-d5}, [r0,:128]!
  126. vaddw.u8 q2, q2, d18
  127. vst1.8 {d0}, [r3,:64], r2
  128. vqmovun.s16 d2, q1
  129. vld1.8 {d19}, [r1,:64], r2
  130. vqmovun.s16 d4, q2
  131. vld1.16 {d6-d7}, [r0,:128]!
  132. vaddw.u8 q3, q3, d19
  133. vst1.8 {d2}, [r3,:64], r2
  134. vqmovun.s16 d6, q3
  135. vst1.8 {d4}, [r3,:64], r2
  136. vst1.8 {d6}, [r3,:64], r2
  137. bx lr
  138. endfunc
  139. function ff_vector_clipf_neon, export=1
  140. VFP vdup.32 q1, d0[1]
  141. VFP vdup.32 q0, d0[0]
  142. NOVFP vdup.32 q0, r2
  143. NOVFP vdup.32 q1, r3
  144. NOVFP ldr r2, [sp]
  145. vld1.f32 {q2},[r1,:128]!
  146. vmin.f32 q10, q2, q1
  147. vld1.f32 {q3},[r1,:128]!
  148. vmin.f32 q11, q3, q1
  149. 1: vmax.f32 q8, q10, q0
  150. vmax.f32 q9, q11, q0
  151. subs r2, r2, #8
  152. beq 2f
  153. vld1.f32 {q2},[r1,:128]!
  154. vmin.f32 q10, q2, q1
  155. vld1.f32 {q3},[r1,:128]!
  156. vmin.f32 q11, q3, q1
  157. vst1.f32 {q8},[r0,:128]!
  158. vst1.f32 {q9},[r0,:128]!
  159. b 1b
  160. 2: vst1.f32 {q8},[r0,:128]!
  161. vst1.f32 {q9},[r0,:128]!
  162. bx lr
  163. endfunc
  164. function ff_apply_window_int16_neon, export=1
  165. push {r4,lr}
  166. add r4, r1, r3, lsl #1
  167. add lr, r0, r3, lsl #1
  168. sub r4, r4, #16
  169. sub lr, lr, #16
  170. mov r12, #-16
  171. 1:
  172. vld1.16 {q0}, [r1,:128]!
  173. vld1.16 {q2}, [r2,:128]!
  174. vld1.16 {q1}, [r4,:128], r12
  175. vrev64.16 q3, q2
  176. vqrdmulh.s16 q0, q0, q2
  177. vqrdmulh.s16 d2, d2, d7
  178. vqrdmulh.s16 d3, d3, d6
  179. vst1.16 {q0}, [r0,:128]!
  180. vst1.16 {q1}, [lr,:128], r12
  181. subs r3, r3, #16
  182. bgt 1b
  183. pop {r4,pc}
  184. endfunc
  185. function ff_vector_clip_int32_neon, export=1
  186. vdup.32 q0, r2
  187. vdup.32 q1, r3
  188. ldr r2, [sp]
  189. 1:
  190. vld1.32 {q2-q3}, [r1,:128]!
  191. vmin.s32 q2, q2, q1
  192. vmin.s32 q3, q3, q1
  193. vmax.s32 q2, q2, q0
  194. vmax.s32 q3, q3, q0
  195. vst1.32 {q2-q3}, [r0,:128]!
  196. subs r2, r2, #8
  197. bgt 1b
  198. bx lr
  199. endfunc