You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

275 lines
8.7KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "asm.S"
  22. preserve8
  23. .fpu neon
  24. .text
  25. .macro pixels16 avg=0
  26. .if \avg
  27. mov ip, r0
  28. .endif
  29. 1: vld1.64 {d0, d1}, [r1], r2
  30. vld1.64 {d2, d3}, [r1], r2
  31. vld1.64 {d4, d5}, [r1], r2
  32. pld [r1, r2, lsl #2]
  33. vld1.64 {d6, d7}, [r1], r2
  34. pld [r1]
  35. pld [r1, r2]
  36. pld [r1, r2, lsl #1]
  37. .if \avg
  38. vld1.64 {d16,d17}, [ip], r2
  39. vrhadd.u8 q0, q0, q8
  40. vld1.64 {d18,d19}, [ip], r2
  41. vrhadd.u8 q1, q1, q9
  42. vld1.64 {d20,d21}, [ip], r2
  43. vrhadd.u8 q2, q2, q10
  44. vld1.64 {d22,d23}, [ip], r2
  45. vrhadd.u8 q3, q3, q11
  46. .endif
  47. subs r3, r3, #4
  48. vst1.64 {d0, d1}, [r0,:128], r2
  49. vst1.64 {d2, d3}, [r0,:128], r2
  50. vst1.64 {d4, d5}, [r0,:128], r2
  51. vst1.64 {d6, d7}, [r0,:128], r2
  52. bne 1b
  53. bx lr
  54. .endm
  55. .macro pixels16_x2 vhadd=vrhadd.u8
  56. 1: vld1.64 {d0-d2}, [r1], r2
  57. vld1.64 {d4-d6}, [r1], r2
  58. pld [r1]
  59. pld [r1, r2]
  60. subs r3, r3, #2
  61. vext.8 q1, q0, q1, #1
  62. \vhadd q0, q0, q1
  63. vext.8 q3, q2, q3, #1
  64. \vhadd q2, q2, q3
  65. vst1.64 {d0, d1}, [r0,:128], r2
  66. vst1.64 {d4, d5}, [r0,:128], r2
  67. bne 1b
  68. bx lr
  69. .endm
  70. .macro pixels16_y2 vhadd=vrhadd.u8
  71. push {lr}
  72. add ip, r1, r2
  73. lsl lr, r2, #1
  74. vld1.64 {d0, d1}, [r1], lr
  75. vld1.64 {d2, d3}, [ip], lr
  76. 1: subs r3, r3, #2
  77. \vhadd q2, q0, q1
  78. vld1.64 {d0, d1}, [r1], lr
  79. \vhadd q3, q0, q1
  80. vld1.64 {d2, d3}, [ip], lr
  81. pld [r1]
  82. pld [ip]
  83. vst1.64 {d4, d5}, [r0,:128], r2
  84. vst1.64 {d6, d7}, [r0,:128], r2
  85. bne 1b
  86. pop {pc}
  87. .endm
  88. .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
  89. push {lr}
  90. lsl lr, r2, #1
  91. add ip, r1, r2
  92. vld1.64 {d0-d2}, [r1], lr
  93. vld1.64 {d4-d6}, [ip], lr
  94. .if \no_rnd
  95. vmov.i16 q13, #1
  96. .endif
  97. pld [r1]
  98. pld [ip]
  99. vext.8 q1, q0, q1, #1
  100. vext.8 q3, q2, q3, #1
  101. vaddl.u8 q8, d0, d2
  102. vaddl.u8 q10, d1, d3
  103. vaddl.u8 q9, d4, d6
  104. vaddl.u8 q11, d5, d7
  105. 1: subs r3, r3, #2
  106. vld1.64 {d0-d2}, [r1], lr
  107. vadd.u16 q12, q8, q9
  108. pld [r1]
  109. .if \no_rnd
  110. vadd.u16 q12, q12, q13
  111. .endif
  112. vext.8 q15, q0, q1, #1
  113. vadd.u16 q1 , q10, q11
  114. \vshrn d28, q12, #2
  115. .if \no_rnd
  116. vadd.u16 q1, q1, q13
  117. .endif
  118. \vshrn d29, q1, #2
  119. vaddl.u8 q8, d0, d30
  120. vld1.64 {d2-d4}, [ip], lr
  121. vaddl.u8 q10, d1, d31
  122. vst1.64 {d28,d29}, [r0,:128], r2
  123. vadd.u16 q12, q8, q9
  124. pld [ip]
  125. .if \no_rnd
  126. vadd.u16 q12, q12, q13
  127. .endif
  128. vext.8 q2, q1, q2, #1
  129. vadd.u16 q0, q10, q11
  130. \vshrn d30, q12, #2
  131. .if \no_rnd
  132. vadd.u16 q0, q0, q13
  133. .endif
  134. \vshrn d31, q0, #2
  135. vaddl.u8 q9, d2, d4
  136. vaddl.u8 q11, d3, d5
  137. vst1.64 {d30,d31}, [r0,:128], r2
  138. bgt 1b
  139. pop {pc}
  140. .endm
  141. .macro pixels8
  142. 1: vld1.64 {d0}, [r1], r2
  143. vld1.64 {d1}, [r1], r2
  144. vld1.64 {d2}, [r1], r2
  145. pld [r1, r2, lsl #2]
  146. vld1.64 {d3}, [r1], r2
  147. pld [r1]
  148. pld [r1, r2]
  149. pld [r1, r2, lsl #1]
  150. subs r3, r3, #4
  151. vst1.64 {d0}, [r0,:64], r2
  152. vst1.64 {d1}, [r0,:64], r2
  153. vst1.64 {d2}, [r0,:64], r2
  154. vst1.64 {d3}, [r0,:64], r2
  155. bne 1b
  156. bx lr
  157. .endm
  158. .macro pixels8_x2 vhadd=vrhadd.u8
  159. 1: vld1.64 {d0, d1}, [r1], r2
  160. vext.8 d1, d0, d1, #1
  161. vld1.64 {d2, d3}, [r1], r2
  162. vext.8 d3, d2, d3, #1
  163. pld [r1]
  164. pld [r1, r2]
  165. subs r3, r3, #2
  166. vswp d1, d2
  167. \vhadd q0, q0, q1
  168. vst1.64 {d0}, [r0,:64], r2
  169. vst1.64 {d1}, [r0,:64], r2
  170. bne 1b
  171. bx lr
  172. .endm
  173. .macro pixels8_y2 vhadd=vrhadd.u8
  174. push {lr}
  175. add ip, r1, r2
  176. lsl lr, r2, #1
  177. vld1.64 {d0}, [r1], lr
  178. vld1.64 {d1}, [ip], lr
  179. 1: subs r3, r3, #2
  180. \vhadd d4, d0, d1
  181. vld1.64 {d0}, [r1], lr
  182. \vhadd d5, d0, d1
  183. vld1.64 {d1}, [ip], lr
  184. pld [r1]
  185. pld [ip]
  186. vst1.64 {d4}, [r0,:64], r2
  187. vst1.64 {d5}, [r0,:64], r2
  188. bne 1b
  189. pop {pc}
  190. .endm
  191. .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  192. push {lr}
  193. lsl lr, r2, #1
  194. add ip, r1, r2
  195. vld1.64 {d0, d1}, [r1], lr
  196. vld1.64 {d2, d3}, [ip], lr
  197. .if \no_rnd
  198. vmov.i16 q11, #1
  199. .endif
  200. pld [r1]
  201. pld [ip]
  202. vext.8 d4, d0, d1, #1
  203. vext.8 d6, d2, d3, #1
  204. vaddl.u8 q8, d0, d4
  205. vaddl.u8 q9, d2, d6
  206. 1: subs r3, r3, #2
  207. vld1.64 {d0, d1}, [r1], lr
  208. pld [r1]
  209. vadd.u16 q10, q8, q9
  210. vext.8 d4, d0, d1, #1
  211. .if \no_rnd
  212. vadd.u16 q10, q10, q11
  213. .endif
  214. vaddl.u8 q8, d0, d4
  215. \vshrn d5, q10, #2
  216. vld1.64 {d2, d3}, [ip], lr
  217. vadd.u16 q10, q8, q9
  218. pld [ip]
  219. .if \no_rnd
  220. vadd.u16 q10, q10, q11
  221. .endif
  222. vst1.64 {d5}, [r0,:64], r2
  223. \vshrn d7, q10, #2
  224. vext.8 d6, d2, d3, #1
  225. vaddl.u8 q9, d2, d6
  226. vst1.64 {d7}, [r0,:64], r2
  227. bgt 1b
  228. pop {pc}
  229. .endm
  230. .macro pixfunc pfx name suf rnd_op args:vararg
  231. function ff_\pfx\name\suf\()_neon, export=1
  232. \name \rnd_op \args
  233. .endfunc
  234. .endm
  235. .macro pixfunc2 pfx name args:vararg
  236. pixfunc \pfx \name
  237. pixfunc \pfx \name \args
  238. .endm
  239. function ff_put_h264_qpel16_mc00_neon, export=1
  240. mov r3, #16
  241. .endfunc
  242. pixfunc put_ pixels16
  243. pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
  244. pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
  245. pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  246. function ff_avg_h264_qpel16_mc00_neon, export=1
  247. mov r3, #16
  248. .endfunc
  249. pixfunc avg_ pixels16,, 1
  250. function ff_put_h264_qpel8_mc00_neon, export=1
  251. mov r3, #8
  252. .endfunc
  253. pixfunc put_ pixels8
  254. pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
  255. pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
  256. pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1