You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

411 lines
13KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. .macro pixels16 rnd=1, avg=0
  23. .if \avg
  24. mov r12, r0
  25. .endif
  26. 1: vld1.8 {q0}, [r1], r2
  27. vld1.8 {q1}, [r1], r2
  28. vld1.8 {q2}, [r1], r2
  29. pld [r1, r2, lsl #2]
  30. vld1.8 {q3}, [r1], r2
  31. pld [r1]
  32. pld [r1, r2]
  33. pld [r1, r2, lsl #1]
  34. .if \avg
  35. vld1.8 {q8}, [r12,:128], r2
  36. vrhadd.u8 q0, q0, q8
  37. vld1.8 {q9}, [r12,:128], r2
  38. vrhadd.u8 q1, q1, q9
  39. vld1.8 {q10}, [r12,:128], r2
  40. vrhadd.u8 q2, q2, q10
  41. vld1.8 {q11}, [r12,:128], r2
  42. vrhadd.u8 q3, q3, q11
  43. .endif
  44. subs r3, r3, #4
  45. vst1.64 {q0}, [r0,:128], r2
  46. vst1.64 {q1}, [r0,:128], r2
  47. vst1.64 {q2}, [r0,:128], r2
  48. vst1.64 {q3}, [r0,:128], r2
  49. bne 1b
  50. bx lr
  51. .endm
  52. .macro pixels16_x2 rnd=1, avg=0
  53. 1: vld1.8 {d0-d2}, [r1], r2
  54. vld1.8 {d4-d6}, [r1], r2
  55. pld [r1]
  56. pld [r1, r2]
  57. subs r3, r3, #2
  58. vext.8 q1, q0, q1, #1
  59. avg q0, q0, q1
  60. vext.8 q3, q2, q3, #1
  61. avg q2, q2, q3
  62. .if \avg
  63. vld1.8 {q1}, [r0,:128], r2
  64. vld1.8 {q3}, [r0,:128]
  65. vrhadd.u8 q0, q0, q1
  66. vrhadd.u8 q2, q2, q3
  67. sub r0, r0, r2
  68. .endif
  69. vst1.8 {q0}, [r0,:128], r2
  70. vst1.8 {q2}, [r0,:128], r2
  71. bne 1b
  72. bx lr
  73. .endm
  74. .macro pixels16_y2 rnd=1, avg=0
  75. sub r3, r3, #2
  76. vld1.8 {q0}, [r1], r2
  77. vld1.8 {q1}, [r1], r2
  78. 1: subs r3, r3, #2
  79. avg q2, q0, q1
  80. vld1.8 {q0}, [r1], r2
  81. avg q3, q0, q1
  82. vld1.8 {q1}, [r1], r2
  83. pld [r1]
  84. pld [r1, r2]
  85. .if \avg
  86. vld1.8 {q8}, [r0,:128], r2
  87. vld1.8 {q9}, [r0,:128]
  88. vrhadd.u8 q2, q2, q8
  89. vrhadd.u8 q3, q3, q9
  90. sub r0, r0, r2
  91. .endif
  92. vst1.8 {q2}, [r0,:128], r2
  93. vst1.8 {q3}, [r0,:128], r2
  94. bne 1b
  95. avg q2, q0, q1
  96. vld1.8 {q0}, [r1], r2
  97. avg q3, q0, q1
  98. .if \avg
  99. vld1.8 {q8}, [r0,:128], r2
  100. vld1.8 {q9}, [r0,:128]
  101. vrhadd.u8 q2, q2, q8
  102. vrhadd.u8 q3, q3, q9
  103. sub r0, r0, r2
  104. .endif
  105. vst1.8 {q2}, [r0,:128], r2
  106. vst1.8 {q3}, [r0,:128], r2
  107. bx lr
  108. .endm
  109. .macro pixels16_xy2 rnd=1, avg=0
  110. sub r3, r3, #2
  111. vld1.8 {d0-d2}, [r1], r2
  112. vld1.8 {d4-d6}, [r1], r2
  113. NRND vmov.i16 q13, #1
  114. pld [r1]
  115. pld [r1, r2]
  116. vext.8 q1, q0, q1, #1
  117. vext.8 q3, q2, q3, #1
  118. vaddl.u8 q8, d0, d2
  119. vaddl.u8 q10, d1, d3
  120. vaddl.u8 q9, d4, d6
  121. vaddl.u8 q11, d5, d7
  122. 1: subs r3, r3, #2
  123. vld1.8 {d0-d2}, [r1], r2
  124. vadd.u16 q12, q8, q9
  125. pld [r1]
  126. NRND vadd.u16 q12, q12, q13
  127. vext.8 q15, q0, q1, #1
  128. vadd.u16 q1 , q10, q11
  129. shrn d28, q12, #2
  130. NRND vadd.u16 q1, q1, q13
  131. shrn d29, q1, #2
  132. .if \avg
  133. vld1.8 {q8}, [r0,:128]
  134. vrhadd.u8 q14, q14, q8
  135. .endif
  136. vaddl.u8 q8, d0, d30
  137. vld1.8 {d2-d4}, [r1], r2
  138. vaddl.u8 q10, d1, d31
  139. vst1.8 {q14}, [r0,:128], r2
  140. vadd.u16 q12, q8, q9
  141. pld [r1, r2]
  142. NRND vadd.u16 q12, q12, q13
  143. vext.8 q2, q1, q2, #1
  144. vadd.u16 q0, q10, q11
  145. shrn d30, q12, #2
  146. NRND vadd.u16 q0, q0, q13
  147. shrn d31, q0, #2
  148. .if \avg
  149. vld1.8 {q9}, [r0,:128]
  150. vrhadd.u8 q15, q15, q9
  151. .endif
  152. vaddl.u8 q9, d2, d4
  153. vaddl.u8 q11, d3, d5
  154. vst1.8 {q15}, [r0,:128], r2
  155. bgt 1b
  156. vld1.8 {d0-d2}, [r1], r2
  157. vadd.u16 q12, q8, q9
  158. NRND vadd.u16 q12, q12, q13
  159. vext.8 q15, q0, q1, #1
  160. vadd.u16 q1 , q10, q11
  161. shrn d28, q12, #2
  162. NRND vadd.u16 q1, q1, q13
  163. shrn d29, q1, #2
  164. .if \avg
  165. vld1.8 {q8}, [r0,:128]
  166. vrhadd.u8 q14, q14, q8
  167. .endif
  168. vaddl.u8 q8, d0, d30
  169. vaddl.u8 q10, d1, d31
  170. vst1.8 {q14}, [r0,:128], r2
  171. vadd.u16 q12, q8, q9
  172. NRND vadd.u16 q12, q12, q13
  173. vadd.u16 q0, q10, q11
  174. shrn d30, q12, #2
  175. NRND vadd.u16 q0, q0, q13
  176. shrn d31, q0, #2
  177. .if \avg
  178. vld1.8 {q9}, [r0,:128]
  179. vrhadd.u8 q15, q15, q9
  180. .endif
  181. vst1.8 {q15}, [r0,:128], r2
  182. bx lr
  183. .endm
  184. .macro pixels8 rnd=1, avg=0
  185. 1: vld1.8 {d0}, [r1], r2
  186. vld1.8 {d1}, [r1], r2
  187. vld1.8 {d2}, [r1], r2
  188. pld [r1, r2, lsl #2]
  189. vld1.8 {d3}, [r1], r2
  190. pld [r1]
  191. pld [r1, r2]
  192. pld [r1, r2, lsl #1]
  193. .if \avg
  194. vld1.8 {d4}, [r0,:64], r2
  195. vrhadd.u8 d0, d0, d4
  196. vld1.8 {d5}, [r0,:64], r2
  197. vrhadd.u8 d1, d1, d5
  198. vld1.8 {d6}, [r0,:64], r2
  199. vrhadd.u8 d2, d2, d6
  200. vld1.8 {d7}, [r0,:64], r2
  201. vrhadd.u8 d3, d3, d7
  202. sub r0, r0, r2, lsl #2
  203. .endif
  204. subs r3, r3, #4
  205. vst1.8 {d0}, [r0,:64], r2
  206. vst1.8 {d1}, [r0,:64], r2
  207. vst1.8 {d2}, [r0,:64], r2
  208. vst1.8 {d3}, [r0,:64], r2
  209. bne 1b
  210. bx lr
  211. .endm
  212. .macro pixels8_x2 rnd=1, avg=0
  213. 1: vld1.8 {q0}, [r1], r2
  214. vext.8 d1, d0, d1, #1
  215. vld1.8 {q1}, [r1], r2
  216. vext.8 d3, d2, d3, #1
  217. pld [r1]
  218. pld [r1, r2]
  219. subs r3, r3, #2
  220. vswp d1, d2
  221. avg q0, q0, q1
  222. .if \avg
  223. vld1.8 {d4}, [r0,:64], r2
  224. vld1.8 {d5}, [r0,:64]
  225. vrhadd.u8 q0, q0, q2
  226. sub r0, r0, r2
  227. .endif
  228. vst1.8 {d0}, [r0,:64], r2
  229. vst1.8 {d1}, [r0,:64], r2
  230. bne 1b
  231. bx lr
  232. .endm
  233. .macro pixels8_y2 rnd=1, avg=0
  234. sub r3, r3, #2
  235. vld1.8 {d0}, [r1], r2
  236. vld1.8 {d1}, [r1], r2
  237. 1: subs r3, r3, #2
  238. avg d4, d0, d1
  239. vld1.8 {d0}, [r1], r2
  240. avg d5, d0, d1
  241. vld1.8 {d1}, [r1], r2
  242. pld [r1]
  243. pld [r1, r2]
  244. .if \avg
  245. vld1.8 {d2}, [r0,:64], r2
  246. vld1.8 {d3}, [r0,:64]
  247. vrhadd.u8 q2, q2, q1
  248. sub r0, r0, r2
  249. .endif
  250. vst1.8 {d4}, [r0,:64], r2
  251. vst1.8 {d5}, [r0,:64], r2
  252. bne 1b
  253. avg d4, d0, d1
  254. vld1.8 {d0}, [r1], r2
  255. avg d5, d0, d1
  256. .if \avg
  257. vld1.8 {d2}, [r0,:64], r2
  258. vld1.8 {d3}, [r0,:64]
  259. vrhadd.u8 q2, q2, q1
  260. sub r0, r0, r2
  261. .endif
  262. vst1.8 {d4}, [r0,:64], r2
  263. vst1.8 {d5}, [r0,:64], r2
  264. bx lr
  265. .endm
  266. .macro pixels8_xy2 rnd=1, avg=0
  267. sub r3, r3, #2
  268. vld1.8 {q0}, [r1], r2
  269. vld1.8 {q1}, [r1], r2
  270. NRND vmov.i16 q11, #1
  271. pld [r1]
  272. pld [r1, r2]
  273. vext.8 d4, d0, d1, #1
  274. vext.8 d6, d2, d3, #1
  275. vaddl.u8 q8, d0, d4
  276. vaddl.u8 q9, d2, d6
  277. 1: subs r3, r3, #2
  278. vld1.8 {q0}, [r1], r2
  279. pld [r1]
  280. vadd.u16 q10, q8, q9
  281. vext.8 d4, d0, d1, #1
  282. NRND vadd.u16 q10, q10, q11
  283. vaddl.u8 q8, d0, d4
  284. shrn d5, q10, #2
  285. vld1.8 {q1}, [r1], r2
  286. vadd.u16 q10, q8, q9
  287. pld [r1, r2]
  288. .if \avg
  289. vld1.8 {d7}, [r0,:64]
  290. vrhadd.u8 d5, d5, d7
  291. .endif
  292. NRND vadd.u16 q10, q10, q11
  293. vst1.8 {d5}, [r0,:64], r2
  294. shrn d7, q10, #2
  295. .if \avg
  296. vld1.8 {d5}, [r0,:64]
  297. vrhadd.u8 d7, d7, d5
  298. .endif
  299. vext.8 d6, d2, d3, #1
  300. vaddl.u8 q9, d2, d6
  301. vst1.8 {d7}, [r0,:64], r2
  302. bgt 1b
  303. vld1.8 {q0}, [r1], r2
  304. vadd.u16 q10, q8, q9
  305. vext.8 d4, d0, d1, #1
  306. NRND vadd.u16 q10, q10, q11
  307. vaddl.u8 q8, d0, d4
  308. shrn d5, q10, #2
  309. vadd.u16 q10, q8, q9
  310. .if \avg
  311. vld1.8 {d7}, [r0,:64]
  312. vrhadd.u8 d5, d5, d7
  313. .endif
  314. NRND vadd.u16 q10, q10, q11
  315. vst1.8 {d5}, [r0,:64], r2
  316. shrn d7, q10, #2
  317. .if \avg
  318. vld1.8 {d5}, [r0,:64]
  319. vrhadd.u8 d7, d7, d5
  320. .endif
  321. vst1.8 {d7}, [r0,:64], r2
  322. bx lr
  323. .endm
  324. .macro pixfunc pfx, name, suf, rnd=1, avg=0
  325. .if \rnd
  326. .macro avg rd, rn, rm
  327. vrhadd.u8 \rd, \rn, \rm
  328. .endm
  329. .macro shrn rd, rn, rm
  330. vrshrn.u16 \rd, \rn, \rm
  331. .endm
  332. .macro NRND insn:vararg
  333. .endm
  334. .else
  335. .macro avg rd, rn, rm
  336. vhadd.u8 \rd, \rn, \rm
  337. .endm
  338. .macro shrn rd, rn, rm
  339. vshrn.u16 \rd, \rn, \rm
  340. .endm
  341. .macro NRND insn:vararg
  342. \insn
  343. .endm
  344. .endif
  345. function ff_\pfx\name\suf\()_neon, export=1
  346. \name \rnd, \avg
  347. endfunc
  348. .purgem avg
  349. .purgem shrn
  350. .purgem NRND
  351. .endm
  352. .macro pixfunc2 pfx, name, avg=0
  353. pixfunc \pfx, \name, rnd=1, avg=\avg
  354. pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
  355. .endm
  356. function ff_put_h264_qpel16_mc00_neon, export=1
  357. mov r3, #16
  358. endfunc
  359. pixfunc put_, pixels16, avg=0
  360. pixfunc2 put_, pixels16_x2, avg=0
  361. pixfunc2 put_, pixels16_y2, avg=0
  362. pixfunc2 put_, pixels16_xy2, avg=0
  363. function ff_avg_h264_qpel16_mc00_neon, export=1
  364. mov r3, #16
  365. endfunc
  366. pixfunc avg_, pixels16, avg=1
  367. pixfunc2 avg_, pixels16_x2, avg=1
  368. pixfunc2 avg_, pixels16_y2, avg=1
  369. pixfunc2 avg_, pixels16_xy2, avg=1
  370. function ff_put_h264_qpel8_mc00_neon, export=1
  371. mov r3, #8
  372. endfunc
  373. pixfunc put_, pixels8, avg=0
  374. pixfunc2 put_, pixels8_x2, avg=0
  375. pixfunc2 put_, pixels8_y2, avg=0
  376. pixfunc2 put_, pixels8_xy2, avg=0
  377. function ff_avg_h264_qpel8_mc00_neon, export=1
  378. mov r3, #8
  379. endfunc
  380. pixfunc avg_, pixels8, avg=1
  381. pixfunc avg_, pixels8_x2, avg=1
  382. pixfunc avg_, pixels8_y2, avg=1
  383. pixfunc avg_, pixels8_xy2, avg=1