You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

702 lines
25KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "asm.S"
  22. preserve8
  23. .fpu neon
  24. .text
  25. .macro pixels16 avg=0
  26. .if \avg
  27. mov ip, r0
  28. .endif
  29. 1: vld1.64 {d0, d1}, [r1], r2
  30. vld1.64 {d2, d3}, [r1], r2
  31. vld1.64 {d4, d5}, [r1], r2
  32. pld [r1, r2, lsl #2]
  33. vld1.64 {d6, d7}, [r1], r2
  34. pld [r1]
  35. pld [r1, r2]
  36. pld [r1, r2, lsl #1]
  37. .if \avg
  38. vld1.64 {d16,d17}, [ip], r2
  39. vrhadd.u8 q0, q0, q8
  40. vld1.64 {d18,d19}, [ip], r2
  41. vrhadd.u8 q1, q1, q9
  42. vld1.64 {d20,d21}, [ip], r2
  43. vrhadd.u8 q2, q2, q10
  44. vld1.64 {d22,d23}, [ip], r2
  45. vrhadd.u8 q3, q3, q11
  46. .endif
  47. subs r3, r3, #4
  48. vst1.64 {d0, d1}, [r0,:128], r2
  49. vst1.64 {d2, d3}, [r0,:128], r2
  50. vst1.64 {d4, d5}, [r0,:128], r2
  51. vst1.64 {d6, d7}, [r0,:128], r2
  52. bne 1b
  53. bx lr
  54. .endm
  55. .macro pixels16_x2 vhadd=vrhadd.u8
  56. 1: vld1.64 {d0-d2}, [r1], r2
  57. vld1.64 {d4-d6}, [r1], r2
  58. pld [r1]
  59. pld [r1, r2]
  60. subs r3, r3, #2
  61. vext.8 q1, q0, q1, #1
  62. \vhadd q0, q0, q1
  63. vext.8 q3, q2, q3, #1
  64. \vhadd q2, q2, q3
  65. vst1.64 {d0, d1}, [r0,:128], r2
  66. vst1.64 {d4, d5}, [r0,:128], r2
  67. bne 1b
  68. bx lr
  69. .endm
  70. .macro pixels16_y2 vhadd=vrhadd.u8
  71. push {lr}
  72. add ip, r1, r2
  73. lsl lr, r2, #1
  74. vld1.64 {d0, d1}, [r1], lr
  75. vld1.64 {d2, d3}, [ip], lr
  76. 1: subs r3, r3, #2
  77. \vhadd q2, q0, q1
  78. vld1.64 {d0, d1}, [r1], lr
  79. \vhadd q3, q0, q1
  80. vld1.64 {d2, d3}, [ip], lr
  81. pld [r1]
  82. pld [ip]
  83. vst1.64 {d4, d5}, [r0,:128], r2
  84. vst1.64 {d6, d7}, [r0,:128], r2
  85. bne 1b
  86. pop {pc}
  87. .endm
  88. .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
  89. push {lr}
  90. lsl lr, r2, #1
  91. add ip, r1, r2
  92. vld1.64 {d0-d2}, [r1], lr
  93. vld1.64 {d4-d6}, [ip], lr
  94. .if \no_rnd
  95. vmov.i16 q13, #1
  96. .endif
  97. pld [r1]
  98. pld [ip]
  99. vext.8 q1, q0, q1, #1
  100. vext.8 q3, q2, q3, #1
  101. vaddl.u8 q8, d0, d2
  102. vaddl.u8 q10, d1, d3
  103. vaddl.u8 q9, d4, d6
  104. vaddl.u8 q11, d5, d7
  105. 1: subs r3, r3, #2
  106. vld1.64 {d0-d2}, [r1], lr
  107. vadd.u16 q12, q8, q9
  108. pld [r1]
  109. .if \no_rnd
  110. vadd.u16 q12, q12, q13
  111. .endif
  112. vext.8 q15, q0, q1, #1
  113. vadd.u16 q1 , q10, q11
  114. \vshrn d28, q12, #2
  115. .if \no_rnd
  116. vadd.u16 q1, q1, q13
  117. .endif
  118. \vshrn d29, q1, #2
  119. vaddl.u8 q8, d0, d30
  120. vld1.64 {d2-d4}, [ip], lr
  121. vaddl.u8 q10, d1, d31
  122. vst1.64 {d28,d29}, [r0,:128], r2
  123. vadd.u16 q12, q8, q9
  124. pld [ip]
  125. .if \no_rnd
  126. vadd.u16 q12, q12, q13
  127. .endif
  128. vext.8 q2, q1, q2, #1
  129. vadd.u16 q0, q10, q11
  130. \vshrn d30, q12, #2
  131. .if \no_rnd
  132. vadd.u16 q0, q0, q13
  133. .endif
  134. \vshrn d31, q0, #2
  135. vaddl.u8 q9, d2, d4
  136. vaddl.u8 q11, d3, d5
  137. vst1.64 {d30,d31}, [r0,:128], r2
  138. bgt 1b
  139. pop {pc}
  140. .endm
  141. .macro pixels8
  142. 1: vld1.64 {d0}, [r1], r2
  143. vld1.64 {d1}, [r1], r2
  144. vld1.64 {d2}, [r1], r2
  145. pld [r1, r2, lsl #2]
  146. vld1.64 {d3}, [r1], r2
  147. pld [r1]
  148. pld [r1, r2]
  149. pld [r1, r2, lsl #1]
  150. subs r3, r3, #4
  151. vst1.64 {d0}, [r0,:64], r2
  152. vst1.64 {d1}, [r0,:64], r2
  153. vst1.64 {d2}, [r0,:64], r2
  154. vst1.64 {d3}, [r0,:64], r2
  155. bne 1b
  156. bx lr
  157. .endm
  158. .macro pixels8_x2 vhadd=vrhadd.u8
  159. 1: vld1.64 {d0, d1}, [r1], r2
  160. vext.8 d1, d0, d1, #1
  161. vld1.64 {d2, d3}, [r1], r2
  162. vext.8 d3, d2, d3, #1
  163. pld [r1]
  164. pld [r1, r2]
  165. subs r3, r3, #2
  166. vswp d1, d2
  167. \vhadd q0, q0, q1
  168. vst1.64 {d0}, [r0,:64], r2
  169. vst1.64 {d1}, [r0,:64], r2
  170. bne 1b
  171. bx lr
  172. .endm
  173. .macro pixels8_y2 vhadd=vrhadd.u8
  174. push {lr}
  175. add ip, r1, r2
  176. lsl lr, r2, #1
  177. vld1.64 {d0}, [r1], lr
  178. vld1.64 {d1}, [ip], lr
  179. 1: subs r3, r3, #2
  180. \vhadd d4, d0, d1
  181. vld1.64 {d0}, [r1], lr
  182. \vhadd d5, d0, d1
  183. vld1.64 {d1}, [ip], lr
  184. pld [r1]
  185. pld [ip]
  186. vst1.64 {d4}, [r0,:64], r2
  187. vst1.64 {d5}, [r0,:64], r2
  188. bne 1b
  189. pop {pc}
  190. .endm
  191. .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  192. push {lr}
  193. lsl lr, r2, #1
  194. add ip, r1, r2
  195. vld1.64 {d0, d1}, [r1], lr
  196. vld1.64 {d2, d3}, [ip], lr
  197. .if \no_rnd
  198. vmov.i16 q11, #1
  199. .endif
  200. pld [r1]
  201. pld [ip]
  202. vext.8 d4, d0, d1, #1
  203. vext.8 d6, d2, d3, #1
  204. vaddl.u8 q8, d0, d4
  205. vaddl.u8 q9, d2, d6
  206. 1: subs r3, r3, #2
  207. vld1.64 {d0, d1}, [r1], lr
  208. pld [r1]
  209. vadd.u16 q10, q8, q9
  210. vext.8 d4, d0, d1, #1
  211. .if \no_rnd
  212. vadd.u16 q10, q10, q11
  213. .endif
  214. vaddl.u8 q8, d0, d4
  215. \vshrn d5, q10, #2
  216. vld1.64 {d2, d3}, [ip], lr
  217. vadd.u16 q10, q8, q9
  218. pld [ip]
  219. .if \no_rnd
  220. vadd.u16 q10, q10, q11
  221. .endif
  222. vst1.64 {d5}, [r0,:64], r2
  223. \vshrn d7, q10, #2
  224. vext.8 d6, d2, d3, #1
  225. vaddl.u8 q9, d2, d6
  226. vst1.64 {d7}, [r0,:64], r2
  227. bgt 1b
  228. pop {pc}
  229. .endm
  230. .macro pixfunc pfx name suf rnd_op args:vararg
  231. function ff_\pfx\name\suf\()_neon, export=1
  232. \name \rnd_op \args
  233. .endfunc
  234. .endm
  235. .macro pixfunc2 pfx name args:vararg
  236. pixfunc \pfx \name
  237. pixfunc \pfx \name \args
  238. .endm
  239. function ff_put_h264_qpel16_mc00_neon, export=1
  240. mov r3, #16
  241. .endfunc
  242. pixfunc put_ pixels16
  243. pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
  244. pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
  245. pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  246. function ff_avg_h264_qpel16_mc00_neon, export=1
  247. mov r3, #16
  248. .endfunc
  249. pixfunc avg_ pixels16,, 1
  250. function ff_put_h264_qpel8_mc00_neon, export=1
  251. mov r3, #8
  252. .endfunc
  253. pixfunc put_ pixels8
  254. pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
  255. pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
  256. pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
  257. function ff_float_to_int16_neon, export=1
  258. subs r2, r2, #8
  259. vld1.64 {d0-d1}, [r1,:128]!
  260. vcvt.s32.f32 q8, q0, #16
  261. vld1.64 {d2-d3}, [r1,:128]!
  262. vcvt.s32.f32 q9, q1, #16
  263. beq 3f
  264. bics ip, r2, #15
  265. beq 2f
  266. 1: subs ip, ip, #16
  267. vshrn.s32 d4, q8, #16
  268. vld1.64 {d0-d1}, [r1,:128]!
  269. vcvt.s32.f32 q0, q0, #16
  270. vshrn.s32 d5, q9, #16
  271. vld1.64 {d2-d3}, [r1,:128]!
  272. vcvt.s32.f32 q1, q1, #16
  273. vshrn.s32 d6, q0, #16
  274. vst1.64 {d4-d5}, [r0,:128]!
  275. vshrn.s32 d7, q1, #16
  276. vld1.64 {d16-d17},[r1,:128]!
  277. vcvt.s32.f32 q8, q8, #16
  278. vld1.64 {d18-d19},[r1,:128]!
  279. vcvt.s32.f32 q9, q9, #16
  280. vst1.64 {d6-d7}, [r0,:128]!
  281. bne 1b
  282. ands r2, r2, #15
  283. beq 3f
  284. 2: vld1.64 {d0-d1}, [r1,:128]!
  285. vshrn.s32 d4, q8, #16
  286. vcvt.s32.f32 q0, q0, #16
  287. vld1.64 {d2-d3}, [r1,:128]!
  288. vshrn.s32 d5, q9, #16
  289. vcvt.s32.f32 q1, q1, #16
  290. vshrn.s32 d6, q0, #16
  291. vst1.64 {d4-d5}, [r0,:128]!
  292. vshrn.s32 d7, q1, #16
  293. vst1.64 {d6-d7}, [r0,:128]!
  294. bx lr
  295. 3: vshrn.s32 d4, q8, #16
  296. vshrn.s32 d5, q9, #16
  297. vst1.64 {d4-d5}, [r0,:128]!
  298. bx lr
  299. .endfunc
  300. function ff_float_to_int16_interleave_neon, export=1
  301. cmp r3, #2
  302. ldrlt r1, [r1]
  303. blt ff_float_to_int16_neon
  304. bne 4f
  305. ldr r3, [r1]
  306. ldr r1, [r1, #4]
  307. subs r2, r2, #8
  308. vld1.64 {d0-d1}, [r3,:128]!
  309. vcvt.s32.f32 q8, q0, #16
  310. vld1.64 {d2-d3}, [r3,:128]!
  311. vcvt.s32.f32 q9, q1, #16
  312. vld1.64 {d20-d21},[r1,:128]!
  313. vcvt.s32.f32 q10, q10, #16
  314. vld1.64 {d22-d23},[r1,:128]!
  315. vcvt.s32.f32 q11, q11, #16
  316. beq 3f
  317. bics ip, r2, #15
  318. beq 2f
  319. 1: subs ip, ip, #16
  320. vld1.64 {d0-d1}, [r3,:128]!
  321. vcvt.s32.f32 q0, q0, #16
  322. vsri.32 q10, q8, #16
  323. vld1.64 {d2-d3}, [r3,:128]!
  324. vcvt.s32.f32 q1, q1, #16
  325. vld1.64 {d24-d25},[r1,:128]!
  326. vcvt.s32.f32 q12, q12, #16
  327. vld1.64 {d26-d27},[r1,:128]!
  328. vsri.32 q11, q9, #16
  329. vst1.64 {d20-d21},[r0,:128]!
  330. vcvt.s32.f32 q13, q13, #16
  331. vst1.64 {d22-d23},[r0,:128]!
  332. vsri.32 q12, q0, #16
  333. vld1.64 {d16-d17},[r3,:128]!
  334. vsri.32 q13, q1, #16
  335. vst1.64 {d24-d25},[r0,:128]!
  336. vcvt.s32.f32 q8, q8, #16
  337. vld1.64 {d18-d19},[r3,:128]!
  338. vcvt.s32.f32 q9, q9, #16
  339. vld1.64 {d20-d21},[r1,:128]!
  340. vcvt.s32.f32 q10, q10, #16
  341. vld1.64 {d22-d23},[r1,:128]!
  342. vcvt.s32.f32 q11, q11, #16
  343. vst1.64 {d26-d27},[r0,:128]!
  344. bne 1b
  345. ands r2, r2, #15
  346. beq 3f
  347. 2: vsri.32 q10, q8, #16
  348. vld1.64 {d0-d1}, [r3,:128]!
  349. vcvt.s32.f32 q0, q0, #16
  350. vld1.64 {d2-d3}, [r3,:128]!
  351. vcvt.s32.f32 q1, q1, #16
  352. vld1.64 {d24-d25},[r1,:128]!
  353. vcvt.s32.f32 q12, q12, #16
  354. vsri.32 q11, q9, #16
  355. vld1.64 {d26-d27},[r1,:128]!
  356. vcvt.s32.f32 q13, q13, #16
  357. vst1.64 {d20-d21},[r0,:128]!
  358. vsri.32 q12, q0, #16
  359. vst1.64 {d22-d23},[r0,:128]!
  360. vsri.32 q13, q1, #16
  361. vst1.64 {d24-d27},[r0,:128]!
  362. bx lr
  363. 3: vsri.32 q10, q8, #16
  364. vsri.32 q11, q9, #16
  365. vst1.64 {d20-d23},[r0,:128]!
  366. bx lr
  367. 4: push {r4-r8,lr}
  368. cmp r3, #4
  369. lsl ip, r3, #1
  370. blt 4f
  371. @ 4 channels
  372. 5: ldmia r1!, {r4-r7}
  373. mov lr, r2
  374. mov r8, r0
  375. vld1.64 {d16-d17},[r4,:128]!
  376. vcvt.s32.f32 q8, q8, #16
  377. vld1.64 {d18-d19},[r5,:128]!
  378. vcvt.s32.f32 q9, q9, #16
  379. vld1.64 {d20-d21},[r6,:128]!
  380. vcvt.s32.f32 q10, q10, #16
  381. vld1.64 {d22-d23},[r7,:128]!
  382. vcvt.s32.f32 q11, q11, #16
  383. 6: subs lr, lr, #8
  384. vld1.64 {d0-d1}, [r4,:128]!
  385. vcvt.s32.f32 q0, q0, #16
  386. vsri.32 q9, q8, #16
  387. vld1.64 {d2-d3}, [r5,:128]!
  388. vcvt.s32.f32 q1, q1, #16
  389. vsri.32 q11, q10, #16
  390. vld1.64 {d4-d5}, [r6,:128]!
  391. vcvt.s32.f32 q2, q2, #16
  392. vzip.32 d18, d22
  393. vld1.64 {d6-d7}, [r7,:128]!
  394. vcvt.s32.f32 q3, q3, #16
  395. vzip.32 d19, d23
  396. vst1.64 {d18}, [r8], ip
  397. vsri.32 q1, q0, #16
  398. vst1.64 {d22}, [r8], ip
  399. vsri.32 q3, q2, #16
  400. vst1.64 {d19}, [r8], ip
  401. vzip.32 d2, d6
  402. vst1.64 {d23}, [r8], ip
  403. vzip.32 d3, d7
  404. beq 7f
  405. vld1.64 {d16-d17},[r4,:128]!
  406. vcvt.s32.f32 q8, q8, #16
  407. vst1.64 {d2}, [r8], ip
  408. vld1.64 {d18-d19},[r5,:128]!
  409. vcvt.s32.f32 q9, q9, #16
  410. vst1.64 {d6}, [r8], ip
  411. vld1.64 {d20-d21},[r6,:128]!
  412. vcvt.s32.f32 q10, q10, #16
  413. vst1.64 {d3}, [r8], ip
  414. vld1.64 {d22-d23},[r7,:128]!
  415. vcvt.s32.f32 q11, q11, #16
  416. vst1.64 {d7}, [r8], ip
  417. b 6b
  418. 7: vst1.64 {d2}, [r8], ip
  419. vst1.64 {d6}, [r8], ip
  420. vst1.64 {d3}, [r8], ip
  421. vst1.64 {d7}, [r8], ip
  422. subs r3, r3, #4
  423. popeq {r4-r8,pc}
  424. cmp r3, #4
  425. add r0, r0, #8
  426. bge 5b
  427. @ 2 channels
  428. 4: cmp r3, #2
  429. blt 4f
  430. ldmia r1!, {r4-r5}
  431. mov lr, r2
  432. mov r8, r0
  433. tst lr, #8
  434. vld1.64 {d16-d17},[r4,:128]!
  435. vcvt.s32.f32 q8, q8, #16
  436. vld1.64 {d18-d19},[r5,:128]!
  437. vcvt.s32.f32 q9, q9, #16
  438. vld1.64 {d20-d21},[r4,:128]!
  439. vcvt.s32.f32 q10, q10, #16
  440. vld1.64 {d22-d23},[r5,:128]!
  441. vcvt.s32.f32 q11, q11, #16
  442. beq 6f
  443. subs lr, lr, #8
  444. beq 7f
  445. vsri.32 d18, d16, #16
  446. vsri.32 d19, d17, #16
  447. vld1.64 {d16-d17},[r4,:128]!
  448. vcvt.s32.f32 q8, q8, #16
  449. vst1.32 {d18[0]}, [r8], ip
  450. vsri.32 d22, d20, #16
  451. vst1.32 {d18[1]}, [r8], ip
  452. vsri.32 d23, d21, #16
  453. vst1.32 {d19[0]}, [r8], ip
  454. vst1.32 {d19[1]}, [r8], ip
  455. vld1.64 {d18-d19},[r5,:128]!
  456. vcvt.s32.f32 q9, q9, #16
  457. vst1.32 {d22[0]}, [r8], ip
  458. vst1.32 {d22[1]}, [r8], ip
  459. vld1.64 {d20-d21},[r4,:128]!
  460. vcvt.s32.f32 q10, q10, #16
  461. vst1.32 {d23[0]}, [r8], ip
  462. vst1.32 {d23[1]}, [r8], ip
  463. vld1.64 {d22-d23},[r5,:128]!
  464. vcvt.s32.f32 q11, q11, #16
  465. 6: subs lr, lr, #16
  466. vld1.64 {d0-d1}, [r4,:128]!
  467. vcvt.s32.f32 q0, q0, #16
  468. vsri.32 d18, d16, #16
  469. vld1.64 {d2-d3}, [r5,:128]!
  470. vcvt.s32.f32 q1, q1, #16
  471. vsri.32 d19, d17, #16
  472. vld1.64 {d4-d5}, [r4,:128]!
  473. vcvt.s32.f32 q2, q2, #16
  474. vld1.64 {d6-d7}, [r5,:128]!
  475. vcvt.s32.f32 q3, q3, #16
  476. vst1.32 {d18[0]}, [r8], ip
  477. vsri.32 d22, d20, #16
  478. vst1.32 {d18[1]}, [r8], ip
  479. vsri.32 d23, d21, #16
  480. vst1.32 {d19[0]}, [r8], ip
  481. vsri.32 d2, d0, #16
  482. vst1.32 {d19[1]}, [r8], ip
  483. vsri.32 d3, d1, #16
  484. vst1.32 {d22[0]}, [r8], ip
  485. vsri.32 d6, d4, #16
  486. vst1.32 {d22[1]}, [r8], ip
  487. vsri.32 d7, d5, #16
  488. vst1.32 {d23[0]}, [r8], ip
  489. vst1.32 {d23[1]}, [r8], ip
  490. beq 6f
  491. vld1.64 {d16-d17},[r4,:128]!
  492. vcvt.s32.f32 q8, q8, #16
  493. vst1.32 {d2[0]}, [r8], ip
  494. vst1.32 {d2[1]}, [r8], ip
  495. vld1.64 {d18-d19},[r5,:128]!
  496. vcvt.s32.f32 q9, q9, #16
  497. vst1.32 {d3[0]}, [r8], ip
  498. vst1.32 {d3[1]}, [r8], ip
  499. vld1.64 {d20-d21},[r4,:128]!
  500. vcvt.s32.f32 q10, q10, #16
  501. vst1.32 {d6[0]}, [r8], ip
  502. vst1.32 {d6[1]}, [r8], ip
  503. vld1.64 {d22-d23},[r5,:128]!
  504. vcvt.s32.f32 q11, q11, #16
  505. vst1.32 {d7[0]}, [r8], ip
  506. vst1.32 {d7[1]}, [r8], ip
  507. bgt 6b
  508. 6: vst1.32 {d2[0]}, [r8], ip
  509. vst1.32 {d2[1]}, [r8], ip
  510. vst1.32 {d3[0]}, [r8], ip
  511. vst1.32 {d3[1]}, [r8], ip
  512. vst1.32 {d6[0]}, [r8], ip
  513. vst1.32 {d6[1]}, [r8], ip
  514. vst1.32 {d7[0]}, [r8], ip
  515. vst1.32 {d7[1]}, [r8], ip
  516. b 8f
  517. 7: vsri.32 d18, d16, #16
  518. vsri.32 d19, d17, #16
  519. vst1.32 {d18[0]}, [r8], ip
  520. vsri.32 d22, d20, #16
  521. vst1.32 {d18[1]}, [r8], ip
  522. vsri.32 d23, d21, #16
  523. vst1.32 {d19[0]}, [r8], ip
  524. vst1.32 {d19[1]}, [r8], ip
  525. vst1.32 {d22[0]}, [r8], ip
  526. vst1.32 {d22[1]}, [r8], ip
  527. vst1.32 {d23[0]}, [r8], ip
  528. vst1.32 {d23[1]}, [r8], ip
  529. 8: subs r3, r3, #2
  530. add r0, r0, #4
  531. popeq {r4-r8,pc}
  532. @ 1 channel
  533. 4: ldr r4, [r1],#4
  534. tst r2, #8
  535. mov lr, r2
  536. mov r5, r0
  537. vld1.64 {d0-d1}, [r4,:128]!
  538. vcvt.s32.f32 q0, q0, #16
  539. vld1.64 {d2-d3}, [r4,:128]!
  540. vcvt.s32.f32 q1, q1, #16
  541. bne 8f
  542. 6: subs lr, lr, #16
  543. vld1.64 {d4-d5}, [r4,:128]!
  544. vcvt.s32.f32 q2, q2, #16
  545. vld1.64 {d6-d7}, [r4,:128]!
  546. vcvt.s32.f32 q3, q3, #16
  547. vst1.16 {d0[1]}, [r5,:16], ip
  548. vst1.16 {d0[3]}, [r5,:16], ip
  549. vst1.16 {d1[1]}, [r5,:16], ip
  550. vst1.16 {d1[3]}, [r5,:16], ip
  551. vst1.16 {d2[1]}, [r5,:16], ip
  552. vst1.16 {d2[3]}, [r5,:16], ip
  553. vst1.16 {d3[1]}, [r5,:16], ip
  554. vst1.16 {d3[3]}, [r5,:16], ip
  555. beq 7f
  556. vld1.64 {d0-d1}, [r4,:128]!
  557. vcvt.s32.f32 q0, q0, #16
  558. vld1.64 {d2-d3}, [r4,:128]!
  559. vcvt.s32.f32 q1, q1, #16
  560. 7: vst1.16 {d4[1]}, [r5,:16], ip
  561. vst1.16 {d4[3]}, [r5,:16], ip
  562. vst1.16 {d5[1]}, [r5,:16], ip
  563. vst1.16 {d5[3]}, [r5,:16], ip
  564. vst1.16 {d6[1]}, [r5,:16], ip
  565. vst1.16 {d6[3]}, [r5,:16], ip
  566. vst1.16 {d7[1]}, [r5,:16], ip
  567. vst1.16 {d7[3]}, [r5,:16], ip
  568. bgt 6b
  569. pop {r4-r8,pc}
  570. 8: subs lr, lr, #8
  571. vst1.16 {d0[1]}, [r5,:16], ip
  572. vst1.16 {d0[3]}, [r5,:16], ip
  573. vst1.16 {d1[1]}, [r5,:16], ip
  574. vst1.16 {d1[3]}, [r5,:16], ip
  575. vst1.16 {d2[1]}, [r5,:16], ip
  576. vst1.16 {d2[3]}, [r5,:16], ip
  577. vst1.16 {d3[1]}, [r5,:16], ip
  578. vst1.16 {d3[3]}, [r5,:16], ip
  579. popeq {r4-r8,pc}
  580. vld1.64 {d0-d1}, [r4,:128]!
  581. vcvt.s32.f32 q0, q0, #16
  582. vld1.64 {d2-d3}, [r4,:128]!
  583. vcvt.s32.f32 q1, q1, #16
  584. b 6b
  585. .endfunc
  586. function ff_vector_fmul_neon, export=1
  587. mov r3, r0
  588. subs r2, r2, #8
  589. vld1.64 {d0-d3}, [r0,:128]!
  590. vld1.64 {d4-d7}, [r1,:128]!
  591. vmul.f32 q8, q0, q2
  592. vmul.f32 q9, q1, q3
  593. beq 3f
  594. bics ip, r2, #15
  595. beq 2f
  596. 1: subs ip, ip, #16
  597. vld1.64 {d0-d1}, [r0,:128]!
  598. vld1.64 {d4-d5}, [r1,:128]!
  599. vmul.f32 q10, q0, q2
  600. vld1.64 {d2-d3}, [r0,:128]!
  601. vld1.64 {d6-d7}, [r1,:128]!
  602. vmul.f32 q11, q1, q3
  603. vst1.64 {d16-d19},[r3,:128]!
  604. vld1.64 {d0-d1}, [r0,:128]!
  605. vld1.64 {d4-d5}, [r1,:128]!
  606. vmul.f32 q8, q0, q2
  607. vld1.64 {d2-d3}, [r0,:128]!
  608. vld1.64 {d6-d7}, [r1,:128]!
  609. vmul.f32 q9, q1, q3
  610. vst1.64 {d20-d23},[r3,:128]!
  611. bne 1b
  612. ands r2, r2, #15
  613. beq 3f
  614. 2: vld1.64 {d0-d1}, [r0,:128]!
  615. vld1.64 {d4-d5}, [r1,:128]!
  616. vst1.64 {d16-d17},[r3,:128]!
  617. vmul.f32 q8, q0, q2
  618. vld1.64 {d2-d3}, [r0,:128]!
  619. vld1.64 {d6-d7}, [r1,:128]!
  620. vst1.64 {d18-d19},[r3,:128]!
  621. vmul.f32 q9, q1, q3
  622. 3: vst1.64 {d16-d19},[r3,:128]!
  623. bx lr
  624. .endfunc
  625. function ff_vector_fmul_window_neon, export=1
  626. vld1.32 {d16[],d17[]}, [sp,:32]
  627. push {r4,r5,lr}
  628. ldr lr, [sp, #16]
  629. sub r2, r2, #8
  630. sub r5, lr, #2
  631. add r2, r2, r5, lsl #2
  632. add r4, r3, r5, lsl #3
  633. add ip, r0, r5, lsl #3
  634. mov r5, #-16
  635. vld1.64 {d0,d1}, [r1,:128]!
  636. vld1.64 {d2,d3}, [r2,:128], r5
  637. vld1.64 {d4,d5}, [r3,:128]!
  638. vld1.64 {d6,d7}, [r4,:128], r5
  639. 1: subs lr, lr, #4
  640. vmov q11, q8
  641. vmla.f32 d22, d0, d4
  642. vmov q10, q8
  643. vmla.f32 d23, d1, d5
  644. vrev64.32 q3, q3
  645. vmla.f32 d20, d0, d7
  646. vrev64.32 q1, q1
  647. vmla.f32 d21, d1, d6
  648. beq 2f
  649. vmla.f32 d22, d3, d7
  650. vld1.64 {d0,d1}, [r1,:128]!
  651. vmla.f32 d23, d2, d6
  652. vld1.64 {d18,d19},[r2,:128], r5
  653. vmls.f32 d20, d3, d4
  654. vld1.64 {d24,d25},[r3,:128]!
  655. vmls.f32 d21, d2, d5
  656. vld1.64 {d6,d7}, [r4,:128], r5
  657. vmov q1, q9
  658. vrev64.32 q11, q11
  659. vmov q2, q12
  660. vswp d22, d23
  661. vst1.64 {d20,d21},[r0,:128]!
  662. vst1.64 {d22,d23},[ip,:128], r5
  663. b 1b
  664. 2: vmla.f32 d22, d3, d7
  665. vmla.f32 d23, d2, d6
  666. vmls.f32 d20, d3, d4
  667. vmls.f32 d21, d2, d5
  668. vrev64.32 q11, q11
  669. vswp d22, d23
  670. vst1.64 {d20,d21},[r0,:128]!
  671. vst1.64 {d22,d23},[ip,:128], r5
  672. pop {r4,r5,pc}
  673. .endfunc