You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

612 lines
21KB

  1. /*
  2. * ARM NEON optimised DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "asm.S"
  22. preserve8
  23. .fpu neon
  24. .text
  25. .macro pixels16 avg=0
  26. .if \avg
  27. mov ip, r0
  28. .endif
  29. 1: vld1.64 {d0, d1}, [r1], r2
  30. vld1.64 {d2, d3}, [r1], r2
  31. vld1.64 {d4, d5}, [r1], r2
  32. pld [r1, r2, lsl #2]
  33. vld1.64 {d6, d7}, [r1], r2
  34. pld [r1]
  35. pld [r1, r2]
  36. pld [r1, r2, lsl #1]
  37. .if \avg
  38. vld1.64 {d16,d17}, [ip], r2
  39. vrhadd.u8 q0, q0, q8
  40. vld1.64 {d18,d19}, [ip], r2
  41. vrhadd.u8 q1, q1, q9
  42. vld1.64 {d20,d21}, [ip], r2
  43. vrhadd.u8 q2, q2, q10
  44. vld1.64 {d22,d23}, [ip], r2
  45. vrhadd.u8 q3, q3, q11
  46. .endif
  47. subs r3, r3, #4
  48. vst1.64 {d0, d1}, [r0,:128], r2
  49. vst1.64 {d2, d3}, [r0,:128], r2
  50. vst1.64 {d4, d5}, [r0,:128], r2
  51. vst1.64 {d6, d7}, [r0,:128], r2
  52. bne 1b
  53. bx lr
  54. .endm
  55. .macro pixels16_x2 vhadd=vrhadd.u8
  56. 1: vld1.64 {d0-d2}, [r1], r2
  57. vld1.64 {d4-d6}, [r1], r2
  58. pld [r1]
  59. pld [r1, r2]
  60. subs r3, r3, #2
  61. vext.8 q1, q0, q1, #1
  62. \vhadd q0, q0, q1
  63. vext.8 q3, q2, q3, #1
  64. \vhadd q2, q2, q3
  65. vst1.64 {d0, d1}, [r0,:128], r2
  66. vst1.64 {d4, d5}, [r0,:128], r2
  67. bne 1b
  68. bx lr
  69. .endm
  70. .macro pixels16_y2 vhadd=vrhadd.u8
  71. push {lr}
  72. add ip, r1, r2
  73. lsl lr, r2, #1
  74. vld1.64 {d0, d1}, [r1], lr
  75. vld1.64 {d2, d3}, [ip], lr
  76. 1: subs r3, r3, #2
  77. \vhadd q2, q0, q1
  78. vld1.64 {d0, d1}, [r1], lr
  79. \vhadd q3, q0, q1
  80. vld1.64 {d2, d3}, [ip], lr
  81. pld [r1]
  82. pld [ip]
  83. vst1.64 {d4, d5}, [r0,:128], r2
  84. vst1.64 {d6, d7}, [r0,:128], r2
  85. bne 1b
  86. pop {pc}
  87. .endm
  88. .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
  89. push {lr}
  90. lsl lr, r2, #1
  91. add ip, r1, r2
  92. vld1.64 {d0-d2}, [r1], lr
  93. vld1.64 {d4-d6}, [ip], lr
  94. .if \no_rnd
  95. vmov.i16 q13, #1
  96. .endif
  97. pld [r1]
  98. pld [ip]
  99. vext.8 q1, q0, q1, #1
  100. vext.8 q3, q2, q3, #1
  101. vaddl.u8 q8, d0, d2
  102. vaddl.u8 q10, d1, d3
  103. vaddl.u8 q9, d4, d6
  104. vaddl.u8 q11, d5, d7
  105. 1: subs r3, r3, #2
  106. vld1.64 {d0-d2}, [r1], lr
  107. vadd.u16 q12, q8, q9
  108. pld [r1]
  109. .if \no_rnd
  110. vadd.u16 q12, q12, q13
  111. .endif
  112. vext.8 q15, q0, q1, #1
  113. vadd.u16 q1 , q10, q11
  114. \vshrn d28, q12, #2
  115. .if \no_rnd
  116. vadd.u16 q1, q1, q13
  117. .endif
  118. \vshrn d29, q1, #2
  119. vaddl.u8 q8, d0, d30
  120. vld1.64 {d2-d4}, [ip], lr
  121. vaddl.u8 q10, d1, d31
  122. vst1.64 {d28,d29}, [r0,:128], r2
  123. vadd.u16 q12, q8, q9
  124. pld [ip]
  125. .if \no_rnd
  126. vadd.u16 q12, q12, q13
  127. .endif
  128. vext.8 q2, q1, q2, #1
  129. vadd.u16 q0, q10, q11
  130. \vshrn d30, q12, #2
  131. .if \no_rnd
  132. vadd.u16 q0, q0, q13
  133. .endif
  134. \vshrn d31, q0, #2
  135. vaddl.u8 q9, d2, d4
  136. vaddl.u8 q11, d3, d5
  137. vst1.64 {d30,d31}, [r0,:128], r2
  138. bgt 1b
  139. pop {pc}
  140. .endm
  141. .macro pixels8
  142. 1: vld1.64 {d0}, [r1], r2
  143. vld1.64 {d1}, [r1], r2
  144. vld1.64 {d2}, [r1], r2
  145. pld [r1, r2, lsl #2]
  146. vld1.64 {d3}, [r1], r2
  147. pld [r1]
  148. pld [r1, r2]
  149. pld [r1, r2, lsl #1]
  150. subs r3, r3, #4
  151. vst1.64 {d0}, [r0,:64], r2
  152. vst1.64 {d1}, [r0,:64], r2
  153. vst1.64 {d2}, [r0,:64], r2
  154. vst1.64 {d3}, [r0,:64], r2
  155. bne 1b
  156. bx lr
  157. .endm
  158. .macro pixels8_x2 vhadd=vrhadd.u8
  159. 1: vld1.64 {d0, d1}, [r1], r2
  160. vext.8 d1, d0, d1, #1
  161. vld1.64 {d2, d3}, [r1], r2
  162. vext.8 d3, d2, d3, #1
  163. pld [r1]
  164. pld [r1, r2]
  165. subs r3, r3, #2
  166. vswp d1, d2
  167. \vhadd q0, q0, q1
  168. vst1.64 {d0}, [r0,:64], r2
  169. vst1.64 {d1}, [r0,:64], r2
  170. bne 1b
  171. bx lr
  172. .endm
  173. .macro pixels8_y2 vhadd=vrhadd.u8
  174. push {lr}
  175. add ip, r1, r2
  176. lsl lr, r2, #1
  177. vld1.64 {d0}, [r1], lr
  178. vld1.64 {d1}, [ip], lr
  179. 1: subs r3, r3, #2
  180. \vhadd d4, d0, d1
  181. vld1.64 {d0}, [r1], lr
  182. \vhadd d5, d0, d1
  183. vld1.64 {d1}, [ip], lr
  184. pld [r1]
  185. pld [ip]
  186. vst1.64 {d4}, [r0,:64], r2
  187. vst1.64 {d5}, [r0,:64], r2
  188. bne 1b
  189. pop {pc}
  190. .endm
  191. .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  192. push {lr}
  193. lsl lr, r2, #1
  194. add ip, r1, r2
  195. vld1.64 {d0, d1}, [r1], lr
  196. vld1.64 {d2, d3}, [ip], lr
  197. .if \no_rnd
  198. vmov.i16 q11, #1
  199. .endif
  200. pld [r1]
  201. pld [ip]
  202. vext.8 d4, d0, d1, #1
  203. vext.8 d6, d2, d3, #1
  204. vaddl.u8 q8, d0, d4
  205. vaddl.u8 q9, d2, d6
  206. 1: subs r3, r3, #2
  207. vld1.64 {d0, d1}, [r1], lr
  208. pld [r1]
  209. vadd.u16 q10, q8, q9
  210. vext.8 d4, d0, d1, #1
  211. .if \no_rnd
  212. vadd.u16 q10, q10, q11
  213. .endif
  214. vaddl.u8 q8, d0, d4
  215. \vshrn d5, q10, #2
  216. vld1.64 {d2, d3}, [ip], lr
  217. vadd.u16 q10, q8, q9
  218. pld [ip]
  219. .if \no_rnd
  220. vadd.u16 q10, q10, q11
  221. .endif
  222. vst1.64 {d5}, [r0,:64], r2
  223. \vshrn d7, q10, #2
  224. vext.8 d6, d2, d3, #1
  225. vaddl.u8 q9, d2, d6
  226. vst1.64 {d7}, [r0,:64], r2
  227. bgt 1b
  228. pop {pc}
  229. .endm
  230. .macro pixfunc pfx name suf rnd_op args:vararg
  231. function ff_\pfx\name\suf\()_neon, export=1
  232. \name \rnd_op \args
  233. .endfunc
  234. .endm
  235. .macro pixfunc2 pfx name args:vararg
  236. pixfunc \pfx \name
  237. pixfunc \pfx \name \args
  238. .endm
  239. function ff_put_h264_qpel16_mc00_neon, export=1
  240. mov r3, #16
  241. .endfunc
  242. pixfunc put_ pixels16
  243. pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
  244. pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
  245. pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  246. function ff_avg_h264_qpel16_mc00_neon, export=1
  247. mov r3, #16
  248. .endfunc
  249. pixfunc avg_ pixels16,, 1
  250. function ff_put_h264_qpel8_mc00_neon, export=1
  251. mov r3, #8
  252. .endfunc
  253. pixfunc put_ pixels8
  254. pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
  255. pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
  256. pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
  257. function ff_float_to_int16_neon, export=1
  258. subs r2, r2, #8
  259. vld1.64 {d0-d1}, [r1,:128]!
  260. vcvt.s32.f32 q8, q0, #16
  261. vld1.64 {d2-d3}, [r1,:128]!
  262. vcvt.s32.f32 q9, q1, #16
  263. beq 3f
  264. bics ip, r2, #15
  265. beq 2f
  266. 1: subs ip, ip, #16
  267. vshrn.s32 d4, q8, #16
  268. vld1.64 {d0-d1}, [r1,:128]!
  269. vcvt.s32.f32 q0, q0, #16
  270. vshrn.s32 d5, q9, #16
  271. vld1.64 {d2-d3}, [r1,:128]!
  272. vcvt.s32.f32 q1, q1, #16
  273. vshrn.s32 d6, q0, #16
  274. vst1.64 {d4-d5}, [r0,:128]!
  275. vshrn.s32 d7, q1, #16
  276. vld1.64 {d16-d17},[r1,:128]!
  277. vcvt.s32.f32 q8, q8, #16
  278. vld1.64 {d18-d19},[r1,:128]!
  279. vcvt.s32.f32 q9, q9, #16
  280. vst1.64 {d6-d7}, [r0,:128]!
  281. bne 1b
  282. ands r2, r2, #15
  283. beq 3f
  284. 2: vld1.64 {d0-d1}, [r1,:128]!
  285. vshrn.s32 d4, q8, #16
  286. vcvt.s32.f32 q0, q0, #16
  287. vld1.64 {d2-d3}, [r1,:128]!
  288. vshrn.s32 d5, q9, #16
  289. vcvt.s32.f32 q1, q1, #16
  290. vshrn.s32 d6, q0, #16
  291. vst1.64 {d4-d5}, [r0,:128]!
  292. vshrn.s32 d7, q1, #16
  293. vst1.64 {d6-d7}, [r0,:128]!
  294. bx lr
  295. 3: vshrn.s32 d4, q8, #16
  296. vshrn.s32 d5, q9, #16
  297. vst1.64 {d4-d5}, [r0,:128]!
  298. bx lr
  299. .endfunc
  300. function ff_float_to_int16_interleave_neon, export=1
  301. cmp r3, #2
  302. ldrlt r1, [r1]
  303. blt ff_float_to_int16_neon
  304. bne 4f
  305. ldr r3, [r1]
  306. ldr r1, [r1, #4]
  307. subs r2, r2, #8
  308. vld1.64 {d0-d1}, [r3,:128]!
  309. vcvt.s32.f32 q8, q0, #16
  310. vld1.64 {d2-d3}, [r3,:128]!
  311. vcvt.s32.f32 q9, q1, #16
  312. vld1.64 {d20-d21},[r1,:128]!
  313. vcvt.s32.f32 q10, q10, #16
  314. vld1.64 {d22-d23},[r1,:128]!
  315. vcvt.s32.f32 q11, q11, #16
  316. beq 3f
  317. bics ip, r2, #15
  318. beq 2f
  319. 1: subs ip, ip, #16
  320. vld1.64 {d0-d1}, [r3,:128]!
  321. vcvt.s32.f32 q0, q0, #16
  322. vsri.32 q10, q8, #16
  323. vld1.64 {d2-d3}, [r3,:128]!
  324. vcvt.s32.f32 q1, q1, #16
  325. vld1.64 {d24-d25},[r1,:128]!
  326. vcvt.s32.f32 q12, q12, #16
  327. vld1.64 {d26-d27},[r1,:128]!
  328. vsri.32 q11, q9, #16
  329. vst1.64 {d20-d21},[r0,:128]!
  330. vcvt.s32.f32 q13, q13, #16
  331. vst1.64 {d22-d23},[r0,:128]!
  332. vsri.32 q12, q0, #16
  333. vld1.64 {d16-d17},[r3,:128]!
  334. vsri.32 q13, q1, #16
  335. vst1.64 {d24-d25},[r0,:128]!
  336. vcvt.s32.f32 q8, q8, #16
  337. vld1.64 {d18-d19},[r3,:128]!
  338. vcvt.s32.f32 q9, q9, #16
  339. vld1.64 {d20-d21},[r1,:128]!
  340. vcvt.s32.f32 q10, q10, #16
  341. vld1.64 {d22-d23},[r1,:128]!
  342. vcvt.s32.f32 q11, q11, #16
  343. vst1.64 {d26-d27},[r0,:128]!
  344. bne 1b
  345. ands r2, r2, #15
  346. beq 3f
  347. 2: vsri.32 q10, q8, #16
  348. vld1.64 {d0-d1}, [r3,:128]!
  349. vcvt.s32.f32 q0, q0, #16
  350. vld1.64 {d2-d3}, [r3,:128]!
  351. vcvt.s32.f32 q1, q1, #16
  352. vld1.64 {d24-d25},[r1,:128]!
  353. vcvt.s32.f32 q12, q12, #16
  354. vsri.32 q11, q9, #16
  355. vld1.64 {d26-d27},[r1,:128]!
  356. vcvt.s32.f32 q13, q13, #16
  357. vst1.64 {d20-d21},[r0,:128]!
  358. vsri.32 q12, q0, #16
  359. vst1.64 {d22-d23},[r0,:128]!
  360. vsri.32 q13, q1, #16
  361. vst1.64 {d24-d27},[r0,:128]!
  362. bx lr
  363. 3: vsri.32 q10, q8, #16
  364. vsri.32 q11, q9, #16
  365. vst1.64 {d20-d23},[r0,:128]!
  366. bx lr
  367. 4: push {r4-r8,lr}
  368. cmp r3, #4
  369. lsl ip, r3, #1
  370. blt 4f
  371. @ 4 channels
  372. 5: ldmia r1!, {r4-r7}
  373. mov lr, r2
  374. mov r8, r0
  375. vld1.64 {d16-d17},[r4,:128]!
  376. vcvt.s32.f32 q8, q8, #16
  377. vld1.64 {d18-d19},[r5,:128]!
  378. vcvt.s32.f32 q9, q9, #16
  379. vld1.64 {d20-d21},[r6,:128]!
  380. vcvt.s32.f32 q10, q10, #16
  381. vld1.64 {d22-d23},[r7,:128]!
  382. vcvt.s32.f32 q11, q11, #16
  383. 6: subs lr, lr, #8
  384. vld1.64 {d0-d1}, [r4,:128]!
  385. vcvt.s32.f32 q0, q0, #16
  386. vsri.32 q9, q8, #16
  387. vld1.64 {d2-d3}, [r5,:128]!
  388. vcvt.s32.f32 q1, q1, #16
  389. vsri.32 q11, q10, #16
  390. vld1.64 {d4-d5}, [r6,:128]!
  391. vcvt.s32.f32 q2, q2, #16
  392. vzip.32 d18, d22
  393. vld1.64 {d6-d7}, [r7,:128]!
  394. vcvt.s32.f32 q3, q3, #16
  395. vzip.32 d19, d23
  396. vst1.64 {d18}, [r8], ip
  397. vsri.32 q1, q0, #16
  398. vst1.64 {d22}, [r8], ip
  399. vsri.32 q3, q2, #16
  400. vst1.64 {d19}, [r8], ip
  401. vzip.32 d2, d6
  402. vst1.64 {d23}, [r8], ip
  403. vzip.32 d3, d7
  404. beq 7f
  405. vld1.64 {d16-d17},[r4,:128]!
  406. vcvt.s32.f32 q8, q8, #16
  407. vst1.64 {d2}, [r8], ip
  408. vld1.64 {d18-d19},[r5,:128]!
  409. vcvt.s32.f32 q9, q9, #16
  410. vst1.64 {d6}, [r8], ip
  411. vld1.64 {d20-d21},[r6,:128]!
  412. vcvt.s32.f32 q10, q10, #16
  413. vst1.64 {d3}, [r8], ip
  414. vld1.64 {d22-d23},[r7,:128]!
  415. vcvt.s32.f32 q11, q11, #16
  416. vst1.64 {d7}, [r8], ip
  417. b 6b
  418. 7: vst1.64 {d2}, [r8], ip
  419. vst1.64 {d6}, [r8], ip
  420. vst1.64 {d3}, [r8], ip
  421. vst1.64 {d7}, [r8], ip
  422. subs r3, r3, #4
  423. popeq {r4-r8,pc}
  424. cmp r3, #4
  425. add r0, r0, #8
  426. bge 5b
  427. @ 2 channels
  428. 4: cmp r3, #2
  429. blt 4f
  430. ldmia r1!, {r4-r5}
  431. mov lr, r2
  432. mov r8, r0
  433. tst lr, #8
  434. vld1.64 {d16-d17},[r4,:128]!
  435. vcvt.s32.f32 q8, q8, #16
  436. vld1.64 {d18-d19},[r5,:128]!
  437. vcvt.s32.f32 q9, q9, #16
  438. vld1.64 {d20-d21},[r4,:128]!
  439. vcvt.s32.f32 q10, q10, #16
  440. vld1.64 {d22-d23},[r5,:128]!
  441. vcvt.s32.f32 q11, q11, #16
  442. beq 6f
  443. subs lr, lr, #8
  444. beq 7f
  445. vsri.32 d18, d16, #16
  446. vsri.32 d19, d17, #16
  447. vld1.64 {d16-d17},[r4,:128]!
  448. vcvt.s32.f32 q8, q8, #16
  449. vst1.32 {d18[0]}, [r8], ip
  450. vsri.32 d22, d20, #16
  451. vst1.32 {d18[1]}, [r8], ip
  452. vsri.32 d23, d21, #16
  453. vst1.32 {d19[0]}, [r8], ip
  454. vst1.32 {d19[1]}, [r8], ip
  455. vld1.64 {d18-d19},[r5,:128]!
  456. vcvt.s32.f32 q9, q9, #16
  457. vst1.32 {d22[0]}, [r8], ip
  458. vst1.32 {d22[1]}, [r8], ip
  459. vld1.64 {d20-d21},[r4,:128]!
  460. vcvt.s32.f32 q10, q10, #16
  461. vst1.32 {d23[0]}, [r8], ip
  462. vst1.32 {d23[1]}, [r8], ip
  463. vld1.64 {d22-d23},[r5,:128]!
  464. vcvt.s32.f32 q11, q11, #16
  465. 6: subs lr, lr, #16
  466. vld1.64 {d0-d1}, [r4,:128]!
  467. vcvt.s32.f32 q0, q0, #16
  468. vsri.32 d18, d16, #16
  469. vld1.64 {d2-d3}, [r5,:128]!
  470. vcvt.s32.f32 q1, q1, #16
  471. vsri.32 d19, d17, #16
  472. vld1.64 {d4-d5}, [r4,:128]!
  473. vcvt.s32.f32 q2, q2, #16
  474. vld1.64 {d6-d7}, [r5,:128]!
  475. vcvt.s32.f32 q3, q3, #16
  476. vst1.32 {d18[0]}, [r8], ip
  477. vsri.32 d22, d20, #16
  478. vst1.32 {d18[1]}, [r8], ip
  479. vsri.32 d23, d21, #16
  480. vst1.32 {d19[0]}, [r8], ip
  481. vsri.32 d2, d0, #16
  482. vst1.32 {d19[1]}, [r8], ip
  483. vsri.32 d3, d1, #16
  484. vst1.32 {d22[0]}, [r8], ip
  485. vsri.32 d6, d4, #16
  486. vst1.32 {d22[1]}, [r8], ip
  487. vsri.32 d7, d5, #16
  488. vst1.32 {d23[0]}, [r8], ip
  489. vst1.32 {d23[1]}, [r8], ip
  490. beq 6f
  491. vld1.64 {d16-d17},[r4,:128]!
  492. vcvt.s32.f32 q8, q8, #16
  493. vst1.32 {d2[0]}, [r8], ip
  494. vst1.32 {d2[1]}, [r8], ip
  495. vld1.64 {d18-d19},[r5,:128]!
  496. vcvt.s32.f32 q9, q9, #16
  497. vst1.32 {d3[0]}, [r8], ip
  498. vst1.32 {d3[1]}, [r8], ip
  499. vld1.64 {d20-d21},[r4,:128]!
  500. vcvt.s32.f32 q10, q10, #16
  501. vst1.32 {d6[0]}, [r8], ip
  502. vst1.32 {d6[1]}, [r8], ip
  503. vld1.64 {d22-d23},[r5,:128]!
  504. vcvt.s32.f32 q11, q11, #16
  505. vst1.32 {d7[0]}, [r8], ip
  506. vst1.32 {d7[1]}, [r8], ip
  507. bgt 6b
  508. 6: vst1.32 {d2[0]}, [r8], ip
  509. vst1.32 {d2[1]}, [r8], ip
  510. vst1.32 {d3[0]}, [r8], ip
  511. vst1.32 {d3[1]}, [r8], ip
  512. vst1.32 {d6[0]}, [r8], ip
  513. vst1.32 {d6[1]}, [r8], ip
  514. vst1.32 {d7[0]}, [r8], ip
  515. vst1.32 {d7[1]}, [r8], ip
  516. b 8f
  517. 7: vsri.32 d18, d16, #16
  518. vsri.32 d19, d17, #16
  519. vst1.32 {d18[0]}, [r8], ip
  520. vsri.32 d22, d20, #16
  521. vst1.32 {d18[1]}, [r8], ip
  522. vsri.32 d23, d21, #16
  523. vst1.32 {d19[0]}, [r8], ip
  524. vst1.32 {d19[1]}, [r8], ip
  525. vst1.32 {d22[0]}, [r8], ip
  526. vst1.32 {d22[1]}, [r8], ip
  527. vst1.32 {d23[0]}, [r8], ip
  528. vst1.32 {d23[1]}, [r8], ip
  529. 8: subs r3, r3, #2
  530. add r0, r0, #4
  531. popeq {r4-r8,pc}
  532. @ 1 channel
  533. 4: ldr r4, [r1],#4
  534. tst r2, #8
  535. mov lr, r2
  536. mov r5, r0
  537. vld1.64 {d0-d1}, [r4,:128]!
  538. vcvt.s32.f32 q0, q0, #16
  539. vld1.64 {d2-d3}, [r4,:128]!
  540. vcvt.s32.f32 q1, q1, #16
  541. bne 8f
  542. 6: subs lr, lr, #16
  543. vld1.64 {d4-d5}, [r4,:128]!
  544. vcvt.s32.f32 q2, q2, #16
  545. vld1.64 {d6-d7}, [r4,:128]!
  546. vcvt.s32.f32 q3, q3, #16
  547. vst1.16 {d0[1]}, [r5,:16], ip
  548. vst1.16 {d0[3]}, [r5,:16], ip
  549. vst1.16 {d1[1]}, [r5,:16], ip
  550. vst1.16 {d1[3]}, [r5,:16], ip
  551. vst1.16 {d2[1]}, [r5,:16], ip
  552. vst1.16 {d2[3]}, [r5,:16], ip
  553. vst1.16 {d3[1]}, [r5,:16], ip
  554. vst1.16 {d3[3]}, [r5,:16], ip
  555. beq 7f
  556. vld1.64 {d0-d1}, [r4,:128]!
  557. vcvt.s32.f32 q0, q0, #16
  558. vld1.64 {d2-d3}, [r4,:128]!
  559. vcvt.s32.f32 q1, q1, #16
  560. 7: vst1.16 {d4[1]}, [r5,:16], ip
  561. vst1.16 {d4[3]}, [r5,:16], ip
  562. vst1.16 {d5[1]}, [r5,:16], ip
  563. vst1.16 {d5[3]}, [r5,:16], ip
  564. vst1.16 {d6[1]}, [r5,:16], ip
  565. vst1.16 {d6[3]}, [r5,:16], ip
  566. vst1.16 {d7[1]}, [r5,:16], ip
  567. vst1.16 {d7[3]}, [r5,:16], ip
  568. bgt 6b
  569. pop {r4-r8,pc}
  570. 8: subs lr, lr, #8
  571. vst1.16 {d0[1]}, [r5,:16], ip
  572. vst1.16 {d0[3]}, [r5,:16], ip
  573. vst1.16 {d1[1]}, [r5,:16], ip
  574. vst1.16 {d1[3]}, [r5,:16], ip
  575. vst1.16 {d2[1]}, [r5,:16], ip
  576. vst1.16 {d2[3]}, [r5,:16], ip
  577. vst1.16 {d3[1]}, [r5,:16], ip
  578. vst1.16 {d3[3]}, [r5,:16], ip
  579. popeq {r4-r8,pc}
  580. vld1.64 {d0-d1}, [r4,:128]!
  581. vcvt.s32.f32 q0, q0, #16
  582. vld1.64 {d2-d3}, [r4,:128]!
  583. vcvt.s32.f32 q1, q1, #16
  584. b 6b
  585. .endfunc