You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

667 lines
15KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This algorithm (though not any of the implementation details) is
  6. ;* based on libdjbfft by D. J. Bernstein.
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. ; These functions are not individually interchangeable with the C versions.
  25. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  26. ; in blocks as conventient to the vector size.
  27. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  28. %include "x86inc.asm"
  29. %ifdef ARCH_X86_64
  30. %define pointer resq
  31. %else
  32. %define pointer resd
  33. %endif
  34. struc FFTContext
  35. .nbits: resd 1
  36. .reverse: resd 1
  37. .revtab: pointer 1
  38. .tmpbuf: pointer 1
  39. .mdctsize: resd 1
  40. .mdctbits: resd 1
  41. .tcos: pointer 1
  42. .tsin: pointer 1
  43. endstruc
  44. SECTION_RODATA
  45. %define M_SQRT1_2 0.70710678118654752440
  46. ps_root2: times 4 dd M_SQRT1_2
  47. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  48. ps_m1p1: dd 1<<31, 0
  49. %assign i 16
  50. %rep 13
  51. cextern cos_ %+ i
  52. %assign i i<<1
  53. %endrep
  54. %ifdef ARCH_X86_64
  55. %define pointer dq
  56. %else
  57. %define pointer dd
  58. %endif
  59. %macro IF0 1+
  60. %endmacro
  61. %macro IF1 1+
  62. %1
  63. %endmacro
  64. section .text align=16
  65. %macro T2_3DN 4 ; z0, z1, mem0, mem1
  66. mova %1, %3
  67. mova %2, %1
  68. pfadd %1, %4
  69. pfsub %2, %4
  70. %endmacro
  71. %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
  72. mova %5, %3
  73. pfsub %3, %4
  74. pfadd %5, %4 ; {t6,t5}
  75. pxor %3, [ps_m1p1] ; {t8,t7}
  76. mova %6, %1
  77. pswapd %3, %3
  78. pfadd %1, %5 ; {r0,i0}
  79. pfsub %6, %5 ; {r2,i2}
  80. mova %4, %2
  81. pfadd %2, %3 ; {r1,i1}
  82. pfsub %4, %3 ; {r3,i3}
  83. SWAP %3, %6
  84. %endmacro
  85. ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
  86. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  87. %macro T4_SSE 3
  88. mova %3, %1
  89. shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
  90. shufps %3, %2, 0xce ; {r1,i1,r2,i3}
  91. mova %2, %1
  92. addps %1, %3 ; {t1,t2,t6,t5}
  93. subps %2, %3 ; {t3,t4,t8,t7}
  94. mova %3, %1
  95. shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
  96. shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
  97. mova %2, %1
  98. addps %1, %3 ; {r0,i0,r1,i1}
  99. subps %2, %3 ; {r2,i2,r3,i3}
  100. mova %3, %1
  101. shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
  102. shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
  103. SWAP %2, %3
  104. %endmacro
  105. %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
  106. mova %5, %3
  107. shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
  108. shufps %5, %4, 0xee ; {r5,i5,r7,i7}
  109. mova %6, %3
  110. subps %3, %5 ; {r5,i5,r7,i7}
  111. addps %6, %5 ; {t1,t2,t3,t4}
  112. mova %5, %3
  113. shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
  114. mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  115. mulps %5, [ps_root2]
  116. addps %3, %5 ; {t8,t7,ta,t9}
  117. mova %5, %6
  118. shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
  119. shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
  120. mova %3, %6
  121. addps %6, %5 ; {t1,t2,t9,ta}
  122. subps %3, %5 ; {t6,t5,tc,tb}
  123. mova %5, %6
  124. shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
  125. shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
  126. mova %3, %1
  127. mova %4, %2
  128. addps %1, %6 ; {r0,r1,r2,r3}
  129. addps %2, %5 ; {i0,i1,i2,i3}
  130. subps %3, %6 ; {r4,r5,r6,r7}
  131. subps %4, %5 ; {i4,i5,i6,i7}
  132. %endmacro
  133. ; scheduled for cpu-bound sizes
  134. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  135. IF%1 mova m4, Z(4)
  136. IF%1 mova m5, Z(5)
  137. mova m0, %2 ; wre
  138. mova m2, m4
  139. mova m1, %3 ; wim
  140. mova m3, m5
  141. mulps m2, m0 ; r2*wre
  142. IF%1 mova m6, Z(6)
  143. mulps m3, m1 ; i2*wim
  144. IF%1 mova m7, Z(7)
  145. mulps m4, m1 ; r2*wim
  146. mulps m5, m0 ; i2*wre
  147. addps m2, m3 ; r2*wre + i2*wim
  148. mova m3, m1
  149. mulps m1, m6 ; r3*wim
  150. subps m5, m4 ; i2*wre - r2*wim
  151. mova m4, m0
  152. mulps m3, m7 ; i3*wim
  153. mulps m4, m6 ; r3*wre
  154. mulps m0, m7 ; i3*wre
  155. subps m4, m3 ; r3*wre - i3*wim
  156. mova m3, Z(0)
  157. addps m0, m1 ; i3*wre + r3*wim
  158. mova m1, m4
  159. addps m4, m2 ; t5
  160. subps m1, m2 ; t3
  161. subps m3, m4 ; r2
  162. addps m4, Z(0) ; r0
  163. mova m6, Z(2)
  164. mova Z(4), m3
  165. mova Z(0), m4
  166. mova m3, m5
  167. subps m5, m0 ; t4
  168. mova m4, m6
  169. subps m6, m5 ; r3
  170. addps m5, m4 ; r1
  171. mova Z(6), m6
  172. mova Z(2), m5
  173. mova m2, Z(3)
  174. addps m3, m0 ; t6
  175. subps m2, m1 ; i3
  176. mova m7, Z(1)
  177. addps m1, Z(3) ; i1
  178. mova Z(7), m2
  179. mova Z(3), m1
  180. mova m4, m7
  181. subps m7, m3 ; i2
  182. addps m3, m4 ; i0
  183. mova Z(5), m7
  184. mova Z(1), m3
  185. %endmacro
  186. ; scheduled to avoid store->load aliasing
  187. %macro PASS_BIG 1 ; (!interleave)
  188. mova m4, Z(4) ; r2
  189. mova m5, Z(5) ; i2
  190. mova m2, m4
  191. mova m0, [wq] ; wre
  192. mova m3, m5
  193. mova m1, [wq+o1q] ; wim
  194. mulps m2, m0 ; r2*wre
  195. mova m6, Z(6) ; r3
  196. mulps m3, m1 ; i2*wim
  197. mova m7, Z(7) ; i3
  198. mulps m4, m1 ; r2*wim
  199. mulps m5, m0 ; i2*wre
  200. addps m2, m3 ; r2*wre + i2*wim
  201. mova m3, m1
  202. mulps m1, m6 ; r3*wim
  203. subps m5, m4 ; i2*wre - r2*wim
  204. mova m4, m0
  205. mulps m3, m7 ; i3*wim
  206. mulps m4, m6 ; r3*wre
  207. mulps m0, m7 ; i3*wre
  208. subps m4, m3 ; r3*wre - i3*wim
  209. mova m3, Z(0)
  210. addps m0, m1 ; i3*wre + r3*wim
  211. mova m1, m4
  212. addps m4, m2 ; t5
  213. subps m1, m2 ; t3
  214. subps m3, m4 ; r2
  215. addps m4, Z(0) ; r0
  216. mova m6, Z(2)
  217. mova Z(4), m3
  218. mova Z(0), m4
  219. mova m3, m5
  220. subps m5, m0 ; t4
  221. mova m4, m6
  222. subps m6, m5 ; r3
  223. addps m5, m4 ; r1
  224. IF%1 mova Z(6), m6
  225. IF%1 mova Z(2), m5
  226. mova m2, Z(3)
  227. addps m3, m0 ; t6
  228. subps m2, m1 ; i3
  229. mova m7, Z(1)
  230. addps m1, Z(3) ; i1
  231. IF%1 mova Z(7), m2
  232. IF%1 mova Z(3), m1
  233. mova m4, m7
  234. subps m7, m3 ; i2
  235. addps m3, m4 ; i0
  236. IF%1 mova Z(5), m7
  237. IF%1 mova Z(1), m3
  238. %if %1==0
  239. mova m4, m5 ; r1
  240. mova m0, m6 ; r3
  241. unpcklps m5, m1
  242. unpckhps m4, m1
  243. unpcklps m6, m2
  244. unpckhps m0, m2
  245. mova m1, Z(0)
  246. mova m2, Z(4)
  247. mova Z(2), m5
  248. mova Z(3), m4
  249. mova Z(6), m6
  250. mova Z(7), m0
  251. mova m5, m1 ; r0
  252. mova m4, m2 ; r2
  253. unpcklps m1, m3
  254. unpckhps m5, m3
  255. unpcklps m2, m7
  256. unpckhps m4, m7
  257. mova Z(0), m1
  258. mova Z(1), m5
  259. mova Z(4), m2
  260. mova Z(5), m4
  261. %endif
  262. %endmacro
  263. %macro PUNPCK 3
  264. mova %3, %1
  265. punpckldq %1, %2
  266. punpckhdq %3, %2
  267. %endmacro
  268. INIT_XMM
  269. %define mova movaps
  270. %define Z(x) [r0+mmsize*x]
  271. align 16
  272. fft4_sse:
  273. mova m0, Z(0)
  274. mova m1, Z(1)
  275. T4_SSE m0, m1, m2
  276. mova Z(0), m0
  277. mova Z(1), m1
  278. ret
  279. align 16
  280. fft8_sse:
  281. mova m0, Z(0)
  282. mova m1, Z(1)
  283. T4_SSE m0, m1, m2
  284. mova m2, Z(2)
  285. mova m3, Z(3)
  286. T8_SSE m0, m1, m2, m3, m4, m5
  287. mova Z(0), m0
  288. mova Z(1), m1
  289. mova Z(2), m2
  290. mova Z(3), m3
  291. ret
  292. align 16
  293. fft16_sse:
  294. mova m0, Z(0)
  295. mova m1, Z(1)
  296. T4_SSE m0, m1, m2
  297. mova m2, Z(2)
  298. mova m3, Z(3)
  299. T8_SSE m0, m1, m2, m3, m4, m5
  300. mova m4, Z(4)
  301. mova m5, Z(5)
  302. mova Z(0), m0
  303. mova Z(1), m1
  304. mova Z(2), m2
  305. mova Z(3), m3
  306. T4_SSE m4, m5, m6
  307. mova m6, Z(6)
  308. mova m7, Z(7)
  309. T4_SSE m6, m7, m0
  310. PASS_SMALL 0, [cos_16], [cos_16+16]
  311. ret
  312. INIT_MMX
  313. %macro FFT48_3DN 1
  314. align 16
  315. fft4%1:
  316. T2_3DN m0, m1, Z(0), Z(1)
  317. mova m2, Z(2)
  318. mova m3, Z(3)
  319. T4_3DN m0, m1, m2, m3, m4, m5
  320. PUNPCK m0, m1, m4
  321. PUNPCK m2, m3, m5
  322. mova Z(0), m0
  323. mova Z(1), m4
  324. mova Z(2), m2
  325. mova Z(3), m5
  326. ret
  327. align 16
  328. fft8%1:
  329. T2_3DN m0, m1, Z(0), Z(1)
  330. mova m2, Z(2)
  331. mova m3, Z(3)
  332. T4_3DN m0, m1, m2, m3, m4, m5
  333. mova Z(0), m0
  334. mova Z(2), m2
  335. T2_3DN m4, m5, Z(4), Z(5)
  336. T2_3DN m6, m7, Z(6), Z(7)
  337. pswapd m0, m5
  338. pswapd m2, m7
  339. pxor m0, [ps_m1p1]
  340. pxor m2, [ps_m1p1]
  341. pfsub m5, m0
  342. pfadd m7, m2
  343. pfmul m5, [ps_root2]
  344. pfmul m7, [ps_root2]
  345. T4_3DN m1, m3, m5, m7, m0, m2
  346. mova Z(5), m5
  347. mova Z(7), m7
  348. mova m0, Z(0)
  349. mova m2, Z(2)
  350. T4_3DN m0, m2, m4, m6, m5, m7
  351. PUNPCK m0, m1, m5
  352. PUNPCK m2, m3, m7
  353. mova Z(0), m0
  354. mova Z(1), m5
  355. mova Z(2), m2
  356. mova Z(3), m7
  357. PUNPCK m4, Z(5), m5
  358. PUNPCK m6, Z(7), m7
  359. mova Z(4), m4
  360. mova Z(5), m5
  361. mova Z(6), m6
  362. mova Z(7), m7
  363. ret
  364. %endmacro
  365. FFT48_3DN _3dn2
  366. %macro pswapd 2
  367. %ifidn %1, %2
  368. movd [r0+12], %1
  369. punpckhdq %1, [r0+8]
  370. %else
  371. movq %1, %2
  372. psrlq %1, 32
  373. punpckldq %1, %2
  374. %endif
  375. %endmacro
  376. FFT48_3DN _3dn
  377. %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
  378. %macro DECL_PASS 2+ ; name, payload
  379. align 16
  380. %1:
  381. DEFINE_ARGS z, w, n, o1, o3
  382. lea o3q, [nq*3]
  383. lea o1q, [nq*8]
  384. shl o3q, 4
  385. .loop:
  386. %2
  387. add zq, mmsize*2
  388. add wq, mmsize
  389. sub nd, mmsize/8
  390. jg .loop
  391. rep ret
  392. %endmacro
  393. INIT_XMM
  394. %define mova movaps
  395. DECL_PASS pass_sse, PASS_BIG 1
  396. DECL_PASS pass_interleave_sse, PASS_BIG 0
  397. INIT_MMX
  398. %define mulps pfmul
  399. %define addps pfadd
  400. %define subps pfsub
  401. %define unpcklps punpckldq
  402. %define unpckhps punpckhdq
  403. DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
  404. DECL_PASS pass_interleave_3dn, PASS_BIG 0
  405. %define pass_3dn2 pass_3dn
  406. %define pass_interleave_3dn2 pass_interleave_3dn
  407. %ifdef PIC
  408. %define SECTION_REL - $$
  409. %else
  410. %define SECTION_REL
  411. %endif
  412. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  413. lea r2, [dispatch_tab%1]
  414. mov r2, [r2 + (%2q-2)*gprsize]
  415. %ifdef PIC
  416. lea r3, [$$]
  417. add r2, r3
  418. %endif
  419. call r2
  420. %endmacro ; FFT_DISPATCH
  421. %macro DECL_FFT 2-3 ; nbits, cpu, suffix
  422. %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
  423. %if %1==5
  424. %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
  425. %endif
  426. %assign n 1<<%1
  427. %rep 17-%1
  428. %assign n2 n/2
  429. %assign n4 n/4
  430. %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
  431. align 16
  432. fft %+ n %+ %3%2:
  433. call fft %+ n2 %+ %2
  434. add r0, n*4 - (n&(-2<<%1))
  435. call fft %+ n4 %+ %2
  436. add r0, n*2 - (n2&(-2<<%1))
  437. call fft %+ n4 %+ %2
  438. sub r0, n*6 + (n2&(-2<<%1))
  439. lea r1, [cos_ %+ n]
  440. mov r2d, n4/2
  441. jmp pass%3%2
  442. %assign n n*2
  443. %endrep
  444. %undef n
  445. align 8
  446. dispatch_tab%3%2: pointer list_of_fft
  447. section .text
  448. ; On x86_32, this function does the register saving and restoring for all of fft.
  449. ; The others pass args in registers and don't spill anything.
  450. cglobal fft_dispatch%3%2, 2,5,8, z, nbits
  451. FFT_DISPATCH %3%2, nbits
  452. RET
  453. %endmacro ; DECL_FFT
  454. DECL_FFT 5, _sse
  455. DECL_FFT 5, _sse, _interleave
  456. DECL_FFT 4, _3dn
  457. DECL_FFT 4, _3dn, _interleave
  458. DECL_FFT 4, _3dn2
  459. DECL_FFT 4, _3dn2, _interleave
  460. INIT_XMM
  461. %undef mulps
  462. %undef addps
  463. %undef subps
  464. %undef unpcklps
  465. %undef unpckhps
  466. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  467. movaps xmm0, [%3+%2*4]
  468. movaps xmm1, [%3+%1*4-0x10]
  469. movaps xmm2, xmm0
  470. shufps xmm0, xmm1, 0x88
  471. shufps xmm1, xmm2, 0x77
  472. movlps xmm4, [%4+%2*2]
  473. movlps xmm5, [%5+%2*2+0x0]
  474. movhps xmm4, [%4+%1*2-0x8]
  475. movhps xmm5, [%5+%1*2-0x8]
  476. movaps xmm2, xmm0
  477. movaps xmm3, xmm1
  478. mulps xmm0, xmm5
  479. mulps xmm1, xmm4
  480. mulps xmm2, xmm4
  481. mulps xmm3, xmm5
  482. subps xmm1, xmm0
  483. addps xmm2, xmm3
  484. movaps xmm0, xmm1
  485. unpcklps xmm1, xmm2
  486. unpckhps xmm0, xmm2
  487. %endmacro
  488. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  489. movaps xmm6, [%4+%1*2]
  490. movaps %2, [%4+%1*2+0x10]
  491. movaps %3, xmm6
  492. movaps xmm7, %2
  493. mulps xmm6, [%5+%1]
  494. mulps %2, [%6+%1]
  495. mulps %3, [%6+%1]
  496. mulps xmm7, [%5+%1]
  497. subps %2, xmm6
  498. addps %3, xmm7
  499. %endmacro
  500. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  501. .post:
  502. CMUL %1, xmm0, xmm1, %3, %4, %5
  503. CMUL %2, xmm4, xmm5, %3, %4, %5
  504. shufps xmm1, xmm1, 0x1b
  505. shufps xmm5, xmm5, 0x1b
  506. movaps xmm6, xmm4
  507. unpckhps xmm4, xmm1
  508. unpcklps xmm6, xmm1
  509. movaps xmm2, xmm0
  510. unpcklps xmm0, xmm5
  511. unpckhps xmm2, xmm5
  512. movaps [%3+%2*2], xmm6
  513. movaps [%3+%2*2+0x10], xmm4
  514. movaps [%3+%1*2], xmm0
  515. movaps [%3+%1*2+0x10], xmm2
  516. sub %2, 0x10
  517. add %1, 0x10
  518. jl .post
  519. %endmacro
  520. cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
  521. %ifdef ARCH_X86_64
  522. %define rrevtab r10
  523. %define rtcos r11
  524. %define rtsin r12
  525. push r12
  526. push r13
  527. push r14
  528. %else
  529. %define rrevtab r6
  530. %define rtsin r6
  531. %define rtcos r5
  532. %endif
  533. mov r3d, [r0+FFTContext.mdctsize]
  534. add r2, r3
  535. shr r3, 1
  536. mov rtcos, [r0+FFTContext.tcos]
  537. mov rtsin, [r0+FFTContext.tsin]
  538. add rtcos, r3
  539. add rtsin, r3
  540. %ifndef ARCH_X86_64
  541. push rtcos
  542. push rtsin
  543. %endif
  544. shr r3, 1
  545. mov rrevtab, [r0+FFTContext.revtab]
  546. add rrevtab, r3
  547. %ifndef ARCH_X86_64
  548. push rrevtab
  549. %endif
  550. sub r3, 4
  551. %ifdef ARCH_X86_64
  552. xor r4, r4
  553. sub r4, r3
  554. %endif
  555. .pre:
  556. %ifndef ARCH_X86_64
  557. ;unspill
  558. xor r4, r4
  559. sub r4, r3
  560. mov rtsin, [esp+4]
  561. mov rtcos, [esp+8]
  562. %endif
  563. PREROTATER r4, r3, r2, rtcos, rtsin
  564. %ifdef ARCH_X86_64
  565. movzx r5, word [rrevtab+r4-4]
  566. movzx r6, word [rrevtab+r4-2]
  567. movzx r13, word [rrevtab+r3]
  568. movzx r14, word [rrevtab+r3+2]
  569. movlps [r1+r5 *8], xmm0
  570. movhps [r1+r6 *8], xmm0
  571. movlps [r1+r13*8], xmm1
  572. movhps [r1+r14*8], xmm1
  573. add r4, 4
  574. %else
  575. mov r6, [esp]
  576. movzx r5, word [r6+r4-4]
  577. movzx r4, word [r6+r4-2]
  578. movlps [r1+r5*8], xmm0
  579. movhps [r1+r4*8], xmm0
  580. movzx r5, word [r6+r3]
  581. movzx r4, word [r6+r3+2]
  582. movlps [r1+r5*8], xmm1
  583. movhps [r1+r4*8], xmm1
  584. %endif
  585. sub r3, 4
  586. jns .pre
  587. mov r5, r0
  588. mov r6, r1
  589. mov r0, r1
  590. mov r1d, [r5+FFTContext.nbits]
  591. FFT_DISPATCH _sse, r1
  592. mov r0d, [r5+FFTContext.mdctsize]
  593. add r6, r0
  594. shr r0, 1
  595. %ifndef ARCH_X86_64
  596. %define rtcos r2
  597. %define rtsin r3
  598. mov rtcos, [esp+8]
  599. mov rtsin, [esp+4]
  600. %endif
  601. neg r0
  602. mov r1, -16
  603. sub r1, r0
  604. POSROTATESHUF r0, r1, r6, rtcos, rtsin
  605. %ifdef ARCH_X86_64
  606. pop r14
  607. pop r13
  608. pop r12
  609. %else
  610. add esp, 12
  611. %endif
  612. RET