You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

669 lines
15KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This algorithm (though not any of the implementation details) is
  6. ;* based on libdjbfft by D. J. Bernstein.
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. ; These functions are not individually interchangeable with the C versions.
  25. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  26. ; in blocks as conventient to the vector size.
  27. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  28. %include "x86inc.asm"
  29. %ifdef ARCH_X86_64
  30. %define pointer resq
  31. %else
  32. %define pointer resd
  33. %endif
  34. struc FFTContext
  35. .nbits: resd 1
  36. .reverse: resd 1
  37. .revtab: pointer 1
  38. .tmpbuf: pointer 1
  39. .mdctsize: resd 1
  40. .mdctbits: resd 1
  41. .tcos: pointer 1
  42. .tsin: pointer 1
  43. endstruc
  44. SECTION_RODATA
  45. %define M_SQRT1_2 0.70710678118654752440
  46. ps_root2: times 4 dd M_SQRT1_2
  47. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  48. ps_m1p1: dd 1<<31, 0
  49. %assign i 16
  50. %rep 13
  51. cextern cos_ %+ i
  52. %assign i i<<1
  53. %endrep
  54. %ifdef ARCH_X86_64
  55. %define pointer dq
  56. %else
  57. %define pointer dd
  58. %endif
  59. %macro IF0 1+
  60. %endmacro
  61. %macro IF1 1+
  62. %1
  63. %endmacro
  64. section .text align=16
  65. %macro T2_3DN 4 ; z0, z1, mem0, mem1
  66. mova %1, %3
  67. mova %2, %1
  68. pfadd %1, %4
  69. pfsub %2, %4
  70. %endmacro
  71. %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
  72. mova %5, %3
  73. pfsub %3, %4
  74. pfadd %5, %4 ; {t6,t5}
  75. pxor %3, [ps_m1p1] ; {t8,t7}
  76. mova %6, %1
  77. pswapd %3, %3
  78. pfadd %1, %5 ; {r0,i0}
  79. pfsub %6, %5 ; {r2,i2}
  80. mova %4, %2
  81. pfadd %2, %3 ; {r1,i1}
  82. pfsub %4, %3 ; {r3,i3}
  83. SWAP %3, %6
  84. %endmacro
  85. ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
  86. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  87. %macro T4_SSE 3
  88. mova %3, %1
  89. shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
  90. shufps %3, %2, 0xce ; {r1,i1,r2,i3}
  91. mova %2, %1
  92. addps %1, %3 ; {t1,t2,t6,t5}
  93. subps %2, %3 ; {t3,t4,t8,t7}
  94. mova %3, %1
  95. shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
  96. shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
  97. mova %2, %1
  98. addps %1, %3 ; {r0,i0,r1,i1}
  99. subps %2, %3 ; {r2,i2,r3,i3}
  100. mova %3, %1
  101. shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
  102. shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
  103. SWAP %2, %3
  104. %endmacro
  105. %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
  106. mova %5, %3
  107. shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
  108. shufps %5, %4, 0xee ; {r5,i5,r7,i7}
  109. mova %6, %3
  110. subps %3, %5 ; {r5,i5,r7,i7}
  111. addps %6, %5 ; {t1,t2,t3,t4}
  112. mova %5, %3
  113. shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
  114. mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  115. mulps %5, [ps_root2]
  116. addps %3, %5 ; {t8,t7,ta,t9}
  117. mova %5, %6
  118. shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
  119. shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
  120. mova %3, %6
  121. addps %6, %5 ; {t1,t2,t9,ta}
  122. subps %3, %5 ; {t6,t5,tc,tb}
  123. mova %5, %6
  124. shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
  125. shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
  126. mova %3, %1
  127. mova %4, %2
  128. addps %1, %6 ; {r0,r1,r2,r3}
  129. addps %2, %5 ; {i0,i1,i2,i3}
  130. subps %3, %6 ; {r4,r5,r6,r7}
  131. subps %4, %5 ; {i4,i5,i6,i7}
  132. %endmacro
  133. ; scheduled for cpu-bound sizes
  134. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  135. IF%1 mova m4, Z(4)
  136. IF%1 mova m5, Z(5)
  137. mova m0, %2 ; wre
  138. mova m2, m4
  139. mova m1, %3 ; wim
  140. mova m3, m5
  141. mulps m2, m0 ; r2*wre
  142. IF%1 mova m6, Z2(6)
  143. mulps m3, m1 ; i2*wim
  144. IF%1 mova m7, Z2(7)
  145. mulps m4, m1 ; r2*wim
  146. mulps m5, m0 ; i2*wre
  147. addps m2, m3 ; r2*wre + i2*wim
  148. mova m3, m1
  149. mulps m1, m6 ; r3*wim
  150. subps m5, m4 ; i2*wre - r2*wim
  151. mova m4, m0
  152. mulps m3, m7 ; i3*wim
  153. mulps m4, m6 ; r3*wre
  154. mulps m0, m7 ; i3*wre
  155. subps m4, m3 ; r3*wre - i3*wim
  156. mova m3, Z(0)
  157. addps m0, m1 ; i3*wre + r3*wim
  158. mova m1, m4
  159. addps m4, m2 ; t5
  160. subps m1, m2 ; t3
  161. subps m3, m4 ; r2
  162. addps m4, Z(0) ; r0
  163. mova m6, Z(2)
  164. mova Z(4), m3
  165. mova Z(0), m4
  166. mova m3, m5
  167. subps m5, m0 ; t4
  168. mova m4, m6
  169. subps m6, m5 ; r3
  170. addps m5, m4 ; r1
  171. mova Z2(6), m6
  172. mova Z(2), m5
  173. mova m2, Z(3)
  174. addps m3, m0 ; t6
  175. subps m2, m1 ; i3
  176. mova m7, Z(1)
  177. addps m1, Z(3) ; i1
  178. mova Z2(7), m2
  179. mova Z(3), m1
  180. mova m4, m7
  181. subps m7, m3 ; i2
  182. addps m3, m4 ; i0
  183. mova Z(5), m7
  184. mova Z(1), m3
  185. %endmacro
  186. ; scheduled to avoid store->load aliasing
  187. %macro PASS_BIG 1 ; (!interleave)
  188. mova m4, Z(4) ; r2
  189. mova m5, Z(5) ; i2
  190. mova m2, m4
  191. mova m0, [wq] ; wre
  192. mova m3, m5
  193. mova m1, [wq+o1q] ; wim
  194. mulps m2, m0 ; r2*wre
  195. mova m6, Z2(6) ; r3
  196. mulps m3, m1 ; i2*wim
  197. mova m7, Z2(7) ; i3
  198. mulps m4, m1 ; r2*wim
  199. mulps m5, m0 ; i2*wre
  200. addps m2, m3 ; r2*wre + i2*wim
  201. mova m3, m1
  202. mulps m1, m6 ; r3*wim
  203. subps m5, m4 ; i2*wre - r2*wim
  204. mova m4, m0
  205. mulps m3, m7 ; i3*wim
  206. mulps m4, m6 ; r3*wre
  207. mulps m0, m7 ; i3*wre
  208. subps m4, m3 ; r3*wre - i3*wim
  209. mova m3, Z(0)
  210. addps m0, m1 ; i3*wre + r3*wim
  211. mova m1, m4
  212. addps m4, m2 ; t5
  213. subps m1, m2 ; t3
  214. subps m3, m4 ; r2
  215. addps m4, Z(0) ; r0
  216. mova m6, Z(2)
  217. mova Z(4), m3
  218. mova Z(0), m4
  219. mova m3, m5
  220. subps m5, m0 ; t4
  221. mova m4, m6
  222. subps m6, m5 ; r3
  223. addps m5, m4 ; r1
  224. IF%1 mova Z2(6), m6
  225. IF%1 mova Z(2), m5
  226. mova m2, Z(3)
  227. addps m3, m0 ; t6
  228. subps m2, m1 ; i3
  229. mova m7, Z(1)
  230. addps m1, Z(3) ; i1
  231. IF%1 mova Z2(7), m2
  232. IF%1 mova Z(3), m1
  233. mova m4, m7
  234. subps m7, m3 ; i2
  235. addps m3, m4 ; i0
  236. IF%1 mova Z(5), m7
  237. IF%1 mova Z(1), m3
  238. %if %1==0
  239. mova m4, m5 ; r1
  240. mova m0, m6 ; r3
  241. unpcklps m5, m1
  242. unpckhps m4, m1
  243. unpcklps m6, m2
  244. unpckhps m0, m2
  245. mova m1, Z(0)
  246. mova m2, Z(4)
  247. mova Z(2), m5
  248. mova Z(3), m4
  249. mova Z2(6), m6
  250. mova Z2(7), m0
  251. mova m5, m1 ; r0
  252. mova m4, m2 ; r2
  253. unpcklps m1, m3
  254. unpckhps m5, m3
  255. unpcklps m2, m7
  256. unpckhps m4, m7
  257. mova Z(0), m1
  258. mova Z(1), m5
  259. mova Z(4), m2
  260. mova Z(5), m4
  261. %endif
  262. %endmacro
  263. %macro PUNPCK 3
  264. mova %3, %1
  265. punpckldq %1, %2
  266. punpckhdq %3, %2
  267. %endmacro
  268. INIT_XMM
  269. %define mova movaps
  270. %define Z(x) [r0+mmsize*x]
  271. %define Z2(x) [r0+mmsize*x]
  272. align 16
  273. fft4_sse:
  274. mova m0, Z(0)
  275. mova m1, Z(1)
  276. T4_SSE m0, m1, m2
  277. mova Z(0), m0
  278. mova Z(1), m1
  279. ret
  280. align 16
  281. fft8_sse:
  282. mova m0, Z(0)
  283. mova m1, Z(1)
  284. T4_SSE m0, m1, m2
  285. mova m2, Z(2)
  286. mova m3, Z(3)
  287. T8_SSE m0, m1, m2, m3, m4, m5
  288. mova Z(0), m0
  289. mova Z(1), m1
  290. mova Z(2), m2
  291. mova Z(3), m3
  292. ret
  293. align 16
  294. fft16_sse:
  295. mova m0, Z(0)
  296. mova m1, Z(1)
  297. T4_SSE m0, m1, m2
  298. mova m2, Z(2)
  299. mova m3, Z(3)
  300. T8_SSE m0, m1, m2, m3, m4, m5
  301. mova m4, Z(4)
  302. mova m5, Z(5)
  303. mova Z(0), m0
  304. mova Z(1), m1
  305. mova Z(2), m2
  306. mova Z(3), m3
  307. T4_SSE m4, m5, m6
  308. mova m6, Z2(6)
  309. mova m7, Z2(7)
  310. T4_SSE m6, m7, m0
  311. PASS_SMALL 0, [cos_16], [cos_16+16]
  312. ret
  313. INIT_MMX
  314. %macro FFT48_3DN 1
  315. align 16
  316. fft4%1:
  317. T2_3DN m0, m1, Z(0), Z(1)
  318. mova m2, Z(2)
  319. mova m3, Z(3)
  320. T4_3DN m0, m1, m2, m3, m4, m5
  321. PUNPCK m0, m1, m4
  322. PUNPCK m2, m3, m5
  323. mova Z(0), m0
  324. mova Z(1), m4
  325. mova Z(2), m2
  326. mova Z(3), m5
  327. ret
  328. align 16
  329. fft8%1:
  330. T2_3DN m0, m1, Z(0), Z(1)
  331. mova m2, Z(2)
  332. mova m3, Z(3)
  333. T4_3DN m0, m1, m2, m3, m4, m5
  334. mova Z(0), m0
  335. mova Z(2), m2
  336. T2_3DN m4, m5, Z(4), Z(5)
  337. T2_3DN m6, m7, Z2(6), Z2(7)
  338. pswapd m0, m5
  339. pswapd m2, m7
  340. pxor m0, [ps_m1p1]
  341. pxor m2, [ps_m1p1]
  342. pfsub m5, m0
  343. pfadd m7, m2
  344. pfmul m5, [ps_root2]
  345. pfmul m7, [ps_root2]
  346. T4_3DN m1, m3, m5, m7, m0, m2
  347. mova Z(5), m5
  348. mova Z2(7), m7
  349. mova m0, Z(0)
  350. mova m2, Z(2)
  351. T4_3DN m0, m2, m4, m6, m5, m7
  352. PUNPCK m0, m1, m5
  353. PUNPCK m2, m3, m7
  354. mova Z(0), m0
  355. mova Z(1), m5
  356. mova Z(2), m2
  357. mova Z(3), m7
  358. PUNPCK m4, Z(5), m5
  359. PUNPCK m6, Z2(7), m7
  360. mova Z(4), m4
  361. mova Z(5), m5
  362. mova Z2(6), m6
  363. mova Z2(7), m7
  364. ret
  365. %endmacro
  366. FFT48_3DN _3dn2
  367. %macro pswapd 2
  368. %ifidn %1, %2
  369. movd [r0+12], %1
  370. punpckhdq %1, [r0+8]
  371. %else
  372. movq %1, %2
  373. psrlq %1, 32
  374. punpckldq %1, %2
  375. %endif
  376. %endmacro
  377. FFT48_3DN _3dn
  378. %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
  379. %define Z2(x) [zq + o3q + mmsize*(x&1)]
  380. %macro DECL_PASS 2+ ; name, payload
  381. align 16
  382. %1:
  383. DEFINE_ARGS z, w, n, o1, o3
  384. lea o3q, [nq*3]
  385. lea o1q, [nq*8]
  386. shl o3q, 4
  387. .loop:
  388. %2
  389. add zq, mmsize*2
  390. add wq, mmsize
  391. sub nd, mmsize/8
  392. jg .loop
  393. rep ret
  394. %endmacro
  395. INIT_XMM
  396. %define mova movaps
  397. DECL_PASS pass_sse, PASS_BIG 1
  398. DECL_PASS pass_interleave_sse, PASS_BIG 0
  399. INIT_MMX
  400. %define mulps pfmul
  401. %define addps pfadd
  402. %define subps pfsub
  403. %define unpcklps punpckldq
  404. %define unpckhps punpckhdq
  405. DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
  406. DECL_PASS pass_interleave_3dn, PASS_BIG 0
  407. %define pass_3dn2 pass_3dn
  408. %define pass_interleave_3dn2 pass_interleave_3dn
  409. %ifdef PIC
  410. %define SECTION_REL - $$
  411. %else
  412. %define SECTION_REL
  413. %endif
  414. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  415. lea r2, [dispatch_tab%1]
  416. mov r2, [r2 + (%2q-2)*gprsize]
  417. %ifdef PIC
  418. lea r3, [$$]
  419. add r2, r3
  420. %endif
  421. call r2
  422. %endmacro ; FFT_DISPATCH
  423. %macro DECL_FFT 2-3 ; nbits, cpu, suffix
  424. %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
  425. %if %1==5
  426. %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
  427. %endif
  428. %assign n 1<<%1
  429. %rep 17-%1
  430. %assign n2 n/2
  431. %assign n4 n/4
  432. %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
  433. align 16
  434. fft %+ n %+ %3%2:
  435. call fft %+ n2 %+ %2
  436. add r0, n*4 - (n&(-2<<%1))
  437. call fft %+ n4 %+ %2
  438. add r0, n*2 - (n2&(-2<<%1))
  439. call fft %+ n4 %+ %2
  440. sub r0, n*6 + (n2&(-2<<%1))
  441. lea r1, [cos_ %+ n]
  442. mov r2d, n4/2
  443. jmp pass%3%2
  444. %assign n n*2
  445. %endrep
  446. %undef n
  447. align 8
  448. dispatch_tab%3%2: pointer list_of_fft
  449. section .text
  450. ; On x86_32, this function does the register saving and restoring for all of fft.
  451. ; The others pass args in registers and don't spill anything.
  452. cglobal fft_dispatch%3%2, 2,5,8, z, nbits
  453. FFT_DISPATCH %3%2, nbits
  454. RET
  455. %endmacro ; DECL_FFT
  456. DECL_FFT 5, _sse
  457. DECL_FFT 5, _sse, _interleave
  458. DECL_FFT 4, _3dn
  459. DECL_FFT 4, _3dn, _interleave
  460. DECL_FFT 4, _3dn2
  461. DECL_FFT 4, _3dn2, _interleave
  462. INIT_XMM
  463. %undef mulps
  464. %undef addps
  465. %undef subps
  466. %undef unpcklps
  467. %undef unpckhps
  468. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  469. movaps xmm0, [%3+%2*4]
  470. movaps xmm1, [%3+%1*4-0x10]
  471. movaps xmm2, xmm0
  472. shufps xmm0, xmm1, 0x88
  473. shufps xmm1, xmm2, 0x77
  474. movlps xmm4, [%4+%2*2]
  475. movlps xmm5, [%5+%2*2+0x0]
  476. movhps xmm4, [%4+%1*2-0x8]
  477. movhps xmm5, [%5+%1*2-0x8]
  478. movaps xmm2, xmm0
  479. movaps xmm3, xmm1
  480. mulps xmm0, xmm5
  481. mulps xmm1, xmm4
  482. mulps xmm2, xmm4
  483. mulps xmm3, xmm5
  484. subps xmm1, xmm0
  485. addps xmm2, xmm3
  486. movaps xmm0, xmm1
  487. unpcklps xmm1, xmm2
  488. unpckhps xmm0, xmm2
  489. %endmacro
  490. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  491. movaps xmm6, [%4+%1*2]
  492. movaps %2, [%4+%1*2+0x10]
  493. movaps %3, xmm6
  494. movaps xmm7, %2
  495. mulps xmm6, [%5+%1]
  496. mulps %2, [%6+%1]
  497. mulps %3, [%6+%1]
  498. mulps xmm7, [%5+%1]
  499. subps %2, xmm6
  500. addps %3, xmm7
  501. %endmacro
  502. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  503. .post:
  504. CMUL %1, xmm0, xmm1, %3, %4, %5
  505. CMUL %2, xmm4, xmm5, %3, %4, %5
  506. shufps xmm1, xmm1, 0x1b
  507. shufps xmm5, xmm5, 0x1b
  508. movaps xmm6, xmm4
  509. unpckhps xmm4, xmm1
  510. unpcklps xmm6, xmm1
  511. movaps xmm2, xmm0
  512. unpcklps xmm0, xmm5
  513. unpckhps xmm2, xmm5
  514. movaps [%3+%2*2], xmm6
  515. movaps [%3+%2*2+0x10], xmm4
  516. movaps [%3+%1*2], xmm0
  517. movaps [%3+%1*2+0x10], xmm2
  518. sub %2, 0x10
  519. add %1, 0x10
  520. jl .post
  521. %endmacro
  522. cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
  523. %ifdef ARCH_X86_64
  524. %define rrevtab r10
  525. %define rtcos r11
  526. %define rtsin r12
  527. push r12
  528. push r13
  529. push r14
  530. %else
  531. %define rrevtab r6
  532. %define rtsin r6
  533. %define rtcos r5
  534. %endif
  535. mov r3d, [r0+FFTContext.mdctsize]
  536. add r2, r3
  537. shr r3, 1
  538. mov rtcos, [r0+FFTContext.tcos]
  539. mov rtsin, [r0+FFTContext.tsin]
  540. add rtcos, r3
  541. add rtsin, r3
  542. %ifndef ARCH_X86_64
  543. push rtcos
  544. push rtsin
  545. %endif
  546. shr r3, 1
  547. mov rrevtab, [r0+FFTContext.revtab]
  548. add rrevtab, r3
  549. %ifndef ARCH_X86_64
  550. push rrevtab
  551. %endif
  552. sub r3, 4
  553. %ifdef ARCH_X86_64
  554. xor r4, r4
  555. sub r4, r3
  556. %endif
  557. .pre:
  558. %ifndef ARCH_X86_64
  559. ;unspill
  560. xor r4, r4
  561. sub r4, r3
  562. mov rtsin, [esp+4]
  563. mov rtcos, [esp+8]
  564. %endif
  565. PREROTATER r4, r3, r2, rtcos, rtsin
  566. %ifdef ARCH_X86_64
  567. movzx r5, word [rrevtab+r4-4]
  568. movzx r6, word [rrevtab+r4-2]
  569. movzx r13, word [rrevtab+r3]
  570. movzx r14, word [rrevtab+r3+2]
  571. movlps [r1+r5 *8], xmm0
  572. movhps [r1+r6 *8], xmm0
  573. movlps [r1+r13*8], xmm1
  574. movhps [r1+r14*8], xmm1
  575. add r4, 4
  576. %else
  577. mov r6, [esp]
  578. movzx r5, word [r6+r4-4]
  579. movzx r4, word [r6+r4-2]
  580. movlps [r1+r5*8], xmm0
  581. movhps [r1+r4*8], xmm0
  582. movzx r5, word [r6+r3]
  583. movzx r4, word [r6+r3+2]
  584. movlps [r1+r5*8], xmm1
  585. movhps [r1+r4*8], xmm1
  586. %endif
  587. sub r3, 4
  588. jns .pre
  589. mov r5, r0
  590. mov r6, r1
  591. mov r0, r1
  592. mov r1d, [r5+FFTContext.nbits]
  593. FFT_DISPATCH _sse, r1
  594. mov r0d, [r5+FFTContext.mdctsize]
  595. add r6, r0
  596. shr r0, 1
  597. %ifndef ARCH_X86_64
  598. %define rtcos r2
  599. %define rtsin r3
  600. mov rtcos, [esp+8]
  601. mov rtsin, [esp+4]
  602. %endif
  603. neg r0
  604. mov r1, -16
  605. sub r1, r0
  606. POSROTATESHUF r0, r1, r6, rtcos, rtsin
  607. %ifdef ARCH_X86_64
  608. pop r14
  609. pop r13
  610. pop r12
  611. %else
  612. add esp, 12
  613. %endif
  614. RET