You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

667 lines
15KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This algorithm (though not any of the implementation details) is
  6. ;* based on libdjbfft by D. J. Bernstein.
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. ; These functions are not individually interchangeable with the C versions.
  25. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  26. ; in blocks as conventient to the vector size.
  27. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  28. %include "x86inc.asm"
  29. %ifdef ARCH_X86_64
  30. %define pointer resq
  31. %else
  32. %define pointer resd
  33. %endif
  34. struc FFTContext
  35. .nbits: resd 1
  36. .reverse: resd 1
  37. .revtab: pointer 1
  38. .tmpbuf: pointer 1
  39. .mdctsize: resd 1
  40. .mdctbits: resd 1
  41. .tcos: pointer 1
  42. .tsin: pointer 1
  43. endstruc
  44. SECTION_RODATA
  45. %define M_SQRT1_2 0.70710678118654752440
  46. ps_root2: times 4 dd M_SQRT1_2
  47. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  48. ps_p1p1m1p1: dd 0, 0, 1<<31, 0
  49. ps_m1p1: dd 1<<31, 0
  50. %assign i 16
  51. %rep 13
  52. cextern cos_ %+ i
  53. %assign i i<<1
  54. %endrep
  55. %ifdef ARCH_X86_64
  56. %define pointer dq
  57. %else
  58. %define pointer dd
  59. %endif
  60. %macro IF0 1+
  61. %endmacro
  62. %macro IF1 1+
  63. %1
  64. %endmacro
  65. section .text align=16
  66. %macro T2_3DN 4 ; z0, z1, mem0, mem1
  67. mova %1, %3
  68. mova %2, %1
  69. pfadd %1, %4
  70. pfsub %2, %4
  71. %endmacro
  72. %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
  73. mova %5, %3
  74. pfsub %3, %4
  75. pfadd %5, %4 ; {t6,t5}
  76. pxor %3, [ps_m1p1] ; {t8,t7}
  77. mova %6, %1
  78. pswapd %3, %3
  79. pfadd %1, %5 ; {r0,i0}
  80. pfsub %6, %5 ; {r2,i2}
  81. mova %4, %2
  82. pfadd %2, %3 ; {r1,i1}
  83. pfsub %4, %3 ; {r3,i3}
  84. SWAP %3, %6
  85. %endmacro
  86. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  87. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  88. %macro T4_SSE 3
  89. mova %3, %1
  90. addps %1, %2 ; {t1,t2,t6,t5}
  91. subps %3, %2 ; {t3,t4,-t8,t7}
  92. xorps %3, [ps_p1p1m1p1]
  93. mova %2, %1
  94. shufps %1, %3, 0x44 ; {t1,t2,t3,t4}
  95. shufps %2, %3, 0xbe ; {t6,t5,t7,t8}
  96. mova %3, %1
  97. addps %1, %2 ; {r0,i0,r1,i1}
  98. subps %3, %2 ; {r2,i2,r3,i3}
  99. mova %2, %1
  100. shufps %1, %3, 0x88 ; {r0,r1,r2,r3}
  101. shufps %2, %3, 0xdd ; {i0,i1,i2,i3}
  102. %endmacro
  103. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  104. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  105. %macro T8_SSE 6
  106. mova %6, %3
  107. subps %3, %4 ; {r5,i5,r7,i7}
  108. addps %6, %4 ; {t1,t2,t3,t4}
  109. mova %4, %3
  110. shufps %4, %4, 0xb1 ; {i5,r5,i7,r7}
  111. mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  112. mulps %4, [ps_root2]
  113. addps %3, %4 ; {t8,t7,ta,t9}
  114. mova %4, %6
  115. shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
  116. shufps %4, %3, 0x9c ; {t1,t4,t7,ta}
  117. mova %3, %6
  118. addps %6, %4 ; {t1,t2,t9,ta}
  119. subps %3, %4 ; {t6,t5,tc,tb}
  120. mova %4, %6
  121. shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
  122. shufps %4, %3, 0x8d ; {t2,ta,t6,tc}
  123. mova %3, %1
  124. mova %5, %2
  125. addps %1, %6 ; {r0,r1,r2,r3}
  126. addps %2, %4 ; {i0,i1,i2,i3}
  127. subps %3, %6 ; {r4,r5,r6,r7}
  128. subps %5, %4 ; {i4,i5,i6,i7}
  129. SWAP %4, %5
  130. %endmacro
  131. ; scheduled for cpu-bound sizes
  132. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  133. IF%1 mova m4, Z(4)
  134. IF%1 mova m5, Z(5)
  135. mova m0, %2 ; wre
  136. mova m2, m4
  137. mova m1, %3 ; wim
  138. mova m3, m5
  139. mulps m2, m0 ; r2*wre
  140. IF%1 mova m6, Z2(6)
  141. mulps m3, m1 ; i2*wim
  142. IF%1 mova m7, Z2(7)
  143. mulps m4, m1 ; r2*wim
  144. mulps m5, m0 ; i2*wre
  145. addps m2, m3 ; r2*wre + i2*wim
  146. mova m3, m1
  147. mulps m1, m6 ; r3*wim
  148. subps m5, m4 ; i2*wre - r2*wim
  149. mova m4, m0
  150. mulps m3, m7 ; i3*wim
  151. mulps m4, m6 ; r3*wre
  152. mulps m0, m7 ; i3*wre
  153. subps m4, m3 ; r3*wre - i3*wim
  154. mova m3, Z(0)
  155. addps m0, m1 ; i3*wre + r3*wim
  156. mova m1, m4
  157. addps m4, m2 ; t5
  158. subps m1, m2 ; t3
  159. subps m3, m4 ; r2
  160. addps m4, Z(0) ; r0
  161. mova m6, Z(2)
  162. mova Z(4), m3
  163. mova Z(0), m4
  164. mova m3, m5
  165. subps m5, m0 ; t4
  166. mova m4, m6
  167. subps m6, m5 ; r3
  168. addps m5, m4 ; r1
  169. mova Z2(6), m6
  170. mova Z(2), m5
  171. mova m2, Z(3)
  172. addps m3, m0 ; t6
  173. subps m2, m1 ; i3
  174. mova m7, Z(1)
  175. addps m1, Z(3) ; i1
  176. mova Z2(7), m2
  177. mova Z(3), m1
  178. mova m4, m7
  179. subps m7, m3 ; i2
  180. addps m3, m4 ; i0
  181. mova Z(5), m7
  182. mova Z(1), m3
  183. %endmacro
  184. ; scheduled to avoid store->load aliasing
  185. %macro PASS_BIG 1 ; (!interleave)
  186. mova m4, Z(4) ; r2
  187. mova m5, Z(5) ; i2
  188. mova m2, m4
  189. mova m0, [wq] ; wre
  190. mova m3, m5
  191. mova m1, [wq+o1q] ; wim
  192. mulps m2, m0 ; r2*wre
  193. mova m6, Z2(6) ; r3
  194. mulps m3, m1 ; i2*wim
  195. mova m7, Z2(7) ; i3
  196. mulps m4, m1 ; r2*wim
  197. mulps m5, m0 ; i2*wre
  198. addps m2, m3 ; r2*wre + i2*wim
  199. mova m3, m1
  200. mulps m1, m6 ; r3*wim
  201. subps m5, m4 ; i2*wre - r2*wim
  202. mova m4, m0
  203. mulps m3, m7 ; i3*wim
  204. mulps m4, m6 ; r3*wre
  205. mulps m0, m7 ; i3*wre
  206. subps m4, m3 ; r3*wre - i3*wim
  207. mova m3, Z(0)
  208. addps m0, m1 ; i3*wre + r3*wim
  209. mova m1, m4
  210. addps m4, m2 ; t5
  211. subps m1, m2 ; t3
  212. subps m3, m4 ; r2
  213. addps m4, Z(0) ; r0
  214. mova m6, Z(2)
  215. mova Z(4), m3
  216. mova Z(0), m4
  217. mova m3, m5
  218. subps m5, m0 ; t4
  219. mova m4, m6
  220. subps m6, m5 ; r3
  221. addps m5, m4 ; r1
  222. IF%1 mova Z2(6), m6
  223. IF%1 mova Z(2), m5
  224. mova m2, Z(3)
  225. addps m3, m0 ; t6
  226. subps m2, m1 ; i3
  227. mova m7, Z(1)
  228. addps m1, Z(3) ; i1
  229. IF%1 mova Z2(7), m2
  230. IF%1 mova Z(3), m1
  231. mova m4, m7
  232. subps m7, m3 ; i2
  233. addps m3, m4 ; i0
  234. IF%1 mova Z(5), m7
  235. IF%1 mova Z(1), m3
  236. %if %1==0
  237. mova m4, m5 ; r1
  238. mova m0, m6 ; r3
  239. unpcklps m5, m1
  240. unpckhps m4, m1
  241. unpcklps m6, m2
  242. unpckhps m0, m2
  243. mova m1, Z(0)
  244. mova m2, Z(4)
  245. mova Z(2), m5
  246. mova Z(3), m4
  247. mova Z2(6), m6
  248. mova Z2(7), m0
  249. mova m5, m1 ; r0
  250. mova m4, m2 ; r2
  251. unpcklps m1, m3
  252. unpckhps m5, m3
  253. unpcklps m2, m7
  254. unpckhps m4, m7
  255. mova Z(0), m1
  256. mova Z(1), m5
  257. mova Z(4), m2
  258. mova Z(5), m4
  259. %endif
  260. %endmacro
  261. %macro PUNPCK 3
  262. mova %3, %1
  263. punpckldq %1, %2
  264. punpckhdq %3, %2
  265. %endmacro
  266. INIT_XMM
  267. %define mova movaps
  268. %define Z(x) [r0+mmsize*x]
  269. %define Z2(x) [r0+mmsize*x]
  270. align 16
  271. fft4_sse:
  272. mova m0, Z(0)
  273. mova m1, Z(1)
  274. T4_SSE m0, m1, m2
  275. mova Z(0), m0
  276. mova Z(1), m1
  277. ret
  278. align 16
  279. fft8_sse:
  280. mova m0, Z(0)
  281. mova m1, Z(1)
  282. T4_SSE m0, m1, m2
  283. mova m2, Z(2)
  284. mova m3, Z(3)
  285. T8_SSE m0, m1, m2, m3, m4, m5
  286. mova Z(0), m0
  287. mova Z(1), m1
  288. mova Z(2), m2
  289. mova Z(3), m3
  290. ret
  291. align 16
  292. fft16_sse:
  293. mova m0, Z(0)
  294. mova m1, Z(1)
  295. T4_SSE m0, m1, m2
  296. mova m2, Z(2)
  297. mova m3, Z(3)
  298. T8_SSE m0, m1, m2, m3, m4, m5
  299. mova m4, Z(4)
  300. mova m5, Z(5)
  301. mova Z(0), m0
  302. mova Z(1), m1
  303. mova Z(2), m2
  304. mova Z(3), m3
  305. T4_SSE m4, m5, m6
  306. mova m6, Z2(6)
  307. mova m7, Z2(7)
  308. T4_SSE m6, m7, m0
  309. PASS_SMALL 0, [cos_16], [cos_16+16]
  310. ret
  311. INIT_MMX
  312. %macro FFT48_3DN 1
  313. align 16
  314. fft4%1:
  315. T2_3DN m0, m1, Z(0), Z(1)
  316. mova m2, Z(2)
  317. mova m3, Z(3)
  318. T4_3DN m0, m1, m2, m3, m4, m5
  319. PUNPCK m0, m1, m4
  320. PUNPCK m2, m3, m5
  321. mova Z(0), m0
  322. mova Z(1), m4
  323. mova Z(2), m2
  324. mova Z(3), m5
  325. ret
  326. align 16
  327. fft8%1:
  328. T2_3DN m0, m1, Z(0), Z(1)
  329. mova m2, Z(2)
  330. mova m3, Z(3)
  331. T4_3DN m0, m1, m2, m3, m4, m5
  332. mova Z(0), m0
  333. mova Z(2), m2
  334. T2_3DN m4, m5, Z(4), Z(5)
  335. T2_3DN m6, m7, Z2(6), Z2(7)
  336. pswapd m0, m5
  337. pswapd m2, m7
  338. pxor m0, [ps_m1p1]
  339. pxor m2, [ps_m1p1]
  340. pfsub m5, m0
  341. pfadd m7, m2
  342. pfmul m5, [ps_root2]
  343. pfmul m7, [ps_root2]
  344. T4_3DN m1, m3, m5, m7, m0, m2
  345. mova Z(5), m5
  346. mova Z2(7), m7
  347. mova m0, Z(0)
  348. mova m2, Z(2)
  349. T4_3DN m0, m2, m4, m6, m5, m7
  350. PUNPCK m0, m1, m5
  351. PUNPCK m2, m3, m7
  352. mova Z(0), m0
  353. mova Z(1), m5
  354. mova Z(2), m2
  355. mova Z(3), m7
  356. PUNPCK m4, Z(5), m5
  357. PUNPCK m6, Z2(7), m7
  358. mova Z(4), m4
  359. mova Z(5), m5
  360. mova Z2(6), m6
  361. mova Z2(7), m7
  362. ret
  363. %endmacro
  364. FFT48_3DN _3dn2
  365. %macro pswapd 2
  366. %ifidn %1, %2
  367. movd [r0+12], %1
  368. punpckhdq %1, [r0+8]
  369. %else
  370. movq %1, %2
  371. psrlq %1, 32
  372. punpckldq %1, %2
  373. %endif
  374. %endmacro
  375. FFT48_3DN _3dn
  376. %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
  377. %define Z2(x) [zq + o3q + mmsize*(x&1)]
  378. %macro DECL_PASS 2+ ; name, payload
  379. align 16
  380. %1:
  381. DEFINE_ARGS z, w, n, o1, o3
  382. lea o3q, [nq*3]
  383. lea o1q, [nq*8]
  384. shl o3q, 4
  385. .loop:
  386. %2
  387. add zq, mmsize*2
  388. add wq, mmsize
  389. sub nd, mmsize/8
  390. jg .loop
  391. rep ret
  392. %endmacro
  393. INIT_XMM
  394. %define mova movaps
  395. DECL_PASS pass_sse, PASS_BIG 1
  396. DECL_PASS pass_interleave_sse, PASS_BIG 0
  397. INIT_MMX
  398. %define mulps pfmul
  399. %define addps pfadd
  400. %define subps pfsub
  401. %define unpcklps punpckldq
  402. %define unpckhps punpckhdq
  403. DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
  404. DECL_PASS pass_interleave_3dn, PASS_BIG 0
  405. %define pass_3dn2 pass_3dn
  406. %define pass_interleave_3dn2 pass_interleave_3dn
  407. %ifdef PIC
  408. %define SECTION_REL - $$
  409. %else
  410. %define SECTION_REL
  411. %endif
  412. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  413. lea r2, [dispatch_tab%1]
  414. mov r2, [r2 + (%2q-2)*gprsize]
  415. %ifdef PIC
  416. lea r3, [$$]
  417. add r2, r3
  418. %endif
  419. call r2
  420. %endmacro ; FFT_DISPATCH
  421. %macro DECL_FFT 2-3 ; nbits, cpu, suffix
  422. %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
  423. %if %1==5
  424. %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
  425. %endif
  426. %assign n 1<<%1
  427. %rep 17-%1
  428. %assign n2 n/2
  429. %assign n4 n/4
  430. %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
  431. align 16
  432. fft %+ n %+ %3%2:
  433. call fft %+ n2 %+ %2
  434. add r0, n*4 - (n&(-2<<%1))
  435. call fft %+ n4 %+ %2
  436. add r0, n*2 - (n2&(-2<<%1))
  437. call fft %+ n4 %+ %2
  438. sub r0, n*6 + (n2&(-2<<%1))
  439. lea r1, [cos_ %+ n]
  440. mov r2d, n4/2
  441. jmp pass%3%2
  442. %assign n n*2
  443. %endrep
  444. %undef n
  445. align 8
  446. dispatch_tab%3%2: pointer list_of_fft
  447. section .text
  448. ; On x86_32, this function does the register saving and restoring for all of fft.
  449. ; The others pass args in registers and don't spill anything.
  450. cglobal fft_dispatch%3%2, 2,5,8, z, nbits
  451. FFT_DISPATCH %3%2, nbits
  452. RET
  453. %endmacro ; DECL_FFT
  454. DECL_FFT 5, _sse
  455. DECL_FFT 5, _sse, _interleave
  456. DECL_FFT 4, _3dn
  457. DECL_FFT 4, _3dn, _interleave
  458. DECL_FFT 4, _3dn2
  459. DECL_FFT 4, _3dn2, _interleave
  460. INIT_XMM
  461. %undef mulps
  462. %undef addps
  463. %undef subps
  464. %undef unpcklps
  465. %undef unpckhps
  466. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  467. movaps xmm0, [%3+%2*4]
  468. movaps xmm1, [%3+%1*4-0x10]
  469. movaps xmm2, xmm0
  470. shufps xmm0, xmm1, 0x88
  471. shufps xmm1, xmm2, 0x77
  472. movlps xmm4, [%4+%2*2]
  473. movlps xmm5, [%5+%2*2+0x0]
  474. movhps xmm4, [%4+%1*2-0x8]
  475. movhps xmm5, [%5+%1*2-0x8]
  476. movaps xmm2, xmm0
  477. movaps xmm3, xmm1
  478. mulps xmm0, xmm5
  479. mulps xmm1, xmm4
  480. mulps xmm2, xmm4
  481. mulps xmm3, xmm5
  482. subps xmm1, xmm0
  483. addps xmm2, xmm3
  484. movaps xmm0, xmm1
  485. unpcklps xmm1, xmm2
  486. unpckhps xmm0, xmm2
  487. %endmacro
  488. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  489. movaps xmm6, [%4+%1*2]
  490. movaps %2, [%4+%1*2+0x10]
  491. movaps %3, xmm6
  492. movaps xmm7, %2
  493. mulps xmm6, [%5+%1]
  494. mulps %2, [%6+%1]
  495. mulps %3, [%6+%1]
  496. mulps xmm7, [%5+%1]
  497. subps %2, xmm6
  498. addps %3, xmm7
  499. %endmacro
  500. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  501. .post:
  502. CMUL %1, xmm0, xmm1, %3, %4, %5
  503. CMUL %2, xmm4, xmm5, %3, %4, %5
  504. shufps xmm1, xmm1, 0x1b
  505. shufps xmm5, xmm5, 0x1b
  506. movaps xmm6, xmm4
  507. unpckhps xmm4, xmm1
  508. unpcklps xmm6, xmm1
  509. movaps xmm2, xmm0
  510. unpcklps xmm0, xmm5
  511. unpckhps xmm2, xmm5
  512. movaps [%3+%2*2], xmm6
  513. movaps [%3+%2*2+0x10], xmm4
  514. movaps [%3+%1*2], xmm0
  515. movaps [%3+%1*2+0x10], xmm2
  516. sub %2, 0x10
  517. add %1, 0x10
  518. jl .post
  519. %endmacro
  520. cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
  521. %ifdef ARCH_X86_64
  522. %define rrevtab r10
  523. %define rtcos r11
  524. %define rtsin r12
  525. push r12
  526. push r13
  527. push r14
  528. %else
  529. %define rrevtab r6
  530. %define rtsin r6
  531. %define rtcos r5
  532. %endif
  533. mov r3d, [r0+FFTContext.mdctsize]
  534. add r2, r3
  535. shr r3, 1
  536. mov rtcos, [r0+FFTContext.tcos]
  537. mov rtsin, [r0+FFTContext.tsin]
  538. add rtcos, r3
  539. add rtsin, r3
  540. %ifndef ARCH_X86_64
  541. push rtcos
  542. push rtsin
  543. %endif
  544. shr r3, 1
  545. mov rrevtab, [r0+FFTContext.revtab]
  546. add rrevtab, r3
  547. %ifndef ARCH_X86_64
  548. push rrevtab
  549. %endif
  550. sub r3, 4
  551. %ifdef ARCH_X86_64
  552. xor r4, r4
  553. sub r4, r3
  554. %endif
  555. .pre:
  556. %ifndef ARCH_X86_64
  557. ;unspill
  558. xor r4, r4
  559. sub r4, r3
  560. mov rtsin, [esp+4]
  561. mov rtcos, [esp+8]
  562. %endif
  563. PREROTATER r4, r3, r2, rtcos, rtsin
  564. %ifdef ARCH_X86_64
  565. movzx r5, word [rrevtab+r4-4]
  566. movzx r6, word [rrevtab+r4-2]
  567. movzx r13, word [rrevtab+r3]
  568. movzx r14, word [rrevtab+r3+2]
  569. movlps [r1+r5 *8], xmm0
  570. movhps [r1+r6 *8], xmm0
  571. movlps [r1+r13*8], xmm1
  572. movhps [r1+r14*8], xmm1
  573. add r4, 4
  574. %else
  575. mov r6, [esp]
  576. movzx r5, word [r6+r4-4]
  577. movzx r4, word [r6+r4-2]
  578. movlps [r1+r5*8], xmm0
  579. movhps [r1+r4*8], xmm0
  580. movzx r5, word [r6+r3]
  581. movzx r4, word [r6+r3+2]
  582. movlps [r1+r5*8], xmm1
  583. movhps [r1+r4*8], xmm1
  584. %endif
  585. sub r3, 4
  586. jns .pre
  587. mov r5, r0
  588. mov r6, r1
  589. mov r0, r1
  590. mov r1d, [r5+FFTContext.nbits]
  591. FFT_DISPATCH _sse, r1
  592. mov r0d, [r5+FFTContext.mdctsize]
  593. add r6, r0
  594. shr r0, 1
  595. %ifndef ARCH_X86_64
  596. %define rtcos r2
  597. %define rtsin r3
  598. mov rtcos, [esp+8]
  599. mov rtsin, [esp+4]
  600. %endif
  601. neg r0
  602. mov r1, -16
  603. sub r1, r0
  604. POSROTATESHUF r0, r1, r6, rtcos, rtsin
  605. %ifdef ARCH_X86_64
  606. pop r14
  607. pop r13
  608. pop r12
  609. %else
  610. add esp, 12
  611. %endif
  612. RET