You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

626 lines
15KB

  1. ;*****************************************************************************
  2. ;* x86inc.asm
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;*****************************************************************************
  22. %ifdef ARCH_X86_64
  23. %ifidn __OUTPUT_FORMAT__,win32
  24. %define WIN64
  25. %else
  26. %define UNIX64
  27. %endif
  28. %endif
  29. ; FIXME: All of the 64bit asm functions that take a stride as an argument
  30. ; via register, assume that the high dword of that register is filled with 0.
  31. ; This is true in practice (since we never do any 64bit arithmetic on strides,
  32. ; and x264's strides are all positive), but is not guaranteed by the ABI.
  33. ; Name of the .rodata section.
  34. ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
  35. ; so use a different read-only section.
  36. %macro SECTION_RODATA 0-1 16
  37. %ifidn __OUTPUT_FORMAT__,macho64
  38. SECTION .text align=%1
  39. %elifidn __OUTPUT_FORMAT__,macho
  40. SECTION .text align=%1
  41. fakegot:
  42. %else
  43. SECTION .rodata align=%1
  44. %endif
  45. %endmacro
  46. ; PIC support macros.
  47. ; x86_64 can't fit 64bit address literals in most instruction types,
  48. ; so shared objects (under the assumption that they might be anywhere
  49. ; in memory) must use an address mode that does fit.
  50. ; So all accesses to global variables must use this macro, e.g.
  51. ; mov eax, [foo GLOBAL]
  52. ; instead of
  53. ; mov eax, [foo]
  54. ;
  55. ; x86_32 doesn't require PIC.
  56. ; Some distros prefer shared objects to be PIC, but nothing breaks if
  57. ; the code contains a few textrels, so we'll skip that complexity.
  58. %ifdef WIN64
  59. %define PIC
  60. %elifndef ARCH_X86_64
  61. %undef PIC
  62. %endif
  63. %ifdef PIC
  64. %define GLOBAL wrt rip
  65. %else
  66. %define GLOBAL
  67. %endif
  68. ; Macros to eliminate most code duplication between x86_32 and x86_64:
  69. ; Currently this works only for leaf functions which load all their arguments
  70. ; into registers at the start, and make no other use of the stack. Luckily that
  71. ; covers most of x264's asm.
  72. ; PROLOGUE:
  73. ; %1 = number of arguments. loads them from stack if needed.
  74. ; %2 = number of registers used. pushes callee-saved regs if needed.
  75. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
  76. ; %4 = list of names to define to registers
  77. ; PROLOGUE can also be invoked by adding the same options to cglobal
  78. ; e.g.
  79. ; cglobal foo, 2,3,0, dst, src, tmp
  80. ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
  81. ; TODO Some functions can use some args directly from the stack. If they're the
  82. ; last args then you can just not declare them, but if they're in the middle
  83. ; we need more flexible macro.
  84. ; RET:
  85. ; Pops anything that was pushed by PROLOGUE
  86. ; REP_RET:
  87. ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
  88. ; which are slow when a normal ret follows a branch.
  89. ; registers:
  90. ; rN and rNq are the native-size register holding function argument N
  91. ; rNd, rNw, rNb are dword, word, and byte size
  92. ; rNm is the original location of arg N (a register or on the stack), dword
  93. ; rNmp is native size
  94. %macro DECLARE_REG 6
  95. %define r%1q %2
  96. %define r%1d %3
  97. %define r%1w %4
  98. %define r%1b %5
  99. %define r%1m %6
  100. %ifid %6 ; i.e. it's a register
  101. %define r%1mp %2
  102. %elifdef ARCH_X86_64 ; memory
  103. %define r%1mp qword %6
  104. %else
  105. %define r%1mp dword %6
  106. %endif
  107. %define r%1 %2
  108. %endmacro
  109. %macro DECLARE_REG_SIZE 2
  110. %define r%1q r%1
  111. %define e%1q r%1
  112. %define r%1d e%1
  113. %define e%1d e%1
  114. %define r%1w %1
  115. %define e%1w %1
  116. %define r%1b %2
  117. %define e%1b %2
  118. %ifndef ARCH_X86_64
  119. %define r%1 e%1
  120. %endif
  121. %endmacro
  122. DECLARE_REG_SIZE ax, al
  123. DECLARE_REG_SIZE bx, bl
  124. DECLARE_REG_SIZE cx, cl
  125. DECLARE_REG_SIZE dx, dl
  126. DECLARE_REG_SIZE si, sil
  127. DECLARE_REG_SIZE di, dil
  128. DECLARE_REG_SIZE bp, bpl
  129. ; t# defines for when per-arch register allocation is more complex than just function arguments
  130. %macro DECLARE_REG_TMP 1-*
  131. %assign %%i 0
  132. %rep %0
  133. CAT_XDEFINE t, %%i, r%1
  134. %assign %%i %%i+1
  135. %rotate 1
  136. %endrep
  137. %endmacro
  138. %macro DECLARE_REG_TMP_SIZE 0-*
  139. %rep %0
  140. %define t%1q t%1 %+ q
  141. %define t%1d t%1 %+ d
  142. %define t%1w t%1 %+ w
  143. %define t%1b t%1 %+ b
  144. %rotate 1
  145. %endrep
  146. %endmacro
  147. DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
  148. %ifdef ARCH_X86_64
  149. %define gprsize 8
  150. %else
  151. %define gprsize 4
  152. %endif
  153. %macro PUSH 1
  154. push %1
  155. %assign stack_offset stack_offset+gprsize
  156. %endmacro
  157. %macro POP 1
  158. pop %1
  159. %assign stack_offset stack_offset-gprsize
  160. %endmacro
  161. %macro SUB 2
  162. sub %1, %2
  163. %ifidn %1, rsp
  164. %assign stack_offset stack_offset+(%2)
  165. %endif
  166. %endmacro
  167. %macro ADD 2
  168. add %1, %2
  169. %ifidn %1, rsp
  170. %assign stack_offset stack_offset-(%2)
  171. %endif
  172. %endmacro
  173. %macro movifnidn 2
  174. %ifnidn %1, %2
  175. mov %1, %2
  176. %endif
  177. %endmacro
  178. %macro movsxdifnidn 2
  179. %ifnidn %1, %2
  180. movsxd %1, %2
  181. %endif
  182. %endmacro
  183. %macro ASSERT 1
  184. %if (%1) == 0
  185. %error assert failed
  186. %endif
  187. %endmacro
  188. %macro DEFINE_ARGS 0-*
  189. %ifdef n_arg_names
  190. %assign %%i 0
  191. %rep n_arg_names
  192. CAT_UNDEF arg_name %+ %%i, q
  193. CAT_UNDEF arg_name %+ %%i, d
  194. CAT_UNDEF arg_name %+ %%i, w
  195. CAT_UNDEF arg_name %+ %%i, b
  196. CAT_UNDEF arg_name %+ %%i, m
  197. CAT_UNDEF arg_name, %%i
  198. %assign %%i %%i+1
  199. %endrep
  200. %endif
  201. %assign %%i 0
  202. %rep %0
  203. %xdefine %1q r %+ %%i %+ q
  204. %xdefine %1d r %+ %%i %+ d
  205. %xdefine %1w r %+ %%i %+ w
  206. %xdefine %1b r %+ %%i %+ b
  207. %xdefine %1m r %+ %%i %+ m
  208. CAT_XDEFINE arg_name, %%i, %1
  209. %assign %%i %%i+1
  210. %rotate 1
  211. %endrep
  212. %assign n_arg_names %%i
  213. %endmacro
  214. %ifdef WIN64 ; Windows x64 ;=================================================
  215. DECLARE_REG 0, rcx, ecx, cx, cl, ecx
  216. DECLARE_REG 1, rdx, edx, dx, dl, edx
  217. DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
  218. DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
  219. DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
  220. DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
  221. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
  222. %define r7m [rsp + stack_offset + 64]
  223. %define r8m [rsp + stack_offset + 72]
  224. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  225. %if %1 < %2
  226. mov r%1, [rsp + stack_offset + 8 + %1*8]
  227. %endif
  228. %endmacro
  229. %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
  230. ASSERT %2 >= %1
  231. %assign regs_used %2
  232. ASSERT regs_used <= 7
  233. %if %0 > 2
  234. %assign xmm_regs_used %3
  235. %else
  236. %assign xmm_regs_used 0
  237. %endif
  238. ASSERT xmm_regs_used <= 16
  239. %if regs_used > 4
  240. push r4
  241. push r5
  242. %assign stack_offset stack_offset+16
  243. %endif
  244. %if xmm_regs_used > 6
  245. sub rsp, (xmm_regs_used-6)*16+16
  246. %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
  247. %assign %%i xmm_regs_used
  248. %rep (xmm_regs_used-6)
  249. %assign %%i %%i-1
  250. movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
  251. %endrep
  252. %endif
  253. LOAD_IF_USED 4, %1
  254. LOAD_IF_USED 5, %1
  255. LOAD_IF_USED 6, %1
  256. DEFINE_ARGS %4
  257. %endmacro
  258. %macro RESTORE_XMM_INTERNAL 1
  259. %if xmm_regs_used > 6
  260. %assign %%i xmm_regs_used
  261. %rep (xmm_regs_used-6)
  262. %assign %%i %%i-1
  263. movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
  264. %endrep
  265. add %1, (xmm_regs_used-6)*16+16
  266. %endif
  267. %endmacro
  268. %macro RESTORE_XMM 1
  269. RESTORE_XMM_INTERNAL %1
  270. %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
  271. %assign xmm_regs_used 0
  272. %endmacro
  273. %macro RET 0
  274. RESTORE_XMM_INTERNAL rsp
  275. %if regs_used > 4
  276. pop r5
  277. pop r4
  278. %endif
  279. ret
  280. %endmacro
  281. %macro REP_RET 0
  282. %if regs_used > 4 || xmm_regs_used > 6
  283. RET
  284. %else
  285. rep ret
  286. %endif
  287. %endmacro
  288. %elifdef ARCH_X86_64 ; *nix x64 ;=============================================
  289. DECLARE_REG 0, rdi, edi, di, dil, edi
  290. DECLARE_REG 1, rsi, esi, si, sil, esi
  291. DECLARE_REG 2, rdx, edx, dx, dl, edx
  292. DECLARE_REG 3, rcx, ecx, cx, cl, ecx
  293. DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
  294. DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
  295. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
  296. %define r7m [rsp + stack_offset + 16]
  297. %define r8m [rsp + stack_offset + 24]
  298. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  299. %if %1 < %2
  300. mov r%1, [rsp - 40 + %1*8]
  301. %endif
  302. %endmacro
  303. %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
  304. ASSERT %2 >= %1
  305. ASSERT %2 <= 7
  306. LOAD_IF_USED 6, %1
  307. DEFINE_ARGS %4
  308. %endmacro
  309. %macro RET 0
  310. ret
  311. %endmacro
  312. %macro REP_RET 0
  313. rep ret
  314. %endmacro
  315. %else ; X86_32 ;==============================================================
  316. DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
  317. DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
  318. DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
  319. DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
  320. DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
  321. DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
  322. DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
  323. %define r7m [esp + stack_offset + 32]
  324. %define r8m [esp + stack_offset + 36]
  325. %define rsp esp
  326. %macro PUSH_IF_USED 1 ; reg_id
  327. %if %1 < regs_used
  328. push r%1
  329. %assign stack_offset stack_offset+4
  330. %endif
  331. %endmacro
  332. %macro POP_IF_USED 1 ; reg_id
  333. %if %1 < regs_used
  334. pop r%1
  335. %endif
  336. %endmacro
  337. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  338. %if %1 < %2
  339. mov r%1, [esp + stack_offset + 4 + %1*4]
  340. %endif
  341. %endmacro
  342. %macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
  343. ASSERT %2 >= %1
  344. %assign regs_used %2
  345. ASSERT regs_used <= 7
  346. PUSH_IF_USED 3
  347. PUSH_IF_USED 4
  348. PUSH_IF_USED 5
  349. PUSH_IF_USED 6
  350. LOAD_IF_USED 0, %1
  351. LOAD_IF_USED 1, %1
  352. LOAD_IF_USED 2, %1
  353. LOAD_IF_USED 3, %1
  354. LOAD_IF_USED 4, %1
  355. LOAD_IF_USED 5, %1
  356. LOAD_IF_USED 6, %1
  357. DEFINE_ARGS %4
  358. %endmacro
  359. %macro RET 0
  360. POP_IF_USED 6
  361. POP_IF_USED 5
  362. POP_IF_USED 4
  363. POP_IF_USED 3
  364. ret
  365. %endmacro
  366. %macro REP_RET 0
  367. %if regs_used > 3
  368. RET
  369. %else
  370. rep ret
  371. %endif
  372. %endmacro
  373. %endif ;======================================================================
  374. ;=============================================================================
  375. ; arch-independent part
  376. ;=============================================================================
  377. %assign function_align 16
  378. ; Symbol prefix for C linkage
  379. %macro cglobal 1-2+
  380. %xdefine %1 ff_%1
  381. %ifdef PREFIX
  382. %xdefine %1 _ %+ %1
  383. %endif
  384. %xdefine %1.skip_prologue %1 %+ .skip_prologue
  385. %ifidn __OUTPUT_FORMAT__,elf
  386. global %1:function hidden
  387. %else
  388. global %1
  389. %endif
  390. align function_align
  391. %1:
  392. RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
  393. %assign stack_offset 0
  394. %if %0 > 1
  395. PROLOGUE %2
  396. %endif
  397. %endmacro
  398. %macro cextern 1
  399. %ifdef PREFIX
  400. %xdefine %1 _%1
  401. %endif
  402. extern %1
  403. %endmacro
  404. ; This is needed for ELF, otherwise the GNU linker assumes the stack is
  405. ; executable by default.
  406. %ifidn __OUTPUT_FORMAT__,elf
  407. SECTION .note.GNU-stack noalloc noexec nowrite progbits
  408. %endif
  409. %assign FENC_STRIDE 16
  410. %assign FDEC_STRIDE 32
  411. ; merge mmx and sse*
  412. %macro CAT_XDEFINE 3
  413. %xdefine %1%2 %3
  414. %endmacro
  415. %macro CAT_UNDEF 2
  416. %undef %1%2
  417. %endmacro
  418. %macro INIT_MMX 0
  419. %define RESET_MM_PERMUTATION INIT_MMX
  420. %define mmsize 8
  421. %define num_mmregs 8
  422. %define mova movq
  423. %define movu movq
  424. %define movh movd
  425. %define movnt movntq
  426. %assign %%i 0
  427. %rep 8
  428. CAT_XDEFINE m, %%i, mm %+ %%i
  429. CAT_XDEFINE nmm, %%i, %%i
  430. %assign %%i %%i+1
  431. %endrep
  432. %rep 8
  433. CAT_UNDEF m, %%i
  434. CAT_UNDEF nmm, %%i
  435. %assign %%i %%i+1
  436. %endrep
  437. %endmacro
  438. %macro INIT_XMM 0
  439. %define RESET_MM_PERMUTATION INIT_XMM
  440. %define mmsize 16
  441. %define num_mmregs 8
  442. %ifdef ARCH_X86_64
  443. %define num_mmregs 16
  444. %endif
  445. %define mova movdqa
  446. %define movu movdqu
  447. %define movh movq
  448. %define movnt movntdq
  449. %assign %%i 0
  450. %rep num_mmregs
  451. CAT_XDEFINE m, %%i, xmm %+ %%i
  452. CAT_XDEFINE nxmm, %%i, %%i
  453. %assign %%i %%i+1
  454. %endrep
  455. %endmacro
  456. INIT_MMX
  457. ; I often want to use macros that permute their arguments. e.g. there's no
  458. ; efficient way to implement butterfly or transpose or dct without swapping some
  459. ; arguments.
  460. ;
  461. ; I would like to not have to manually keep track of the permutations:
  462. ; If I insert a permutation in the middle of a function, it should automatically
  463. ; change everything that follows. For more complex macros I may also have multiple
  464. ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
  465. ;
  466. ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
  467. ; permutes its arguments. It's equivalent to exchanging the contents of the
  468. ; registers, except that this way you exchange the register names instead, so it
  469. ; doesn't cost any cycles.
  470. %macro PERMUTE 2-* ; takes a list of pairs to swap
  471. %rep %0/2
  472. %xdefine tmp%2 m%2
  473. %xdefine ntmp%2 nm%2
  474. %rotate 2
  475. %endrep
  476. %rep %0/2
  477. %xdefine m%1 tmp%2
  478. %xdefine nm%1 ntmp%2
  479. %undef tmp%2
  480. %undef ntmp%2
  481. %rotate 2
  482. %endrep
  483. %endmacro
  484. %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
  485. %rep %0-1
  486. %ifdef m%1
  487. %xdefine tmp m%1
  488. %xdefine m%1 m%2
  489. %xdefine m%2 tmp
  490. CAT_XDEFINE n, m%1, %1
  491. CAT_XDEFINE n, m%2, %2
  492. %else
  493. ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
  494. ; Be careful using this mode in nested macros though, as in some cases there may be
  495. ; other copies of m# that have already been dereferenced and don't get updated correctly.
  496. %xdefine %%n1 n %+ %1
  497. %xdefine %%n2 n %+ %2
  498. %xdefine tmp m %+ %%n1
  499. CAT_XDEFINE m, %%n1, m %+ %%n2
  500. CAT_XDEFINE m, %%n2, tmp
  501. CAT_XDEFINE n, m %+ %%n1, %%n1
  502. CAT_XDEFINE n, m %+ %%n2, %%n2
  503. %endif
  504. %undef tmp
  505. %rotate 1
  506. %endrep
  507. %endmacro
  508. %macro SAVE_MM_PERMUTATION 1
  509. %assign %%i 0
  510. %rep num_mmregs
  511. CAT_XDEFINE %1_m, %%i, m %+ %%i
  512. %assign %%i %%i+1
  513. %endrep
  514. %endmacro
  515. %macro LOAD_MM_PERMUTATION 1
  516. %assign %%i 0
  517. %rep num_mmregs
  518. CAT_XDEFINE m, %%i, %1_m %+ %%i
  519. CAT_XDEFINE n, m %+ %%i, %%i
  520. %assign %%i %%i+1
  521. %endrep
  522. %endmacro
  523. %macro call 1
  524. call %1
  525. %ifdef %1_m0
  526. LOAD_MM_PERMUTATION %1
  527. %endif
  528. %endmacro
  529. ;Substitutions that reduce instruction size but are functionally equivalent
  530. %macro add 2
  531. %ifnum %2
  532. %if %2==128
  533. sub %1, -128
  534. %else
  535. add %1, %2
  536. %endif
  537. %else
  538. add %1, %2
  539. %endif
  540. %endmacro
  541. %macro sub 2
  542. %ifnum %2
  543. %if %2==128
  544. add %1, -128
  545. %else
  546. sub %1, %2
  547. %endif
  548. %else
  549. sub %1, %2
  550. %endif
  551. %endmacro