You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

626 lines
15KB

  1. ;*****************************************************************************
  2. ;* x86inc.asm
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;*****************************************************************************
  22. %ifdef ARCH_X86_64
  23. %ifidn __OUTPUT_FORMAT__,win32
  24. %define WIN64
  25. %else
  26. %define UNIX64
  27. %endif
  28. %endif
  29. ; FIXME: All of the 64bit asm functions that take a stride as an argument
  30. ; via register, assume that the high dword of that register is filled with 0.
  31. ; This is true in practice (since we never do any 64bit arithmetic on strides,
  32. ; and x264's strides are all positive), but is not guaranteed by the ABI.
  33. ; Name of the .rodata section.
  34. ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
  35. ; so use a different read-only section.
  36. %macro SECTION_RODATA 0-1 16
  37. %ifidn __OUTPUT_FORMAT__,macho64
  38. SECTION .text align=%1
  39. %elifidn __OUTPUT_FORMAT__,macho
  40. SECTION .text align=%1
  41. fakegot:
  42. %else
  43. SECTION .rodata align=%1
  44. %endif
  45. %endmacro
  46. ; PIC support macros.
  47. ; x86_64 can't fit 64bit address literals in most instruction types,
  48. ; so shared objects (under the assumption that they might be anywhere
  49. ; in memory) must use an address mode that does fit.
  50. ; So all accesses to global variables must use this macro, e.g.
  51. ; mov eax, [foo GLOBAL]
  52. ; instead of
  53. ; mov eax, [foo]
  54. ;
  55. ; x86_32 doesn't require PIC.
  56. ; Some distros prefer shared objects to be PIC, but nothing breaks if
  57. ; the code contains a few textrels, so we'll skip that complexity.
  58. %ifdef WIN64
  59. %define PIC
  60. %elifndef ARCH_X86_64
  61. %undef PIC
  62. %endif
  63. %ifdef PIC
  64. %define GLOBAL wrt rip
  65. %else
  66. %define GLOBAL
  67. %endif
  68. ; Macros to eliminate most code duplication between x86_32 and x86_64:
  69. ; Currently this works only for leaf functions which load all their arguments
  70. ; into registers at the start, and make no other use of the stack. Luckily that
  71. ; covers most of x264's asm.
  72. ; PROLOGUE:
  73. ; %1 = number of arguments. loads them from stack if needed.
  74. ; %2 = number of registers used. pushes callee-saved regs if needed.
  75. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
  76. ; %4 = list of names to define to registers
  77. ; PROLOGUE can also be invoked by adding the same options to cglobal
  78. ; e.g.
  79. ; cglobal foo, 2,3, dst, src, tmp
  80. ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
  81. ; TODO Some functions can use some args directly from the stack. If they're the
  82. ; last args then you can just not declare them, but if they're in the middle
  83. ; we need more flexible macro.
  84. ; RET:
  85. ; Pops anything that was pushed by PROLOGUE
  86. ; REP_RET:
  87. ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
  88. ; which are slow when a normal ret follows a branch.
  89. ; registers:
  90. ; rN and rNq are the native-size register holding function argument N
  91. ; rNd, rNw, rNb are dword, word, and byte size
  92. ; rNm is the original location of arg N (a register or on the stack), dword
  93. ; rNmp is native size
  94. %macro DECLARE_REG 6
  95. %define r%1q %2
  96. %define r%1d %3
  97. %define r%1w %4
  98. %define r%1b %5
  99. %define r%1m %6
  100. %ifid %6 ; i.e. it's a register
  101. %define r%1mp %2
  102. %elifdef ARCH_X86_64 ; memory
  103. %define r%1mp qword %6
  104. %else
  105. %define r%1mp dword %6
  106. %endif
  107. %define r%1 %2
  108. %endmacro
  109. %macro DECLARE_REG_SIZE 2
  110. %define r%1q r%1
  111. %define e%1q r%1
  112. %define r%1d e%1
  113. %define e%1d e%1
  114. %define r%1w %1
  115. %define e%1w %1
  116. %define r%1b %2
  117. %define e%1b %2
  118. %ifndef ARCH_X86_64
  119. %define r%1 e%1
  120. %endif
  121. %endmacro
  122. DECLARE_REG_SIZE ax, al
  123. DECLARE_REG_SIZE bx, bl
  124. DECLARE_REG_SIZE cx, cl
  125. DECLARE_REG_SIZE dx, dl
  126. DECLARE_REG_SIZE si, sil
  127. DECLARE_REG_SIZE di, dil
  128. DECLARE_REG_SIZE bp, bpl
  129. ; t# defines for when per-arch register allocation is more complex than just function arguments
  130. %macro DECLARE_REG_TMP 1-*
  131. %assign %%i 0
  132. %rep %0
  133. CAT_XDEFINE t, %%i, r%1
  134. %assign %%i %%i+1
  135. %rotate 1
  136. %endrep
  137. %endmacro
  138. %macro DECLARE_REG_TMP_SIZE 0-*
  139. %rep %0
  140. %define t%1q t%1 %+ q
  141. %define t%1d t%1 %+ d
  142. %define t%1w t%1 %+ w
  143. %define t%1b t%1 %+ b
  144. %rotate 1
  145. %endrep
  146. %endmacro
  147. DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
  148. %ifdef ARCH_X86_64
  149. %define gprsize 8
  150. %else
  151. %define gprsize 4
  152. %endif
  153. %macro PUSH 1
  154. push %1
  155. %assign stack_offset stack_offset+gprsize
  156. %endmacro
  157. %macro POP 1
  158. pop %1
  159. %assign stack_offset stack_offset-gprsize
  160. %endmacro
  161. %macro SUB 2
  162. sub %1, %2
  163. %ifidn %1, rsp
  164. %assign stack_offset stack_offset+(%2)
  165. %endif
  166. %endmacro
  167. %macro ADD 2
  168. add %1, %2
  169. %ifidn %1, rsp
  170. %assign stack_offset stack_offset-(%2)
  171. %endif
  172. %endmacro
  173. %macro movifnidn 2
  174. %ifnidn %1, %2
  175. mov %1, %2
  176. %endif
  177. %endmacro
  178. %macro movsxdifnidn 2
  179. %ifnidn %1, %2
  180. movsxd %1, %2
  181. %endif
  182. %endmacro
  183. %macro ASSERT 1
  184. %if (%1) == 0
  185. %error assert failed
  186. %endif
  187. %endmacro
  188. %macro DEFINE_ARGS 0-*
  189. %ifdef n_arg_names
  190. %assign %%i 0
  191. %rep n_arg_names
  192. CAT_UNDEF arg_name %+ %%i, q
  193. CAT_UNDEF arg_name %+ %%i, d
  194. CAT_UNDEF arg_name %+ %%i, w
  195. CAT_UNDEF arg_name %+ %%i, b
  196. CAT_UNDEF arg_name, %%i
  197. %assign %%i %%i+1
  198. %endrep
  199. %endif
  200. %assign %%i 0
  201. %rep %0
  202. %xdefine %1q r %+ %%i %+ q
  203. %xdefine %1d r %+ %%i %+ d
  204. %xdefine %1w r %+ %%i %+ w
  205. %xdefine %1b r %+ %%i %+ b
  206. CAT_XDEFINE arg_name, %%i, %1
  207. %assign %%i %%i+1
  208. %rotate 1
  209. %endrep
  210. %assign n_arg_names %%i
  211. %endmacro
  212. %ifdef WIN64 ; Windows x64 ;=================================================
  213. DECLARE_REG 0, rcx, ecx, cx, cl, ecx
  214. DECLARE_REG 1, rdx, edx, dx, dl, edx
  215. DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
  216. DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
  217. DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
  218. DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
  219. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
  220. %define r7m [rsp + stack_offset + 64]
  221. %define r8m [rsp + stack_offset + 72]
  222. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  223. %if %1 < %2
  224. mov r%1, [rsp + stack_offset + 8 + %1*8]
  225. %endif
  226. %endmacro
  227. %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
  228. ASSERT %2 >= %1
  229. %assign regs_used %2
  230. ASSERT regs_used <= 7
  231. %if %0 > 2
  232. %assign xmm_regs_used %3
  233. %else
  234. %assign xmm_regs_used 0
  235. %endif
  236. ASSERT xmm_regs_used <= 16
  237. %if regs_used > 4
  238. push r4
  239. push r5
  240. %assign stack_offset stack_offset+16
  241. %endif
  242. %if xmm_regs_used > 6
  243. sub rsp, (xmm_regs_used-6)*16+16
  244. %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
  245. %assign %%i xmm_regs_used
  246. %rep (xmm_regs_used-6)
  247. %assign %%i %%i-1
  248. movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
  249. %endrep
  250. %endif
  251. LOAD_IF_USED 4, %1
  252. LOAD_IF_USED 5, %1
  253. LOAD_IF_USED 6, %1
  254. DEFINE_ARGS %4
  255. %endmacro
  256. %macro RESTORE_XMM_INTERNAL 1
  257. %if xmm_regs_used > 6
  258. %assign %%i xmm_regs_used
  259. %rep (xmm_regs_used-6)
  260. %assign %%i %%i-1
  261. movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
  262. %endrep
  263. add %1, (xmm_regs_used-6)*16+16
  264. %endif
  265. %endmacro
  266. %macro RESTORE_XMM 1
  267. RESTORE_XMM_INTERNAL %1
  268. %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
  269. %assign xmm_regs_used 0
  270. %endmacro
  271. %macro RET 0
  272. RESTORE_XMM_INTERNAL rsp
  273. %if regs_used > 4
  274. pop r5
  275. pop r4
  276. %endif
  277. ret
  278. %endmacro
  279. %macro REP_RET 0
  280. %if regs_used > 4 || xmm_regs_used > 6
  281. RET
  282. %else
  283. rep ret
  284. %endif
  285. %endmacro
  286. %elifdef ARCH_X86_64 ; *nix x64 ;=============================================
  287. DECLARE_REG 0, rdi, edi, di, dil, edi
  288. DECLARE_REG 1, rsi, esi, si, sil, esi
  289. DECLARE_REG 2, rdx, edx, dx, dl, edx
  290. DECLARE_REG 3, rcx, ecx, cx, cl, ecx
  291. DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
  292. DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
  293. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
  294. %define r7m [rsp + stack_offset + 16]
  295. %define r8m [rsp + stack_offset + 24]
  296. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  297. %if %1 < %2
  298. mov r%1, [rsp - 40 + %1*8]
  299. %endif
  300. %endmacro
  301. %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
  302. ASSERT %2 >= %1
  303. ASSERT %2 <= 7
  304. LOAD_IF_USED 6, %1
  305. DEFINE_ARGS %4
  306. %endmacro
  307. %macro RET 0
  308. ret
  309. %endmacro
  310. %macro REP_RET 0
  311. rep ret
  312. %endmacro
  313. %else ; X86_32 ;==============================================================
  314. DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
  315. DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
  316. DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
  317. DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
  318. DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
  319. DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
  320. DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
  321. %define r7m [esp + stack_offset + 32]
  322. %define r8m [esp + stack_offset + 36]
  323. %define rsp esp
  324. %macro PUSH_IF_USED 1 ; reg_id
  325. %if %1 < regs_used
  326. push r%1
  327. %assign stack_offset stack_offset+4
  328. %endif
  329. %endmacro
  330. %macro POP_IF_USED 1 ; reg_id
  331. %if %1 < regs_used
  332. pop r%1
  333. %endif
  334. %endmacro
  335. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  336. %if %1 < %2
  337. mov r%1, [esp + stack_offset + 4 + %1*4]
  338. %endif
  339. %endmacro
  340. %macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
  341. ASSERT %2 >= %1
  342. %assign regs_used %2
  343. ASSERT regs_used <= 7
  344. PUSH_IF_USED 3
  345. PUSH_IF_USED 4
  346. PUSH_IF_USED 5
  347. PUSH_IF_USED 6
  348. LOAD_IF_USED 0, %1
  349. LOAD_IF_USED 1, %1
  350. LOAD_IF_USED 2, %1
  351. LOAD_IF_USED 3, %1
  352. LOAD_IF_USED 4, %1
  353. LOAD_IF_USED 5, %1
  354. LOAD_IF_USED 6, %1
  355. DEFINE_ARGS %4
  356. %endmacro
  357. %macro RET 0
  358. POP_IF_USED 6
  359. POP_IF_USED 5
  360. POP_IF_USED 4
  361. POP_IF_USED 3
  362. ret
  363. %endmacro
  364. %macro REP_RET 0
  365. %if regs_used > 3
  366. RET
  367. %else
  368. rep ret
  369. %endif
  370. %endmacro
  371. %endif ;======================================================================
  372. ;=============================================================================
  373. ; arch-independent part
  374. ;=============================================================================
  375. %assign function_align 16
  376. ; Symbol prefix for C linkage
  377. %macro cglobal 1-2+
  378. %xdefine %1 ff_%1
  379. %ifdef PREFIX
  380. %xdefine %1 _ %+ %1
  381. %endif
  382. %ifidn __OUTPUT_FORMAT__,elf
  383. global %1:function hidden
  384. %else
  385. global %1
  386. %endif
  387. align function_align
  388. %1:
  389. RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
  390. %assign stack_offset 0
  391. %if %0 > 1
  392. PROLOGUE %2
  393. %endif
  394. %endmacro
  395. %macro cextern 1
  396. %ifdef PREFIX
  397. %xdefine %1 _%1
  398. %endif
  399. extern %1
  400. %endmacro
  401. ; This is needed for ELF, otherwise the GNU linker assumes the stack is
  402. ; executable by default.
  403. %ifidn __OUTPUT_FORMAT__,elf
  404. SECTION .note.GNU-stack noalloc noexec nowrite progbits
  405. %endif
  406. %assign FENC_STRIDE 16
  407. %assign FDEC_STRIDE 32
  408. ; merge mmx and sse*
  409. %macro CAT_XDEFINE 3
  410. %xdefine %1%2 %3
  411. %endmacro
  412. %macro CAT_UNDEF 2
  413. %undef %1%2
  414. %endmacro
  415. %macro INIT_MMX 0
  416. %define RESET_MM_PERMUTATION INIT_MMX
  417. %define mmsize 8
  418. %define num_mmregs 8
  419. %define mova movq
  420. %define movu movq
  421. %define movh movd
  422. %define movnt movntq
  423. %assign %%i 0
  424. %rep 8
  425. CAT_XDEFINE m, %%i, mm %+ %%i
  426. CAT_XDEFINE nmm, %%i, %%i
  427. %assign %%i %%i+1
  428. %endrep
  429. %rep 8
  430. CAT_UNDEF m, %%i
  431. CAT_UNDEF nmm, %%i
  432. %assign %%i %%i+1
  433. %endrep
  434. %endmacro
  435. %macro INIT_XMM 0
  436. %define RESET_MM_PERMUTATION INIT_XMM
  437. %define mmsize 16
  438. %define num_mmregs 8
  439. %ifdef ARCH_X86_64
  440. %define num_mmregs 16
  441. %endif
  442. %define mova movdqa
  443. %define movu movdqu
  444. %define movh movq
  445. %define movnt movntdq
  446. %assign %%i 0
  447. %rep num_mmregs
  448. CAT_XDEFINE m, %%i, xmm %+ %%i
  449. CAT_XDEFINE nxmm, %%i, %%i
  450. %assign %%i %%i+1
  451. %endrep
  452. %endmacro
  453. INIT_MMX
  454. ; I often want to use macros that permute their arguments. e.g. there's no
  455. ; efficient way to implement butterfly or transpose or dct without swapping some
  456. ; arguments.
  457. ;
  458. ; I would like to not have to manually keep track of the permutations:
  459. ; If I insert a permutation in the middle of a function, it should automatically
  460. ; change everything that follows. For more complex macros I may also have multiple
  461. ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
  462. ;
  463. ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
  464. ; permutes its arguments. It's equivalent to exchanging the contents of the
  465. ; registers, except that this way you exchange the register names instead, so it
  466. ; doesn't cost any cycles.
  467. %macro PERMUTE 2-* ; takes a list of pairs to swap
  468. %rep %0/2
  469. %xdefine tmp%2 m%2
  470. %xdefine ntmp%2 nm%2
  471. %rotate 2
  472. %endrep
  473. %rep %0/2
  474. %xdefine m%1 tmp%2
  475. %xdefine nm%1 ntmp%2
  476. %undef tmp%2
  477. %undef ntmp%2
  478. %rotate 2
  479. %endrep
  480. %endmacro
  481. %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
  482. %rep %0-1
  483. %ifdef m%1
  484. %xdefine tmp m%1
  485. %xdefine m%1 m%2
  486. %xdefine m%2 tmp
  487. CAT_XDEFINE n, m%1, %1
  488. CAT_XDEFINE n, m%2, %2
  489. %else
  490. ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
  491. ; Be careful using this mode in nested macros though, as in some cases there may be
  492. ; other copies of m# that have already been dereferenced and don't get updated correctly.
  493. %xdefine %%n1 n %+ %1
  494. %xdefine %%n2 n %+ %2
  495. %xdefine tmp m %+ %%n1
  496. CAT_XDEFINE m, %%n1, m %+ %%n2
  497. CAT_XDEFINE m, %%n2, tmp
  498. CAT_XDEFINE n, m %+ %%n1, %%n1
  499. CAT_XDEFINE n, m %+ %%n2, %%n2
  500. %endif
  501. %undef tmp
  502. %rotate 1
  503. %endrep
  504. %endmacro
  505. %macro SAVE_MM_PERMUTATION 1
  506. %assign %%i 0
  507. %rep num_mmregs
  508. CAT_XDEFINE %1_m, %%i, m %+ %%i
  509. %assign %%i %%i+1
  510. %endrep
  511. %endmacro
  512. %macro LOAD_MM_PERMUTATION 1
  513. %assign %%i 0
  514. %rep num_mmregs
  515. CAT_XDEFINE m, %%i, %1_m %+ %%i
  516. CAT_XDEFINE n, m %+ %%i, %%i
  517. %assign %%i %%i+1
  518. %endrep
  519. %endmacro
  520. %macro call 1
  521. call %1
  522. %ifdef %1_m0
  523. LOAD_MM_PERMUTATION %1
  524. %endif
  525. %endmacro
  526. ;Substitutions that reduce instruction size but are functionally equivalent
  527. %define movdqa movaps
  528. %define movdqu movups
  529. %macro add 2
  530. %ifnum %2
  531. %if %2==128
  532. sub %1, -128
  533. %else
  534. add %1, %2
  535. %endif
  536. %else
  537. add %1, %2
  538. %endif
  539. %endmacro
  540. %macro sub 2
  541. %ifnum %2
  542. %if %2==128
  543. add %1, -128
  544. %else
  545. sub %1, %2
  546. %endif
  547. %else
  548. sub %1, %2
  549. %endif
  550. %endmacro