You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

895 lines
22KB

  1. ;*****************************************************************************
  2. ;* x86inc.asm
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Anton Mitrofanov <BugMaster@narod.ru>
  8. ;* Jason Garrett-Glaser <darkshikari@gmail.com>
  9. ;*
  10. ;* Permission to use, copy, modify, and/or distribute this software for any
  11. ;* purpose with or without fee is hereby granted, provided that the above
  12. ;* copyright notice and this permission notice appear in all copies.
  13. ;*
  14. ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  15. ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  16. ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  17. ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  18. ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  19. ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  20. ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  21. ;*****************************************************************************
  22. ; This is a header file for the x264ASM assembly language, which uses
  23. ; NASM/YASM syntax combined with a large number of macros to provide easy
  24. ; abstraction between different calling conventions (x86_32, win64, linux64).
  25. ; It also has various other useful features to simplify writing the kind of
  26. ; DSP functions that are most often used in x264.
  27. ; Unlike the rest of x264, this file is available under an ISC license, as it
  28. ; has significant usefulness outside of x264 and we want it to be available
  29. ; to the largest audience possible. Of course, if you modify it for your own
  30. ; purposes to add a new feature, we strongly encourage contributing a patch
  31. ; as this feature might be useful for others as well. Send patches or ideas
  32. ; to x264-devel@videolan.org .
  33. %define program_name ff
  34. %ifdef ARCH_X86_64
  35. %ifidn __OUTPUT_FORMAT__,win32
  36. %define WIN64
  37. %else
  38. %define UNIX64
  39. %endif
  40. %endif
  41. %ifdef PREFIX
  42. %define mangle(x) _ %+ x
  43. %else
  44. %define mangle(x) x
  45. %endif
  46. ; FIXME: All of the 64bit asm functions that take a stride as an argument
  47. ; via register, assume that the high dword of that register is filled with 0.
  48. ; This is true in practice (since we never do any 64bit arithmetic on strides,
  49. ; and x264's strides are all positive), but is not guaranteed by the ABI.
  50. ; Name of the .rodata section.
  51. ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
  52. ; so use a different read-only section.
  53. %macro SECTION_RODATA 0-1 16
  54. %ifidn __OUTPUT_FORMAT__,macho64
  55. SECTION .text align=%1
  56. %elifidn __OUTPUT_FORMAT__,macho
  57. SECTION .text align=%1
  58. fakegot:
  59. %else
  60. SECTION .rodata align=%1
  61. %endif
  62. %endmacro
  63. %ifdef WIN64
  64. %define PIC
  65. %elifndef ARCH_X86_64
  66. ; x86_32 doesn't require PIC.
  67. ; Some distros prefer shared objects to be PIC, but nothing breaks if
  68. ; the code contains a few textrels, so we'll skip that complexity.
  69. %undef PIC
  70. %endif
  71. %ifdef PIC
  72. default rel
  73. %endif
  74. ; Macros to eliminate most code duplication between x86_32 and x86_64:
  75. ; Currently this works only for leaf functions which load all their arguments
  76. ; into registers at the start, and make no other use of the stack. Luckily that
  77. ; covers most of x264's asm.
  78. ; PROLOGUE:
  79. ; %1 = number of arguments. loads them from stack if needed.
  80. ; %2 = number of registers used. pushes callee-saved regs if needed.
  81. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
  82. ; %4 = list of names to define to registers
  83. ; PROLOGUE can also be invoked by adding the same options to cglobal
  84. ; e.g.
  85. ; cglobal foo, 2,3,0, dst, src, tmp
  86. ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
  87. ; TODO Some functions can use some args directly from the stack. If they're the
  88. ; last args then you can just not declare them, but if they're in the middle
  89. ; we need more flexible macro.
  90. ; RET:
  91. ; Pops anything that was pushed by PROLOGUE
  92. ; REP_RET:
  93. ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
  94. ; which are slow when a normal ret follows a branch.
  95. ; registers:
  96. ; rN and rNq are the native-size register holding function argument N
  97. ; rNd, rNw, rNb are dword, word, and byte size
  98. ; rNm is the original location of arg N (a register or on the stack), dword
  99. ; rNmp is native size
  100. %macro DECLARE_REG 6
  101. %define r%1q %2
  102. %define r%1d %3
  103. %define r%1w %4
  104. %define r%1b %5
  105. %define r%1m %6
  106. %ifid %6 ; i.e. it's a register
  107. %define r%1mp %2
  108. %elifdef ARCH_X86_64 ; memory
  109. %define r%1mp qword %6
  110. %else
  111. %define r%1mp dword %6
  112. %endif
  113. %define r%1 %2
  114. %endmacro
  115. %macro DECLARE_REG_SIZE 2
  116. %define r%1q r%1
  117. %define e%1q r%1
  118. %define r%1d e%1
  119. %define e%1d e%1
  120. %define r%1w %1
  121. %define e%1w %1
  122. %define r%1b %2
  123. %define e%1b %2
  124. %ifndef ARCH_X86_64
  125. %define r%1 e%1
  126. %endif
  127. %endmacro
  128. DECLARE_REG_SIZE ax, al
  129. DECLARE_REG_SIZE bx, bl
  130. DECLARE_REG_SIZE cx, cl
  131. DECLARE_REG_SIZE dx, dl
  132. DECLARE_REG_SIZE si, sil
  133. DECLARE_REG_SIZE di, dil
  134. DECLARE_REG_SIZE bp, bpl
  135. ; t# defines for when per-arch register allocation is more complex than just function arguments
  136. %macro DECLARE_REG_TMP 1-*
  137. %assign %%i 0
  138. %rep %0
  139. CAT_XDEFINE t, %%i, r%1
  140. %assign %%i %%i+1
  141. %rotate 1
  142. %endrep
  143. %endmacro
  144. %macro DECLARE_REG_TMP_SIZE 0-*
  145. %rep %0
  146. %define t%1q t%1 %+ q
  147. %define t%1d t%1 %+ d
  148. %define t%1w t%1 %+ w
  149. %define t%1b t%1 %+ b
  150. %rotate 1
  151. %endrep
  152. %endmacro
  153. DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
  154. %ifdef ARCH_X86_64
  155. %define gprsize 8
  156. %else
  157. %define gprsize 4
  158. %endif
  159. %macro PUSH 1
  160. push %1
  161. %assign stack_offset stack_offset+gprsize
  162. %endmacro
  163. %macro POP 1
  164. pop %1
  165. %assign stack_offset stack_offset-gprsize
  166. %endmacro
  167. %macro SUB 2
  168. sub %1, %2
  169. %ifidn %1, rsp
  170. %assign stack_offset stack_offset+(%2)
  171. %endif
  172. %endmacro
  173. %macro ADD 2
  174. add %1, %2
  175. %ifidn %1, rsp
  176. %assign stack_offset stack_offset-(%2)
  177. %endif
  178. %endmacro
  179. %macro movifnidn 2
  180. %ifnidn %1, %2
  181. mov %1, %2
  182. %endif
  183. %endmacro
  184. %macro movsxdifnidn 2
  185. %ifnidn %1, %2
  186. movsxd %1, %2
  187. %endif
  188. %endmacro
  189. %macro ASSERT 1
  190. %if (%1) == 0
  191. %error assert failed
  192. %endif
  193. %endmacro
  194. %macro DEFINE_ARGS 0-*
  195. %ifdef n_arg_names
  196. %assign %%i 0
  197. %rep n_arg_names
  198. CAT_UNDEF arg_name %+ %%i, q
  199. CAT_UNDEF arg_name %+ %%i, d
  200. CAT_UNDEF arg_name %+ %%i, w
  201. CAT_UNDEF arg_name %+ %%i, b
  202. CAT_UNDEF arg_name %+ %%i, m
  203. CAT_UNDEF arg_name, %%i
  204. %assign %%i %%i+1
  205. %endrep
  206. %endif
  207. %assign %%i 0
  208. %rep %0
  209. %xdefine %1q r %+ %%i %+ q
  210. %xdefine %1d r %+ %%i %+ d
  211. %xdefine %1w r %+ %%i %+ w
  212. %xdefine %1b r %+ %%i %+ b
  213. %xdefine %1m r %+ %%i %+ m
  214. CAT_XDEFINE arg_name, %%i, %1
  215. %assign %%i %%i+1
  216. %rotate 1
  217. %endrep
  218. %assign n_arg_names %%i
  219. %endmacro
  220. %ifdef WIN64 ; Windows x64 ;=================================================
  221. DECLARE_REG 0, rcx, ecx, cx, cl, ecx
  222. DECLARE_REG 1, rdx, edx, dx, dl, edx
  223. DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
  224. DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
  225. DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
  226. DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
  227. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
  228. %define r7m [rsp + stack_offset + 64]
  229. %define r8m [rsp + stack_offset + 72]
  230. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  231. %if %1 < %2
  232. mov r%1, [rsp + stack_offset + 8 + %1*8]
  233. %endif
  234. %endmacro
  235. %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
  236. ASSERT %2 >= %1
  237. %assign regs_used %2
  238. ASSERT regs_used <= 7
  239. %if regs_used > 4
  240. push r4
  241. push r5
  242. %assign stack_offset stack_offset+16
  243. %endif
  244. WIN64_SPILL_XMM %3
  245. LOAD_IF_USED 4, %1
  246. LOAD_IF_USED 5, %1
  247. LOAD_IF_USED 6, %1
  248. DEFINE_ARGS %4
  249. %endmacro
  250. %macro WIN64_SPILL_XMM 1
  251. %assign xmm_regs_used %1
  252. ASSERT xmm_regs_used <= 16
  253. %if xmm_regs_used > 6
  254. sub rsp, (xmm_regs_used-6)*16+16
  255. %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
  256. %assign %%i xmm_regs_used
  257. %rep (xmm_regs_used-6)
  258. %assign %%i %%i-1
  259. movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
  260. %endrep
  261. %endif
  262. %endmacro
  263. %macro WIN64_RESTORE_XMM_INTERNAL 1
  264. %if xmm_regs_used > 6
  265. %assign %%i xmm_regs_used
  266. %rep (xmm_regs_used-6)
  267. %assign %%i %%i-1
  268. movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
  269. %endrep
  270. add %1, (xmm_regs_used-6)*16+16
  271. %endif
  272. %endmacro
  273. %macro WIN64_RESTORE_XMM 1
  274. WIN64_RESTORE_XMM_INTERNAL %1
  275. %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
  276. %assign xmm_regs_used 0
  277. %endmacro
  278. %macro RET 0
  279. WIN64_RESTORE_XMM_INTERNAL rsp
  280. %if regs_used > 4
  281. pop r5
  282. pop r4
  283. %endif
  284. ret
  285. %endmacro
  286. %macro REP_RET 0
  287. %if regs_used > 4 || xmm_regs_used > 6
  288. RET
  289. %else
  290. rep ret
  291. %endif
  292. %endmacro
  293. %elifdef ARCH_X86_64 ; *nix x64 ;=============================================
  294. DECLARE_REG 0, rdi, edi, di, dil, edi
  295. DECLARE_REG 1, rsi, esi, si, sil, esi
  296. DECLARE_REG 2, rdx, edx, dx, dl, edx
  297. DECLARE_REG 3, rcx, ecx, cx, cl, ecx
  298. DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
  299. DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
  300. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
  301. %define r7m [rsp + stack_offset + 16]
  302. %define r8m [rsp + stack_offset + 24]
  303. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  304. %if %1 < %2
  305. mov r%1, [rsp - 40 + %1*8]
  306. %endif
  307. %endmacro
  308. %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
  309. ASSERT %2 >= %1
  310. ASSERT %2 <= 7
  311. LOAD_IF_USED 6, %1
  312. DEFINE_ARGS %4
  313. %endmacro
  314. %macro RET 0
  315. ret
  316. %endmacro
  317. %macro REP_RET 0
  318. rep ret
  319. %endmacro
  320. %else ; X86_32 ;==============================================================
  321. DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
  322. DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
  323. DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
  324. DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
  325. DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
  326. DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
  327. DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
  328. %define r7m [esp + stack_offset + 32]
  329. %define r8m [esp + stack_offset + 36]
  330. %define rsp esp
  331. %macro PUSH_IF_USED 1 ; reg_id
  332. %if %1 < regs_used
  333. push r%1
  334. %assign stack_offset stack_offset+4
  335. %endif
  336. %endmacro
  337. %macro POP_IF_USED 1 ; reg_id
  338. %if %1 < regs_used
  339. pop r%1
  340. %endif
  341. %endmacro
  342. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  343. %if %1 < %2
  344. mov r%1, [esp + stack_offset + 4 + %1*4]
  345. %endif
  346. %endmacro
  347. %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
  348. ASSERT %2 >= %1
  349. %assign regs_used %2
  350. ASSERT regs_used <= 7
  351. PUSH_IF_USED 3
  352. PUSH_IF_USED 4
  353. PUSH_IF_USED 5
  354. PUSH_IF_USED 6
  355. LOAD_IF_USED 0, %1
  356. LOAD_IF_USED 1, %1
  357. LOAD_IF_USED 2, %1
  358. LOAD_IF_USED 3, %1
  359. LOAD_IF_USED 4, %1
  360. LOAD_IF_USED 5, %1
  361. LOAD_IF_USED 6, %1
  362. DEFINE_ARGS %4
  363. %endmacro
  364. %macro RET 0
  365. POP_IF_USED 6
  366. POP_IF_USED 5
  367. POP_IF_USED 4
  368. POP_IF_USED 3
  369. ret
  370. %endmacro
  371. %macro REP_RET 0
  372. %if regs_used > 3
  373. RET
  374. %else
  375. rep ret
  376. %endif
  377. %endmacro
  378. %endif ;======================================================================
  379. %ifndef WIN64
  380. %macro WIN64_SPILL_XMM 1
  381. %endmacro
  382. %macro WIN64_RESTORE_XMM 1
  383. %endmacro
  384. %endif
  385. ;=============================================================================
  386. ; arch-independent part
  387. ;=============================================================================
  388. %assign function_align 16
  389. ; Symbol prefix for C linkage
  390. %macro cglobal 1-2+
  391. %xdefine %1 mangle(program_name %+ _ %+ %1)
  392. %xdefine %1.skip_prologue %1 %+ .skip_prologue
  393. %ifidn __OUTPUT_FORMAT__,elf
  394. global %1:function hidden
  395. %else
  396. global %1
  397. %endif
  398. align function_align
  399. %1:
  400. RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
  401. %assign stack_offset 0
  402. %if %0 > 1
  403. PROLOGUE %2
  404. %endif
  405. %endmacro
  406. %macro cextern 1
  407. %xdefine %1 mangle(program_name %+ _ %+ %1)
  408. extern %1
  409. %endmacro
  410. ;like cextern, but without the prefix
  411. %macro cextern_naked 1
  412. %xdefine %1 mangle(%1)
  413. extern %1
  414. %endmacro
  415. %macro const 2+
  416. %xdefine %1 mangle(program_name %+ _ %+ %1)
  417. global %1
  418. %1: %2
  419. %endmacro
  420. ; This is needed for ELF, otherwise the GNU linker assumes the stack is
  421. ; executable by default.
  422. %ifidn __OUTPUT_FORMAT__,elf
  423. SECTION .note.GNU-stack noalloc noexec nowrite progbits
  424. %endif
  425. ; merge mmx and sse*
  426. %macro CAT_XDEFINE 3
  427. %xdefine %1%2 %3
  428. %endmacro
  429. %macro CAT_UNDEF 2
  430. %undef %1%2
  431. %endmacro
  432. %macro INIT_MMX 0
  433. %assign avx_enabled 0
  434. %define RESET_MM_PERMUTATION INIT_MMX
  435. %define mmsize 8
  436. %define num_mmregs 8
  437. %define mova movq
  438. %define movu movq
  439. %define movh movd
  440. %define movnta movntq
  441. %assign %%i 0
  442. %rep 8
  443. CAT_XDEFINE m, %%i, mm %+ %%i
  444. CAT_XDEFINE nmm, %%i, %%i
  445. %assign %%i %%i+1
  446. %endrep
  447. %rep 8
  448. CAT_UNDEF m, %%i
  449. CAT_UNDEF nmm, %%i
  450. %assign %%i %%i+1
  451. %endrep
  452. %endmacro
  453. %macro INIT_XMM 0
  454. %assign avx_enabled 0
  455. %define RESET_MM_PERMUTATION INIT_XMM
  456. %define mmsize 16
  457. %define num_mmregs 8
  458. %ifdef ARCH_X86_64
  459. %define num_mmregs 16
  460. %endif
  461. %define mova movdqa
  462. %define movu movdqu
  463. %define movh movq
  464. %define movnta movntdq
  465. %assign %%i 0
  466. %rep num_mmregs
  467. CAT_XDEFINE m, %%i, xmm %+ %%i
  468. CAT_XDEFINE nxmm, %%i, %%i
  469. %assign %%i %%i+1
  470. %endrep
  471. %endmacro
  472. %macro INIT_AVX 0
  473. INIT_XMM
  474. %assign avx_enabled 1
  475. %define PALIGNR PALIGNR_SSSE3
  476. %define RESET_MM_PERMUTATION INIT_AVX
  477. %endmacro
  478. %macro INIT_YMM 0
  479. %assign avx_enabled 1
  480. %define RESET_MM_PERMUTATION INIT_YMM
  481. %define mmsize 32
  482. %define num_mmregs 8
  483. %ifdef ARCH_X86_64
  484. %define num_mmregs 16
  485. %endif
  486. %define mova vmovaps
  487. %define movu vmovups
  488. %assign %%i 0
  489. %rep num_mmregs
  490. CAT_XDEFINE m, %%i, ymm %+ %%i
  491. CAT_XDEFINE nymm, %%i, %%i
  492. %assign %%i %%i+1
  493. %endrep
  494. %endmacro
  495. INIT_MMX
  496. ; I often want to use macros that permute their arguments. e.g. there's no
  497. ; efficient way to implement butterfly or transpose or dct without swapping some
  498. ; arguments.
  499. ;
  500. ; I would like to not have to manually keep track of the permutations:
  501. ; If I insert a permutation in the middle of a function, it should automatically
  502. ; change everything that follows. For more complex macros I may also have multiple
  503. ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
  504. ;
  505. ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
  506. ; permutes its arguments. It's equivalent to exchanging the contents of the
  507. ; registers, except that this way you exchange the register names instead, so it
  508. ; doesn't cost any cycles.
  509. %macro PERMUTE 2-* ; takes a list of pairs to swap
  510. %rep %0/2
  511. %xdefine tmp%2 m%2
  512. %xdefine ntmp%2 nm%2
  513. %rotate 2
  514. %endrep
  515. %rep %0/2
  516. %xdefine m%1 tmp%2
  517. %xdefine nm%1 ntmp%2
  518. %undef tmp%2
  519. %undef ntmp%2
  520. %rotate 2
  521. %endrep
  522. %endmacro
  523. %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
  524. %rep %0-1
  525. %ifdef m%1
  526. %xdefine tmp m%1
  527. %xdefine m%1 m%2
  528. %xdefine m%2 tmp
  529. CAT_XDEFINE n, m%1, %1
  530. CAT_XDEFINE n, m%2, %2
  531. %else
  532. ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
  533. ; Be careful using this mode in nested macros though, as in some cases there may be
  534. ; other copies of m# that have already been dereferenced and don't get updated correctly.
  535. %xdefine %%n1 n %+ %1
  536. %xdefine %%n2 n %+ %2
  537. %xdefine tmp m %+ %%n1
  538. CAT_XDEFINE m, %%n1, m %+ %%n2
  539. CAT_XDEFINE m, %%n2, tmp
  540. CAT_XDEFINE n, m %+ %%n1, %%n1
  541. CAT_XDEFINE n, m %+ %%n2, %%n2
  542. %endif
  543. %undef tmp
  544. %rotate 1
  545. %endrep
  546. %endmacro
  547. ; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
  548. ; function name, then any later calls to that function will automatically
  549. ; load the permutation, so values can be returned in mmregs.
  550. %macro SAVE_MM_PERMUTATION 1 ; name to save as
  551. %assign %%i 0
  552. %rep num_mmregs
  553. CAT_XDEFINE %1_m, %%i, m %+ %%i
  554. %assign %%i %%i+1
  555. %endrep
  556. %endmacro
  557. %macro LOAD_MM_PERMUTATION 1 ; name to load from
  558. %assign %%i 0
  559. %rep num_mmregs
  560. CAT_XDEFINE m, %%i, %1_m %+ %%i
  561. CAT_XDEFINE n, m %+ %%i, %%i
  562. %assign %%i %%i+1
  563. %endrep
  564. %endmacro
  565. %macro call 1
  566. call %1
  567. %ifdef %1_m0
  568. LOAD_MM_PERMUTATION %1
  569. %endif
  570. %endmacro
  571. ; Substitutions that reduce instruction size but are functionally equivalent
  572. %macro add 2
  573. %ifnum %2
  574. %if %2==128
  575. sub %1, -128
  576. %else
  577. add %1, %2
  578. %endif
  579. %else
  580. add %1, %2
  581. %endif
  582. %endmacro
  583. %macro sub 2
  584. %ifnum %2
  585. %if %2==128
  586. add %1, -128
  587. %else
  588. sub %1, %2
  589. %endif
  590. %else
  591. sub %1, %2
  592. %endif
  593. %endmacro
  594. ;=============================================================================
  595. ; AVX abstraction layer
  596. ;=============================================================================
  597. %assign i 0
  598. %rep 16
  599. %if i < 8
  600. CAT_XDEFINE sizeofmm, i, 8
  601. %endif
  602. CAT_XDEFINE sizeofxmm, i, 16
  603. CAT_XDEFINE sizeofymm, i, 32
  604. %assign i i+1
  605. %endrep
  606. %undef i
  607. ;%1 == instruction
  608. ;%2 == 1 if float, 0 if int
  609. ;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
  610. ;%4 == number of operands given
  611. ;%5+: operands
  612. %macro RUN_AVX_INSTR 6-7+
  613. %if sizeof%5==32
  614. v%1 %5, %6, %7
  615. %else
  616. %if sizeof%5==8
  617. %define %%regmov movq
  618. %elif %2
  619. %define %%regmov movaps
  620. %else
  621. %define %%regmov movdqa
  622. %endif
  623. %if %4>=3+%3
  624. %ifnidn %5, %6
  625. %if avx_enabled && sizeof%5==16
  626. v%1 %5, %6, %7
  627. %else
  628. %%regmov %5, %6
  629. %1 %5, %7
  630. %endif
  631. %else
  632. %1 %5, %7
  633. %endif
  634. %elif %3
  635. %1 %5, %6, %7
  636. %else
  637. %1 %5, %6
  638. %endif
  639. %endif
  640. %endmacro
  641. ;%1 == instruction
  642. ;%2 == 1 if float, 0 if int
  643. ;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
  644. %macro AVX_INSTR 3
  645. %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
  646. %ifidn %3, fnord
  647. RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
  648. %elifidn %4, fnord
  649. RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
  650. %elifidn %5, fnord
  651. RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
  652. %else
  653. RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
  654. %endif
  655. %endmacro
  656. %endmacro
  657. AVX_INSTR addpd, 1, 0
  658. AVX_INSTR addps, 1, 0
  659. AVX_INSTR addsd, 1, 0
  660. AVX_INSTR addss, 1, 0
  661. AVX_INSTR addsubpd, 1, 0
  662. AVX_INSTR addsubps, 1, 0
  663. AVX_INSTR andpd, 1, 0
  664. AVX_INSTR andps, 1, 0
  665. AVX_INSTR andnpd, 1, 0
  666. AVX_INSTR andnps, 1, 0
  667. AVX_INSTR blendpd, 1, 0
  668. AVX_INSTR blendps, 1, 0
  669. AVX_INSTR blendvpd, 1, 0
  670. AVX_INSTR blendvps, 1, 0
  671. AVX_INSTR cmppd, 1, 0
  672. AVX_INSTR cmpps, 1, 0
  673. AVX_INSTR cmpsd, 1, 0
  674. AVX_INSTR cmpss, 1, 0
  675. AVX_INSTR divpd, 1, 0
  676. AVX_INSTR divps, 1, 0
  677. AVX_INSTR divsd, 1, 0
  678. AVX_INSTR divss, 1, 0
  679. AVX_INSTR dppd, 1, 0
  680. AVX_INSTR dpps, 1, 0
  681. AVX_INSTR haddpd, 1, 0
  682. AVX_INSTR haddps, 1, 0
  683. AVX_INSTR hsubpd, 1, 0
  684. AVX_INSTR hsubps, 1, 0
  685. AVX_INSTR maxpd, 1, 0
  686. AVX_INSTR maxps, 1, 0
  687. AVX_INSTR maxsd, 1, 0
  688. AVX_INSTR maxss, 1, 0
  689. AVX_INSTR minpd, 1, 0
  690. AVX_INSTR minps, 1, 0
  691. AVX_INSTR minsd, 1, 0
  692. AVX_INSTR minss, 1, 0
  693. AVX_INSTR mpsadbw, 0, 1
  694. AVX_INSTR mulpd, 1, 0
  695. AVX_INSTR mulps, 1, 0
  696. AVX_INSTR mulsd, 1, 0
  697. AVX_INSTR mulss, 1, 0
  698. AVX_INSTR orpd, 1, 0
  699. AVX_INSTR orps, 1, 0
  700. AVX_INSTR packsswb, 0, 0
  701. AVX_INSTR packssdw, 0, 0
  702. AVX_INSTR packuswb, 0, 0
  703. AVX_INSTR packusdw, 0, 0
  704. AVX_INSTR paddb, 0, 0
  705. AVX_INSTR paddw, 0, 0
  706. AVX_INSTR paddd, 0, 0
  707. AVX_INSTR paddq, 0, 0
  708. AVX_INSTR paddsb, 0, 0
  709. AVX_INSTR paddsw, 0, 0
  710. AVX_INSTR paddusb, 0, 0
  711. AVX_INSTR paddusw, 0, 0
  712. AVX_INSTR palignr, 0, 1
  713. AVX_INSTR pand, 0, 0
  714. AVX_INSTR pandn, 0, 0
  715. AVX_INSTR pavgb, 0, 0
  716. AVX_INSTR pavgw, 0, 0
  717. AVX_INSTR pblendvb, 0, 0
  718. AVX_INSTR pblendw, 0, 1
  719. AVX_INSTR pcmpestri, 0, 0
  720. AVX_INSTR pcmpestrm, 0, 0
  721. AVX_INSTR pcmpistri, 0, 0
  722. AVX_INSTR pcmpistrm, 0, 0
  723. AVX_INSTR pcmpeqb, 0, 0
  724. AVX_INSTR pcmpeqw, 0, 0
  725. AVX_INSTR pcmpeqd, 0, 0
  726. AVX_INSTR pcmpeqq, 0, 0
  727. AVX_INSTR pcmpgtb, 0, 0
  728. AVX_INSTR pcmpgtw, 0, 0
  729. AVX_INSTR pcmpgtd, 0, 0
  730. AVX_INSTR pcmpgtq, 0, 0
  731. AVX_INSTR phaddw, 0, 0
  732. AVX_INSTR phaddd, 0, 0
  733. AVX_INSTR phaddsw, 0, 0
  734. AVX_INSTR phsubw, 0, 0
  735. AVX_INSTR phsubd, 0, 0
  736. AVX_INSTR phsubsw, 0, 0
  737. AVX_INSTR pmaddwd, 0, 0
  738. AVX_INSTR pmaddubsw, 0, 0
  739. AVX_INSTR pmaxsb, 0, 0
  740. AVX_INSTR pmaxsw, 0, 0
  741. AVX_INSTR pmaxsd, 0, 0
  742. AVX_INSTR pmaxub, 0, 0
  743. AVX_INSTR pmaxuw, 0, 0
  744. AVX_INSTR pmaxud, 0, 0
  745. AVX_INSTR pminsb, 0, 0
  746. AVX_INSTR pminsw, 0, 0
  747. AVX_INSTR pminsd, 0, 0
  748. AVX_INSTR pminub, 0, 0
  749. AVX_INSTR pminuw, 0, 0
  750. AVX_INSTR pminud, 0, 0
  751. AVX_INSTR pmulhuw, 0, 0
  752. AVX_INSTR pmulhrsw, 0, 0
  753. AVX_INSTR pmulhw, 0, 0
  754. AVX_INSTR pmullw, 0, 0
  755. AVX_INSTR pmulld, 0, 0
  756. AVX_INSTR pmuludq, 0, 0
  757. AVX_INSTR pmuldq, 0, 0
  758. AVX_INSTR por, 0, 0
  759. AVX_INSTR psadbw, 0, 0
  760. AVX_INSTR pshufb, 0, 0
  761. AVX_INSTR psignb, 0, 0
  762. AVX_INSTR psignw, 0, 0
  763. AVX_INSTR psignd, 0, 0
  764. AVX_INSTR psllw, 0, 0
  765. AVX_INSTR pslld, 0, 0
  766. AVX_INSTR psllq, 0, 0
  767. AVX_INSTR pslldq, 0, 0
  768. AVX_INSTR psraw, 0, 0
  769. AVX_INSTR psrad, 0, 0
  770. AVX_INSTR psrlw, 0, 0
  771. AVX_INSTR psrld, 0, 0
  772. AVX_INSTR psrlq, 0, 0
  773. AVX_INSTR psrldq, 0, 0
  774. AVX_INSTR psubb, 0, 0
  775. AVX_INSTR psubw, 0, 0
  776. AVX_INSTR psubd, 0, 0
  777. AVX_INSTR psubq, 0, 0
  778. AVX_INSTR psubsb, 0, 0
  779. AVX_INSTR psubsw, 0, 0
  780. AVX_INSTR psubusb, 0, 0
  781. AVX_INSTR psubusw, 0, 0
  782. AVX_INSTR punpckhbw, 0, 0
  783. AVX_INSTR punpckhwd, 0, 0
  784. AVX_INSTR punpckhdq, 0, 0
  785. AVX_INSTR punpckhqdq, 0, 0
  786. AVX_INSTR punpcklbw, 0, 0
  787. AVX_INSTR punpcklwd, 0, 0
  788. AVX_INSTR punpckldq, 0, 0
  789. AVX_INSTR punpcklqdq, 0, 0
  790. AVX_INSTR pxor, 0, 0
  791. AVX_INSTR shufps, 0, 1
  792. AVX_INSTR subpd, 1, 0
  793. AVX_INSTR subps, 1, 0
  794. AVX_INSTR subsd, 1, 0
  795. AVX_INSTR subss, 1, 0
  796. AVX_INSTR unpckhpd, 1, 0
  797. AVX_INSTR unpckhps, 1, 0
  798. AVX_INSTR unpcklpd, 1, 0
  799. AVX_INSTR unpcklps, 1, 0
  800. AVX_INSTR xorpd, 1, 0
  801. AVX_INSTR xorps, 1, 0
  802. ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
  803. AVX_INSTR pfadd, 1, 0
  804. AVX_INSTR pfsub, 1, 0
  805. AVX_INSTR pfmul, 1, 0