Home | History | Annotate | Download | only in x86inc
      1 ;*****************************************************************************
      2 ;* x86inc.asm
      3 ;*****************************************************************************
      4 ;* Copyright (C) 2005-2011 x264 project
      5 ;*
      6 ;* Authors: Loren Merritt <lorenm (a] u.washington.edu>
      7 ;*          Anton Mitrofanov <BugMaster (a] narod.ru>
      8 ;*          Jason Garrett-Glaser <darkshikari (a] gmail.com>
      9 ;*
     10 ;* Permission to use, copy, modify, and/or distribute this software for any
     11 ;* purpose with or without fee is hereby granted, provided that the above
     12 ;* copyright notice and this permission notice appear in all copies.
     13 ;*
     14 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     15 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     16 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     17 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     18 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     19 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     20 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     21 ;*****************************************************************************
     22 
     23 ; This is a header file for the x264ASM assembly language, which uses
     24 ; NASM/YASM syntax combined with a large number of macros to provide easy
     25 ; abstraction between different calling conventions (x86_32, win64, linux64).
     26 ; It also has various other useful features to simplify writing the kind of
     27 ; DSP functions that are most often used in x264.
     28 
     29 ; Unlike the rest of x264, this file is available under an ISC license, as it
     30 ; has significant usefulness outside of x264 and we want it to be available
     31 ; to the largest audience possible.  Of course, if you modify it for your own
     32 ; purposes to add a new feature, we strongly encourage contributing a patch
     33 ; as this feature might be useful for others as well.  Send patches or ideas
     34 ; to x264-devel (a] videolan.org .
     35 
     36 %ifndef THIRD_PARTY_X86INC_X86INC_ASM_
     37 %define THIRD_PARTY_X86INC_X86INC_ASM_
     38 
     39 ; TODO(wolenetz): Consider either updating this customized version to base from
     40 ; a more recent original, or switching to using third_party/ffmpeg's version of
     41 ; this abstraction layer.  See http://crbug.com/175029
     42 
     43 %define program_name ff
     44 
     45 %ifdef ARCH_X86_64
     46     %ifidn __OUTPUT_FORMAT__,win32
     47         %define WIN64
     48     %elifidn __OUTPUT_FORMAT__,win64
     49         %define WIN64
     50     %else
     51         %define UNIX64
     52     %endif
     53 %endif
     54 
     55 %ifdef PREFIX
     56     %define mangle(x) _ %+ x
     57 %else
     58     %define mangle(x) x
     59 %endif
     60 
     61 ; FIXME: All of the 64bit asm functions that take a stride as an argument
     62 ; via register, assume that the high dword of that register is filled with 0.
     63 ; This is true in practice (since we never do any 64bit arithmetic on strides,
     64 ; and x264's strides are all positive), but is not guaranteed by the ABI.
     65 
     66 ; Name of the .rodata section.
     67 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
     68 ; so use a different read-only section.
     69 %ifdef CHROMIUM
     70 %macro SECTION_RODATA 0-1 16
     71     %ifidn __OUTPUT_FORMAT__,macho64
     72         SECTION .text align=%1
     73     %elifidn __OUTPUT_FORMAT__,macho
     74         SECTION .text align=%1
     75         fakegot:
     76     %elifidn __OUTPUT_FORMAT__,aout
     77         section .text
     78     %else
     79         SECTION .rodata align=%1
     80     %endif
     81 %endmacro
     82 %else
     83 %macro SECTION_RODATA 0-1 16
     84     %ifidn __OUTPUT_FORMAT__,aout
     85         section .text
     86     %else
     87         SECTION .rodata align=%1
     88     %endif
     89 %endmacro
     90 %endif
     91 
     92 ; aout does not support align=
     93 %macro SECTION_TEXT 0-1 16
     94     %ifidn __OUTPUT_FORMAT__,aout
     95         SECTION .text
     96     %else
     97         SECTION .text align=%1
     98     %endif
     99 %endmacro
    100 
    101 %ifdef WIN64
    102     %define PIC
    103 %elifndef ARCH_X86_64
    104 ; For chromium we may build PIC code even for 32 bits system.
    105 %ifndef CHROMIUM
    106 ; x86_32 doesn't require PIC.
    107 ; Some distros prefer shared objects to be PIC, but nothing breaks if
    108 ; the code contains a few textrels, so we'll skip that complexity.
    109     %undef PIC
    110 %endif
    111 %endif
    112 %ifdef PIC
    113     default rel
    114 %endif
    115 
    116 ; Macros to eliminate most code duplication between x86_32 and x86_64:
    117 ; Currently this works only for leaf functions which load all their arguments
    118 ; into registers at the start, and make no other use of the stack. Luckily that
    119 ; covers most of x264's asm.
    120 
    121 ; PROLOGUE:
    122 ; %1 = number of arguments. loads them from stack if needed.
    123 ; %2 = number of registers used. pushes callee-saved regs if needed.
    124 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
    125 ; %4 = list of names to define to registers
    126 ; PROLOGUE can also be invoked by adding the same options to cglobal
    127 
    128 ; e.g.
    129 ; cglobal foo, 2,3,0, dst, src, tmp
    130 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
    131 
    132 ; TODO Some functions can use some args directly from the stack. If they're the
    133 ; last args then you can just not declare them, but if they're in the middle
    134 ; we need more flexible macro.
    135 
    136 ; RET:
    137 ; Pops anything that was pushed by PROLOGUE
    138 
    139 ; REP_RET:
    140 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
    141 ; which are slow when a normal ret follows a branch.
    142 
    143 ; registers:
    144 ; rN and rNq are the native-size register holding function argument N
    145 ; rNd, rNw, rNb are dword, word, and byte size
    146 ; rNm is the original location of arg N (a register or on the stack), dword
    147 ; rNmp is native size
    148 
    149 %macro DECLARE_REG 6
    150     %define r%1q %2
    151     %define r%1d %3
    152     %define r%1w %4
    153     %define r%1b %5
    154     %define r%1m %6
    155     %ifid %6 ; i.e. it's a register
    156         %define r%1mp %2
    157     %elifdef ARCH_X86_64 ; memory
    158         %define r%1mp qword %6
    159     %else
    160         %define r%1mp dword %6
    161     %endif
    162     %define r%1  %2
    163 %endmacro
    164 
    165 %macro DECLARE_REG_SIZE 2
    166     %define r%1q r%1
    167     %define e%1q r%1
    168     %define r%1d e%1
    169     %define e%1d e%1
    170     %define r%1w %1
    171     %define e%1w %1
    172     %define r%1b %2
    173     %define e%1b %2
    174 %ifndef ARCH_X86_64
    175     %define r%1  e%1
    176 %endif
    177 %endmacro
    178 
    179 DECLARE_REG_SIZE ax, al
    180 DECLARE_REG_SIZE bx, bl
    181 DECLARE_REG_SIZE cx, cl
    182 DECLARE_REG_SIZE dx, dl
    183 DECLARE_REG_SIZE si, sil
    184 DECLARE_REG_SIZE di, dil
    185 DECLARE_REG_SIZE bp, bpl
    186 
    187 ; t# defines for when per-arch register allocation is more complex than just function arguments
    188 
    189 %macro DECLARE_REG_TMP 1-*
    190     %assign %%i 0
    191     %rep %0
    192         CAT_XDEFINE t, %%i, r%1
    193         %assign %%i %%i+1
    194         %rotate 1
    195     %endrep
    196 %endmacro
    197 
    198 %macro DECLARE_REG_TMP_SIZE 0-*
    199     %rep %0
    200         %define t%1q t%1 %+ q
    201         %define t%1d t%1 %+ d
    202         %define t%1w t%1 %+ w
    203         %define t%1b t%1 %+ b
    204         %rotate 1
    205     %endrep
    206 %endmacro
    207 
    208 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
    209 
    210 %ifdef ARCH_X86_64
    211     %define gprsize 8
    212 %else
    213     %define gprsize 4
    214 %endif
    215 
    216 %macro PUSH 1
    217     push %1
    218     %assign stack_offset stack_offset+gprsize
    219 %endmacro
    220 
    221 %macro POP 1
    222     pop %1
    223     %assign stack_offset stack_offset-gprsize
    224 %endmacro
    225 
    226 %macro SUB 2
    227     sub %1, %2
    228     %ifidn %1, rsp
    229         %assign stack_offset stack_offset+(%2)
    230     %endif
    231 %endmacro
    232 
    233 %macro ADD 2
    234     add %1, %2
    235     %ifidn %1, rsp
    236         %assign stack_offset stack_offset-(%2)
    237     %endif
    238 %endmacro
    239 
    240 %macro movifnidn 2
    241     %ifnidn %1, %2
    242         mov %1, %2
    243     %endif
    244 %endmacro
    245 
    246 %macro movsxdifnidn 2
    247     %ifnidn %1, %2
    248         movsxd %1, %2
    249     %endif
    250 %endmacro
    251 
    252 %macro ASSERT 1
    253     %if (%1) == 0
    254         %error assert failed
    255     %endif
    256 %endmacro
    257 
    258 %macro DEFINE_ARGS 0-*
    259     %ifdef n_arg_names
    260         %assign %%i 0
    261         %rep n_arg_names
    262             CAT_UNDEF arg_name %+ %%i, q
    263             CAT_UNDEF arg_name %+ %%i, d
    264             CAT_UNDEF arg_name %+ %%i, w
    265             CAT_UNDEF arg_name %+ %%i, b
    266             CAT_UNDEF arg_name %+ %%i, m
    267             CAT_UNDEF arg_name, %%i
    268             %assign %%i %%i+1
    269         %endrep
    270     %endif
    271 
    272     %assign %%i 0
    273     %rep %0
    274         %xdefine %1q r %+ %%i %+ q
    275         %xdefine %1d r %+ %%i %+ d
    276         %xdefine %1w r %+ %%i %+ w
    277         %xdefine %1b r %+ %%i %+ b
    278         %xdefine %1m r %+ %%i %+ m
    279         CAT_XDEFINE arg_name, %%i, %1
    280         %assign %%i %%i+1
    281         %rotate 1
    282     %endrep
    283     %assign n_arg_names %%i
    284 %endmacro
    285 
    286 %ifdef WIN64 ; Windows x64 ;=================================================
    287 
    288 DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
    289 DECLARE_REG 1, rdx, edx, dx,  dl,  edx
    290 DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
    291 DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
    292 DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
    293 DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
    294 DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
    295 %define r7m [rsp + stack_offset + 64]
    296 %define r8m [rsp + stack_offset + 72]
    297 
    298 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
    299     %if %1 < %2
    300         mov r%1, [rsp + stack_offset + 8 + %1*8]
    301     %endif
    302 %endmacro
    303 
    304 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
    305     ASSERT %2 >= %1
    306     %assign regs_used %2
    307     ASSERT regs_used <= 7
    308     %if regs_used > 4
    309         push r4
    310         push r5
    311         %assign stack_offset stack_offset+16
    312     %endif
    313     WIN64_SPILL_XMM %3
    314     LOAD_IF_USED 4, %1
    315     LOAD_IF_USED 5, %1
    316     LOAD_IF_USED 6, %1
    317     DEFINE_ARGS %4
    318 %endmacro
    319 
    320 %macro WIN64_SPILL_XMM 1
    321     %assign xmm_regs_used %1
    322     ASSERT xmm_regs_used <= 16
    323     %if xmm_regs_used > 6
    324         sub rsp, (xmm_regs_used-6)*16+16
    325         %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
    326         %assign %%i xmm_regs_used
    327         %rep (xmm_regs_used-6)
    328             %assign %%i %%i-1
    329             movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
    330         %endrep
    331     %endif
    332 %endmacro
    333 
    334 %macro WIN64_RESTORE_XMM_INTERNAL 1
    335     %if xmm_regs_used > 6
    336         %assign %%i xmm_regs_used
    337         %rep (xmm_regs_used-6)
    338             %assign %%i %%i-1
    339             movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
    340         %endrep
    341         add %1, (xmm_regs_used-6)*16+16
    342     %endif
    343 %endmacro
    344 
    345 %macro WIN64_RESTORE_XMM 1
    346     WIN64_RESTORE_XMM_INTERNAL %1
    347     %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
    348     %assign xmm_regs_used 0
    349 %endmacro
    350 
    351 %macro RET 0
    352     WIN64_RESTORE_XMM_INTERNAL rsp
    353     %if regs_used > 4
    354         pop r5
    355         pop r4
    356     %endif
    357     ret
    358 %endmacro
    359 
    360 %macro REP_RET 0
    361     %if regs_used > 4 || xmm_regs_used > 6
    362         RET
    363     %else
    364         rep ret
    365     %endif
    366 %endmacro
    367 
    368 %elifdef ARCH_X86_64 ; *nix x64 ;=============================================
    369 
    370 DECLARE_REG 0, rdi, edi, di,  dil, edi
    371 DECLARE_REG 1, rsi, esi, si,  sil, esi
    372 DECLARE_REG 2, rdx, edx, dx,  dl,  edx
    373 DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
    374 DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
    375 DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
    376 DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
    377 %define r7m [rsp + stack_offset + 16]
    378 %define r8m [rsp + stack_offset + 24]
    379 
    380 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
    381     %if %1 < %2
    382         mov r%1, [rsp - 40 + %1*8]
    383     %endif
    384 %endmacro
    385 
    386 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    387     ASSERT %2 >= %1
    388     ASSERT %2 <= 7
    389     LOAD_IF_USED 6, %1
    390     DEFINE_ARGS %4
    391 %endmacro
    392 
    393 %macro RET 0
    394     ret
    395 %endmacro
    396 
    397 %macro REP_RET 0
    398     rep ret
    399 %endmacro
    400 
    401 %else ; X86_32 ;==============================================================
    402 
    403 ; Begin chromium edits
    404 %ifdef CHROMIUM
    405 ; Change the order of registers so we can get the lower 8-bit or the 5th and 6th
    406 ; arguments.
    407 DECLARE_REG 0, esi, esi, si, null, [esp + stack_offset + 4]
    408 DECLARE_REG 1, edi, edi, di, null, [esp + stack_offset + 8]
    409 DECLARE_REG 2, ecx, ecx, cx, cl,   [esp + stack_offset + 12]
    410 DECLARE_REG 3, edx, edx, dx, dl,   [esp + stack_offset + 16]
    411 DECLARE_REG 4, eax, eax, ax, al,   [esp + stack_offset + 20]
    412 DECLARE_REG 5, ebx, ebx, bx, bl,   [esp + stack_offset + 24]
    413 %else
    414 DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
    415 DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
    416 DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
    417 DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
    418 DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
    419 DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
    420 %endif
    421 ; End chromium edits
    422 DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
    423 %define r7m [esp + stack_offset + 32]
    424 %define r8m [esp + stack_offset + 36]
    425 %define rsp esp
    426 
    427 %macro PUSH_IF_USED 1 ; reg_id
    428     %if %1 < regs_used
    429         push r%1
    430         %assign stack_offset stack_offset+4
    431     %endif
    432 %endmacro
    433 
    434 %macro POP_IF_USED 1 ; reg_id
    435     %if %1 < regs_used
    436         pop r%1
    437     %endif
    438 %endmacro
    439 
    440 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
    441     %if %1 < %2
    442         mov r%1, [esp + stack_offset + 4 + %1*4]
    443     %endif
    444 %endmacro
    445 
    446 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    447     ASSERT %2 >= %1
    448     %assign regs_used %2
    449     ASSERT regs_used <= 7
    450 %ifdef CHROMIUM
    451     PUSH_IF_USED 0
    452     PUSH_IF_USED 1
    453     PUSH_IF_USED 5
    454 %else
    455     PUSH_IF_USED 3
    456     PUSH_IF_USED 4
    457     PUSH_IF_USED 5
    458 %endif
    459     PUSH_IF_USED 6
    460     LOAD_IF_USED 0, %1
    461     LOAD_IF_USED 1, %1
    462     LOAD_IF_USED 2, %1
    463     LOAD_IF_USED 3, %1
    464     LOAD_IF_USED 4, %1
    465     LOAD_IF_USED 5, %1
    466     LOAD_IF_USED 6, %1
    467     DEFINE_ARGS %4
    468 %endmacro
    469 
    470 %macro RET 0
    471     POP_IF_USED 6
    472 %ifdef CHROMIUM
    473     POP_IF_USED 5
    474     POP_IF_USED 1
    475     POP_IF_USED 0
    476 %else
    477     POP_IF_USED 5
    478     POP_IF_USED 4
    479     POP_IF_USED 3
    480 %endif
    481     ret
    482 %endmacro
    483 
    484 %macro REP_RET 0
    485     %if regs_used > 3
    486         RET
    487     %else
    488         rep ret
    489     %endif
    490 %endmacro
    491 
    492 %endif ;======================================================================
    493 
    494 %ifndef WIN64
    495 %macro WIN64_SPILL_XMM 1
    496 %endmacro
    497 %macro WIN64_RESTORE_XMM 1
    498 %endmacro
    499 %endif
    500 
    501 
    502 
    503 ;=============================================================================
    504 ; arch-independent part
    505 ;=============================================================================
    506 
    507 %assign function_align 16
    508 
    509 ; Symbol prefix for C linkage
    510 %macro cglobal 1-2+
    511     %xdefine %1 mangle(program_name %+ _ %+ %1)
    512     %xdefine %1.skip_prologue %1 %+ .skip_prologue
    513     %ifidn __OUTPUT_FORMAT__,elf
    514         global %1:function hidden
    515     %else
    516         global %1
    517     %endif
    518     align function_align
    519     %1:
    520     RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
    521     %assign stack_offset 0
    522     %if %0 > 1
    523         PROLOGUE %2
    524     %endif
    525 %endmacro
    526 
    527 %macro cextern 1
    528     %xdefine %1 mangle(program_name %+ _ %+ %1)
    529     extern %1
    530 %endmacro
    531 
    532 ;like cextern, but without the prefix
    533 %macro cextern_naked 1
    534     %xdefine %1 mangle(%1)
    535     extern %1
    536 %endmacro
    537 
    538 %macro const 2+
    539     %xdefine %1 mangle(program_name %+ _ %+ %1)
    540     global %1
    541     %1: %2
    542 %endmacro
    543 
    544 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
    545 ; executable by default.
    546 %ifidn __OUTPUT_FORMAT__,elf
    547 SECTION .note.GNU-stack noalloc noexec nowrite progbits
    548 %endif
    549 
    550 ; merge mmx and sse*
    551 
    552 %macro CAT_XDEFINE 3
    553     %xdefine %1%2 %3
    554 %endmacro
    555 
    556 %macro CAT_UNDEF 2
    557     %undef %1%2
    558 %endmacro
    559 
    560 %macro INIT_MMX 0
    561     %assign avx_enabled 0
    562     %define RESET_MM_PERMUTATION INIT_MMX
    563     %define mmsize 8
    564     %define num_mmregs 8
    565     %define mova movq
    566     %define movu movq
    567     %define movh movd
    568     %define movnta movntq
    569     %assign %%i 0
    570     %rep 8
    571     CAT_XDEFINE m, %%i, mm %+ %%i
    572     CAT_XDEFINE nmm, %%i, %%i
    573     %assign %%i %%i+1
    574     %endrep
    575     %rep 8
    576     CAT_UNDEF m, %%i
    577     CAT_UNDEF nmm, %%i
    578     %assign %%i %%i+1
    579     %endrep
    580 %endmacro
    581 
    582 %macro INIT_XMM 0
    583     %assign avx_enabled 0
    584     %define RESET_MM_PERMUTATION INIT_XMM
    585     %define mmsize 16
    586     %define num_mmregs 8
    587     %ifdef ARCH_X86_64
    588     %define num_mmregs 16
    589     %endif
    590     %define mova movdqa
    591     %define movu movdqu
    592     %define movh movq
    593     %define movnta movntdq
    594     %assign %%i 0
    595     %rep num_mmregs
    596     CAT_XDEFINE m, %%i, xmm %+ %%i
    597     CAT_XDEFINE nxmm, %%i, %%i
    598     %assign %%i %%i+1
    599     %endrep
    600 %endmacro
    601 
    602 %macro INIT_AVX 0
    603     INIT_XMM
    604     %assign avx_enabled 1
    605     %define PALIGNR PALIGNR_SSSE3
    606     %define RESET_MM_PERMUTATION INIT_AVX
    607 %endmacro
    608 
    609 %macro INIT_YMM 0
    610     %assign avx_enabled 1
    611     %define RESET_MM_PERMUTATION INIT_YMM
    612     %define mmsize 32
    613     %define num_mmregs 8
    614     %ifdef ARCH_X86_64
    615     %define num_mmregs 16
    616     %endif
    617     %define mova vmovaps
    618     %define movu vmovups
    619     %assign %%i 0
    620     %rep num_mmregs
    621     CAT_XDEFINE m, %%i, ymm %+ %%i
    622     CAT_XDEFINE nymm, %%i, %%i
    623     %assign %%i %%i+1
    624     %endrep
    625 %endmacro
    626 
    627 INIT_MMX
    628 
    629 ; I often want to use macros that permute their arguments. e.g. there's no
    630 ; efficient way to implement butterfly or transpose or dct without swapping some
    631 ; arguments.
    632 ;
    633 ; I would like to not have to manually keep track of the permutations:
    634 ; If I insert a permutation in the middle of a function, it should automatically
    635 ; change everything that follows. For more complex macros I may also have multiple
    636 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
    637 ;
    638 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
    639 ; permutes its arguments. It's equivalent to exchanging the contents of the
    640 ; registers, except that this way you exchange the register names instead, so it
    641 ; doesn't cost any cycles.
    642 
    643 %macro PERMUTE 2-* ; takes a list of pairs to swap
    644 %rep %0/2
    645     %xdefine tmp%2 m%2
    646     %xdefine ntmp%2 nm%2
    647     %rotate 2
    648 %endrep
    649 %rep %0/2
    650     %xdefine m%1 tmp%2
    651     %xdefine nm%1 ntmp%2
    652     %undef tmp%2
    653     %undef ntmp%2
    654     %rotate 2
    655 %endrep
    656 %endmacro
    657 
    658 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
    659 %rep %0-1
    660 %ifdef m%1
    661     %xdefine tmp m%1
    662     %xdefine m%1 m%2
    663     %xdefine m%2 tmp
    664     CAT_XDEFINE n, m%1, %1
    665     CAT_XDEFINE n, m%2, %2
    666 %else
    667     ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
    668     ; Be careful using this mode in nested macros though, as in some cases there may be
    669     ; other copies of m# that have already been dereferenced and don't get updated correctly.
    670     %xdefine %%n1 n %+ %1
    671     %xdefine %%n2 n %+ %2
    672     %xdefine tmp m %+ %%n1
    673     CAT_XDEFINE m, %%n1, m %+ %%n2
    674     CAT_XDEFINE m, %%n2, tmp
    675     CAT_XDEFINE n, m %+ %%n1, %%n1
    676     CAT_XDEFINE n, m %+ %%n2, %%n2
    677 %endif
    678     %undef tmp
    679     %rotate 1
    680 %endrep
    681 %endmacro
    682 
    683 ; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
    684 ; function name, then any later calls to that function will automatically
    685 ; load the permutation, so values can be returned in mmregs.
    686 %macro SAVE_MM_PERMUTATION 1 ; name to save as
    687     %assign %%i 0
    688     %rep num_mmregs
    689     CAT_XDEFINE %1_m, %%i, m %+ %%i
    690     %assign %%i %%i+1
    691     %endrep
    692 %endmacro
    693 
    694 %macro LOAD_MM_PERMUTATION 1 ; name to load from
    695     %assign %%i 0
    696     %rep num_mmregs
    697     CAT_XDEFINE m, %%i, %1_m %+ %%i
    698     CAT_XDEFINE n, m %+ %%i, %%i
    699     %assign %%i %%i+1
    700     %endrep
    701 %endmacro
    702 
    703 %macro call 1
    704     call %1
    705     %ifdef %1_m0
    706         LOAD_MM_PERMUTATION %1
    707     %endif
    708 %endmacro
    709 
    710 ; Substitutions that reduce instruction size but are functionally equivalent
    711 %macro add 2
    712     %ifnum %2
    713         %if %2==128
    714             sub %1, -128
    715         %else
    716             add %1, %2
    717         %endif
    718     %else
    719         add %1, %2
    720     %endif
    721 %endmacro
    722 
    723 %macro sub 2
    724     %ifnum %2
    725         %if %2==128
    726             add %1, -128
    727         %else
    728             sub %1, %2
    729         %endif
    730     %else
    731         sub %1, %2
    732     %endif
    733 %endmacro
    734 
    735 ;=============================================================================
    736 ; AVX abstraction layer
    737 ;=============================================================================
    738 
    739 %assign i 0
    740 %rep 16
    741     %if i < 8
    742         CAT_XDEFINE sizeofmm, i, 8
    743     %endif
    744     CAT_XDEFINE sizeofxmm, i, 16
    745     CAT_XDEFINE sizeofymm, i, 32
    746 %assign i i+1
    747 %endrep
    748 %undef i
    749 
    750 ;%1 == instruction
    751 ;%2 == 1 if float, 0 if int
    752 ;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
    753 ;%4 == number of operands given
    754 ;%5+: operands
    755 %macro RUN_AVX_INSTR 6-7+
    756     %if sizeof%5==32
    757         v%1 %5, %6, %7
    758     %else
    759         %if sizeof%5==8
    760             %define %%regmov movq
    761         %elif %2
    762             %define %%regmov movaps
    763         %else
    764             %define %%regmov movdqa
    765         %endif
    766 
    767         %if %4>=3+%3
    768             %ifnidn %5, %6
    769                 %if avx_enabled && sizeof%5==16
    770                     v%1 %5, %6, %7
    771                 %else
    772                     %%regmov %5, %6
    773                     %1 %5, %7
    774                 %endif
    775             %else
    776                 %1 %5, %7
    777             %endif
    778         %elif %3
    779             %1 %5, %6, %7
    780         %else
    781             %1 %5, %6
    782         %endif
    783     %endif
    784 %endmacro
    785 
    786 ;%1 == instruction
    787 ;%2 == 1 if float, 0 if int
    788 ;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
    789 %macro AVX_INSTR 3
    790     %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
    791         %ifidn %3, fnord
    792             RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
    793         %elifidn %4, fnord
    794             RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
    795         %elifidn %5, fnord
    796             RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
    797         %else
    798             RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
    799         %endif
    800     %endmacro
    801 %endmacro
    802 
    803 AVX_INSTR addpd, 1, 0
    804 AVX_INSTR addps, 1, 0
    805 AVX_INSTR addsd, 1, 0
    806 AVX_INSTR addss, 1, 0
    807 AVX_INSTR addsubpd, 1, 0
    808 AVX_INSTR addsubps, 1, 0
    809 AVX_INSTR andpd, 1, 0
    810 AVX_INSTR andps, 1, 0
    811 AVX_INSTR andnpd, 1, 0
    812 AVX_INSTR andnps, 1, 0
    813 AVX_INSTR blendpd, 1, 0
    814 AVX_INSTR blendps, 1, 0
    815 AVX_INSTR blendvpd, 1, 0
    816 AVX_INSTR blendvps, 1, 0
    817 AVX_INSTR cmppd, 1, 0
    818 AVX_INSTR cmpps, 1, 0
    819 AVX_INSTR cmpsd, 1, 0
    820 AVX_INSTR cmpss, 1, 0
    821 AVX_INSTR divpd, 1, 0
    822 AVX_INSTR divps, 1, 0
    823 AVX_INSTR divsd, 1, 0
    824 AVX_INSTR divss, 1, 0
    825 AVX_INSTR dppd, 1, 0
    826 AVX_INSTR dpps, 1, 0
    827 AVX_INSTR haddpd, 1, 0
    828 AVX_INSTR haddps, 1, 0
    829 AVX_INSTR hsubpd, 1, 0
    830 AVX_INSTR hsubps, 1, 0
    831 AVX_INSTR maxpd, 1, 0
    832 AVX_INSTR maxps, 1, 0
    833 AVX_INSTR maxsd, 1, 0
    834 AVX_INSTR maxss, 1, 0
    835 AVX_INSTR minpd, 1, 0
    836 AVX_INSTR minps, 1, 0
    837 AVX_INSTR minsd, 1, 0
    838 AVX_INSTR minss, 1, 0
    839 AVX_INSTR mpsadbw, 0, 1
    840 AVX_INSTR mulpd, 1, 0
    841 AVX_INSTR mulps, 1, 0
    842 AVX_INSTR mulsd, 1, 0
    843 AVX_INSTR mulss, 1, 0
    844 AVX_INSTR orpd, 1, 0
    845 AVX_INSTR orps, 1, 0
    846 AVX_INSTR packsswb, 0, 0
    847 AVX_INSTR packssdw, 0, 0
    848 AVX_INSTR packuswb, 0, 0
    849 AVX_INSTR packusdw, 0, 0
    850 AVX_INSTR paddb, 0, 0
    851 AVX_INSTR paddw, 0, 0
    852 AVX_INSTR paddd, 0, 0
    853 AVX_INSTR paddq, 0, 0
    854 AVX_INSTR paddsb, 0, 0
    855 AVX_INSTR paddsw, 0, 0
    856 AVX_INSTR paddusb, 0, 0
    857 AVX_INSTR paddusw, 0, 0
    858 AVX_INSTR palignr, 0, 1
    859 AVX_INSTR pand, 0, 0
    860 AVX_INSTR pandn, 0, 0
    861 AVX_INSTR pavgb, 0, 0
    862 AVX_INSTR pavgw, 0, 0
    863 AVX_INSTR pblendvb, 0, 0
    864 AVX_INSTR pblendw, 0, 1
    865 AVX_INSTR pcmpestri, 0, 0
    866 AVX_INSTR pcmpestrm, 0, 0
    867 AVX_INSTR pcmpistri, 0, 0
    868 AVX_INSTR pcmpistrm, 0, 0
    869 AVX_INSTR pcmpeqb, 0, 0
    870 AVX_INSTR pcmpeqw, 0, 0
    871 AVX_INSTR pcmpeqd, 0, 0
    872 AVX_INSTR pcmpeqq, 0, 0
    873 AVX_INSTR pcmpgtb, 0, 0
    874 AVX_INSTR pcmpgtw, 0, 0
    875 AVX_INSTR pcmpgtd, 0, 0
    876 AVX_INSTR pcmpgtq, 0, 0
    877 AVX_INSTR phaddw, 0, 0
    878 AVX_INSTR phaddd, 0, 0
    879 AVX_INSTR phaddsw, 0, 0
    880 AVX_INSTR phsubw, 0, 0
    881 AVX_INSTR phsubd, 0, 0
    882 AVX_INSTR phsubsw, 0, 0
    883 AVX_INSTR pmaddwd, 0, 0
    884 AVX_INSTR pmaddubsw, 0, 0
    885 AVX_INSTR pmaxsb, 0, 0
    886 AVX_INSTR pmaxsw, 0, 0
    887 AVX_INSTR pmaxsd, 0, 0
    888 AVX_INSTR pmaxub, 0, 0
    889 AVX_INSTR pmaxuw, 0, 0
    890 AVX_INSTR pmaxud, 0, 0
    891 AVX_INSTR pminsb, 0, 0
    892 AVX_INSTR pminsw, 0, 0
    893 AVX_INSTR pminsd, 0, 0
    894 AVX_INSTR pminub, 0, 0
    895 AVX_INSTR pminuw, 0, 0
    896 AVX_INSTR pminud, 0, 0
    897 AVX_INSTR pmulhuw, 0, 0
    898 AVX_INSTR pmulhrsw, 0, 0
    899 AVX_INSTR pmulhw, 0, 0
    900 AVX_INSTR pmullw, 0, 0
    901 AVX_INSTR pmulld, 0, 0
    902 AVX_INSTR pmuludq, 0, 0
    903 AVX_INSTR pmuldq, 0, 0
    904 AVX_INSTR por, 0, 0
    905 AVX_INSTR psadbw, 0, 0
    906 AVX_INSTR pshufb, 0, 0
    907 AVX_INSTR psignb, 0, 0
    908 AVX_INSTR psignw, 0, 0
    909 AVX_INSTR psignd, 0, 0
    910 AVX_INSTR psllw, 0, 0
    911 AVX_INSTR pslld, 0, 0
    912 AVX_INSTR psllq, 0, 0
    913 AVX_INSTR pslldq, 0, 0
    914 AVX_INSTR psraw, 0, 0
    915 AVX_INSTR psrad, 0, 0
    916 AVX_INSTR psrlw, 0, 0
    917 AVX_INSTR psrld, 0, 0
    918 AVX_INSTR psrlq, 0, 0
    919 AVX_INSTR psrldq, 0, 0
    920 AVX_INSTR psubb, 0, 0
    921 AVX_INSTR psubw, 0, 0
    922 AVX_INSTR psubd, 0, 0
    923 AVX_INSTR psubq, 0, 0
    924 AVX_INSTR psubsb, 0, 0
    925 AVX_INSTR psubsw, 0, 0
    926 AVX_INSTR psubusb, 0, 0
    927 AVX_INSTR psubusw, 0, 0
    928 AVX_INSTR punpckhbw, 0, 0
    929 AVX_INSTR punpckhwd, 0, 0
    930 AVX_INSTR punpckhdq, 0, 0
    931 AVX_INSTR punpckhqdq, 0, 0
    932 AVX_INSTR punpcklbw, 0, 0
    933 AVX_INSTR punpcklwd, 0, 0
    934 AVX_INSTR punpckldq, 0, 0
    935 AVX_INSTR punpcklqdq, 0, 0
    936 AVX_INSTR pxor, 0, 0
    937 AVX_INSTR shufps, 0, 1
    938 AVX_INSTR subpd, 1, 0
    939 AVX_INSTR subps, 1, 0
    940 AVX_INSTR subsd, 1, 0
    941 AVX_INSTR subss, 1, 0
    942 AVX_INSTR unpckhpd, 1, 0
    943 AVX_INSTR unpckhps, 1, 0
    944 AVX_INSTR unpcklpd, 1, 0
    945 AVX_INSTR unpcklps, 1, 0
    946 AVX_INSTR xorpd, 1, 0
    947 AVX_INSTR xorps, 1, 0
    948 
    949 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
    950 AVX_INSTR pfadd, 1, 0
    951 AVX_INSTR pfsub, 1, 0
    952 AVX_INSTR pfmul, 1, 0
    953 
    954 ;=============================================================================
    955 ; Chromium extensions
    956 ;=============================================================================
    957 
    958 %ifdef CHROMIUM
    959 ; Always build PIC code on Mac or Linux for Chromium.
    960 %ifdef MACHO
    961 %define PIC
    962 %endif
    963 %ifdef ELF
    964 %define PIC
    965 %endif
    966 
    967 ;
    968 ; LOAD_SYM %1 (reg), %2 (sym)
    969 ; Copies the address to a local symbol to the specified register.
    970 ;
    971 
    972 %macro LOAD_SYM 2
    973 
    974 %ifdef PIC
    975   call      %%geteip
    976   add       %1, %2 - $
    977   jmp       %%end
    978 %%geteip:
    979   mov       %1, [rsp]
    980   ret
    981 %%end:
    982 
    983 %else
    984   lea       %1, [%2]
    985 %endif
    986 
    987 %endmacro
    988 
    989 ;
    990 ; MOVq %1 (xmm), %2 (reg)
    991 ; MOVq %1 (reg), %2 (xmm)
    992 ; Copies a general-purpose register to an XMM register, and vice versa.
    993 ;
    994 %macro MOVq 2
    995 %if gprsize == 8
    996   movq      %1, %2
    997 %else
    998   movd      %1, %2
    999 %endif
   1000 %endmacro
   1001 
   1002 %endif  ; CHROMIUM
   1003 
   1004 %endif  ; THIRD_PARTY_X86INC_X86INC_ASM_
   1005