Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jsimdext.inc - common declarations
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2010 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library - version 1.02
      9 ;
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ;
     12 ; This software is provided 'as-is', without any express or implied
     13 ; warranty.  In no event will the authors be held liable for any damages
     14 ; arising from the use of this software.
     15 ;
     16 ; Permission is granted to anyone to use this software for any purpose,
     17 ; including commercial applications, and to alter it and redistribute it
     18 ; freely, subject to the following restrictions:
     19 ;
     20 ; 1. The origin of this software must not be misrepresented; you must not
     21 ;    claim that you wrote the original software. If you use this software
     22 ;    in a product, an acknowledgment in the product documentation would be
     23 ;    appreciated but is not required.
     24 ; 2. Altered source versions must be plainly marked as such, and must not be
     25 ;    misrepresented as being the original software.
     26 ; 3. This notice may not be removed or altered from any source distribution.
     27 ;
     28 ; [TAB8]
     29 
     30 ; ==========================================================================
     31 ;  System-dependent configurations
     32 
     33 %ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
     34 ; * Microsoft Visual C++
     35 ; * MinGW (Minimalist GNU for Windows)
     36 ; * CygWin
     37 ; * LCC-Win32
     38 
     39 ; -- segment definition --
     40 ;
     41 %ifdef __YASM_VER__
     42 %define SEG_TEXT    .text  align=16
     43 %define SEG_CONST   .rdata align=16
     44 %else
     45 %define SEG_TEXT    .text  align=16 public use32 class=CODE
     46 %define SEG_CONST   .rdata align=16 public use32 class=CONST
     47 %endif
     48 
     49 %elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
     50 ; * Microsoft Visual C++
     51 
     52 ; -- segment definition --
     53 ;
     54 %ifdef __YASM_VER__
     55 %define SEG_TEXT    .text  align=16
     56 %define SEG_CONST   .rdata align=16
     57 %else
     58 %define SEG_TEXT    .text  align=16 public use64 class=CODE
     59 %define SEG_CONST   .rdata align=16 public use64 class=CONST
     60 %endif
     61 %define EXTN(name)  name                        ; foo() -> foo
     62 
     63 %elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
     64 ; * Borland C++ (Win32)
     65 
     66 ; -- segment definition --
     67 ;
     68 %define SEG_TEXT    _text  align=16 public use32 class=CODE
     69 %define SEG_CONST   _data  align=16 public use32 class=DATA
     70 
     71 %elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
     72 ; * Linux
     73 ; * *BSD family Unix using elf format
     74 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
     75 
     76 ; mark stack as non-executable
     77 section .note.GNU-stack noalloc noexec nowrite progbits
     78 
     79 ; -- segment definition --
     80 ;
     81 %ifdef __x86_64__
     82 %define SEG_TEXT    .text   progbits align=16
     83 %define SEG_CONST   .rodata progbits align=16
     84 %else
     85 %define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
     86 %define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
     87 %endif
     88 
     89 ; To make the code position-independent, append -DPIC to the commandline
     90 ;
     91 %define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
     92 %define EXTN(name)  name                        ; foo() -> foo
     93 
     94 %elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
     95 ; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
     96 ; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
     97 
     98 ; -- segment definition --
     99 ;
    100 %define SEG_TEXT    .text
    101 %define SEG_CONST   .data
    102 
    103 ; To make the code position-independent, append -DPIC to the commandline
    104 ;
    105 %define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
    106 
    107 %elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
    108 ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
    109 
    110 ; -- segment definition --
    111 ;
    112 %define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
    113 %define SEG_CONST   .rodata align=16
    114 
    115 ; The generation of position-independent code (PIC) is the default on Darwin.
    116 ;
    117 %define PIC
    118 %define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
    119 
    120 %else           ; ----(Other case)----------------------
    121 
    122 ; -- segment definition --
    123 ;
    124 %define SEG_TEXT    .text
    125 %define SEG_CONST   .data
    126 
    127 %endif  ; ----------------------------------------------
    128 
    129 ; ==========================================================================
    130 
    131 ; --------------------------------------------------------------------------
    132 ;  Common types
    133 ;
    134 %ifdef __x86_64__
    135 %define POINTER                 qword           ; general pointer type
    136 %define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
    137 %define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
    138 %else
    139 %define POINTER                 dword           ; general pointer type
    140 %define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
    141 %define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
    142 %endif
    143 
    144 %define INT                     dword           ; signed integer type
    145 %define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
    146 %define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
    147 
    148 %define FP32                    dword           ; IEEE754 single
    149 %define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
    150 %define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
    151 
    152 %define MMWORD                  qword           ; int64  (MMX register)
    153 %define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
    154 %define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
    155 
    156 ; NASM is buggy and doesn't properly handle operand sizes for SSE
    157 ; instructions, so for now we have to define XMMWORD as blank.
    158 %define XMMWORD                                 ; int128 (SSE register)
    159 %define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
    160 %define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
    161 
    162 ; Similar hacks for when we load a dword or MMWORD into an xmm# register
    163 %define XMM_DWORD
    164 %define XMM_MMWORD
    165 
    166 %define SIZEOF_BYTE             1               ; sizeof(BYTE)
    167 %define SIZEOF_WORD             2               ; sizeof(WORD)
    168 %define SIZEOF_DWORD            4               ; sizeof(DWORD)
    169 %define SIZEOF_QWORD            8               ; sizeof(QWORD)
    170 %define SIZEOF_OWORD            16              ; sizeof(OWORD)
    171 
    172 %define BYTE_BIT                8               ; CHAR_BIT in C
    173 %define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
    174 %define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
    175 %define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
    176 %define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
    177 
    178 ; --------------------------------------------------------------------------
    179 ;  External Symbol Name
    180 ;
    181 %ifndef EXTN
    182 # Android Modification:
    183 # The unmodified code from upstream appends an underscore to the front of
    184 # "name" here.  It is unclear why.  Before removing the underscore, the
    185 # code failed to link because the function names in the SIMD code did not
    186 # match the callers (because of the extra underscore).  This fix only
    187 # applies to x86 SIMD code.  x86_64 is handled properly by the code above.
    188 %define EXTN(name)  name
    189 %endif
    190 
    191 ; --------------------------------------------------------------------------
    192 ;  Macros for position-independent code (PIC) support
    193 ;
    194 %ifndef GOT_SYMBOL
    195 %undef PIC
    196 %endif
    197 
    198 %ifdef PIC ; -------------------------------------------
    199 
    200 %ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
    201 
    202 ; At present, nasm doesn't seem to support PIC generation for Mach-O.
    203 ; The PIC support code below is a little tricky.
    204 
    205         SECTION SEG_CONST
    206 const_base:
    207 
    208 %define GOTOFF(got,sym) (got) + (sym) - const_base
    209 
    210 %imacro get_GOT 1
    211         ; NOTE: this macro destroys ecx resister.
    212         call    %%geteip
    213         add     ecx, byte (%%ref - $)
    214         jmp     short %%adjust
    215 %%geteip:
    216         mov     ecx, POINTER [esp]
    217         ret
    218 %%adjust:
    219         push    ebp
    220         xor     ebp,ebp         ; ebp = 0
    221 %ifidni %1,ebx  ; (%1 == ebx)
    222         ; db 0x8D,0x9C + jmp near const_base =
    223         ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
    224         db      0x8D,0x9C               ; 8D,9C
    225         jmp     near const_base         ; E9,(const_base-%%ref)
    226 %%ref:
    227 %else  ; (%1 != ebx)
    228         ; db 0x8D,0x8C + jmp near const_base =
    229         ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
    230         db      0x8D,0x8C               ; 8D,8C
    231         jmp     near const_base         ; E9,(const_base-%%ref)
    232 %%ref:  mov     %1, ecx
    233 %endif ; (%1 == ebx)
    234         pop     ebp
    235 %endmacro
    236 
    237 %else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
    238 
    239 %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
    240 
    241 %imacro get_GOT 1
    242         extern  GOT_SYMBOL
    243         call    %%geteip
    244         add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
    245         jmp     short %%done
    246 %%geteip:
    247         mov     %1, POINTER [esp]
    248         ret
    249 %%done:
    250 %endmacro
    251 
    252 %endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
    253 
    254 %imacro pushpic 1.nolist
    255         push    %1
    256 %endmacro
    257 %imacro poppic  1.nolist
    258         pop     %1
    259 %endmacro
    260 %imacro movpic  2.nolist
    261         mov     %1,%2
    262 %endmacro
    263 
    264 %else   ; !PIC -----------------------------------------
    265 
    266 %define GOTOFF(got,sym) (sym)
    267 
    268 %imacro get_GOT 1.nolist
    269 %endmacro
    270 %imacro pushpic 1.nolist
    271 %endmacro
    272 %imacro poppic  1.nolist
    273 %endmacro
    274 %imacro movpic  2.nolist
    275 %endmacro
    276 
    277 %endif  ;  PIC -----------------------------------------
    278 
    279 ; --------------------------------------------------------------------------
    280 ;  Align the next instruction on {2,4,8,16,..}-byte boundary.
    281 ;  ".balign n,,m" in GNU as
    282 ;
    283 %define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
    284 %define FILLB(b,n)  (($$-(b)) & ((n)-1))
    285 
    286 %imacro alignx 1-2.nolist 0xFFFF
    287 %%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
    288                db 0x90                               ; nop
    289         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
    290                db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
    291         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
    292                db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
    293         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
    294                db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
    295         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
    296                db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
    297         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
    298                db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
    299         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
    300                db 0x8B,0xED                          ; mov ebp,ebp
    301         times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
    302                db 0x90                               ; nop
    303 %endmacro
    304 
    305 ; Align the next data on {2,4,8,16,..}-byte boundary.
    306 ;
    307 %imacro alignz 1.nolist
    308         align %1, db 0          ; filling zeros
    309 %endmacro
    310 
    311 %ifdef __x86_64__
    312 
    313 %ifdef WIN64
    314 
    315 %imacro collect_args 0
    316         push r12
    317         push r13
    318         push r14
    319         push r15
    320         mov r10, rcx
    321         mov r11, rdx
    322         mov r12, r8
    323         mov r13, r9
    324         mov r14, [rax+48]
    325         mov r15, [rax+56]
    326         push rsi
    327         push rdi
    328         sub     rsp, SIZEOF_XMMWORD
    329         movaps  XMMWORD [rsp], xmm6
    330         sub     rsp, SIZEOF_XMMWORD
    331         movaps  XMMWORD [rsp], xmm7
    332 %endmacro
    333 
    334 %imacro uncollect_args 0
    335         movaps  xmm7, XMMWORD [rsp]
    336         add     rsp, SIZEOF_XMMWORD
    337         movaps  xmm6, XMMWORD [rsp]
    338         add     rsp, SIZEOF_XMMWORD
    339         pop rdi
    340         pop rsi
    341         pop r15
    342         pop r14
    343         pop r13
    344         pop r12
    345 %endmacro
    346 
    347 %else
    348 
    349 %imacro collect_args 0
    350         push r10
    351         push r11
    352         push r12
    353         push r13
    354         push r14
    355         push r15
    356         mov r10, rdi
    357         mov r11, rsi
    358         mov r12, rdx
    359         mov r13, rcx
    360         mov r14, r8
    361         mov r15, r9
    362 %endmacro
    363 
    364 %imacro uncollect_args 0
    365         pop r15
    366         pop r14
    367         pop r13
    368         pop r12
    369         pop r11
    370         pop r10
    371 %endmacro
    372 
    373 %endif
    374 
    375 %endif
    376 
    377 ; --------------------------------------------------------------------------
    378 ;  Defines picked up from the C headers
    379 ;
    380 %include "jsimdcfg.inc"
    381 
    382 ; --------------------------------------------------------------------------
    383