Home | History | Annotate | Download | only in m_syswrap
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Handle system calls.                          syswrap-main.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2000-2013 Julian Seward
     11       jseward (at) acm.org
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     26    02111-1307, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 */
     30 
     31 #include "libvex_guest_offsets.h"
     32 #include "libvex_trc_values.h"
     33 #include "pub_core_basics.h"
     34 #include "pub_core_aspacemgr.h"
     35 #include "pub_core_vki.h"
     36 #include "pub_core_vkiscnums.h"
     37 #include "pub_core_libcsetjmp.h"    // to keep _threadstate.h happy
     38 #include "pub_core_threadstate.h"
     39 #include "pub_core_libcbase.h"
     40 #include "pub_core_libcassert.h"
     41 #include "pub_core_libcprint.h"
     42 #include "pub_core_libcproc.h"      // For VG_(getpid)()
     43 #include "pub_core_libcsignal.h"
     44 #include "pub_core_scheduler.h"     // For VG_({acquire,release}_BigLock),
     45                                     //   and VG_(vg_yield)
     46 #include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
     47 #include "pub_core_tooliface.h"
     48 #include "pub_core_options.h"
     49 #include "pub_core_signals.h"       // For VG_SIGVGKILL, VG_(poll_signals)
     50 #include "pub_core_syscall.h"
     51 #include "pub_core_machine.h"
     52 #include "pub_core_syswrap.h"
     53 
     54 #include "priv_types_n_macros.h"
     55 #include "priv_syswrap-main.h"
     56 
     57 #if defined(VGO_darwin)
     58 #include "priv_syswrap-darwin.h"
     59 #endif
     60 
     61 /* Useful info which needs to be recorded somewhere:
     62    Use of registers in syscalls is:
     63 
     64           NUM   ARG1 ARG2 ARG3 ARG4 ARG5 ARG6 ARG7 ARG8 RESULT
     65    LINUX:
     66    x86    eax   ebx  ecx  edx  esi  edi  ebp  n/a  n/a  eax       (== NUM)
     67    amd64  rax   rdi  rsi  rdx  r10  r8   r9   n/a  n/a  rax       (== NUM)
     68    ppc32  r0    r3   r4   r5   r6   r7   r8   n/a  n/a  r3+CR0.SO (== ARG1)
     69    ppc64  r0    r3   r4   r5   r6   r7   r8   n/a  n/a  r3+CR0.SO (== ARG1)
     70    arm    r7    r0   r1   r2   r3   r4   r5   n/a  n/a  r0        (== ARG1)
     71    mips32 v0    a0   a1   a2   a3 stack stack n/a  n/a  v0        (== NUM)
     72    mips64 v0    a0   a1   a2   a3   a4   a5   a6   a7   v0        (== NUM)
     73    arm64  x8    x0   x1   x2   x3   x4   x5   n/a  n/a  x0 ??     (== ARG1??)
     74 
     75    On s390x the svc instruction is used for system calls. The system call
     76    number is encoded in the instruction (8 bit immediate field). Since Linux
     77    2.6 it is also allowed to use svc 0 with the system call number in r1.
     78    This was introduced for system calls >255, but works for all. It is
     79    also possible to see the svc 0 together with an EXecute instruction, that
     80    fills in the immediate field.
     81    s390x r1/SVC r2   r3   r4   r5   r6   r7   n/a  n/a  r2        (== ARG1)
     82 
     83    DARWIN:
     84    x86    eax +4   +8   +12  +16  +20  +24  +28  +32  edx:eax, eflags.c
     85    amd64  rax rdi  rsi  rdx  rcx  r8   r9   +8   +16  rdx:rax, rflags.c
     86 
     87    For x86-darwin, "+N" denotes "in memory at N(%esp)"; ditto
     88    amd64-darwin.  Apparently 0(%esp) is some kind of return address
     89    (perhaps for syscalls done with "sysenter"?)  I don't think it is
     90    relevant for syscalls done with "int $0x80/1/2".
     91 */
     92 
     93 /* This is the top level of the system-call handler module.  All
     94    system calls are channelled through here, doing two things:
     95 
     96    * notify the tool of the events (mem/reg reads, writes) happening
     97 
     98    * perform the syscall, usually by passing it along to the kernel
     99      unmodified.
    100 
    101    A magical piece of assembly code, do_syscall_for_client_WRK, in
    102    syscall-$PLATFORM.S does the tricky bit of passing a syscall to the
    103    kernel, whilst having the simulator retain control.
    104 */
    105 
    106 /* The main function is VG_(client_syscall).  The simulation calls it
    107    whenever a client thread wants to do a syscall.  The following is a
    108    sketch of what it does.
    109 
    110    * Ensures the root thread's stack is suitably mapped.  Tedious and
    111      arcane.  See big big comment in VG_(client_syscall).
    112 
    113    * First, it rounds up the syscall number and args (which is a
    114      platform dependent activity) and puts them in a struct ("args")
    115      and also a copy in "orig_args".
    116 
    117      The pre/post wrappers refer to these structs and so no longer
    118      need magic macros to access any specific registers.  This struct
    119      is stored in thread-specific storage.
    120 
    121 
    122    * The pre-wrapper is called, passing it a pointer to struct
    123      "args".
    124 
    125 
    126    * The pre-wrapper examines the args and pokes the tool
    127      appropriately.  It may modify the args; this is why "orig_args"
    128      is also stored.
    129 
    130      The pre-wrapper may choose to 'do' the syscall itself, and
    131      concludes one of three outcomes:
    132 
    133        Success(N)    -- syscall is already complete, with success;
    134                         result is N
    135 
    136        Fail(N)       -- syscall is already complete, with failure;
    137                         error code is N
    138 
    139        HandToKernel  -- (the usual case): this needs to be given to
    140                         the kernel to be done, using the values in
    141                         the possibly-modified "args" struct.
    142 
    143      In addition, the pre-wrapper may set some flags:
    144 
    145        MayBlock   -- only applicable when outcome==HandToKernel
    146 
    147        PostOnFail -- only applicable when outcome==HandToKernel or Fail
    148 
    149 
    150    * If the pre-outcome is HandToKernel, the syscall is duly handed
    151      off to the kernel (perhaps involving some thread switchery, but
    152      that's not important).  This reduces the possible set of outcomes
    153      to either Success(N) or Fail(N).
    154 
    155 
    156    * The outcome (Success(N) or Fail(N)) is written back to the guest
    157      register(s).  This is platform specific:
    158 
    159      x86:    Success(N) ==>  eax = N
    160              Fail(N)    ==>  eax = -N
    161 
    162      ditto amd64
    163 
    164      ppc32:  Success(N) ==>  r3 = N, CR0.SO = 0
    165              Fail(N) ==>     r3 = N, CR0.SO = 1
    166 
    167      Darwin:
    168      x86:    Success(N) ==>  edx:eax = N, cc = 0
    169              Fail(N)    ==>  edx:eax = N, cc = 1
    170 
    171      s390x:  Success(N) ==>  r2 = N
    172              Fail(N)    ==>  r2 = -N
    173 
    174    * The post wrapper is called if:
    175 
    176      - it exists, and
    177      - outcome==Success or (outcome==Fail and PostOnFail is set)
    178 
    179      The post wrapper is passed the adulterated syscall args (struct
    180      "args"), and the syscall outcome (viz, Success(N) or Fail(N)).
    181 
    182    There are several other complications, primarily to do with
    183    syscalls getting interrupted, explained in comments in the code.
    184 */
    185 
    186 /* CAVEATS for writing wrappers.  It is important to follow these!
    187 
    188    The macros defined in priv_types_n_macros.h are designed to help
    189    decouple the wrapper logic from the actual representation of
    190    syscall args/results, since these wrappers are designed to work on
    191    multiple platforms.
    192 
    193    Sometimes a PRE wrapper will complete the syscall itself, without
    194    handing it to the kernel.  It will use one of SET_STATUS_Success,
    195    SET_STATUS_Failure or SET_STATUS_from_SysRes to set the return
    196    value.  It is critical to appreciate that use of the macro does not
    197    immediately cause the underlying guest state to be updated -- that
    198    is done by the driver logic in this file, when the wrapper returns.
    199 
    200    As a result, PRE wrappers of the following form will malfunction:
    201 
    202    PRE(fooble)
    203    {
    204       ... do stuff ...
    205       SET_STATUS_Somehow(...)
    206 
    207       // do something that assumes guest state is up to date
    208    }
    209 
    210    In particular, direct or indirect calls to VG_(poll_signals) after
    211    setting STATUS can cause the guest state to be read (in order to
    212    build signal frames).  Do not do this.  If you want a signal poll
    213    after the syscall goes through, do "*flags |= SfPollAfter" and the
    214    driver logic will do it for you.
    215 
    216    -----------
    217 
    218    Another critical requirement following introduction of new address
    219    space manager (JRS, 20050923):
    220 
    221    In a situation where the mappedness of memory has changed, aspacem
    222    should be notified BEFORE the tool.  Hence the following is
    223    correct:
    224 
    225       Bool d = VG_(am_notify_munmap)(s->start, s->end+1 - s->start);
    226       VG_TRACK( die_mem_munmap, s->start, s->end+1 - s->start );
    227       if (d)
    228          VG_(discard_translations)(s->start, s->end+1 - s->start);
    229 
    230    whilst this is wrong:
    231 
    232       VG_TRACK( die_mem_munmap, s->start, s->end+1 - s->start );
    233       Bool d = VG_(am_notify_munmap)(s->start, s->end+1 - s->start);
    234       if (d)
    235          VG_(discard_translations)(s->start, s->end+1 - s->start);
    236 
    237    The reason is that the tool may itself ask aspacem for more shadow
    238    memory as a result of the VG_TRACK call.  In such a situation it is
    239    critical that aspacem's segment array is up to date -- hence the
    240    need to notify aspacem first.
    241 
    242    -----------
    243 
    244    Also .. take care to call VG_(discard_translations) whenever
    245    memory with execute permissions is unmapped.
    246 */
    247 
    248 
    249 /* ---------------------------------------------------------------------
    250    Do potentially blocking syscall for the client, and mess with
    251    signal masks at the same time.
    252    ------------------------------------------------------------------ */
    253 
    254 /* Perform a syscall on behalf of a client thread, using a specific
    255    signal mask.  On completion, the signal mask is set to restore_mask
    256    (which presumably blocks almost everything).  If a signal happens
    257    during the syscall, the handler should call
    258    VG_(fixup_guest_state_after_syscall_interrupted) to adjust the
    259    thread's context to do the right thing.
    260 
    261    The _WRK function is handwritten assembly, implemented per-platform
    262    in coregrind/m_syswrap/syscall-$PLAT.S.  It has some very magic
    263    properties.  See comments at the top of
    264    VG_(fixup_guest_state_after_syscall_interrupted) below for details.
    265 
    266    This function (these functions) are required to return zero in case
    267    of success (even if the syscall itself failed), and nonzero if the
    268    sigprocmask-swizzling calls failed.  We don't actually care about
    269    the failure values from sigprocmask, although most of the assembly
    270    implementations do attempt to return that, using the convention
    271    0 for success, or 0x8000 | error-code for failure.
    272 */
    273 #if defined(VGO_linux)
    274 extern
    275 UWord ML_(do_syscall_for_client_WRK)( Word syscallno,
    276                                       void* guest_state,
    277                                       const vki_sigset_t *syscall_mask,
    278                                       const vki_sigset_t *restore_mask,
    279                                       Word sigsetSzB );
    280 #elif defined(VGO_darwin)
    281 extern
    282 UWord ML_(do_syscall_for_client_unix_WRK)( Word syscallno,
    283                                            void* guest_state,
    284                                            const vki_sigset_t *syscall_mask,
    285                                            const vki_sigset_t *restore_mask,
    286                                            Word sigsetSzB ); /* unused */
    287 extern
    288 UWord ML_(do_syscall_for_client_mach_WRK)( Word syscallno,
    289                                            void* guest_state,
    290                                            const vki_sigset_t *syscall_mask,
    291                                            const vki_sigset_t *restore_mask,
    292                                            Word sigsetSzB ); /* unused */
    293 extern
    294 UWord ML_(do_syscall_for_client_mdep_WRK)( Word syscallno,
    295                                            void* guest_state,
    296                                            const vki_sigset_t *syscall_mask,
    297                                            const vki_sigset_t *restore_mask,
    298                                            Word sigsetSzB ); /* unused */
    299 #else
    300 #  error "Unknown OS"
    301 #endif
    302 
    303 
    304 static
    305 void do_syscall_for_client ( Int syscallno,
    306                              ThreadState* tst,
    307                              const vki_sigset_t* syscall_mask )
    308 {
    309    vki_sigset_t saved;
    310    UWord err;
    311 #  if defined(VGO_linux)
    312    err = ML_(do_syscall_for_client_WRK)(
    313             syscallno, &tst->arch.vex,
    314             syscall_mask, &saved, sizeof(vki_sigset_t)
    315          );
    316 #  elif defined(VGO_darwin)
    317    switch (VG_DARWIN_SYSNO_CLASS(syscallno)) {
    318       case VG_DARWIN_SYSCALL_CLASS_UNIX:
    319          err = ML_(do_syscall_for_client_unix_WRK)(
    320                   VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
    321                   syscall_mask, &saved, 0/*unused:sigsetSzB*/
    322                );
    323          break;
    324       case VG_DARWIN_SYSCALL_CLASS_MACH:
    325          err = ML_(do_syscall_for_client_mach_WRK)(
    326                   VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
    327                   syscall_mask, &saved, 0/*unused:sigsetSzB*/
    328                );
    329          break;
    330       case VG_DARWIN_SYSCALL_CLASS_MDEP:
    331          err = ML_(do_syscall_for_client_mdep_WRK)(
    332                   VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
    333                   syscall_mask, &saved, 0/*unused:sigsetSzB*/
    334                );
    335          break;
    336       default:
    337          vg_assert(0);
    338          /*NOTREACHED*/
    339          break;
    340    }
    341 #  else
    342 #    error "Unknown OS"
    343 #  endif
    344    vg_assert2(
    345       err == 0,
    346       "ML_(do_syscall_for_client_WRK): sigprocmask error %d",
    347       (Int)(err & 0xFFF)
    348    );
    349 }
    350 
    351 
    352 /* ---------------------------------------------------------------------
    353    Impedance matchers and misc helpers
    354    ------------------------------------------------------------------ */
    355 
    356 static
    357 Bool eq_SyscallArgs ( SyscallArgs* a1, SyscallArgs* a2 )
    358 {
    359    return a1->sysno == a2->sysno
    360           && a1->arg1 == a2->arg1
    361           && a1->arg2 == a2->arg2
    362           && a1->arg3 == a2->arg3
    363           && a1->arg4 == a2->arg4
    364           && a1->arg5 == a2->arg5
    365           && a1->arg6 == a2->arg6
    366           && a1->arg7 == a2->arg7
    367           && a1->arg8 == a2->arg8;
    368 }
    369 
    370 static
    371 Bool eq_SyscallStatus ( SyscallStatus* s1, SyscallStatus* s2 )
    372 {
    373    /* was: return s1->what == s2->what && sr_EQ( s1->sres, s2->sres ); */
    374    if (s1->what == s2->what && sr_EQ( s1->sres, s2->sres ))
    375       return True;
    376 #  if defined(VGO_darwin)
    377    /* Darwin-specific debugging guff */
    378    vg_assert(s1->what == s2->what);
    379    VG_(printf)("eq_SyscallStatus:\n");
    380    VG_(printf)("  {%lu %lu %u}\n", s1->sres._wLO, s1->sres._wHI, s1->sres._mode);
    381    VG_(printf)("  {%lu %lu %u}\n", s2->sres._wLO, s2->sres._wHI, s2->sres._mode);
    382    vg_assert(0);
    383 #  endif
    384    return False;
    385 }
    386 
    387 /* Convert between SysRes and SyscallStatus, to the extent possible. */
    388 
    389 static
    390 SyscallStatus convert_SysRes_to_SyscallStatus ( SysRes res )
    391 {
    392    SyscallStatus status;
    393    status.what = SsComplete;
    394    status.sres = res;
    395    return status;
    396 }
    397 
    398 
    399 /* Impedance matchers.  These convert syscall arg or result data from
    400    the platform-specific in-guest-state format to the canonical
    401    formats, and back. */
    402 
    403 static
    404 void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs*       canonical,
    405                                     /*IN*/ VexGuestArchState* gst_vanilla,
    406                                     /*IN*/ UInt trc )
    407 {
    408 #if defined(VGP_x86_linux)
    409    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    410    canonical->sysno = gst->guest_EAX;
    411    canonical->arg1  = gst->guest_EBX;
    412    canonical->arg2  = gst->guest_ECX;
    413    canonical->arg3  = gst->guest_EDX;
    414    canonical->arg4  = gst->guest_ESI;
    415    canonical->arg5  = gst->guest_EDI;
    416    canonical->arg6  = gst->guest_EBP;
    417    canonical->arg7  = 0;
    418    canonical->arg8  = 0;
    419 
    420 #elif defined(VGP_amd64_linux)
    421    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    422    canonical->sysno = gst->guest_RAX;
    423    canonical->arg1  = gst->guest_RDI;
    424    canonical->arg2  = gst->guest_RSI;
    425    canonical->arg3  = gst->guest_RDX;
    426    canonical->arg4  = gst->guest_R10;
    427    canonical->arg5  = gst->guest_R8;
    428    canonical->arg6  = gst->guest_R9;
    429    canonical->arg7  = 0;
    430    canonical->arg8  = 0;
    431 
    432 #elif defined(VGP_ppc32_linux)
    433    VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
    434    canonical->sysno = gst->guest_GPR0;
    435    canonical->arg1  = gst->guest_GPR3;
    436    canonical->arg2  = gst->guest_GPR4;
    437    canonical->arg3  = gst->guest_GPR5;
    438    canonical->arg4  = gst->guest_GPR6;
    439    canonical->arg5  = gst->guest_GPR7;
    440    canonical->arg6  = gst->guest_GPR8;
    441    canonical->arg7  = 0;
    442    canonical->arg8  = 0;
    443 
    444 #elif defined(VGP_ppc64_linux)
    445    VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
    446    canonical->sysno = gst->guest_GPR0;
    447    canonical->arg1  = gst->guest_GPR3;
    448    canonical->arg2  = gst->guest_GPR4;
    449    canonical->arg3  = gst->guest_GPR5;
    450    canonical->arg4  = gst->guest_GPR6;
    451    canonical->arg5  = gst->guest_GPR7;
    452    canonical->arg6  = gst->guest_GPR8;
    453    canonical->arg7  = 0;
    454    canonical->arg8  = 0;
    455 
    456 #elif defined(VGP_arm_linux)
    457    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
    458    canonical->sysno = gst->guest_R7;
    459    canonical->arg1  = gst->guest_R0;
    460    canonical->arg2  = gst->guest_R1;
    461    canonical->arg3  = gst->guest_R2;
    462    canonical->arg4  = gst->guest_R3;
    463    canonical->arg5  = gst->guest_R4;
    464    canonical->arg6  = gst->guest_R5;
    465    canonical->arg7  = 0;
    466    canonical->arg8  = 0;
    467 
    468 #elif defined(VGP_arm64_linux)
    469    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
    470    canonical->sysno = gst->guest_X8;
    471    canonical->arg1  = gst->guest_X0;
    472    canonical->arg2  = gst->guest_X1;
    473    canonical->arg3  = gst->guest_X2;
    474    canonical->arg4  = gst->guest_X3;
    475    canonical->arg5  = gst->guest_X4;
    476    canonical->arg6  = gst->guest_X5;
    477    canonical->arg7  = 0;
    478    canonical->arg8  = 0;
    479 
    480 #elif defined(VGP_mips32_linux)
    481    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
    482    canonical->sysno = gst->guest_r2;    // v0
    483    if (canonical->sysno == __NR_exit) {
    484       canonical->arg1 = gst->guest_r4;    // a0
    485       canonical->arg2 = 0;
    486       canonical->arg3 = 0;
    487       canonical->arg4 = 0;
    488       canonical->arg5 = 0;
    489       canonical->arg6 = 0;
    490       canonical->arg8 = 0;
    491    } else if (canonical->sysno != __NR_syscall) {
    492       canonical->arg1  = gst->guest_r4;    // a0
    493       canonical->arg2  = gst->guest_r5;    // a1
    494       canonical->arg3  = gst->guest_r6;    // a2
    495       canonical->arg4  = gst->guest_r7;    // a3
    496       canonical->arg5  = *((UInt*) (gst->guest_r29 + 16));    // 16(guest_SP/sp)
    497       canonical->arg6  = *((UInt*) (gst->guest_r29 + 20));    // 20(sp)
    498       canonical->arg8 = 0;
    499    } else {
    500       // Fixme hack handle syscall()
    501       canonical->sysno = gst->guest_r4;    // a0
    502       canonical->arg1  = gst->guest_r5;    // a1
    503       canonical->arg2  = gst->guest_r6;    // a2
    504       canonical->arg3  = gst->guest_r7;    // a3
    505       canonical->arg4  = *((UInt*) (gst->guest_r29 + 16));    // 16(guest_SP/sp)
    506       canonical->arg5  = *((UInt*) (gst->guest_r29 + 20));    // 20(guest_SP/sp)
    507       canonical->arg6  = *((UInt*) (gst->guest_r29 + 24));    // 24(guest_SP/sp)
    508       canonical->arg8 = __NR_syscall;
    509    }
    510 
    511 #elif defined(VGP_mips64_linux)
    512    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
    513    canonical->sysno = gst->guest_r2;    // v0
    514    canonical->arg1  = gst->guest_r4;    // a0
    515    canonical->arg2  = gst->guest_r5;    // a1
    516    canonical->arg3  = gst->guest_r6;    // a2
    517    canonical->arg4  = gst->guest_r7;    // a3
    518    canonical->arg5  = gst->guest_r8;    // a4
    519    canonical->arg6  = gst->guest_r9;    // a5
    520 
    521 #elif defined(VGP_x86_darwin)
    522    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    523    UWord *stack = (UWord *)gst->guest_ESP;
    524    // GrP fixme hope syscalls aren't called with really shallow stacks...
    525    canonical->sysno = gst->guest_EAX;
    526    if (canonical->sysno != 0) {
    527       // stack[0] is return address
    528       canonical->arg1  = stack[1];
    529       canonical->arg2  = stack[2];
    530       canonical->arg3  = stack[3];
    531       canonical->arg4  = stack[4];
    532       canonical->arg5  = stack[5];
    533       canonical->arg6  = stack[6];
    534       canonical->arg7  = stack[7];
    535       canonical->arg8  = stack[8];
    536    } else {
    537       // GrP fixme hack handle syscall()
    538       // GrP fixme what about __syscall() ?
    539       // stack[0] is return address
    540       // DDD: the tool can't see that the params have been shifted!  Can
    541       //      lead to incorrect checking, I think, because the PRRAn/PSARn
    542       //      macros will mention the pre-shifted args.
    543       canonical->sysno = stack[1];
    544       vg_assert(canonical->sysno != 0);
    545       canonical->arg1  = stack[2];
    546       canonical->arg2  = stack[3];
    547       canonical->arg3  = stack[4];
    548       canonical->arg4  = stack[5];
    549       canonical->arg5  = stack[6];
    550       canonical->arg6  = stack[7];
    551       canonical->arg7  = stack[8];
    552       canonical->arg8  = stack[9];
    553 
    554       PRINT("SYSCALL[%d,?](%s) syscall(%s, ...); please stand by...\n",
    555             VG_(getpid)(), /*tid,*/
    556             VG_SYSNUM_STRING(0), VG_SYSNUM_STRING(canonical->sysno));
    557    }
    558 
    559    // Here we determine what kind of syscall it was by looking at the
    560    // interrupt kind, and then encode the syscall number using the 64-bit
    561    // encoding for Valgrind's internal use.
    562    //
    563    // DDD: Would it be better to stash the JMP kind into the Darwin
    564    // thread state rather than passing in the trc?
    565    switch (trc) {
    566    case VEX_TRC_JMP_SYS_INT128:
    567       // int $0x80 = Unix, 64-bit result
    568       vg_assert(canonical->sysno >= 0);
    569       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(canonical->sysno);
    570       break;
    571    case VEX_TRC_JMP_SYS_SYSENTER:
    572       // syscall = Unix, 32-bit result
    573       // OR        Mach, 32-bit result
    574       if (canonical->sysno >= 0) {
    575          // GrP fixme hack:  0xffff == I386_SYSCALL_NUMBER_MASK
    576          canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(canonical->sysno
    577                                                              & 0xffff);
    578       } else {
    579          canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MACH(-canonical->sysno);
    580       }
    581       break;
    582    case VEX_TRC_JMP_SYS_INT129:
    583       // int $0x81 = Mach, 32-bit result
    584       vg_assert(canonical->sysno < 0);
    585       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MACH(-canonical->sysno);
    586       break;
    587    case VEX_TRC_JMP_SYS_INT130:
    588       // int $0x82 = mdep, 32-bit result
    589       vg_assert(canonical->sysno >= 0);
    590       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MDEP(canonical->sysno);
    591       break;
    592    default:
    593       vg_assert(0);
    594       break;
    595    }
    596 
    597 #elif defined(VGP_amd64_darwin)
    598    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    599    UWord *stack = (UWord *)gst->guest_RSP;
    600 
    601    vg_assert(trc == VEX_TRC_JMP_SYS_SYSCALL);
    602 
    603    // GrP fixme hope syscalls aren't called with really shallow stacks...
    604    canonical->sysno = gst->guest_RAX;
    605    if (canonical->sysno != __NR_syscall) {
    606       // stack[0] is return address
    607       canonical->arg1  = gst->guest_RDI;
    608       canonical->arg2  = gst->guest_RSI;
    609       canonical->arg3  = gst->guest_RDX;
    610       canonical->arg4  = gst->guest_R10;  // not rcx with syscall insn
    611       canonical->arg5  = gst->guest_R8;
    612       canonical->arg6  = gst->guest_R9;
    613       canonical->arg7  = stack[1];
    614       canonical->arg8  = stack[2];
    615    } else {
    616       // GrP fixme hack handle syscall()
    617       // GrP fixme what about __syscall() ?
    618       // stack[0] is return address
    619       // DDD: the tool can't see that the params have been shifted!  Can
    620       //      lead to incorrect checking, I think, because the PRRAn/PSARn
    621       //      macros will mention the pre-shifted args.
    622       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(gst->guest_RDI);
    623       vg_assert(canonical->sysno != __NR_syscall);
    624       canonical->arg1  = gst->guest_RSI;
    625       canonical->arg2  = gst->guest_RDX;
    626       canonical->arg3  = gst->guest_R10;  // not rcx with syscall insn
    627       canonical->arg4  = gst->guest_R8;
    628       canonical->arg5  = gst->guest_R9;
    629       canonical->arg6  = stack[1];
    630       canonical->arg7  = stack[2];
    631       canonical->arg8  = stack[3];
    632 
    633       PRINT("SYSCALL[%d,?](%s) syscall(%s, ...); please stand by...\n",
    634             VG_(getpid)(), /*tid,*/
    635             VG_SYSNUM_STRING(0), VG_SYSNUM_STRING(canonical->sysno));
    636    }
    637 
    638    // no canonical->sysno adjustment needed
    639 
    640 #elif defined(VGP_s390x_linux)
    641    VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
    642    canonical->sysno = gst->guest_SYSNO;
    643    canonical->arg1  = gst->guest_r2;
    644    canonical->arg2  = gst->guest_r3;
    645    canonical->arg3  = gst->guest_r4;
    646    canonical->arg4  = gst->guest_r5;
    647    canonical->arg5  = gst->guest_r6;
    648    canonical->arg6  = gst->guest_r7;
    649    canonical->arg7  = 0;
    650    canonical->arg8  = 0;
    651 #else
    652 #  error "getSyscallArgsFromGuestState: unknown arch"
    653 #endif
    654 }
    655 
    656 static
    657 void putSyscallArgsIntoGuestState ( /*IN*/ SyscallArgs*       canonical,
    658                                     /*OUT*/VexGuestArchState* gst_vanilla )
    659 {
    660 #if defined(VGP_x86_linux)
    661    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    662    gst->guest_EAX = canonical->sysno;
    663    gst->guest_EBX = canonical->arg1;
    664    gst->guest_ECX = canonical->arg2;
    665    gst->guest_EDX = canonical->arg3;
    666    gst->guest_ESI = canonical->arg4;
    667    gst->guest_EDI = canonical->arg5;
    668    gst->guest_EBP = canonical->arg6;
    669 
    670 #elif defined(VGP_amd64_linux)
    671    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    672    gst->guest_RAX = canonical->sysno;
    673    gst->guest_RDI = canonical->arg1;
    674    gst->guest_RSI = canonical->arg2;
    675    gst->guest_RDX = canonical->arg3;
    676    gst->guest_R10 = canonical->arg4;
    677    gst->guest_R8  = canonical->arg5;
    678    gst->guest_R9  = canonical->arg6;
    679 
    680 #elif defined(VGP_ppc32_linux)
    681    VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
    682    gst->guest_GPR0 = canonical->sysno;
    683    gst->guest_GPR3 = canonical->arg1;
    684    gst->guest_GPR4 = canonical->arg2;
    685    gst->guest_GPR5 = canonical->arg3;
    686    gst->guest_GPR6 = canonical->arg4;
    687    gst->guest_GPR7 = canonical->arg5;
    688    gst->guest_GPR8 = canonical->arg6;
    689 
    690 #elif defined(VGP_ppc64_linux)
    691    VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
    692    gst->guest_GPR0 = canonical->sysno;
    693    gst->guest_GPR3 = canonical->arg1;
    694    gst->guest_GPR4 = canonical->arg2;
    695    gst->guest_GPR5 = canonical->arg3;
    696    gst->guest_GPR6 = canonical->arg4;
    697    gst->guest_GPR7 = canonical->arg5;
    698    gst->guest_GPR8 = canonical->arg6;
    699 
    700 #elif defined(VGP_arm_linux)
    701    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
    702    gst->guest_R7 = canonical->sysno;
    703    gst->guest_R0 = canonical->arg1;
    704    gst->guest_R1 = canonical->arg2;
    705    gst->guest_R2 = canonical->arg3;
    706    gst->guest_R3 = canonical->arg4;
    707    gst->guest_R4 = canonical->arg5;
    708    gst->guest_R5 = canonical->arg6;
    709 
    710 #elif defined(VGP_arm64_linux)
    711    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
    712    gst->guest_X8 = canonical->sysno;
    713    gst->guest_X0 = canonical->arg1;
    714    gst->guest_X1 = canonical->arg2;
    715    gst->guest_X2 = canonical->arg3;
    716    gst->guest_X3 = canonical->arg4;
    717    gst->guest_X4 = canonical->arg5;
    718    gst->guest_X5 = canonical->arg6;
    719 
    720 #elif defined(VGP_x86_darwin)
    721    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    722    UWord *stack = (UWord *)gst->guest_ESP;
    723 
    724    gst->guest_EAX = VG_DARWIN_SYSNO_FOR_KERNEL(canonical->sysno);
    725 
    726    // GrP fixme? gst->guest_TEMP_EFLAG_C = 0;
    727    // stack[0] is return address
    728    stack[1] = canonical->arg1;
    729    stack[2] = canonical->arg2;
    730    stack[3] = canonical->arg3;
    731    stack[4] = canonical->arg4;
    732    stack[5] = canonical->arg5;
    733    stack[6] = canonical->arg6;
    734    stack[7] = canonical->arg7;
    735    stack[8] = canonical->arg8;
    736 
    737 #elif defined(VGP_amd64_darwin)
    738    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    739    UWord *stack = (UWord *)gst->guest_RSP;
    740 
    741    gst->guest_RAX = VG_DARWIN_SYSNO_FOR_KERNEL(canonical->sysno);
    742    // GrP fixme? gst->guest_TEMP_EFLAG_C = 0;
    743 
    744    // stack[0] is return address
    745    gst->guest_RDI = canonical->arg1;
    746    gst->guest_RSI = canonical->arg2;
    747    gst->guest_RDX = canonical->arg3;
    748    gst->guest_RCX = canonical->arg4;
    749    gst->guest_R8  = canonical->arg5;
    750    gst->guest_R9  = canonical->arg6;
    751    stack[1]       = canonical->arg7;
    752    stack[2]       = canonical->arg8;
    753 
    754 #elif defined(VGP_s390x_linux)
    755    VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
    756    gst->guest_SYSNO  = canonical->sysno;
    757    gst->guest_r2     = canonical->arg1;
    758    gst->guest_r3     = canonical->arg2;
    759    gst->guest_r4     = canonical->arg3;
    760    gst->guest_r5     = canonical->arg4;
    761    gst->guest_r6     = canonical->arg5;
    762    gst->guest_r7     = canonical->arg6;
    763 
    764 #elif defined(VGP_mips32_linux)
    765    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
    766    if (canonical->arg8 != __NR_syscall) {
    767       gst->guest_r2 = canonical->sysno;
    768       gst->guest_r4 = canonical->arg1;
    769       gst->guest_r5 = canonical->arg2;
    770       gst->guest_r6 = canonical->arg3;
    771       gst->guest_r7 = canonical->arg4;
    772       *((UInt*) (gst->guest_r29 + 16)) = canonical->arg5; // 16(guest_GPR29/sp)
    773       *((UInt*) (gst->guest_r29 + 20)) = canonical->arg6; // 20(sp)
    774    } else {
    775       canonical->arg8 = 0;
    776       gst->guest_r2 = __NR_syscall;
    777       gst->guest_r4 = canonical->sysno;
    778       gst->guest_r5 = canonical->arg1;
    779       gst->guest_r6 = canonical->arg2;
    780       gst->guest_r7 = canonical->arg3;
    781       *((UInt*) (gst->guest_r29 + 16)) = canonical->arg4; // 16(guest_GPR29/sp)
    782       *((UInt*) (gst->guest_r29 + 20)) = canonical->arg5; // 20(sp)
    783       *((UInt*) (gst->guest_r29 + 24)) = canonical->arg6; // 24(sp)
    784    }
    785 
    786 #elif defined(VGP_mips64_linux)
    787    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
    788    gst->guest_r2 = canonical->sysno;
    789    gst->guest_r4 = canonical->arg1;
    790    gst->guest_r5 = canonical->arg2;
    791    gst->guest_r6 = canonical->arg3;
    792    gst->guest_r7 = canonical->arg4;
    793    gst->guest_r8 = canonical->arg5;
    794    gst->guest_r9 = canonical->arg6;
    795 #else
    796 #  error "putSyscallArgsIntoGuestState: unknown arch"
    797 #endif
    798 }
    799 
    800 static
    801 void getSyscallStatusFromGuestState ( /*OUT*/SyscallStatus*     canonical,
    802                                       /*IN*/ VexGuestArchState* gst_vanilla )
    803 {
    804 #  if defined(VGP_x86_linux)
    805    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    806    canonical->sres = VG_(mk_SysRes_x86_linux)( gst->guest_EAX );
    807    canonical->what = SsComplete;
    808 
    809 #  elif defined(VGP_amd64_linux)
    810    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    811    canonical->sres = VG_(mk_SysRes_amd64_linux)( gst->guest_RAX );
    812    canonical->what = SsComplete;
    813 
    814 #  elif defined(VGP_ppc32_linux)
    815    VexGuestPPC32State* gst   = (VexGuestPPC32State*)gst_vanilla;
    816    UInt                cr    = LibVEX_GuestPPC32_get_CR( gst );
    817    UInt                cr0so = (cr >> 28) & 1;
    818    canonical->sres = VG_(mk_SysRes_ppc32_linux)( gst->guest_GPR3, cr0so );
    819    canonical->what = SsComplete;
    820 
    821 #  elif defined(VGP_ppc64_linux)
    822    VexGuestPPC64State* gst   = (VexGuestPPC64State*)gst_vanilla;
    823    UInt                cr    = LibVEX_GuestPPC64_get_CR( gst );
    824    UInt                cr0so = (cr >> 28) & 1;
    825    canonical->sres = VG_(mk_SysRes_ppc64_linux)( gst->guest_GPR3, cr0so );
    826    canonical->what = SsComplete;
    827 
    828 #  elif defined(VGP_arm_linux)
    829    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
    830    canonical->sres = VG_(mk_SysRes_arm_linux)( gst->guest_R0 );
    831    canonical->what = SsComplete;
    832 
    833 #  elif defined(VGP_arm64_linux)
    834    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
    835    canonical->sres = VG_(mk_SysRes_arm64_linux)( gst->guest_X0 );
    836    canonical->what = SsComplete;
    837 
    838 #  elif defined(VGP_mips32_linux)
    839    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
    840    UInt                v0 = gst->guest_r2;    // v0
    841    UInt                v1 = gst->guest_r3;    // v1
    842    UInt                a3 = gst->guest_r7;    // a3
    843    canonical->sres = VG_(mk_SysRes_mips32_linux)( v0, v1, a3 );
    844    canonical->what = SsComplete;
    845 
    846 #  elif defined(VGP_mips64_linux)
    847    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
    848    ULong                v0 = gst->guest_r2;    // v0
    849    ULong                v1 = gst->guest_r3;    // v1
    850    ULong                a3 = gst->guest_r7;    // a3
    851    canonical->sres = VG_(mk_SysRes_mips64_linux)(v0, v1, a3);
    852    canonical->what = SsComplete;
    853 
    854 #  elif defined(VGP_x86_darwin)
    855    /* duplicates logic in m_signals.VG_UCONTEXT_SYSCALL_SYSRES */
    856    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    857    UInt carry = 1 & LibVEX_GuestX86_get_eflags(gst);
    858    UInt err = 0;
    859    UInt wLO = 0;
    860    UInt wHI = 0;
    861    switch (gst->guest_SC_CLASS) {
    862       case VG_DARWIN_SYSCALL_CLASS_UNIX:
    863          // int $0x80 = Unix, 64-bit result
    864          err = carry;
    865          wLO = gst->guest_EAX;
    866          wHI = gst->guest_EDX;
    867          break;
    868       case VG_DARWIN_SYSCALL_CLASS_MACH:
    869          // int $0x81 = Mach, 32-bit result
    870          wLO = gst->guest_EAX;
    871          break;
    872       case VG_DARWIN_SYSCALL_CLASS_MDEP:
    873          // int $0x82 = mdep, 32-bit result
    874          wLO = gst->guest_EAX;
    875          break;
    876       default:
    877          vg_assert(0);
    878          break;
    879    }
    880    canonical->sres = VG_(mk_SysRes_x86_darwin)(
    881                         gst->guest_SC_CLASS, err ? True : False,
    882                         wHI, wLO
    883                      );
    884    canonical->what = SsComplete;
    885 
    886 #  elif defined(VGP_amd64_darwin)
    887    /* duplicates logic in m_signals.VG_UCONTEXT_SYSCALL_SYSRES */
    888    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    889    ULong carry = 1 & LibVEX_GuestAMD64_get_rflags(gst);
    890    ULong err = 0;
    891    ULong wLO = 0;
    892    ULong wHI = 0;
    893    switch (gst->guest_SC_CLASS) {
    894       case VG_DARWIN_SYSCALL_CLASS_UNIX:
    895          // syscall = Unix, 128-bit result
    896          err = carry;
    897          wLO = gst->guest_RAX;
    898          wHI = gst->guest_RDX;
    899          break;
    900       case VG_DARWIN_SYSCALL_CLASS_MACH:
    901          // syscall = Mach, 64-bit result
    902          wLO = gst->guest_RAX;
    903          break;
    904       case VG_DARWIN_SYSCALL_CLASS_MDEP:
    905          // syscall = mdep, 64-bit result
    906          wLO = gst->guest_RAX;
    907          break;
    908       default:
    909          vg_assert(0);
    910          break;
    911    }
    912    canonical->sres = VG_(mk_SysRes_amd64_darwin)(
    913                         gst->guest_SC_CLASS, err ? True : False,
    914                         wHI, wLO
    915                      );
    916    canonical->what = SsComplete;
    917 
    918 #  elif defined(VGP_s390x_linux)
    919    VexGuestS390XState* gst   = (VexGuestS390XState*)gst_vanilla;
    920    canonical->sres = VG_(mk_SysRes_s390x_linux)( gst->guest_r2 );
    921    canonical->what = SsComplete;
    922 
    923 #  else
    924 #    error "getSyscallStatusFromGuestState: unknown arch"
    925 #  endif
    926 }
    927 
    928 static
    929 void putSyscallStatusIntoGuestState ( /*IN*/ ThreadId tid,
    930                                       /*IN*/ SyscallStatus*     canonical,
    931                                       /*OUT*/VexGuestArchState* gst_vanilla )
    932 {
    933 #  if defined(VGP_x86_linux)
    934    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
    935    vg_assert(canonical->what == SsComplete);
    936    if (sr_isError(canonical->sres)) {
    937       /* This isn't exactly right, in that really a Failure with res
    938          not in the range 1 .. 4095 is unrepresentable in the
    939          Linux-x86 scheme.  Oh well. */
    940       gst->guest_EAX = - (Int)sr_Err(canonical->sres);
    941    } else {
    942       gst->guest_EAX = sr_Res(canonical->sres);
    943    }
    944    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
    945              OFFSET_x86_EAX, sizeof(UWord) );
    946 
    947 #  elif defined(VGP_amd64_linux)
    948    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
    949    vg_assert(canonical->what == SsComplete);
    950    if (sr_isError(canonical->sres)) {
    951       /* This isn't exactly right, in that really a Failure with res
    952          not in the range 1 .. 4095 is unrepresentable in the
    953          Linux-amd64 scheme.  Oh well. */
    954       gst->guest_RAX = - (Long)sr_Err(canonical->sres);
    955    } else {
    956       gst->guest_RAX = sr_Res(canonical->sres);
    957    }
    958    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
    959              OFFSET_amd64_RAX, sizeof(UWord) );
    960 
    961 #  elif defined(VGP_ppc32_linux)
    962    VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
    963    UInt old_cr = LibVEX_GuestPPC32_get_CR(gst);
    964    vg_assert(canonical->what == SsComplete);
    965    if (sr_isError(canonical->sres)) {
    966       /* set CR0.SO */
    967       LibVEX_GuestPPC32_put_CR( old_cr | (1<<28), gst );
    968       gst->guest_GPR3 = sr_Err(canonical->sres);
    969    } else {
    970       /* clear CR0.SO */
    971       LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), gst );
    972       gst->guest_GPR3 = sr_Res(canonical->sres);
    973    }
    974    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
    975              OFFSET_ppc32_GPR3, sizeof(UWord) );
    976    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
    977              OFFSET_ppc32_CR0_0, sizeof(UChar) );
    978 
    979 #  elif defined(VGP_ppc64_linux)
    980    VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
    981    UInt old_cr = LibVEX_GuestPPC64_get_CR(gst);
    982    vg_assert(canonical->what == SsComplete);
    983    if (sr_isError(canonical->sres)) {
    984       /* set CR0.SO */
    985       LibVEX_GuestPPC64_put_CR( old_cr | (1<<28), gst );
    986       gst->guest_GPR3 = sr_Err(canonical->sres);
    987    } else {
    988       /* clear CR0.SO */
    989       LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), gst );
    990       gst->guest_GPR3 = sr_Res(canonical->sres);
    991    }
    992    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
    993              OFFSET_ppc64_GPR3, sizeof(UWord) );
    994    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
    995              OFFSET_ppc64_CR0_0, sizeof(UChar) );
    996 
    997 #  elif defined(VGP_arm_linux)
    998    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
    999    vg_assert(canonical->what == SsComplete);
   1000    if (sr_isError(canonical->sres)) {
   1001       /* This isn't exactly right, in that really a Failure with res
   1002          not in the range 1 .. 4095 is unrepresentable in the
   1003          Linux-arm scheme.  Oh well. */
   1004       gst->guest_R0 = - (Int)sr_Err(canonical->sres);
   1005    } else {
   1006       gst->guest_R0 = sr_Res(canonical->sres);
   1007    }
   1008    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1009              OFFSET_arm_R0, sizeof(UWord) );
   1010 
   1011 #  elif defined(VGP_arm64_linux)
   1012    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
   1013    vg_assert(canonical->what == SsComplete);
   1014    if (sr_isError(canonical->sres)) {
   1015       /* This isn't exactly right, in that really a Failure with res
   1016          not in the range 1 .. 4095 is unrepresentable in the
   1017          Linux-arm64 scheme.  Oh well. */
   1018       gst->guest_X0 = - (Long)sr_Err(canonical->sres);
   1019    } else {
   1020       gst->guest_X0 = sr_Res(canonical->sres);
   1021    }
   1022    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1023              OFFSET_arm64_X0, sizeof(UWord) );
   1024 
   1025 #elif defined(VGP_x86_darwin)
   1026    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
   1027    SysRes sres = canonical->sres;
   1028    vg_assert(canonical->what == SsComplete);
   1029    /* Unfortunately here we have to break abstraction and look
   1030       directly inside 'res', in order to decide what to do. */
   1031    switch (sres._mode) {
   1032       case SysRes_MACH: // int $0x81 = Mach, 32-bit result
   1033       case SysRes_MDEP: // int $0x82 = mdep, 32-bit result
   1034          gst->guest_EAX = sres._wLO;
   1035          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1036                    OFFSET_x86_EAX, sizeof(UInt) );
   1037          break;
   1038       case SysRes_UNIX_OK:  // int $0x80 = Unix, 64-bit result
   1039       case SysRes_UNIX_ERR: // int $0x80 = Unix, 64-bit error
   1040          gst->guest_EAX = sres._wLO;
   1041          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1042                    OFFSET_x86_EAX, sizeof(UInt) );
   1043          gst->guest_EDX = sres._wHI;
   1044          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1045                    OFFSET_x86_EDX, sizeof(UInt) );
   1046          LibVEX_GuestX86_put_eflag_c( sres._mode==SysRes_UNIX_ERR ? 1 : 0,
   1047                                       gst );
   1048          // GrP fixme sets defined for entire eflags, not just bit c
   1049          // DDD: this breaks exp-ptrcheck.
   1050          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1051                    offsetof(VexGuestX86State, guest_CC_DEP1), sizeof(UInt) );
   1052          break;
   1053       default:
   1054          vg_assert(0);
   1055          break;
   1056    }
   1057 
   1058 #elif defined(VGP_amd64_darwin)
   1059    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
   1060    SysRes sres = canonical->sres;
   1061    vg_assert(canonical->what == SsComplete);
   1062    /* Unfortunately here we have to break abstraction and look
   1063       directly inside 'res', in order to decide what to do. */
   1064    switch (sres._mode) {
   1065       case SysRes_MACH: // syscall = Mach, 64-bit result
   1066       case SysRes_MDEP: // syscall = mdep, 64-bit result
   1067          gst->guest_RAX = sres._wLO;
   1068          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1069                    OFFSET_amd64_RAX, sizeof(ULong) );
   1070          break;
   1071       case SysRes_UNIX_OK:  // syscall = Unix, 128-bit result
   1072       case SysRes_UNIX_ERR: // syscall = Unix, 128-bit error
   1073          gst->guest_RAX = sres._wLO;
   1074          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1075                    OFFSET_amd64_RAX, sizeof(ULong) );
   1076          gst->guest_RDX = sres._wHI;
   1077          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1078                    OFFSET_amd64_RDX, sizeof(ULong) );
   1079          LibVEX_GuestAMD64_put_rflag_c( sres._mode==SysRes_UNIX_ERR ? 1 : 0,
   1080                                         gst );
   1081          // GrP fixme sets defined for entire rflags, not just bit c
   1082          // DDD: this breaks exp-ptrcheck.
   1083          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1084                    offsetof(VexGuestAMD64State, guest_CC_DEP1), sizeof(ULong) );
   1085          break;
   1086       default:
   1087          vg_assert(0);
   1088          break;
   1089    }
   1090 
   1091 #  elif defined(VGP_s390x_linux)
   1092    VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
   1093    vg_assert(canonical->what == SsComplete);
   1094    if (sr_isError(canonical->sres)) {
   1095       gst->guest_r2 = - (Long)sr_Err(canonical->sres);
   1096    } else {
   1097       gst->guest_r2 = sr_Res(canonical->sres);
   1098    }
   1099 
   1100 #  elif defined(VGP_mips32_linux)
   1101    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
   1102    vg_assert(canonical->what == SsComplete);
   1103    if (sr_isError(canonical->sres)) {
   1104       gst->guest_r2 = (Int)sr_Err(canonical->sres);
   1105       gst->guest_r7 = (Int)sr_Err(canonical->sres);
   1106    } else {
   1107       gst->guest_r2 = sr_Res(canonical->sres);
   1108       gst->guest_r3 = sr_ResEx(canonical->sres);
   1109       gst->guest_r7 = (Int)sr_Err(canonical->sres);
   1110    }
   1111    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1112              OFFSET_mips32_r2, sizeof(UWord) );
   1113    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1114              OFFSET_mips32_r3, sizeof(UWord) );
   1115    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1116              OFFSET_mips32_r7, sizeof(UWord) );
   1117 
   1118 #  elif defined(VGP_mips64_linux)
   1119    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
   1120    vg_assert(canonical->what == SsComplete);
   1121    if (sr_isError(canonical->sres)) {
   1122       gst->guest_r2 = (Int)sr_Err(canonical->sres);
   1123       gst->guest_r7 = (Int)sr_Err(canonical->sres);
   1124    } else {
   1125       gst->guest_r2 = sr_Res(canonical->sres);
   1126       gst->guest_r3 = sr_ResEx(canonical->sres);
   1127       gst->guest_r7 = (Int)sr_Err(canonical->sres);
   1128    }
   1129    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1130              OFFSET_mips64_r2, sizeof(UWord) );
   1131    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1132              OFFSET_mips64_r3, sizeof(UWord) );
   1133    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
   1134              OFFSET_mips64_r7, sizeof(UWord) );
   1135 
   1136 #  else
   1137 #    error "putSyscallStatusIntoGuestState: unknown arch"
   1138 #  endif
   1139 }
   1140 
   1141 
   1142 /* Tell me the offsets in the guest state of the syscall params, so
   1143    that the scalar argument checkers don't have to have this info
   1144    hardwired. */
   1145 
   1146 static
   1147 void getSyscallArgLayout ( /*OUT*/SyscallArgLayout* layout )
   1148 {
   1149    VG_(bzero_inline)(layout, sizeof(*layout));
   1150 
   1151 #if defined(VGP_x86_linux)
   1152    layout->o_sysno  = OFFSET_x86_EAX;
   1153    layout->o_arg1   = OFFSET_x86_EBX;
   1154    layout->o_arg2   = OFFSET_x86_ECX;
   1155    layout->o_arg3   = OFFSET_x86_EDX;
   1156    layout->o_arg4   = OFFSET_x86_ESI;
   1157    layout->o_arg5   = OFFSET_x86_EDI;
   1158    layout->o_arg6   = OFFSET_x86_EBP;
   1159    layout->uu_arg7  = -1; /* impossible value */
   1160    layout->uu_arg8  = -1; /* impossible value */
   1161 
   1162 #elif defined(VGP_amd64_linux)
   1163    layout->o_sysno  = OFFSET_amd64_RAX;
   1164    layout->o_arg1   = OFFSET_amd64_RDI;
   1165    layout->o_arg2   = OFFSET_amd64_RSI;
   1166    layout->o_arg3   = OFFSET_amd64_RDX;
   1167    layout->o_arg4   = OFFSET_amd64_R10;
   1168    layout->o_arg5   = OFFSET_amd64_R8;
   1169    layout->o_arg6   = OFFSET_amd64_R9;
   1170    layout->uu_arg7  = -1; /* impossible value */
   1171    layout->uu_arg8  = -1; /* impossible value */
   1172 
   1173 #elif defined(VGP_ppc32_linux)
   1174    layout->o_sysno  = OFFSET_ppc32_GPR0;
   1175    layout->o_arg1   = OFFSET_ppc32_GPR3;
   1176    layout->o_arg2   = OFFSET_ppc32_GPR4;
   1177    layout->o_arg3   = OFFSET_ppc32_GPR5;
   1178    layout->o_arg4   = OFFSET_ppc32_GPR6;
   1179    layout->o_arg5   = OFFSET_ppc32_GPR7;
   1180    layout->o_arg6   = OFFSET_ppc32_GPR8;
   1181    layout->uu_arg7  = -1; /* impossible value */
   1182    layout->uu_arg8  = -1; /* impossible value */
   1183 
   1184 #elif defined(VGP_ppc64_linux)
   1185    layout->o_sysno  = OFFSET_ppc64_GPR0;
   1186    layout->o_arg1   = OFFSET_ppc64_GPR3;
   1187    layout->o_arg2   = OFFSET_ppc64_GPR4;
   1188    layout->o_arg3   = OFFSET_ppc64_GPR5;
   1189    layout->o_arg4   = OFFSET_ppc64_GPR6;
   1190    layout->o_arg5   = OFFSET_ppc64_GPR7;
   1191    layout->o_arg6   = OFFSET_ppc64_GPR8;
   1192    layout->uu_arg7  = -1; /* impossible value */
   1193    layout->uu_arg8  = -1; /* impossible value */
   1194 
   1195 #elif defined(VGP_arm_linux)
   1196    layout->o_sysno  = OFFSET_arm_R7;
   1197    layout->o_arg1   = OFFSET_arm_R0;
   1198    layout->o_arg2   = OFFSET_arm_R1;
   1199    layout->o_arg3   = OFFSET_arm_R2;
   1200    layout->o_arg4   = OFFSET_arm_R3;
   1201    layout->o_arg5   = OFFSET_arm_R4;
   1202    layout->o_arg6   = OFFSET_arm_R5;
   1203    layout->uu_arg7  = -1; /* impossible value */
   1204    layout->uu_arg8  = -1; /* impossible value */
   1205 
   1206 #elif defined(VGP_arm64_linux)
   1207    layout->o_sysno  = OFFSET_arm64_X8;
   1208    layout->o_arg1   = OFFSET_arm64_X0;
   1209    layout->o_arg2   = OFFSET_arm64_X1;
   1210    layout->o_arg3   = OFFSET_arm64_X2;
   1211    layout->o_arg4   = OFFSET_arm64_X3;
   1212    layout->o_arg5   = OFFSET_arm64_X4;
   1213    layout->o_arg6   = OFFSET_arm64_X5;
   1214    layout->uu_arg7  = -1; /* impossible value */
   1215    layout->uu_arg8  = -1; /* impossible value */
   1216 
   1217 #elif defined(VGP_mips32_linux)
   1218    layout->o_sysno  = OFFSET_mips32_r2;
   1219    layout->o_arg1   = OFFSET_mips32_r4;
   1220    layout->o_arg2   = OFFSET_mips32_r5;
   1221    layout->o_arg3   = OFFSET_mips32_r6;
   1222    layout->o_arg4   = OFFSET_mips32_r7;
   1223    layout->s_arg5   = sizeof(UWord) * 4;
   1224    layout->s_arg6   = sizeof(UWord) * 5;
   1225    layout->uu_arg7  = -1; /* impossible value */
   1226    layout->uu_arg8  = -1; /* impossible value */
   1227 
   1228 #elif defined(VGP_mips64_linux)
   1229    layout->o_sysno  = OFFSET_mips64_r2;
   1230    layout->o_arg1   = OFFSET_mips64_r4;
   1231    layout->o_arg2   = OFFSET_mips64_r5;
   1232    layout->o_arg3   = OFFSET_mips64_r6;
   1233    layout->o_arg4   = OFFSET_mips64_r7;
   1234    layout->o_arg5   = OFFSET_mips64_r8;
   1235    layout->o_arg6   = OFFSET_mips64_r9;
   1236    layout->uu_arg7  = -1; /* impossible value */
   1237    layout->uu_arg8  = -1; /* impossible value */
   1238 
   1239 #elif defined(VGP_x86_darwin)
   1240    layout->o_sysno  = OFFSET_x86_EAX;
   1241    // syscall parameters are on stack in C convention
   1242    layout->s_arg1   = sizeof(UWord) * 1;
   1243    layout->s_arg2   = sizeof(UWord) * 2;
   1244    layout->s_arg3   = sizeof(UWord) * 3;
   1245    layout->s_arg4   = sizeof(UWord) * 4;
   1246    layout->s_arg5   = sizeof(UWord) * 5;
   1247    layout->s_arg6   = sizeof(UWord) * 6;
   1248    layout->s_arg7   = sizeof(UWord) * 7;
   1249    layout->s_arg8   = sizeof(UWord) * 8;
   1250 
   1251 #elif defined(VGP_amd64_darwin)
   1252    layout->o_sysno  = OFFSET_amd64_RAX;
   1253    layout->o_arg1   = OFFSET_amd64_RDI;
   1254    layout->o_arg2   = OFFSET_amd64_RSI;
   1255    layout->o_arg3   = OFFSET_amd64_RDX;
   1256    layout->o_arg4   = OFFSET_amd64_RCX;
   1257    layout->o_arg5   = OFFSET_amd64_R8;
   1258    layout->o_arg6   = OFFSET_amd64_R9;
   1259    layout->s_arg7   = sizeof(UWord) * 1;
   1260    layout->s_arg8   = sizeof(UWord) * 2;
   1261 
   1262 #elif defined(VGP_s390x_linux)
   1263    layout->o_sysno  = OFFSET_s390x_SYSNO;
   1264    layout->o_arg1   = OFFSET_s390x_r2;
   1265    layout->o_arg2   = OFFSET_s390x_r3;
   1266    layout->o_arg3   = OFFSET_s390x_r4;
   1267    layout->o_arg4   = OFFSET_s390x_r5;
   1268    layout->o_arg5   = OFFSET_s390x_r6;
   1269    layout->o_arg6   = OFFSET_s390x_r7;
   1270    layout->uu_arg7  = -1; /* impossible value */
   1271    layout->uu_arg8  = -1; /* impossible value */
   1272 #else
   1273 #  error "getSyscallLayout: unknown arch"
   1274 #endif
   1275 }
   1276 
   1277 
   1278 /* ---------------------------------------------------------------------
   1279    The main driver logic
   1280    ------------------------------------------------------------------ */
   1281 
   1282 /* Finding the handlers for a given syscall, or faking up one
   1283    when no handler is found. */
   1284 
   1285 static
   1286 void bad_before ( ThreadId              tid,
   1287                   SyscallArgLayout*     layout,
   1288                   /*MOD*/SyscallArgs*   args,
   1289                   /*OUT*/SyscallStatus* status,
   1290                   /*OUT*/UWord*         flags )
   1291 {
   1292    VG_(dmsg)("WARNING: unhandled syscall: %s\n",
   1293       VG_SYSNUM_STRING_EXTRA(args->sysno));
   1294    if (VG_(clo_verbosity) > 1) {
   1295       VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
   1296    }
   1297    VG_(dmsg)("You may be able to write your own handler.\n");
   1298    VG_(dmsg)("Read the file README_MISSING_SYSCALL_OR_IOCTL.\n");
   1299    VG_(dmsg)("Nevertheless we consider this a bug.  Please report\n");
   1300    VG_(dmsg)("it at http://valgrind.org/support/bug_reports.html.\n");
   1301 
   1302    SET_STATUS_Failure(VKI_ENOSYS);
   1303 }
   1304 
   1305 static SyscallTableEntry bad_sys =
   1306    { bad_before, NULL };
   1307 
   1308 static const SyscallTableEntry* get_syscall_entry ( Int syscallno )
   1309 {
   1310    const SyscallTableEntry* sys = NULL;
   1311 
   1312 #  if defined(VGO_linux)
   1313    sys = ML_(get_linux_syscall_entry)( syscallno );
   1314 
   1315 #  elif defined(VGO_darwin)
   1316    Int idx = VG_DARWIN_SYSNO_INDEX(syscallno);
   1317 
   1318    switch (VG_DARWIN_SYSNO_CLASS(syscallno)) {
   1319    case VG_DARWIN_SYSCALL_CLASS_UNIX:
   1320       if (idx >= 0 && idx < ML_(syscall_table_size) &&
   1321           ML_(syscall_table)[idx].before != NULL)
   1322          sys = &ML_(syscall_table)[idx];
   1323          break;
   1324    case VG_DARWIN_SYSCALL_CLASS_MACH:
   1325       if (idx >= 0 && idx < ML_(mach_trap_table_size) &&
   1326           ML_(mach_trap_table)[idx].before != NULL)
   1327          sys = &ML_(mach_trap_table)[idx];
   1328          break;
   1329    case VG_DARWIN_SYSCALL_CLASS_MDEP:
   1330       if (idx >= 0 && idx < ML_(mdep_trap_table_size) &&
   1331           ML_(mdep_trap_table)[idx].before != NULL)
   1332          sys = &ML_(mdep_trap_table)[idx];
   1333          break;
   1334    default:
   1335       vg_assert(0);
   1336       break;
   1337    }
   1338 
   1339 #  else
   1340 #    error Unknown OS
   1341 #  endif
   1342 
   1343    return sys == NULL  ? &bad_sys  : sys;
   1344 }
   1345 
   1346 
   1347 /* Add and remove signals from mask so that we end up telling the
   1348    kernel the state we actually want rather than what the client
   1349    wants. */
   1350 static void sanitize_client_sigmask(vki_sigset_t *mask)
   1351 {
   1352    VG_(sigdelset)(mask, VKI_SIGKILL);
   1353    VG_(sigdelset)(mask, VKI_SIGSTOP);
   1354    VG_(sigdelset)(mask, VG_SIGVGKILL); /* never block */
   1355 }
   1356 
   1357 typedef
   1358    struct {
   1359       SyscallArgs   orig_args;
   1360       SyscallArgs   args;
   1361       SyscallStatus status;
   1362       UWord         flags;
   1363    }
   1364    SyscallInfo;
   1365 
   1366 SyscallInfo syscallInfo[VG_N_THREADS];
   1367 
   1368 
   1369 /* The scheduler needs to be able to zero out these records after a
   1370    fork, hence this is exported from m_syswrap. */
   1371 void VG_(clear_syscallInfo) ( Int tid )
   1372 {
   1373    vg_assert(tid >= 0 && tid < VG_N_THREADS);
   1374    VG_(memset)( & syscallInfo[tid], 0, sizeof( syscallInfo[tid] ));
   1375    syscallInfo[tid].status.what = SsIdle;
   1376 }
   1377 
   1378 static void ensure_initialised ( void )
   1379 {
   1380    Int i;
   1381    static Bool init_done = False;
   1382    if (init_done)
   1383       return;
   1384    init_done = True;
   1385    for (i = 0; i < VG_N_THREADS; i++) {
   1386       VG_(clear_syscallInfo)( i );
   1387    }
   1388 }
   1389 
   1390 /* --- This is the main function of this file. --- */
   1391 
   1392 void VG_(client_syscall) ( ThreadId tid, UInt trc )
   1393 {
   1394    Word                     sysno;
   1395    ThreadState*             tst;
   1396    const SyscallTableEntry* ent;
   1397    SyscallArgLayout         layout;
   1398    SyscallInfo*             sci;
   1399 
   1400    ensure_initialised();
   1401 
   1402    vg_assert(VG_(is_valid_tid)(tid));
   1403    vg_assert(tid >= 1 && tid < VG_N_THREADS);
   1404    vg_assert(VG_(is_running_thread)(tid));
   1405 
   1406    tst = VG_(get_ThreadState)(tid);
   1407 
   1408    /* BEGIN ensure root thread's stack is suitably mapped */
   1409    /* In some rare circumstances, we may do the syscall without the
   1410       bottom page of the stack being mapped, because the stack pointer
   1411       was moved down just a few instructions before the syscall
   1412       instruction, and there have been no memory references since
   1413       then, that would cause a call to VG_(extend_stack) to have
   1414       happened.
   1415 
   1416       In native execution that's OK: the kernel automagically extends
   1417       the stack's mapped area down to cover the stack pointer (or sp -
   1418       redzone, really).  In simulated normal execution that's OK too,
   1419       since any signals we get from accessing below the mapped area of
   1420       the (guest's) stack lead us to VG_(extend_stack), where we
   1421       simulate the kernel's stack extension logic.  But that leaves
   1422       the problem of entering a syscall with the SP unmapped.  Because
   1423       the kernel doesn't know that the segment immediately above SP is
   1424       supposed to be a grow-down segment, it causes the syscall to
   1425       fail, and thereby causes a divergence between native behaviour
   1426       (syscall succeeds) and simulated behaviour (syscall fails).
   1427 
   1428       This is quite a rare failure mode.  It has only been seen
   1429       affecting calls to sys_readlink on amd64-linux, and even then it
   1430       requires a certain code sequence around the syscall to trigger
   1431       it.  Here is one:
   1432 
   1433       extern int my_readlink ( const char* path );
   1434       asm(
   1435       ".text\n"
   1436       ".globl my_readlink\n"
   1437       "my_readlink:\n"
   1438       "\tsubq    $0x1008,%rsp\n"
   1439       "\tmovq    %rdi,%rdi\n"              // path is in rdi
   1440       "\tmovq    %rsp,%rsi\n"              // &buf[0] -> rsi
   1441       "\tmovl    $0x1000,%edx\n"           // sizeof(buf) in rdx
   1442       "\tmovl    $"__NR_READLINK",%eax\n"  // syscall number
   1443       "\tsyscall\n"
   1444       "\taddq    $0x1008,%rsp\n"
   1445       "\tret\n"
   1446       ".previous\n"
   1447       );
   1448 
   1449       For more details, see bug #156404
   1450       (https://bugs.kde.org/show_bug.cgi?id=156404).
   1451 
   1452       The fix is actually very simple.  We simply need to call
   1453       VG_(extend_stack) for this thread, handing it the lowest
   1454       possible valid address for stack (sp - redzone), to ensure the
   1455       pages all the way down to that address, are mapped.  Because
   1456       this is a potentially expensive and frequent operation, we
   1457       filter in two ways:
   1458 
   1459       First, only the main thread (tid=1) has a growdown stack.  So
   1460       ignore all others.  It is conceivable, although highly unlikely,
   1461       that the main thread exits, and later another thread is
   1462       allocated tid=1, but that's harmless, I believe;
   1463       VG_(extend_stack) will do nothing when applied to a non-root
   1464       thread.
   1465 
   1466       Secondly, first call VG_(am_find_nsegment) directly, to see if
   1467       the page holding (sp - redzone) is mapped correctly.  If so, do
   1468       nothing.  This is almost always the case.  VG_(extend_stack)
   1469       calls VG_(am_find_nsegment) twice, so this optimisation -- and
   1470       that's all it is -- more or less halves the number of calls to
   1471       VG_(am_find_nsegment) required.
   1472 
   1473       TODO: the test "seg->kind == SkAnonC" is really inadequate,
   1474       because although it tests whether the segment is mapped
   1475       _somehow_, it doesn't check that it has the right permissions
   1476       (r,w, maybe x) ?  We could test that here, but it will also be
   1477       necessary to fix the corresponding test in VG_(extend_stack).
   1478 
   1479       All this guff is of course Linux-specific.  Hence the ifdef.
   1480    */
   1481 #  if defined(VGO_linux)
   1482    if (tid == 1/*ROOT THREAD*/) {
   1483       Addr     stackMin   = VG_(get_SP)(tid) - VG_STACK_REDZONE_SZB;
   1484       NSegment const* seg = VG_(am_find_nsegment)(stackMin);
   1485       if (seg && seg->kind == SkAnonC) {
   1486          /* stackMin is already mapped.  Nothing to do. */
   1487       } else {
   1488          (void)VG_(extend_stack)( stackMin,
   1489                                   tst->client_stack_szB );
   1490       }
   1491    }
   1492 #  endif
   1493    /* END ensure root thread's stack is suitably mapped */
   1494 
   1495    /* First off, get the syscall args and number.  This is a
   1496       platform-dependent action. */
   1497 
   1498    sci = & syscallInfo[tid];
   1499    vg_assert(sci->status.what == SsIdle);
   1500 
   1501    getSyscallArgsFromGuestState( &sci->orig_args, &tst->arch.vex, trc );
   1502 
   1503    /* Copy .orig_args to .args.  The pre-handler may modify .args, but
   1504       we want to keep the originals too, just in case. */
   1505    sci->args = sci->orig_args;
   1506 
   1507    /* Save the syscall number in the thread state in case the syscall
   1508       is interrupted by a signal. */
   1509    sysno = sci->orig_args.sysno;
   1510 
   1511    /* It's sometimes useful, as a crude debugging hack, to get a
   1512       stack trace at each (or selected) syscalls. */
   1513    if (0 && sysno == __NR_ioctl) {
   1514       VG_(umsg)("\nioctl:\n");
   1515       VG_(get_and_pp_StackTrace)(tid, 10);
   1516       VG_(umsg)("\n");
   1517    }
   1518 
   1519 #  if defined(VGO_darwin)
   1520    /* Record syscall class.  But why?  Because the syscall might be
   1521       interrupted by a signal, and in the signal handler (which will
   1522       be m_signals.async_signalhandler) we will need to build a SysRes
   1523       reflecting the syscall return result.  In order to do that we
   1524       need to know the syscall class.  Hence stash it in the guest
   1525       state of this thread.  This madness is not needed on Linux
   1526       because it only has a single syscall return convention and so
   1527       there is no ambiguity involved in converting the post-signal
   1528       machine state into a SysRes. */
   1529    tst->arch.vex.guest_SC_CLASS = VG_DARWIN_SYSNO_CLASS(sysno);
   1530 #  endif
   1531 
   1532    /* The default what-to-do-next thing is hand the syscall to the
   1533       kernel, so we pre-set that here.  Set .sres to something
   1534       harmless looking (is irrelevant because .what is not
   1535       SsComplete.) */
   1536    sci->status.what = SsHandToKernel;
   1537    sci->status.sres = VG_(mk_SysRes_Error)(0);
   1538    sci->flags       = 0;
   1539 
   1540    /* Fetch the syscall's handlers.  If no handlers exist for this
   1541       syscall, we are given dummy handlers which force an immediate
   1542       return with ENOSYS. */
   1543    ent = get_syscall_entry(sysno);
   1544 
   1545    /* Fetch the layout information, which tells us where in the guest
   1546       state the syscall args reside.  This is a platform-dependent
   1547       action.  This info is needed so that the scalar syscall argument
   1548       checks (PRE_REG_READ calls) know which bits of the guest state
   1549       they need to inspect. */
   1550    getSyscallArgLayout( &layout );
   1551 
   1552    /* Make sure the tmp signal mask matches the real signal mask;
   1553       sigsuspend may change this. */
   1554    vg_assert(VG_(iseqsigset)(&tst->sig_mask, &tst->tmp_sig_mask));
   1555 
   1556    /* Right, we're finally ready to Party.  Call the pre-handler and
   1557       see what we get back.  At this point:
   1558 
   1559         sci->status.what  is Unset (we don't know yet).
   1560         sci->orig_args    contains the original args.
   1561         sci->args         is the same as sci->orig_args.
   1562         sci->flags        is zero.
   1563    */
   1564 
   1565    PRINT("SYSCALL[%d,%d](%s) ",
   1566       VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno));
   1567 
   1568    /* Do any pre-syscall actions */
   1569    if (VG_(needs).syscall_wrapper) {
   1570       UWord tmpv[8];
   1571       tmpv[0] = sci->orig_args.arg1;
   1572       tmpv[1] = sci->orig_args.arg2;
   1573       tmpv[2] = sci->orig_args.arg3;
   1574       tmpv[3] = sci->orig_args.arg4;
   1575       tmpv[4] = sci->orig_args.arg5;
   1576       tmpv[5] = sci->orig_args.arg6;
   1577       tmpv[6] = sci->orig_args.arg7;
   1578       tmpv[7] = sci->orig_args.arg8;
   1579       VG_TDICT_CALL(tool_pre_syscall, tid, sysno,
   1580                     &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]));
   1581    }
   1582 
   1583    vg_assert(ent);
   1584    vg_assert(ent->before);
   1585    (ent->before)( tid,
   1586                   &layout,
   1587                   &sci->args, &sci->status, &sci->flags );
   1588 
   1589    /* The pre-handler may have modified:
   1590          sci->args
   1591          sci->status
   1592          sci->flags
   1593       All else remains unchanged.
   1594       Although the args may be modified, pre handlers are not allowed
   1595       to change the syscall number.
   1596    */
   1597    /* Now we proceed according to what the pre-handler decided. */
   1598    vg_assert(sci->status.what == SsHandToKernel
   1599              || sci->status.what == SsComplete);
   1600    vg_assert(sci->args.sysno == sci->orig_args.sysno);
   1601 
   1602    if (sci->status.what == SsComplete && !sr_isError(sci->status.sres)) {
   1603       /* The pre-handler completed the syscall itself, declaring
   1604          success. */
   1605       if (sci->flags & SfNoWriteResult) {
   1606          PRINT(" --> [pre-success] NoWriteResult");
   1607       } else {
   1608          PRINT(" --> [pre-success] Success(0x%llx:0x%llx)",
   1609                (ULong)sr_ResHI(sci->status.sres),
   1610                (ULong)sr_Res(sci->status.sres));
   1611       }
   1612       /* In this case the allowable flags are to ask for a signal-poll
   1613          and/or a yield after the call.  Changing the args isn't
   1614          allowed. */
   1615       vg_assert(0 == (sci->flags
   1616                       & ~(SfPollAfter | SfYieldAfter | SfNoWriteResult)));
   1617       vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
   1618    }
   1619 
   1620    else
   1621    if (sci->status.what == SsComplete && sr_isError(sci->status.sres)) {
   1622       /* The pre-handler decided to fail syscall itself. */
   1623       PRINT(" --> [pre-fail] Failure(0x%llx)", (ULong)sr_Err(sci->status.sres));
   1624       /* In this case, the pre-handler is also allowed to ask for the
   1625          post-handler to be run anyway.  Changing the args is not
   1626          allowed. */
   1627       vg_assert(0 == (sci->flags & ~(SfMayBlock | SfPostOnFail | SfPollAfter)));
   1628       vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
   1629    }
   1630 
   1631    else
   1632    if (sci->status.what != SsHandToKernel) {
   1633       /* huh?! */
   1634       vg_assert(0);
   1635    }
   1636 
   1637    else /* (sci->status.what == HandToKernel) */ {
   1638       /* Ok, this is the usual case -- and the complicated one.  There
   1639          are two subcases: sync and async.  async is the general case
   1640          and is to be used when there is any possibility that the
   1641          syscall might block [a fact that the pre-handler must tell us
   1642          via the sci->flags field.]  Because the tidying-away /
   1643          context-switch overhead of the async case could be large, if
   1644          we are sure that the syscall will not block, we fast-track it
   1645          by doing it directly in this thread, which is a lot
   1646          simpler. */
   1647 
   1648       /* Check that the given flags are allowable: MayBlock, PollAfter
   1649          and PostOnFail are ok. */
   1650       vg_assert(0 == (sci->flags & ~(SfMayBlock | SfPostOnFail | SfPollAfter)));
   1651 
   1652       if (sci->flags & SfMayBlock) {
   1653 
   1654          /* Syscall may block, so run it asynchronously */
   1655          vki_sigset_t mask;
   1656 
   1657          PRINT(" --> [async] ... \n");
   1658 
   1659          mask = tst->sig_mask;
   1660          sanitize_client_sigmask(&mask);
   1661 
   1662          /* Gack.  More impedance matching.  Copy the possibly
   1663             modified syscall args back into the guest state. */
   1664          /* JRS 2009-Mar-16: if the syscall args are possibly modified,
   1665             then this assertion is senseless:
   1666               vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
   1667             The case that exposed it was sys_posix_spawn on Darwin,
   1668             which heavily modifies its arguments but then lets the call
   1669             go through anyway, with SfToBlock set, hence we end up here. */
   1670          putSyscallArgsIntoGuestState( &sci->args, &tst->arch.vex );
   1671 
   1672          /* Drop the bigLock */
   1673          VG_(release_BigLock)(tid, VgTs_WaitSys, "VG_(client_syscall)[async]");
   1674          /* Urr.  We're now in a race against other threads trying to
   1675             acquire the bigLock.  I guess that doesn't matter provided
   1676             that do_syscall_for_client only touches thread-local
   1677             state. */
   1678 
   1679          /* Do the call, which operates directly on the guest state,
   1680             not on our abstracted copies of the args/result. */
   1681          do_syscall_for_client(sysno, tst, &mask);
   1682 
   1683          /* do_syscall_for_client may not return if the syscall was
   1684             interrupted by a signal.  In that case, flow of control is
   1685             first to m_signals.async_sighandler, which calls
   1686             VG_(fixup_guest_state_after_syscall_interrupted), which
   1687             fixes up the guest state, and possibly calls
   1688             VG_(post_syscall).  Once that's done, control drops back
   1689             to the scheduler.  */
   1690 
   1691          /* Darwin: do_syscall_for_client may not return if the
   1692             syscall was workq_ops(WQOPS_THREAD_RETURN) and the kernel
   1693             responded by starting the thread at wqthread_hijack(reuse=1)
   1694             (to run another workqueue item). In that case, wqthread_hijack
   1695             calls ML_(wqthread_continue), which is similar to
   1696             VG_(fixup_guest_state_after_syscall_interrupted). */
   1697 
   1698          /* Reacquire the lock */
   1699          VG_(acquire_BigLock)(tid, "VG_(client_syscall)[async]");
   1700 
   1701          /* Even more impedance matching.  Extract the syscall status
   1702             from the guest state. */
   1703          getSyscallStatusFromGuestState( &sci->status, &tst->arch.vex );
   1704          vg_assert(sci->status.what == SsComplete);
   1705 
   1706          /* Be decorative, if required. */
   1707          if (VG_(clo_trace_syscalls)) {
   1708             Bool failed = sr_isError(sci->status.sres);
   1709             if (failed) {
   1710                PRINT("SYSCALL[%d,%d](%s) ... [async] --> Failure(0x%llx)",
   1711                      VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno),
   1712                      (ULong)sr_Err(sci->status.sres));
   1713             } else {
   1714                PRINT("SYSCALL[%d,%d](%s) ... [async] --> "
   1715                      "Success(0x%llx:0x%llx)",
   1716                      VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno),
   1717                      (ULong)sr_ResHI(sci->status.sres),
   1718                      (ULong)sr_Res(sci->status.sres) );
   1719             }
   1720          }
   1721 
   1722       } else {
   1723 
   1724          /* run the syscall directly */
   1725          /* The pre-handler may have modified the syscall args, but
   1726             since we're passing values in ->args directly to the
   1727             kernel, there's no point in flushing them back to the
   1728             guest state.  Indeed doing so could be construed as
   1729             incorrect. */
   1730          SysRes sres
   1731             = VG_(do_syscall)(sysno, sci->args.arg1, sci->args.arg2,
   1732                                      sci->args.arg3, sci->args.arg4,
   1733                                      sci->args.arg5, sci->args.arg6,
   1734                                      sci->args.arg7, sci->args.arg8 );
   1735          sci->status = convert_SysRes_to_SyscallStatus(sres);
   1736 
   1737          /* Be decorative, if required. */
   1738          if (VG_(clo_trace_syscalls)) {
   1739             Bool failed = sr_isError(sci->status.sres);
   1740             if (failed) {
   1741                PRINT("[sync] --> Failure(0x%llx)",
   1742                      (ULong)sr_Err(sci->status.sres) );
   1743             } else {
   1744                PRINT("[sync] --> Success(0x%llx:0x%llx)",
   1745                      (ULong)sr_ResHI(sci->status.sres),
   1746                      (ULong)sr_Res(sci->status.sres) );
   1747             }
   1748          }
   1749       }
   1750    }
   1751 
   1752    vg_assert(sci->status.what == SsComplete);
   1753 
   1754    vg_assert(VG_(is_running_thread)(tid));
   1755 
   1756    /* Dump the syscall result back in the guest state.  This is
   1757       a platform-specific action. */
   1758    if (!(sci->flags & SfNoWriteResult))
   1759       putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
   1760 
   1761    /* Situation now:
   1762       - the guest state is now correctly modified following the syscall
   1763       - modified args, original args and syscall status are still
   1764         available in the syscallInfo[] entry for this syscall.
   1765 
   1766       Now go on to do the post-syscall actions (read on down ..)
   1767    */
   1768    PRINT(" ");
   1769    VG_(post_syscall)(tid);
   1770    PRINT("\n");
   1771 }
   1772 
   1773 
   1774 /* Perform post syscall actions.  The expected state on entry is
   1775    precisely as at the end of VG_(client_syscall), that is:
   1776 
   1777    - guest state up to date following the syscall
   1778    - modified args, original args and syscall status are still
   1779      available in the syscallInfo[] entry for this syscall.
   1780    - syscall status matches what's in the guest state.
   1781 
   1782    There are two ways to get here: the normal way -- being called by
   1783    VG_(client_syscall), and the unusual way, from
   1784    VG_(fixup_guest_state_after_syscall_interrupted).
   1785    Darwin: there's a third way, ML_(wqthread_continue).
   1786 */
   1787 void VG_(post_syscall) (ThreadId tid)
   1788 {
   1789    SyscallInfo*             sci;
   1790    const SyscallTableEntry* ent;
   1791    SyscallStatus            test_status;
   1792    ThreadState*             tst;
   1793    Word sysno;
   1794 
   1795    /* Preliminaries */
   1796    vg_assert(VG_(is_valid_tid)(tid));
   1797    vg_assert(tid >= 1 && tid < VG_N_THREADS);
   1798    vg_assert(VG_(is_running_thread)(tid));
   1799 
   1800    tst = VG_(get_ThreadState)(tid);
   1801    sci = & syscallInfo[tid];
   1802 
   1803    /* m_signals.sigvgkill_handler might call here even when not in
   1804       a syscall. */
   1805    if (sci->status.what == SsIdle || sci->status.what == SsHandToKernel) {
   1806       sci->status.what = SsIdle;
   1807       return;
   1808    }
   1809 
   1810    /* Validate current syscallInfo entry.  In particular we require
   1811       that the current .status matches what's actually in the guest
   1812       state.  At least in the normal case where we have actually
   1813       previously written the result into the guest state. */
   1814    vg_assert(sci->status.what == SsComplete);
   1815 
   1816    getSyscallStatusFromGuestState( &test_status, &tst->arch.vex );
   1817    if (!(sci->flags & SfNoWriteResult))
   1818       vg_assert(eq_SyscallStatus( &sci->status, &test_status ));
   1819    /* Failure of the above assertion on Darwin can indicate a problem
   1820       in the syscall wrappers that pre-fail or pre-succeed the
   1821       syscall, by calling SET_STATUS_Success or SET_STATUS_Failure,
   1822       when they really should call SET_STATUS_from_SysRes.  The former
   1823       create a UNIX-class syscall result on Darwin, which may not be
   1824       correct for the syscall; if that's the case then this assertion
   1825       fires.  See PRE(thread_fast_set_cthread_self) for an example.  On
   1826       non-Darwin platforms this assertion is should never fail, and this
   1827       comment is completely irrelevant. */
   1828    /* Ok, looks sane */
   1829 
   1830    /* Get the system call number.  Because the pre-handler isn't
   1831       allowed to mess with it, it should be the same for both the
   1832       original and potentially-modified args. */
   1833    vg_assert(sci->args.sysno == sci->orig_args.sysno);
   1834    sysno = sci->args.sysno;
   1835    ent = get_syscall_entry(sysno);
   1836 
   1837    /* pre: status == Complete (asserted above) */
   1838    /* Consider either success or failure.  Now run the post handler if:
   1839       - it exists, and
   1840       - Success or (Failure and PostOnFail is set)
   1841    */
   1842    if (ent->after
   1843        && ((!sr_isError(sci->status.sres))
   1844            || (sr_isError(sci->status.sres)
   1845                && (sci->flags & SfPostOnFail) ))) {
   1846 
   1847       (ent->after)( tid, &sci->args, &sci->status );
   1848    }
   1849 
   1850    /* Because the post handler might have changed the status (eg, the
   1851       post-handler for sys_open can change the result from success to
   1852       failure if the kernel supplied a fd that it doesn't like), once
   1853       again dump the syscall result back in the guest state.*/
   1854    if (!(sci->flags & SfNoWriteResult))
   1855       putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
   1856 
   1857    /* Do any post-syscall actions required by the tool. */
   1858    if (VG_(needs).syscall_wrapper) {
   1859       UWord tmpv[8];
   1860       tmpv[0] = sci->orig_args.arg1;
   1861       tmpv[1] = sci->orig_args.arg2;
   1862       tmpv[2] = sci->orig_args.arg3;
   1863       tmpv[3] = sci->orig_args.arg4;
   1864       tmpv[4] = sci->orig_args.arg5;
   1865       tmpv[5] = sci->orig_args.arg6;
   1866       tmpv[6] = sci->orig_args.arg7;
   1867       tmpv[7] = sci->orig_args.arg8;
   1868       VG_TDICT_CALL(tool_post_syscall, tid,
   1869                     sysno,
   1870                     &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]),
   1871                     sci->status.sres);
   1872    }
   1873 
   1874    /* The syscall is done. */
   1875    vg_assert(sci->status.what == SsComplete);
   1876    sci->status.what = SsIdle;
   1877 
   1878    /* The pre/post wrappers may have concluded that pending signals
   1879       might have been created, and will have set SfPollAfter to
   1880       request a poll for them once the syscall is done. */
   1881    if (sci->flags & SfPollAfter)
   1882       VG_(poll_signals)(tid);
   1883 
   1884    /* Similarly, the wrappers might have asked for a yield
   1885       afterwards. */
   1886    if (sci->flags & SfYieldAfter)
   1887       VG_(vg_yield)();
   1888 }
   1889 
   1890 
   1891 /* ---------------------------------------------------------------------
   1892    Dealing with syscalls which get interrupted by a signal:
   1893    VG_(fixup_guest_state_after_syscall_interrupted)
   1894    ------------------------------------------------------------------ */
   1895 
   1896 /* Syscalls done on behalf of the client are finally handed off to the
   1897    kernel in VG_(client_syscall) above, either by calling
   1898    do_syscall_for_client (the async case), or by calling
   1899    VG_(do_syscall6) (the sync case).
   1900 
   1901    If the syscall is not interrupted by a signal (it may block and
   1902    later unblock, but that's irrelevant here) then those functions
   1903    eventually return and so control is passed to VG_(post_syscall).
   1904    NB: not sure if the sync case can actually get interrupted, as it
   1905    operates with all signals masked.
   1906 
   1907    However, the syscall may get interrupted by an async-signal.  In
   1908    that case do_syscall_for_client/VG_(do_syscall6) do not
   1909    return.  Instead we wind up in m_signals.async_sighandler.  We need
   1910    to fix up the guest state to make it look like the syscall was
   1911    interrupted for guest.  So async_sighandler calls here, and this
   1912    does the fixup.  Note that from here we wind up calling
   1913    VG_(post_syscall) too.
   1914 */
   1915 
   1916 
   1917 /* These are addresses within ML_(do_syscall_for_client_WRK).  See
   1918    syscall-$PLAT.S for details.
   1919 */
   1920 #if defined(VGO_linux)
   1921   extern const Addr ML_(blksys_setup);
   1922   extern const Addr ML_(blksys_restart);
   1923   extern const Addr ML_(blksys_complete);
   1924   extern const Addr ML_(blksys_committed);
   1925   extern const Addr ML_(blksys_finished);
   1926 #elif defined(VGO_darwin)
   1927   /* Darwin requires extra uglyness */
   1928   extern const Addr ML_(blksys_setup_MACH);
   1929   extern const Addr ML_(blksys_restart_MACH);
   1930   extern const Addr ML_(blksys_complete_MACH);
   1931   extern const Addr ML_(blksys_committed_MACH);
   1932   extern const Addr ML_(blksys_finished_MACH);
   1933   extern const Addr ML_(blksys_setup_MDEP);
   1934   extern const Addr ML_(blksys_restart_MDEP);
   1935   extern const Addr ML_(blksys_complete_MDEP);
   1936   extern const Addr ML_(blksys_committed_MDEP);
   1937   extern const Addr ML_(blksys_finished_MDEP);
   1938   extern const Addr ML_(blksys_setup_UNIX);
   1939   extern const Addr ML_(blksys_restart_UNIX);
   1940   extern const Addr ML_(blksys_complete_UNIX);
   1941   extern const Addr ML_(blksys_committed_UNIX);
   1942   extern const Addr ML_(blksys_finished_UNIX);
   1943 #else
   1944 # error "Unknown OS"
   1945 #endif
   1946 
   1947 
   1948 /* Back up guest state to restart a system call. */
   1949 
   1950 void ML_(fixup_guest_state_to_restart_syscall) ( ThreadArchState* arch )
   1951 {
   1952 #if defined(VGP_x86_linux)
   1953    arch->vex.guest_EIP -= 2;             // sizeof(int $0x80)
   1954 
   1955    /* Make sure our caller is actually sane, and we're really backing
   1956       back over a syscall.
   1957 
   1958       int $0x80 == CD 80
   1959    */
   1960    {
   1961       UChar *p = (UChar *)arch->vex.guest_EIP;
   1962 
   1963       if (p[0] != 0xcd || p[1] != 0x80)
   1964          VG_(message)(Vg_DebugMsg,
   1965                       "?! restarting over syscall at %#x %02x %02x\n",
   1966                       arch->vex.guest_EIP, p[0], p[1]);
   1967 
   1968       vg_assert(p[0] == 0xcd && p[1] == 0x80);
   1969    }
   1970 
   1971 #elif defined(VGP_amd64_linux)
   1972    arch->vex.guest_RIP -= 2;             // sizeof(syscall)
   1973 
   1974    /* Make sure our caller is actually sane, and we're really backing
   1975       back over a syscall.
   1976 
   1977       syscall == 0F 05
   1978    */
   1979    {
   1980       UChar *p = (UChar *)arch->vex.guest_RIP;
   1981 
   1982       if (p[0] != 0x0F || p[1] != 0x05)
   1983          VG_(message)(Vg_DebugMsg,
   1984                       "?! restarting over syscall at %#llx %02x %02x\n",
   1985                       arch->vex.guest_RIP, p[0], p[1]);
   1986 
   1987       vg_assert(p[0] == 0x0F && p[1] == 0x05);
   1988    }
   1989 
   1990 #elif defined(VGP_ppc32_linux) || defined(VGP_ppc64_linux)
   1991    arch->vex.guest_CIA -= 4;             // sizeof(ppc32 instr)
   1992 
   1993    /* Make sure our caller is actually sane, and we're really backing
   1994       back over a syscall.
   1995 
   1996       sc == 44 00 00 02
   1997    */
   1998    {
   1999       UChar *p = (UChar *)arch->vex.guest_CIA;
   2000 
   2001       if (p[0] != 0x44 || p[1] != 0x0 || p[2] != 0x0 || p[3] != 0x02)
   2002          VG_(message)(Vg_DebugMsg,
   2003                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
   2004                       arch->vex.guest_CIA + 0ULL, p[0], p[1], p[2], p[3]);
   2005 
   2006       vg_assert(p[0] == 0x44 && p[1] == 0x0 && p[2] == 0x0 && p[3] == 0x2);
   2007    }
   2008 
   2009 #elif defined(VGP_arm_linux)
   2010    if (arch->vex.guest_R15T & 1) {
   2011       // Thumb mode.  SVC is a encoded as
   2012       //   1101 1111 imm8
   2013       // where imm8 is the SVC number, and we only accept 0.
   2014       arch->vex.guest_R15T -= 2;   // sizeof(thumb 16 bit insn)
   2015       UChar* p     = (UChar*)(arch->vex.guest_R15T - 1);
   2016       Bool   valid = p[0] == 0 && p[1] == 0xDF;
   2017       if (!valid) {
   2018          VG_(message)(Vg_DebugMsg,
   2019                       "?! restarting over (Thumb) syscall that is not syscall "
   2020                       "at %#llx %02x %02x\n",
   2021                       arch->vex.guest_R15T - 1ULL, p[0], p[1]);
   2022       }
   2023       vg_assert(valid);
   2024       // FIXME: NOTE, this really isn't right.  We need to back up
   2025       // ITSTATE to what it was before the SVC instruction, but we
   2026       // don't know what it was.  At least assert that it is now
   2027       // zero, because if it is nonzero then it must also have
   2028       // been nonzero for the SVC itself, which means it was
   2029       // conditional.  Urk.
   2030       vg_assert(arch->vex.guest_ITSTATE == 0);
   2031    } else {
   2032       // ARM mode.  SVC is encoded as
   2033       //   cond 1111 imm24
   2034       // where imm24 is the SVC number, and we only accept 0.
   2035       arch->vex.guest_R15T -= 4;   // sizeof(arm instr)
   2036       UChar* p     = (UChar*)arch->vex.guest_R15T;
   2037       Bool   valid = p[0] == 0 && p[1] == 0 && p[2] == 0
   2038                      && (p[3] & 0xF) == 0xF;
   2039       if (!valid) {
   2040          VG_(message)(Vg_DebugMsg,
   2041                       "?! restarting over (ARM) syscall that is not syscall "
   2042                       "at %#llx %02x %02x %02x %02x\n",
   2043                       arch->vex.guest_R15T + 0ULL, p[0], p[1], p[2], p[3]);
   2044       }
   2045       vg_assert(valid);
   2046    }
   2047 
   2048 #elif defined(VGP_arm64_linux)
   2049    arch->vex.guest_PC -= 4;             // sizeof(arm64 instr)
   2050 
   2051    /* Make sure our caller is actually sane, and we're really backing
   2052       back over a syscall.
   2053 
   2054       svc #0 == d4 00 00 01
   2055    */
   2056    {
   2057       UChar *p = (UChar *)arch->vex.guest_PC;
   2058 
   2059       if (p[0] != 0x01 || p[1] != 0x00 || p[2] != 0x00 || p[3] != 0xD4)
   2060          VG_(message)(
   2061             Vg_DebugMsg,
   2062             "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
   2063             arch->vex.guest_PC + 0ULL, p[0], p[1], p[2], p[3]
   2064           );
   2065 
   2066       vg_assert(p[0] == 0x01 && p[1] == 0x00 && p[2] == 0x00 && p[3] == 0xD4);
   2067    }
   2068 
   2069 #elif defined(VGP_x86_darwin)
   2070    arch->vex.guest_EIP = arch->vex.guest_IP_AT_SYSCALL;
   2071 
   2072    /* Make sure our caller is actually sane, and we're really backing
   2073       back over a syscall.
   2074 
   2075       int $0x80 == CD 80
   2076       int $0x81 == CD 81
   2077       int $0x82 == CD 82
   2078       sysenter  == 0F 34
   2079    */
   2080    {
   2081        UChar *p = (UChar *)arch->vex.guest_EIP;
   2082        Bool  ok = (p[0] == 0xCD && p[1] == 0x80)
   2083                   || (p[0] == 0xCD && p[1] == 0x81)
   2084                   || (p[0] == 0xCD && p[1] == 0x82)
   2085                   || (p[0] == 0x0F && p[1] == 0x34);
   2086        if (!ok)
   2087            VG_(message)(Vg_DebugMsg,
   2088                         "?! restarting over syscall at %#x %02x %02x\n",
   2089                         arch->vex.guest_EIP, p[0], p[1]);
   2090        vg_assert(ok);
   2091    }
   2092 
   2093 #elif defined(VGP_amd64_darwin)
   2094    // DDD: #warning GrP fixme amd64 restart unimplemented
   2095    vg_assert(0);
   2096 
   2097 #elif defined(VGP_s390x_linux)
   2098    arch->vex.guest_IA -= 2;             // sizeof(syscall)
   2099 
   2100    /* Make sure our caller is actually sane, and we're really backing
   2101       back over a syscall.
   2102 
   2103       syscall == 0A <num>
   2104    */
   2105    {
   2106       UChar *p = (UChar *)arch->vex.guest_IA;
   2107       if (p[0] != 0x0A)
   2108          VG_(message)(Vg_DebugMsg,
   2109                       "?! restarting over syscall at %#llx %02x %02x\n",
   2110                       arch->vex.guest_IA, p[0], p[1]);
   2111 
   2112       vg_assert(p[0] == 0x0A);
   2113    }
   2114 
   2115 #elif defined(VGP_mips32_linux) || defined(VGP_mips64_linux)
   2116 
   2117    arch->vex.guest_PC -= 4;             // sizeof(mips instr)
   2118 
   2119    /* Make sure our caller is actually sane, and we're really backing
   2120       back over a syscall.
   2121 
   2122       syscall == 00 00 00 0C
   2123       big endian
   2124       syscall == 0C 00 00 00
   2125    */
   2126    {
   2127       UChar *p = (UChar *)(arch->vex.guest_PC);
   2128 #     if defined (VG_LITTLEENDIAN)
   2129       if (p[0] != 0x0c || p[1] != 0x00 || p[2] != 0x00 || p[3] != 0x00)
   2130          VG_(message)(Vg_DebugMsg,
   2131                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
   2132                       (ULong)arch->vex.guest_PC, p[0], p[1], p[2], p[3]);
   2133 
   2134       vg_assert(p[0] == 0x0c && p[1] == 0x00 && p[2] == 0x00 && p[3] == 0x00);
   2135 #     elif defined (VG_BIGENDIAN)
   2136       if (p[0] != 0x00 || p[1] != 0x00 || p[2] != 0x00 || p[3] != 0x0c)
   2137          VG_(message)(Vg_DebugMsg,
   2138                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
   2139                       (ULong)arch->vex.guest_PC, p[0], p[1], p[2], p[3]);
   2140 
   2141       vg_assert(p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x00 && p[3] == 0x0c);
   2142 #     else
   2143 #        error "Unknown endianness"
   2144 #     endif
   2145    }
   2146 
   2147 #else
   2148 #  error "ML_(fixup_guest_state_to_restart_syscall): unknown plat"
   2149 #endif
   2150 }
   2151 
   2152 
   2153 /*
   2154    Fix up the guest state when a syscall is interrupted by a signal
   2155    and so has been forced to return 'sysret'.
   2156 
   2157    To do this, we determine the precise state of the syscall by
   2158    looking at the (real) IP at the time the signal happened.  The
   2159    syscall sequence looks like:
   2160 
   2161      1. unblock signals
   2162      2. perform syscall
   2163      3. save result to guest state (EAX, RAX, R3+CR0.SO, R0, V0)
   2164      4. re-block signals
   2165 
   2166    If a signal
   2167    happens at      Then     Why?
   2168    [1-2)           restart  nothing has happened (restart syscall)
   2169    [2]             restart  syscall hasn't started, or kernel wants to restart
   2170    [2-3)           save     syscall complete, but results not saved
   2171    [3-4)           syscall complete, results saved
   2172 
   2173    Sometimes we never want to restart an interrupted syscall (because
   2174    sigaction says not to), so we only restart if "restart" is True.
   2175 
   2176    This will also call VG_(post_syscall) if the syscall has actually
   2177    completed (either because it was interrupted, or because it
   2178    actually finished).  It will not call VG_(post_syscall) if the
   2179    syscall is set up for restart, which means that the pre-wrapper may
   2180    get called multiple times.
   2181 */
   2182 
   2183 void
   2184 VG_(fixup_guest_state_after_syscall_interrupted)( ThreadId tid,
   2185                                                   Addr     ip,
   2186                                                   SysRes   sres,
   2187                                                   Bool     restart)
   2188 {
   2189    /* Note that we don't know the syscall number here, since (1) in
   2190       general there's no reliable way to get hold of it short of
   2191       stashing it in the guest state before the syscall, and (2) in
   2192       any case we don't need to know it for the actions done by this
   2193       routine.
   2194 
   2195       Furthermore, 'sres' is only used in the case where the syscall
   2196       is complete, but the result has not been committed to the guest
   2197       state yet.  In any other situation it will be meaningless and
   2198       therefore ignored. */
   2199 
   2200    ThreadState*     tst;
   2201    SyscallStatus    canonical;
   2202    ThreadArchState* th_regs;
   2203    SyscallInfo*     sci;
   2204 
   2205    /* Compute some Booleans indicating which range we're in. */
   2206    Bool outside_range,
   2207         in_setup_to_restart,      // [1,2) in the .S files
   2208         at_restart,               // [2]   in the .S files
   2209         in_complete_to_committed, // [3,4) in the .S files
   2210         in_committed_to_finished; // [4,5) in the .S files
   2211 
   2212 #  if defined(VGO_linux)
   2213    outside_range
   2214       = ip < ML_(blksys_setup) || ip >= ML_(blksys_finished);
   2215    in_setup_to_restart
   2216       = ip >= ML_(blksys_setup) && ip < ML_(blksys_restart);
   2217    at_restart
   2218       = ip == ML_(blksys_restart);
   2219    in_complete_to_committed
   2220       = ip >= ML_(blksys_complete) && ip < ML_(blksys_committed);
   2221    in_committed_to_finished
   2222       = ip >= ML_(blksys_committed) && ip < ML_(blksys_finished);
   2223 #  elif defined(VGO_darwin)
   2224    outside_range
   2225       =  (ip < ML_(blksys_setup_MACH) || ip >= ML_(blksys_finished_MACH))
   2226       && (ip < ML_(blksys_setup_MDEP) || ip >= ML_(blksys_finished_MDEP))
   2227       && (ip < ML_(blksys_setup_UNIX) || ip >= ML_(blksys_finished_UNIX));
   2228    in_setup_to_restart
   2229       =  (ip >= ML_(blksys_setup_MACH) && ip < ML_(blksys_restart_MACH))
   2230       || (ip >= ML_(blksys_setup_MDEP) && ip < ML_(blksys_restart_MDEP))
   2231       || (ip >= ML_(blksys_setup_UNIX) && ip < ML_(blksys_restart_UNIX));
   2232    at_restart
   2233       =  (ip == ML_(blksys_restart_MACH))
   2234       || (ip == ML_(blksys_restart_MDEP))
   2235       || (ip == ML_(blksys_restart_UNIX));
   2236    in_complete_to_committed
   2237       =  (ip >= ML_(blksys_complete_MACH) && ip < ML_(blksys_committed_MACH))
   2238       || (ip >= ML_(blksys_complete_MDEP) && ip < ML_(blksys_committed_MDEP))
   2239       || (ip >= ML_(blksys_complete_UNIX) && ip < ML_(blksys_committed_UNIX));
   2240    in_committed_to_finished
   2241       =  (ip >= ML_(blksys_committed_MACH) && ip < ML_(blksys_finished_MACH))
   2242       || (ip >= ML_(blksys_committed_MDEP) && ip < ML_(blksys_finished_MDEP))
   2243       || (ip >= ML_(blksys_committed_UNIX) && ip < ML_(blksys_finished_UNIX));
   2244    /* Wasn't that just So Much Fun?  Does your head hurt yet?  Mine does. */
   2245 #  else
   2246 #    error "Unknown OS"
   2247 #  endif
   2248 
   2249    if (VG_(clo_trace_signals))
   2250       VG_(message)( Vg_DebugMsg,
   2251                     "interrupted_syscall: tid=%d, ip=0x%llx, "
   2252                     "restart=%s, sres.isErr=%s, sres.val=%lld\n",
   2253                     (Int)tid,
   2254                     (ULong)ip,
   2255                     restart ? "True" : "False",
   2256                     sr_isError(sres) ? "True" : "False",
   2257                     (Long)(sr_isError(sres) ? sr_Err(sres) : sr_Res(sres)) );
   2258 
   2259    vg_assert(VG_(is_valid_tid)(tid));
   2260    vg_assert(tid >= 1 && tid < VG_N_THREADS);
   2261    vg_assert(VG_(is_running_thread)(tid));
   2262 
   2263    tst     = VG_(get_ThreadState)(tid);
   2264    th_regs = &tst->arch;
   2265    sci     = & syscallInfo[tid];
   2266 
   2267    /* Figure out what the state of the syscall was by examining the
   2268       (real) IP at the time of the signal, and act accordingly. */
   2269    if (outside_range) {
   2270       if (VG_(clo_trace_signals))
   2271          VG_(message)( Vg_DebugMsg,
   2272                        "  not in syscall at all: hmm, very suspicious\n" );
   2273       /* Looks like we weren't in a syscall at all.  Hmm. */
   2274       vg_assert(sci->status.what != SsIdle);
   2275       return;
   2276    }
   2277 
   2278    /* We should not be here unless this thread had first started up
   2279       the machinery for a syscall by calling VG_(client_syscall).
   2280       Hence: */
   2281    vg_assert(sci->status.what != SsIdle);
   2282 
   2283    /* now, do one of four fixup actions, depending on where the IP has
   2284       got to. */
   2285 
   2286    if (in_setup_to_restart) {
   2287       /* syscall hasn't even started; go around again */
   2288       if (VG_(clo_trace_signals))
   2289          VG_(message)( Vg_DebugMsg, "  not started: restarting\n");
   2290       vg_assert(sci->status.what == SsHandToKernel);
   2291       ML_(fixup_guest_state_to_restart_syscall)(th_regs);
   2292    }
   2293 
   2294    else
   2295    if (at_restart) {
   2296       /* We're either about to run the syscall, or it was interrupted
   2297          and the kernel restarted it.  Restart if asked, otherwise
   2298          EINTR it. */
   2299       if (restart) {
   2300          if (VG_(clo_trace_signals))
   2301             VG_(message)( Vg_DebugMsg, "  at syscall instr: restarting\n");
   2302          ML_(fixup_guest_state_to_restart_syscall)(th_regs);
   2303       } else {
   2304          if (VG_(clo_trace_signals))
   2305             VG_(message)( Vg_DebugMsg, "  at syscall instr: returning EINTR\n");
   2306          canonical = convert_SysRes_to_SyscallStatus(
   2307                         VG_(mk_SysRes_Error)( VKI_EINTR )
   2308                      );
   2309          if (!(sci->flags & SfNoWriteResult))
   2310             putSyscallStatusIntoGuestState( tid, &canonical, &th_regs->vex );
   2311          sci->status = canonical;
   2312          VG_(post_syscall)(tid);
   2313       }
   2314    }
   2315 
   2316    else
   2317    if (in_complete_to_committed) {
   2318       /* Syscall complete, but result hasn't been written back yet.
   2319          Write the SysRes we were supplied with back to the guest
   2320          state. */
   2321       if (VG_(clo_trace_signals))
   2322          VG_(message)( Vg_DebugMsg,
   2323                        "  completed, but uncommitted: committing\n");
   2324       canonical = convert_SysRes_to_SyscallStatus( sres );
   2325       if (!(sci->flags & SfNoWriteResult))
   2326          putSyscallStatusIntoGuestState( tid, &canonical, &th_regs->vex );
   2327       sci->status = canonical;
   2328       VG_(post_syscall)(tid);
   2329    }
   2330 
   2331    else
   2332    if (in_committed_to_finished) {
   2333       /* Result committed, but the signal mask has not been restored;
   2334          we expect our caller (the signal handler) will have fixed
   2335          this up. */
   2336       if (VG_(clo_trace_signals))
   2337          VG_(message)( Vg_DebugMsg,
   2338                        "  completed and committed: nothing to do\n");
   2339       getSyscallStatusFromGuestState( &sci->status, &th_regs->vex );
   2340       vg_assert(sci->status.what == SsComplete);
   2341       VG_(post_syscall)(tid);
   2342    }
   2343 
   2344    else
   2345       VG_(core_panic)("?? strange syscall interrupt state?");
   2346 
   2347    /* In all cases, the syscall is now finished (even if we called
   2348       ML_(fixup_guest_state_to_restart_syscall), since that just
   2349       re-positions the guest's IP for another go at it).  So we need
   2350       to record that fact. */
   2351    sci->status.what = SsIdle;
   2352 }
   2353 
   2354 
   2355 #if defined(VGO_darwin)
   2356 // Clean up after workq_ops(WQOPS_THREAD_RETURN) jumped to wqthread_hijack.
   2357 // This is similar to VG_(fixup_guest_state_after_syscall_interrupted).
   2358 // This longjmps back to the scheduler.
   2359 void ML_(wqthread_continue_NORETURN)(ThreadId tid)
   2360 {
   2361    ThreadState*     tst;
   2362    SyscallInfo*     sci;
   2363 
   2364    VG_(acquire_BigLock)(tid, "wqthread_continue_NORETURN");
   2365 
   2366    PRINT("SYSCALL[%d,%d](%s) workq_ops() starting new workqueue item\n",
   2367          VG_(getpid)(), tid, VG_SYSNUM_STRING(__NR_workq_ops));
   2368 
   2369    vg_assert(VG_(is_valid_tid)(tid));
   2370    vg_assert(tid >= 1 && tid < VG_N_THREADS);
   2371    vg_assert(VG_(is_running_thread)(tid));
   2372 
   2373    tst     = VG_(get_ThreadState)(tid);
   2374    sci     = & syscallInfo[tid];
   2375    vg_assert(sci->status.what != SsIdle);
   2376    vg_assert(tst->os_state.wq_jmpbuf_valid);  // check this BEFORE post_syscall
   2377 
   2378    // Pretend the syscall completed normally, but don't touch the thread state.
   2379    sci->status = convert_SysRes_to_SyscallStatus( VG_(mk_SysRes_Success)(0) );
   2380    sci->flags |= SfNoWriteResult;
   2381    VG_(post_syscall)(tid);
   2382 
   2383    sci->status.what = SsIdle;
   2384 
   2385    vg_assert(tst->sched_jmpbuf_valid);
   2386    VG_MINIMAL_LONGJMP(tst->sched_jmpbuf);
   2387 
   2388    /* NOTREACHED */
   2389    vg_assert(0);
   2390 }
   2391 #endif
   2392 
   2393 
   2394 /* ---------------------------------------------------------------------
   2395    A place to store the where-to-call-when-really-done pointer
   2396    ------------------------------------------------------------------ */
   2397 
   2398 // When the final thread is done, where shall I call to shutdown the
   2399 // system cleanly?  Is set once at startup (in m_main) and never
   2400 // changes after that.  Is basically a pointer to the exit
   2401 // continuation.  This is all just a nasty hack to avoid calling
   2402 // directly from m_syswrap to m_main at exit, since that would cause
   2403 // m_main to become part of a module cycle, which is silly.
   2404 void (* VG_(address_of_m_main_shutdown_actions_NORETURN) )
   2405        (ThreadId,VgSchedReturnCode)
   2406    = NULL;
   2407 
   2408 /*--------------------------------------------------------------------*/
   2409 /*--- end                                                          ---*/
   2410 /*--------------------------------------------------------------------*/
   2411