Home | History | Annotate | Download | only in m_scheduler
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Thread scheduling.                               scheduler.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2000-2012 Julian Seward
     11       jseward (at) acm.org
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     26    02111-1307, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 */
     30 
     31 /*
     32    Overview
     33 
     34    Valgrind tries to emulate the kernel's threading as closely as
     35    possible.  The client does all threading via the normal syscalls
     36    (on Linux: clone, etc).  Valgrind emulates this by creating exactly
     37    the same process structure as would be created without Valgrind.
     38    There are no extra threads.
     39 
     40    The main difference is that Valgrind only allows one client thread
     41    to run at once.  This is controlled with the CPU Big Lock,
     42    "the_BigLock".  Any time a thread wants to run client code or
     43    manipulate any shared state (which is anything other than its own
     44    ThreadState entry), it must hold the_BigLock.
     45 
     46    When a thread is about to block in a blocking syscall, it releases
     47    the_BigLock, and re-takes it when it becomes runnable again (either
     48    because the syscall finished, or we took a signal).
     49 
     50    VG_(scheduler) therefore runs in each thread.  It returns only when
     51    the thread is exiting, either because it exited itself, or it was
     52    told to exit by another thread.
     53 
     54    This file is almost entirely OS-independent.  The details of how
     55    the OS handles threading and signalling are abstracted away and
     56    implemented elsewhere.  [Some of the functions have worked their
     57    way back for the moment, until we do an OS port in earnest...]
     58 */
     59 
     60 
     61 #include "pub_core_basics.h"
     62 #include "pub_core_debuglog.h"
     63 #include "pub_core_vki.h"
     64 #include "pub_core_vkiscnums.h"    // __NR_sched_yield
     65 #include "pub_core_libcsetjmp.h"   // to keep _threadstate.h happy
     66 #include "pub_core_threadstate.h"
     67 #include "pub_core_aspacemgr.h"
     68 #include "pub_core_clreq.h"         // for VG_USERREQ__*
     69 #include "pub_core_dispatch.h"
     70 #include "pub_core_errormgr.h"      // For VG_(get_n_errs_found)()
     71 #include "pub_core_gdbserver.h"     // for VG_(gdbserver) and VG_(gdbserver_activity)
     72 #include "pub_core_libcbase.h"
     73 #include "pub_core_libcassert.h"
     74 #include "pub_core_libcprint.h"
     75 #include "pub_core_libcproc.h"
     76 #include "pub_core_libcsignal.h"
     77 #if defined(VGO_darwin)
     78 #include "pub_core_mach.h"
     79 #endif
     80 #include "pub_core_machine.h"
     81 #include "pub_core_mallocfree.h"
     82 #include "pub_core_options.h"
     83 #include "pub_core_replacemalloc.h"
     84 #include "pub_core_signals.h"
     85 #include "pub_core_stacks.h"
     86 #include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
     87 #include "pub_core_syscall.h"
     88 #include "pub_core_syswrap.h"
     89 #include "pub_core_tooliface.h"
     90 #include "pub_core_translate.h"     // For VG_(translate)()
     91 #include "pub_core_transtab.h"
     92 #include "pub_core_debuginfo.h"     // VG_(di_notify_pdb_debuginfo)
     93 #include "priv_sched-lock.h"
     94 #include "pub_core_scheduler.h"     // self
     95 #include "pub_core_redir.h"
     96 
     97 
     98 /* ---------------------------------------------------------------------
     99    Types and globals for the scheduler.
    100    ------------------------------------------------------------------ */
    101 
    102 /* ThreadId and ThreadState are defined elsewhere*/
    103 
    104 /* Defines the thread-scheduling timeslice, in terms of the number of
    105    basic blocks we attempt to run each thread for.  Smaller values
    106    give finer interleaving but much increased scheduling overheads. */
    107 #define SCHEDULING_QUANTUM   100000
    108 
    109 /* If False, a fault is Valgrind-internal (ie, a bug) */
    110 Bool VG_(in_generated_code) = False;
    111 
    112 /* 64-bit counter for the number of basic blocks done. */
    113 static ULong bbs_done = 0;
    114 
    115 /* Counter to see if vgdb activity is to be verified.
    116    When nr of bbs done reaches vgdb_next_poll, scheduler will
    117    poll for gdbserver activity. VG_(force_vgdb_poll) and
    118    VG_(disable_vgdb_poll) allows the valgrind core (e.g. m_gdbserver)
    119    to control when the next poll will be done. */
    120 static ULong vgdb_next_poll;
    121 
    122 /* Forwards */
    123 static void do_client_request ( ThreadId tid );
    124 static void scheduler_sanity ( ThreadId tid );
    125 static void mostly_clear_thread_record ( ThreadId tid );
    126 
    127 /* Stats. */
    128 static ULong n_scheduling_events_MINOR = 0;
    129 static ULong n_scheduling_events_MAJOR = 0;
    130 
    131 /* Stats: number of XIndirs, and number that missed in the fast
    132    cache. */
    133 static ULong stats__n_xindirs = 0;
    134 static ULong stats__n_xindir_misses = 0;
    135 
    136 /* And 32-bit temp bins for the above, so that 32-bit platforms don't
    137    have to do 64 bit incs on the hot path through
    138    VG_(cp_disp_xindir). */
    139 /*global*/ UInt VG_(stats__n_xindirs_32) = 0;
    140 /*global*/ UInt VG_(stats__n_xindir_misses_32) = 0;
    141 
    142 /* Sanity checking counts. */
    143 static UInt sanity_fast_count = 0;
    144 static UInt sanity_slow_count = 0;
    145 
    146 void VG_(print_scheduler_stats)(void)
    147 {
    148    VG_(message)(Vg_DebugMsg,
    149       "scheduler: %'llu event checks.\n", bbs_done );
    150    VG_(message)(Vg_DebugMsg,
    151                 "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n",
    152                 stats__n_xindirs, stats__n_xindir_misses,
    153                 stats__n_xindirs / (stats__n_xindir_misses
    154                                     ? stats__n_xindir_misses : 1));
    155    VG_(message)(Vg_DebugMsg,
    156       "scheduler: %'llu/%'llu major/minor sched events.\n",
    157       n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
    158    VG_(message)(Vg_DebugMsg,
    159                 "   sanity: %d cheap, %d expensive checks.\n",
    160                 sanity_fast_count, sanity_slow_count );
    161 }
    162 
    163 /*
    164  * Mutual exclusion object used to serialize threads.
    165  */
    166 static struct sched_lock *the_BigLock;
    167 
    168 
    169 /* ---------------------------------------------------------------------
    170    Helper functions for the scheduler.
    171    ------------------------------------------------------------------ */
    172 
    173 static
    174 void print_sched_event ( ThreadId tid, Char* what )
    175 {
    176    VG_(message)(Vg_DebugMsg, "  SCHED[%d]: %s\n", tid, what );
    177 }
    178 
    179 /* For showing SB counts, if the user asks to see them. */
    180 #define SHOW_SBCOUNT_EVERY (20ULL * 1000 * 1000)
    181 static ULong bbs_done_lastcheck = 0;
    182 
    183 static
    184 void maybe_show_sb_counts ( void )
    185 {
    186    Long delta = bbs_done - bbs_done_lastcheck;
    187    vg_assert(delta >= 0);
    188    if (UNLIKELY(delta >= SHOW_SBCOUNT_EVERY)) {
    189       VG_(umsg)("%'lld superblocks executed\n", bbs_done);
    190       bbs_done_lastcheck = bbs_done;
    191    }
    192 }
    193 
    194 static
    195 HChar* name_of_sched_event ( UInt event )
    196 {
    197    switch (event) {
    198       case VEX_TRC_JMP_TINVAL:         return "TINVAL";
    199       case VEX_TRC_JMP_NOREDIR:        return "NOREDIR";
    200       case VEX_TRC_JMP_SIGTRAP:        return "SIGTRAP";
    201       case VEX_TRC_JMP_SIGSEGV:        return "SIGSEGV";
    202       case VEX_TRC_JMP_SIGBUS:         return "SIGBUS";
    203       case VEX_TRC_JMP_EMWARN:         return "EMWARN";
    204       case VEX_TRC_JMP_EMFAIL:         return "EMFAIL";
    205       case VEX_TRC_JMP_CLIENTREQ:      return "CLIENTREQ";
    206       case VEX_TRC_JMP_YIELD:          return "YIELD";
    207       case VEX_TRC_JMP_NODECODE:       return "NODECODE";
    208       case VEX_TRC_JMP_MAPFAIL:        return "MAPFAIL";
    209       case VEX_TRC_JMP_SYS_SYSCALL:    return "SYSCALL";
    210       case VEX_TRC_JMP_SYS_INT32:      return "INT32";
    211       case VEX_TRC_JMP_SYS_INT128:     return "INT128";
    212       case VEX_TRC_JMP_SYS_INT129:     return "INT129";
    213       case VEX_TRC_JMP_SYS_INT130:     return "INT130";
    214       case VEX_TRC_JMP_SYS_SYSENTER:   return "SYSENTER";
    215       case VEX_TRC_JMP_BORING:         return "VEX_BORING";
    216 
    217       case VG_TRC_BORING:              return "VG_BORING";
    218       case VG_TRC_INNER_FASTMISS:      return "FASTMISS";
    219       case VG_TRC_INNER_COUNTERZERO:   return "COUNTERZERO";
    220       case VG_TRC_FAULT_SIGNAL:        return "FAULTSIGNAL";
    221       case VG_TRC_INVARIANT_FAILED:    return "INVFAILED";
    222       case VG_TRC_CHAIN_ME_TO_SLOW_EP: return "CHAIN_ME_SLOW";
    223       case VG_TRC_CHAIN_ME_TO_FAST_EP: return "CHAIN_ME_FAST";
    224       default:                         return "??UNKNOWN??";
    225   }
    226 }
    227 
    228 /* Allocate a completely empty ThreadState record. */
    229 ThreadId VG_(alloc_ThreadState) ( void )
    230 {
    231    Int i;
    232    for (i = 1; i < VG_N_THREADS; i++) {
    233       if (VG_(threads)[i].status == VgTs_Empty) {
    234 	 VG_(threads)[i].status = VgTs_Init;
    235 	 VG_(threads)[i].exitreason = VgSrc_None;
    236          return i;
    237       }
    238    }
    239    VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
    240    VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
    241    VG_(core_panic)("VG_N_THREADS is too low");
    242    /*NOTREACHED*/
    243 }
    244 
    245 /*
    246    Mark a thread as Runnable.  This will block until the_BigLock is
    247    available, so that we get exclusive access to all the shared
    248    structures and the CPU.  Up until we get the_BigLock, we must not
    249    touch any shared state.
    250 
    251    When this returns, we'll actually be running.
    252  */
    253 void VG_(acquire_BigLock)(ThreadId tid, HChar* who)
    254 {
    255    ThreadState *tst;
    256 
    257 #if 0
    258    if (VG_(clo_trace_sched)) {
    259       HChar buf[100];
    260       vg_assert(VG_(strlen)(who) <= 100-50);
    261       VG_(sprintf)(buf, "waiting for lock (%s)", who);
    262       print_sched_event(tid, buf);
    263    }
    264 #endif
    265 
    266    /* First, acquire the_BigLock.  We can't do anything else safely
    267       prior to this point.  Even doing debug printing prior to this
    268       point is, technically, wrong. */
    269    VG_(acquire_BigLock_LL)(NULL);
    270 
    271    tst = VG_(get_ThreadState)(tid);
    272 
    273    vg_assert(tst->status != VgTs_Runnable);
    274 
    275    tst->status = VgTs_Runnable;
    276 
    277    if (VG_(running_tid) != VG_INVALID_THREADID)
    278       VG_(printf)("tid %d found %d running\n", tid, VG_(running_tid));
    279    vg_assert(VG_(running_tid) == VG_INVALID_THREADID);
    280    VG_(running_tid) = tid;
    281 
    282    { Addr gsp = VG_(get_SP)(tid);
    283      VG_(unknown_SP_update)(gsp, gsp, 0/*unknown origin*/);
    284    }
    285 
    286    if (VG_(clo_trace_sched)) {
    287       HChar buf[150];
    288       vg_assert(VG_(strlen)(who) <= 150-50);
    289       VG_(sprintf)(buf, " acquired lock (%s)", who);
    290       print_sched_event(tid, buf);
    291    }
    292 }
    293 
    294 /*
    295    Set a thread into a sleeping state, and give up exclusive access to
    296    the CPU.  On return, the thread must be prepared to block until it
    297    is ready to run again (generally this means blocking in a syscall,
    298    but it may mean that we remain in a Runnable state and we're just
    299    yielding the CPU to another thread).
    300  */
    301 void VG_(release_BigLock)(ThreadId tid, ThreadStatus sleepstate, HChar* who)
    302 {
    303    ThreadState *tst = VG_(get_ThreadState)(tid);
    304 
    305    vg_assert(tst->status == VgTs_Runnable);
    306 
    307    vg_assert(sleepstate == VgTs_WaitSys ||
    308 	     sleepstate == VgTs_Yielding);
    309 
    310    tst->status = sleepstate;
    311 
    312    vg_assert(VG_(running_tid) == tid);
    313    VG_(running_tid) = VG_INVALID_THREADID;
    314 
    315    if (VG_(clo_trace_sched)) {
    316       Char buf[200];
    317       vg_assert(VG_(strlen)(who) <= 200-100);
    318       VG_(sprintf)(buf, "releasing lock (%s) -> %s",
    319                         who, VG_(name_of_ThreadStatus)(sleepstate));
    320       print_sched_event(tid, buf);
    321    }
    322 
    323    /* Release the_BigLock; this will reschedule any runnable
    324       thread. */
    325    VG_(release_BigLock_LL)(NULL);
    326 }
    327 
    328 static void init_BigLock(void)
    329 {
    330    vg_assert(!the_BigLock);
    331    the_BigLock = ML_(create_sched_lock)();
    332 }
    333 
    334 static void deinit_BigLock(void)
    335 {
    336    ML_(destroy_sched_lock)(the_BigLock);
    337    the_BigLock = NULL;
    338 }
    339 
    340 /* See pub_core_scheduler.h for description */
    341 void VG_(acquire_BigLock_LL) ( HChar* who )
    342 {
    343    ML_(acquire_sched_lock)(the_BigLock);
    344 }
    345 
    346 /* See pub_core_scheduler.h for description */
    347 void VG_(release_BigLock_LL) ( HChar* who )
    348 {
    349    ML_(release_sched_lock)(the_BigLock);
    350 }
    351 
    352 Bool VG_(owns_BigLock_LL) ( ThreadId tid )
    353 {
    354    return (ML_(get_sched_lock_owner)(the_BigLock)
    355            == VG_(threads)[tid].os_state.lwpid);
    356 }
    357 
    358 
    359 /* Clear out the ThreadState and release the semaphore. Leaves the
    360    ThreadState in VgTs_Zombie state, so that it doesn't get
    361    reallocated until the caller is really ready. */
    362 void VG_(exit_thread)(ThreadId tid)
    363 {
    364    vg_assert(VG_(is_valid_tid)(tid));
    365    vg_assert(VG_(is_running_thread)(tid));
    366    vg_assert(VG_(is_exiting)(tid));
    367 
    368    mostly_clear_thread_record(tid);
    369    VG_(running_tid) = VG_INVALID_THREADID;
    370 
    371    /* There should still be a valid exitreason for this thread */
    372    vg_assert(VG_(threads)[tid].exitreason != VgSrc_None);
    373 
    374    if (VG_(clo_trace_sched))
    375       print_sched_event(tid, "release lock in VG_(exit_thread)");
    376 
    377    VG_(release_BigLock_LL)(NULL);
    378 }
    379 
    380 /* If 'tid' is blocked in a syscall, send it SIGVGKILL so as to get it
    381    out of the syscall and onto doing the next thing, whatever that is.
    382    If it isn't blocked in a syscall, has no effect on the thread. */
    383 void VG_(get_thread_out_of_syscall)(ThreadId tid)
    384 {
    385    vg_assert(VG_(is_valid_tid)(tid));
    386    vg_assert(!VG_(is_running_thread)(tid));
    387 
    388    if (VG_(threads)[tid].status == VgTs_WaitSys) {
    389       if (VG_(clo_trace_signals)) {
    390 	 VG_(message)(Vg_DebugMsg,
    391                       "get_thread_out_of_syscall zaps tid %d lwp %d\n",
    392 		      tid, VG_(threads)[tid].os_state.lwpid);
    393       }
    394 #     if defined(VGO_darwin)
    395       {
    396          // GrP fixme use mach primitives on darwin?
    397          // GrP fixme thread_abort_safely?
    398          // GrP fixme race for thread with WaitSys set but not in syscall yet?
    399          extern kern_return_t thread_abort(mach_port_t);
    400          thread_abort(VG_(threads)[tid].os_state.lwpid);
    401       }
    402 #     else
    403       {
    404          __attribute__((unused))
    405          Int r = VG_(tkill)(VG_(threads)[tid].os_state.lwpid, VG_SIGVGKILL);
    406          /* JRS 2009-Mar-20: should we assert for r==0 (tkill succeeded)?
    407             I'm really not sure.  Here's a race scenario which argues
    408             that we shoudn't; but equally I'm not sure the scenario is
    409             even possible, because of constraints caused by the question
    410             of who holds the BigLock when.
    411 
    412             Target thread tid does sys_read on a socket and blocks.  This
    413             function gets called, and we observe correctly that tid's
    414             status is WaitSys but then for whatever reason this function
    415             goes very slowly for a while.  Then data arrives from
    416             wherever, tid's sys_read returns, tid exits.  Then we do
    417             tkill on tid, but tid no longer exists; tkill returns an
    418             error code and the assert fails. */
    419          /* vg_assert(r == 0); */
    420       }
    421 #     endif
    422    }
    423 }
    424 
    425 /*
    426    Yield the CPU for a short time to let some other thread run.
    427  */
    428 void VG_(vg_yield)(void)
    429 {
    430    ThreadId tid = VG_(running_tid);
    431 
    432    vg_assert(tid != VG_INVALID_THREADID);
    433    vg_assert(VG_(threads)[tid].os_state.lwpid == VG_(gettid)());
    434 
    435    VG_(release_BigLock)(tid, VgTs_Yielding, "VG_(vg_yield)");
    436 
    437    /*
    438       Tell the kernel we're yielding.
    439     */
    440    VG_(do_syscall0)(__NR_sched_yield);
    441 
    442    VG_(acquire_BigLock)(tid, "VG_(vg_yield)");
    443 }
    444 
    445 
    446 /* Set the standard set of blocked signals, used whenever we're not
    447    running a client syscall. */
    448 static void block_signals(void)
    449 {
    450    vki_sigset_t mask;
    451 
    452    VG_(sigfillset)(&mask);
    453 
    454    /* Don't block these because they're synchronous */
    455    VG_(sigdelset)(&mask, VKI_SIGSEGV);
    456    VG_(sigdelset)(&mask, VKI_SIGBUS);
    457    VG_(sigdelset)(&mask, VKI_SIGFPE);
    458    VG_(sigdelset)(&mask, VKI_SIGILL);
    459    VG_(sigdelset)(&mask, VKI_SIGTRAP);
    460 
    461    /* Can't block these anyway */
    462    VG_(sigdelset)(&mask, VKI_SIGSTOP);
    463    VG_(sigdelset)(&mask, VKI_SIGKILL);
    464 
    465    VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, NULL);
    466 }
    467 
    468 static void os_state_clear(ThreadState *tst)
    469 {
    470    tst->os_state.lwpid       = 0;
    471    tst->os_state.threadgroup = 0;
    472 #  if defined(VGO_linux)
    473    /* no other fields to clear */
    474 #  elif defined(VGO_darwin)
    475    tst->os_state.post_mach_trap_fn = NULL;
    476    tst->os_state.pthread           = 0;
    477    tst->os_state.func_arg          = 0;
    478    VG_(memset)(&tst->os_state.child_go, 0, sizeof(tst->os_state.child_go));
    479    VG_(memset)(&tst->os_state.child_done, 0, sizeof(tst->os_state.child_done));
    480    tst->os_state.wq_jmpbuf_valid   = False;
    481    tst->os_state.remote_port       = 0;
    482    tst->os_state.msgh_id           = 0;
    483    VG_(memset)(&tst->os_state.mach_args, 0, sizeof(tst->os_state.mach_args));
    484 #  else
    485 #    error "Unknown OS"
    486 #  endif
    487 }
    488 
    489 static void os_state_init(ThreadState *tst)
    490 {
    491    tst->os_state.valgrind_stack_base    = 0;
    492    tst->os_state.valgrind_stack_init_SP = 0;
    493    os_state_clear(tst);
    494 }
    495 
    496 static
    497 void mostly_clear_thread_record ( ThreadId tid )
    498 {
    499    vki_sigset_t savedmask;
    500 
    501    vg_assert(tid >= 0 && tid < VG_N_THREADS);
    502    VG_(cleanup_thread)(&VG_(threads)[tid].arch);
    503    VG_(threads)[tid].tid = tid;
    504 
    505    /* Leave the thread in Zombie, so that it doesn't get reallocated
    506       until the caller is finally done with the thread stack. */
    507    VG_(threads)[tid].status               = VgTs_Zombie;
    508 
    509    VG_(sigemptyset)(&VG_(threads)[tid].sig_mask);
    510    VG_(sigemptyset)(&VG_(threads)[tid].tmp_sig_mask);
    511 
    512    os_state_clear(&VG_(threads)[tid]);
    513 
    514    /* start with no altstack */
    515    VG_(threads)[tid].altstack.ss_sp = (void *)0xdeadbeef;
    516    VG_(threads)[tid].altstack.ss_size = 0;
    517    VG_(threads)[tid].altstack.ss_flags = VKI_SS_DISABLE;
    518 
    519    VG_(clear_out_queued_signals)(tid, &savedmask);
    520 
    521    VG_(threads)[tid].sched_jmpbuf_valid = False;
    522 }
    523 
    524 /*
    525    Called in the child after fork.  If the parent has multiple
    526    threads, then we've inherited a VG_(threads) array describing them,
    527    but only the thread which called fork() is actually alive in the
    528    child.  This functions needs to clean up all those other thread
    529    structures.
    530 
    531    Whichever tid in the parent which called fork() becomes the
    532    master_tid in the child.  That's because the only living slot in
    533    VG_(threads) in the child after fork is VG_(threads)[tid], and it
    534    would be too hard to try to re-number the thread and relocate the
    535    thread state down to VG_(threads)[1].
    536 
    537    This function also needs to reinitialize the_BigLock, since
    538    otherwise we may end up sharing its state with the parent, which
    539    would be deeply confusing.
    540 */
    541 static void sched_fork_cleanup(ThreadId me)
    542 {
    543    ThreadId tid;
    544    vg_assert(VG_(running_tid) == me);
    545 
    546 #  if defined(VGO_darwin)
    547    // GrP fixme hack reset Mach ports
    548    VG_(mach_init)();
    549 #  endif
    550 
    551    VG_(threads)[me].os_state.lwpid = VG_(gettid)();
    552    VG_(threads)[me].os_state.threadgroup = VG_(getpid)();
    553 
    554    /* clear out all the unused thread slots */
    555    for (tid = 1; tid < VG_N_THREADS; tid++) {
    556       if (tid != me) {
    557          mostly_clear_thread_record(tid);
    558 	 VG_(threads)[tid].status = VgTs_Empty;
    559          VG_(clear_syscallInfo)(tid);
    560       }
    561    }
    562 
    563    /* re-init and take the sema */
    564    deinit_BigLock();
    565    init_BigLock();
    566    VG_(acquire_BigLock_LL)(NULL);
    567 }
    568 
    569 
    570 /* First phase of initialisation of the scheduler.  Initialise the
    571    bigLock, zeroise the VG_(threads) structure and decide on the
    572    ThreadId of the root thread.
    573 */
    574 ThreadId VG_(scheduler_init_phase1) ( void )
    575 {
    576    Int i;
    577    ThreadId tid_main;
    578 
    579    VG_(debugLog)(1,"sched","sched_init_phase1\n");
    580 
    581    if (VG_(clo_fair_sched) != disable_fair_sched
    582        && !ML_(set_sched_lock_impl)(sched_lock_ticket)
    583        && VG_(clo_fair_sched) == enable_fair_sched)
    584    {
    585       VG_(printf)("Error: fair scheduling is not supported on this system.\n");
    586       VG_(exit)(1);
    587    }
    588 
    589    if (VG_(clo_verbosity) > 1) {
    590       VG_(message)(Vg_DebugMsg,
    591                    "Scheduler: using %s scheduler lock implementation.\n",
    592                    ML_(get_sched_lock_name)());
    593    }
    594 
    595    init_BigLock();
    596 
    597    for (i = 0 /* NB; not 1 */; i < VG_N_THREADS; i++) {
    598       /* Paranoia .. completely zero it out. */
    599       VG_(memset)( & VG_(threads)[i], 0, sizeof( VG_(threads)[i] ) );
    600 
    601       VG_(threads)[i].sig_queue = NULL;
    602 
    603       os_state_init(&VG_(threads)[i]);
    604       mostly_clear_thread_record(i);
    605 
    606       VG_(threads)[i].status                    = VgTs_Empty;
    607       VG_(threads)[i].client_stack_szB          = 0;
    608       VG_(threads)[i].client_stack_highest_word = (Addr)NULL;
    609       VG_(threads)[i].err_disablement_level     = 0;
    610    }
    611 
    612    tid_main = VG_(alloc_ThreadState)();
    613 
    614    /* Bleh.  Unfortunately there are various places in the system that
    615       assume that the main thread has a ThreadId of 1.
    616       - Helgrind (possibly)
    617       - stack overflow message in default_action() in m_signals.c
    618       - definitely a lot more places
    619    */
    620    vg_assert(tid_main == 1);
    621 
    622    return tid_main;
    623 }
    624 
    625 
    626 /* Second phase of initialisation of the scheduler.  Given the root
    627    ThreadId computed by first phase of initialisation, fill in stack
    628    details and acquire bigLock.  Initialise the scheduler.  This is
    629    called at startup.  The caller subsequently initialises the guest
    630    state components of this main thread.
    631 */
    632 void VG_(scheduler_init_phase2) ( ThreadId tid_main,
    633                                   Addr     clstack_end,
    634                                   SizeT    clstack_size )
    635 {
    636    VG_(debugLog)(1,"sched","sched_init_phase2: tid_main=%d, "
    637                    "cls_end=0x%lx, cls_sz=%ld\n",
    638                    tid_main, clstack_end, clstack_size);
    639 
    640    vg_assert(VG_IS_PAGE_ALIGNED(clstack_end+1));
    641    vg_assert(VG_IS_PAGE_ALIGNED(clstack_size));
    642 
    643    VG_(threads)[tid_main].client_stack_highest_word
    644       = clstack_end + 1 - sizeof(UWord);
    645    VG_(threads)[tid_main].client_stack_szB
    646       = clstack_size;
    647 
    648    VG_(atfork)(NULL, NULL, sched_fork_cleanup);
    649 }
    650 
    651 
    652 /* ---------------------------------------------------------------------
    653    Helpers for running translations.
    654    ------------------------------------------------------------------ */
    655 
    656 /* Use gcc's built-in setjmp/longjmp.  longjmp must not restore signal
    657    mask state, but does need to pass "val" through.  jumped must be a
    658    volatile UWord. */
    659 #define SCHEDSETJMP(tid, jumped, stmt)					\
    660    do {									\
    661       ThreadState * volatile _qq_tst = VG_(get_ThreadState)(tid);	\
    662 									\
    663       (jumped) = VG_MINIMAL_SETJMP(_qq_tst->sched_jmpbuf);              \
    664       if ((jumped) == ((UWord)0)) {                                     \
    665 	 vg_assert(!_qq_tst->sched_jmpbuf_valid);			\
    666 	 _qq_tst->sched_jmpbuf_valid = True;				\
    667 	 stmt;								\
    668       }	else if (VG_(clo_trace_sched))					\
    669 	 VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=%ld\n",       \
    670                      __LINE__, tid, jumped);                            \
    671       vg_assert(_qq_tst->sched_jmpbuf_valid);				\
    672       _qq_tst->sched_jmpbuf_valid = False;				\
    673    } while(0)
    674 
    675 
    676 /* Do various guest state alignment checks prior to running a thread.
    677    Specifically, check that what we have matches Vex's guest state
    678    layout requirements.  See libvex.h for details, but in short the
    679    requirements are: There must be no holes in between the primary
    680    guest state, its two copies, and the spill area.  In short, all 4
    681    areas must have a 16-aligned size and be 16-aligned, and placed
    682    back-to-back. */
    683 static void do_pre_run_checks ( ThreadState* tst )
    684 {
    685    Addr a_vex     = (Addr) & tst->arch.vex;
    686    Addr a_vexsh1  = (Addr) & tst->arch.vex_shadow1;
    687    Addr a_vexsh2  = (Addr) & tst->arch.vex_shadow2;
    688    Addr a_spill   = (Addr) & tst->arch.vex_spill;
    689    UInt sz_vex    = (UInt) sizeof tst->arch.vex;
    690    UInt sz_vexsh1 = (UInt) sizeof tst->arch.vex_shadow1;
    691    UInt sz_vexsh2 = (UInt) sizeof tst->arch.vex_shadow2;
    692    UInt sz_spill  = (UInt) sizeof tst->arch.vex_spill;
    693 
    694    if (0)
    695    VG_(printf)("gst %p %d, sh1 %p %d, "
    696                "sh2 %p %d, spill %p %d\n",
    697                (void*)a_vex, sz_vex,
    698                (void*)a_vexsh1, sz_vexsh1,
    699                (void*)a_vexsh2, sz_vexsh2,
    700                (void*)a_spill, sz_spill );
    701 
    702    vg_assert(VG_IS_16_ALIGNED(sz_vex));
    703    vg_assert(VG_IS_16_ALIGNED(sz_vexsh1));
    704    vg_assert(VG_IS_16_ALIGNED(sz_vexsh2));
    705    vg_assert(VG_IS_16_ALIGNED(sz_spill));
    706 
    707    vg_assert(VG_IS_16_ALIGNED(a_vex));
    708    vg_assert(VG_IS_16_ALIGNED(a_vexsh1));
    709    vg_assert(VG_IS_16_ALIGNED(a_vexsh2));
    710    vg_assert(VG_IS_16_ALIGNED(a_spill));
    711 
    712    /* Check that the guest state and its two shadows have the same
    713       size, and that there are no holes in between.  The latter is
    714       important because Memcheck assumes that it can reliably access
    715       the shadows by indexing off a pointer to the start of the
    716       primary guest state area. */
    717    vg_assert(sz_vex == sz_vexsh1);
    718    vg_assert(sz_vex == sz_vexsh2);
    719    vg_assert(a_vex + 1 * sz_vex == a_vexsh1);
    720    vg_assert(a_vex + 2 * sz_vex == a_vexsh2);
    721    /* Also check there's no hole between the second shadow area and
    722       the spill area. */
    723    vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
    724    vg_assert(a_vex + 3 * sz_vex == a_spill);
    725 
    726 #  if defined(VGA_x86)
    727    /* x86 XMM regs must form an array, ie, have no holes in
    728       between. */
    729    vg_assert(
    730       (offsetof(VexGuestX86State,guest_XMM7)
    731        - offsetof(VexGuestX86State,guest_XMM0))
    732       == (8/*#regs*/-1) * 16/*bytes per reg*/
    733    );
    734    vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestX86State,guest_XMM0)));
    735    vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestX86State,guest_FPREG)));
    736    vg_assert(8 == offsetof(VexGuestX86State,guest_EAX));
    737    vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EAX)));
    738    vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EIP)));
    739 #  endif
    740 
    741 #  if defined(VGA_amd64)
    742    /* amd64 YMM regs must form an array, ie, have no holes in
    743       between. */
    744    vg_assert(
    745       (offsetof(VexGuestAMD64State,guest_YMM16)
    746        - offsetof(VexGuestAMD64State,guest_YMM0))
    747       == (17/*#regs*/-1) * 32/*bytes per reg*/
    748    );
    749    vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestAMD64State,guest_YMM0)));
    750    vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_FPREG)));
    751    vg_assert(16 == offsetof(VexGuestAMD64State,guest_RAX));
    752    vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RAX)));
    753    vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RIP)));
    754 #  endif
    755 
    756 #  if defined(VGA_ppc32) || defined(VGA_ppc64)
    757    /* ppc guest_state vector regs must be 16 byte aligned for
    758       loads/stores.  This is important! */
    759    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VSR0));
    760    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VSR0));
    761    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VSR0));
    762    /* be extra paranoid .. */
    763    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VSR1));
    764    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VSR1));
    765    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VSR1));
    766 #  endif
    767 
    768 #  if defined(VGA_arm)
    769    /* arm guest_state VFP regs must be 8 byte aligned for
    770       loads/stores.  Let's use 16 just to be on the safe side. */
    771    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_D0));
    772    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
    773    vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
    774    /* be extra paranoid .. */
    775    vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D1));
    776    vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D1));
    777    vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D1));
    778 #  endif
    779 
    780 #  if defined(VGA_s390x)
    781    /* no special requirements */
    782 #  endif
    783 
    784 #  if defined(VGA_mips32)
    785   /* no special requirements */
    786 #  endif
    787 }
    788 
    789 // NO_VGDB_POLL value ensures vgdb is not polled, while
    790 // VGDB_POLL_ASAP ensures that the next scheduler call
    791 // will cause a poll.
    792 #define NO_VGDB_POLL    0xffffffffffffffffULL
    793 #define VGDB_POLL_ASAP  0x0ULL
    794 
    795 void VG_(disable_vgdb_poll) (void )
    796 {
    797    vgdb_next_poll = NO_VGDB_POLL;
    798 }
    799 void VG_(force_vgdb_poll) ( void )
    800 {
    801    vgdb_next_poll = VGDB_POLL_ASAP;
    802 }
    803 
    804 /* Run the thread tid for a while, and return a VG_TRC_* value
    805    indicating why VG_(disp_run_translations) stopped, and possibly an
    806    auxiliary word.  Also, only allow the thread to run for at most
    807    *dispatchCtrP events.  If (as is the normal case) use_alt_host_addr
    808    is False, we are running ordinary redir'd translations, and we
    809    should therefore start by looking up the guest next IP in TT.  If
    810    it is True then we ignore the guest next IP and just run from
    811    alt_host_addr, which presumably points at host code for a no-redir
    812    translation.
    813 
    814    Return results are placed in two_words.  two_words[0] is set to the
    815    TRC.  In the case where that is VG_TRC_CHAIN_ME_TO_{SLOW,FAST}_EP,
    816    the address to patch is placed in two_words[1].
    817 */
    818 static
    819 void run_thread_for_a_while ( /*OUT*/HWord* two_words,
    820                               /*MOD*/Int*   dispatchCtrP,
    821                               ThreadId      tid,
    822                               HWord         alt_host_addr,
    823                               Bool          use_alt_host_addr )
    824 {
    825    volatile HWord        jumped         = 0;
    826    volatile ThreadState* tst            = NULL; /* stop gcc complaining */
    827    volatile Int          done_this_time = 0;
    828    volatile HWord        host_code_addr = 0;
    829 
    830    /* Paranoia */
    831    vg_assert(VG_(is_valid_tid)(tid));
    832    vg_assert(VG_(is_running_thread)(tid));
    833    vg_assert(!VG_(is_exiting)(tid));
    834    vg_assert(*dispatchCtrP > 0);
    835 
    836    tst = VG_(get_ThreadState)(tid);
    837    do_pre_run_checks( (ThreadState*)tst );
    838    /* end Paranoia */
    839 
    840    /* Futz with the XIndir stats counters. */
    841    vg_assert(VG_(stats__n_xindirs_32) == 0);
    842    vg_assert(VG_(stats__n_xindir_misses_32) == 0);
    843 
    844    /* Clear return area. */
    845    two_words[0] = two_words[1] = 0;
    846 
    847    /* Figure out where we're starting from. */
    848    if (use_alt_host_addr) {
    849       /* unusual case -- no-redir translation */
    850       host_code_addr = alt_host_addr;
    851    } else {
    852       /* normal case -- redir translation */
    853       UInt cno = (UInt)VG_TT_FAST_HASH((Addr)tst->arch.vex.VG_INSTR_PTR);
    854       if (LIKELY(VG_(tt_fast)[cno].guest == (Addr)tst->arch.vex.VG_INSTR_PTR))
    855          host_code_addr = VG_(tt_fast)[cno].host;
    856       else {
    857          AddrH res   = 0;
    858          /* not found in VG_(tt_fast). Searching here the transtab
    859             improves the performance compared to returning directly
    860             to the scheduler. */
    861          Bool  found = VG_(search_transtab)(&res, NULL, NULL,
    862                                             (Addr)tst->arch.vex.VG_INSTR_PTR,
    863                                             True/*upd cache*/
    864                                             );
    865          if (LIKELY(found)) {
    866             host_code_addr = res;
    867          } else {
    868             /* At this point, we know that we intended to start at a
    869                normal redir translation, but it was not found.  In
    870                which case we can return now claiming it's not
    871                findable. */
    872             two_words[0] = VG_TRC_INNER_FASTMISS; /* hmm, is that right? */
    873             return;
    874          }
    875       }
    876    }
    877    /* We have either a no-redir or a redir translation. */
    878    vg_assert(host_code_addr != 0); /* implausible */
    879 
    880    /* there should be no undealt-with signals */
    881    //vg_assert(VG_(threads)[tid].siginfo.si_signo == 0);
    882 
    883    /* Set up event counter stuff for the run. */
    884    tst->arch.vex.host_EvC_COUNTER = *dispatchCtrP;
    885    tst->arch.vex.host_EvC_FAILADDR
    886       = (HWord)VG_(fnptr_to_fnentry)( &VG_(disp_cp_evcheck_fail) );
    887 
    888    if (0) {
    889       vki_sigset_t m;
    890       Int i, err = VG_(sigprocmask)(VKI_SIG_SETMASK, NULL, &m);
    891       vg_assert(err == 0);
    892       VG_(printf)("tid %d: entering code with unblocked signals: ", tid);
    893       for (i = 1; i <= _VKI_NSIG; i++)
    894          if (!VG_(sigismember)(&m, i))
    895             VG_(printf)("%d ", i);
    896       VG_(printf)("\n");
    897    }
    898 
    899    /* Set up return-value area. */
    900 
    901    // Tell the tool this thread is about to run client code
    902    VG_TRACK( start_client_code, tid, bbs_done );
    903 
    904    vg_assert(VG_(in_generated_code) == False);
    905    VG_(in_generated_code) = True;
    906 
    907    SCHEDSETJMP(
    908       tid,
    909       jumped,
    910       VG_(disp_run_translations)(
    911          two_words,
    912          (void*)&tst->arch.vex,
    913          host_code_addr
    914       )
    915    );
    916 
    917    vg_assert(VG_(in_generated_code) == True);
    918    VG_(in_generated_code) = False;
    919 
    920    if (jumped != (HWord)0) {
    921       /* We get here if the client took a fault that caused our signal
    922          handler to longjmp. */
    923       vg_assert(two_words[0] == 0 && two_words[1] == 0); // correct?
    924       two_words[0] = VG_TRC_FAULT_SIGNAL;
    925       two_words[1] = 0;
    926       block_signals();
    927    }
    928 
    929    /* Merge the 32-bit XIndir/miss counters into the 64 bit versions,
    930       and zero out the 32-bit ones in preparation for the next run of
    931       generated code. */
    932    stats__n_xindirs += (ULong)VG_(stats__n_xindirs_32);
    933    VG_(stats__n_xindirs_32) = 0;
    934    stats__n_xindir_misses += (ULong)VG_(stats__n_xindir_misses_32);
    935    VG_(stats__n_xindir_misses_32) = 0;
    936 
    937    /* Inspect the event counter. */
    938    vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1);
    939    vg_assert(tst->arch.vex.host_EvC_FAILADDR
    940              == (HWord)VG_(fnptr_to_fnentry)( &VG_(disp_cp_evcheck_fail)) );
    941 
    942    done_this_time = *dispatchCtrP - ((Int)tst->arch.vex.host_EvC_COUNTER + 1);
    943 
    944    vg_assert(done_this_time >= 0);
    945    bbs_done += (ULong)done_this_time;
    946 
    947    *dispatchCtrP -= done_this_time;
    948    vg_assert(*dispatchCtrP >= 0);
    949 
    950    // Tell the tool this thread has stopped running client code
    951    VG_TRACK( stop_client_code, tid, bbs_done );
    952 
    953    if (bbs_done >= vgdb_next_poll) {
    954       if (VG_(clo_vgdb_poll))
    955          vgdb_next_poll = bbs_done + (ULong)VG_(clo_vgdb_poll);
    956       else
    957          /* value was changed due to gdbserver invocation via ptrace */
    958          vgdb_next_poll = NO_VGDB_POLL;
    959       if (VG_(gdbserver_activity) (tid))
    960          VG_(gdbserver) (tid);
    961    }
    962 
    963    /* TRC value and possible auxiliary patch-address word are already
    964       in two_words[0] and [1] respectively, as a result of the call to
    965       VG_(run_innerloop). */
    966    /* Stay sane .. */
    967    if (two_words[0] == VG_TRC_CHAIN_ME_TO_SLOW_EP
    968        || two_words[0] == VG_TRC_CHAIN_ME_TO_FAST_EP) {
    969       vg_assert(two_words[1] != 0); /* we have a legit patch addr */
    970    } else {
    971       vg_assert(two_words[1] == 0); /* nobody messed with it */
    972    }
    973 }
    974 
    975 
    976 /* ---------------------------------------------------------------------
    977    The scheduler proper.
    978    ------------------------------------------------------------------ */
    979 
    980 static void handle_tt_miss ( ThreadId tid )
    981 {
    982    Bool found;
    983    Addr ip = VG_(get_IP)(tid);
    984 
    985    /* Trivial event.  Miss in the fast-cache.  Do a full
    986       lookup for it. */
    987    found = VG_(search_transtab)( NULL, NULL, NULL,
    988                                  ip, True/*upd_fast_cache*/ );
    989    if (UNLIKELY(!found)) {
    990       /* Not found; we need to request a translation. */
    991       if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/,
    992                           bbs_done, True/*allow redirection*/ )) {
    993          found = VG_(search_transtab)( NULL, NULL, NULL,
    994                                        ip, True );
    995          vg_assert2(found, "handle_tt_miss: missing tt_fast entry");
    996 
    997       } else {
    998 	 // If VG_(translate)() fails, it's because it had to throw a
    999 	 // signal because the client jumped to a bad address.  That
   1000 	 // means that either a signal has been set up for delivery,
   1001 	 // or the thread has been marked for termination.  Either
   1002 	 // way, we just need to go back into the scheduler loop.
   1003       }
   1004    }
   1005 }
   1006 
   1007 static
   1008 void handle_chain_me ( ThreadId tid, void* place_to_chain, Bool toFastEP )
   1009 {
   1010    Bool found          = False;
   1011    Addr ip             = VG_(get_IP)(tid);
   1012    UInt to_sNo         = (UInt)-1;
   1013    UInt to_tteNo       = (UInt)-1;
   1014 
   1015    found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
   1016                                  ip, False/*dont_upd_fast_cache*/ );
   1017    if (!found) {
   1018       /* Not found; we need to request a translation. */
   1019       if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/,
   1020                           bbs_done, True/*allow redirection*/ )) {
   1021          found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
   1022                                        ip, False );
   1023          vg_assert2(found, "handle_chain_me: missing tt_fast entry");
   1024       } else {
   1025 	 // If VG_(translate)() fails, it's because it had to throw a
   1026 	 // signal because the client jumped to a bad address.  That
   1027 	 // means that either a signal has been set up for delivery,
   1028 	 // or the thread has been marked for termination.  Either
   1029 	 // way, we just need to go back into the scheduler loop.
   1030         return;
   1031       }
   1032    }
   1033    vg_assert(found);
   1034    vg_assert(to_sNo != -1);
   1035    vg_assert(to_tteNo != -1);
   1036 
   1037    /* So, finally we know where to patch through to.  Do the patching
   1038       and update the various admin tables that allow it to be undone
   1039       in the case that the destination block gets deleted. */
   1040    VG_(tt_tc_do_chaining)( place_to_chain,
   1041                            to_sNo, to_tteNo, toFastEP );
   1042 }
   1043 
   1044 static void handle_syscall(ThreadId tid, UInt trc)
   1045 {
   1046    ThreadState * volatile tst = VG_(get_ThreadState)(tid);
   1047    volatile UWord jumped;
   1048 
   1049    /* Syscall may or may not block; either way, it will be
   1050       complete by the time this call returns, and we'll be
   1051       runnable again.  We could take a signal while the
   1052       syscall runs. */
   1053 
   1054    if (VG_(clo_sanity_level >= 3))
   1055       VG_(am_do_sync_check)("(BEFORE SYSCALL)",__FILE__,__LINE__);
   1056 
   1057    SCHEDSETJMP(tid, jumped, VG_(client_syscall)(tid, trc));
   1058 
   1059    if (VG_(clo_sanity_level >= 3))
   1060       VG_(am_do_sync_check)("(AFTER SYSCALL)",__FILE__,__LINE__);
   1061 
   1062    if (!VG_(is_running_thread)(tid))
   1063       VG_(printf)("tid %d not running; VG_(running_tid)=%d, tid %d status %d\n",
   1064 		  tid, VG_(running_tid), tid, tst->status);
   1065    vg_assert(VG_(is_running_thread)(tid));
   1066 
   1067    if (jumped != (UWord)0) {
   1068       block_signals();
   1069       VG_(poll_signals)(tid);
   1070    }
   1071 }
   1072 
   1073 /* tid just requested a jump to the noredir version of its current
   1074    program counter.  So make up that translation if needed, run it,
   1075    and return the resulting thread return code in two_words[]. */
   1076 static
   1077 void handle_noredir_jump ( /*OUT*/HWord* two_words,
   1078                            /*MOD*/Int*   dispatchCtrP,
   1079                            ThreadId tid )
   1080 {
   1081    /* Clear return area. */
   1082    two_words[0] = two_words[1] = 0;
   1083 
   1084    AddrH hcode = 0;
   1085    Addr  ip    = VG_(get_IP)(tid);
   1086 
   1087    Bool  found = VG_(search_unredir_transtab)( &hcode, ip );
   1088    if (!found) {
   1089       /* Not found; we need to request a translation. */
   1090       if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
   1091                           False/*NO REDIRECTION*/ )) {
   1092 
   1093          found = VG_(search_unredir_transtab)( &hcode, ip );
   1094          vg_assert2(found, "unredir translation missing after creation?!");
   1095       } else {
   1096 	 // If VG_(translate)() fails, it's because it had to throw a
   1097 	 // signal because the client jumped to a bad address.  That
   1098 	 // means that either a signal has been set up for delivery,
   1099 	 // or the thread has been marked for termination.  Either
   1100 	 // way, we just need to go back into the scheduler loop.
   1101          two_words[0] = VG_TRC_BORING;
   1102          return;
   1103       }
   1104 
   1105    }
   1106 
   1107    vg_assert(found);
   1108    vg_assert(hcode != 0);
   1109 
   1110    /* Otherwise run it and return the resulting VG_TRC_* value. */
   1111    vg_assert(*dispatchCtrP > 0); /* so as to guarantee progress */
   1112    run_thread_for_a_while( two_words, dispatchCtrP, tid,
   1113                            hcode, True/*use hcode*/ );
   1114 }
   1115 
   1116 
   1117 /*
   1118    Run a thread until it wants to exit.
   1119 
   1120    We assume that the caller has already called VG_(acquire_BigLock) for
   1121    us, so we own the VCPU.  Also, all signals are blocked.
   1122  */
   1123 VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
   1124 {
   1125    /* Holds the remaining size of this thread's "timeslice". */
   1126    Int dispatch_ctr = 0;
   1127 
   1128    ThreadState *tst = VG_(get_ThreadState)(tid);
   1129    static Bool vgdb_startup_action_done = False;
   1130 
   1131    if (VG_(clo_trace_sched))
   1132       print_sched_event(tid, "entering VG_(scheduler)");
   1133 
   1134    /* Do vgdb initialization (but once). Only the first (main) task
   1135       starting up will do the below.
   1136       Initialize gdbserver earlier than at the first
   1137       thread VG_(scheduler) is causing problems:
   1138       * at the end of VG_(scheduler_init_phase2) :
   1139         The main thread is in VgTs_Init state, but in a not yet
   1140         consistent state => the thread cannot be reported to gdb
   1141         (e.g. causes an assert in LibVEX_GuestX86_get_eflags when giving
   1142         back the guest registers to gdb).
   1143       * at end of valgrind_main, just
   1144         before VG_(main_thread_wrapper_NORETURN)(1) :
   1145         The main thread is still in VgTs_Init state but in a
   1146         more advanced state. However, the thread state is not yet
   1147         completely initialized : a.o., the os_state is not yet fully
   1148         set => the thread is then not properly reported to gdb,
   1149         which is then confused (causing e.g. a duplicate thread be
   1150         shown, without thread id).
   1151       * it would be possible to initialize gdbserver "lower" in the
   1152         call stack (e.g. in VG_(main_thread_wrapper_NORETURN)) but
   1153         these are platform dependent and the place at which
   1154         the thread state is completely initialized is not
   1155         specific anymore to the main thread (so a similar "do it only
   1156         once" would be needed).
   1157 
   1158         => a "once only" initialization here is the best compromise. */
   1159    if (!vgdb_startup_action_done) {
   1160       vg_assert(tid == 1); // it must be the main thread.
   1161       vgdb_startup_action_done = True;
   1162       if (VG_(clo_vgdb) != Vg_VgdbNo) {
   1163          /* If we have to poll, ensures we do an initial poll at first
   1164             scheduler call. Otherwise, ensure no poll (unless interrupted
   1165             by ptrace). */
   1166          if (VG_(clo_vgdb_poll))
   1167             VG_(force_vgdb_poll) ();
   1168          else
   1169             VG_(disable_vgdb_poll) ();
   1170 
   1171          vg_assert (VG_(dyn_vgdb_error) == VG_(clo_vgdb_error));
   1172          /* As we are initializing, VG_(dyn_vgdb_error) can't have been
   1173             changed yet. */
   1174 
   1175          VG_(gdbserver_prerun_action) (1);
   1176       } else {
   1177          VG_(disable_vgdb_poll) ();
   1178       }
   1179    }
   1180 
   1181    /* set the proper running signal mask */
   1182    block_signals();
   1183 
   1184    vg_assert(VG_(is_running_thread)(tid));
   1185 
   1186    dispatch_ctr = SCHEDULING_QUANTUM;
   1187 
   1188    while (!VG_(is_exiting)(tid)) {
   1189 
   1190       vg_assert(dispatch_ctr >= 0);
   1191       if (dispatch_ctr == 0) {
   1192 
   1193 	 /* Our slice is done, so yield the CPU to another thread.  On
   1194             Linux, this doesn't sleep between sleeping and running,
   1195             since that would take too much time. */
   1196 
   1197 	 /* 4 July 06: it seems that a zero-length nsleep is needed to
   1198             cause async thread cancellation (canceller.c) to terminate
   1199             in finite time; else it is in some kind of race/starvation
   1200             situation and completion is arbitrarily delayed (although
   1201             this is not a deadlock).
   1202 
   1203             Unfortunately these sleeps cause MPI jobs not to terminate
   1204             sometimes (some kind of livelock).  So sleeping once
   1205             every N opportunities appears to work. */
   1206 
   1207 	 /* 3 Aug 06: doing sys__nsleep works but crashes some apps.
   1208             sys_yield also helps the problem, whilst not crashing apps. */
   1209 
   1210 	 VG_(release_BigLock)(tid, VgTs_Yielding,
   1211                                    "VG_(scheduler):timeslice");
   1212 	 /* ------------ now we don't have The Lock ------------ */
   1213 
   1214 	 VG_(acquire_BigLock)(tid, "VG_(scheduler):timeslice");
   1215 	 /* ------------ now we do have The Lock ------------ */
   1216 
   1217 	 /* OK, do some relatively expensive housekeeping stuff */
   1218 	 scheduler_sanity(tid);
   1219 	 VG_(sanity_check_general)(False);
   1220 
   1221 	 /* Look for any pending signals for this thread, and set them up
   1222 	    for delivery */
   1223 	 VG_(poll_signals)(tid);
   1224 
   1225 	 if (VG_(is_exiting)(tid))
   1226 	    break;		/* poll_signals picked up a fatal signal */
   1227 
   1228 	 /* For stats purposes only. */
   1229 	 n_scheduling_events_MAJOR++;
   1230 
   1231 	 /* Figure out how many bbs to ask vg_run_innerloop to do.  Note
   1232 	    that it decrements the counter before testing it for zero, so
   1233 	    that if tst->dispatch_ctr is set to N you get at most N-1
   1234 	    iterations.  Also this means that tst->dispatch_ctr must
   1235 	    exceed zero before entering the innerloop.  Also also, the
   1236 	    decrement is done before the bb is actually run, so you
   1237 	    always get at least one decrement even if nothing happens. */
   1238          // FIXME is this right?
   1239          dispatch_ctr = SCHEDULING_QUANTUM;
   1240 
   1241 	 /* paranoia ... */
   1242 	 vg_assert(tst->tid == tid);
   1243 	 vg_assert(tst->os_state.lwpid == VG_(gettid)());
   1244       }
   1245 
   1246       /* For stats purposes only. */
   1247       n_scheduling_events_MINOR++;
   1248 
   1249       if (0)
   1250          VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs\n",
   1251                                    tid, dispatch_ctr - 1 );
   1252 
   1253       HWord trc[2]; /* "two_words" */
   1254       run_thread_for_a_while( &trc[0],
   1255                               &dispatch_ctr,
   1256                               tid, 0/*ignored*/, False );
   1257 
   1258       if (VG_(clo_trace_sched) && VG_(clo_verbosity) > 2) {
   1259 	 HChar buf[50];
   1260 	 VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc[0]));
   1261 	 print_sched_event(tid, buf);
   1262       }
   1263 
   1264       if (trc[0] == VEX_TRC_JMP_NOREDIR) {
   1265          /* If we got a request to run a no-redir version of
   1266             something, do so now -- handle_noredir_jump just (creates
   1267             and) runs that one translation.  The flip side is that the
   1268             noredir translation can't itself return another noredir
   1269             request -- that would be nonsensical.  It can, however,
   1270             return VG_TRC_BORING, which just means keep going as
   1271             normal. */
   1272          /* Note that the fact that we need to continue with a
   1273             no-redir jump is not recorded anywhere else in this
   1274             thread's state.  So we *must* execute the block right now
   1275             -- we can't fail to execute it and later resume with it,
   1276             because by then we'll have forgotten the fact that it
   1277             should be run as no-redir, but will get run as a normal
   1278             potentially-redir'd, hence screwing up.  This really ought
   1279             to be cleaned up, by noting in the guest state that the
   1280             next block to be executed should be no-redir.  Then we can
   1281             suspend and resume at any point, which isn't the case at
   1282             the moment. */
   1283          handle_noredir_jump( &trc[0],
   1284                               &dispatch_ctr,
   1285                               tid );
   1286          vg_assert(trc[0] != VEX_TRC_JMP_NOREDIR);
   1287 
   1288          /* This can't be allowed to happen, since it means the block
   1289             didn't execute, and we have no way to resume-as-noredir
   1290             after we get more timeslice.  But I don't think it ever
   1291             can, since handle_noredir_jump will assert if the counter
   1292             is zero on entry. */
   1293          vg_assert(trc[0] != VG_TRC_INNER_COUNTERZERO);
   1294 
   1295          /* A no-redir translation can't return with a chain-me
   1296             request, since chaining in the no-redir cache is too
   1297             complex. */
   1298          vg_assert(trc[0] != VG_TRC_CHAIN_ME_TO_SLOW_EP
   1299                    && trc[0] != VG_TRC_CHAIN_ME_TO_FAST_EP);
   1300       }
   1301 
   1302       switch (trc[0]) {
   1303       case VEX_TRC_JMP_BORING:
   1304          /* assisted dispatch, no event.  Used by no-redir
   1305             translations to force return to the scheduler. */
   1306       case VG_TRC_BORING:
   1307          /* no special event, just keep going. */
   1308          break;
   1309 
   1310       case VG_TRC_INNER_FASTMISS:
   1311 	 vg_assert(dispatch_ctr > 0);
   1312 	 handle_tt_miss(tid);
   1313 	 break;
   1314 
   1315       case VG_TRC_CHAIN_ME_TO_SLOW_EP: {
   1316          if (0) VG_(printf)("sched: CHAIN_TO_SLOW_EP: %p\n", (void*)trc[1] );
   1317          handle_chain_me(tid, (void*)trc[1], False);
   1318          break;
   1319       }
   1320 
   1321       case VG_TRC_CHAIN_ME_TO_FAST_EP: {
   1322          if (0) VG_(printf)("sched: CHAIN_TO_FAST_EP: %p\n", (void*)trc[1] );
   1323          handle_chain_me(tid, (void*)trc[1], True);
   1324          break;
   1325       }
   1326 
   1327       case VEX_TRC_JMP_CLIENTREQ:
   1328 	 do_client_request(tid);
   1329 	 break;
   1330 
   1331       case VEX_TRC_JMP_SYS_INT128:  /* x86-linux */
   1332       case VEX_TRC_JMP_SYS_INT129:  /* x86-darwin */
   1333       case VEX_TRC_JMP_SYS_INT130:  /* x86-darwin */
   1334       case VEX_TRC_JMP_SYS_SYSCALL: /* amd64-linux, ppc32-linux, amd64-darwin */
   1335 	 handle_syscall(tid, trc[0]);
   1336 	 if (VG_(clo_sanity_level) > 2)
   1337 	    VG_(sanity_check_general)(True); /* sanity-check every syscall */
   1338 	 break;
   1339 
   1340       case VEX_TRC_JMP_YIELD:
   1341 	 /* Explicit yield, because this thread is in a spin-lock
   1342 	    or something.  Only let the thread run for a short while
   1343             longer.  Because swapping to another thread is expensive,
   1344             we're prepared to let this thread eat a little more CPU
   1345             before swapping to another.  That means that short term
   1346             spins waiting for hardware to poke memory won't cause a
   1347             thread swap. */
   1348 	 if (dispatch_ctr > 2000)
   1349             dispatch_ctr = 2000;
   1350 	 break;
   1351 
   1352       case VG_TRC_INNER_COUNTERZERO:
   1353 	 /* Timeslice is out.  Let a new thread be scheduled. */
   1354 	 vg_assert(dispatch_ctr == 0);
   1355 	 break;
   1356 
   1357       case VG_TRC_FAULT_SIGNAL:
   1358 	 /* Everything should be set up (either we're exiting, or
   1359 	    about to start in a signal handler). */
   1360 	 break;
   1361 
   1362       case VEX_TRC_JMP_MAPFAIL:
   1363          /* Failure of arch-specific address translation (x86/amd64
   1364             segment override use) */
   1365          /* jrs 2005 03 11: is this correct? */
   1366          VG_(synth_fault)(tid);
   1367          break;
   1368 
   1369       case VEX_TRC_JMP_EMWARN: {
   1370          static Int  counts[EmWarn_NUMBER];
   1371          static Bool counts_initted = False;
   1372          VexEmWarn ew;
   1373          HChar*    what;
   1374          Bool      show;
   1375          Int       q;
   1376          if (!counts_initted) {
   1377             counts_initted = True;
   1378             for (q = 0; q < EmWarn_NUMBER; q++)
   1379                counts[q] = 0;
   1380          }
   1381          ew   = (VexEmWarn)VG_(threads)[tid].arch.vex.guest_EMWARN;
   1382          what = (ew < 0 || ew >= EmWarn_NUMBER)
   1383                    ? "unknown (?!)"
   1384                    : LibVEX_EmWarn_string(ew);
   1385          show = (ew < 0 || ew >= EmWarn_NUMBER)
   1386                    ? True
   1387                    : counts[ew]++ < 3;
   1388          if (show && VG_(clo_show_emwarns) && !VG_(clo_xml)) {
   1389             VG_(message)( Vg_UserMsg,
   1390                           "Emulation warning: unsupported action:\n");
   1391             VG_(message)( Vg_UserMsg, "  %s\n", what);
   1392             VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
   1393          }
   1394          break;
   1395       }
   1396 
   1397       case VEX_TRC_JMP_EMFAIL: {
   1398          VexEmWarn ew;
   1399          HChar*    what;
   1400          ew   = (VexEmWarn)VG_(threads)[tid].arch.vex.guest_EMWARN;
   1401          what = (ew < 0 || ew >= EmWarn_NUMBER)
   1402                    ? "unknown (?!)"
   1403                    : LibVEX_EmWarn_string(ew);
   1404          VG_(message)( Vg_UserMsg,
   1405                        "Emulation fatal error -- Valgrind cannot continue:\n");
   1406          VG_(message)( Vg_UserMsg, "  %s\n", what);
   1407          VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
   1408          VG_(message)(Vg_UserMsg, "\n");
   1409          VG_(message)(Vg_UserMsg, "Valgrind has to exit now.  Sorry.\n");
   1410          VG_(message)(Vg_UserMsg, "\n");
   1411          VG_(exit)(1);
   1412          break;
   1413       }
   1414 
   1415       case VEX_TRC_JMP_SIGTRAP:
   1416          VG_(synth_sigtrap)(tid);
   1417          break;
   1418 
   1419       case VEX_TRC_JMP_SIGSEGV:
   1420          VG_(synth_fault)(tid);
   1421          break;
   1422 
   1423       case VEX_TRC_JMP_SIGBUS:
   1424          VG_(synth_sigbus)(tid);
   1425          break;
   1426 
   1427       case VEX_TRC_JMP_NODECODE: {
   1428          Addr addr = VG_(get_IP)(tid);
   1429 
   1430          VG_(umsg)(
   1431             "valgrind: Unrecognised instruction at address %#lx.\n", addr);
   1432          VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
   1433 #define M(a) VG_(umsg)(a "\n");
   1434    M("Your program just tried to execute an instruction that Valgrind" );
   1435    M("did not recognise.  There are two possible reasons for this."    );
   1436    M("1. Your program has a bug and erroneously jumped to a non-code"  );
   1437    M("   location.  If you are running Memcheck and you just saw a"    );
   1438    M("   warning about a bad jump, it's probably your program's fault.");
   1439    M("2. The instruction is legitimate but Valgrind doesn't handle it,");
   1440    M("   i.e. it's Valgrind's fault.  If you think this is the case or");
   1441    M("   you are not sure, please let us know and we'll try to fix it.");
   1442    M("Either way, Valgrind will now raise a SIGILL signal which will"  );
   1443    M("probably kill your program."                                     );
   1444 #undef M
   1445 
   1446 #if defined(VGA_s390x)
   1447          /* Now that the complaint is out we need to adjust the guest_IA. The
   1448             reason is that -- after raising the exception -- execution will
   1449             continue with the insn that follows the invalid insn. As the first
   1450             2 bits of the invalid insn determine its length in the usual way,
   1451             we can compute the address of the next insn here and adjust the
   1452             guest_IA accordingly. This adjustment is essential and tested by
   1453             none/tests/s390x/op_exception.c (which would loop forever
   1454             otherwise) */
   1455          UChar byte = ((UChar *)addr)[0];
   1456          UInt  insn_length = ((((byte >> 6) + 1) >> 1) + 1) << 1;
   1457          Addr  next_insn_addr = addr + insn_length;
   1458 
   1459          VG_(set_IP)(tid, next_insn_addr);
   1460 #endif
   1461          VG_(synth_sigill)(tid, addr);
   1462          break;
   1463       }
   1464       case VEX_TRC_JMP_TINVAL:
   1465          VG_(discard_translations)(
   1466             (Addr64)VG_(threads)[tid].arch.vex.guest_TISTART,
   1467             VG_(threads)[tid].arch.vex.guest_TILEN,
   1468             "scheduler(VEX_TRC_JMP_TINVAL)"
   1469          );
   1470          if (0)
   1471             VG_(printf)("dump translations done.\n");
   1472          break;
   1473 
   1474       case VG_TRC_INVARIANT_FAILED:
   1475          /* This typically happens if, after running generated code,
   1476             it is detected that host CPU settings (eg, FPU/Vector
   1477             control words) are not as they should be.  Vex's code
   1478             generation specifies the state such control words should
   1479             be in on entry to Vex-generated code, and they should be
   1480             unchanged on exit from it.  Failure of this assertion
   1481             usually means a bug in Vex's code generation. */
   1482          //{ UInt xx;
   1483          //  __asm__ __volatile__ (
   1484          //     "\t.word 0xEEF12A10\n"  // fmrx r2,fpscr
   1485          //     "\tmov %0, r2" : "=r"(xx) : : "r2" );
   1486          //  VG_(printf)("QQQQ new fpscr = %08x\n", xx);
   1487          //}
   1488          vg_assert2(0, "VG_(scheduler), phase 3: "
   1489                        "run_innerloop detected host "
   1490                        "state invariant failure", trc);
   1491 
   1492       case VEX_TRC_JMP_SYS_SYSENTER:
   1493          /* Do whatever simulation is appropriate for an x86 sysenter
   1494             instruction.  Note that it is critical to set this thread's
   1495             guest_EIP to point at the code to execute after the
   1496             sysenter, since Vex-generated code will not have set it --
   1497             vex does not know what it should be.  Vex sets the next
   1498             address to zero, so if you don't set guest_EIP, the thread
   1499             will jump to zero afterwards and probably die as a result. */
   1500 #        if defined(VGP_x86_linux)
   1501          vg_assert2(0, "VG_(scheduler), phase 3: "
   1502                        "sysenter_x86 on x86-linux is not supported");
   1503 #        elif defined(VGP_x86_darwin)
   1504          /* return address in client edx */
   1505          VG_(threads)[tid].arch.vex.guest_EIP
   1506             = VG_(threads)[tid].arch.vex.guest_EDX;
   1507          handle_syscall(tid, trc[0]);
   1508 #        else
   1509          vg_assert2(0, "VG_(scheduler), phase 3: "
   1510                        "sysenter_x86 on non-x86 platform?!?!");
   1511 #        endif
   1512          break;
   1513 
   1514       default:
   1515 	 vg_assert2(0, "VG_(scheduler), phase 3: "
   1516                        "unexpected thread return code (%u)", trc[0]);
   1517 	 /* NOTREACHED */
   1518 	 break;
   1519 
   1520       } /* switch (trc) */
   1521 
   1522       if (0)
   1523          maybe_show_sb_counts();
   1524    }
   1525 
   1526    if (VG_(clo_trace_sched))
   1527       print_sched_event(tid, "exiting VG_(scheduler)");
   1528 
   1529    vg_assert(VG_(is_exiting)(tid));
   1530 
   1531    return tst->exitreason;
   1532 }
   1533 
   1534 
   1535 /*
   1536    This causes all threads to forceably exit.  They aren't actually
   1537    dead by the time this returns; you need to call
   1538    VG_(reap_threads)() to wait for them.
   1539  */
   1540 void VG_(nuke_all_threads_except) ( ThreadId me, VgSchedReturnCode src )
   1541 {
   1542    ThreadId tid;
   1543 
   1544    vg_assert(VG_(is_running_thread)(me));
   1545 
   1546    for (tid = 1; tid < VG_N_THREADS; tid++) {
   1547       if (tid == me
   1548           || VG_(threads)[tid].status == VgTs_Empty)
   1549          continue;
   1550       if (0)
   1551          VG_(printf)(
   1552             "VG_(nuke_all_threads_except): nuking tid %d\n", tid);
   1553 
   1554       VG_(threads)[tid].exitreason = src;
   1555       if (src == VgSrc_FatalSig)
   1556          VG_(threads)[tid].os_state.fatalsig = VKI_SIGKILL;
   1557       VG_(get_thread_out_of_syscall)(tid);
   1558    }
   1559 }
   1560 
   1561 
   1562 /* ---------------------------------------------------------------------
   1563    Specifying shadow register values
   1564    ------------------------------------------------------------------ */
   1565 
   1566 #if defined(VGA_x86)
   1567 #  define VG_CLREQ_ARGS       guest_EAX
   1568 #  define VG_CLREQ_RET        guest_EDX
   1569 #elif defined(VGA_amd64)
   1570 #  define VG_CLREQ_ARGS       guest_RAX
   1571 #  define VG_CLREQ_RET        guest_RDX
   1572 #elif defined(VGA_ppc32) || defined(VGA_ppc64)
   1573 #  define VG_CLREQ_ARGS       guest_GPR4
   1574 #  define VG_CLREQ_RET        guest_GPR3
   1575 #elif defined(VGA_arm)
   1576 #  define VG_CLREQ_ARGS       guest_R4
   1577 #  define VG_CLREQ_RET        guest_R3
   1578 #elif defined (VGA_s390x)
   1579 #  define VG_CLREQ_ARGS       guest_r2
   1580 #  define VG_CLREQ_RET        guest_r3
   1581 #elif defined(VGA_mips32)
   1582 #  define VG_CLREQ_ARGS       guest_r12
   1583 #  define VG_CLREQ_RET        guest_r11
   1584 #else
   1585 #  error Unknown arch
   1586 #endif
   1587 
   1588 #define CLREQ_ARGS(regs)   ((regs).vex.VG_CLREQ_ARGS)
   1589 #define CLREQ_RET(regs)    ((regs).vex.VG_CLREQ_RET)
   1590 #define O_CLREQ_RET        (offsetof(VexGuestArchState, VG_CLREQ_RET))
   1591 
   1592 // These macros write a value to a client's thread register, and tell the
   1593 // tool that it's happened (if necessary).
   1594 
   1595 #define SET_CLREQ_RETVAL(zztid, zzval) \
   1596    do { CLREQ_RET(VG_(threads)[zztid].arch) = (zzval); \
   1597         VG_TRACK( post_reg_write, \
   1598                   Vg_CoreClientReq, zztid, O_CLREQ_RET, sizeof(UWord)); \
   1599    } while (0)
   1600 
   1601 #define SET_CLCALL_RETVAL(zztid, zzval, f) \
   1602    do { CLREQ_RET(VG_(threads)[zztid].arch) = (zzval); \
   1603         VG_TRACK( post_reg_write_clientcall_return, \
   1604                   zztid, O_CLREQ_RET, sizeof(UWord), f); \
   1605    } while (0)
   1606 
   1607 
   1608 /* ---------------------------------------------------------------------
   1609    Handle client requests.
   1610    ------------------------------------------------------------------ */
   1611 
   1612 // OS-specific(?) client requests
   1613 static Bool os_client_request(ThreadId tid, UWord *args)
   1614 {
   1615    Bool handled = True;
   1616 
   1617    vg_assert(VG_(is_running_thread)(tid));
   1618 
   1619    switch(args[0]) {
   1620    case VG_USERREQ__LIBC_FREERES_DONE:
   1621       /* This is equivalent to an exit() syscall, but we don't set the
   1622 	 exitcode (since it might already be set) */
   1623       if (0 || VG_(clo_trace_syscalls) || VG_(clo_trace_sched))
   1624          VG_(message)(Vg_DebugMsg,
   1625                       "__libc_freeres() done; really quitting!\n");
   1626       VG_(threads)[tid].exitreason = VgSrc_ExitThread;
   1627       break;
   1628 
   1629    default:
   1630       handled = False;
   1631       break;
   1632    }
   1633 
   1634    return handled;
   1635 }
   1636 
   1637 
   1638 /* Do a client request for the thread tid.  After the request, tid may
   1639    or may not still be runnable; if not, the scheduler will have to
   1640    choose a new thread to run.
   1641 */
   1642 static
   1643 void do_client_request ( ThreadId tid )
   1644 {
   1645    UWord* arg = (UWord*)(CLREQ_ARGS(VG_(threads)[tid].arch));
   1646    UWord req_no = arg[0];
   1647 
   1648    if (0)
   1649       VG_(printf)("req no = 0x%llx, arg = %p\n", (ULong)req_no, arg);
   1650    switch (req_no) {
   1651 
   1652       case VG_USERREQ__CLIENT_CALL0: {
   1653          UWord (*f)(ThreadId) = (void*)arg[1];
   1654 	 if (f == NULL)
   1655 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL0: func=%p\n", f);
   1656 	 else
   1657 	    SET_CLCALL_RETVAL(tid, f ( tid ), (Addr)f);
   1658          break;
   1659       }
   1660       case VG_USERREQ__CLIENT_CALL1: {
   1661          UWord (*f)(ThreadId, UWord) = (void*)arg[1];
   1662 	 if (f == NULL)
   1663 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL1: func=%p\n", f);
   1664 	 else
   1665 	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2] ), (Addr)f );
   1666          break;
   1667       }
   1668       case VG_USERREQ__CLIENT_CALL2: {
   1669          UWord (*f)(ThreadId, UWord, UWord) = (void*)arg[1];
   1670 	 if (f == NULL)
   1671 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL2: func=%p\n", f);
   1672 	 else
   1673 	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3] ), (Addr)f );
   1674          break;
   1675       }
   1676       case VG_USERREQ__CLIENT_CALL3: {
   1677          UWord (*f)(ThreadId, UWord, UWord, UWord) = (void*)arg[1];
   1678 	 if (f == NULL)
   1679 	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL3: func=%p\n", f);
   1680 	 else
   1681 	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3], arg[4] ), (Addr)f );
   1682          break;
   1683       }
   1684 
   1685       // Nb: this looks like a circular definition, because it kind of is.
   1686       // See comment in valgrind.h to understand what's going on.
   1687       case VG_USERREQ__RUNNING_ON_VALGRIND:
   1688          SET_CLREQ_RETVAL(tid, RUNNING_ON_VALGRIND+1);
   1689          break;
   1690 
   1691       case VG_USERREQ__PRINTF: {
   1692          /* JRS 2010-Jan-28: this is DEPRECATED; use the
   1693             _VALIST_BY_REF version instead */
   1694          if (sizeof(va_list) != sizeof(UWord))
   1695             goto va_list_casting_error_NORETURN;
   1696          union {
   1697             va_list vargs;
   1698             unsigned long uw;
   1699          } u;
   1700          u.uw = (unsigned long)arg[2];
   1701          Int count =
   1702             VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], u.vargs );
   1703          VG_(message_flush)();
   1704          SET_CLREQ_RETVAL( tid, count );
   1705          break;
   1706       }
   1707 
   1708       case VG_USERREQ__PRINTF_BACKTRACE: {
   1709          /* JRS 2010-Jan-28: this is DEPRECATED; use the
   1710             _VALIST_BY_REF version instead */
   1711          if (sizeof(va_list) != sizeof(UWord))
   1712             goto va_list_casting_error_NORETURN;
   1713          union {
   1714             va_list vargs;
   1715             unsigned long uw;
   1716          } u;
   1717          u.uw = (unsigned long)arg[2];
   1718          Int count =
   1719             VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], u.vargs );
   1720          VG_(message_flush)();
   1721          VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
   1722          SET_CLREQ_RETVAL( tid, count );
   1723          break;
   1724       }
   1725 
   1726       case VG_USERREQ__PRINTF_VALIST_BY_REF: {
   1727          va_list* vargsp = (va_list*)arg[2];
   1728          Int count =
   1729             VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], *vargsp );
   1730          VG_(message_flush)();
   1731          SET_CLREQ_RETVAL( tid, count );
   1732          break;
   1733       }
   1734 
   1735       case VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF: {
   1736          va_list* vargsp = (va_list*)arg[2];
   1737          Int count =
   1738             VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], *vargsp );
   1739          VG_(message_flush)();
   1740          VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
   1741          SET_CLREQ_RETVAL( tid, count );
   1742          break;
   1743       }
   1744 
   1745       case VG_USERREQ__INTERNAL_PRINTF_VALIST_BY_REF: {
   1746          va_list* vargsp = (va_list*)arg[2];
   1747          Int count =
   1748             VG_(vmessage)( Vg_DebugMsg, (char *)arg[1], *vargsp );
   1749          VG_(message_flush)();
   1750          SET_CLREQ_RETVAL( tid, count );
   1751          break;
   1752       }
   1753 
   1754       case VG_USERREQ__ADD_IFUNC_TARGET: {
   1755          VG_(redir_add_ifunc_target)( arg[1], arg[2] );
   1756          SET_CLREQ_RETVAL( tid, 0);
   1757          break; }
   1758 
   1759       case VG_USERREQ__STACK_REGISTER: {
   1760          UWord sid = VG_(register_stack)((Addr)arg[1], (Addr)arg[2]);
   1761          SET_CLREQ_RETVAL( tid, sid );
   1762          break; }
   1763 
   1764       case VG_USERREQ__STACK_DEREGISTER: {
   1765          VG_(deregister_stack)(arg[1]);
   1766          SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
   1767          break; }
   1768 
   1769       case VG_USERREQ__STACK_CHANGE: {
   1770          VG_(change_stack)(arg[1], (Addr)arg[2], (Addr)arg[3]);
   1771          SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
   1772          break; }
   1773 
   1774       case VG_USERREQ__GET_MALLOCFUNCS: {
   1775 	 struct vg_mallocfunc_info *info = (struct vg_mallocfunc_info *)arg[1];
   1776 
   1777 	 info->tl_malloc               = VG_(tdict).tool_malloc;
   1778 	 info->tl_calloc               = VG_(tdict).tool_calloc;
   1779 	 info->tl_realloc              = VG_(tdict).tool_realloc;
   1780 	 info->tl_memalign             = VG_(tdict).tool_memalign;
   1781 	 info->tl___builtin_new        = VG_(tdict).tool___builtin_new;
   1782 	 info->tl___builtin_vec_new    = VG_(tdict).tool___builtin_vec_new;
   1783 	 info->tl_free                 = VG_(tdict).tool_free;
   1784 	 info->tl___builtin_delete     = VG_(tdict).tool___builtin_delete;
   1785 	 info->tl___builtin_vec_delete = VG_(tdict).tool___builtin_vec_delete;
   1786          info->tl_malloc_usable_size   = VG_(tdict).tool_malloc_usable_size;
   1787 
   1788 	 info->mallinfo                = VG_(mallinfo);
   1789 	 info->clo_trace_malloc        = VG_(clo_trace_malloc);
   1790 
   1791          SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
   1792 
   1793 	 break;
   1794       }
   1795 
   1796       /* Requests from the client program */
   1797 
   1798       case VG_USERREQ__DISCARD_TRANSLATIONS:
   1799          if (VG_(clo_verbosity) > 2)
   1800             VG_(printf)( "client request: DISCARD_TRANSLATIONS,"
   1801                          " addr %p,  len %lu\n",
   1802                          (void*)arg[1], arg[2] );
   1803 
   1804          VG_(discard_translations)(
   1805             arg[1], arg[2], "scheduler(VG_USERREQ__DISCARD_TRANSLATIONS)"
   1806          );
   1807 
   1808          SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
   1809 	 break;
   1810 
   1811       case VG_USERREQ__COUNT_ERRORS:
   1812          SET_CLREQ_RETVAL( tid, VG_(get_n_errs_found)() );
   1813          break;
   1814 
   1815       case VG_USERREQ__LOAD_PDB_DEBUGINFO:
   1816          VG_(di_notify_pdb_debuginfo)( arg[1], arg[2], arg[3], arg[4] );
   1817          SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
   1818          break;
   1819 
   1820       case VG_USERREQ__MAP_IP_TO_SRCLOC: {
   1821          Addr   ip    = arg[1];
   1822          UChar* buf64 = (UChar*)arg[2];
   1823 
   1824          VG_(memset)(buf64, 0, 64);
   1825          UInt linenum = 0;
   1826          Bool ok = VG_(get_filename_linenum)(
   1827                       ip, &buf64[0], 50, NULL, 0, NULL, &linenum
   1828                    );
   1829          if (ok) {
   1830             /* Find the terminating zero in the first 50 bytes. */
   1831             UInt i;
   1832             for (i = 0; i < 50; i++) {
   1833                if (buf64[i] == 0)
   1834                   break;
   1835             }
   1836             /* We must find a zero somewhere in 0 .. 49.  Else
   1837                VG_(get_filename_linenum) is not properly zero
   1838                terminating. */
   1839             vg_assert(i < 50);
   1840             VG_(sprintf)(&buf64[i], ":%u", linenum);
   1841          } else {
   1842             buf64[0] = 0;
   1843          }
   1844 
   1845          SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */
   1846          break;
   1847       }
   1848 
   1849       case VG_USERREQ__CHANGE_ERR_DISABLEMENT: {
   1850          Word delta = arg[1];
   1851          vg_assert(delta == 1 || delta == -1);
   1852          ThreadState* tst = VG_(get_ThreadState)(tid);
   1853          vg_assert(tst);
   1854          if (delta == 1 && tst->err_disablement_level < 0xFFFFFFFF) {
   1855             tst->err_disablement_level++;
   1856          }
   1857          else
   1858          if (delta == -1 && tst->err_disablement_level > 0) {
   1859             tst->err_disablement_level--;
   1860          }
   1861          SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */
   1862          break;
   1863       }
   1864 
   1865       case VG_USERREQ__MALLOCLIKE_BLOCK:
   1866       case VG_USERREQ__RESIZEINPLACE_BLOCK:
   1867       case VG_USERREQ__FREELIKE_BLOCK:
   1868          // Ignore them if the addr is NULL;  otherwise pass onto the tool.
   1869          if (!arg[1]) {
   1870             SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
   1871             break;
   1872          } else {
   1873             goto my_default;
   1874          }
   1875 
   1876       default:
   1877        my_default:
   1878 	 if (os_client_request(tid, arg)) {
   1879 	    // do nothing, os_client_request() handled it
   1880          } else if (VG_(needs).client_requests) {
   1881 	    UWord ret;
   1882 
   1883             if (VG_(clo_verbosity) > 2)
   1884                VG_(printf)("client request: code %lx,  addr %p,  len %lu\n",
   1885                            arg[0], (void*)arg[1], arg[2] );
   1886 
   1887 	    if ( VG_TDICT_CALL(tool_handle_client_request, tid, arg, &ret) )
   1888 	       SET_CLREQ_RETVAL(tid, ret);
   1889          } else {
   1890 	    static Bool whined = False;
   1891 
   1892 	    if (!whined && VG_(clo_verbosity) > 2) {
   1893                // Allow for requests in core, but defined by tools, which
   1894                // have 0 and 0 in their two high bytes.
   1895                Char c1 = (arg[0] >> 24) & 0xff;
   1896                Char c2 = (arg[0] >> 16) & 0xff;
   1897                if (c1 == 0) c1 = '_';
   1898                if (c2 == 0) c2 = '_';
   1899 	       VG_(message)(Vg_UserMsg, "Warning:\n"
   1900                    "  unhandled client request: 0x%lx (%c%c+0x%lx).  Perhaps\n"
   1901 		   "  VG_(needs).client_requests should be set?\n",
   1902 			    arg[0], c1, c2, arg[0] & 0xffff);
   1903 	       whined = True;
   1904 	    }
   1905          }
   1906          break;
   1907    }
   1908    return;
   1909 
   1910    /*NOTREACHED*/
   1911   va_list_casting_error_NORETURN:
   1912    VG_(umsg)(
   1913       "Valgrind: fatal error - cannot continue: use of the deprecated\n"
   1914       "client requests VG_USERREQ__PRINTF or VG_USERREQ__PRINTF_BACKTRACE\n"
   1915       "on a platform where they cannot be supported.  Please use the\n"
   1916       "equivalent _VALIST_BY_REF versions instead.\n"
   1917       "\n"
   1918       "This is a binary-incompatible change in Valgrind's client request\n"
   1919       "mechanism.  It is unfortunate, but difficult to avoid.  End-users\n"
   1920       "are expected to almost never see this message.  The only case in\n"
   1921       "which you might see this message is if your code uses the macros\n"
   1922       "VALGRIND_PRINTF or VALGRIND_PRINTF_BACKTRACE.  If so, you will need\n"
   1923       "to recompile such code, using the header files from this version of\n"
   1924       "Valgrind, and not any previous version.\n"
   1925       "\n"
   1926       "If you see this mesage in any other circumstances, it is probably\n"
   1927       "a bug in Valgrind.  In this case, please file a bug report at\n"
   1928       "\n"
   1929       "   http://www.valgrind.org/support/bug_reports.html\n"
   1930       "\n"
   1931       "Will now abort.\n"
   1932    );
   1933    vg_assert(0);
   1934 }
   1935 
   1936 
   1937 /* ---------------------------------------------------------------------
   1938    Sanity checking (permanently engaged)
   1939    ------------------------------------------------------------------ */
   1940 
   1941 /* Internal consistency checks on the sched structures. */
   1942 static
   1943 void scheduler_sanity ( ThreadId tid )
   1944 {
   1945    Bool bad = False;
   1946    static UInt lasttime = 0;
   1947    UInt now;
   1948    Int lwpid = VG_(gettid)();
   1949 
   1950    if (!VG_(is_running_thread)(tid)) {
   1951       VG_(message)(Vg_DebugMsg,
   1952 		   "Thread %d is supposed to be running, "
   1953                    "but doesn't own the_BigLock (owned by %d)\n",
   1954 		   tid, VG_(running_tid));
   1955       bad = True;
   1956    }
   1957 
   1958    if (lwpid != VG_(threads)[tid].os_state.lwpid) {
   1959       VG_(message)(Vg_DebugMsg,
   1960                    "Thread %d supposed to be in LWP %d, but we're actually %d\n",
   1961                    tid, VG_(threads)[tid].os_state.lwpid, VG_(gettid)());
   1962       bad = True;
   1963    }
   1964 
   1965    if (lwpid != ML_(get_sched_lock_owner)(the_BigLock)) {
   1966       VG_(message)(Vg_DebugMsg,
   1967                    "Thread (LWPID) %d doesn't own the_BigLock\n",
   1968                    tid);
   1969       bad = True;
   1970    }
   1971 
   1972    /* Periodically show the state of all threads, for debugging
   1973       purposes. */
   1974    now = VG_(read_millisecond_timer)();
   1975    if (0 && (!bad) && (lasttime + 4000/*ms*/ <= now)) {
   1976       lasttime = now;
   1977       VG_(printf)("\n------------ Sched State at %d ms ------------\n",
   1978                   (Int)now);
   1979       VG_(show_sched_status)();
   1980    }
   1981 
   1982    /* core_panic also shows the sched status, which is why we don't
   1983       show it above if bad==True. */
   1984    if (bad)
   1985       VG_(core_panic)("scheduler_sanity: failed");
   1986 }
   1987 
   1988 void VG_(sanity_check_general) ( Bool force_expensive )
   1989 {
   1990    ThreadId tid;
   1991 
   1992    static UInt next_slow_check_at = 1;
   1993    static UInt slow_check_interval = 25;
   1994 
   1995    if (VG_(clo_sanity_level) < 1) return;
   1996 
   1997    /* --- First do all the tests that we can do quickly. ---*/
   1998 
   1999    sanity_fast_count++;
   2000 
   2001    /* Check stuff pertaining to the memory check system. */
   2002 
   2003    /* Check that nobody has spuriously claimed that the first or
   2004       last 16 pages of memory have become accessible [...] */
   2005    if (VG_(needs).sanity_checks) {
   2006       vg_assert(VG_TDICT_CALL(tool_cheap_sanity_check));
   2007    }
   2008 
   2009    /* --- Now some more expensive checks. ---*/
   2010 
   2011    /* Once every now and again, check some more expensive stuff.
   2012       Gradually increase the interval between such checks so as not to
   2013       burden long-running programs too much. */
   2014    if ( force_expensive
   2015         || VG_(clo_sanity_level) > 1
   2016         || (VG_(clo_sanity_level) == 1
   2017             && sanity_fast_count == next_slow_check_at)) {
   2018 
   2019       if (0) VG_(printf)("SLOW at %d\n", sanity_fast_count-1);
   2020 
   2021       next_slow_check_at = sanity_fast_count - 1 + slow_check_interval;
   2022       slow_check_interval++;
   2023       sanity_slow_count++;
   2024 
   2025       if (VG_(needs).sanity_checks) {
   2026           vg_assert(VG_TDICT_CALL(tool_expensive_sanity_check));
   2027       }
   2028 
   2029       /* Look for stack overruns.  Visit all threads. */
   2030       for (tid = 1; tid < VG_N_THREADS; tid++) {
   2031 	 SizeT    remains;
   2032          VgStack* stack;
   2033 
   2034 	 if (VG_(threads)[tid].status == VgTs_Empty ||
   2035 	     VG_(threads)[tid].status == VgTs_Zombie)
   2036 	    continue;
   2037 
   2038          stack
   2039             = (VgStack*)
   2040               VG_(get_ThreadState)(tid)->os_state.valgrind_stack_base;
   2041          SizeT limit
   2042             = 4096; // Let's say.  Checking more causes lots of L2 misses.
   2043 	 remains
   2044             = VG_(am_get_VgStack_unused_szB)(stack, limit);
   2045 	 if (remains < limit)
   2046 	    VG_(message)(Vg_DebugMsg,
   2047                          "WARNING: Thread %d is within %ld bytes "
   2048                          "of running out of stack!\n",
   2049 		         tid, remains);
   2050       }
   2051    }
   2052 
   2053    if (VG_(clo_sanity_level) > 1) {
   2054       /* Check sanity of the low-level memory manager.  Note that bugs
   2055          in the client's code can cause this to fail, so we don't do
   2056          this check unless specially asked for.  And because it's
   2057          potentially very expensive. */
   2058       VG_(sanity_check_malloc_all)();
   2059    }
   2060 }
   2061 
   2062 /*--------------------------------------------------------------------*/
   2063 /*--- end                                                          ---*/
   2064 /*--------------------------------------------------------------------*/
   2065