Home | History | Annotate | Download | only in src
      1 // Copyright 2006 Google Inc. All Rights Reserved.
      2 
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // worker.cc : individual tasks that can be run in combination to
     16 // stress the system
     17 
     18 #include <errno.h>
     19 #include <pthread.h>
     20 #include <sched.h>
     21 #include <signal.h>
     22 #include <stdlib.h>
     23 #include <stdio.h>
     24 #include <stdint.h>
     25 #include <string.h>
     26 #include <time.h>
     27 #include <unistd.h>
     28 
     29 #include <sys/select.h>
     30 #include <sys/stat.h>
     31 #include <sys/types.h>
     32 #include <sys/times.h>
     33 
     34 // These are necessary, but on by default
     35 // #define __USE_GNU
     36 // #define __USE_LARGEFILE64
     37 #include <fcntl.h>
     38 #include <sys/socket.h>
     39 #include <netdb.h>
     40 #include <arpa/inet.h>
     41 #include <linux/unistd.h>  // for gettid
     42 
     43 // For size of block device
     44 #include <sys/ioctl.h>
     45 #include <linux/fs.h>
     46 // For asynchronous I/O
     47 #ifdef HAVE_LIBAIO_H
     48 #include <libaio.h>
     49 #endif
     50 
     51 #include <sys/syscall.h>
     52 
     53 #include <set>
     54 #include <string>
     55 
     56 // This file must work with autoconf on its public version,
     57 // so these includes are correct.
     58 #include "error_diag.h"  // NOLINT
     59 #include "os.h"          // NOLINT
     60 #include "pattern.h"     // NOLINT
     61 #include "queue.h"       // NOLINT
     62 #include "sat.h"         // NOLINT
     63 #include "sattypes.h"    // NOLINT
     64 #include "worker.h"      // NOLINT
     65 
     66 // Syscalls
     67 // Why ubuntu, do you hate gettid so bad?
     68 #if !defined(__NR_gettid)
     69   #define __NR_gettid             224
     70 #endif
     71 
     72 #define gettid() syscall(__NR_gettid)
     73 #if !defined(CPU_SETSIZE)
     74 _syscall3(int, sched_getaffinity, pid_t, pid,
     75           unsigned int, len, cpu_set_t*, mask)
     76 _syscall3(int, sched_setaffinity, pid_t, pid,
     77           unsigned int, len, cpu_set_t*, mask)
     78 #endif
     79 
     80 namespace {
     81   // Work around the sad fact that there are two (gnu, xsi) incompatible
     82   // versions of strerror_r floating around google. Awesome.
     83   bool sat_strerror(int err, char *buf, int len) {
     84     buf[0] = 0;
     85     char *errmsg = reinterpret_cast<char*>(strerror_r(err, buf, len));
     86     int retval = reinterpret_cast<int64>(errmsg);
     87     if (retval == 0)
     88       return true;
     89     if (retval == -1)
     90       return false;
     91     if (errmsg != buf) {
     92       strncpy(buf, errmsg, len);
     93       buf[len - 1] = 0;
     94     }
     95     return true;
     96   }
     97 
     98 
     99   inline uint64 addr_to_tag(void *address) {
    100     return reinterpret_cast<uint64>(address);
    101   }
    102 }  // namespace
    103 
    104 #if !defined(O_DIRECT)
    105 // Sometimes this isn't available.
    106 // Disregard if it's not defined.
    107   #define O_DIRECT            0
    108 #endif
    109 
    110 // A struct to hold captured errors, for later reporting.
    111 struct ErrorRecord {
    112   uint64 actual;  // This is the actual value read.
    113   uint64 reread;  // This is the actual value, reread.
    114   uint64 expected;  // This is what it should have been.
    115   uint64 *vaddr;  // This is where it was (or wasn't).
    116   char *vbyteaddr;  // This is byte specific where the data was (or wasn't).
    117   uint64 paddr;  // This is the bus address, if available.
    118   uint64 *tagvaddr;  // This holds the tag value if this data was tagged.
    119   uint64 tagpaddr;  // This holds the physical address corresponding to the tag.
    120 };
    121 
    122 // This is a helper function to create new threads with pthreads.
    123 static void *ThreadSpawnerGeneric(void *ptr) {
    124   WorkerThread *worker = static_cast<WorkerThread*>(ptr);
    125   worker->StartRoutine();
    126   return NULL;
    127 }
    128 
    129 void WorkerStatus::Initialize() {
    130   sat_assert(0 == pthread_mutex_init(&num_workers_mutex_, NULL));
    131   sat_assert(0 == pthread_rwlock_init(&status_rwlock_, NULL));
    132 #ifdef HAVE_PTHREAD_BARRIERS
    133   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL,
    134                                        num_workers_ + 1));
    135 #endif
    136 }
    137 
    138 void WorkerStatus::Destroy() {
    139   sat_assert(0 == pthread_mutex_destroy(&num_workers_mutex_));
    140   sat_assert(0 == pthread_rwlock_destroy(&status_rwlock_));
    141 #ifdef HAVE_PTHREAD_BARRIERS
    142   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
    143 #endif
    144 }
    145 
    146 void WorkerStatus::PauseWorkers() {
    147   if (SetStatus(PAUSE) != PAUSE)
    148     WaitOnPauseBarrier();
    149 }
    150 
    151 void WorkerStatus::ResumeWorkers() {
    152   if (SetStatus(RUN) == PAUSE)
    153     WaitOnPauseBarrier();
    154 }
    155 
    156 void WorkerStatus::StopWorkers() {
    157   if (SetStatus(STOP) == PAUSE)
    158     WaitOnPauseBarrier();
    159 }
    160 
    161 bool WorkerStatus::ContinueRunning(bool *paused) {
    162   // This loop is an optimization.  We use it to immediately re-check the status
    163   // after resuming from a pause, instead of returning and waiting for the next
    164   // call to this function.
    165   if (paused) {
    166     *paused = false;
    167   }
    168   for (;;) {
    169     switch (GetStatus()) {
    170       case RUN:
    171         return true;
    172       case PAUSE:
    173         // Wait for the other workers to call this function so that
    174         // PauseWorkers() can return.
    175         WaitOnPauseBarrier();
    176         // Wait for ResumeWorkers() to be called.
    177         WaitOnPauseBarrier();
    178         // Indicate that a pause occurred.
    179         if (paused) {
    180           *paused = true;
    181         }
    182         break;
    183       case STOP:
    184         return false;
    185     }
    186   }
    187 }
    188 
    189 bool WorkerStatus::ContinueRunningNoPause() {
    190   return (GetStatus() != STOP);
    191 }
    192 
    193 void WorkerStatus::RemoveSelf() {
    194   // Acquire a read lock on status_rwlock_ while (status_ != PAUSE).
    195   for (;;) {
    196     AcquireStatusReadLock();
    197     if (status_ != PAUSE)
    198       break;
    199     // We need to obey PauseWorkers() just like ContinueRunning() would, so that
    200     // the other threads won't wait on pause_barrier_ forever.
    201     ReleaseStatusLock();
    202     // Wait for the other workers to call this function so that PauseWorkers()
    203     // can return.
    204     WaitOnPauseBarrier();
    205     // Wait for ResumeWorkers() to be called.
    206     WaitOnPauseBarrier();
    207   }
    208 
    209   // This lock would be unnecessary if we held a write lock instead of a read
    210   // lock on status_rwlock_, but that would also force all threads calling
    211   // ContinueRunning() to wait on this one.  Using a separate lock avoids that.
    212   AcquireNumWorkersLock();
    213   // Decrement num_workers_ and reinitialize pause_barrier_, which we know isn't
    214   // in use because (status != PAUSE).
    215 #ifdef HAVE_PTHREAD_BARRIERS
    216   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
    217   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_));
    218 #endif
    219   --num_workers_;
    220   ReleaseNumWorkersLock();
    221 
    222   // Release status_rwlock_.
    223   ReleaseStatusLock();
    224 }
    225 
    226 
    227 // Parent thread class.
    228 WorkerThread::WorkerThread() {
    229   status_ = false;
    230   pages_copied_ = 0;
    231   errorcount_ = 0;
    232   runduration_usec_ = 1;
    233   priority_ = Normal;
    234   worker_status_ = NULL;
    235   thread_spawner_ = &ThreadSpawnerGeneric;
    236   tag_mode_ = false;
    237 }
    238 
    239 WorkerThread::~WorkerThread() {}
    240 
    241 // Constructors. Just init some default values.
    242 FillThread::FillThread() {
    243   num_pages_to_fill_ = 0;
    244 }
    245 
    246 // Initialize file name to empty.
    247 FileThread::FileThread() {
    248   filename_ = "";
    249   devicename_ = "";
    250   pass_ = 0;
    251   page_io_ = true;
    252   crc_page_ = -1;
    253   local_page_ = NULL;
    254 }
    255 
    256 // If file thread used bounce buffer in memory, account for the extra
    257 // copy for memory bandwidth calculation.
    258 float FileThread::GetMemoryCopiedData() {
    259   if (!os_->normal_mem())
    260     return GetCopiedData();
    261   else
    262     return 0;
    263 }
    264 
    265 // Initialize target hostname to be invalid.
    266 NetworkThread::NetworkThread() {
    267   snprintf(ipaddr_, sizeof(ipaddr_), "Unknown");
    268   sock_ = 0;
    269 }
    270 
    271 // Initialize?
    272 NetworkSlaveThread::NetworkSlaveThread() {
    273 }
    274 
    275 // Initialize?
    276 NetworkListenThread::NetworkListenThread() {
    277 }
    278 
    279 // Init member variables.
    280 void WorkerThread::InitThread(int thread_num_init,
    281                               class Sat *sat_init,
    282                               class OsLayer *os_init,
    283                               class PatternList *patternlist_init,
    284                               WorkerStatus *worker_status) {
    285   sat_assert(worker_status);
    286   worker_status->AddWorkers(1);
    287 
    288   thread_num_ = thread_num_init;
    289   sat_ = sat_init;
    290   os_ = os_init;
    291   patternlist_ = patternlist_init;
    292   worker_status_ = worker_status;
    293 
    294   AvailableCpus(&cpu_mask_);
    295   tag_ = 0xffffffff;
    296 
    297   tag_mode_ = sat_->tag_mode();
    298 }
    299 
    300 
    301 // Use pthreads to prioritize a system thread.
    302 bool WorkerThread::InitPriority() {
    303   // This doesn't affect performance that much, and may not be too safe.
    304 
    305   bool ret = BindToCpus(&cpu_mask_);
    306   if (!ret)
    307     logprintf(11, "Log: Bind to %s failed.\n",
    308               cpuset_format(&cpu_mask_).c_str());
    309 
    310   logprintf(11, "Log: Thread %d running on core ID %d mask %s (%s).\n",
    311             thread_num_, sched_getcpu(),
    312             CurrentCpusFormat().c_str(),
    313             cpuset_format(&cpu_mask_).c_str());
    314 #if 0
    315   if (priority_ == High) {
    316     sched_param param;
    317     param.sched_priority = 1;
    318     // Set the priority; others are unchanged.
    319     logprintf(0, "Log: Changing priority to SCHED_FIFO %d\n",
    320               param.sched_priority);
    321     if (sched_setscheduler(0, SCHED_FIFO, &param)) {
    322       char buf[256];
    323       sat_strerror(errno, buf, sizeof(buf));
    324       logprintf(0, "Process Error: sched_setscheduler "
    325                    "failed - error %d %s\n",
    326                 errno, buf);
    327     }
    328   }
    329 #endif
    330   return true;
    331 }
    332 
    333 // Use pthreads to create a system thread.
    334 int WorkerThread::SpawnThread() {
    335   // Create the new thread.
    336   int result = pthread_create(&thread_, NULL, thread_spawner_, this);
    337   if (result) {
    338     char buf[256];
    339     sat_strerror(result, buf, sizeof(buf));
    340     logprintf(0, "Process Error: pthread_create "
    341                   "failed - error %d %s\n", result,
    342               buf);
    343     status_ = false;
    344     return false;
    345   }
    346 
    347   // 0 is pthreads success.
    348   return true;
    349 }
    350 
    351 // Kill the worker thread with SIGINT.
    352 bool WorkerThread::KillThread() {
    353   return (pthread_kill(thread_, SIGINT) == 0);
    354 }
    355 
    356 // Block until thread has exited.
    357 bool WorkerThread::JoinThread() {
    358   int result = pthread_join(thread_, NULL);
    359 
    360   if (result) {
    361     logprintf(0, "Process Error: pthread_join failed - error %d\n", result);
    362     status_ = false;
    363   }
    364 
    365   // 0 is pthreads success.
    366   return (!result);
    367 }
    368 
    369 
    370 void WorkerThread::StartRoutine() {
    371   InitPriority();
    372   StartThreadTimer();
    373   Work();
    374   StopThreadTimer();
    375   worker_status_->RemoveSelf();
    376 }
    377 
    378 
    379 // Thread work loop. Execute until marked finished.
    380 bool WorkerThread::Work() {
    381   do {
    382     logprintf(9, "Log: ...\n");
    383     // Sleep for 1 second.
    384     sat_sleep(1);
    385   } while (IsReadyToRun());
    386 
    387   return false;
    388 }
    389 
    390 
    391 // Returns CPU mask of CPUs available to this process,
    392 // Conceptually, each bit represents a logical CPU, ie:
    393 //   mask = 3  (11b):   cpu0, 1
    394 //   mask = 13 (1101b): cpu0, 2, 3
    395 bool WorkerThread::AvailableCpus(cpu_set_t *cpuset) {
    396   CPU_ZERO(cpuset);
    397 #ifdef HAVE_SCHED_GETAFFINITY
    398   return sched_getaffinity(getppid(), sizeof(*cpuset), cpuset) == 0;
    399 #else
    400   return 0;
    401 #endif
    402 }
    403 
    404 
    405 // Returns CPU mask of CPUs this thread is bound to,
    406 // Conceptually, each bit represents a logical CPU, ie:
    407 //   mask = 3  (11b):   cpu0, 1
    408 //   mask = 13 (1101b): cpu0, 2, 3
    409 bool WorkerThread::CurrentCpus(cpu_set_t *cpuset) {
    410   CPU_ZERO(cpuset);
    411 #ifdef HAVE_SCHED_GETAFFINITY
    412   return sched_getaffinity(0, sizeof(*cpuset), cpuset) == 0;
    413 #else
    414   return 0;
    415 #endif
    416 }
    417 
    418 
    419 // Bind worker thread to specified CPU(s)
    420 //   Args:
    421 //     thread_mask: cpu_set_t representing CPUs, ie
    422 //                  mask = 1  (01b):   cpu0
    423 //                  mask = 3  (11b):   cpu0, 1
    424 //                  mask = 13 (1101b): cpu0, 2, 3
    425 //
    426 //   Returns true on success, false otherwise.
    427 bool WorkerThread::BindToCpus(const cpu_set_t *thread_mask) {
    428   cpu_set_t process_mask;
    429   AvailableCpus(&process_mask);
    430   if (cpuset_isequal(thread_mask, &process_mask))
    431     return true;
    432 
    433   logprintf(11, "Log: available CPU mask - %s\n",
    434             cpuset_format(&process_mask).c_str());
    435   if (!cpuset_issubset(thread_mask, &process_mask)) {
    436     // Invalid cpu_mask, ie cpu not allocated to this process or doesn't exist.
    437     logprintf(0, "Log: requested CPUs %s not a subset of available %s\n",
    438               cpuset_format(thread_mask).c_str(),
    439               cpuset_format(&process_mask).c_str());
    440     return false;
    441   }
    442 #ifdef HAVE_SCHED_GETAFFINITY
    443   return (sched_setaffinity(gettid(), sizeof(*thread_mask), thread_mask) == 0);
    444 #else
    445   return 0;
    446 #endif
    447 }
    448 
    449 
    450 // A worker thread can yield itself to give up CPU until it's scheduled again.
    451 //   Returns true on success, false on error.
    452 bool WorkerThread::YieldSelf() {
    453   return (sched_yield() == 0);
    454 }
    455 
    456 
    457 // Fill this page with its pattern.
    458 bool WorkerThread::FillPage(struct page_entry *pe) {
    459   // Error check arguments.
    460   if (pe == 0) {
    461     logprintf(0, "Process Error: Fill Page entry null\n");
    462     return 0;
    463   }
    464 
    465   // Mask is the bitmask of indexes used by the pattern.
    466   // It is the pattern size -1. Size is always a power of 2.
    467   uint64 *memwords = static_cast<uint64*>(pe->addr);
    468   int length = sat_->page_length();
    469 
    470   if (tag_mode_) {
    471     // Select tag or data as appropriate.
    472     for (int i = 0; i < length / wordsize_; i++) {
    473       datacast_t data;
    474 
    475       if ((i & 0x7) == 0) {
    476         data.l64 = addr_to_tag(&memwords[i]);
    477       } else {
    478         data.l32.l = pe->pattern->pattern(i << 1);
    479         data.l32.h = pe->pattern->pattern((i << 1) + 1);
    480       }
    481       memwords[i] = data.l64;
    482     }
    483   } else {
    484     // Just fill in untagged data directly.
    485     for (int i = 0; i < length / wordsize_; i++) {
    486       datacast_t data;
    487 
    488       data.l32.l = pe->pattern->pattern(i << 1);
    489       data.l32.h = pe->pattern->pattern((i << 1) + 1);
    490       memwords[i] = data.l64;
    491     }
    492   }
    493 
    494   return 1;
    495 }
    496 
    497 
    498 // Tell the thread how many pages to fill.
    499 void FillThread::SetFillPages(int64 num_pages_to_fill_init) {
    500   num_pages_to_fill_ = num_pages_to_fill_init;
    501 }
    502 
    503 // Fill this page with a random pattern.
    504 bool FillThread::FillPageRandom(struct page_entry *pe) {
    505   // Error check arguments.
    506   if (pe == 0) {
    507     logprintf(0, "Process Error: Fill Page entry null\n");
    508     return 0;
    509   }
    510   if ((patternlist_ == 0) || (patternlist_->Size() == 0)) {
    511     logprintf(0, "Process Error: No data patterns available\n");
    512     return 0;
    513   }
    514 
    515   // Choose a random pattern for this block.
    516   pe->pattern = patternlist_->GetRandomPattern();
    517   if (pe->pattern == 0) {
    518     logprintf(0, "Process Error: Null data pattern\n");
    519     return 0;
    520   }
    521 
    522   // Actually fill the page.
    523   return FillPage(pe);
    524 }
    525 
    526 
    527 // Memory fill work loop. Execute until alloted pages filled.
    528 bool FillThread::Work() {
    529   bool result = true;
    530 
    531   logprintf(9, "Log: Starting fill thread %d\n", thread_num_);
    532 
    533   // We want to fill num_pages_to_fill pages, and
    534   // stop when we've filled that many.
    535   // We also want to capture early break
    536   struct page_entry pe;
    537   int64 loops = 0;
    538   while (IsReadyToRun() && (loops < num_pages_to_fill_)) {
    539     result = result && sat_->GetEmpty(&pe);
    540     if (!result) {
    541       logprintf(0, "Process Error: fill_thread failed to pop pages, "
    542                 "bailing\n");
    543       break;
    544     }
    545 
    546     // Fill the page with pattern
    547     result = result && FillPageRandom(&pe);
    548     if (!result) break;
    549 
    550     // Put the page back on the queue.
    551     result = result && sat_->PutValid(&pe);
    552     if (!result) {
    553       logprintf(0, "Process Error: fill_thread failed to push pages, "
    554                 "bailing\n");
    555       break;
    556     }
    557     loops++;
    558   }
    559 
    560   // Fill in thread status.
    561   pages_copied_ = loops;
    562   status_ = result;
    563   logprintf(9, "Log: Completed %d: Fill thread. Status %d, %d pages filled\n",
    564             thread_num_, status_, pages_copied_);
    565   return result;
    566 }
    567 
    568 
    569 // Print error information about a data miscompare.
    570 void WorkerThread::ProcessError(struct ErrorRecord *error,
    571                                 int priority,
    572                                 const char *message) {
    573   char dimm_string[256] = "";
    574 
    575   int core_id = sched_getcpu();
    576 
    577   // Determine if this is a write or read error.
    578   os_->Flush(error->vaddr);
    579   error->reread = *(error->vaddr);
    580 
    581   char *good = reinterpret_cast<char*>(&(error->expected));
    582   char *bad = reinterpret_cast<char*>(&(error->actual));
    583 
    584   sat_assert(error->expected != error->actual);
    585   unsigned int offset = 0;
    586   for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
    587     if (good[offset] != bad[offset])
    588       break;
    589   }
    590 
    591   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
    592 
    593   // Find physical address if possible.
    594   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
    595 
    596   // Pretty print DIMM mapping if available.
    597   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
    598 
    599   // Report parseable error.
    600   if (priority < 5) {
    601     // Run miscompare error through diagnoser for logging and reporting.
    602     os_->error_diagnoser_->AddMiscompareError(dimm_string,
    603                                               reinterpret_cast<uint64>
    604                                               (error->vaddr), 1);
    605 
    606     logprintf(priority,
    607               "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
    608               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
    609               message,
    610               core_id,
    611               CurrentCpusFormat().c_str(),
    612               error->vaddr,
    613               error->paddr,
    614               dimm_string,
    615               error->actual,
    616               error->reread,
    617               error->expected);
    618   }
    619 
    620 
    621   // Overwrite incorrect data with correct data to prevent
    622   // future miscompares when this data is reused.
    623   *(error->vaddr) = error->expected;
    624   os_->Flush(error->vaddr);
    625 }
    626 
    627 
    628 
    629 // Print error information about a data miscompare.
    630 void FileThread::ProcessError(struct ErrorRecord *error,
    631                               int priority,
    632                               const char *message) {
    633   char dimm_string[256] = "";
    634 
    635   // Determine if this is a write or read error.
    636   os_->Flush(error->vaddr);
    637   error->reread = *(error->vaddr);
    638 
    639   char *good = reinterpret_cast<char*>(&(error->expected));
    640   char *bad = reinterpret_cast<char*>(&(error->actual));
    641 
    642   sat_assert(error->expected != error->actual);
    643   unsigned int offset = 0;
    644   for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
    645     if (good[offset] != bad[offset])
    646       break;
    647   }
    648 
    649   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
    650 
    651   // Find physical address if possible.
    652   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
    653 
    654   // Pretty print DIMM mapping if available.
    655   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
    656 
    657   // If crc_page_ is valid, ie checking content read back from file,
    658   // track src/dst memory addresses. Otherwise catagorize as general
    659   // mememory miscompare for CRC checking everywhere else.
    660   if (crc_page_ != -1) {
    661     int miscompare_byteoffset = static_cast<char*>(error->vbyteaddr) -
    662                                 static_cast<char*>(page_recs_[crc_page_].dst);
    663     os_->error_diagnoser_->AddHDDMiscompareError(devicename_,
    664                                                  crc_page_,
    665                                                  miscompare_byteoffset,
    666                                                  page_recs_[crc_page_].src,
    667                                                  page_recs_[crc_page_].dst);
    668   } else {
    669     os_->error_diagnoser_->AddMiscompareError(dimm_string,
    670                                               reinterpret_cast<uint64>
    671                                               (error->vaddr), 1);
    672   }
    673 
    674   logprintf(priority,
    675             "%s: miscompare on %s at %p(0x%llx:%s): read:0x%016llx, "
    676             "reread:0x%016llx expected:0x%016llx\n",
    677             message,
    678             devicename_.c_str(),
    679             error->vaddr,
    680             error->paddr,
    681             dimm_string,
    682             error->actual,
    683             error->reread,
    684             error->expected);
    685 
    686   // Overwrite incorrect data with correct data to prevent
    687   // future miscompares when this data is reused.
    688   *(error->vaddr) = error->expected;
    689   os_->Flush(error->vaddr);
    690 }
    691 
    692 
    693 // Do a word by word result check of a region.
    694 // Print errors on mismatches.
    695 int WorkerThread::CheckRegion(void *addr,
    696                               class Pattern *pattern,
    697                               int64 length,
    698                               int offset,
    699                               int64 pattern_offset) {
    700   uint64 *memblock = static_cast<uint64*>(addr);
    701   const int kErrorLimit = 128;
    702   int errors = 0;
    703   int overflowerrors = 0;  // Count of overflowed errors.
    704   bool page_error = false;
    705   string errormessage("Hardware Error");
    706   struct ErrorRecord
    707     recorded[kErrorLimit];  // Queued errors for later printing.
    708 
    709   // For each word in the data region.
    710   for (int i = 0; i < length / wordsize_; i++) {
    711     uint64 actual = memblock[i];
    712     uint64 expected;
    713 
    714     // Determine the value that should be there.
    715     datacast_t data;
    716     int index = 2 * i + pattern_offset;
    717     data.l32.l = pattern->pattern(index);
    718     data.l32.h = pattern->pattern(index + 1);
    719     expected = data.l64;
    720     // Check tags if necessary.
    721     if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
    722       expected = addr_to_tag(&memblock[i]);
    723     }
    724 
    725 
    726     // If the value is incorrect, save an error record for later printing.
    727     if (actual != expected) {
    728       if (errors < kErrorLimit) {
    729         recorded[errors].actual = actual;
    730         recorded[errors].expected = expected;
    731         recorded[errors].vaddr = &memblock[i];
    732         errors++;
    733       } else {
    734         page_error = true;
    735         // If we have overflowed the error queue, just print the errors now.
    736         logprintf(10, "Log: Error record overflow, too many miscompares!\n");
    737         errormessage = "Page Error";
    738         break;
    739       }
    740     }
    741   }
    742 
    743   // Find if this is a whole block corruption.
    744   if (page_error && !tag_mode_) {
    745     int patsize = patternlist_->Size();
    746     for (int pat = 0; pat < patsize; pat++) {
    747       class Pattern *altpattern = patternlist_->GetPattern(pat);
    748       const int kGood = 0;
    749       const int kBad = 1;
    750       const int kGoodAgain = 2;
    751       const int kNoMatch = 3;
    752       int state = kGood;
    753       unsigned int badstart = 0;
    754       unsigned int badend = 0;
    755 
    756       // Don't match against ourself!
    757       if (pattern == altpattern)
    758         continue;
    759 
    760       for (int i = 0; i < length / wordsize_; i++) {
    761         uint64 actual = memblock[i];
    762         datacast_t expected;
    763         datacast_t possible;
    764 
    765         // Determine the value that should be there.
    766         int index = 2 * i + pattern_offset;
    767 
    768         expected.l32.l = pattern->pattern(index);
    769         expected.l32.h = pattern->pattern(index + 1);
    770 
    771         possible.l32.l = pattern->pattern(index);
    772         possible.l32.h = pattern->pattern(index + 1);
    773 
    774         if (state == kGood) {
    775           if (actual == expected.l64) {
    776             continue;
    777           } else if (actual == possible.l64) {
    778             badstart = i;
    779             badend = i;
    780             state = kBad;
    781             continue;
    782           } else {
    783             state = kNoMatch;
    784             break;
    785           }
    786         } else if (state == kBad) {
    787           if (actual == possible.l64) {
    788             badend = i;
    789             continue;
    790           } else if (actual == expected.l64) {
    791             state = kGoodAgain;
    792             continue;
    793           } else {
    794             state = kNoMatch;
    795             break;
    796           }
    797         } else if (state == kGoodAgain) {
    798           if (actual == expected.l64) {
    799             continue;
    800           } else {
    801             state = kNoMatch;
    802             break;
    803           }
    804         }
    805       }
    806 
    807       if ((state == kGoodAgain) || (state == kBad)) {
    808         unsigned int blockerrors = badend - badstart + 1;
    809         errormessage = "Block Error";
    810         // It's okay for the 1st entry to be corrected multiple times,
    811         // it will simply be reported twice. Once here and once below
    812         // when processing the error queue.
    813         ProcessError(&recorded[0], 0, errormessage.c_str());
    814         logprintf(0, "Block Error: (%p) pattern %s instead of %s, "
    815                   "%d bytes from offset 0x%x to 0x%x\n",
    816                   &memblock[badstart],
    817                   altpattern->name(), pattern->name(),
    818                   blockerrors * wordsize_,
    819                   offset + badstart * wordsize_,
    820                   offset + badend * wordsize_);
    821       }
    822     }
    823   }
    824 
    825 
    826   // Process error queue after all errors have been recorded.
    827   for (int err = 0; err < errors; err++) {
    828     int priority = 5;
    829     if (errorcount_ + err < 30)
    830       priority = 0;  // Bump up the priority for the first few errors.
    831     ProcessError(&recorded[err], priority, errormessage.c_str());
    832   }
    833 
    834   if (page_error) {
    835     // For each word in the data region.
    836     for (int i = 0; i < length / wordsize_; i++) {
    837       uint64 actual = memblock[i];
    838       uint64 expected;
    839       datacast_t data;
    840       // Determine the value that should be there.
    841       int index = 2 * i + pattern_offset;
    842 
    843       data.l32.l = pattern->pattern(index);
    844       data.l32.h = pattern->pattern(index + 1);
    845       expected = data.l64;
    846 
    847       // Check tags if necessary.
    848       if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
    849         expected = addr_to_tag(&memblock[i]);
    850       }
    851 
    852       // If the value is incorrect, save an error record for later printing.
    853       if (actual != expected) {
    854         // If we have overflowed the error queue, print the errors now.
    855         struct ErrorRecord er;
    856         er.actual = actual;
    857         er.expected = expected;
    858         er.vaddr = &memblock[i];
    859 
    860         // Do the error printout. This will take a long time and
    861         // likely change the machine state.
    862         ProcessError(&er, 12, errormessage.c_str());
    863         overflowerrors++;
    864       }
    865     }
    866   }
    867 
    868   // Keep track of observed errors.
    869   errorcount_ += errors + overflowerrors;
    870   return errors + overflowerrors;
    871 }
    872 
    873 float WorkerThread::GetCopiedData() {
    874   return pages_copied_ * sat_->page_length() / kMegabyte;
    875 }
    876 
    877 // Calculate the CRC of a region.
    878 // Result check if the CRC mismatches.
    879 int WorkerThread::CrcCheckPage(struct page_entry *srcpe) {
    880   const int blocksize = 4096;
    881   const int blockwords = blocksize / wordsize_;
    882   int errors = 0;
    883 
    884   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
    885   uint64 *memblock = static_cast<uint64*>(srcpe->addr);
    886   int blocks = sat_->page_length() / blocksize;
    887   for (int currentblock = 0; currentblock < blocks; currentblock++) {
    888     uint64 *memslice = memblock + currentblock * blockwords;
    889 
    890     AdlerChecksum crc;
    891     if (tag_mode_) {
    892       AdlerAddrCrcC(memslice, blocksize, &crc, srcpe);
    893     } else {
    894       CalculateAdlerChecksum(memslice, blocksize, &crc);
    895     }
    896 
    897     // If the CRC does not match, we'd better look closer.
    898     if (!crc.Equals(*expectedcrc)) {
    899       logprintf(11, "Log: CrcCheckPage Falling through to slow compare, "
    900                 "CRC mismatch %s != %s\n",
    901                 crc.ToHexString().c_str(),
    902                 expectedcrc->ToHexString().c_str());
    903       int errorcount = CheckRegion(memslice,
    904                                    srcpe->pattern,
    905                                    blocksize,
    906                                    currentblock * blocksize, 0);
    907       if (errorcount == 0) {
    908         logprintf(0, "Log: CrcCheckPage CRC mismatch %s != %s, "
    909                      "but no miscompares found.\n",
    910                   crc.ToHexString().c_str(),
    911                   expectedcrc->ToHexString().c_str());
    912       }
    913       errors += errorcount;
    914     }
    915   }
    916 
    917   // For odd length transfers, we should never hit this.
    918   int leftovers = sat_->page_length() % blocksize;
    919   if (leftovers) {
    920     uint64 *memslice = memblock + blocks * blockwords;
    921     errors += CheckRegion(memslice,
    922                           srcpe->pattern,
    923                           leftovers,
    924                           blocks * blocksize, 0);
    925   }
    926   return errors;
    927 }
    928 
    929 
    930 // Print error information about a data miscompare.
    931 void WorkerThread::ProcessTagError(struct ErrorRecord *error,
    932                                    int priority,
    933                                    const char *message) {
    934   char dimm_string[256] = "";
    935   char tag_dimm_string[256] = "";
    936   bool read_error = false;
    937 
    938   int core_id = sched_getcpu();
    939 
    940   // Determine if this is a write or read error.
    941   os_->Flush(error->vaddr);
    942   error->reread = *(error->vaddr);
    943 
    944   // Distinguish read and write errors.
    945   if (error->actual != error->reread) {
    946     read_error = true;
    947   }
    948 
    949   sat_assert(error->expected != error->actual);
    950 
    951   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr);
    952 
    953   // Find physical address if possible.
    954   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
    955   error->tagpaddr = os_->VirtualToPhysical(error->tagvaddr);
    956 
    957   // Pretty print DIMM mapping if available.
    958   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
    959   // Pretty print DIMM mapping if available.
    960   os_->FindDimm(error->tagpaddr, tag_dimm_string, sizeof(tag_dimm_string));
    961 
    962   // Report parseable error.
    963   if (priority < 5) {
    964     logprintf(priority,
    965               "%s: Tag from %p(0x%llx:%s) (%s) "
    966               "miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
    967               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
    968               message,
    969               error->tagvaddr, error->tagpaddr,
    970               tag_dimm_string,
    971               read_error ? "read error" : "write error",
    972               core_id,
    973               CurrentCpusFormat().c_str(),
    974               error->vaddr,
    975               error->paddr,
    976               dimm_string,
    977               error->actual,
    978               error->reread,
    979               error->expected);
    980   }
    981 
    982   errorcount_ += 1;
    983 
    984   // Overwrite incorrect data with correct data to prevent
    985   // future miscompares when this data is reused.
    986   *(error->vaddr) = error->expected;
    987   os_->Flush(error->vaddr);
    988 }
    989 
    990 
    991 // Print out and log a tag error.
    992 bool WorkerThread::ReportTagError(
    993     uint64 *mem64,
    994     uint64 actual,
    995     uint64 tag) {
    996   struct ErrorRecord er;
    997   er.actual = actual;
    998 
    999   er.expected = tag;
   1000   er.vaddr = mem64;
   1001 
   1002   // Generate vaddr from tag.
   1003   er.tagvaddr = reinterpret_cast<uint64*>(actual);
   1004 
   1005   ProcessTagError(&er, 0, "Hardware Error");
   1006   return true;
   1007 }
   1008 
   1009 // C implementation of Adler memory copy, with memory tagging.
   1010 bool WorkerThread::AdlerAddrMemcpyC(uint64 *dstmem64,
   1011                                     uint64 *srcmem64,
   1012                                     unsigned int size_in_bytes,
   1013                                     AdlerChecksum *checksum,
   1014                                     struct page_entry *pe) {
   1015   // Use this data wrapper to access memory with 64bit read/write.
   1016   datacast_t data;
   1017   datacast_t dstdata;
   1018   unsigned int count = size_in_bytes / sizeof(data);
   1019 
   1020   if (count > ((1U) << 19)) {
   1021     // Size is too large, must be strictly less than 512 KB.
   1022     return false;
   1023   }
   1024 
   1025   uint64 a1 = 1;
   1026   uint64 a2 = 1;
   1027   uint64 b1 = 0;
   1028   uint64 b2 = 0;
   1029 
   1030   class Pattern *pattern = pe->pattern;
   1031 
   1032   unsigned int i = 0;
   1033   while (i < count) {
   1034     // Process 64 bits at a time.
   1035     if ((i & 0x7) == 0) {
   1036       data.l64 = srcmem64[i];
   1037       dstdata.l64 = dstmem64[i];
   1038       uint64 src_tag = addr_to_tag(&srcmem64[i]);
   1039       uint64 dst_tag = addr_to_tag(&dstmem64[i]);
   1040       // Detect if tags have been corrupted.
   1041       if (data.l64 != src_tag)
   1042         ReportTagError(&srcmem64[i], data.l64, src_tag);
   1043       if (dstdata.l64 != dst_tag)
   1044         ReportTagError(&dstmem64[i], dstdata.l64, dst_tag);
   1045 
   1046       data.l32.l = pattern->pattern(i << 1);
   1047       data.l32.h = pattern->pattern((i << 1) + 1);
   1048       a1 = a1 + data.l32.l;
   1049       b1 = b1 + a1;
   1050       a1 = a1 + data.l32.h;
   1051       b1 = b1 + a1;
   1052 
   1053       data.l64  = dst_tag;
   1054       dstmem64[i] = data.l64;
   1055 
   1056     } else {
   1057       data.l64 = srcmem64[i];
   1058       a1 = a1 + data.l32.l;
   1059       b1 = b1 + a1;
   1060       a1 = a1 + data.l32.h;
   1061       b1 = b1 + a1;
   1062       dstmem64[i] = data.l64;
   1063     }
   1064     i++;
   1065 
   1066     data.l64 = srcmem64[i];
   1067     a2 = a2 + data.l32.l;
   1068     b2 = b2 + a2;
   1069     a2 = a2 + data.l32.h;
   1070     b2 = b2 + a2;
   1071     dstmem64[i] = data.l64;
   1072     i++;
   1073   }
   1074   checksum->Set(a1, a2, b1, b2);
   1075   return true;
   1076 }
   1077 
   1078 // x86_64 SSE2 assembly implementation of Adler memory copy, with address
   1079 // tagging added as a second step. This is useful for debugging failures
   1080 // that only occur when SSE / nontemporal writes are used.
   1081 bool WorkerThread::AdlerAddrMemcpyWarm(uint64 *dstmem64,
   1082                                        uint64 *srcmem64,
   1083                                        unsigned int size_in_bytes,
   1084                                        AdlerChecksum *checksum,
   1085                                        struct page_entry *pe) {
   1086   // Do ASM copy, ignore checksum.
   1087   AdlerChecksum ignored_checksum;
   1088   os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum);
   1089 
   1090   // Force cache flush of both the source and destination addresses.
   1091   //  length - length of block to flush in cachelines.
   1092   //  mem_increment - number of dstmem/srcmem values per cacheline.
   1093   int length = size_in_bytes / kCacheLineSize;
   1094   int mem_increment = kCacheLineSize / sizeof(*dstmem64);
   1095   OsLayer::FastFlushSync();
   1096   for (int i = 0; i < length; ++i) {
   1097     OsLayer::FastFlushHint(dstmem64 + (i * mem_increment));
   1098     OsLayer::FastFlushHint(srcmem64 + (i * mem_increment));
   1099   }
   1100   OsLayer::FastFlushSync();
   1101 
   1102   // Check results.
   1103   AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe);
   1104   // Patch up address tags.
   1105   TagAddrC(dstmem64, size_in_bytes);
   1106   return true;
   1107 }
   1108 
   1109 // Retag pages..
   1110 bool WorkerThread::TagAddrC(uint64 *memwords,
   1111                             unsigned int size_in_bytes) {
   1112   // Mask is the bitmask of indexes used by the pattern.
   1113   // It is the pattern size -1. Size is always a power of 2.
   1114 
   1115   // Select tag or data as appropriate.
   1116   int length = size_in_bytes / wordsize_;
   1117   for (int i = 0; i < length; i += 8) {
   1118     datacast_t data;
   1119     data.l64 = addr_to_tag(&memwords[i]);
   1120     memwords[i] = data.l64;
   1121   }
   1122   return true;
   1123 }
   1124 
   1125 // C implementation of Adler memory crc.
   1126 bool WorkerThread::AdlerAddrCrcC(uint64 *srcmem64,
   1127                                  unsigned int size_in_bytes,
   1128                                  AdlerChecksum *checksum,
   1129                                  struct page_entry *pe) {
   1130   // Use this data wrapper to access memory with 64bit read/write.
   1131   datacast_t data;
   1132   unsigned int count = size_in_bytes / sizeof(data);
   1133 
   1134   if (count > ((1U) << 19)) {
   1135     // Size is too large, must be strictly less than 512 KB.
   1136     return false;
   1137   }
   1138 
   1139   uint64 a1 = 1;
   1140   uint64 a2 = 1;
   1141   uint64 b1 = 0;
   1142   uint64 b2 = 0;
   1143 
   1144   class Pattern *pattern = pe->pattern;
   1145 
   1146   unsigned int i = 0;
   1147   while (i < count) {
   1148     // Process 64 bits at a time.
   1149     if ((i & 0x7) == 0) {
   1150       data.l64 = srcmem64[i];
   1151       uint64 src_tag = addr_to_tag(&srcmem64[i]);
   1152       // Check that tags match expected.
   1153       if (data.l64 != src_tag)
   1154         ReportTagError(&srcmem64[i], data.l64, src_tag);
   1155 
   1156       data.l32.l = pattern->pattern(i << 1);
   1157       data.l32.h = pattern->pattern((i << 1) + 1);
   1158       a1 = a1 + data.l32.l;
   1159       b1 = b1 + a1;
   1160       a1 = a1 + data.l32.h;
   1161       b1 = b1 + a1;
   1162     } else {
   1163       data.l64 = srcmem64[i];
   1164       a1 = a1 + data.l32.l;
   1165       b1 = b1 + a1;
   1166       a1 = a1 + data.l32.h;
   1167       b1 = b1 + a1;
   1168     }
   1169     i++;
   1170 
   1171     data.l64 = srcmem64[i];
   1172     a2 = a2 + data.l32.l;
   1173     b2 = b2 + a2;
   1174     a2 = a2 + data.l32.h;
   1175     b2 = b2 + a2;
   1176     i++;
   1177   }
   1178   checksum->Set(a1, a2, b1, b2);
   1179   return true;
   1180 }
   1181 
   1182 // Copy a block of memory quickly, while keeping a CRC of the data.
   1183 // Result check if the CRC mismatches.
   1184 int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
   1185                               struct page_entry *srcpe) {
   1186   int errors = 0;
   1187   const int blocksize = 4096;
   1188   const int blockwords = blocksize / wordsize_;
   1189   int blocks = sat_->page_length() / blocksize;
   1190 
   1191   // Base addresses for memory copy
   1192   uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
   1193   uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
   1194   // Remember the expected CRC
   1195   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
   1196 
   1197   for (int currentblock = 0; currentblock < blocks; currentblock++) {
   1198     uint64 *targetmem = targetmembase + currentblock * blockwords;
   1199     uint64 *sourcemem = sourcemembase + currentblock * blockwords;
   1200 
   1201     AdlerChecksum crc;
   1202     if (tag_mode_) {
   1203       AdlerAddrMemcpyC(targetmem, sourcemem, blocksize, &crc, srcpe);
   1204     } else {
   1205       AdlerMemcpyC(targetmem, sourcemem, blocksize, &crc);
   1206     }
   1207 
   1208     // Investigate miscompares.
   1209     if (!crc.Equals(*expectedcrc)) {
   1210       logprintf(11, "Log: CrcCopyPage Falling through to slow compare, "
   1211                 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
   1212                 expectedcrc->ToHexString().c_str());
   1213       int errorcount = CheckRegion(sourcemem,
   1214                                    srcpe->pattern,
   1215                                    blocksize,
   1216                                    currentblock * blocksize, 0);
   1217       if (errorcount == 0) {
   1218         logprintf(0, "Log: CrcCopyPage CRC mismatch %s != %s, "
   1219                      "but no miscompares found. Retrying with fresh data.\n",
   1220                   crc.ToHexString().c_str(),
   1221                   expectedcrc->ToHexString().c_str());
   1222         if (!tag_mode_) {
   1223           // Copy the data originally read from this region back again.
   1224           // This data should have any corruption read originally while
   1225           // calculating the CRC.
   1226           memcpy(sourcemem, targetmem, blocksize);
   1227           errorcount = CheckRegion(sourcemem,
   1228                                    srcpe->pattern,
   1229                                    blocksize,
   1230                                    currentblock * blocksize, 0);
   1231           if (errorcount == 0) {
   1232             int core_id = sched_getcpu();
   1233             logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage "
   1234                          "CRC mismatch %s != %s, "
   1235                          "but no miscompares found on second pass.\n",
   1236                       core_id, CurrentCpusFormat().c_str(),
   1237                       crc.ToHexString().c_str(),
   1238                       expectedcrc->ToHexString().c_str());
   1239             struct ErrorRecord er;
   1240             er.actual = sourcemem[0];
   1241             er.expected = 0x0;
   1242             er.vaddr = sourcemem;
   1243             ProcessError(&er, 0, "Hardware Error");
   1244           }
   1245         }
   1246       }
   1247       errors += errorcount;
   1248     }
   1249   }
   1250 
   1251   // For odd length transfers, we should never hit this.
   1252   int leftovers = sat_->page_length() % blocksize;
   1253   if (leftovers) {
   1254     uint64 *targetmem = targetmembase + blocks * blockwords;
   1255     uint64 *sourcemem = sourcemembase + blocks * blockwords;
   1256 
   1257     errors += CheckRegion(sourcemem,
   1258                           srcpe->pattern,
   1259                           leftovers,
   1260                           blocks * blocksize, 0);
   1261     int leftoverwords = leftovers / wordsize_;
   1262     for (int i = 0; i < leftoverwords; i++) {
   1263       targetmem[i] = sourcemem[i];
   1264     }
   1265   }
   1266 
   1267   // Update pattern reference to reflect new contents.
   1268   dstpe->pattern = srcpe->pattern;
   1269 
   1270   // Clean clean clean the errors away.
   1271   if (errors) {
   1272     // TODO(nsanders): Maybe we should patch rather than fill? Filling may
   1273     // cause bad data to be propogated across the page.
   1274     FillPage(dstpe);
   1275   }
   1276   return errors;
   1277 }
   1278 
   1279 
   1280 
   1281 // Invert a block of memory quickly, traversing downwards.
   1282 int InvertThread::InvertPageDown(struct page_entry *srcpe) {
   1283   const int blocksize = 4096;
   1284   const int blockwords = blocksize / wordsize_;
   1285   int blocks = sat_->page_length() / blocksize;
   1286 
   1287   // Base addresses for memory copy
   1288   unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
   1289 
   1290   for (int currentblock = blocks-1; currentblock >= 0; currentblock--) {
   1291     unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
   1292     for (int i = blockwords - 32; i >= 0; i -= 32) {
   1293       for (int index = i + 31; index >= i; --index) {
   1294         unsigned int actual = sourcemem[index];
   1295         sourcemem[index] = ~actual;
   1296       }
   1297       OsLayer::FastFlush(&sourcemem[i]);
   1298     }
   1299   }
   1300 
   1301   return 0;
   1302 }
   1303 
   1304 // Invert a block of memory, traversing upwards.
   1305 int InvertThread::InvertPageUp(struct page_entry *srcpe) {
   1306   const int blocksize = 4096;
   1307   const int blockwords = blocksize / wordsize_;
   1308   int blocks = sat_->page_length() / blocksize;
   1309 
   1310   // Base addresses for memory copy
   1311   unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
   1312 
   1313   for (int currentblock = 0; currentblock < blocks; currentblock++) {
   1314     unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
   1315     for (int i = 0; i < blockwords; i += 32) {
   1316       for (int index = i; index <= i + 31; ++index) {
   1317         unsigned int actual = sourcemem[index];
   1318         sourcemem[index] = ~actual;
   1319       }
   1320       OsLayer::FastFlush(&sourcemem[i]);
   1321     }
   1322   }
   1323   return 0;
   1324 }
   1325 
   1326 // Copy a block of memory quickly, while keeping a CRC of the data.
   1327 // Result check if the CRC mismatches. Warm the CPU while running
   1328 int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
   1329                                   struct page_entry *srcpe) {
   1330   int errors = 0;
   1331   const int blocksize = 4096;
   1332   const int blockwords = blocksize / wordsize_;
   1333   int blocks = sat_->page_length() / blocksize;
   1334 
   1335   // Base addresses for memory copy
   1336   uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
   1337   uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
   1338   // Remember the expected CRC
   1339   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
   1340 
   1341   for (int currentblock = 0; currentblock < blocks; currentblock++) {
   1342     uint64 *targetmem = targetmembase + currentblock * blockwords;
   1343     uint64 *sourcemem = sourcemembase + currentblock * blockwords;
   1344 
   1345     AdlerChecksum crc;
   1346     if (tag_mode_) {
   1347       AdlerAddrMemcpyWarm(targetmem, sourcemem, blocksize, &crc, srcpe);
   1348     } else {
   1349       os_->AdlerMemcpyWarm(targetmem, sourcemem, blocksize, &crc);
   1350     }
   1351 
   1352     // Investigate miscompares.
   1353     if (!crc.Equals(*expectedcrc)) {
   1354       logprintf(11, "Log: CrcWarmCopyPage Falling through to slow compare, "
   1355                 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
   1356                 expectedcrc->ToHexString().c_str());
   1357       int errorcount = CheckRegion(sourcemem,
   1358                                    srcpe->pattern,
   1359                                    blocksize,
   1360                                    currentblock * blocksize, 0);
   1361       if (errorcount == 0) {
   1362         logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, "
   1363                      "but no miscompares found. Retrying with fresh data.\n",
   1364                   expectedcrc->ToHexString().c_str(),
   1365                   crc.ToHexString().c_str() );
   1366         if (!tag_mode_) {
   1367           // Copy the data originally read from this region back again.
   1368           // This data should have any corruption read originally while
   1369           // calculating the CRC.
   1370           memcpy(sourcemem, targetmem, blocksize);
   1371           errorcount = CheckRegion(sourcemem,
   1372                                    srcpe->pattern,
   1373                                    blocksize,
   1374                                    currentblock * blocksize, 0);
   1375           if (errorcount == 0) {
   1376             int core_id = sched_getcpu();
   1377             logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage "
   1378                          "CRC mismatch %s != %s, "
   1379                          "but no miscompares found on second pass.\n",
   1380                       core_id, CurrentCpusFormat().c_str(),
   1381                       crc.ToHexString().c_str(),
   1382                       expectedcrc->ToHexString().c_str());
   1383             struct ErrorRecord er;
   1384             er.actual = sourcemem[0];
   1385             er.expected = 0xbad;
   1386             er.vaddr = sourcemem;
   1387             ProcessError(&er, 0, "Hardware Error");
   1388           }
   1389         }
   1390       }
   1391       errors += errorcount;
   1392     }
   1393   }
   1394 
   1395   // For odd length transfers, we should never hit this.
   1396   int leftovers = sat_->page_length() % blocksize;
   1397   if (leftovers) {
   1398     uint64 *targetmem = targetmembase + blocks * blockwords;
   1399     uint64 *sourcemem = sourcemembase + blocks * blockwords;
   1400 
   1401     errors += CheckRegion(sourcemem,
   1402                           srcpe->pattern,
   1403                           leftovers,
   1404                           blocks * blocksize, 0);
   1405     int leftoverwords = leftovers / wordsize_;
   1406     for (int i = 0; i < leftoverwords; i++) {
   1407       targetmem[i] = sourcemem[i];
   1408     }
   1409   }
   1410 
   1411   // Update pattern reference to reflect new contents.
   1412   dstpe->pattern = srcpe->pattern;
   1413 
   1414   // Clean clean clean the errors away.
   1415   if (errors) {
   1416     // TODO(nsanders): Maybe we should patch rather than fill? Filling may
   1417     // cause bad data to be propogated across the page.
   1418     FillPage(dstpe);
   1419   }
   1420   return errors;
   1421 }
   1422 
   1423 
   1424 
   1425 // Memory check work loop. Execute until done, then exhaust pages.
   1426 bool CheckThread::Work() {
   1427   struct page_entry pe;
   1428   bool result = true;
   1429   int64 loops = 0;
   1430 
   1431   logprintf(9, "Log: Starting Check thread %d\n", thread_num_);
   1432 
   1433   // We want to check all the pages, and
   1434   // stop when there aren't any left.
   1435   while (true) {
   1436     result = result && sat_->GetValid(&pe);
   1437     if (!result) {
   1438       if (IsReadyToRunNoPause())
   1439         logprintf(0, "Process Error: check_thread failed to pop pages, "
   1440                   "bailing\n");
   1441       else
   1442         result = true;
   1443       break;
   1444     }
   1445 
   1446     // Do the result check.
   1447     CrcCheckPage(&pe);
   1448 
   1449     // Push pages back on the valid queue if we are still going,
   1450     // throw them out otherwise.
   1451     if (IsReadyToRunNoPause())
   1452       result = result && sat_->PutValid(&pe);
   1453     else
   1454       result = result && sat_->PutEmpty(&pe);
   1455     if (!result) {
   1456       logprintf(0, "Process Error: check_thread failed to push pages, "
   1457                 "bailing\n");
   1458       break;
   1459     }
   1460     loops++;
   1461   }
   1462 
   1463   pages_copied_ = loops;
   1464   status_ = result;
   1465   logprintf(9, "Log: Completed %d: Check thread. Status %d, %d pages checked\n",
   1466             thread_num_, status_, pages_copied_);
   1467   return result;
   1468 }
   1469 
   1470 
   1471 // Memory copy work loop. Execute until marked done.
   1472 bool CopyThread::Work() {
   1473   struct page_entry src;
   1474   struct page_entry dst;
   1475   bool result = true;
   1476   int64 loops = 0;
   1477 
   1478   logprintf(9, "Log: Starting copy thread %d: cpu %s, mem %x\n",
   1479             thread_num_, cpuset_format(&cpu_mask_).c_str(), tag_);
   1480 
   1481   while (IsReadyToRun()) {
   1482     // Pop the needed pages.
   1483     result = result && sat_->GetValid(&src, tag_);
   1484     result = result && sat_->GetEmpty(&dst, tag_);
   1485     if (!result) {
   1486       logprintf(0, "Process Error: copy_thread failed to pop pages, "
   1487                 "bailing\n");
   1488       break;
   1489     }
   1490 
   1491     // Force errors for unittests.
   1492     if (sat_->error_injection()) {
   1493       if (loops == 8) {
   1494         char *addr = reinterpret_cast<char*>(src.addr);
   1495         int offset = random() % sat_->page_length();
   1496         addr[offset] = 0xba;
   1497       }
   1498     }
   1499 
   1500     // We can use memcpy, or CRC check while we copy.
   1501     if (sat_->warm()) {
   1502       CrcWarmCopyPage(&dst, &src);
   1503     } else if (sat_->strict()) {
   1504       CrcCopyPage(&dst, &src);
   1505     } else {
   1506       memcpy(dst.addr, src.addr, sat_->page_length());
   1507       dst.pattern = src.pattern;
   1508     }
   1509 
   1510     result = result && sat_->PutValid(&dst);
   1511     result = result && sat_->PutEmpty(&src);
   1512 
   1513     // Copy worker-threads yield themselves at the end of each copy loop,
   1514     // to avoid threads from preempting each other in the middle of the inner
   1515     // copy-loop. Cooperations between Copy worker-threads results in less
   1516     // unnecessary cache thrashing (which happens when context-switching in the
   1517     // middle of the inner copy-loop).
   1518     YieldSelf();
   1519 
   1520     if (!result) {
   1521       logprintf(0, "Process Error: copy_thread failed to push pages, "
   1522                 "bailing\n");
   1523       break;
   1524     }
   1525     loops++;
   1526   }
   1527 
   1528   pages_copied_ = loops;
   1529   status_ = result;
   1530   logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
   1531             thread_num_, status_, pages_copied_);
   1532   return result;
   1533 }
   1534 
   1535 // Memory invert work loop. Execute until marked done.
   1536 bool InvertThread::Work() {
   1537   struct page_entry src;
   1538   bool result = true;
   1539   int64 loops = 0;
   1540 
   1541   logprintf(9, "Log: Starting invert thread %d\n", thread_num_);
   1542 
   1543   while (IsReadyToRun()) {
   1544     // Pop the needed pages.
   1545     result = result && sat_->GetValid(&src);
   1546     if (!result) {
   1547       logprintf(0, "Process Error: invert_thread failed to pop pages, "
   1548                 "bailing\n");
   1549       break;
   1550     }
   1551 
   1552     if (sat_->strict())
   1553       CrcCheckPage(&src);
   1554 
   1555     // For the same reason CopyThread yields itself (see YieldSelf comment
   1556     // in CopyThread::Work(), InvertThread yields itself after each invert
   1557     // operation to improve cooperation between different worker threads
   1558     // stressing the memory/cache.
   1559     InvertPageUp(&src);
   1560     YieldSelf();
   1561     InvertPageDown(&src);
   1562     YieldSelf();
   1563     InvertPageDown(&src);
   1564     YieldSelf();
   1565     InvertPageUp(&src);
   1566     YieldSelf();
   1567 
   1568     if (sat_->strict())
   1569       CrcCheckPage(&src);
   1570 
   1571     result = result && sat_->PutValid(&src);
   1572     if (!result) {
   1573       logprintf(0, "Process Error: invert_thread failed to push pages, "
   1574                 "bailing\n");
   1575       break;
   1576     }
   1577     loops++;
   1578   }
   1579 
   1580   pages_copied_ = loops * 2;
   1581   status_ = result;
   1582   logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
   1583             thread_num_, status_, pages_copied_);
   1584   return result;
   1585 }
   1586 
   1587 
   1588 // Set file name to use for File IO.
   1589 void FileThread::SetFile(const char *filename_init) {
   1590   filename_ = filename_init;
   1591   devicename_ = os_->FindFileDevice(filename_);
   1592 }
   1593 
   1594 // Open the file for access.
   1595 bool FileThread::OpenFile(int *pfile) {
   1596   int flags = O_RDWR | O_CREAT | O_SYNC;
   1597   int fd = open(filename_.c_str(), flags | O_DIRECT, 0644);
   1598   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
   1599     fd = open(filename_.c_str(), flags, 0644);  // Try without O_DIRECT
   1600     os_->ActivateFlushPageCache();  // Not using O_DIRECT fixed EINVAL
   1601   }
   1602   if (fd < 0) {
   1603     logprintf(0, "Process Error: Failed to create file %s!!\n",
   1604               filename_.c_str());
   1605     pages_copied_ = 0;
   1606     return false;
   1607   }
   1608   *pfile = fd;
   1609   return true;
   1610 }
   1611 
   1612 // Close the file.
   1613 bool FileThread::CloseFile(int fd) {
   1614   close(fd);
   1615   return true;
   1616 }
   1617 
   1618 // Check sector tagging.
   1619 bool FileThread::SectorTagPage(struct page_entry *src, int block) {
   1620   int page_length = sat_->page_length();
   1621   struct FileThread::SectorTag *tag =
   1622     (struct FileThread::SectorTag *)(src->addr);
   1623 
   1624   // Tag each sector.
   1625   unsigned char magic = ((0xba + thread_num_) & 0xff);
   1626   for (int sec = 0; sec < page_length / 512; sec++) {
   1627     tag[sec].magic = magic;
   1628     tag[sec].block = block & 0xff;
   1629     tag[sec].sector = sec & 0xff;
   1630     tag[sec].pass = pass_ & 0xff;
   1631   }
   1632   return true;
   1633 }
   1634 
   1635 bool FileThread::WritePageToFile(int fd, struct page_entry *src) {
   1636   int page_length = sat_->page_length();
   1637   // Fill the file with our data.
   1638   int64 size = write(fd, src->addr, page_length);
   1639 
   1640   if (size != page_length) {
   1641     os_->ErrorReport(devicename_.c_str(), "write-error", 1);
   1642     errorcount_++;
   1643     logprintf(0, "Block Error: file_thread failed to write, "
   1644               "bailing\n");
   1645     return false;
   1646   }
   1647   return true;
   1648 }
   1649 
   1650 // Write the data to the file.
   1651 bool FileThread::WritePages(int fd) {
   1652   int strict = sat_->strict();
   1653 
   1654   // Start fresh at beginning of file for each batch of pages.
   1655   lseek64(fd, 0, SEEK_SET);
   1656   for (int i = 0; i < sat_->disk_pages(); i++) {
   1657     struct page_entry src;
   1658     if (!GetValidPage(&src))
   1659       return false;
   1660     // Save expected pattern.
   1661     page_recs_[i].pattern = src.pattern;
   1662     page_recs_[i].src = src.addr;
   1663 
   1664     // Check data correctness.
   1665     if (strict)
   1666       CrcCheckPage(&src);
   1667 
   1668     SectorTagPage(&src, i);
   1669 
   1670     bool result = WritePageToFile(fd, &src);
   1671 
   1672     if (!PutEmptyPage(&src))
   1673       return false;
   1674 
   1675     if (!result)
   1676       return false;
   1677   }
   1678   return os_->FlushPageCache();  // If O_DIRECT worked, this will be a NOP.
   1679 }
   1680 
   1681 // Copy data from file into memory block.
   1682 bool FileThread::ReadPageFromFile(int fd, struct page_entry *dst) {
   1683   int page_length = sat_->page_length();
   1684 
   1685   // Do the actual read.
   1686   int64 size = read(fd, dst->addr, page_length);
   1687   if (size != page_length) {
   1688     os_->ErrorReport(devicename_.c_str(), "read-error", 1);
   1689     logprintf(0, "Block Error: file_thread failed to read, "
   1690               "bailing\n");
   1691     errorcount_++;
   1692     return false;
   1693   }
   1694   return true;
   1695 }
   1696 
   1697 // Check sector tagging.
   1698 bool FileThread::SectorValidatePage(const struct PageRec &page,
   1699                                     struct page_entry *dst, int block) {
   1700   // Error injection.
   1701   static int calls = 0;
   1702   calls++;
   1703 
   1704   // Do sector tag compare.
   1705   int firstsector = -1;
   1706   int lastsector = -1;
   1707   bool badsector = false;
   1708   int page_length = sat_->page_length();
   1709 
   1710   // Cast data block into an array of tagged sectors.
   1711   struct FileThread::SectorTag *tag =
   1712   (struct FileThread::SectorTag *)(dst->addr);
   1713 
   1714   sat_assert(sizeof(*tag) == 512);
   1715 
   1716   // Error injection.
   1717   if (sat_->error_injection()) {
   1718     if (calls == 2) {
   1719       for (int badsec = 8; badsec < 17; badsec++)
   1720         tag[badsec].pass = 27;
   1721     }
   1722     if (calls == 18) {
   1723       (static_cast<int32*>(dst->addr))[27] = 0xbadda7a;
   1724     }
   1725   }
   1726 
   1727   // Check each sector for the correct tag we added earlier,
   1728   // then revert the tag to the to normal data pattern.
   1729   unsigned char magic = ((0xba + thread_num_) & 0xff);
   1730   for (int sec = 0; sec < page_length / 512; sec++) {
   1731     // Check magic tag.
   1732     if ((tag[sec].magic != magic) ||
   1733         (tag[sec].block != (block & 0xff)) ||
   1734         (tag[sec].sector != (sec & 0xff)) ||
   1735         (tag[sec].pass != (pass_ & 0xff))) {
   1736       // Offset calculation for tag location.
   1737       int offset = sec * sizeof(SectorTag);
   1738       if (tag[sec].block != (block & 0xff))
   1739         offset += 1 * sizeof(uint8);
   1740       else if (tag[sec].sector != (sec & 0xff))
   1741         offset += 2 * sizeof(uint8);
   1742       else if (tag[sec].pass != (pass_ & 0xff))
   1743         offset += 3 * sizeof(uint8);
   1744 
   1745       // Run sector tag error through diagnoser for logging and reporting.
   1746       errorcount_ += 1;
   1747       os_->error_diagnoser_->AddHDDSectorTagError(devicename_, tag[sec].block,
   1748                                                   offset,
   1749                                                   tag[sec].sector,
   1750                                                   page.src, page.dst);
   1751 
   1752       logprintf(5, "Sector Error: Sector tag @ 0x%x, pass %d/%d. "
   1753                 "sec %x/%x, block %d/%d, magic %x/%x, File: %s \n",
   1754                 block * page_length + 512 * sec,
   1755                 (pass_ & 0xff), (unsigned int)tag[sec].pass,
   1756                 sec, (unsigned int)tag[sec].sector,
   1757                 block, (unsigned int)tag[sec].block,
   1758                 magic, (unsigned int)tag[sec].magic,
   1759                 filename_.c_str());
   1760 
   1761       // Keep track of first and last bad sector.
   1762       if (firstsector == -1)
   1763         firstsector = (block * page_length / 512) + sec;
   1764       lastsector = (block * page_length / 512) + sec;
   1765       badsector = true;
   1766     }
   1767     // Patch tag back to proper pattern.
   1768     unsigned int *addr = (unsigned int *)(&tag[sec]);
   1769     *addr = dst->pattern->pattern(512 * sec / sizeof(*addr));
   1770   }
   1771 
   1772   // If we found sector errors:
   1773   if (badsector == true) {
   1774     logprintf(5, "Log: file sector miscompare at offset %x-%x. File: %s\n",
   1775               firstsector * 512,
   1776               ((lastsector + 1) * 512) - 1,
   1777               filename_.c_str());
   1778 
   1779     // Either exit immediately, or patch the data up and continue.
   1780     if (sat_->stop_on_error()) {
   1781       exit(1);
   1782     } else {
   1783       // Patch up bad pages.
   1784       for (int block = (firstsector * 512) / page_length;
   1785           block <= (lastsector * 512) / page_length;
   1786           block++) {
   1787         unsigned int *memblock = static_cast<unsigned int *>(dst->addr);
   1788         int length = page_length / wordsize_;
   1789         for (int i = 0; i < length; i++) {
   1790           memblock[i] = dst->pattern->pattern(i);
   1791         }
   1792       }
   1793     }
   1794   }
   1795   return true;
   1796 }
   1797 
   1798 // Get memory for an incoming data transfer..
   1799 bool FileThread::PagePrepare() {
   1800   // We can only do direct IO to SAT pages if it is normal mem.
   1801   page_io_ = os_->normal_mem();
   1802 
   1803   // Init a local buffer if we need it.
   1804   if (!page_io_) {
   1805 #ifdef HAVE_POSIX_MEMALIGN
   1806     int result = posix_memalign(&local_page_, 512, sat_->page_length());
   1807 #else
   1808     local_page_ = memalign(512, sat_->page_length());
   1809     int result = (local_page_ == 0);
   1810 #endif
   1811     if (result) {
   1812       logprintf(0, "Process Error: disk thread posix_memalign "
   1813                    "returned %d (fail)\n",
   1814                 result);
   1815       status_ = false;
   1816       return false;
   1817     }
   1818   }
   1819   return true;
   1820 }
   1821 
   1822 
   1823 // Remove memory allocated for data transfer.
   1824 bool FileThread::PageTeardown() {
   1825   // Free a local buffer if we need to.
   1826   if (!page_io_) {
   1827     free(local_page_);
   1828   }
   1829   return true;
   1830 }
   1831 
   1832 
   1833 
   1834 // Get memory for an incoming data transfer..
   1835 bool FileThread::GetEmptyPage(struct page_entry *dst) {
   1836   if (page_io_) {
   1837     if (!sat_->GetEmpty(dst))
   1838       return false;
   1839   } else {
   1840     dst->addr = local_page_;
   1841     dst->offset = 0;
   1842     dst->pattern = 0;
   1843   }
   1844   return true;
   1845 }
   1846 
   1847 // Get memory for an outgoing data transfer..
   1848 bool FileThread::GetValidPage(struct page_entry *src) {
   1849   struct page_entry tmp;
   1850   if (!sat_->GetValid(&tmp))
   1851     return false;
   1852   if (page_io_) {
   1853     *src = tmp;
   1854     return true;
   1855   } else {
   1856     src->addr = local_page_;
   1857     src->offset = 0;
   1858     CrcCopyPage(src, &tmp);
   1859     if (!sat_->PutValid(&tmp))
   1860       return false;
   1861   }
   1862   return true;
   1863 }
   1864 
   1865 
   1866 // Throw out a used empty page.
   1867 bool FileThread::PutEmptyPage(struct page_entry *src) {
   1868   if (page_io_) {
   1869     if (!sat_->PutEmpty(src))
   1870       return false;
   1871   }
   1872   return true;
   1873 }
   1874 
   1875 // Throw out a used, filled page.
   1876 bool FileThread::PutValidPage(struct page_entry *src) {
   1877   if (page_io_) {
   1878     if (!sat_->PutValid(src))
   1879       return false;
   1880   }
   1881   return true;
   1882 }
   1883 
   1884 // Copy data from file into memory blocks.
   1885 bool FileThread::ReadPages(int fd) {
   1886   int page_length = sat_->page_length();
   1887   int strict = sat_->strict();
   1888   bool result = true;
   1889 
   1890   // Read our data back out of the file, into it's new location.
   1891   lseek64(fd, 0, SEEK_SET);
   1892   for (int i = 0; i < sat_->disk_pages(); i++) {
   1893     struct page_entry dst;
   1894     if (!GetEmptyPage(&dst))
   1895       return false;
   1896     // Retrieve expected pattern.
   1897     dst.pattern = page_recs_[i].pattern;
   1898     // Update page recordpage record.
   1899     page_recs_[i].dst = dst.addr;
   1900 
   1901     // Read from the file into destination page.
   1902     if (!ReadPageFromFile(fd, &dst)) {
   1903         PutEmptyPage(&dst);
   1904         return false;
   1905     }
   1906 
   1907     SectorValidatePage(page_recs_[i], &dst, i);
   1908 
   1909     // Ensure that the transfer ended up with correct data.
   1910     if (strict) {
   1911       // Record page index currently CRC checked.
   1912       crc_page_ = i;
   1913       int errors = CrcCheckPage(&dst);
   1914       if (errors) {
   1915         logprintf(5, "Log: file miscompare at block %d, "
   1916                   "offset %x-%x. File: %s\n",
   1917                   i, i * page_length, ((i + 1) * page_length) - 1,
   1918                   filename_.c_str());
   1919         result = false;
   1920       }
   1921       crc_page_ = -1;
   1922       errorcount_ += errors;
   1923     }
   1924     if (!PutValidPage(&dst))
   1925       return false;
   1926   }
   1927   return result;
   1928 }
   1929 
   1930 // File IO work loop. Execute until marked done.
   1931 bool FileThread::Work() {
   1932   bool result = true;
   1933   int64 loops = 0;
   1934 
   1935   logprintf(9, "Log: Starting file thread %d, file %s, device %s\n",
   1936             thread_num_,
   1937             filename_.c_str(),
   1938             devicename_.c_str());
   1939 
   1940   if (!PagePrepare()) {
   1941     status_ = false;
   1942     return false;
   1943   }
   1944 
   1945   // Open the data IO file.
   1946   int fd = 0;
   1947   if (!OpenFile(&fd)) {
   1948     status_ = false;
   1949     return false;
   1950   }
   1951 
   1952   pass_ = 0;
   1953 
   1954   // Load patterns into page records.
   1955   page_recs_ = new struct PageRec[sat_->disk_pages()];
   1956   for (int i = 0; i < sat_->disk_pages(); i++) {
   1957     page_recs_[i].pattern = new class Pattern();
   1958   }
   1959 
   1960   // Loop until done.
   1961   while (IsReadyToRun()) {
   1962     // Do the file write.
   1963     if (!(result = result && WritePages(fd)))
   1964       break;
   1965 
   1966     // Do the file read.
   1967     if (!(result = result && ReadPages(fd)))
   1968       break;
   1969 
   1970     loops++;
   1971     pass_ = loops;
   1972   }
   1973 
   1974   pages_copied_ = loops * sat_->disk_pages();
   1975 
   1976   // Clean up.
   1977   CloseFile(fd);
   1978   PageTeardown();
   1979 
   1980   logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n",
   1981             thread_num_, status_, pages_copied_);
   1982   // Failure to read from device indicates hardware,
   1983   // rather than procedural SW error.
   1984   status_ = true;
   1985   return true;
   1986 }
   1987 
   1988 bool NetworkThread::IsNetworkStopSet() {
   1989   return !IsReadyToRunNoPause();
   1990 }
   1991 
   1992 bool NetworkSlaveThread::IsNetworkStopSet() {
   1993   // This thread has no completion status.
   1994   // It finishes whever there is no more data to be
   1995   // passed back.
   1996   return true;
   1997 }
   1998 
   1999 // Set ip name to use for Network IO.
   2000 void NetworkThread::SetIP(const char *ipaddr_init) {
   2001   strncpy(ipaddr_, ipaddr_init, 256);
   2002 }
   2003 
   2004 // Create a socket.
   2005 // Return 0 on error.
   2006 bool NetworkThread::CreateSocket(int *psocket) {
   2007   int sock = socket(AF_INET, SOCK_STREAM, 0);
   2008   if (sock == -1) {
   2009     logprintf(0, "Process Error: Cannot open socket\n");
   2010     pages_copied_ = 0;
   2011     status_ = false;
   2012     return false;
   2013   }
   2014   *psocket = sock;
   2015   return true;
   2016 }
   2017 
   2018 // Close the socket.
   2019 bool NetworkThread::CloseSocket(int sock) {
   2020   close(sock);
   2021   return true;
   2022 }
   2023 
   2024 // Initiate the tcp connection.
   2025 bool NetworkThread::Connect(int sock) {
   2026   struct sockaddr_in dest_addr;
   2027   dest_addr.sin_family = AF_INET;
   2028   dest_addr.sin_port = htons(kNetworkPort);
   2029   memset(&(dest_addr.sin_zero), '\0', sizeof(dest_addr.sin_zero));
   2030 
   2031   // Translate dot notation to u32.
   2032   if (inet_aton(ipaddr_, &dest_addr.sin_addr) == 0) {
   2033     logprintf(0, "Process Error: Cannot resolve %s\n", ipaddr_);
   2034     pages_copied_ = 0;
   2035     status_ = false;
   2036     return false;
   2037   }
   2038 
   2039   if (-1 == connect(sock, reinterpret_cast<struct sockaddr *>(&dest_addr),
   2040                     sizeof(struct sockaddr))) {
   2041     logprintf(0, "Process Error: Cannot connect %s\n", ipaddr_);
   2042     pages_copied_ = 0;
   2043     status_ = false;
   2044     return false;
   2045   }
   2046   return true;
   2047 }
   2048 
   2049 // Initiate the tcp connection.
   2050 bool NetworkListenThread::Listen() {
   2051   struct sockaddr_in sa;
   2052 
   2053   memset(&(sa.sin_zero), '\0', sizeof(sa.sin_zero));
   2054 
   2055   sa.sin_family = AF_INET;
   2056   sa.sin_addr.s_addr = INADDR_ANY;
   2057   sa.sin_port = htons(kNetworkPort);
   2058 
   2059   if (-1 == ::bind(sock_, (struct sockaddr*)&sa, sizeof(struct sockaddr))) {
   2060     char buf[256];
   2061     sat_strerror(errno, buf, sizeof(buf));
   2062     logprintf(0, "Process Error: Cannot bind socket: %s\n", buf);
   2063     pages_copied_ = 0;
   2064     status_ = false;
   2065     return false;
   2066   }
   2067   listen(sock_, 3);
   2068   return true;
   2069 }
   2070 
   2071 // Wait for a connection from a network traffic generation thread.
   2072 bool NetworkListenThread::Wait() {
   2073     fd_set rfds;
   2074     struct timeval tv;
   2075     int retval;
   2076 
   2077     // Watch sock_ to see when it has input.
   2078     FD_ZERO(&rfds);
   2079     FD_SET(sock_, &rfds);
   2080     // Wait up to five seconds.
   2081     tv.tv_sec = 5;
   2082     tv.tv_usec = 0;
   2083 
   2084     retval = select(sock_ + 1, &rfds, NULL, NULL, &tv);
   2085 
   2086     return (retval > 0);
   2087 }
   2088 
   2089 // Wait for a connection from a network traffic generation thread.
   2090 bool NetworkListenThread::GetConnection(int *pnewsock) {
   2091   struct sockaddr_in sa;
   2092   socklen_t size = sizeof(struct sockaddr_in);
   2093 
   2094   int newsock = accept(sock_, reinterpret_cast<struct sockaddr *>(&sa), &size);
   2095   if (newsock < 0)  {
   2096     logprintf(0, "Process Error: Did not receive connection\n");
   2097     pages_copied_ = 0;
   2098     status_ = false;
   2099     return false;
   2100   }
   2101   *pnewsock = newsock;
   2102   return true;
   2103 }
   2104 
   2105 // Send a page, return false if a page was not sent.
   2106 bool NetworkThread::SendPage(int sock, struct page_entry *src) {
   2107   int page_length = sat_->page_length();
   2108   char *address = static_cast<char*>(src->addr);
   2109 
   2110   // Send our data over the network.
   2111   int size = page_length;
   2112   while (size) {
   2113     int transferred = send(sock, address + (page_length - size), size, 0);
   2114     if ((transferred == 0) || (transferred == -1)) {
   2115       if (!IsNetworkStopSet()) {
   2116         char buf[256] = "";
   2117         sat_strerror(errno, buf, sizeof(buf));
   2118         logprintf(0, "Process Error: Thread %d, "
   2119                      "Network write failed, bailing. (%s)\n",
   2120                   thread_num_, buf);
   2121         status_ = false;
   2122       }
   2123       return false;
   2124     }
   2125     size = size - transferred;
   2126   }
   2127   return true;
   2128 }
   2129 
   2130 // Receive a page. Return false if a page was not received.
   2131 bool NetworkThread::ReceivePage(int sock, struct page_entry *dst) {
   2132   int page_length = sat_->page_length();
   2133   char *address = static_cast<char*>(dst->addr);
   2134 
   2135   // Maybe we will get our data back again, maybe not.
   2136   int size = page_length;
   2137   while (size) {
   2138     int transferred = recv(sock, address + (page_length - size), size, 0);
   2139     if ((transferred == 0) || (transferred == -1)) {
   2140       // Typically network slave thread should exit as network master
   2141       // thread stops sending data.
   2142       if (IsNetworkStopSet()) {
   2143         int err = errno;
   2144         if (transferred == 0 && err == 0) {
   2145           // Two system setups will not sync exactly,
   2146           // allow early exit, but log it.
   2147           logprintf(0, "Log: Net thread did not receive any data, exiting.\n");
   2148         } else {
   2149           char buf[256] = "";
   2150           sat_strerror(err, buf, sizeof(buf));
   2151           // Print why we failed.
   2152           logprintf(0, "Process Error: Thread %d, "
   2153                        "Network read failed, bailing (%s).\n",
   2154                     thread_num_, buf);
   2155           status_ = false;
   2156           // Print arguments and results.
   2157           logprintf(0, "Log: recv(%d, address %x, size %x, 0) == %x, err %d\n",
   2158                     sock, address + (page_length - size),
   2159                     size, transferred, err);
   2160           if ((transferred == 0) &&
   2161               (page_length - size < 512) &&
   2162               (page_length - size > 0)) {
   2163             // Print null terminated data received, to see who's been
   2164             // sending us supicious unwanted data.
   2165             address[page_length - size] = 0;
   2166             logprintf(0, "Log: received  %d bytes: '%s'\n",
   2167                       page_length - size, address);
   2168           }
   2169         }
   2170       }
   2171       return false;
   2172     }
   2173     size = size - transferred;
   2174   }
   2175   return true;
   2176 }
   2177 
   2178 // Network IO work loop. Execute until marked done.
   2179 // Return true if the thread ran as expected.
   2180 bool NetworkThread::Work() {
   2181   logprintf(9, "Log: Starting network thread %d, ip %s\n",
   2182             thread_num_,
   2183             ipaddr_);
   2184 
   2185   // Make a socket.
   2186   int sock = 0;
   2187   if (!CreateSocket(&sock))
   2188     return false;
   2189 
   2190   // Network IO loop requires network slave thread to have already initialized.
   2191   // We will sleep here for awhile to ensure that the slave thread will be
   2192   // listening by the time we connect.
   2193   // Sleep for 15 seconds.
   2194   sat_sleep(15);
   2195   logprintf(9, "Log: Starting execution of network thread %d, ip %s\n",
   2196             thread_num_,
   2197             ipaddr_);
   2198 
   2199 
   2200   // Connect to a slave thread.
   2201   if (!Connect(sock))
   2202     return false;
   2203 
   2204   // Loop until done.
   2205   bool result = true;
   2206   int strict = sat_->strict();
   2207   int64 loops = 0;
   2208   while (IsReadyToRun()) {
   2209     struct page_entry src;
   2210     struct page_entry dst;
   2211     result = result && sat_->GetValid(&src);
   2212     result = result && sat_->GetEmpty(&dst);
   2213     if (!result) {
   2214       logprintf(0, "Process Error: net_thread failed to pop pages, "
   2215                 "bailing\n");
   2216       break;
   2217     }
   2218 
   2219     // Check data correctness.
   2220     if (strict)
   2221       CrcCheckPage(&src);
   2222 
   2223     // Do the network write.
   2224     if (!(result = result && SendPage(sock, &src)))
   2225       break;
   2226 
   2227     // Update pattern reference to reflect new contents.
   2228     dst.pattern = src.pattern;
   2229 
   2230     // Do the network read.
   2231     if (!(result = result && ReceivePage(sock, &dst)))
   2232       break;
   2233 
   2234     // Ensure that the transfer ended up with correct data.
   2235     if (strict)
   2236       CrcCheckPage(&dst);
   2237 
   2238     // Return all of our pages to the queue.
   2239     result = result && sat_->PutValid(&dst);
   2240     result = result && sat_->PutEmpty(&src);
   2241     if (!result) {
   2242       logprintf(0, "Process Error: net_thread failed to push pages, "
   2243                 "bailing\n");
   2244       break;
   2245     }
   2246     loops++;
   2247   }
   2248 
   2249   pages_copied_ = loops;
   2250   status_ = result;
   2251 
   2252   // Clean up.
   2253   CloseSocket(sock);
   2254 
   2255   logprintf(9, "Log: Completed %d: network thread status %d, "
   2256                "%d pages copied\n",
   2257             thread_num_, status_, pages_copied_);
   2258   return result;
   2259 }
   2260 
   2261 // Spawn slave threads for incoming connections.
   2262 bool NetworkListenThread::SpawnSlave(int newsock, int threadid) {
   2263   logprintf(12, "Log: Listen thread spawning slave\n");
   2264 
   2265   // Spawn slave thread, to reflect network traffic back to sender.
   2266   ChildWorker *child_worker = new ChildWorker;
   2267   child_worker->thread.SetSock(newsock);
   2268   child_worker->thread.InitThread(threadid, sat_, os_, patternlist_,
   2269                                   &child_worker->status);
   2270   child_worker->status.Initialize();
   2271   child_worker->thread.SpawnThread();
   2272   child_workers_.push_back(child_worker);
   2273 
   2274   return true;
   2275 }
   2276 
   2277 // Reap slave threads.
   2278 bool NetworkListenThread::ReapSlaves() {
   2279   bool result = true;
   2280   // Gather status and reap threads.
   2281   logprintf(12, "Log: Joining all outstanding threads\n");
   2282 
   2283   for (size_t i = 0; i < child_workers_.size(); i++) {
   2284     NetworkSlaveThread& child_thread = child_workers_[i]->thread;
   2285     logprintf(12, "Log: Joining slave thread %d\n", i);
   2286     child_thread.JoinThread();
   2287     if (child_thread.GetStatus() != 1) {
   2288       logprintf(0, "Process Error: Slave Thread %d failed with status %d\n", i,
   2289                 child_thread.GetStatus());
   2290       result = false;
   2291     }
   2292     errorcount_ += child_thread.GetErrorCount();
   2293     logprintf(9, "Log: Slave Thread %d found %lld miscompares\n", i,
   2294               child_thread.GetErrorCount());
   2295     pages_copied_ += child_thread.GetPageCount();
   2296   }
   2297 
   2298   return result;
   2299 }
   2300 
   2301 // Network listener IO work loop. Execute until marked done.
   2302 // Return false on fatal software error.
   2303 bool NetworkListenThread::Work() {
   2304   logprintf(9, "Log: Starting network listen thread %d\n",
   2305             thread_num_);
   2306 
   2307   // Make a socket.
   2308   sock_ = 0;
   2309   if (!CreateSocket(&sock_)) {
   2310     status_ = false;
   2311     return false;
   2312   }
   2313   logprintf(9, "Log: Listen thread created sock\n");
   2314 
   2315   // Allows incoming connections to be queued up by socket library.
   2316   int newsock = 0;
   2317   Listen();
   2318   logprintf(12, "Log: Listen thread waiting for incoming connections\n");
   2319 
   2320   // Wait on incoming connections, and spawn worker threads for them.
   2321   int threadcount = 0;
   2322   while (IsReadyToRun()) {
   2323     // Poll for connections that we can accept().
   2324     if (Wait()) {
   2325       // Accept those connections.
   2326       logprintf(12, "Log: Listen thread found incoming connection\n");
   2327       if (GetConnection(&newsock)) {
   2328         SpawnSlave(newsock, threadcount);
   2329         threadcount++;
   2330       }
   2331     }
   2332   }
   2333 
   2334   // Gather status and join spawned threads.
   2335   ReapSlaves();
   2336 
   2337   // Delete the child workers.
   2338   for (ChildVector::iterator it = child_workers_.begin();
   2339        it != child_workers_.end(); ++it) {
   2340     (*it)->status.Destroy();
   2341     delete *it;
   2342   }
   2343   child_workers_.clear();
   2344 
   2345   CloseSocket(sock_);
   2346 
   2347   status_ = true;
   2348   logprintf(9,
   2349             "Log: Completed %d: network listen thread status %d, "
   2350             "%d pages copied\n",
   2351             thread_num_, status_, pages_copied_);
   2352   return true;
   2353 }
   2354 
   2355 // Set network reflector socket struct.
   2356 void NetworkSlaveThread::SetSock(int sock) {
   2357   sock_ = sock;
   2358 }
   2359 
   2360 // Network reflector IO work loop. Execute until marked done.
   2361 // Return false on fatal software error.
   2362 bool NetworkSlaveThread::Work() {
   2363   logprintf(9, "Log: Starting network slave thread %d\n",
   2364             thread_num_);
   2365 
   2366   // Verify that we have a socket.
   2367   int sock = sock_;
   2368   if (!sock) {
   2369     status_ = false;
   2370     return false;
   2371   }
   2372 
   2373   // Loop until done.
   2374   int64 loops = 0;
   2375   // Init a local buffer for storing data.
   2376   void *local_page = NULL;
   2377 #ifdef HAVE_POSIX_MEMALIGN
   2378   int result = posix_memalign(&local_page, 512, sat_->page_length());
   2379 #else
   2380   local_page = memalign(512, sat_->page_length());
   2381   int result = (local_page == 0);
   2382 #endif
   2383   if (result) {
   2384     logprintf(0, "Process Error: net slave posix_memalign "
   2385                  "returned %d (fail)\n",
   2386               result);
   2387     status_ = false;
   2388     return false;
   2389   }
   2390 
   2391   struct page_entry page;
   2392   page.addr = local_page;
   2393 
   2394   // This thread will continue to run as long as the thread on the other end of
   2395   // the socket is still sending and receiving data.
   2396   while (1) {
   2397     // Do the network read.
   2398     if (!ReceivePage(sock, &page))
   2399       break;
   2400 
   2401     // Do the network write.
   2402     if (!SendPage(sock, &page))
   2403       break;
   2404 
   2405     loops++;
   2406   }
   2407 
   2408   pages_copied_ = loops;
   2409   // No results provided from this type of thread.
   2410   status_ = true;
   2411 
   2412   // Clean up.
   2413   CloseSocket(sock);
   2414 
   2415   logprintf(9,
   2416             "Log: Completed %d: network slave thread status %d, "
   2417             "%d pages copied\n",
   2418             thread_num_, status_, pages_copied_);
   2419   return true;
   2420 }
   2421 
   2422 // Thread work loop. Execute until marked finished.
   2423 bool ErrorPollThread::Work() {
   2424   logprintf(9, "Log: Starting system error poll thread %d\n", thread_num_);
   2425 
   2426   // This calls a generic error polling function in the Os abstraction layer.
   2427   do {
   2428     errorcount_ += os_->ErrorPoll();
   2429     os_->ErrorWait();
   2430   } while (IsReadyToRun());
   2431 
   2432   logprintf(9, "Log: Finished system error poll thread %d: %d errors\n",
   2433             thread_num_, errorcount_);
   2434   status_ = true;
   2435   return true;
   2436 }
   2437 
   2438 // Worker thread to heat up CPU.
   2439 // This thread does not evaluate pass/fail or software error.
   2440 bool CpuStressThread::Work() {
   2441   logprintf(9, "Log: Starting CPU stress thread %d\n", thread_num_);
   2442 
   2443   do {
   2444     // Run ludloff's platform/CPU-specific assembly workload.
   2445     os_->CpuStressWorkload();
   2446     YieldSelf();
   2447   } while (IsReadyToRun());
   2448 
   2449   logprintf(9, "Log: Finished CPU stress thread %d:\n",
   2450             thread_num_);
   2451   status_ = true;
   2452   return true;
   2453 }
   2454 
   2455 CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data,
   2456                                                  int cacheline_count,
   2457                                                  int thread_num,
   2458                                                  int thread_count,
   2459                                                  int inc_count) {
   2460   cc_cacheline_data_ = data;
   2461   cc_cacheline_count_ = cacheline_count;
   2462   cc_thread_num_ = thread_num;
   2463   cc_thread_count_ = thread_count;
   2464   cc_inc_count_ = inc_count;
   2465 }
   2466 
   2467 // A very simple psuedorandom generator.  Since the random number is based
   2468 // on only a few simple logic operations, it can be done quickly in registers
   2469 // and the compiler can inline it.
   2470 uint64 CpuCacheCoherencyThread::SimpleRandom(uint64 seed) {
   2471   return (seed >> 1) ^ (-(seed & 1) & kRandomPolynomial);
   2472 }
   2473 
   2474 // Worked thread to test the cache coherency of the CPUs
   2475 // Return false on fatal sw error.
   2476 bool CpuCacheCoherencyThread::Work() {
   2477   logprintf(9, "Log: Starting the Cache Coherency thread %d\n",
   2478             cc_thread_num_);
   2479   uint64 time_start, time_end;
   2480   struct timeval tv;
   2481 
   2482   // Use a slightly more robust random number for the initial
   2483   // value, so the random sequences from the simple generator will
   2484   // be more divergent.
   2485 #ifdef HAVE_RAND_R
   2486   unsigned int seed = static_cast<unsigned int>(gettid());
   2487   uint64 r = static_cast<uint64>(rand_r(&seed));
   2488   r |= static_cast<uint64>(rand_r(&seed)) << 32;
   2489 #else
   2490   srand(time(NULL));
   2491   uint64 r = static_cast<uint64>(rand());  // NOLINT
   2492   r |= static_cast<uint64>(rand()) << 32;  // NOLINT
   2493 #endif
   2494 
   2495   gettimeofday(&tv, NULL);  // Get the timestamp before increments.
   2496   time_start = tv.tv_sec * 1000000ULL + tv.tv_usec;
   2497 
   2498   uint64 total_inc = 0;  // Total increments done by the thread.
   2499   while (IsReadyToRun()) {
   2500     for (int i = 0; i < cc_inc_count_; i++) {
   2501       // Choose a datastructure in random and increment the appropriate
   2502       // member in that according to the offset (which is the same as the
   2503       // thread number.
   2504       r = SimpleRandom(r);
   2505       int cline_num = r % cc_cacheline_count_;
   2506       int offset;
   2507       // Reverse the order for odd numbered threads in odd numbered cache
   2508       // lines.  This is designed for massively multi-core systems where the
   2509       // number of cores exceeds the bytes in a cache line, so "distant" cores
   2510       // get a chance to exercize cache coherency between them.
   2511       if (cline_num & cc_thread_num_ & 1)
   2512         offset = (cc_thread_count_ & ~1) - cc_thread_num_;
   2513       else
   2514         offset = cc_thread_num_;
   2515       // Increment the member of the randomely selected structure.
   2516       (cc_cacheline_data_[cline_num].num[offset])++;
   2517     }
   2518 
   2519     total_inc += cc_inc_count_;
   2520 
   2521     // Calculate if the local counter matches with the global value
   2522     // in all the cache line structures for this particular thread.
   2523     int cc_global_num = 0;
   2524     for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) {
   2525       int offset;
   2526       // Perform the same offset calculation from above.
   2527       if (cline_num & cc_thread_num_ & 1)
   2528         offset = (cc_thread_count_ & ~1) - cc_thread_num_;
   2529       else
   2530         offset = cc_thread_num_;
   2531       cc_global_num += cc_cacheline_data_[cline_num].num[offset];
   2532       // Reset the cachline member's value for the next run.
   2533       cc_cacheline_data_[cline_num].num[offset] = 0;
   2534     }
   2535     if (sat_->error_injection())
   2536       cc_global_num = -1;
   2537 
   2538     // Since the count is only stored in a byte, to squeeze more into a
   2539     // single cache line, only compare it as a byte.  In the event that there
   2540     // is something detected, the chance that it would be missed by a single
   2541     // thread is 1 in 256.  If it affects all cores, that makes the chance
   2542     // of it being missed terribly minute.  It seems unlikely any failure
   2543     // case would be off by more than a small number.
   2544     if ((cc_global_num & 0xff) != (cc_inc_count_ & 0xff)) {
   2545       errorcount_++;
   2546       logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n",
   2547                 cc_global_num, cc_inc_count_);
   2548     }
   2549   }
   2550   gettimeofday(&tv, NULL);  // Get the timestamp at the end.
   2551   time_end = tv.tv_sec * 1000000ULL + tv.tv_usec;
   2552 
   2553   uint64 us_elapsed = time_end - time_start;
   2554   // inc_rate is the no. of increments per second.
   2555   double inc_rate = total_inc * 1e6 / us_elapsed;
   2556 
   2557   logprintf(4, "Stats: CC Thread(%d): Time=%llu us,"
   2558             " Increments=%llu, Increments/sec = %.6lf\n",
   2559             cc_thread_num_, us_elapsed, total_inc, inc_rate);
   2560   logprintf(9, "Log: Finished CPU Cache Coherency thread %d:\n",
   2561             cc_thread_num_);
   2562   status_ = true;
   2563   return true;
   2564 }
   2565 
   2566 DiskThread::DiskThread(DiskBlockTable *block_table) {
   2567   read_block_size_ = kSectorSize;   // default 1 sector (512 bytes)
   2568   write_block_size_ = kSectorSize;  // this assumes read and write block size
   2569                                     // are the same
   2570   segment_size_ = -1;               // use the entire disk as one segment
   2571   cache_size_ = 16 * 1024 * 1024;   // assume 16MiB cache by default
   2572   // Use a queue such that 3/2 times as much data as the cache can hold
   2573   // is written before it is read so that there is little chance the read
   2574   // data is in the cache.
   2575   queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
   2576   blocks_per_segment_ = 32;
   2577 
   2578   read_threshold_ = 100000;         // 100ms is a reasonable limit for
   2579   write_threshold_ = 100000;        // reading/writing a sector
   2580 
   2581   read_timeout_ = 5000000;          // 5 seconds should be long enough for a
   2582   write_timeout_ = 5000000;         // timout for reading/writing
   2583 
   2584   device_sectors_ = 0;
   2585   non_destructive_ = 0;
   2586 
   2587 #ifdef HAVE_LIBAIO_H
   2588   aio_ctx_ = 0;
   2589 #endif
   2590   block_table_ = block_table;
   2591   update_block_table_ = 1;
   2592 
   2593   block_buffer_ = NULL;
   2594 
   2595   blocks_written_ = 0;
   2596   blocks_read_ = 0;
   2597 }
   2598 
   2599 DiskThread::~DiskThread() {
   2600   if (block_buffer_)
   2601     free(block_buffer_);
   2602 }
   2603 
   2604 // Set filename for device file (in /dev).
   2605 void DiskThread::SetDevice(const char *device_name) {
   2606   device_name_ = device_name;
   2607 }
   2608 
   2609 // Set various parameters that control the behaviour of the test.
   2610 // -1 is used as a sentinel value on each parameter (except non_destructive)
   2611 // to indicate that the parameter not be set.
   2612 bool DiskThread::SetParameters(int read_block_size,
   2613                                int write_block_size,
   2614                                int64 segment_size,
   2615                                int64 cache_size,
   2616                                int blocks_per_segment,
   2617                                int64 read_threshold,
   2618                                int64 write_threshold,
   2619                                int non_destructive) {
   2620   if (read_block_size != -1) {
   2621     // Blocks must be aligned to the disk's sector size.
   2622     if (read_block_size % kSectorSize != 0) {
   2623       logprintf(0, "Process Error: Block size must be a multiple of %d "
   2624                 "(thread %d).\n", kSectorSize, thread_num_);
   2625       return false;
   2626     }
   2627 
   2628     read_block_size_ = read_block_size;
   2629   }
   2630 
   2631   if (write_block_size != -1) {
   2632     // Write blocks must be aligned to the disk's sector size and to the
   2633     // block size.
   2634     if (write_block_size % kSectorSize != 0) {
   2635       logprintf(0, "Process Error: Write block size must be a multiple "
   2636                 "of %d (thread %d).\n", kSectorSize, thread_num_);
   2637       return false;
   2638     }
   2639     if (write_block_size % read_block_size_ != 0) {
   2640       logprintf(0, "Process Error: Write block size must be a multiple "
   2641                 "of the read block size, which is %d (thread %d).\n",
   2642                 read_block_size_, thread_num_);
   2643       return false;
   2644     }
   2645 
   2646     write_block_size_ = write_block_size;
   2647 
   2648   } else {
   2649     // Make sure write_block_size_ is still valid.
   2650     if (read_block_size_ > write_block_size_) {
   2651       logprintf(5, "Log: Assuming write block size equal to read block size, "
   2652                 "which is %d (thread %d).\n", read_block_size_,
   2653                 thread_num_);
   2654       write_block_size_ = read_block_size_;
   2655     } else {
   2656       if (write_block_size_ % read_block_size_ != 0) {
   2657         logprintf(0, "Process Error: Write block size (defined as %d) must "
   2658                   "be a multiple of the read block size, which is %d "
   2659                   "(thread %d).\n", write_block_size_, read_block_size_,
   2660                   thread_num_);
   2661         return false;
   2662       }
   2663     }
   2664   }
   2665 
   2666   if (cache_size != -1) {
   2667     cache_size_ = cache_size;
   2668   }
   2669 
   2670   if (blocks_per_segment != -1) {
   2671     if (blocks_per_segment <= 0) {
   2672       logprintf(0, "Process Error: Blocks per segment must be greater than "
   2673                    "zero.\n (thread %d)", thread_num_);
   2674       return false;
   2675     }
   2676 
   2677     blocks_per_segment_ = blocks_per_segment;
   2678   }
   2679 
   2680   if (read_threshold != -1) {
   2681     if (read_threshold <= 0) {
   2682       logprintf(0, "Process Error: Read threshold must be greater than "
   2683                    "zero (thread %d).\n", thread_num_);
   2684       return false;
   2685     }
   2686 
   2687     read_threshold_ = read_threshold;
   2688   }
   2689 
   2690   if (write_threshold != -1) {
   2691     if (write_threshold <= 0) {
   2692       logprintf(0, "Process Error: Write threshold must be greater than "
   2693                    "zero (thread %d).\n", thread_num_);
   2694       return false;
   2695     }
   2696 
   2697     write_threshold_ = write_threshold;
   2698   }
   2699 
   2700   if (segment_size != -1) {
   2701     // Segments must be aligned to the disk's sector size.
   2702     if (segment_size % kSectorSize != 0) {
   2703       logprintf(0, "Process Error: Segment size must be a multiple of %d"
   2704                 " (thread %d).\n", kSectorSize, thread_num_);
   2705       return false;
   2706     }
   2707 
   2708     segment_size_ = segment_size / kSectorSize;
   2709   }
   2710 
   2711   non_destructive_ = non_destructive;
   2712 
   2713   // Having a queue of 150% of blocks that will fit in the disk's cache
   2714   // should be enough to force out the oldest block before it is read and hence,
   2715   // making sure the data comes form the disk and not the cache.
   2716   queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
   2717   // Updating DiskBlockTable parameters
   2718   if (update_block_table_) {
   2719     block_table_->SetParameters(kSectorSize, write_block_size_,
   2720                                 device_sectors_, segment_size_,
   2721                                 device_name_);
   2722   }
   2723   return true;
   2724 }
   2725 
   2726 // Open a device, return false on failure.
   2727 bool DiskThread::OpenDevice(int *pfile) {
   2728   int flags = O_RDWR | O_SYNC | O_LARGEFILE;
   2729   int fd = open(device_name_.c_str(), flags | O_DIRECT, 0);
   2730   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
   2731     fd = open(device_name_.c_str(), flags, 0);  // Try without O_DIRECT
   2732     os_->ActivateFlushPageCache();
   2733   }
   2734   if (fd < 0) {
   2735     logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n",
   2736               device_name_.c_str(), thread_num_);
   2737     return false;
   2738   }
   2739   *pfile = fd;
   2740 
   2741   return GetDiskSize(fd);
   2742 }
   2743 
   2744 // Retrieves the size (in bytes) of the disk/file.
   2745 // Return false on failure.
   2746 bool DiskThread::GetDiskSize(int fd) {
   2747   struct stat device_stat;
   2748   if (fstat(fd, &device_stat) == -1) {
   2749     logprintf(0, "Process Error: Unable to fstat disk %s (thread %d).\n",
   2750               device_name_.c_str(), thread_num_);
   2751     return false;
   2752   }
   2753 
   2754   // For a block device, an ioctl is needed to get the size since the size
   2755   // of the device file (i.e. /dev/sdb) is 0.
   2756   if (S_ISBLK(device_stat.st_mode)) {
   2757     uint64 block_size = 0;
   2758 
   2759     if (ioctl(fd, BLKGETSIZE64, &block_size) == -1) {
   2760       logprintf(0, "Process Error: Unable to ioctl disk %s (thread %d).\n",
   2761                 device_name_.c_str(), thread_num_);
   2762       return false;
   2763     }
   2764 
   2765     // Zero size indicates nonworking device..
   2766     if (block_size == 0) {
   2767       os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1);
   2768       ++errorcount_;
   2769       status_ = true;  // Avoid a procedural error.
   2770       return false;
   2771     }
   2772 
   2773     device_sectors_ = block_size / kSectorSize;
   2774 
   2775   } else if (S_ISREG(device_stat.st_mode)) {
   2776     device_sectors_ = device_stat.st_size / kSectorSize;
   2777 
   2778   } else {
   2779     logprintf(0, "Process Error: %s is not a regular file or block "
   2780               "device (thread %d).\n", device_name_.c_str(),
   2781               thread_num_);
   2782     return false;
   2783   }
   2784 
   2785   logprintf(12, "Log: Device sectors: %lld on disk %s (thread %d).\n",
   2786             device_sectors_, device_name_.c_str(), thread_num_);
   2787 
   2788   if (update_block_table_) {
   2789     block_table_->SetParameters(kSectorSize, write_block_size_,
   2790                                 device_sectors_, segment_size_,
   2791                                 device_name_);
   2792   }
   2793 
   2794   return true;
   2795 }
   2796 
   2797 bool DiskThread::CloseDevice(int fd) {
   2798   close(fd);
   2799   return true;
   2800 }
   2801 
   2802 // Return the time in microseconds.
   2803 int64 DiskThread::GetTime() {
   2804   struct timeval tv;
   2805   gettimeofday(&tv, NULL);
   2806   return tv.tv_sec * 1000000 + tv.tv_usec;
   2807 }
   2808 
   2809 // Do randomized reads and (possibly) writes on a device.
   2810 // Return false on fatal SW error, true on SW success,
   2811 // regardless of whether HW failed.
   2812 bool DiskThread::DoWork(int fd) {
   2813   int64 block_num = 0;
   2814   int64 num_segments;
   2815 
   2816   if (segment_size_ == -1) {
   2817     num_segments = 1;
   2818   } else {
   2819     num_segments = device_sectors_ / segment_size_;
   2820     if (device_sectors_ % segment_size_ != 0)
   2821       num_segments++;
   2822   }
   2823 
   2824   // Disk size should be at least 3x cache size.  See comment later for
   2825   // details.
   2826   sat_assert(device_sectors_ * kSectorSize > 3 * cache_size_);
   2827 
   2828   // This disk test works by writing blocks with a certain pattern to
   2829   // disk, then reading them back and verifying it against the pattern
   2830   // at a later time.  A failure happens when either the block cannot
   2831   // be written/read or when the read block is different than what was
   2832   // written.  If a block takes too long to write/read, then a warning
   2833   // is given instead of an error since taking too long is not
   2834   // necessarily an error.
   2835   //
   2836   // To prevent the read blocks from coming from the disk cache,
   2837   // enough blocks are written before read such that a block would
   2838   // be ejected from the disk cache by the time it is read.
   2839   //
   2840   // TODO(amistry): Implement some sort of read/write throttling.  The
   2841   //                flood of asynchronous I/O requests when a drive is
   2842   //                unplugged is causing the application and kernel to
   2843   //                become unresponsive.
   2844 
   2845   while (IsReadyToRun()) {
   2846     // Write blocks to disk.
   2847     logprintf(16, "Log: Write phase %sfor disk %s (thread %d).\n",
   2848               non_destructive_ ? "(disabled) " : "",
   2849               device_name_.c_str(), thread_num_);
   2850     while (IsReadyToRunNoPause() &&
   2851            in_flight_sectors_.size() <
   2852                static_cast<size_t>(queue_size_ + 1)) {
   2853       // Confine testing to a particular segment of the disk.
   2854       int64 segment = (block_num / blocks_per_segment_) % num_segments;
   2855       if (!non_destructive_ &&
   2856           (block_num % blocks_per_segment_ == 0)) {
   2857         logprintf(20, "Log: Starting to write segment %lld out of "
   2858                   "%lld on disk %s (thread %d).\n",
   2859                   segment, num_segments, device_name_.c_str(),
   2860                   thread_num_);
   2861       }
   2862       block_num++;
   2863 
   2864       BlockData *block = block_table_->GetUnusedBlock(segment);
   2865 
   2866       // If an unused sequence of sectors could not be found, skip to the
   2867       // next block to process.  Soon, a new segment will come and new
   2868       // sectors will be able to be allocated.  This effectively puts a
   2869       // minumim on the disk size at 3x the stated cache size, or 48MiB
   2870       // if a cache size is not given (since the cache is set as 16MiB
   2871       // by default).  Given that todays caches are at the low MiB range
   2872       // and drive sizes at the mid GB, this shouldn't pose a problem.
   2873       // The 3x minimum comes from the following:
   2874       //   1. In order to allocate 'y' blocks from a segment, the
   2875       //      segment must contain at least 2y blocks or else an
   2876       //      allocation may not succeed.
   2877       //   2. Assume the entire disk is one segment.
   2878       //   3. A full write phase consists of writing blocks corresponding to
   2879       //      3/2 cache size.
   2880       //   4. Therefore, the one segment must have 2 * 3/2 * cache
   2881       //      size worth of blocks = 3 * cache size worth of blocks
   2882       //      to complete.
   2883       // In non-destructive mode, don't write anything to disk.
   2884       if (!non_destructive_) {
   2885         if (!WriteBlockToDisk(fd, block)) {
   2886           block_table_->RemoveBlock(block);
   2887           return true;
   2888         }
   2889         blocks_written_++;
   2890       }
   2891 
   2892       // Block is either initialized by writing, or in nondestructive case,
   2893       // initialized by being added into the datastructure for later reading.
   2894       block->initialized();
   2895 
   2896       in_flight_sectors_.push(block);
   2897     }
   2898     if (!os_->FlushPageCache())  // If O_DIRECT worked, this will be a NOP.
   2899       return false;
   2900 
   2901     // Verify blocks on disk.
   2902     logprintf(20, "Log: Read phase for disk %s (thread %d).\n",
   2903               device_name_.c_str(), thread_num_);
   2904     while (IsReadyToRunNoPause() && !in_flight_sectors_.empty()) {
   2905       BlockData *block = in_flight_sectors_.front();
   2906       in_flight_sectors_.pop();
   2907       if (!ValidateBlockOnDisk(fd, block))
   2908         return true;
   2909       block_table_->RemoveBlock(block);
   2910       blocks_read_++;
   2911     }
   2912   }
   2913 
   2914   pages_copied_ = blocks_written_ + blocks_read_;
   2915   return true;
   2916 }
   2917 
   2918 // Do an asynchronous disk I/O operation.
   2919 // Return false if the IO is not set up.
   2920 bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
   2921                             int64 offset, int64 timeout) {
   2922 #ifdef HAVE_LIBAIO_H
   2923   // Use the Linux native asynchronous I/O interface for reading/writing.
   2924   // A read/write consists of three basic steps:
   2925   //    1. create an io context.
   2926   //    2. prepare and submit an io request to the context
   2927   //    3. wait for an event on the context.
   2928 
   2929   struct {
   2930     const int opcode;
   2931     const char *op_str;
   2932     const char *error_str;
   2933   } operations[2] = {
   2934     { IO_CMD_PREAD, "read", "disk-read-error" },
   2935     { IO_CMD_PWRITE, "write", "disk-write-error" }
   2936   };
   2937 
   2938   struct iocb cb;
   2939   memset(&cb, 0, sizeof(cb));
   2940 
   2941   cb.aio_fildes = fd;
   2942   cb.aio_lio_opcode = operations[op].opcode;
   2943   cb.u.c.buf = buf;
   2944   cb.u.c.nbytes = size;
   2945   cb.u.c.offset = offset;
   2946 
   2947   struct iocb *cbs[] = { &cb };
   2948   if (io_submit(aio_ctx_, 1, cbs) != 1) {
   2949     int error = errno;
   2950     char buf[256];
   2951     sat_strerror(error, buf, sizeof(buf));
   2952     logprintf(0, "Process Error: Unable to submit async %s "
   2953                  "on disk %s (thread %d). Error %d, %s\n",
   2954               operations[op].op_str, device_name_.c_str(),
   2955               thread_num_, error, buf);
   2956     return false;
   2957   }
   2958 
   2959   struct io_event event;
   2960   memset(&event, 0, sizeof(event));
   2961   struct timespec tv;
   2962   tv.tv_sec = timeout / 1000000;
   2963   tv.tv_nsec = (timeout % 1000000) * 1000;
   2964   if (io_getevents(aio_ctx_, 1, 1, &event, &tv) != 1) {
   2965     // A ctrl-c from the keyboard will cause io_getevents to fail with an
   2966     // EINTR error code.  This is not an error and so don't treat it as such,
   2967     // but still log it.
   2968     int error = errno;
   2969     if (error == EINTR) {
   2970       logprintf(5, "Log: %s interrupted on disk %s (thread %d).\n",
   2971                 operations[op].op_str, device_name_.c_str(),
   2972                 thread_num_);
   2973     } else {
   2974       os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
   2975       errorcount_ += 1;
   2976       logprintf(0, "Hardware Error: Timeout doing async %s to sectors "
   2977                    "starting at %lld on disk %s (thread %d).\n",
   2978                 operations[op].op_str, offset / kSectorSize,
   2979                 device_name_.c_str(), thread_num_);
   2980     }
   2981 
   2982     // Don't bother checking return codes since io_cancel seems to always fail.
   2983     // Since io_cancel is always failing, destroying and recreating an I/O
   2984     // context is a workaround for canceling an in-progress I/O operation.
   2985     // TODO(amistry): Find out why io_cancel isn't working and make it work.
   2986     io_cancel(aio_ctx_, &cb, &event);
   2987     io_destroy(aio_ctx_);
   2988     aio_ctx_ = 0;
   2989     if (io_setup(5, &aio_ctx_)) {
   2990       int error = errno;
   2991       char buf[256];
   2992       sat_strerror(error, buf, sizeof(buf));
   2993       logprintf(0, "Process Error: Unable to create aio context on disk %s"
   2994                 " (thread %d) Error %d, %s\n",
   2995                 device_name_.c_str(), thread_num_, error, buf);
   2996     }
   2997 
   2998     return false;
   2999   }
   3000 
   3001   // event.res contains the number of bytes written/read or
   3002   // error if < 0, I think.
   3003   if (event.res != static_cast<uint64>(size)) {
   3004     errorcount_++;
   3005     os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
   3006 
   3007     int64 result = static_cast<int64>(event.res);
   3008     if (result < 0) {
   3009       switch (result) {
   3010         case -EIO:
   3011           logprintf(0, "Hardware Error: Low-level I/O error while doing %s to "
   3012                        "sectors starting at %lld on disk %s (thread %d).\n",
   3013                     operations[op].op_str, offset / kSectorSize,
   3014                     device_name_.c_str(), thread_num_);
   3015           break;
   3016         default:
   3017           logprintf(0, "Hardware Error: Unknown error while doing %s to "
   3018                        "sectors starting at %lld on disk %s (thread %d).\n",
   3019                     operations[op].op_str, offset / kSectorSize,
   3020                     device_name_.c_str(), thread_num_);
   3021       }
   3022     } else {
   3023       logprintf(0, "Hardware Error: Unable to %s to sectors starting at "
   3024                    "%lld on disk %s (thread %d).\n",
   3025                 operations[op].op_str, offset / kSectorSize,
   3026                 device_name_.c_str(), thread_num_);
   3027     }
   3028     return false;
   3029   }
   3030 
   3031   return true;
   3032 #else  // !HAVE_LIBAIO_H
   3033   return false;
   3034 #endif
   3035 }
   3036 
   3037 // Write a block to disk.
   3038 // Return false if the block is not written.
   3039 bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
   3040   memset(block_buffer_, 0, block->size());
   3041 
   3042   // Fill block buffer with a pattern
   3043   struct page_entry pe;
   3044   if (!sat_->GetValid(&pe)) {
   3045     // Even though a valid page could not be obatined, it is not an error
   3046     // since we can always fill in a pattern directly, albeit slower.
   3047     unsigned int *memblock = static_cast<unsigned int *>(block_buffer_);
   3048     block->set_pattern(patternlist_->GetRandomPattern());
   3049 
   3050     logprintf(11, "Log: Warning, using pattern fill fallback in "
   3051                   "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n",
   3052               device_name_.c_str(), thread_num_);
   3053 
   3054     for (unsigned int i = 0; i < block->size()/wordsize_; i++) {
   3055       memblock[i] = block->pattern()->pattern(i);
   3056     }
   3057   } else {
   3058     memcpy(block_buffer_, pe.addr, block->size());
   3059     block->set_pattern(pe.pattern);
   3060     sat_->PutValid(&pe);
   3061   }
   3062 
   3063   logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s"
   3064             " (thread %d).\n",
   3065             block->size()/kSectorSize, block->address(),
   3066             device_name_.c_str(), thread_num_);
   3067 
   3068   int64 start_time = GetTime();
   3069 
   3070   if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->size(),
   3071                    block->address() * kSectorSize, write_timeout_)) {
   3072     return false;
   3073   }
   3074 
   3075   int64 end_time = GetTime();
   3076   logprintf(12, "Log: Writing time: %lld us (thread %d).\n",
   3077             end_time - start_time, thread_num_);
   3078   if (end_time - start_time > write_threshold_) {
   3079     logprintf(5, "Log: Write took %lld us which is longer than threshold "
   3080                  "%lld us on disk %s (thread %d).\n",
   3081               end_time - start_time, write_threshold_, device_name_.c_str(),
   3082               thread_num_);
   3083   }
   3084 
   3085   return true;
   3086 }
   3087 
   3088 // Verify a block on disk.
   3089 // Return true if the block was read, also increment errorcount
   3090 // if the block had data errors or performance problems.
   3091 bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
   3092   int64 blocks = block->size() / read_block_size_;
   3093   int64 bytes_read = 0;
   3094   int64 current_blocks;
   3095   int64 current_bytes;
   3096   uint64 address = block->address();
   3097 
   3098   logprintf(20, "Log: Reading sectors starting at %lld on disk %s "
   3099             "(thread %d).\n",
   3100             address, device_name_.c_str(), thread_num_);
   3101 
   3102   // Read block from disk and time the read.  If it takes longer than the
   3103   // threshold, complain.
   3104   if (lseek64(fd, address * kSectorSize, SEEK_SET) == -1) {
   3105     logprintf(0, "Process Error: Unable to seek to sector %lld in "
   3106               "DiskThread::ValidateSectorsOnDisk on disk %s "
   3107               "(thread %d).\n", address, device_name_.c_str(), thread_num_);
   3108     return false;
   3109   }
   3110   int64 start_time = GetTime();
   3111 
   3112   // Split a large write-sized block into small read-sized blocks and
   3113   // read them in groups of randomly-sized multiples of read block size.
   3114   // This assures all data written on disk by this particular block
   3115   // will be tested using a random reading pattern.
   3116   while (blocks != 0) {
   3117     // Test all read blocks in a written block.
   3118     current_blocks = (random() % blocks) + 1;
   3119     current_bytes = current_blocks * read_block_size_;
   3120 
   3121     memset(block_buffer_, 0, current_bytes);
   3122 
   3123     logprintf(20, "Log: Reading %lld sectors starting at sector %lld on "
   3124               "disk %s (thread %d)\n",
   3125               current_bytes / kSectorSize,
   3126               (address * kSectorSize + bytes_read) / kSectorSize,
   3127               device_name_.c_str(), thread_num_);
   3128 
   3129     if (!AsyncDiskIO(ASYNC_IO_READ, fd, block_buffer_, current_bytes,
   3130                      address * kSectorSize + bytes_read,
   3131                      write_timeout_)) {
   3132       return false;
   3133     }
   3134 
   3135     int64 end_time = GetTime();
   3136     logprintf(20, "Log: Reading time: %lld us (thread %d).\n",
   3137               end_time - start_time, thread_num_);
   3138     if (end_time - start_time > read_threshold_) {
   3139       logprintf(5, "Log: Read took %lld us which is longer than threshold "
   3140                 "%lld us on disk %s (thread %d).\n",
   3141                 end_time - start_time, read_threshold_,
   3142                 device_name_.c_str(), thread_num_);
   3143     }
   3144 
   3145     // In non-destructive mode, don't compare the block to the pattern since
   3146     // the block was never written to disk in the first place.
   3147     if (!non_destructive_) {
   3148       if (CheckRegion(block_buffer_, block->pattern(), current_bytes,
   3149                       0, bytes_read)) {
   3150         os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
   3151         errorcount_ += 1;
   3152         logprintf(0, "Hardware Error: Pattern mismatch in block starting at "
   3153                   "sector %lld in DiskThread::ValidateSectorsOnDisk on "
   3154                   "disk %s (thread %d).\n",
   3155                   address, device_name_.c_str(), thread_num_);
   3156       }
   3157     }
   3158 
   3159     bytes_read += current_blocks * read_block_size_;
   3160     blocks -= current_blocks;
   3161   }
   3162 
   3163   return true;
   3164 }
   3165 
   3166 // Direct device access thread.
   3167 // Return false on software error.
   3168 bool DiskThread::Work() {
   3169   int fd;
   3170 
   3171   logprintf(9, "Log: Starting disk thread %d, disk %s\n",
   3172             thread_num_, device_name_.c_str());
   3173 
   3174   srandom(time(NULL));
   3175 
   3176   if (!OpenDevice(&fd)) {
   3177     status_ = false;
   3178     return false;
   3179   }
   3180 
   3181   // Allocate a block buffer aligned to 512 bytes since the kernel requires it
   3182   // when using direct IO.
   3183 #ifdef HAVE_POSIX_MEMALIGN
   3184   int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment,
   3185                                        sat_->page_length());
   3186 #else
   3187   block_buffer_ = memalign(kBufferAlignment, sat_->page_length());
   3188   int memalign_result = (block_buffer_ == 0);
   3189 #endif
   3190   if (memalign_result) {
   3191     CloseDevice(fd);
   3192     logprintf(0, "Process Error: Unable to allocate memory for buffers "
   3193                  "for disk %s (thread %d) posix memalign returned %d.\n",
   3194               device_name_.c_str(), thread_num_, memalign_result);
   3195     status_ = false;
   3196     return false;
   3197   }
   3198 
   3199 #ifdef HAVE_LIBAIO_H
   3200   if (io_setup(5, &aio_ctx_)) {
   3201     CloseDevice(fd);
   3202     logprintf(0, "Process Error: Unable to create aio context for disk %s"
   3203               " (thread %d).\n",
   3204               device_name_.c_str(), thread_num_);
   3205     status_ = false;
   3206     return false;
   3207   }
   3208 #endif
   3209 
   3210   bool result = DoWork(fd);
   3211 
   3212   status_ = result;
   3213 
   3214 #ifdef HAVE_LIBAIO_H
   3215   io_destroy(aio_ctx_);
   3216 #endif
   3217   CloseDevice(fd);
   3218 
   3219   logprintf(9, "Log: Completed %d (disk %s): disk thread status %d, "
   3220                "%d pages copied\n",
   3221             thread_num_, device_name_.c_str(), status_, pages_copied_);
   3222   return result;
   3223 }
   3224 
   3225 RandomDiskThread::RandomDiskThread(DiskBlockTable *block_table)
   3226     : DiskThread(block_table) {
   3227   update_block_table_ = 0;
   3228 }
   3229 
   3230 RandomDiskThread::~RandomDiskThread() {
   3231 }
   3232 
   3233 // Workload for random disk thread.
   3234 bool RandomDiskThread::DoWork(int fd) {
   3235   logprintf(11, "Log: Random phase for disk %s (thread %d).\n",
   3236             device_name_.c_str(), thread_num_);
   3237   while (IsReadyToRun()) {
   3238     BlockData *block = block_table_->GetRandomBlock();
   3239     if (block == NULL) {
   3240       logprintf(12, "Log: No block available for device %s (thread %d).\n",
   3241                 device_name_.c_str(), thread_num_);
   3242     } else {
   3243       ValidateBlockOnDisk(fd, block);
   3244       block_table_->ReleaseBlock(block);
   3245       blocks_read_++;
   3246     }
   3247   }
   3248   pages_copied_ = blocks_read_;
   3249   return true;
   3250 }
   3251 
   3252 MemoryRegionThread::MemoryRegionThread() {
   3253   error_injection_ = false;
   3254   pages_ = NULL;
   3255 }
   3256 
   3257 MemoryRegionThread::~MemoryRegionThread() {
   3258   if (pages_ != NULL)
   3259     delete pages_;
   3260 }
   3261 
   3262 // Set a region of memory or MMIO to be tested.
   3263 // Return false if region could not be mapped.
   3264 bool MemoryRegionThread::SetRegion(void *region, int64 size) {
   3265   int plength = sat_->page_length();
   3266   int npages = size / plength;
   3267   if (size % plength) {
   3268     logprintf(0, "Process Error: region size is not a multiple of SAT "
   3269               "page length\n");
   3270     return false;
   3271   } else {
   3272     if (pages_ != NULL)
   3273       delete pages_;
   3274     pages_ = new PageEntryQueue(npages);
   3275     char *base_addr = reinterpret_cast<char*>(region);
   3276     region_ = base_addr;
   3277     for (int i = 0; i < npages; i++) {
   3278       struct page_entry pe;
   3279       init_pe(&pe);
   3280       pe.addr = reinterpret_cast<void*>(base_addr + i * plength);
   3281       pe.offset = i * plength;
   3282 
   3283       pages_->Push(&pe);
   3284     }
   3285     return true;
   3286   }
   3287 }
   3288 
   3289 // More detailed error printout for hardware errors in memory or MMIO
   3290 // regions.
   3291 void MemoryRegionThread::ProcessError(struct ErrorRecord *error,
   3292                                       int priority,
   3293                                       const char *message) {
   3294   uint32 buffer_offset;
   3295   if (phase_ == kPhaseCopy) {
   3296     // If the error occurred on the Copy Phase, it means that
   3297     // the source data (i.e., the main memory) is wrong. so
   3298     // just pass it to the original ProcessError to call a
   3299     // bad-dimm error
   3300     WorkerThread::ProcessError(error, priority, message);
   3301   } else if (phase_ == kPhaseCheck) {
   3302     // A error on the Check Phase means that the memory region tested
   3303     // has an error. Gathering more information and then reporting
   3304     // the error.
   3305     // Determine if this is a write or read error.
   3306     os_->Flush(error->vaddr);
   3307     error->reread = *(error->vaddr);
   3308     char *good = reinterpret_cast<char*>(&(error->expected));
   3309     char *bad = reinterpret_cast<char*>(&(error->actual));
   3310     sat_assert(error->expected != error->actual);
   3311     unsigned int offset = 0;
   3312     for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
   3313       if (good[offset] != bad[offset])
   3314         break;
   3315     }
   3316 
   3317     error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
   3318 
   3319     buffer_offset = error->vbyteaddr - region_;
   3320 
   3321     // Find physical address if possible.
   3322     error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
   3323     logprintf(priority,
   3324               "%s: miscompare on %s, CRC check at %p(0x%llx), "
   3325               "offset %llx: read:0x%016llx, reread:0x%016llx "
   3326               "expected:0x%016llx\n",
   3327               message,
   3328               identifier_.c_str(),
   3329               error->vaddr,
   3330               error->paddr,
   3331               buffer_offset,
   3332               error->actual,
   3333               error->reread,
   3334               error->expected);
   3335   } else {
   3336     logprintf(0, "Process Error: memory region thread raised an "
   3337               "unexpected error.");
   3338   }
   3339 }
   3340 
   3341 // Workload for testion memory or MMIO regions.
   3342 // Return false on software error.
   3343 bool MemoryRegionThread::Work() {
   3344   struct page_entry source_pe;
   3345   struct page_entry memregion_pe;
   3346   bool result = true;
   3347   int64 loops = 0;
   3348   const uint64 error_constant = 0x00ba00000000ba00LL;
   3349 
   3350   // For error injection.
   3351   int64 *addr = 0x0;
   3352   int offset = 0;
   3353   int64 data = 0;
   3354 
   3355   logprintf(9, "Log: Starting Memory Region thread %d\n", thread_num_);
   3356 
   3357   while (IsReadyToRun()) {
   3358     // Getting pages from SAT and queue.
   3359     phase_ = kPhaseNoPhase;
   3360     result = result && sat_->GetValid(&source_pe);
   3361     if (!result) {
   3362       logprintf(0, "Process Error: memory region thread failed to pop "
   3363                 "pages from SAT, bailing\n");
   3364       break;
   3365     }
   3366 
   3367     result = result && pages_->PopRandom(&memregion_pe);
   3368     if (!result) {
   3369       logprintf(0, "Process Error: memory region thread failed to pop "
   3370                 "pages from queue, bailing\n");
   3371       break;
   3372     }
   3373 
   3374     // Error injection for CRC copy.
   3375     if ((sat_->error_injection() || error_injection_) && loops == 1) {
   3376       addr = reinterpret_cast<int64*>(source_pe.addr);
   3377       offset = random() % (sat_->page_length() / wordsize_);
   3378       data = addr[offset];
   3379       addr[offset] = error_constant;
   3380     }
   3381 
   3382     // Copying SAT page into memory region.
   3383     phase_ = kPhaseCopy;
   3384     CrcCopyPage(&memregion_pe, &source_pe);
   3385     memregion_pe.pattern = source_pe.pattern;
   3386 
   3387     // Error injection for CRC Check.
   3388     if ((sat_->error_injection() || error_injection_) && loops == 2) {
   3389       addr = reinterpret_cast<int64*>(memregion_pe.addr);
   3390       offset = random() % (sat_->page_length() / wordsize_);
   3391       data = addr[offset];
   3392       addr[offset] = error_constant;
   3393     }
   3394 
   3395     // Checking page content in memory region.
   3396     phase_ = kPhaseCheck;
   3397     CrcCheckPage(&memregion_pe);
   3398 
   3399     phase_ = kPhaseNoPhase;
   3400     // Storing pages on their proper queues.
   3401     result = result && sat_->PutValid(&source_pe);
   3402     if (!result) {
   3403       logprintf(0, "Process Error: memory region thread failed to push "
   3404                 "pages into SAT, bailing\n");
   3405       break;
   3406     }
   3407     result = result && pages_->Push(&memregion_pe);
   3408     if (!result) {
   3409       logprintf(0, "Process Error: memory region thread failed to push "
   3410                 "pages into queue, bailing\n");
   3411       break;
   3412     }
   3413 
   3414     if ((sat_->error_injection() || error_injection_) &&
   3415         loops >= 1 && loops <= 2) {
   3416       addr[offset] = data;
   3417     }
   3418 
   3419     loops++;
   3420     YieldSelf();
   3421   }
   3422 
   3423   pages_copied_ = loops;
   3424   status_ = result;
   3425   logprintf(9, "Log: Completed %d: Memory Region thread. Status %d, %d "
   3426             "pages checked\n", thread_num_, status_, pages_copied_);
   3427   return result;
   3428 }
   3429 
   3430 // The list of MSRs to read from each cpu.
   3431 const CpuFreqThread::CpuRegisterType CpuFreqThread::kCpuRegisters[] = {
   3432   { kMsrTscAddr, "TSC" },
   3433   { kMsrAperfAddr, "APERF" },
   3434   { kMsrMperfAddr, "MPERF" },
   3435 };
   3436 
   3437 CpuFreqThread::CpuFreqThread(int num_cpus, int freq_threshold, int round)
   3438   : num_cpus_(num_cpus),
   3439     freq_threshold_(freq_threshold),
   3440     round_(round) {
   3441   sat_assert(round >= 0);
   3442   if (round == 0) {
   3443     // If rounding is off, force rounding to the nearest MHz.
   3444     round_ = 1;
   3445     round_value_ = 0.5;
   3446   } else {
   3447     round_value_ = round/2.0;
   3448   }
   3449 }
   3450 
   3451 CpuFreqThread::~CpuFreqThread() {
   3452 }
   3453 
   3454 // Compute the difference between the currently read MSR values and the
   3455 // previously read values and store the results in delta. If any of the
   3456 // values did not increase, or the TSC value is too small, returns false.
   3457 // Otherwise, returns true.
   3458 bool CpuFreqThread::ComputeDelta(CpuDataType *current, CpuDataType *previous,
   3459                                  CpuDataType *delta) {
   3460   // Loop through the msrs.
   3461   for (int msr = 0; msr < kMsrLast; msr++) {
   3462     if (previous->msrs[msr] > current->msrs[msr]) {
   3463       logprintf(0, "Log: Register %s went backwards 0x%llx to 0x%llx "
   3464                 "skipping interval\n", kCpuRegisters[msr], previous->msrs[msr],
   3465                 current->msrs[msr]);
   3466       return false;
   3467     } else {
   3468       delta->msrs[msr] = current->msrs[msr] - previous->msrs[msr];
   3469     }
   3470   }
   3471 
   3472   // Check for TSC < 1 Mcycles over interval.
   3473   if (delta->msrs[kMsrTsc] < (1000 * 1000)) {
   3474     logprintf(0, "Log: Insanely slow TSC rate, TSC stops in idle?\n");
   3475     return false;
   3476   }
   3477   timersub(&current->tv, &previous->tv, &delta->tv);
   3478 
   3479   return true;
   3480 }
   3481 
   3482 // Compute the change in values of the MSRs between current and previous,
   3483 // set the frequency in MHz of the cpu. If there is an error computing
   3484 // the delta, return false. Othewise, return true.
   3485 bool CpuFreqThread::ComputeFrequency(CpuDataType *current,
   3486                                      CpuDataType *previous, int *freq) {
   3487   CpuDataType delta;
   3488   if (!ComputeDelta(current, previous, &delta)) {
   3489     return false;
   3490   }
   3491 
   3492   double interval = delta.tv.tv_sec + delta.tv.tv_usec / 1000000.0;
   3493   double frequency = 1.0 * delta.msrs[kMsrTsc] / 1000000
   3494                      * delta.msrs[kMsrAperf] / delta.msrs[kMsrMperf] / interval;
   3495 
   3496   // Use the rounding value to round up properly.
   3497   int computed = static_cast<int>(frequency + round_value_);
   3498   *freq = computed - (computed % round_);
   3499   return true;
   3500 }
   3501 
   3502 // This is the task function that the thread executes.
   3503 bool CpuFreqThread::Work() {
   3504   cpu_set_t cpuset;
   3505   if (!AvailableCpus(&cpuset)) {
   3506     logprintf(0, "Process Error: Cannot get information about the cpus.\n");
   3507     return false;
   3508   }
   3509 
   3510   // Start off indicating the test is passing.
   3511   status_ = true;
   3512 
   3513   int curr = 0;
   3514   int prev = 1;
   3515   uint32 num_intervals = 0;
   3516   bool paused = false;
   3517   bool valid;
   3518   bool pass = true;
   3519 
   3520   vector<CpuDataType> data[2];
   3521   data[0].resize(num_cpus_);
   3522   data[1].resize(num_cpus_);
   3523   while (IsReadyToRun(&paused)) {
   3524     if (paused) {
   3525       // Reset the intervals and restart logic after the pause.
   3526       num_intervals = 0;
   3527     }
   3528     if (num_intervals == 0) {
   3529       // If this is the first interval, then always wait a bit before
   3530       // starting to collect data.
   3531       sat_sleep(kStartupDelay);
   3532     }
   3533 
   3534     // Get the per cpu counters.
   3535     valid = true;
   3536     for (int cpu = 0; cpu < num_cpus_; cpu++) {
   3537       if (CPU_ISSET(cpu, &cpuset)) {
   3538         if (!GetMsrs(cpu, &data[curr][cpu])) {
   3539           logprintf(0, "Failed to get msrs on cpu %d.\n", cpu);
   3540           valid = false;
   3541           break;
   3542         }
   3543       }
   3544     }
   3545     if (!valid) {
   3546       // Reset the number of collected intervals since something bad happened.
   3547       num_intervals = 0;
   3548       continue;
   3549     }
   3550 
   3551     num_intervals++;
   3552 
   3553     // Only compute a delta when we have at least two intervals worth of data.
   3554     if (num_intervals > 2) {
   3555       for (int cpu = 0; cpu < num_cpus_; cpu++) {
   3556         if (CPU_ISSET(cpu, &cpuset)) {
   3557           int freq;
   3558           if (!ComputeFrequency(&data[curr][cpu], &data[prev][cpu],
   3559                                 &freq)) {
   3560             // Reset the number of collected intervals since an unknown
   3561             // error occurred.
   3562             logprintf(0, "Log: Cannot get frequency of cpu %d.\n", cpu);
   3563             num_intervals = 0;
   3564             break;
   3565           }
   3566           logprintf(15, "Cpu %d Freq %d\n", cpu, freq);
   3567           if (freq < freq_threshold_) {
   3568             errorcount_++;
   3569             pass = false;
   3570             logprintf(0, "Log: Cpu %d frequency is too low, frequency %d MHz "
   3571                       "threshold %d MHz.\n", cpu, freq, freq_threshold_);
   3572           }
   3573         }
   3574       }
   3575     }
   3576 
   3577     sat_sleep(kIntervalPause);
   3578 
   3579     // Swap the values in curr and prev (these values flip between 0 and 1).
   3580     curr ^= 1;
   3581     prev ^= 1;
   3582   }
   3583 
   3584   return pass;
   3585 }
   3586 
   3587 
   3588 // Get the MSR values for this particular cpu and save them in data. If
   3589 // any error is encountered, returns false. Otherwise, returns true.
   3590 bool CpuFreqThread::GetMsrs(int cpu, CpuDataType *data) {
   3591   for (int msr = 0; msr < kMsrLast; msr++) {
   3592     if (!os_->ReadMSR(cpu, kCpuRegisters[msr].msr, &data->msrs[msr])) {
   3593       return false;
   3594     }
   3595   }
   3596   // Save the time at which we acquired these values.
   3597   gettimeofday(&data->tv, NULL);
   3598 
   3599   return true;
   3600 }
   3601 
   3602 // Returns true if this test can run on the current machine. Otherwise,
   3603 // returns false.
   3604 bool CpuFreqThread::CanRun() {
   3605 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
   3606   unsigned int eax, ebx, ecx, edx;
   3607 
   3608   // Check that the TSC feature is supported.
   3609   // This check is valid for both Intel and AMD.
   3610   eax = 1;
   3611   cpuid(&eax, &ebx, &ecx, &edx);
   3612   if (!(edx & (1 << 5))) {
   3613     logprintf(0, "Process Error: No TSC support.\n");
   3614     return false;
   3615   }
   3616 
   3617   // Check the highest extended function level supported.
   3618   // This check is valid for both Intel and AMD.
   3619   eax = 0x80000000;
   3620   cpuid(&eax, &ebx, &ecx, &edx);
   3621   if (eax < 0x80000007) {
   3622     logprintf(0, "Process Error: No invariant TSC support.\n");
   3623     return false;
   3624   }
   3625 
   3626   // Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
   3627   // This check is valid for both Intel and AMD.
   3628   eax = 0x80000007;
   3629   cpuid(&eax, &ebx, &ecx, &edx);
   3630   if ((edx & (1 << 8)) == 0) {
   3631     logprintf(0, "Process Error: No non-stop TSC support.\n");
   3632     return false;
   3633   }
   3634 
   3635   // APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
   3636   // This check is valid for both Intel and AMD.
   3637   eax = 0x6;
   3638   cpuid(&eax, &ebx, &ecx, &edx);
   3639   if ((ecx & 1) == 0) {
   3640     logprintf(0, "Process Error: No APERF MSR support.\n");
   3641     return false;
   3642   }
   3643   return true;
   3644 #else
   3645   logprintf(0, "Process Error: "
   3646                "cpu_freq_test is only supported on X86 processors.\n");
   3647   return false;
   3648 #endif
   3649 }
   3650