1 // Copyright 2006 Google Inc. All Rights Reserved. 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 7 // http://www.apache.org/licenses/LICENSE-2.0 8 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // sat.cc : a stress test for stressful testing 16 17 // stressapptest (or SAT, from Stressful Application Test) is a test 18 // designed to stress the system, as well as provide a comprehensive 19 // memory interface test. 20 21 // stressapptest can be run using memory only, or using many system components. 22 23 #include <errno.h> 24 #include <pthread.h> 25 #include <signal.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <unistd.h> 31 32 #include <sys/stat.h> 33 #include <sys/times.h> 34 35 // #define __USE_GNU 36 // #define __USE_LARGEFILE64 37 #include <fcntl.h> 38 39 #include <list> 40 #include <string> 41 42 // This file must work with autoconf on its public version, 43 // so these includes are correct. 44 #include "disk_blocks.h" 45 #include "logger.h" 46 #include "os.h" 47 #include "sat.h" 48 #include "sattypes.h" 49 #include "worker.h" 50 51 // stressapptest versioning here. 52 #ifndef PACKAGE_VERSION 53 static const char* kVersion = "1.0.0"; 54 #else 55 static const char* kVersion = PACKAGE_VERSION; 56 #endif 57 58 // Global stressapptest reference, for use by signal handler. 59 // This makes Sat objects not safe for multiple instances. 60 namespace { 61 Sat *g_sat = NULL; 62 63 // Signal handler for catching break or kill. 64 // 65 // This must be installed after g_sat is assigned and while there is a single 66 // thread. 67 // 68 // This must be uninstalled while there is only a single thread, and of course 69 // before g_sat is cleared or deleted. 70 void SatHandleBreak(int signal) { 71 g_sat->Break(); 72 } 73 } 74 75 // Opens the logfile for writing if necessary 76 bool Sat::InitializeLogfile() { 77 // Open logfile. 78 if (use_logfile_) { 79 logfile_ = open(logfilename_, 80 #if defined(O_DSYNC) 81 O_DSYNC | 82 #elif defined(O_SYNC) 83 O_SYNC | 84 #elif defined(O_FSYNC) 85 O_FSYNC | 86 #endif 87 O_WRONLY | O_CREAT, 88 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); 89 if (logfile_ < 0) { 90 printf("Fatal Error: cannot open file %s for logging\n", 91 logfilename_); 92 bad_status(); 93 return false; 94 } 95 // We seek to the end once instead of opening in append mode because no 96 // other processes should be writing to it while this one exists. 97 if (lseek(logfile_, 0, SEEK_END) == -1) { 98 printf("Fatal Error: cannot seek to end of logfile (%s)\n", 99 logfilename_); 100 bad_status(); 101 return false; 102 } 103 Logger::GlobalLogger()->SetLogFd(logfile_); 104 } 105 return true; 106 } 107 108 // Check that the environment is known and safe to run on. 109 // Return 1 if good, 0 if unsuppported. 110 bool Sat::CheckEnvironment() { 111 // Check that this is not a debug build. Debug builds lack 112 // enough performance to stress the system. 113 #if !defined NDEBUG 114 if (run_on_anything_) { 115 logprintf(1, "Log: Running DEBUG version of SAT, " 116 "with significantly reduced coverage.\n"); 117 } else { 118 logprintf(0, "Process Error: Running DEBUG version of SAT, " 119 "with significantly reduced coverage.\n"); 120 logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); 121 bad_status(); 122 return false; 123 } 124 #elif !defined CHECKOPTS 125 #error Build system regression - COPTS disregarded. 126 #endif 127 128 // Check if the cpu frequency test is enabled and able to run. 129 if (cpu_freq_test_) { 130 if (!CpuFreqThread::CanRun()) { 131 logprintf(0, "Process Error: This platform does not support this " 132 "test.\n"); 133 bad_status(); 134 return false; 135 } else if (cpu_freq_threshold_ <= 0) { 136 logprintf(0, "Process Error: The cpu frequency test requires " 137 "--cpu_freq_threshold set to a value > 0\n"); 138 bad_status(); 139 return false; 140 } else if (cpu_freq_round_ < 0) { 141 logprintf(0, "Process Error: The --cpu_freq_round option must be greater" 142 " than or equal to zero. A value of zero means no rounding.\n"); 143 bad_status(); 144 return false; 145 } 146 } 147 148 // Use all CPUs if nothing is specified. 149 if (memory_threads_ == -1) { 150 memory_threads_ = os_->num_cpus(); 151 logprintf(7, "Log: Defaulting to %d copy threads\n", memory_threads_); 152 } 153 154 // Use all memory if no size is specified. 155 if (size_mb_ == 0) 156 size_mb_ = os_->FindFreeMemSize() / kMegabyte; 157 size_ = static_cast<int64>(size_mb_) * kMegabyte; 158 159 // Autodetect file locations. 160 if (findfiles_ && (file_threads_ == 0)) { 161 // Get a space separated sting of disk locations. 162 list<string> locations = os_->FindFileDevices(); 163 164 // Extract each one. 165 while (!locations.empty()) { 166 // Copy and remove the disk name. 167 string disk = locations.back(); 168 locations.pop_back(); 169 170 logprintf(12, "Log: disk at %s\n", disk.c_str()); 171 file_threads_++; 172 filename_.push_back(disk + "/sat_disk.a"); 173 file_threads_++; 174 filename_.push_back(disk + "/sat_disk.b"); 175 } 176 } 177 178 // We'd better have some memory by this point. 179 if (size_ < 1) { 180 logprintf(0, "Process Error: No memory found to test.\n"); 181 bad_status(); 182 return false; 183 } 184 185 if (tag_mode_ && ((file_threads_ > 0) || 186 (disk_threads_ > 0) || 187 (net_threads_ > 0))) { 188 logprintf(0, "Process Error: Memory tag mode incompatible " 189 "with disk/network DMA.\n"); 190 bad_status(); 191 return false; 192 } 193 194 // If platform is 32 bit Xeon, floor memory size to multiple of 4. 195 if (address_mode_ == 32) { 196 size_mb_ = (size_mb_ / 4) * 4; 197 size_ = size_mb_ * kMegabyte; 198 logprintf(1, "Log: Flooring memory allocation to multiple of 4: %lldMB\n", 199 size_mb_); 200 } 201 202 // Check if this system is on the whitelist for supported systems. 203 if (!os_->IsSupported()) { 204 if (run_on_anything_) { 205 logprintf(1, "Log: Unsupported system. Running with reduced coverage.\n"); 206 // This is ok, continue on. 207 } else { 208 logprintf(0, "Process Error: Unsupported system, " 209 "no error reporting available\n"); 210 logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); 211 bad_status(); 212 return false; 213 } 214 } 215 216 return true; 217 } 218 219 // Allocates memory to run the test on 220 bool Sat::AllocateMemory() { 221 // Allocate our test memory. 222 bool result = os_->AllocateTestMem(size_, paddr_base_); 223 if (!result) { 224 logprintf(0, "Process Error: failed to allocate memory\n"); 225 bad_status(); 226 return false; 227 } 228 return true; 229 } 230 231 // Sets up access to data patterns 232 bool Sat::InitializePatterns() { 233 // Initialize pattern data. 234 patternlist_ = new PatternList(); 235 if (!patternlist_) { 236 logprintf(0, "Process Error: failed to allocate patterns\n"); 237 bad_status(); 238 return false; 239 } 240 if (!patternlist_->Initialize()) { 241 logprintf(0, "Process Error: failed to initialize patternlist\n"); 242 bad_status(); 243 return false; 244 } 245 return true; 246 } 247 248 // Get any valid page, no tag specified. 249 bool Sat::GetValid(struct page_entry *pe) { 250 return GetValid(pe, kDontCareTag); 251 } 252 253 254 // Fetch and return empty and full pages into the empty and full pools. 255 bool Sat::GetValid(struct page_entry *pe, int32 tag) { 256 bool result = false; 257 // Get valid page depending on implementation. 258 if (pe_q_implementation_ == SAT_FINELOCK) 259 result = finelock_q_->GetValid(pe, tag); 260 else if (pe_q_implementation_ == SAT_ONELOCK) 261 result = valid_->PopRandom(pe); 262 263 if (result) { 264 pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it. 265 266 // Tag this access and current pattern. 267 pe->ts = os_->GetTimestamp(); 268 pe->lastpattern = pe->pattern; 269 270 return (pe->addr != 0); // Return success or failure. 271 } 272 return false; 273 } 274 275 bool Sat::PutValid(struct page_entry *pe) { 276 if (pe->addr != 0) 277 os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page. 278 pe->addr = 0; 279 280 // Put valid page depending on implementation. 281 if (pe_q_implementation_ == SAT_FINELOCK) 282 return finelock_q_->PutValid(pe); 283 else if (pe_q_implementation_ == SAT_ONELOCK) 284 return valid_->Push(pe); 285 else 286 return false; 287 } 288 289 // Get an empty page with any tag. 290 bool Sat::GetEmpty(struct page_entry *pe) { 291 return GetEmpty(pe, kDontCareTag); 292 } 293 294 bool Sat::GetEmpty(struct page_entry *pe, int32 tag) { 295 bool result = false; 296 // Get empty page depending on implementation. 297 if (pe_q_implementation_ == SAT_FINELOCK) 298 result = finelock_q_->GetEmpty(pe, tag); 299 else if (pe_q_implementation_ == SAT_ONELOCK) 300 result = empty_->PopRandom(pe); 301 302 if (result) { 303 pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it. 304 return (pe->addr != 0); // Return success or failure. 305 } 306 return false; 307 } 308 309 bool Sat::PutEmpty(struct page_entry *pe) { 310 if (pe->addr != 0) 311 os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page. 312 pe->addr = 0; 313 314 // Put empty page depending on implementation. 315 if (pe_q_implementation_ == SAT_FINELOCK) 316 return finelock_q_->PutEmpty(pe); 317 else if (pe_q_implementation_ == SAT_ONELOCK) 318 return empty_->Push(pe); 319 else 320 return false; 321 } 322 323 // Set up the bitmap of physical pages in case we want to see which pages were 324 // accessed under this run of SAT. 325 void Sat::AddrMapInit() { 326 if (!do_page_map_) 327 return; 328 // Find about how much physical mem is in the system. 329 // TODO(nsanders): Find some way to get the max 330 // and min phys addr in the system. 331 uint64 maxsize = os_->FindFreeMemSize() * 4; 332 sat_assert(maxsize != 0); 333 334 // Make a bitmask of this many pages. Assume that the memory is relatively 335 // zero based. This is true on x86, typically. 336 // This is one bit per page. 337 uint64 arraysize = maxsize / 4096 / 8; 338 unsigned char *bitmap = new unsigned char[arraysize]; 339 sat_assert(bitmap); 340 341 // Mark every page as 0, not seen. 342 memset(bitmap, 0, arraysize); 343 344 page_bitmap_size_ = maxsize; 345 page_bitmap_ = bitmap; 346 } 347 348 // Add the 4k pages in this block to the array of pages SAT has seen. 349 void Sat::AddrMapUpdate(struct page_entry *pe) { 350 if (!do_page_map_) 351 return; 352 353 // Go through 4k page blocks. 354 uint64 arraysize = page_bitmap_size_ / 4096 / 8; 355 356 char *base = reinterpret_cast<char*>(pe->addr); 357 for (int i = 0; i < page_length_; i += 4096) { 358 uint64 paddr = os_->VirtualToPhysical(base + i); 359 360 uint32 offset = paddr / 4096 / 8; 361 unsigned char mask = 1 << ((paddr / 4096) % 8); 362 363 if (offset >= arraysize) { 364 logprintf(0, "Process Error: Physical address %#llx is " 365 "greater than expected %#llx.\n", 366 paddr, page_bitmap_size_); 367 sat_assert(0); 368 } 369 page_bitmap_[offset] |= mask; 370 } 371 } 372 373 // Print out the physical memory ranges that SAT has accessed. 374 void Sat::AddrMapPrint() { 375 if (!do_page_map_) 376 return; 377 378 uint64 pages = page_bitmap_size_ / 4096; 379 380 uint64 last_page = 0; 381 bool valid_range = false; 382 383 logprintf(4, "Log: Printing tested physical ranges.\n"); 384 385 for (uint64 i = 0; i < pages; i ++) { 386 int offset = i / 8; 387 unsigned char mask = 1 << (i % 8); 388 389 bool touched = page_bitmap_[offset] & mask; 390 if (touched && !valid_range) { 391 valid_range = true; 392 last_page = i * 4096; 393 } else if (!touched && valid_range) { 394 valid_range = false; 395 logprintf(4, "Log: %#016llx - %#016llx\n", last_page, (i * 4096) - 1); 396 } 397 } 398 logprintf(4, "Log: Done printing physical ranges.\n"); 399 } 400 401 // Initializes page lists and fills pages with data patterns. 402 bool Sat::InitializePages() { 403 int result = 1; 404 // Calculate needed page totals. 405 int64 neededpages = memory_threads_ + 406 invert_threads_ + 407 check_threads_ + 408 net_threads_ + 409 file_threads_; 410 411 // Empty-valid page ratio is adjusted depending on queue implementation. 412 // since fine-grain-locked queue keeps both valid and empty entries in the 413 // same queue and randomly traverse to find pages, the empty-valid ratio 414 // should be more even. 415 if (pe_q_implementation_ == SAT_FINELOCK) 416 freepages_ = pages_ / 5 * 2; // Mark roughly 2/5 of all pages as Empty. 417 else 418 freepages_ = (pages_ / 100) + (2 * neededpages); 419 420 if (freepages_ < neededpages) { 421 logprintf(0, "Process Error: freepages < neededpages.\n"); 422 logprintf(1, "Stats: Total: %lld, Needed: %lld, Marked free: %lld\n", 423 static_cast<int64>(pages_), 424 static_cast<int64>(neededpages), 425 static_cast<int64>(freepages_)); 426 bad_status(); 427 return false; 428 } 429 430 if (freepages_ > pages_/2) { 431 logprintf(0, "Process Error: not enough pages for IO\n"); 432 logprintf(1, "Stats: Total: %lld, Needed: %lld, Available: %lld\n", 433 static_cast<int64>(pages_), 434 static_cast<int64>(freepages_), 435 static_cast<int64>(pages_/2)); 436 bad_status(); 437 return false; 438 } 439 logprintf(12, "Log: Allocating pages, Total: %lld Free: %lld\n", 440 pages_, 441 freepages_); 442 443 // Initialize page locations. 444 for (int64 i = 0; i < pages_; i++) { 445 struct page_entry pe; 446 init_pe(&pe); 447 pe.offset = i * page_length_; 448 result &= PutEmpty(&pe); 449 } 450 451 if (!result) { 452 logprintf(0, "Process Error: while initializing empty_ list\n"); 453 bad_status(); 454 return false; 455 } 456 457 // Fill valid pages with test patterns. 458 // Use fill threads to do this. 459 WorkerStatus fill_status; 460 WorkerVector fill_vector; 461 462 logprintf(12, "Starting Fill threads: %d threads, %d pages\n", 463 fill_threads_, pages_); 464 // Initialize the fill threads. 465 for (int i = 0; i < fill_threads_; i++) { 466 FillThread *thread = new FillThread(); 467 thread->InitThread(i, this, os_, patternlist_, &fill_status); 468 if (i != fill_threads_ - 1) { 469 logprintf(12, "Starting Fill Threads %d: %d pages\n", 470 i, pages_ / fill_threads_); 471 thread->SetFillPages(pages_ / fill_threads_); 472 // The last thread finishes up all the leftover pages. 473 } else { 474 logprintf(12, "Starting Fill Threads %d: %d pages\n", 475 i, pages_ - pages_ / fill_threads_ * i); 476 thread->SetFillPages(pages_ - pages_ / fill_threads_ * i); 477 } 478 fill_vector.push_back(thread); 479 } 480 481 // Spawn the fill threads. 482 fill_status.Initialize(); 483 for (WorkerVector::const_iterator it = fill_vector.begin(); 484 it != fill_vector.end(); ++it) 485 (*it)->SpawnThread(); 486 487 // Reap the finished fill threads. 488 for (WorkerVector::const_iterator it = fill_vector.begin(); 489 it != fill_vector.end(); ++it) { 490 (*it)->JoinThread(); 491 if ((*it)->GetStatus() != 1) { 492 logprintf(0, "Thread %d failed with status %d at %.2f seconds\n", 493 (*it)->ThreadID(), (*it)->GetStatus(), 494 (*it)->GetRunDurationUSec() * 1.0/1000000); 495 bad_status(); 496 return false; 497 } 498 delete (*it); 499 } 500 fill_vector.clear(); 501 fill_status.Destroy(); 502 logprintf(12, "Log: Done filling pages.\n"); 503 logprintf(12, "Log: Allocating pages.\n"); 504 505 AddrMapInit(); 506 507 // Initialize page locations. 508 for (int64 i = 0; i < pages_; i++) { 509 struct page_entry pe; 510 // Only get valid pages with uninitialized tags here. 511 if (GetValid(&pe, kInvalidTag)) { 512 int64 paddr = os_->VirtualToPhysical(pe.addr); 513 int32 region = os_->FindRegion(paddr); 514 region_[region]++; 515 pe.paddr = paddr; 516 pe.tag = 1 << region; 517 region_mask_ |= pe.tag; 518 519 // Generate a physical region map 520 AddrMapUpdate(&pe); 521 522 // Note: this does not allocate free pages among all regions 523 // fairly. However, with large enough (thousands) random number 524 // of pages being marked free in each region, the free pages 525 // count in each region end up pretty balanced. 526 if (i < freepages_) { 527 result &= PutEmpty(&pe); 528 } else { 529 result &= PutValid(&pe); 530 } 531 } else { 532 logprintf(0, "Log: didn't tag all pages. %d - %d = %d\n", 533 pages_, i, pages_ - i); 534 return false; 535 } 536 } 537 logprintf(12, "Log: Done allocating pages.\n"); 538 539 AddrMapPrint(); 540 541 for (int i = 0; i < 32; i++) { 542 if (region_mask_ & (1 << i)) { 543 region_count_++; 544 logprintf(12, "Log: Region %d: %d.\n", i, region_[i]); 545 } 546 } 547 logprintf(5, "Log: Region mask: 0x%x\n", region_mask_); 548 549 return true; 550 } 551 552 // Print SAT version info. 553 bool Sat::PrintVersion() { 554 logprintf(1, "Stats: SAT revision %s, %d bit binary\n", 555 kVersion, address_mode_); 556 logprintf(5, "Log: %s from %s\n", Timestamp(), BuildChangelist()); 557 558 return true; 559 } 560 561 562 // Initializes the resources that SAT needs to run. 563 // This needs to be called before Run(), and after ParseArgs(). 564 // Returns true on success, false on error, and will exit() on help message. 565 bool Sat::Initialize() { 566 g_sat = this; 567 568 // Initializes sync'd log file to ensure output is saved. 569 if (!InitializeLogfile()) 570 return false; 571 Logger::GlobalLogger()->SetTimestampLogging(log_timestamps_); 572 Logger::GlobalLogger()->StartThread(); 573 574 logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str()); 575 PrintVersion(); 576 577 std::map<std::string, std::string> options; 578 579 GoogleOsOptions(&options); 580 581 // Initialize OS/Hardware interface. 582 os_ = OsLayerFactory(options); 583 if (!os_) { 584 bad_status(); 585 return false; 586 } 587 588 if (min_hugepages_mbytes_ > 0) 589 os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte); 590 591 if (reserve_mb_ > 0) 592 os_->SetReserveSize(reserve_mb_); 593 594 if (channels_.size() > 0) { 595 logprintf(6, "Log: Decoding memory: %dx%d bit channels," 596 "%d modules per channel (x%d), decoding hash 0x%x\n", 597 channels_.size(), channel_width_, channels_[0].size(), 598 channel_width_/channels_[0].size(), channel_hash_); 599 os_->SetDramMappingParams(channel_hash_, channel_width_, &channels_); 600 } 601 602 if (!os_->Initialize()) { 603 logprintf(0, "Process Error: Failed to initialize OS layer\n"); 604 bad_status(); 605 delete os_; 606 return false; 607 } 608 609 // Checks that OS/Build/Platform is supported. 610 if (!CheckEnvironment()) 611 return false; 612 613 if (error_injection_) 614 os_->set_error_injection(true); 615 616 // Run SAT in monitor only mode, do not continue to allocate resources. 617 if (monitor_mode_) { 618 logprintf(5, "Log: Running in monitor-only mode. " 619 "Will not allocate any memory nor run any stress test. " 620 "Only polling ECC errors.\n"); 621 return true; 622 } 623 624 // Allocate the memory to test. 625 if (!AllocateMemory()) 626 return false; 627 628 logprintf(5, "Stats: Starting SAT, %dM, %d seconds\n", 629 static_cast<int>(size_/kMegabyte), 630 runtime_seconds_); 631 632 if (!InitializePatterns()) 633 return false; 634 635 // Initialize memory allocation. 636 pages_ = size_ / page_length_; 637 638 // Allocate page queue depending on queue implementation switch. 639 if (pe_q_implementation_ == SAT_FINELOCK) { 640 finelock_q_ = new FineLockPEQueue(pages_, page_length_); 641 if (finelock_q_ == NULL) 642 return false; 643 finelock_q_->set_os(os_); 644 os_->set_err_log_callback(finelock_q_->get_err_log_callback()); 645 } else if (pe_q_implementation_ == SAT_ONELOCK) { 646 empty_ = new PageEntryQueue(pages_); 647 valid_ = new PageEntryQueue(pages_); 648 if ((empty_ == NULL) || (valid_ == NULL)) 649 return false; 650 } 651 652 if (!InitializePages()) { 653 logprintf(0, "Process Error: Initialize Pages failed\n"); 654 return false; 655 } 656 657 return true; 658 } 659 660 // Constructor and destructor. 661 Sat::Sat() { 662 // Set defaults, command line might override these. 663 runtime_seconds_ = 20; 664 page_length_ = kSatPageSize; 665 disk_pages_ = kSatDiskPage; 666 pages_ = 0; 667 size_mb_ = 0; 668 size_ = size_mb_ * kMegabyte; 669 reserve_mb_ = 0; 670 min_hugepages_mbytes_ = 0; 671 freepages_ = 0; 672 paddr_base_ = 0; 673 channel_hash_ = kCacheLineSize; 674 channel_width_ = 64; 675 676 user_break_ = false; 677 verbosity_ = 8; 678 Logger::GlobalLogger()->SetVerbosity(verbosity_); 679 print_delay_ = 10; 680 strict_ = 1; 681 warm_ = 0; 682 run_on_anything_ = 0; 683 use_logfile_ = 0; 684 logfile_ = 0; 685 log_timestamps_ = true; 686 // Detect 32/64 bit binary. 687 void *pvoid = 0; 688 address_mode_ = sizeof(pvoid) * 8; 689 error_injection_ = false; 690 crazy_error_injection_ = false; 691 max_errorcount_ = 0; // Zero means no early exit. 692 stop_on_error_ = false; 693 error_poll_ = true; 694 findfiles_ = false; 695 696 do_page_map_ = false; 697 page_bitmap_ = 0; 698 page_bitmap_size_ = 0; 699 700 // Cache coherency data initialization. 701 cc_test_ = false; // Flag to trigger cc threads. 702 cc_cacheline_count_ = 2; // Two datastructures of cache line size. 703 cc_cacheline_size_ = 0; // Size of a cacheline (0 for auto-detect). 704 cc_inc_count_ = 1000; // Number of times to increment the shared variable. 705 cc_cacheline_data_ = 0; // Cache Line size datastructure. 706 707 // Cpu frequency data initialization. 708 cpu_freq_test_ = false; // Flag to trigger cpu frequency thread. 709 cpu_freq_threshold_ = 0; // Threshold, in MHz, at which a cpu fails. 710 cpu_freq_round_ = 10; // Round the computed frequency to this value. 711 712 sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL)); 713 file_threads_ = 0; 714 net_threads_ = 0; 715 listen_threads_ = 0; 716 // Default to autodetect number of cpus, and run that many threads. 717 memory_threads_ = -1; 718 invert_threads_ = 0; 719 fill_threads_ = 8; 720 check_threads_ = 0; 721 cpu_stress_threads_ = 0; 722 disk_threads_ = 0; 723 total_threads_ = 0; 724 725 region_mask_ = 0; 726 region_count_ = 0; 727 for (int i = 0; i < 32; i++) { 728 region_[i] = 0; 729 } 730 region_mode_ = 0; 731 732 errorcount_ = 0; 733 statuscount_ = 0; 734 735 valid_ = 0; 736 empty_ = 0; 737 finelock_q_ = 0; 738 // Default to use fine-grain lock for better performance. 739 pe_q_implementation_ = SAT_FINELOCK; 740 741 os_ = 0; 742 patternlist_ = 0; 743 logfilename_[0] = 0; 744 745 read_block_size_ = 512; 746 write_block_size_ = -1; 747 segment_size_ = -1; 748 cache_size_ = -1; 749 blocks_per_segment_ = -1; 750 read_threshold_ = -1; 751 write_threshold_ = -1; 752 non_destructive_ = 1; 753 monitor_mode_ = 0; 754 tag_mode_ = 0; 755 random_threads_ = 0; 756 757 pause_delay_ = 600; 758 pause_duration_ = 15; 759 } 760 761 // Destructor. 762 Sat::~Sat() { 763 // We need to have called Cleanup() at this point. 764 // We should probably enforce this. 765 } 766 767 768 #define ARG_KVALUE(argument, variable, value) \ 769 if (!strcmp(argv[i], argument)) { \ 770 variable = value; \ 771 continue; \ 772 } 773 774 #define ARG_IVALUE(argument, variable) \ 775 if (!strcmp(argv[i], argument)) { \ 776 i++; \ 777 if (i < argc) \ 778 variable = strtoull(argv[i], NULL, 0); \ 779 continue; \ 780 } 781 782 #define ARG_SVALUE(argument, variable) \ 783 if (!strcmp(argv[i], argument)) { \ 784 i++; \ 785 if (i < argc) \ 786 snprintf(variable, sizeof(variable), "%s", argv[i]); \ 787 continue; \ 788 } 789 790 // Configures SAT from command line arguments. 791 // This will call exit() given a request for 792 // self-documentation or unexpected args. 793 bool Sat::ParseArgs(int argc, char **argv) { 794 int i; 795 uint64 filesize = page_length_ * disk_pages_; 796 797 // Parse each argument. 798 for (i = 1; i < argc; i++) { 799 // Switch to fall back to corase-grain-lock queue. (for benchmarking) 800 ARG_KVALUE("--coarse_grain_lock", pe_q_implementation_, SAT_ONELOCK); 801 802 // Set number of megabyte to use. 803 ARG_IVALUE("-M", size_mb_); 804 805 // Specify the amount of megabytes to be reserved for system. 806 ARG_IVALUE("--reserve_memory", reserve_mb_); 807 808 // Set minimum megabytes of hugepages to require. 809 ARG_IVALUE("-H", min_hugepages_mbytes_); 810 811 // Set number of seconds to run. 812 ARG_IVALUE("-s", runtime_seconds_); 813 814 // Set number of memory copy threads. 815 ARG_IVALUE("-m", memory_threads_); 816 817 // Set number of memory invert threads. 818 ARG_IVALUE("-i", invert_threads_); 819 820 // Set number of check-only threads. 821 ARG_IVALUE("-c", check_threads_); 822 823 // Set number of cache line size datastructures. 824 ARG_IVALUE("--cc_inc_count", cc_inc_count_); 825 826 // Set number of cache line size datastructures 827 ARG_IVALUE("--cc_line_count", cc_cacheline_count_); 828 829 // Override the detected or assumed cache line size. 830 ARG_IVALUE("--cc_line_size", cc_cacheline_size_); 831 832 // Flag set when cache coherency tests need to be run 833 ARG_KVALUE("--cc_test", cc_test_, true); 834 835 // Set when the cpu_frequency test needs to be run 836 ARG_KVALUE("--cpu_freq_test", cpu_freq_test_, true); 837 838 // Set the threshold in MHz at which the cpu frequency test will fail. 839 ARG_IVALUE("--cpu_freq_threshold", cpu_freq_threshold_); 840 841 // Set the rounding value for the cpu frequency test. The default is to 842 // round to the nearest 10s value. 843 ARG_IVALUE("--cpu_freq_round", cpu_freq_round_); 844 845 // Set number of CPU stress threads. 846 ARG_IVALUE("-C", cpu_stress_threads_); 847 848 // Set logfile name. 849 ARG_SVALUE("-l", logfilename_); 850 851 // Verbosity level. 852 ARG_IVALUE("-v", verbosity_); 853 854 // Chatty printout level. 855 ARG_IVALUE("--printsec", print_delay_); 856 857 // Turn off timestamps logging. 858 ARG_KVALUE("--no_timestamps", log_timestamps_, false); 859 860 // Set maximum number of errors to collect. Stop running after this many. 861 ARG_IVALUE("--max_errors", max_errorcount_); 862 863 // Set pattern block size. 864 ARG_IVALUE("-p", page_length_); 865 866 // Set pattern block size. 867 ARG_IVALUE("--filesize", filesize); 868 869 // NUMA options. 870 ARG_KVALUE("--local_numa", region_mode_, kLocalNuma); 871 ARG_KVALUE("--remote_numa", region_mode_, kRemoteNuma); 872 873 // Autodetect tempfile locations. 874 ARG_KVALUE("--findfiles", findfiles_, 1); 875 876 // Inject errors to force miscompare code paths 877 ARG_KVALUE("--force_errors", error_injection_, true); 878 ARG_KVALUE("--force_errors_like_crazy", crazy_error_injection_, true); 879 if (crazy_error_injection_) 880 error_injection_ = true; 881 882 // Stop immediately on any arror, for debugging HW problems. 883 ARG_KVALUE("--stop_on_errors", stop_on_error_, 1); 884 885 // Don't use internal error polling, allow external detection. 886 ARG_KVALUE("--no_errors", error_poll_, 0); 887 888 // Never check data as you go. 889 ARG_KVALUE("-F", strict_, 0); 890 891 // Warm the cpu as you go. 892 ARG_KVALUE("-W", warm_, 1); 893 894 // Allow runnign on unknown systems with base unimplemented OsLayer 895 ARG_KVALUE("-A", run_on_anything_, 1); 896 897 // Size of read blocks for disk test. 898 ARG_IVALUE("--read-block-size", read_block_size_); 899 900 // Size of write blocks for disk test. 901 ARG_IVALUE("--write-block-size", write_block_size_); 902 903 // Size of segment for disk test. 904 ARG_IVALUE("--segment-size", segment_size_); 905 906 // Size of disk cache size for disk test. 907 ARG_IVALUE("--cache-size", cache_size_); 908 909 // Number of blocks to test per segment. 910 ARG_IVALUE("--blocks-per-segment", blocks_per_segment_); 911 912 // Maximum time a block read should take before warning. 913 ARG_IVALUE("--read-threshold", read_threshold_); 914 915 // Maximum time a block write should take before warning. 916 ARG_IVALUE("--write-threshold", write_threshold_); 917 918 // Do not write anything to disk in the disk test. 919 ARG_KVALUE("--destructive", non_destructive_, 0); 920 921 // Run SAT in monitor mode. No test load at all. 922 ARG_KVALUE("--monitor_mode", monitor_mode_, true); 923 924 // Run SAT in address mode. Tag all cachelines by virt addr. 925 ARG_KVALUE("--tag_mode", tag_mode_, true); 926 927 // Dump range map of tested pages.. 928 ARG_KVALUE("--do_page_map", do_page_map_, true); 929 930 // Specify the physical address base to test. 931 ARG_IVALUE("--paddr_base", paddr_base_); 932 933 // Specify the frequency for power spikes. 934 ARG_IVALUE("--pause_delay", pause_delay_); 935 936 // Specify the duration of each pause (for power spikes). 937 ARG_IVALUE("--pause_duration", pause_duration_); 938 939 // Disk device names 940 if (!strcmp(argv[i], "-d")) { 941 i++; 942 if (i < argc) { 943 disk_threads_++; 944 diskfilename_.push_back(string(argv[i])); 945 blocktables_.push_back(new DiskBlockTable()); 946 } 947 continue; 948 } 949 950 // Set number of disk random threads for each disk write thread. 951 ARG_IVALUE("--random-threads", random_threads_); 952 953 // Set a tempfile to use in a file thread. 954 if (!strcmp(argv[i], "-f")) { 955 i++; 956 if (i < argc) { 957 file_threads_++; 958 filename_.push_back(string(argv[i])); 959 } 960 continue; 961 } 962 963 // Set a hostname to use in a network thread. 964 if (!strcmp(argv[i], "-n")) { 965 i++; 966 if (i < argc) { 967 net_threads_++; 968 ipaddrs_.push_back(string(argv[i])); 969 } 970 continue; 971 } 972 973 // Run threads that listen for incoming SAT net connections. 974 ARG_KVALUE("--listen", listen_threads_, 1); 975 976 if (CheckGoogleSpecificArgs(argc, argv, &i)) { 977 continue; 978 } 979 980 ARG_IVALUE("--channel_hash", channel_hash_); 981 ARG_IVALUE("--channel_width", channel_width_); 982 983 if (!strcmp(argv[i], "--memory_channel")) { 984 i++; 985 if (i < argc) { 986 char *channel = argv[i]; 987 channels_.push_back(vector<string>()); 988 while (char* next = strchr(channel, ',')) { 989 channels_.back().push_back(string(channel, next - channel)); 990 channel = next + 1; 991 } 992 channels_.back().push_back(string(channel)); 993 } 994 continue; 995 } 996 997 // Default: 998 PrintVersion(); 999 PrintHelp(); 1000 if (strcmp(argv[i], "-h") && strcmp(argv[i], "--help")) { 1001 printf("\n Unknown argument %s\n", argv[i]); 1002 bad_status(); 1003 exit(1); 1004 } 1005 // Forget it, we printed the help, just bail. 1006 // We don't want to print test status, or any log parser stuff. 1007 exit(0); 1008 } 1009 1010 Logger::GlobalLogger()->SetVerbosity(verbosity_); 1011 1012 // Update relevant data members with parsed input. 1013 // Translate MB into bytes. 1014 size_ = static_cast<int64>(size_mb_) * kMegabyte; 1015 1016 // Set logfile flag. 1017 if (strcmp(logfilename_, "")) 1018 use_logfile_ = 1; 1019 // Checks valid page length. 1020 if (page_length_ && 1021 !(page_length_ & (page_length_ - 1)) && 1022 (page_length_ > 1023)) { 1023 // Prints if we have changed from default. 1024 if (page_length_ != kSatPageSize) 1025 logprintf(12, "Log: Updating page size to %d\n", page_length_); 1026 } else { 1027 // Revert to default page length. 1028 logprintf(6, "Process Error: " 1029 "Invalid page size %d\n", page_length_); 1030 page_length_ = kSatPageSize; 1031 return false; 1032 } 1033 1034 // Set disk_pages_ if filesize or page size changed. 1035 if (filesize != static_cast<uint64>(page_length_) * 1036 static_cast<uint64>(disk_pages_)) { 1037 disk_pages_ = filesize / page_length_; 1038 if (disk_pages_ == 0) 1039 disk_pages_ = 1; 1040 } 1041 1042 // Validate memory channel parameters if supplied 1043 if (channels_.size()) { 1044 if (channels_.size() == 1) { 1045 channel_hash_ = 0; 1046 logprintf(7, "Log: " 1047 "Only one memory channel...deactivating interleave decoding.\n"); 1048 } else if (channels_.size() > 2) { 1049 logprintf(6, "Process Error: " 1050 "Triple-channel mode not yet supported... sorry.\n"); 1051 bad_status(); 1052 return false; 1053 } 1054 for (uint i = 0; i < channels_.size(); i++) 1055 if (channels_[i].size() != channels_[0].size()) { 1056 logprintf(6, "Process Error: " 1057 "Channels 0 and %d have a different count of dram modules.\n", i); 1058 bad_status(); 1059 return false; 1060 } 1061 if (channels_[0].size() & (channels_[0].size() - 1)) { 1062 logprintf(6, "Process Error: " 1063 "Amount of modules per memory channel is not a power of 2.\n"); 1064 bad_status(); 1065 return false; 1066 } 1067 if (channel_width_ < 16 1068 || channel_width_ & (channel_width_ - 1)) { 1069 logprintf(6, "Process Error: " 1070 "Channel width %d is invalid.\n", channel_width_); 1071 bad_status(); 1072 return false; 1073 } 1074 if (channel_width_ / channels_[0].size() < 8) { 1075 logprintf(6, "Process Error: Chip width x%d must be x8 or greater.\n", 1076 channel_width_ / channels_[0].size()); 1077 bad_status(); 1078 return false; 1079 } 1080 } 1081 1082 1083 // Print each argument. 1084 for (int i = 0; i < argc; i++) { 1085 if (i) 1086 cmdline_ += " "; 1087 cmdline_ += argv[i]; 1088 } 1089 1090 return true; 1091 } 1092 1093 void Sat::PrintHelp() { 1094 printf("Usage: ./sat(32|64) [options]\n" 1095 " -M mbytes megabytes of ram to test\n" 1096 " --reserve-memory If not using hugepages, the amount of memory to " 1097 " reserve for the system\n" 1098 " -H mbytes minimum megabytes of hugepages to require\n" 1099 " -s seconds number of seconds to run\n" 1100 " -m threads number of memory copy threads to run\n" 1101 " -i threads number of memory invert threads to run\n" 1102 " -C threads number of memory CPU stress threads to run\n" 1103 " --findfiles find locations to do disk IO automatically\n" 1104 " -d device add a direct write disk thread with block " 1105 "device (or file) 'device'\n" 1106 " -f filename add a disk thread with " 1107 "tempfile 'filename'\n" 1108 " -l logfile log output to file 'logfile'\n" 1109 " --no_timestamps do not prefix timestamps to log messages\n" 1110 " --max_errors n exit early after finding 'n' errors\n" 1111 " -v level verbosity (0-20), default is 8\n" 1112 " --printsec secs How often to print 'seconds remaining'\n" 1113 " -W Use more CPU-stressful memory copy\n" 1114 " -A run in degraded mode on incompatible systems\n" 1115 " -p pagesize size in bytes of memory chunks\n" 1116 " --filesize size size of disk IO tempfiles\n" 1117 " -n ipaddr add a network thread connecting to " 1118 "system at 'ipaddr'\n" 1119 " --listen run a thread to listen for and respond " 1120 "to network threads.\n" 1121 " --no_errors run without checking for ECC or other errors\n" 1122 " --force_errors inject false errors to test error handling\n" 1123 " --force_errors_like_crazy inject a lot of false errors " 1124 "to test error handling\n" 1125 " -F don't result check each transaction\n" 1126 " --stop_on_errors Stop after finding the first error.\n" 1127 " --read-block-size size of block for reading (-d)\n" 1128 " --write-block-size size of block for writing (-d). If not " 1129 "defined, the size of block for writing will be defined as the " 1130 "size of block for reading\n" 1131 " --segment-size size of segments to split disk into (-d)\n" 1132 " --cache-size size of disk cache (-d)\n" 1133 " --blocks-per-segment number of blocks to read/write per " 1134 "segment per iteration (-d)\n" 1135 " --read-threshold maximum time (in us) a block read should " 1136 "take (-d)\n" 1137 " --write-threshold maximum time (in us) a block write " 1138 "should take (-d)\n" 1139 " --random-threads number of random threads for each disk " 1140 "write thread (-d)\n" 1141 " --destructive write/wipe disk partition (-d)\n" 1142 " --monitor_mode only do ECC error polling, no stress load.\n" 1143 " --cc_test do the cache coherency testing\n" 1144 " --cc_inc_count number of times to increment the " 1145 "cacheline's member\n" 1146 " --cc_line_count number of cache line sized datastructures " 1147 "to allocate for the cache coherency threads to operate\n" 1148 " --cc_line_size override the auto-detected cache line size\n" 1149 " --cpu_freq_test enable the cpu frequency test (requires the " 1150 "--cpu_freq_threshold argument to be set)\n" 1151 " --cpu_freq_threshold fail the cpu frequency test if the frequency " 1152 "goes below this value (specified in MHz)\n" 1153 " --cpu_freq_round round the computed frequency to this value, if set" 1154 " to zero, only round to the nearest MHz\n" 1155 " --paddr_base allocate memory starting from this address\n" 1156 " --pause_delay delay (in seconds) between power spikes\n" 1157 " --pause_duration duration (in seconds) of each pause\n" 1158 " --local_numa choose memory regions associated with " 1159 "each CPU to be tested by that CPU\n" 1160 " --remote_numa choose memory regions not associated with " 1161 "each CPU to be tested by that CPU\n" 1162 " --channel_hash mask of address bits XORed to determine channel. " 1163 "Mask 0x40 interleaves cachelines between channels\n" 1164 " --channel_width bits width in bits of each memory channel\n" 1165 " --memory_channel u1,u2 defines a comma-separated list of names " 1166 "for dram packages in a memory channel. Use multiple times to " 1167 "define multiple channels.\n"); 1168 } 1169 1170 bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) { 1171 // Do nothing, no google-specific argument on public stressapptest 1172 return false; 1173 } 1174 1175 void Sat::GoogleOsOptions(std::map<std::string, std::string> *options) { 1176 // Do nothing, no OS-specific argument on public stressapptest 1177 } 1178 1179 // Launch the SAT task threads. Returns 0 on error. 1180 void Sat::InitializeThreads() { 1181 // Memory copy threads. 1182 AcquireWorkerLock(); 1183 1184 logprintf(12, "Log: Starting worker threads\n"); 1185 WorkerVector *memory_vector = new WorkerVector(); 1186 1187 // Error polling thread. 1188 // This may detect ECC corrected errors, disk problems, or 1189 // any other errors normally hidden from userspace. 1190 WorkerVector *error_vector = new WorkerVector(); 1191 if (error_poll_) { 1192 ErrorPollThread *thread = new ErrorPollThread(); 1193 thread->InitThread(total_threads_++, this, os_, patternlist_, 1194 &continuous_status_); 1195 1196 error_vector->insert(error_vector->end(), thread); 1197 } else { 1198 logprintf(5, "Log: Skipping error poll thread due to --no_errors flag\n"); 1199 } 1200 workers_map_.insert(make_pair(kErrorType, error_vector)); 1201 1202 // Only start error poll threads for monitor-mode SAT, 1203 // skip all other types of worker threads. 1204 if (monitor_mode_) { 1205 ReleaseWorkerLock(); 1206 return; 1207 } 1208 1209 for (int i = 0; i < memory_threads_; i++) { 1210 CopyThread *thread = new CopyThread(); 1211 thread->InitThread(total_threads_++, this, os_, patternlist_, 1212 &power_spike_status_); 1213 1214 if ((region_count_ > 1) && (region_mode_)) { 1215 int32 region = region_find(i % region_count_); 1216 cpu_set_t *cpuset = os_->FindCoreMask(region); 1217 sat_assert(cpuset); 1218 if (region_mode_ == kLocalNuma) { 1219 // Choose regions associated with this CPU. 1220 thread->set_cpu_mask(cpuset); 1221 thread->set_tag(1 << region); 1222 } else if (region_mode_ == kRemoteNuma) { 1223 // Choose regions not associated with this CPU.. 1224 thread->set_cpu_mask(cpuset); 1225 thread->set_tag(region_mask_ & ~(1 << region)); 1226 } 1227 } else { 1228 cpu_set_t available_cpus; 1229 thread->AvailableCpus(&available_cpus); 1230 int cores = cpuset_count(&available_cpus); 1231 // Don't restrict thread location if we have more than one 1232 // thread per core. Not so good for performance. 1233 if (cpu_stress_threads_ + memory_threads_ <= cores) { 1234 // Place a thread on alternating cores first. 1235 // This assures interleaved core use with no overlap. 1236 int nthcore = i; 1237 int nthbit = (((2 * nthcore) % cores) + 1238 (((2 * nthcore) / cores) % 2)) % cores; 1239 cpu_set_t all_cores; 1240 cpuset_set_ab(&all_cores, 0, cores); 1241 if (!cpuset_isequal(&available_cpus, &all_cores)) { 1242 // We are assuming the bits are contiguous. 1243 // Complain if this is not so. 1244 logprintf(0, "Log: cores = %s, expected %s\n", 1245 cpuset_format(&available_cpus).c_str(), 1246 cpuset_format(&all_cores).c_str()); 1247 } 1248 1249 // Set thread affinity. 1250 thread->set_cpu_mask_to_cpu(nthbit); 1251 } 1252 } 1253 memory_vector->insert(memory_vector->end(), thread); 1254 } 1255 workers_map_.insert(make_pair(kMemoryType, memory_vector)); 1256 1257 // File IO threads. 1258 WorkerVector *fileio_vector = new WorkerVector(); 1259 for (int i = 0; i < file_threads_; i++) { 1260 FileThread *thread = new FileThread(); 1261 thread->InitThread(total_threads_++, this, os_, patternlist_, 1262 &power_spike_status_); 1263 thread->SetFile(filename_[i].c_str()); 1264 // Set disk threads high priority. They don't take much processor time, 1265 // but blocking them will delay disk IO. 1266 thread->SetPriority(WorkerThread::High); 1267 1268 fileio_vector->insert(fileio_vector->end(), thread); 1269 } 1270 workers_map_.insert(make_pair(kFileIOType, fileio_vector)); 1271 1272 // Net IO threads. 1273 WorkerVector *netio_vector = new WorkerVector(); 1274 WorkerVector *netslave_vector = new WorkerVector(); 1275 if (listen_threads_ > 0) { 1276 // Create a network slave thread. This listens for connections. 1277 NetworkListenThread *thread = new NetworkListenThread(); 1278 thread->InitThread(total_threads_++, this, os_, patternlist_, 1279 &continuous_status_); 1280 1281 netslave_vector->insert(netslave_vector->end(), thread); 1282 } 1283 for (int i = 0; i < net_threads_; i++) { 1284 NetworkThread *thread = new NetworkThread(); 1285 thread->InitThread(total_threads_++, this, os_, patternlist_, 1286 &continuous_status_); 1287 thread->SetIP(ipaddrs_[i].c_str()); 1288 1289 netio_vector->insert(netio_vector->end(), thread); 1290 } 1291 workers_map_.insert(make_pair(kNetIOType, netio_vector)); 1292 workers_map_.insert(make_pair(kNetSlaveType, netslave_vector)); 1293 1294 // Result check threads. 1295 WorkerVector *check_vector = new WorkerVector(); 1296 for (int i = 0; i < check_threads_; i++) { 1297 CheckThread *thread = new CheckThread(); 1298 thread->InitThread(total_threads_++, this, os_, patternlist_, 1299 &continuous_status_); 1300 1301 check_vector->insert(check_vector->end(), thread); 1302 } 1303 workers_map_.insert(make_pair(kCheckType, check_vector)); 1304 1305 // Memory invert threads. 1306 logprintf(12, "Log: Starting invert threads\n"); 1307 WorkerVector *invert_vector = new WorkerVector(); 1308 for (int i = 0; i < invert_threads_; i++) { 1309 InvertThread *thread = new InvertThread(); 1310 thread->InitThread(total_threads_++, this, os_, patternlist_, 1311 &continuous_status_); 1312 1313 invert_vector->insert(invert_vector->end(), thread); 1314 } 1315 workers_map_.insert(make_pair(kInvertType, invert_vector)); 1316 1317 // Disk stress threads. 1318 WorkerVector *disk_vector = new WorkerVector(); 1319 WorkerVector *random_vector = new WorkerVector(); 1320 logprintf(12, "Log: Starting disk stress threads\n"); 1321 for (int i = 0; i < disk_threads_; i++) { 1322 // Creating write threads 1323 DiskThread *thread = new DiskThread(blocktables_[i]); 1324 thread->InitThread(total_threads_++, this, os_, patternlist_, 1325 &power_spike_status_); 1326 thread->SetDevice(diskfilename_[i].c_str()); 1327 if (thread->SetParameters(read_block_size_, write_block_size_, 1328 segment_size_, cache_size_, 1329 blocks_per_segment_, 1330 read_threshold_, write_threshold_, 1331 non_destructive_)) { 1332 disk_vector->insert(disk_vector->end(), thread); 1333 } else { 1334 logprintf(12, "Log: DiskThread::SetParameters() failed\n"); 1335 delete thread; 1336 } 1337 1338 for (int j = 0; j < random_threads_; j++) { 1339 // Creating random threads 1340 RandomDiskThread *rthread = new RandomDiskThread(blocktables_[i]); 1341 rthread->InitThread(total_threads_++, this, os_, patternlist_, 1342 &power_spike_status_); 1343 rthread->SetDevice(diskfilename_[i].c_str()); 1344 if (rthread->SetParameters(read_block_size_, write_block_size_, 1345 segment_size_, cache_size_, 1346 blocks_per_segment_, 1347 read_threshold_, write_threshold_, 1348 non_destructive_)) { 1349 random_vector->insert(random_vector->end(), rthread); 1350 } else { 1351 logprintf(12, "Log: RandomDiskThread::SetParameters() failed\n"); 1352 delete rthread; 1353 } 1354 } 1355 } 1356 1357 workers_map_.insert(make_pair(kDiskType, disk_vector)); 1358 workers_map_.insert(make_pair(kRandomDiskType, random_vector)); 1359 1360 // CPU stress threads. 1361 WorkerVector *cpu_vector = new WorkerVector(); 1362 logprintf(12, "Log: Starting cpu stress threads\n"); 1363 for (int i = 0; i < cpu_stress_threads_; i++) { 1364 CpuStressThread *thread = new CpuStressThread(); 1365 thread->InitThread(total_threads_++, this, os_, patternlist_, 1366 &continuous_status_); 1367 1368 // Don't restrict thread location if we have more than one 1369 // thread per core. Not so good for performance. 1370 cpu_set_t available_cpus; 1371 thread->AvailableCpus(&available_cpus); 1372 int cores = cpuset_count(&available_cpus); 1373 if (cpu_stress_threads_ + memory_threads_ <= cores) { 1374 // Place a thread on alternating cores first. 1375 // Go in reverse order for CPU stress threads. This assures interleaved 1376 // core use with no overlap. 1377 int nthcore = (cores - 1) - i; 1378 int nthbit = (((2 * nthcore) % cores) + 1379 (((2 * nthcore) / cores) % 2)) % cores; 1380 cpu_set_t all_cores; 1381 cpuset_set_ab(&all_cores, 0, cores); 1382 if (!cpuset_isequal(&available_cpus, &all_cores)) { 1383 logprintf(0, "Log: cores = %s, expected %s\n", 1384 cpuset_format(&available_cpus).c_str(), 1385 cpuset_format(&all_cores).c_str()); 1386 } 1387 1388 // Set thread affinity. 1389 thread->set_cpu_mask_to_cpu(nthbit); 1390 } 1391 1392 1393 cpu_vector->insert(cpu_vector->end(), thread); 1394 } 1395 workers_map_.insert(make_pair(kCPUType, cpu_vector)); 1396 1397 // CPU Cache Coherency Threads - one for each core available. 1398 if (cc_test_) { 1399 WorkerVector *cc_vector = new WorkerVector(); 1400 logprintf(12, "Log: Starting cpu cache coherency threads\n"); 1401 1402 // Allocate the shared datastructure to be worked on by the threads. 1403 cc_cacheline_data_ = reinterpret_cast<cc_cacheline_data*>( 1404 malloc(sizeof(cc_cacheline_data) * cc_cacheline_count_)); 1405 sat_assert(cc_cacheline_data_ != NULL); 1406 1407 // Initialize the strucutre. 1408 memset(cc_cacheline_data_, 0, 1409 sizeof(cc_cacheline_data) * cc_cacheline_count_); 1410 1411 int num_cpus = CpuCount(); 1412 char *num; 1413 // Calculate the number of cache lines needed just to give each core 1414 // its own counter. 1415 int line_size = cc_cacheline_size_; 1416 if (line_size <= 0) { 1417 line_size = CacheLineSize(); 1418 if (line_size < kCacheLineSize) 1419 line_size = kCacheLineSize; 1420 logprintf(12, "Log: Using %d as cache line size\n", line_size); 1421 } 1422 // The number of cache lines needed to hold an array of num_cpus. 1423 // "num" must be the same type as cc_cacheline_data[X].num or the memory 1424 // size calculations will fail. 1425 int needed_lines = (sizeof(*num) * num_cpus + line_size - 1) / line_size; 1426 // Allocate all the nums once so that we get a single chunk 1427 // of contiguous memory. 1428 #ifdef HAVE_POSIX_MEMALIGN 1429 int err_result = posix_memalign( 1430 reinterpret_cast<void**>(&num), 1431 line_size, line_size * needed_lines * cc_cacheline_count_); 1432 #else 1433 num = reinterpret_cast<char*>(memalign( 1434 line_size, line_size * needed_lines * cc_cacheline_count_)); 1435 int err_result = (num == 0); 1436 #endif 1437 sat_assert(err_result == 0); 1438 1439 int cline; 1440 for (cline = 0; cline < cc_cacheline_count_; cline++) { 1441 memset(num, 0, sizeof(*num) * num_cpus); 1442 cc_cacheline_data_[cline].num = num; 1443 num += (line_size * needed_lines) / sizeof(*num); 1444 } 1445 1446 int tnum; 1447 for (tnum = 0; tnum < num_cpus; tnum++) { 1448 CpuCacheCoherencyThread *thread = 1449 new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_, 1450 tnum, num_cpus, cc_inc_count_); 1451 thread->InitThread(total_threads_++, this, os_, patternlist_, 1452 &continuous_status_); 1453 // Pin the thread to a particular core. 1454 thread->set_cpu_mask_to_cpu(tnum); 1455 1456 // Insert the thread into the vector. 1457 cc_vector->insert(cc_vector->end(), thread); 1458 } 1459 workers_map_.insert(make_pair(kCCType, cc_vector)); 1460 } 1461 1462 if (cpu_freq_test_) { 1463 // Create the frequency test thread. 1464 logprintf(5, "Log: Running cpu frequency test: threshold set to %dMHz.\n", 1465 cpu_freq_threshold_); 1466 CpuFreqThread *thread = new CpuFreqThread(CpuCount(), cpu_freq_threshold_, 1467 cpu_freq_round_); 1468 // This thread should be paused when other threads are paused. 1469 thread->InitThread(total_threads_++, this, os_, NULL, 1470 &power_spike_status_); 1471 1472 WorkerVector *cpu_freq_vector = new WorkerVector(); 1473 cpu_freq_vector->insert(cpu_freq_vector->end(), thread); 1474 workers_map_.insert(make_pair(kCPUFreqType, cpu_freq_vector)); 1475 } 1476 1477 ReleaseWorkerLock(); 1478 } 1479 1480 // Return the number of cpus actually present in the machine. 1481 int Sat::CpuCount() { 1482 return sysconf(_SC_NPROCESSORS_CONF); 1483 } 1484 1485 // Return the worst case (largest) cache line size of the various levels of 1486 // cache actually prsent in the machine. 1487 int Sat::CacheLineSize() { 1488 int max_linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 1489 int linesize = sysconf(_SC_LEVEL2_CACHE_LINESIZE); 1490 if (linesize > max_linesize) max_linesize = linesize; 1491 linesize = sysconf(_SC_LEVEL3_CACHE_LINESIZE); 1492 if (linesize > max_linesize) max_linesize = linesize; 1493 linesize = sysconf(_SC_LEVEL4_CACHE_LINESIZE); 1494 if (linesize > max_linesize) max_linesize = linesize; 1495 return max_linesize; 1496 } 1497 1498 // Notify and reap worker threads. 1499 void Sat::JoinThreads() { 1500 logprintf(12, "Log: Joining worker threads\n"); 1501 power_spike_status_.StopWorkers(); 1502 continuous_status_.StopWorkers(); 1503 1504 AcquireWorkerLock(); 1505 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1506 map_it != workers_map_.end(); ++map_it) { 1507 for (WorkerVector::const_iterator it = map_it->second->begin(); 1508 it != map_it->second->end(); ++it) { 1509 logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID()); 1510 (*it)->JoinThread(); 1511 } 1512 } 1513 ReleaseWorkerLock(); 1514 1515 QueueStats(); 1516 1517 // Finish up result checking. 1518 // Spawn 4 check threads to minimize check time. 1519 logprintf(12, "Log: Finished countdown, begin to result check\n"); 1520 WorkerStatus reap_check_status; 1521 WorkerVector reap_check_vector; 1522 1523 // No need for check threads for monitor mode. 1524 if (!monitor_mode_) { 1525 // Initialize the check threads. 1526 for (int i = 0; i < fill_threads_; i++) { 1527 CheckThread *thread = new CheckThread(); 1528 thread->InitThread(total_threads_++, this, os_, patternlist_, 1529 &reap_check_status); 1530 logprintf(12, "Log: Finished countdown, begin to result check\n"); 1531 reap_check_vector.push_back(thread); 1532 } 1533 } 1534 1535 reap_check_status.Initialize(); 1536 // Check threads should be marked to stop ASAP. 1537 reap_check_status.StopWorkers(); 1538 1539 // Spawn the check threads. 1540 for (WorkerVector::const_iterator it = reap_check_vector.begin(); 1541 it != reap_check_vector.end(); ++it) { 1542 logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID()); 1543 (*it)->SpawnThread(); 1544 } 1545 1546 // Join the check threads. 1547 for (WorkerVector::const_iterator it = reap_check_vector.begin(); 1548 it != reap_check_vector.end(); ++it) { 1549 logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID()); 1550 (*it)->JoinThread(); 1551 } 1552 1553 // Reap all children. Stopped threads should have already ended. 1554 // Result checking threads will end when they have finished 1555 // result checking. 1556 logprintf(12, "Log: Join all outstanding threads\n"); 1557 1558 // Find all errors. 1559 errorcount_ = GetTotalErrorCount(); 1560 1561 AcquireWorkerLock(); 1562 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1563 map_it != workers_map_.end(); ++map_it) { 1564 for (WorkerVector::const_iterator it = map_it->second->begin(); 1565 it != map_it->second->end(); ++it) { 1566 logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID()); 1567 if ((*it)->GetStatus() != 1) { 1568 logprintf(0, "Process Error: Thread %d failed with status %d at " 1569 "%.2f seconds\n", 1570 (*it)->ThreadID(), (*it)->GetStatus(), 1571 (*it)->GetRunDurationUSec()*1.0/1000000); 1572 bad_status(); 1573 } 1574 int priority = 12; 1575 if ((*it)->GetErrorCount()) 1576 priority = 5; 1577 logprintf(priority, "Log: Thread %d found %lld hardware incidents\n", 1578 (*it)->ThreadID(), (*it)->GetErrorCount()); 1579 } 1580 } 1581 ReleaseWorkerLock(); 1582 1583 1584 // Add in any errors from check threads. 1585 for (WorkerVector::const_iterator it = reap_check_vector.begin(); 1586 it != reap_check_vector.end(); ++it) { 1587 logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID()); 1588 if ((*it)->GetStatus() != 1) { 1589 logprintf(0, "Process Error: Thread %d failed with status %d at " 1590 "%.2f seconds\n", 1591 (*it)->ThreadID(), (*it)->GetStatus(), 1592 (*it)->GetRunDurationUSec()*1.0/1000000); 1593 bad_status(); 1594 } 1595 errorcount_ += (*it)->GetErrorCount(); 1596 int priority = 12; 1597 if ((*it)->GetErrorCount()) 1598 priority = 5; 1599 logprintf(priority, "Log: Thread %d found %lld hardware incidents\n", 1600 (*it)->ThreadID(), (*it)->GetErrorCount()); 1601 delete (*it); 1602 } 1603 reap_check_vector.clear(); 1604 reap_check_status.Destroy(); 1605 } 1606 1607 // Print queuing information. 1608 void Sat::QueueStats() { 1609 finelock_q_->QueueAnalysis(); 1610 } 1611 1612 void Sat::AnalysisAllStats() { 1613 float max_runtime_sec = 0.; 1614 float total_data = 0.; 1615 float total_bandwidth = 0.; 1616 float thread_runtime_sec = 0.; 1617 1618 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1619 map_it != workers_map_.end(); ++map_it) { 1620 for (WorkerVector::const_iterator it = map_it->second->begin(); 1621 it != map_it->second->end(); ++it) { 1622 thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000.; 1623 total_data += (*it)->GetMemoryCopiedData(); 1624 total_data += (*it)->GetDeviceCopiedData(); 1625 if (thread_runtime_sec > max_runtime_sec) { 1626 max_runtime_sec = thread_runtime_sec; 1627 } 1628 } 1629 } 1630 1631 total_bandwidth = total_data / max_runtime_sec; 1632 1633 logprintf(0, "Stats: Completed: %.2fM in %.2fs %.2fMB/s, " 1634 "with %d hardware incidents, %d errors\n", 1635 total_data, 1636 max_runtime_sec, 1637 total_bandwidth, 1638 errorcount_, 1639 statuscount_); 1640 } 1641 1642 void Sat::MemoryStats() { 1643 float memcopy_data = 0.; 1644 float memcopy_bandwidth = 0.; 1645 WorkerMap::const_iterator mem_it = workers_map_.find( 1646 static_cast<int>(kMemoryType)); 1647 WorkerMap::const_iterator file_it = workers_map_.find( 1648 static_cast<int>(kFileIOType)); 1649 sat_assert(mem_it != workers_map_.end()); 1650 sat_assert(file_it != workers_map_.end()); 1651 for (WorkerVector::const_iterator it = mem_it->second->begin(); 1652 it != mem_it->second->end(); ++it) { 1653 memcopy_data += (*it)->GetMemoryCopiedData(); 1654 memcopy_bandwidth += (*it)->GetMemoryBandwidth(); 1655 } 1656 for (WorkerVector::const_iterator it = file_it->second->begin(); 1657 it != file_it->second->end(); ++it) { 1658 memcopy_data += (*it)->GetMemoryCopiedData(); 1659 memcopy_bandwidth += (*it)->GetMemoryBandwidth(); 1660 } 1661 GoogleMemoryStats(&memcopy_data, &memcopy_bandwidth); 1662 logprintf(4, "Stats: Memory Copy: %.2fM at %.2fMB/s\n", 1663 memcopy_data, 1664 memcopy_bandwidth); 1665 } 1666 1667 void Sat::GoogleMemoryStats(float *memcopy_data, 1668 float *memcopy_bandwidth) { 1669 // Do nothing, should be implemented by subclasses. 1670 } 1671 1672 void Sat::FileStats() { 1673 float file_data = 0.; 1674 float file_bandwidth = 0.; 1675 WorkerMap::const_iterator file_it = workers_map_.find( 1676 static_cast<int>(kFileIOType)); 1677 sat_assert(file_it != workers_map_.end()); 1678 for (WorkerVector::const_iterator it = file_it->second->begin(); 1679 it != file_it->second->end(); ++it) { 1680 file_data += (*it)->GetDeviceCopiedData(); 1681 file_bandwidth += (*it)->GetDeviceBandwidth(); 1682 } 1683 logprintf(4, "Stats: File Copy: %.2fM at %.2fMB/s\n", 1684 file_data, 1685 file_bandwidth); 1686 } 1687 1688 void Sat::CheckStats() { 1689 float check_data = 0.; 1690 float check_bandwidth = 0.; 1691 WorkerMap::const_iterator check_it = workers_map_.find( 1692 static_cast<int>(kCheckType)); 1693 sat_assert(check_it != workers_map_.end()); 1694 for (WorkerVector::const_iterator it = check_it->second->begin(); 1695 it != check_it->second->end(); ++it) { 1696 check_data += (*it)->GetMemoryCopiedData(); 1697 check_bandwidth += (*it)->GetMemoryBandwidth(); 1698 } 1699 logprintf(4, "Stats: Data Check: %.2fM at %.2fMB/s\n", 1700 check_data, 1701 check_bandwidth); 1702 } 1703 1704 void Sat::NetStats() { 1705 float net_data = 0.; 1706 float net_bandwidth = 0.; 1707 WorkerMap::const_iterator netio_it = workers_map_.find( 1708 static_cast<int>(kNetIOType)); 1709 WorkerMap::const_iterator netslave_it = workers_map_.find( 1710 static_cast<int>(kNetSlaveType)); 1711 sat_assert(netio_it != workers_map_.end()); 1712 sat_assert(netslave_it != workers_map_.end()); 1713 for (WorkerVector::const_iterator it = netio_it->second->begin(); 1714 it != netio_it->second->end(); ++it) { 1715 net_data += (*it)->GetDeviceCopiedData(); 1716 net_bandwidth += (*it)->GetDeviceBandwidth(); 1717 } 1718 for (WorkerVector::const_iterator it = netslave_it->second->begin(); 1719 it != netslave_it->second->end(); ++it) { 1720 net_data += (*it)->GetDeviceCopiedData(); 1721 net_bandwidth += (*it)->GetDeviceBandwidth(); 1722 } 1723 logprintf(4, "Stats: Net Copy: %.2fM at %.2fMB/s\n", 1724 net_data, 1725 net_bandwidth); 1726 } 1727 1728 void Sat::InvertStats() { 1729 float invert_data = 0.; 1730 float invert_bandwidth = 0.; 1731 WorkerMap::const_iterator invert_it = workers_map_.find( 1732 static_cast<int>(kInvertType)); 1733 sat_assert(invert_it != workers_map_.end()); 1734 for (WorkerVector::const_iterator it = invert_it->second->begin(); 1735 it != invert_it->second->end(); ++it) { 1736 invert_data += (*it)->GetMemoryCopiedData(); 1737 invert_bandwidth += (*it)->GetMemoryBandwidth(); 1738 } 1739 logprintf(4, "Stats: Invert Data: %.2fM at %.2fMB/s\n", 1740 invert_data, 1741 invert_bandwidth); 1742 } 1743 1744 void Sat::DiskStats() { 1745 float disk_data = 0.; 1746 float disk_bandwidth = 0.; 1747 WorkerMap::const_iterator disk_it = workers_map_.find( 1748 static_cast<int>(kDiskType)); 1749 WorkerMap::const_iterator random_it = workers_map_.find( 1750 static_cast<int>(kRandomDiskType)); 1751 sat_assert(disk_it != workers_map_.end()); 1752 sat_assert(random_it != workers_map_.end()); 1753 for (WorkerVector::const_iterator it = disk_it->second->begin(); 1754 it != disk_it->second->end(); ++it) { 1755 disk_data += (*it)->GetDeviceCopiedData(); 1756 disk_bandwidth += (*it)->GetDeviceBandwidth(); 1757 } 1758 for (WorkerVector::const_iterator it = random_it->second->begin(); 1759 it != random_it->second->end(); ++it) { 1760 disk_data += (*it)->GetDeviceCopiedData(); 1761 disk_bandwidth += (*it)->GetDeviceBandwidth(); 1762 } 1763 1764 logprintf(4, "Stats: Disk: %.2fM at %.2fMB/s\n", 1765 disk_data, 1766 disk_bandwidth); 1767 } 1768 1769 // Process worker thread data for bandwidth information, and error results. 1770 // You can add more methods here just subclassing SAT. 1771 void Sat::RunAnalysis() { 1772 AnalysisAllStats(); 1773 MemoryStats(); 1774 FileStats(); 1775 NetStats(); 1776 CheckStats(); 1777 InvertStats(); 1778 DiskStats(); 1779 } 1780 1781 // Get total error count, summing across all threads.. 1782 int64 Sat::GetTotalErrorCount() { 1783 int64 errors = 0; 1784 1785 AcquireWorkerLock(); 1786 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1787 map_it != workers_map_.end(); ++map_it) { 1788 for (WorkerVector::const_iterator it = map_it->second->begin(); 1789 it != map_it->second->end(); ++it) { 1790 errors += (*it)->GetErrorCount(); 1791 } 1792 } 1793 ReleaseWorkerLock(); 1794 return errors; 1795 } 1796 1797 1798 void Sat::SpawnThreads() { 1799 logprintf(12, "Log: Initializing WorkerStatus objects\n"); 1800 power_spike_status_.Initialize(); 1801 continuous_status_.Initialize(); 1802 logprintf(12, "Log: Spawning worker threads\n"); 1803 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1804 map_it != workers_map_.end(); ++map_it) { 1805 for (WorkerVector::const_iterator it = map_it->second->begin(); 1806 it != map_it->second->end(); ++it) { 1807 logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID()); 1808 (*it)->SpawnThread(); 1809 } 1810 } 1811 } 1812 1813 // Delete used worker thread objects. 1814 void Sat::DeleteThreads() { 1815 logprintf(12, "Log: Deleting worker threads\n"); 1816 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1817 map_it != workers_map_.end(); ++map_it) { 1818 for (WorkerVector::const_iterator it = map_it->second->begin(); 1819 it != map_it->second->end(); ++it) { 1820 logprintf(12, "Log: Deleting thread %d\n", (*it)->ThreadID()); 1821 delete (*it); 1822 } 1823 delete map_it->second; 1824 } 1825 workers_map_.clear(); 1826 logprintf(12, "Log: Destroying WorkerStatus objects\n"); 1827 power_spike_status_.Destroy(); 1828 continuous_status_.Destroy(); 1829 } 1830 1831 namespace { 1832 // Calculates the next time an action in Sat::Run() should occur, based on a 1833 // schedule derived from a start point and a regular frequency. 1834 // 1835 // Using frequencies instead of intervals with their accompanying drift allows 1836 // users to better predict when the actions will occur throughout a run. 1837 // 1838 // Arguments: 1839 // frequency: seconds 1840 // start: unixtime 1841 // now: unixtime 1842 // 1843 // Returns: unixtime 1844 inline time_t NextOccurance(time_t frequency, time_t start, time_t now) { 1845 return start + frequency + (((now - start) / frequency) * frequency); 1846 } 1847 } 1848 1849 // Run the actual test. 1850 bool Sat::Run() { 1851 // Install signal handlers to gracefully exit in the middle of a run. 1852 // 1853 // Why go through this whole rigmarole? It's the only standards-compliant 1854 // (C++ and POSIX) way to handle signals in a multithreaded program. 1855 // Specifically: 1856 // 1857 // 1) (C++) The value of a variable not of type "volatile sig_atomic_t" is 1858 // unspecified upon entering a signal handler and, if modified by the 1859 // handler, is unspecified after leaving the handler. 1860 // 1861 // 2) (POSIX) After the value of a variable is changed in one thread, another 1862 // thread is only guaranteed to see the new value after both threads have 1863 // acquired or released the same mutex or rwlock, synchronized to the 1864 // same barrier, or similar. 1865 // 1866 // #1 prevents the use of #2 in a signal handler, so the signal handler must 1867 // be called in the same thread that reads the "volatile sig_atomic_t" 1868 // variable it sets. We enforce that by blocking the signals in question in 1869 // the worker threads, forcing them to be handled by this thread. 1870 logprintf(12, "Log: Installing signal handlers\n"); 1871 sigset_t new_blocked_signals; 1872 sigemptyset(&new_blocked_signals); 1873 sigaddset(&new_blocked_signals, SIGINT); 1874 sigaddset(&new_blocked_signals, SIGTERM); 1875 sigset_t prev_blocked_signals; 1876 pthread_sigmask(SIG_BLOCK, &new_blocked_signals, &prev_blocked_signals); 1877 sighandler_t prev_sigint_handler = signal(SIGINT, SatHandleBreak); 1878 sighandler_t prev_sigterm_handler = signal(SIGTERM, SatHandleBreak); 1879 1880 // Kick off all the worker threads. 1881 logprintf(12, "Log: Launching worker threads\n"); 1882 InitializeThreads(); 1883 SpawnThreads(); 1884 pthread_sigmask(SIG_SETMASK, &prev_blocked_signals, NULL); 1885 1886 logprintf(12, "Log: Starting countdown with %d seconds\n", runtime_seconds_); 1887 1888 // In seconds. 1889 static const time_t kSleepFrequency = 5; 1890 // All of these are in seconds. You probably want them to be >= 1891 // kSleepFrequency and multiples of kSleepFrequency, but neither is necessary. 1892 static const time_t kInjectionFrequency = 10; 1893 // print_delay_ determines "seconds remaining" chatty update. 1894 1895 const time_t start = time(NULL); 1896 const time_t end = start + runtime_seconds_; 1897 time_t now = start; 1898 time_t next_print = start + print_delay_; 1899 time_t next_pause = start + pause_delay_; 1900 time_t next_resume = 0; 1901 time_t next_injection; 1902 if (crazy_error_injection_) { 1903 next_injection = start + kInjectionFrequency; 1904 } else { 1905 next_injection = 0; 1906 } 1907 1908 while (now < end) { 1909 // This is an int because it's for logprintf(). 1910 const int seconds_remaining = end - now; 1911 1912 if (user_break_) { 1913 // Handle early exit. 1914 logprintf(0, "Log: User exiting early (%d seconds remaining)\n", 1915 seconds_remaining); 1916 break; 1917 } 1918 1919 // If we have an error limit, check it here and see if we should exit. 1920 if (max_errorcount_ != 0) { 1921 uint64 errors = GetTotalErrorCount(); 1922 if (errors > max_errorcount_) { 1923 logprintf(0, "Log: Exiting early (%d seconds remaining) " 1924 "due to excessive failures (%lld)\n", 1925 seconds_remaining, 1926 errors); 1927 break; 1928 } 1929 } 1930 1931 if (now >= next_print) { 1932 // Print a count down message. 1933 logprintf(5, "Log: Seconds remaining: %d\n", seconds_remaining); 1934 next_print = NextOccurance(print_delay_, start, now); 1935 } 1936 1937 if (next_injection && now >= next_injection) { 1938 // Inject an error. 1939 logprintf(4, "Log: Injecting error (%d seconds remaining)\n", 1940 seconds_remaining); 1941 struct page_entry src; 1942 GetValid(&src); 1943 src.pattern = patternlist_->GetPattern(0); 1944 PutValid(&src); 1945 next_injection = NextOccurance(kInjectionFrequency, start, now); 1946 } 1947 1948 if (next_pause && now >= next_pause) { 1949 // Tell worker threads to pause in preparation for a power spike. 1950 logprintf(4, "Log: Pausing worker threads in preparation for power spike " 1951 "(%d seconds remaining)\n", seconds_remaining); 1952 power_spike_status_.PauseWorkers(); 1953 logprintf(12, "Log: Worker threads paused\n"); 1954 next_pause = 0; 1955 next_resume = now + pause_duration_; 1956 } 1957 1958 if (next_resume && now >= next_resume) { 1959 // Tell worker threads to resume in order to cause a power spike. 1960 logprintf(4, "Log: Resuming worker threads to cause a power spike (%d " 1961 "seconds remaining)\n", seconds_remaining); 1962 power_spike_status_.ResumeWorkers(); 1963 logprintf(12, "Log: Worker threads resumed\n"); 1964 next_pause = NextOccurance(pause_delay_, start, now); 1965 next_resume = 0; 1966 } 1967 1968 sat_sleep(NextOccurance(kSleepFrequency, start, now) - now); 1969 now = time(NULL); 1970 } 1971 1972 JoinThreads(); 1973 1974 logprintf(0, "Stats: Found %lld hardware incidents\n", errorcount_); 1975 1976 if (!monitor_mode_) 1977 RunAnalysis(); 1978 1979 DeleteThreads(); 1980 1981 logprintf(12, "Log: Uninstalling signal handlers\n"); 1982 signal(SIGINT, prev_sigint_handler); 1983 signal(SIGTERM, prev_sigterm_handler); 1984 1985 return true; 1986 } 1987 1988 // Clean up all resources. 1989 bool Sat::Cleanup() { 1990 g_sat = NULL; 1991 Logger::GlobalLogger()->StopThread(); 1992 Logger::GlobalLogger()->SetStdoutOnly(); 1993 if (logfile_) { 1994 close(logfile_); 1995 logfile_ = 0; 1996 } 1997 if (patternlist_) { 1998 patternlist_->Destroy(); 1999 delete patternlist_; 2000 patternlist_ = 0; 2001 } 2002 if (os_) { 2003 os_->FreeTestMem(); 2004 delete os_; 2005 os_ = 0; 2006 } 2007 if (empty_) { 2008 delete empty_; 2009 empty_ = 0; 2010 } 2011 if (valid_) { 2012 delete valid_; 2013 valid_ = 0; 2014 } 2015 if (finelock_q_) { 2016 delete finelock_q_; 2017 finelock_q_ = 0; 2018 } 2019 if (page_bitmap_) { 2020 delete[] page_bitmap_; 2021 } 2022 2023 for (size_t i = 0; i < blocktables_.size(); i++) { 2024 delete blocktables_[i]; 2025 } 2026 2027 if (cc_cacheline_data_) { 2028 // The num integer arrays for all the cacheline structures are 2029 // allocated as a single chunk. The pointers in the cacheline struct 2030 // are populated accordingly. Hence calling free on the first 2031 // cacheline's num's address is going to free the entire array. 2032 // TODO(aganti): Refactor this to have a class for the cacheline 2033 // structure (currently defined in worker.h) and clean this up 2034 // in the destructor of that class. 2035 if (cc_cacheline_data_[0].num) { 2036 free(cc_cacheline_data_[0].num); 2037 } 2038 free(cc_cacheline_data_); 2039 } 2040 2041 sat_assert(0 == pthread_mutex_destroy(&worker_lock_)); 2042 2043 return true; 2044 } 2045 2046 2047 // Pretty print really obvious results. 2048 bool Sat::PrintResults() { 2049 bool result = true; 2050 2051 logprintf(4, "\n"); 2052 if (statuscount_) { 2053 logprintf(4, "Status: FAIL - test encountered procedural errors\n"); 2054 result = false; 2055 } else if (errorcount_) { 2056 logprintf(4, "Status: FAIL - test discovered HW problems\n"); 2057 result = false; 2058 } else { 2059 logprintf(4, "Status: PASS - please verify no corrected errors\n"); 2060 } 2061 logprintf(4, "\n"); 2062 2063 return result; 2064 } 2065 2066 // Helper functions. 2067 void Sat::AcquireWorkerLock() { 2068 sat_assert(0 == pthread_mutex_lock(&worker_lock_)); 2069 } 2070 void Sat::ReleaseWorkerLock() { 2071 sat_assert(0 == pthread_mutex_unlock(&worker_lock_)); 2072 } 2073 2074 void logprintf(int priority, const char *format, ...) { 2075 va_list args; 2076 va_start(args, format); 2077 Logger::GlobalLogger()->VLogF(priority, format, args); 2078 va_end(args); 2079 } 2080 2081 // Stop the logging thread and verify any pending data is written to the log. 2082 void logstop() { 2083 Logger::GlobalLogger()->StopThread(); 2084 } 2085 2086