1 // Copyright 2006 Google Inc. All Rights Reserved. 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 7 // http://www.apache.org/licenses/LICENSE-2.0 8 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // sat.cc : a stress test for stressful testing 16 17 // stressapptest (or SAT, from Stressful Application Test) is a test 18 // designed to stress the system, as well as provide a comprehensive 19 // memory interface test. 20 21 // stressapptest can be run using memory only, or using many system components. 22 23 #include <errno.h> 24 #include <pthread.h> 25 #include <signal.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <unistd.h> 31 32 #include <sys/stat.h> 33 #include <sys/times.h> 34 35 // #define __USE_GNU 36 // #define __USE_LARGEFILE64 37 #include <fcntl.h> 38 39 #include <list> 40 #include <string> 41 42 // This file must work with autoconf on its public version, 43 // so these includes are correct. 44 #include "disk_blocks.h" 45 #include "logger.h" 46 #include "os.h" 47 #include "sat.h" 48 #include "sattypes.h" 49 #include "worker.h" 50 51 // stressapptest versioning here. 52 #ifndef PACKAGE_VERSION 53 static const char* kVersion = "1.0.0"; 54 #else 55 static const char* kVersion = PACKAGE_VERSION; 56 #endif 57 58 // Global stressapptest reference, for use by signal handler. 59 // This makes Sat objects not safe for multiple instances. 60 namespace { 61 Sat *g_sat = NULL; 62 63 // Signal handler for catching break or kill. 64 // 65 // This must be installed after g_sat is assigned and while there is a single 66 // thread. 67 // 68 // This must be uninstalled while there is only a single thread, and of course 69 // before g_sat is cleared or deleted. 70 void SatHandleBreak(int signal) { 71 g_sat->Break(); 72 } 73 } 74 75 // Opens the logfile for writing if necessary 76 bool Sat::InitializeLogfile() { 77 // Open logfile. 78 if (use_logfile_) { 79 logfile_ = open(logfilename_, 80 #if defined(O_DSYNC) 81 O_DSYNC | 82 #elif defined(O_SYNC) 83 O_SYNC | 84 #elif defined(O_FSYNC) 85 O_FSYNC | 86 #endif 87 O_WRONLY | O_CREAT, 88 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); 89 if (logfile_ < 0) { 90 printf("Fatal Error: cannot open file %s for logging\n", 91 logfilename_); 92 bad_status(); 93 return false; 94 } 95 // We seek to the end once instead of opening in append mode because no 96 // other processes should be writing to it while this one exists. 97 if (lseek(logfile_, 0, SEEK_END) == -1) { 98 printf("Fatal Error: cannot seek to end of logfile (%s)\n", 99 logfilename_); 100 bad_status(); 101 return false; 102 } 103 Logger::GlobalLogger()->SetLogFd(logfile_); 104 } 105 return true; 106 } 107 108 // Check that the environment is known and safe to run on. 109 // Return 1 if good, 0 if unsuppported. 110 bool Sat::CheckEnvironment() { 111 // Check that this is not a debug build. Debug builds lack 112 // enough performance to stress the system. 113 #if !defined NDEBUG 114 if (run_on_anything_) { 115 logprintf(1, "Log: Running DEBUG version of SAT, " 116 "with significantly reduced coverage.\n"); 117 } else { 118 logprintf(0, "Process Error: Running DEBUG version of SAT, " 119 "with significantly reduced coverage.\n"); 120 logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); 121 bad_status(); 122 return false; 123 } 124 #elif !defined CHECKOPTS 125 #error Build system regression - COPTS disregarded. 126 #endif 127 128 // Use all CPUs if nothing is specified. 129 if (memory_threads_ == -1) { 130 memory_threads_ = os_->num_cpus(); 131 logprintf(7, "Log: Defaulting to %d copy threads\n", memory_threads_); 132 } 133 134 // Use all memory if no size is specified. 135 if (size_mb_ == 0) 136 size_mb_ = os_->FindFreeMemSize() / kMegabyte; 137 size_ = static_cast<int64>(size_mb_) * kMegabyte; 138 139 // Autodetect file locations. 140 if (findfiles_ && (file_threads_ == 0)) { 141 // Get a space separated sting of disk locations. 142 list<string> locations = os_->FindFileDevices(); 143 144 // Extract each one. 145 while (!locations.empty()) { 146 // Copy and remove the disk name. 147 string disk = locations.back(); 148 locations.pop_back(); 149 150 logprintf(12, "Log: disk at %s\n", disk.c_str()); 151 file_threads_++; 152 filename_.push_back(disk + "/sat_disk.a"); 153 file_threads_++; 154 filename_.push_back(disk + "/sat_disk.b"); 155 } 156 } 157 158 // We'd better have some memory by this point. 159 if (size_ < 1) { 160 logprintf(0, "Process Error: No memory found to test.\n"); 161 bad_status(); 162 return false; 163 } 164 165 if (tag_mode_ && ((file_threads_ > 0) || 166 (disk_threads_ > 0) || 167 (net_threads_ > 0))) { 168 logprintf(0, "Process Error: Memory tag mode incompatible " 169 "with disk/network DMA.\n"); 170 bad_status(); 171 return false; 172 } 173 174 // If platform is 32 bit Xeon, floor memory size to multiple of 4. 175 if (address_mode_ == 32) { 176 size_mb_ = (size_mb_ / 4) * 4; 177 size_ = size_mb_ * kMegabyte; 178 logprintf(1, "Log: Flooring memory allocation to multiple of 4: %lldMB\n", 179 size_mb_); 180 } 181 182 // Check if this system is on the whitelist for supported systems. 183 if (!os_->IsSupported()) { 184 if (run_on_anything_) { 185 logprintf(1, "Log: Unsupported system. Running with reduced coverage.\n"); 186 // This is ok, continue on. 187 } else { 188 logprintf(0, "Process Error: Unsupported system, " 189 "no error reporting available\n"); 190 logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); 191 bad_status(); 192 return false; 193 } 194 } 195 196 return true; 197 } 198 199 // Allocates memory to run the test on 200 bool Sat::AllocateMemory() { 201 // Allocate our test memory. 202 bool result = os_->AllocateTestMem(size_, paddr_base_); 203 if (!result) { 204 logprintf(0, "Process Error: failed to allocate memory\n"); 205 bad_status(); 206 return false; 207 } 208 return true; 209 } 210 211 // Sets up access to data patterns 212 bool Sat::InitializePatterns() { 213 // Initialize pattern data. 214 patternlist_ = new PatternList(); 215 if (!patternlist_) { 216 logprintf(0, "Process Error: failed to allocate patterns\n"); 217 bad_status(); 218 return false; 219 } 220 if (!patternlist_->Initialize()) { 221 logprintf(0, "Process Error: failed to initialize patternlist\n"); 222 bad_status(); 223 return false; 224 } 225 return true; 226 } 227 228 // Get any valid page, no tag specified. 229 bool Sat::GetValid(struct page_entry *pe) { 230 return GetValid(pe, kDontCareTag); 231 } 232 233 234 // Fetch and return empty and full pages into the empty and full pools. 235 bool Sat::GetValid(struct page_entry *pe, int32 tag) { 236 bool result = false; 237 // Get valid page depending on implementation. 238 if (pe_q_implementation_ == SAT_FINELOCK) 239 result = finelock_q_->GetValid(pe, tag); 240 else if (pe_q_implementation_ == SAT_ONELOCK) 241 result = valid_->PopRandom(pe); 242 243 if (result) { 244 pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it. 245 246 // Tag this access and current pattern. 247 pe->ts = os_->GetTimestamp(); 248 pe->lastpattern = pe->pattern; 249 250 return (pe->addr != 0); // Return success or failure. 251 } 252 return false; 253 } 254 255 bool Sat::PutValid(struct page_entry *pe) { 256 if (pe->addr != 0) 257 os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page. 258 pe->addr = 0; 259 260 // Put valid page depending on implementation. 261 if (pe_q_implementation_ == SAT_FINELOCK) 262 return finelock_q_->PutValid(pe); 263 else if (pe_q_implementation_ == SAT_ONELOCK) 264 return valid_->Push(pe); 265 else 266 return false; 267 } 268 269 // Get an empty page with any tag. 270 bool Sat::GetEmpty(struct page_entry *pe) { 271 return GetEmpty(pe, kDontCareTag); 272 } 273 274 bool Sat::GetEmpty(struct page_entry *pe, int32 tag) { 275 bool result = false; 276 // Get empty page depending on implementation. 277 if (pe_q_implementation_ == SAT_FINELOCK) 278 result = finelock_q_->GetEmpty(pe, tag); 279 else if (pe_q_implementation_ == SAT_ONELOCK) 280 result = empty_->PopRandom(pe); 281 282 if (result) { 283 pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it. 284 return (pe->addr != 0); // Return success or failure. 285 } 286 return false; 287 } 288 289 bool Sat::PutEmpty(struct page_entry *pe) { 290 if (pe->addr != 0) 291 os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page. 292 pe->addr = 0; 293 294 // Put empty page depending on implementation. 295 if (pe_q_implementation_ == SAT_FINELOCK) 296 return finelock_q_->PutEmpty(pe); 297 else if (pe_q_implementation_ == SAT_ONELOCK) 298 return empty_->Push(pe); 299 else 300 return false; 301 } 302 303 // Set up the bitmap of physical pages in case we want to see which pages were 304 // accessed under this run of SAT. 305 void Sat::AddrMapInit() { 306 if (!do_page_map_) 307 return; 308 // Find about how much physical mem is in the system. 309 // TODO(nsanders): Find some way to get the max 310 // and min phys addr in the system. 311 uint64 maxsize = os_->FindFreeMemSize() * 4; 312 sat_assert(maxsize != 0); 313 314 // Make a bitmask of this many pages. Assume that the memory is relatively 315 // zero based. This is true on x86, typically. 316 // This is one bit per page. 317 uint64 arraysize = maxsize / 4096 / 8; 318 unsigned char *bitmap = new unsigned char[arraysize]; 319 sat_assert(bitmap); 320 321 // Mark every page as 0, not seen. 322 memset(bitmap, 0, arraysize); 323 324 page_bitmap_size_ = maxsize; 325 page_bitmap_ = bitmap; 326 } 327 328 // Add the 4k pages in this block to the array of pages SAT has seen. 329 void Sat::AddrMapUpdate(struct page_entry *pe) { 330 if (!do_page_map_) 331 return; 332 333 // Go through 4k page blocks. 334 uint64 arraysize = page_bitmap_size_ / 4096 / 8; 335 336 char *base = reinterpret_cast<char*>(pe->addr); 337 for (int i = 0; i < page_length_; i += 4096) { 338 uint64 paddr = os_->VirtualToPhysical(base + i); 339 340 uint32 offset = paddr / 4096 / 8; 341 unsigned char mask = 1 << ((paddr / 4096) % 8); 342 343 if (offset >= arraysize) { 344 logprintf(0, "Process Error: Physical address %#llx is " 345 "greater than expected %#llx.\n", 346 paddr, page_bitmap_size_); 347 sat_assert(0); 348 } 349 page_bitmap_[offset] |= mask; 350 } 351 } 352 353 // Print out the physical memory ranges that SAT has accessed. 354 void Sat::AddrMapPrint() { 355 if (!do_page_map_) 356 return; 357 358 uint64 pages = page_bitmap_size_ / 4096; 359 360 uint64 last_page = 0; 361 bool valid_range = false; 362 363 logprintf(4, "Log: Printing tested physical ranges.\n"); 364 365 for (uint64 i = 0; i < pages; i ++) { 366 int offset = i / 8; 367 unsigned char mask = 1 << (i % 8); 368 369 bool touched = page_bitmap_[offset] & mask; 370 if (touched && !valid_range) { 371 valid_range = true; 372 last_page = i * 4096; 373 } else if (!touched && valid_range) { 374 valid_range = false; 375 logprintf(4, "Log: %#016llx - %#016llx\n", last_page, (i * 4096) - 1); 376 } 377 } 378 logprintf(4, "Log: Done printing physical ranges.\n"); 379 } 380 381 // Initializes page lists and fills pages with data patterns. 382 bool Sat::InitializePages() { 383 int result = 1; 384 // Calculate needed page totals. 385 int64 neededpages = memory_threads_ + 386 invert_threads_ + 387 check_threads_ + 388 net_threads_ + 389 file_threads_; 390 391 // Empty-valid page ratio is adjusted depending on queue implementation. 392 // since fine-grain-locked queue keeps both valid and empty entries in the 393 // same queue and randomly traverse to find pages, the empty-valid ratio 394 // should be more even. 395 if (pe_q_implementation_ == SAT_FINELOCK) 396 freepages_ = pages_ / 5 * 2; // Mark roughly 2/5 of all pages as Empty. 397 else 398 freepages_ = (pages_ / 100) + (2 * neededpages); 399 400 if (freepages_ < neededpages) { 401 logprintf(0, "Process Error: freepages < neededpages.\n"); 402 logprintf(1, "Stats: Total: %lld, Needed: %lld, Marked free: %lld\n", 403 static_cast<int64>(pages_), 404 static_cast<int64>(neededpages), 405 static_cast<int64>(freepages_)); 406 bad_status(); 407 return false; 408 } 409 410 if (freepages_ > pages_/2) { 411 logprintf(0, "Process Error: not enough pages for IO\n"); 412 logprintf(1, "Stats: Total: %lld, Needed: %lld, Available: %lld\n", 413 static_cast<int64>(pages_), 414 static_cast<int64>(freepages_), 415 static_cast<int64>(pages_/2)); 416 bad_status(); 417 return false; 418 } 419 logprintf(12, "Log: Allocating pages, Total: %lld Free: %lld\n", 420 pages_, 421 freepages_); 422 423 // Initialize page locations. 424 for (int64 i = 0; i < pages_; i++) { 425 struct page_entry pe; 426 init_pe(&pe); 427 pe.offset = i * page_length_; 428 result &= PutEmpty(&pe); 429 } 430 431 if (!result) { 432 logprintf(0, "Process Error: while initializing empty_ list\n"); 433 bad_status(); 434 return false; 435 } 436 437 // Fill valid pages with test patterns. 438 // Use fill threads to do this. 439 WorkerStatus fill_status; 440 WorkerVector fill_vector; 441 442 logprintf(12, "Starting Fill threads: %d threads, %d pages\n", 443 fill_threads_, pages_); 444 // Initialize the fill threads. 445 for (int i = 0; i < fill_threads_; i++) { 446 FillThread *thread = new FillThread(); 447 thread->InitThread(i, this, os_, patternlist_, &fill_status); 448 if (i != fill_threads_ - 1) { 449 logprintf(12, "Starting Fill Threads %d: %d pages\n", 450 i, pages_ / fill_threads_); 451 thread->SetFillPages(pages_ / fill_threads_); 452 // The last thread finishes up all the leftover pages. 453 } else { 454 logprintf(12, "Starting Fill Threads %d: %d pages\n", 455 i, pages_ - pages_ / fill_threads_ * i); 456 thread->SetFillPages(pages_ - pages_ / fill_threads_ * i); 457 } 458 fill_vector.push_back(thread); 459 } 460 461 // Spawn the fill threads. 462 fill_status.Initialize(); 463 for (WorkerVector::const_iterator it = fill_vector.begin(); 464 it != fill_vector.end(); ++it) 465 (*it)->SpawnThread(); 466 467 // Reap the finished fill threads. 468 for (WorkerVector::const_iterator it = fill_vector.begin(); 469 it != fill_vector.end(); ++it) { 470 (*it)->JoinThread(); 471 if ((*it)->GetStatus() != 1) { 472 logprintf(0, "Thread %d failed with status %d at %.2f seconds\n", 473 (*it)->ThreadID(), (*it)->GetStatus(), 474 (*it)->GetRunDurationUSec() * 1.0/1000000); 475 bad_status(); 476 return false; 477 } 478 delete (*it); 479 } 480 fill_vector.clear(); 481 fill_status.Destroy(); 482 logprintf(12, "Log: Done filling pages.\n"); 483 logprintf(12, "Log: Allocating pages.\n"); 484 485 AddrMapInit(); 486 487 // Initialize page locations. 488 for (int64 i = 0; i < pages_; i++) { 489 struct page_entry pe; 490 // Only get valid pages with uninitialized tags here. 491 char buf[256]; 492 if (GetValid(&pe, kInvalidTag)) { 493 int64 paddr = os_->VirtualToPhysical(pe.addr); 494 int32 region = os_->FindRegion(paddr); 495 496 os_->FindDimm(paddr, buf, sizeof(buf)); 497 if (i < 256) { 498 logprintf(12, "Log: address: %#llx, %s\n", paddr, buf); 499 } 500 region_[region]++; 501 pe.paddr = paddr; 502 pe.tag = 1 << region; 503 region_mask_ |= pe.tag; 504 505 // Generate a physical region map 506 AddrMapUpdate(&pe); 507 508 // Note: this does not allocate free pages among all regions 509 // fairly. However, with large enough (thousands) random number 510 // of pages being marked free in each region, the free pages 511 // count in each region end up pretty balanced. 512 if (i < freepages_) { 513 result &= PutEmpty(&pe); 514 } else { 515 result &= PutValid(&pe); 516 } 517 } else { 518 logprintf(0, "Log: didn't tag all pages. %d - %d = %d\n", 519 pages_, i, pages_ - i); 520 return false; 521 } 522 } 523 logprintf(12, "Log: Done allocating pages.\n"); 524 525 AddrMapPrint(); 526 527 for (int i = 0; i < 32; i++) { 528 if (region_mask_ & (1 << i)) { 529 region_count_++; 530 logprintf(12, "Log: Region %d: %d.\n", i, region_[i]); 531 } 532 } 533 logprintf(5, "Log: Region mask: 0x%x\n", region_mask_); 534 535 return true; 536 } 537 538 // Print SAT version info. 539 bool Sat::PrintVersion() { 540 logprintf(1, "Stats: SAT revision %s, %d bit binary\n", 541 kVersion, address_mode_); 542 logprintf(5, "Log: %s from %s\n", Timestamp(), BuildChangelist()); 543 544 return true; 545 } 546 547 548 // Initializes the resources that SAT needs to run. 549 // This needs to be called before Run(), and after ParseArgs(). 550 // Returns true on success, false on error, and will exit() on help message. 551 bool Sat::Initialize() { 552 g_sat = this; 553 554 // Initializes sync'd log file to ensure output is saved. 555 if (!InitializeLogfile()) 556 return false; 557 Logger::GlobalLogger()->StartThread(); 558 559 logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str()); 560 PrintVersion(); 561 562 std::map<std::string, std::string> options; 563 564 GoogleOsOptions(&options); 565 566 // Initialize OS/Hardware interface. 567 os_ = OsLayerFactory(options); 568 if (!os_) { 569 bad_status(); 570 return false; 571 } 572 573 if (min_hugepages_mbytes_ > 0) 574 os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte); 575 576 if (!os_->Initialize()) { 577 logprintf(0, "Process Error: Failed to initialize OS layer\n"); 578 bad_status(); 579 delete os_; 580 return false; 581 } 582 583 // Checks that OS/Build/Platform is supported. 584 if (!CheckEnvironment()) 585 return false; 586 587 if (error_injection_) 588 os_->set_error_injection(true); 589 590 // Run SAT in monitor only mode, do not continue to allocate resources. 591 if (monitor_mode_) { 592 logprintf(5, "Log: Running in monitor-only mode. " 593 "Will not allocate any memory nor run any stress test. " 594 "Only polling ECC errors.\n"); 595 return true; 596 } 597 598 // Allocate the memory to test. 599 if (!AllocateMemory()) 600 return false; 601 602 logprintf(5, "Stats: Starting SAT, %dM, %d seconds\n", 603 static_cast<int>(size_/kMegabyte), 604 runtime_seconds_); 605 606 if (!InitializePatterns()) 607 return false; 608 609 // Initialize memory allocation. 610 pages_ = size_ / page_length_; 611 612 // Allocate page queue depending on queue implementation switch. 613 if (pe_q_implementation_ == SAT_FINELOCK) { 614 finelock_q_ = new FineLockPEQueue(pages_, page_length_); 615 if (finelock_q_ == NULL) 616 return false; 617 finelock_q_->set_os(os_); 618 os_->set_err_log_callback(finelock_q_->get_err_log_callback()); 619 } else if (pe_q_implementation_ == SAT_ONELOCK) { 620 empty_ = new PageEntryQueue(pages_); 621 valid_ = new PageEntryQueue(pages_); 622 if ((empty_ == NULL) || (valid_ == NULL)) 623 return false; 624 } 625 626 if (!InitializePages()) { 627 logprintf(0, "Process Error: Initialize Pages failed\n"); 628 return false; 629 } 630 631 return true; 632 } 633 634 // Constructor and destructor. 635 Sat::Sat() { 636 // Set defaults, command line might override these. 637 runtime_seconds_ = 20; 638 page_length_ = kSatPageSize; 639 disk_pages_ = kSatDiskPage; 640 pages_ = 0; 641 size_mb_ = 0; 642 size_ = size_mb_ * kMegabyte; 643 min_hugepages_mbytes_ = 0; 644 freepages_ = 0; 645 paddr_base_ = 0; 646 647 user_break_ = false; 648 verbosity_ = 8; 649 Logger::GlobalLogger()->SetVerbosity(verbosity_); 650 strict_ = 1; 651 warm_ = 0; 652 run_on_anything_ = 0; 653 use_logfile_ = 0; 654 logfile_ = 0; 655 // Detect 32/64 bit binary. 656 void *pvoid = 0; 657 address_mode_ = sizeof(pvoid) * 8; 658 error_injection_ = false; 659 crazy_error_injection_ = false; 660 max_errorcount_ = 0; // Zero means no early exit. 661 stop_on_error_ = false; 662 error_poll_ = true; 663 findfiles_ = false; 664 665 do_page_map_ = false; 666 page_bitmap_ = 0; 667 page_bitmap_size_ = 0; 668 669 // Cache coherency data initialization. 670 cc_test_ = false; // Flag to trigger cc threads. 671 cc_cacheline_count_ = 2; // Two datastructures of cache line size. 672 cc_inc_count_ = 1000; // Number of times to increment the shared variable. 673 cc_cacheline_data_ = 0; // Cache Line size datastructure. 674 675 sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL)); 676 file_threads_ = 0; 677 net_threads_ = 0; 678 listen_threads_ = 0; 679 // Default to autodetect number of cpus, and run that many threads. 680 memory_threads_ = -1; 681 invert_threads_ = 0; 682 fill_threads_ = 8; 683 check_threads_ = 0; 684 cpu_stress_threads_ = 0; 685 disk_threads_ = 0; 686 total_threads_ = 0; 687 688 region_mask_ = 0; 689 region_count_ = 0; 690 for (int i = 0; i < 32; i++) { 691 region_[i] = 0; 692 } 693 region_mode_ = 0; 694 695 errorcount_ = 0; 696 statuscount_ = 0; 697 698 valid_ = 0; 699 empty_ = 0; 700 finelock_q_ = 0; 701 // Default to use fine-grain lock for better performance. 702 pe_q_implementation_ = SAT_FINELOCK; 703 704 os_ = 0; 705 patternlist_ = 0; 706 logfilename_[0] = 0; 707 708 read_block_size_ = 512; 709 write_block_size_ = -1; 710 segment_size_ = -1; 711 cache_size_ = -1; 712 blocks_per_segment_ = -1; 713 read_threshold_ = -1; 714 write_threshold_ = -1; 715 non_destructive_ = 1; 716 monitor_mode_ = 0; 717 tag_mode_ = 0; 718 random_threads_ = 0; 719 720 pause_delay_ = 600; 721 pause_duration_ = 15; 722 } 723 724 // Destructor. 725 Sat::~Sat() { 726 // We need to have called Cleanup() at this point. 727 // We should probably enforce this. 728 } 729 730 731 #define ARG_KVALUE(argument, variable, value) \ 732 if (!strcmp(argv[i], argument)) { \ 733 variable = value; \ 734 continue; \ 735 } 736 737 #define ARG_IVALUE(argument, variable) \ 738 if (!strcmp(argv[i], argument)) { \ 739 i++; \ 740 if (i < argc) \ 741 variable = strtoull(argv[i], NULL, 0); \ 742 continue; \ 743 } 744 745 #define ARG_SVALUE(argument, variable) \ 746 if (!strcmp(argv[i], argument)) { \ 747 i++; \ 748 if (i < argc) \ 749 snprintf(variable, sizeof(variable), "%s", argv[i]); \ 750 continue; \ 751 } 752 753 // Configures SAT from command line arguments. 754 // This will call exit() given a request for 755 // self-documentation or unexpected args. 756 bool Sat::ParseArgs(int argc, char **argv) { 757 int i; 758 uint64 filesize = page_length_ * disk_pages_; 759 760 // Parse each argument. 761 for (i = 1; i < argc; i++) { 762 // Switch to fall back to corase-grain-lock queue. (for benchmarking) 763 ARG_KVALUE("--coarse_grain_lock", pe_q_implementation_, SAT_ONELOCK); 764 765 // Set number of megabyte to use. 766 ARG_IVALUE("-M", size_mb_); 767 768 // Set minimum megabytes of hugepages to require. 769 ARG_IVALUE("-H", min_hugepages_mbytes_); 770 771 // Set number of seconds to run. 772 ARG_IVALUE("-s", runtime_seconds_); 773 774 // Set number of memory copy threads. 775 ARG_IVALUE("-m", memory_threads_); 776 777 // Set number of memory invert threads. 778 ARG_IVALUE("-i", invert_threads_); 779 780 // Set number of check-only threads. 781 ARG_IVALUE("-c", check_threads_); 782 783 // Set number of cache line size datastructures. 784 ARG_IVALUE("--cc_inc_count", cc_inc_count_); 785 786 // Set number of cache line size datastructures 787 ARG_IVALUE("--cc_line_count", cc_cacheline_count_); 788 789 // Flag set when cache coherency tests need to be run 790 ARG_KVALUE("--cc_test", cc_test_, 1); 791 792 // Set number of CPU stress threads. 793 ARG_IVALUE("-C", cpu_stress_threads_); 794 795 // Set logfile name. 796 ARG_SVALUE("-l", logfilename_); 797 798 // Verbosity level. 799 ARG_IVALUE("-v", verbosity_); 800 801 // Set maximum number of errors to collect. Stop running after this many. 802 ARG_IVALUE("--max_errors", max_errorcount_); 803 804 // Set pattern block size. 805 ARG_IVALUE("-p", page_length_); 806 807 // Set pattern block size. 808 ARG_IVALUE("--filesize", filesize); 809 810 // NUMA options. 811 ARG_KVALUE("--local_numa", region_mode_, kLocalNuma); 812 ARG_KVALUE("--remote_numa", region_mode_, kRemoteNuma); 813 814 // Autodetect tempfile locations. 815 ARG_KVALUE("--findfiles", findfiles_, 1); 816 817 // Inject errors to force miscompare code paths 818 ARG_KVALUE("--force_errors", error_injection_, true); 819 ARG_KVALUE("--force_errors_like_crazy", crazy_error_injection_, true); 820 if (crazy_error_injection_) 821 error_injection_ = true; 822 823 // Stop immediately on any arror, for debugging HW problems. 824 ARG_KVALUE("--stop_on_errors", stop_on_error_, 1); 825 826 // Don't use internal error polling, allow external detection. 827 ARG_KVALUE("--no_errors", error_poll_, 0); 828 829 // Never check data as you go. 830 ARG_KVALUE("-F", strict_, 0); 831 832 // Warm the cpu as you go. 833 ARG_KVALUE("-W", warm_, 1); 834 835 // Allow runnign on unknown systems with base unimplemented OsLayer 836 ARG_KVALUE("-A", run_on_anything_, 1); 837 838 // Size of read blocks for disk test. 839 ARG_IVALUE("--read-block-size", read_block_size_); 840 841 // Size of write blocks for disk test. 842 ARG_IVALUE("--write-block-size", write_block_size_); 843 844 // Size of segment for disk test. 845 ARG_IVALUE("--segment-size", segment_size_); 846 847 // Size of disk cache size for disk test. 848 ARG_IVALUE("--cache-size", cache_size_); 849 850 // Number of blocks to test per segment. 851 ARG_IVALUE("--blocks-per-segment", blocks_per_segment_); 852 853 // Maximum time a block read should take before warning. 854 ARG_IVALUE("--read-threshold", read_threshold_); 855 856 // Maximum time a block write should take before warning. 857 ARG_IVALUE("--write-threshold", write_threshold_); 858 859 // Do not write anything to disk in the disk test. 860 ARG_KVALUE("--destructive", non_destructive_, 0); 861 862 // Run SAT in monitor mode. No test load at all. 863 ARG_KVALUE("--monitor_mode", monitor_mode_, true); 864 865 // Run SAT in address mode. Tag all cachelines by virt addr. 866 ARG_KVALUE("--tag_mode", tag_mode_, true); 867 868 // Dump range map of tested pages.. 869 ARG_KVALUE("--do_page_map", do_page_map_, true); 870 871 // Specify the physical address base to test. 872 ARG_IVALUE("--paddr_base", paddr_base_); 873 874 // Specify the frequency for power spikes. 875 ARG_IVALUE("--pause_delay", pause_delay_); 876 877 // Specify the duration of each pause (for power spikes). 878 ARG_IVALUE("--pause_duration", pause_duration_); 879 880 // Disk device names 881 if (!strcmp(argv[i], "-d")) { 882 i++; 883 if (i < argc) { 884 disk_threads_++; 885 diskfilename_.push_back(string(argv[i])); 886 blocktables_.push_back(new DiskBlockTable()); 887 } 888 continue; 889 } 890 891 // Set number of disk random threads for each disk write thread. 892 ARG_IVALUE("--random-threads", random_threads_); 893 894 // Set a tempfile to use in a file thread. 895 if (!strcmp(argv[i], "-f")) { 896 i++; 897 if (i < argc) { 898 file_threads_++; 899 filename_.push_back(string(argv[i])); 900 } 901 continue; 902 } 903 904 // Set a hostname to use in a network thread. 905 if (!strcmp(argv[i], "-n")) { 906 i++; 907 if (i < argc) { 908 net_threads_++; 909 ipaddrs_.push_back(string(argv[i])); 910 } 911 continue; 912 } 913 914 // Run threads that listen for incoming SAT net connections. 915 ARG_KVALUE("--listen", listen_threads_, 1); 916 917 if (CheckGoogleSpecificArgs(argc, argv, &i)) { 918 continue; 919 } 920 921 // Default: 922 PrintVersion(); 923 PrintHelp(); 924 if (strcmp(argv[i], "-h") && strcmp(argv[i], "--help")) { 925 printf("\n Unknown argument %s\n", argv[i]); 926 bad_status(); 927 exit(1); 928 } 929 // Forget it, we printed the help, just bail. 930 // We don't want to print test status, or any log parser stuff. 931 exit(0); 932 } 933 934 Logger::GlobalLogger()->SetVerbosity(verbosity_); 935 936 // Update relevant data members with parsed input. 937 // Translate MB into bytes. 938 size_ = static_cast<int64>(size_mb_) * kMegabyte; 939 940 // Set logfile flag. 941 if (strcmp(logfilename_, "")) 942 use_logfile_ = 1; 943 // Checks valid page length. 944 if (page_length_ && 945 !(page_length_ & (page_length_ - 1)) && 946 (page_length_ > 1023)) { 947 // Prints if we have changed from default. 948 if (page_length_ != kSatPageSize) 949 logprintf(12, "Log: Updating page size to %d\n", page_length_); 950 } else { 951 // Revert to default page length. 952 logprintf(6, "Process Error: " 953 "Invalid page size %d\n", page_length_); 954 page_length_ = kSatPageSize; 955 return false; 956 } 957 958 // Set disk_pages_ if filesize or page size changed. 959 if (filesize != static_cast<uint64>(page_length_) * 960 static_cast<uint64>(disk_pages_)) { 961 disk_pages_ = filesize / page_length_; 962 if (disk_pages_ == 0) 963 disk_pages_ = 1; 964 } 965 966 // Print each argument. 967 for (int i = 0; i < argc; i++) { 968 if (i) 969 cmdline_ += " "; 970 cmdline_ += argv[i]; 971 } 972 973 return true; 974 } 975 976 void Sat::PrintHelp() { 977 printf("Usage: ./sat(32|64) [options]\n" 978 " -M mbytes megabytes of ram to test\n" 979 " -H mbytes minimum megabytes of hugepages to require\n" 980 " -s seconds number of seconds to run\n" 981 " -m threads number of memory copy threads to run\n" 982 " -i threads number of memory invert threads to run\n" 983 " -C threads number of memory CPU stress threads to run\n" 984 " --findfiles find locations to do disk IO automatically\n" 985 " -d device add a direct write disk thread with block " 986 "device (or file) 'device'\n" 987 " -f filename add a disk thread with " 988 "tempfile 'filename'\n" 989 " -l logfile log output to file 'logfile'\n" 990 " --max_errors n exit early after finding 'n' errors\n" 991 " -v level verbosity (0-20), default is 8\n" 992 " -W Use more CPU-stressful memory copy\n" 993 " -A run in degraded mode on incompatible systems\n" 994 " -p pagesize size in bytes of memory chunks\n" 995 " --filesize size size of disk IO tempfiles\n" 996 " -n ipaddr add a network thread connecting to " 997 "system at 'ipaddr'\n" 998 " --listen run a thread to listen for and respond " 999 "to network threads.\n" 1000 " --no_errors run without checking for ECC or other errors\n" 1001 " --force_errors inject false errors to test error handling\n" 1002 " --force_errors_like_crazy inject a lot of false errors " 1003 "to test error handling\n" 1004 " -F don't result check each transaction\n" 1005 " --stop_on_errors Stop after finding the first error.\n" 1006 " --read-block-size size of block for reading (-d)\n" 1007 " --write-block-size size of block for writing (-d). If not " 1008 "defined, the size of block for writing will be defined as the " 1009 "size of block for reading\n" 1010 " --segment-size size of segments to split disk into (-d)\n" 1011 " --cache-size size of disk cache (-d)\n" 1012 " --blocks-per-segment number of blocks to read/write per " 1013 "segment per iteration (-d)\n" 1014 " --read-threshold maximum time (in us) a block read should " 1015 "take (-d)\n" 1016 " --write-threshold maximum time (in us) a block write " 1017 "should take (-d)\n" 1018 " --random-threads number of random threads for each disk " 1019 "write thread (-d)\n" 1020 " --destructive write/wipe disk partition (-d)\n" 1021 " --monitor_mode only do ECC error polling, no stress load.\n" 1022 " --cc_test do the cache coherency testing\n" 1023 " --cc_inc_count number of times to increment the " 1024 "cacheline's member\n" 1025 " --cc_line_count number of cache line sized datastructures " 1026 "to allocate for the cache coherency threads to operate\n" 1027 " --paddr_base allocate memory starting from this address\n" 1028 " --pause_delay delay (in seconds) between power spikes\n" 1029 " --pause_duration duration (in seconds) of each pause\n" 1030 " --local_numa : choose memory regions associated with " 1031 "each CPU to be tested by that CPU\n" 1032 " --remote_numa : choose memory regions not associated with " 1033 "each CPU to be tested by that CPU\n"); 1034 } 1035 1036 bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) { 1037 // Do nothing, no google-specific argument on public stressapptest 1038 return false; 1039 } 1040 1041 void Sat::GoogleOsOptions(std::map<std::string, std::string> *options) { 1042 // Do nothing, no OS-specific argument on public stressapptest 1043 } 1044 1045 // Launch the SAT task threads. Returns 0 on error. 1046 void Sat::InitializeThreads() { 1047 // Memory copy threads. 1048 AcquireWorkerLock(); 1049 1050 logprintf(12, "Log: Starting worker threads\n"); 1051 WorkerVector *memory_vector = new WorkerVector(); 1052 1053 // Error polling thread. 1054 // This may detect ECC corrected errors, disk problems, or 1055 // any other errors normally hidden from userspace. 1056 WorkerVector *error_vector = new WorkerVector(); 1057 if (error_poll_) { 1058 ErrorPollThread *thread = new ErrorPollThread(); 1059 thread->InitThread(total_threads_++, this, os_, patternlist_, 1060 &continuous_status_); 1061 1062 error_vector->insert(error_vector->end(), thread); 1063 } else { 1064 logprintf(5, "Log: Skipping error poll thread due to --no_errors flag\n"); 1065 } 1066 workers_map_.insert(make_pair(kErrorType, error_vector)); 1067 1068 // Only start error poll threads for monitor-mode SAT, 1069 // skip all other types of worker threads. 1070 if (monitor_mode_) { 1071 ReleaseWorkerLock(); 1072 return; 1073 } 1074 1075 for (int i = 0; i < memory_threads_; i++) { 1076 CopyThread *thread = new CopyThread(); 1077 thread->InitThread(total_threads_++, this, os_, patternlist_, 1078 &power_spike_status_); 1079 1080 if ((region_count_ > 1) && (region_mode_)) { 1081 int32 region = region_find(i % region_count_); 1082 cpu_set_t *cpuset = os_->FindCoreMask(region); 1083 sat_assert(cpuset); 1084 if (region_mode_ == kLocalNuma) { 1085 // Choose regions associated with this CPU. 1086 thread->set_cpu_mask(cpuset); 1087 thread->set_tag(1 << region); 1088 } else if (region_mode_ == kRemoteNuma) { 1089 // Choose regions not associated with this CPU.. 1090 thread->set_cpu_mask(cpuset); 1091 thread->set_tag(region_mask_ & ~(1 << region)); 1092 } 1093 } else { 1094 cpu_set_t available_cpus; 1095 thread->AvailableCpus(&available_cpus); 1096 int cores = cpuset_count(&available_cpus); 1097 // Don't restrict thread location if we have more than one 1098 // thread per core. Not so good for performance. 1099 if (cpu_stress_threads_ + memory_threads_ <= cores) { 1100 // Place a thread on alternating cores first. 1101 // This assures interleaved core use with no overlap. 1102 int nthcore = i; 1103 int nthbit = (((2 * nthcore) % cores) + 1104 (((2 * nthcore) / cores) % 2)) % cores; 1105 cpu_set_t all_cores; 1106 cpuset_set_ab(&all_cores, 0, cores); 1107 if (!cpuset_isequal(&available_cpus, &all_cores)) { 1108 // We are assuming the bits are contiguous. 1109 // Complain if this is not so. 1110 logprintf(0, "Log: cores = %s, expected %s\n", 1111 cpuset_format(&available_cpus).c_str(), 1112 cpuset_format(&all_cores).c_str()); 1113 } 1114 1115 // Set thread affinity. 1116 thread->set_cpu_mask_to_cpu(nthbit); 1117 } 1118 } 1119 memory_vector->insert(memory_vector->end(), thread); 1120 } 1121 workers_map_.insert(make_pair(kMemoryType, memory_vector)); 1122 1123 // File IO threads. 1124 WorkerVector *fileio_vector = new WorkerVector(); 1125 for (int i = 0; i < file_threads_; i++) { 1126 FileThread *thread = new FileThread(); 1127 thread->InitThread(total_threads_++, this, os_, patternlist_, 1128 &power_spike_status_); 1129 thread->SetFile(filename_[i].c_str()); 1130 // Set disk threads high priority. They don't take much processor time, 1131 // but blocking them will delay disk IO. 1132 thread->SetPriority(WorkerThread::High); 1133 1134 fileio_vector->insert(fileio_vector->end(), thread); 1135 } 1136 workers_map_.insert(make_pair(kFileIOType, fileio_vector)); 1137 1138 // Net IO threads. 1139 WorkerVector *netio_vector = new WorkerVector(); 1140 WorkerVector *netslave_vector = new WorkerVector(); 1141 if (listen_threads_ > 0) { 1142 // Create a network slave thread. This listens for connections. 1143 NetworkListenThread *thread = new NetworkListenThread(); 1144 thread->InitThread(total_threads_++, this, os_, patternlist_, 1145 &continuous_status_); 1146 1147 netslave_vector->insert(netslave_vector->end(), thread); 1148 } 1149 for (int i = 0; i < net_threads_; i++) { 1150 NetworkThread *thread = new NetworkThread(); 1151 thread->InitThread(total_threads_++, this, os_, patternlist_, 1152 &continuous_status_); 1153 thread->SetIP(ipaddrs_[i].c_str()); 1154 1155 netio_vector->insert(netio_vector->end(), thread); 1156 } 1157 workers_map_.insert(make_pair(kNetIOType, netio_vector)); 1158 workers_map_.insert(make_pair(kNetSlaveType, netslave_vector)); 1159 1160 // Result check threads. 1161 WorkerVector *check_vector = new WorkerVector(); 1162 for (int i = 0; i < check_threads_; i++) { 1163 CheckThread *thread = new CheckThread(); 1164 thread->InitThread(total_threads_++, this, os_, patternlist_, 1165 &continuous_status_); 1166 1167 check_vector->insert(check_vector->end(), thread); 1168 } 1169 workers_map_.insert(make_pair(kCheckType, check_vector)); 1170 1171 // Memory invert threads. 1172 logprintf(12, "Log: Starting invert threads\n"); 1173 WorkerVector *invert_vector = new WorkerVector(); 1174 for (int i = 0; i < invert_threads_; i++) { 1175 InvertThread *thread = new InvertThread(); 1176 thread->InitThread(total_threads_++, this, os_, patternlist_, 1177 &continuous_status_); 1178 1179 invert_vector->insert(invert_vector->end(), thread); 1180 } 1181 workers_map_.insert(make_pair(kInvertType, invert_vector)); 1182 1183 // Disk stress threads. 1184 WorkerVector *disk_vector = new WorkerVector(); 1185 WorkerVector *random_vector = new WorkerVector(); 1186 logprintf(12, "Log: Starting disk stress threads\n"); 1187 for (int i = 0; i < disk_threads_; i++) { 1188 // Creating write threads 1189 DiskThread *thread = new DiskThread(blocktables_[i]); 1190 thread->InitThread(total_threads_++, this, os_, patternlist_, 1191 &power_spike_status_); 1192 thread->SetDevice(diskfilename_[i].c_str()); 1193 if (thread->SetParameters(read_block_size_, write_block_size_, 1194 segment_size_, cache_size_, 1195 blocks_per_segment_, 1196 read_threshold_, write_threshold_, 1197 non_destructive_)) { 1198 disk_vector->insert(disk_vector->end(), thread); 1199 } else { 1200 logprintf(12, "Log: DiskThread::SetParameters() failed\n"); 1201 delete thread; 1202 } 1203 1204 for (int j = 0; j < random_threads_; j++) { 1205 // Creating random threads 1206 RandomDiskThread *rthread = new RandomDiskThread(blocktables_[i]); 1207 rthread->InitThread(total_threads_++, this, os_, patternlist_, 1208 &power_spike_status_); 1209 rthread->SetDevice(diskfilename_[i].c_str()); 1210 if (rthread->SetParameters(read_block_size_, write_block_size_, 1211 segment_size_, cache_size_, 1212 blocks_per_segment_, 1213 read_threshold_, write_threshold_, 1214 non_destructive_)) { 1215 random_vector->insert(random_vector->end(), rthread); 1216 } else { 1217 logprintf(12, "Log: RandomDiskThread::SetParameters() failed\n"); 1218 delete rthread; 1219 } 1220 } 1221 } 1222 1223 workers_map_.insert(make_pair(kDiskType, disk_vector)); 1224 workers_map_.insert(make_pair(kRandomDiskType, random_vector)); 1225 1226 // CPU stress threads. 1227 WorkerVector *cpu_vector = new WorkerVector(); 1228 logprintf(12, "Log: Starting cpu stress threads\n"); 1229 for (int i = 0; i < cpu_stress_threads_; i++) { 1230 CpuStressThread *thread = new CpuStressThread(); 1231 thread->InitThread(total_threads_++, this, os_, patternlist_, 1232 &continuous_status_); 1233 1234 // Don't restrict thread location if we have more than one 1235 // thread per core. Not so good for performance. 1236 cpu_set_t available_cpus; 1237 thread->AvailableCpus(&available_cpus); 1238 int cores = cpuset_count(&available_cpus); 1239 if (cpu_stress_threads_ + memory_threads_ <= cores) { 1240 // Place a thread on alternating cores first. 1241 // Go in reverse order for CPU stress threads. This assures interleaved 1242 // core use with no overlap. 1243 int nthcore = (cores - 1) - i; 1244 int nthbit = (((2 * nthcore) % cores) + 1245 (((2 * nthcore) / cores) % 2)) % cores; 1246 cpu_set_t all_cores; 1247 cpuset_set_ab(&all_cores, 0, cores); 1248 if (!cpuset_isequal(&available_cpus, &all_cores)) { 1249 logprintf(0, "Log: cores = %s, expected %s\n", 1250 cpuset_format(&available_cpus).c_str(), 1251 cpuset_format(&all_cores).c_str()); 1252 } 1253 1254 // Set thread affinity. 1255 thread->set_cpu_mask_to_cpu(nthbit); 1256 } 1257 1258 1259 cpu_vector->insert(cpu_vector->end(), thread); 1260 } 1261 workers_map_.insert(make_pair(kCPUType, cpu_vector)); 1262 1263 // CPU Cache Coherency Threads - one for each core available. 1264 if (cc_test_) { 1265 WorkerVector *cc_vector = new WorkerVector(); 1266 logprintf(12, "Log: Starting cpu cache coherency threads\n"); 1267 1268 // Allocate the shared datastructure to be worked on by the threads. 1269 cc_cacheline_data_ = reinterpret_cast<cc_cacheline_data*>( 1270 malloc(sizeof(cc_cacheline_data) * cc_cacheline_count_)); 1271 sat_assert(cc_cacheline_data_ != NULL); 1272 1273 // Initialize the strucutre. 1274 memset(cc_cacheline_data_, 0, 1275 sizeof(cc_cacheline_data) * cc_cacheline_count_); 1276 1277 int num_cpus = CpuCount(); 1278 // Allocate all the nums once so that we get a single chunk 1279 // of contiguous memory. 1280 int *num; 1281 #ifdef HAVE_POSIX_MEMALIGN 1282 int err_result = posix_memalign( 1283 reinterpret_cast<void**>(&num), 1284 kCacheLineSize, sizeof(*num) * num_cpus * cc_cacheline_count_); 1285 #else 1286 num = reinterpret_cast<int*>(memalign(kCacheLineSize, 1287 sizeof(*num) * num_cpus * cc_cacheline_count_)); 1288 int err_result = (num == 0); 1289 #endif 1290 sat_assert(err_result == 0); 1291 1292 int cline; 1293 for (cline = 0; cline < cc_cacheline_count_; cline++) { 1294 memset(num, 0, sizeof(num_cpus) * num_cpus); 1295 cc_cacheline_data_[cline].num = num; 1296 num += num_cpus; 1297 } 1298 1299 int tnum; 1300 for (tnum = 0; tnum < num_cpus; tnum++) { 1301 CpuCacheCoherencyThread *thread = 1302 new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_, 1303 tnum, cc_inc_count_); 1304 thread->InitThread(total_threads_++, this, os_, patternlist_, 1305 &continuous_status_); 1306 // Pin the thread to a particular core. 1307 thread->set_cpu_mask_to_cpu(tnum); 1308 1309 // Insert the thread into the vector. 1310 cc_vector->insert(cc_vector->end(), thread); 1311 } 1312 workers_map_.insert(make_pair(kCCType, cc_vector)); 1313 } 1314 ReleaseWorkerLock(); 1315 } 1316 1317 // Return the number of cpus actually present in the machine. 1318 int Sat::CpuCount() { 1319 return sysconf(_SC_NPROCESSORS_CONF); 1320 } 1321 1322 // Notify and reap worker threads. 1323 void Sat::JoinThreads() { 1324 logprintf(12, "Log: Joining worker threads\n"); 1325 power_spike_status_.StopWorkers(); 1326 continuous_status_.StopWorkers(); 1327 1328 AcquireWorkerLock(); 1329 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1330 map_it != workers_map_.end(); ++map_it) { 1331 for (WorkerVector::const_iterator it = map_it->second->begin(); 1332 it != map_it->second->end(); ++it) { 1333 logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID()); 1334 (*it)->JoinThread(); 1335 } 1336 } 1337 ReleaseWorkerLock(); 1338 1339 QueueStats(); 1340 1341 // Finish up result checking. 1342 // Spawn 4 check threads to minimize check time. 1343 logprintf(12, "Log: Finished countdown, begin to result check\n"); 1344 WorkerStatus reap_check_status; 1345 WorkerVector reap_check_vector; 1346 1347 // No need for check threads for monitor mode. 1348 if (!monitor_mode_) { 1349 // Initialize the check threads. 1350 for (int i = 0; i < fill_threads_; i++) { 1351 CheckThread *thread = new CheckThread(); 1352 thread->InitThread(total_threads_++, this, os_, patternlist_, 1353 &reap_check_status); 1354 logprintf(12, "Log: Finished countdown, begin to result check\n"); 1355 reap_check_vector.push_back(thread); 1356 } 1357 } 1358 1359 reap_check_status.Initialize(); 1360 // Check threads should be marked to stop ASAP. 1361 reap_check_status.StopWorkers(); 1362 1363 // Spawn the check threads. 1364 for (WorkerVector::const_iterator it = reap_check_vector.begin(); 1365 it != reap_check_vector.end(); ++it) { 1366 logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID()); 1367 (*it)->SpawnThread(); 1368 } 1369 1370 // Join the check threads. 1371 for (WorkerVector::const_iterator it = reap_check_vector.begin(); 1372 it != reap_check_vector.end(); ++it) { 1373 logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID()); 1374 (*it)->JoinThread(); 1375 } 1376 1377 // Reap all children. Stopped threads should have already ended. 1378 // Result checking threads will end when they have finished 1379 // result checking. 1380 logprintf(12, "Log: Join all outstanding threads\n"); 1381 1382 // Find all errors. 1383 errorcount_ = GetTotalErrorCount(); 1384 1385 AcquireWorkerLock(); 1386 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1387 map_it != workers_map_.end(); ++map_it) { 1388 for (WorkerVector::const_iterator it = map_it->second->begin(); 1389 it != map_it->second->end(); ++it) { 1390 logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID()); 1391 if ((*it)->GetStatus() != 1) { 1392 logprintf(0, "Process Error: Thread %d failed with status %d at " 1393 "%.2f seconds\n", 1394 (*it)->ThreadID(), (*it)->GetStatus(), 1395 (*it)->GetRunDurationUSec()*1.0/1000000); 1396 bad_status(); 1397 } 1398 int priority = 12; 1399 if ((*it)->GetErrorCount()) 1400 priority = 5; 1401 logprintf(priority, "Log: Thread %d found %lld hardware incidents\n", 1402 (*it)->ThreadID(), (*it)->GetErrorCount()); 1403 } 1404 } 1405 ReleaseWorkerLock(); 1406 1407 1408 // Add in any errors from check threads. 1409 for (WorkerVector::const_iterator it = reap_check_vector.begin(); 1410 it != reap_check_vector.end(); ++it) { 1411 logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID()); 1412 if ((*it)->GetStatus() != 1) { 1413 logprintf(0, "Process Error: Thread %d failed with status %d at " 1414 "%.2f seconds\n", 1415 (*it)->ThreadID(), (*it)->GetStatus(), 1416 (*it)->GetRunDurationUSec()*1.0/1000000); 1417 bad_status(); 1418 } 1419 errorcount_ += (*it)->GetErrorCount(); 1420 int priority = 12; 1421 if ((*it)->GetErrorCount()) 1422 priority = 5; 1423 logprintf(priority, "Log: Thread %d found %lld hardware incidents\n", 1424 (*it)->ThreadID(), (*it)->GetErrorCount()); 1425 delete (*it); 1426 } 1427 reap_check_vector.clear(); 1428 reap_check_status.Destroy(); 1429 } 1430 1431 // Print queuing information. 1432 void Sat::QueueStats() { 1433 finelock_q_->QueueAnalysis(); 1434 } 1435 1436 void Sat::AnalysisAllStats() { 1437 float max_runtime_sec = 0.; 1438 float total_data = 0.; 1439 float total_bandwidth = 0.; 1440 float thread_runtime_sec = 0.; 1441 1442 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1443 map_it != workers_map_.end(); ++map_it) { 1444 for (WorkerVector::const_iterator it = map_it->second->begin(); 1445 it != map_it->second->end(); ++it) { 1446 thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000; 1447 total_data += (*it)->GetMemoryCopiedData(); 1448 total_data += (*it)->GetDeviceCopiedData(); 1449 if (thread_runtime_sec > max_runtime_sec) { 1450 max_runtime_sec = thread_runtime_sec; 1451 } 1452 } 1453 } 1454 1455 total_bandwidth = total_data / max_runtime_sec; 1456 1457 logprintf(0, "Stats: Completed: %.2fM in %.2fs %.2fMB/s, " 1458 "with %d hardware incidents, %d errors\n", 1459 total_data, 1460 max_runtime_sec, 1461 total_bandwidth, 1462 errorcount_, 1463 statuscount_); 1464 } 1465 1466 void Sat::MemoryStats() { 1467 float memcopy_data = 0.; 1468 float memcopy_bandwidth = 0.; 1469 WorkerMap::const_iterator mem_it = workers_map_.find( 1470 static_cast<int>(kMemoryType)); 1471 WorkerMap::const_iterator file_it = workers_map_.find( 1472 static_cast<int>(kFileIOType)); 1473 sat_assert(mem_it != workers_map_.end()); 1474 sat_assert(file_it != workers_map_.end()); 1475 for (WorkerVector::const_iterator it = mem_it->second->begin(); 1476 it != mem_it->second->end(); ++it) { 1477 memcopy_data += (*it)->GetMemoryCopiedData(); 1478 memcopy_bandwidth += (*it)->GetMemoryBandwidth(); 1479 } 1480 for (WorkerVector::const_iterator it = file_it->second->begin(); 1481 it != file_it->second->end(); ++it) { 1482 memcopy_data += (*it)->GetMemoryCopiedData(); 1483 memcopy_bandwidth += (*it)->GetMemoryBandwidth(); 1484 } 1485 GoogleMemoryStats(&memcopy_data, &memcopy_bandwidth); 1486 logprintf(4, "Stats: Memory Copy: %.2fM at %.2fMB/s\n", 1487 memcopy_data, 1488 memcopy_bandwidth); 1489 } 1490 1491 void Sat::GoogleMemoryStats(float *memcopy_data, 1492 float *memcopy_bandwidth) { 1493 // Do nothing, should be implemented by subclasses. 1494 } 1495 1496 void Sat::FileStats() { 1497 float file_data = 0.; 1498 float file_bandwidth = 0.; 1499 WorkerMap::const_iterator file_it = workers_map_.find( 1500 static_cast<int>(kFileIOType)); 1501 sat_assert(file_it != workers_map_.end()); 1502 for (WorkerVector::const_iterator it = file_it->second->begin(); 1503 it != file_it->second->end(); ++it) { 1504 file_data += (*it)->GetDeviceCopiedData(); 1505 file_bandwidth += (*it)->GetDeviceBandwidth(); 1506 } 1507 logprintf(4, "Stats: File Copy: %.2fM at %.2fMB/s\n", 1508 file_data, 1509 file_bandwidth); 1510 } 1511 1512 void Sat::CheckStats() { 1513 float check_data = 0.; 1514 float check_bandwidth = 0.; 1515 WorkerMap::const_iterator check_it = workers_map_.find( 1516 static_cast<int>(kCheckType)); 1517 sat_assert(check_it != workers_map_.end()); 1518 for (WorkerVector::const_iterator it = check_it->second->begin(); 1519 it != check_it->second->end(); ++it) { 1520 check_data += (*it)->GetMemoryCopiedData(); 1521 check_bandwidth += (*it)->GetMemoryBandwidth(); 1522 } 1523 logprintf(4, "Stats: Data Check: %.2fM at %.2fMB/s\n", 1524 check_data, 1525 check_bandwidth); 1526 } 1527 1528 void Sat::NetStats() { 1529 float net_data = 0.; 1530 float net_bandwidth = 0.; 1531 WorkerMap::const_iterator netio_it = workers_map_.find( 1532 static_cast<int>(kNetIOType)); 1533 WorkerMap::const_iterator netslave_it = workers_map_.find( 1534 static_cast<int>(kNetSlaveType)); 1535 sat_assert(netio_it != workers_map_.end()); 1536 sat_assert(netslave_it != workers_map_.end()); 1537 for (WorkerVector::const_iterator it = netio_it->second->begin(); 1538 it != netio_it->second->end(); ++it) { 1539 net_data += (*it)->GetDeviceCopiedData(); 1540 net_bandwidth += (*it)->GetDeviceBandwidth(); 1541 } 1542 for (WorkerVector::const_iterator it = netslave_it->second->begin(); 1543 it != netslave_it->second->end(); ++it) { 1544 net_data += (*it)->GetDeviceCopiedData(); 1545 net_bandwidth += (*it)->GetDeviceBandwidth(); 1546 } 1547 logprintf(4, "Stats: Net Copy: %.2fM at %.2fMB/s\n", 1548 net_data, 1549 net_bandwidth); 1550 } 1551 1552 void Sat::InvertStats() { 1553 float invert_data = 0.; 1554 float invert_bandwidth = 0.; 1555 WorkerMap::const_iterator invert_it = workers_map_.find( 1556 static_cast<int>(kInvertType)); 1557 sat_assert(invert_it != workers_map_.end()); 1558 for (WorkerVector::const_iterator it = invert_it->second->begin(); 1559 it != invert_it->second->end(); ++it) { 1560 invert_data += (*it)->GetMemoryCopiedData(); 1561 invert_bandwidth += (*it)->GetMemoryBandwidth(); 1562 } 1563 logprintf(4, "Stats: Invert Data: %.2fM at %.2fMB/s\n", 1564 invert_data, 1565 invert_bandwidth); 1566 } 1567 1568 void Sat::DiskStats() { 1569 float disk_data = 0.; 1570 float disk_bandwidth = 0.; 1571 WorkerMap::const_iterator disk_it = workers_map_.find( 1572 static_cast<int>(kDiskType)); 1573 WorkerMap::const_iterator random_it = workers_map_.find( 1574 static_cast<int>(kRandomDiskType)); 1575 sat_assert(disk_it != workers_map_.end()); 1576 sat_assert(random_it != workers_map_.end()); 1577 for (WorkerVector::const_iterator it = disk_it->second->begin(); 1578 it != disk_it->second->end(); ++it) { 1579 disk_data += (*it)->GetDeviceCopiedData(); 1580 disk_bandwidth += (*it)->GetDeviceBandwidth(); 1581 } 1582 for (WorkerVector::const_iterator it = random_it->second->begin(); 1583 it != random_it->second->end(); ++it) { 1584 disk_data += (*it)->GetDeviceCopiedData(); 1585 disk_bandwidth += (*it)->GetDeviceBandwidth(); 1586 } 1587 1588 logprintf(4, "Stats: Disk: %.2fM at %.2fMB/s\n", 1589 disk_data, 1590 disk_bandwidth); 1591 } 1592 1593 // Process worker thread data for bandwidth information, and error results. 1594 // You can add more methods here just subclassing SAT. 1595 void Sat::RunAnalysis() { 1596 AnalysisAllStats(); 1597 MemoryStats(); 1598 FileStats(); 1599 NetStats(); 1600 CheckStats(); 1601 InvertStats(); 1602 DiskStats(); 1603 } 1604 1605 // Get total error count, summing across all threads.. 1606 int64 Sat::GetTotalErrorCount() { 1607 int64 errors = 0; 1608 1609 AcquireWorkerLock(); 1610 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1611 map_it != workers_map_.end(); ++map_it) { 1612 for (WorkerVector::const_iterator it = map_it->second->begin(); 1613 it != map_it->second->end(); ++it) { 1614 errors += (*it)->GetErrorCount(); 1615 } 1616 } 1617 ReleaseWorkerLock(); 1618 return errors; 1619 } 1620 1621 1622 void Sat::SpawnThreads() { 1623 logprintf(12, "Log: Initializing WorkerStatus objects\n"); 1624 power_spike_status_.Initialize(); 1625 continuous_status_.Initialize(); 1626 logprintf(12, "Log: Spawning worker threads\n"); 1627 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1628 map_it != workers_map_.end(); ++map_it) { 1629 for (WorkerVector::const_iterator it = map_it->second->begin(); 1630 it != map_it->second->end(); ++it) { 1631 logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID()); 1632 (*it)->SpawnThread(); 1633 } 1634 } 1635 } 1636 1637 // Delete used worker thread objects. 1638 void Sat::DeleteThreads() { 1639 logprintf(12, "Log: Deleting worker threads\n"); 1640 for (WorkerMap::const_iterator map_it = workers_map_.begin(); 1641 map_it != workers_map_.end(); ++map_it) { 1642 for (WorkerVector::const_iterator it = map_it->second->begin(); 1643 it != map_it->second->end(); ++it) { 1644 logprintf(12, "Log: Deleting thread %d\n", (*it)->ThreadID()); 1645 delete (*it); 1646 } 1647 delete map_it->second; 1648 } 1649 workers_map_.clear(); 1650 logprintf(12, "Log: Destroying WorkerStatus objects\n"); 1651 power_spike_status_.Destroy(); 1652 continuous_status_.Destroy(); 1653 } 1654 1655 namespace { 1656 // Calculates the next time an action in Sat::Run() should occur, based on a 1657 // schedule derived from a start point and a regular frequency. 1658 // 1659 // Using frequencies instead of intervals with their accompanying drift allows 1660 // users to better predict when the actions will occur throughout a run. 1661 // 1662 // Arguments: 1663 // frequency: seconds 1664 // start: unixtime 1665 // now: unixtime 1666 // 1667 // Returns: unixtime 1668 inline time_t NextOccurance(time_t frequency, time_t start, time_t now) { 1669 return start + frequency + (((now - start) / frequency) * frequency); 1670 } 1671 } 1672 1673 // Run the actual test. 1674 bool Sat::Run() { 1675 // Install signal handlers to gracefully exit in the middle of a run. 1676 // 1677 // Why go through this whole rigmarole? It's the only standards-compliant 1678 // (C++ and POSIX) way to handle signals in a multithreaded program. 1679 // Specifically: 1680 // 1681 // 1) (C++) The value of a variable not of type "volatile sig_atomic_t" is 1682 // unspecified upon entering a signal handler and, if modified by the 1683 // handler, is unspecified after leaving the handler. 1684 // 1685 // 2) (POSIX) After the value of a variable is changed in one thread, another 1686 // thread is only guaranteed to see the new value after both threads have 1687 // acquired or released the same mutex or rwlock, synchronized to the 1688 // same barrier, or similar. 1689 // 1690 // #1 prevents the use of #2 in a signal handler, so the signal handler must 1691 // be called in the same thread that reads the "volatile sig_atomic_t" 1692 // variable it sets. We enforce that by blocking the signals in question in 1693 // the worker threads, forcing them to be handled by this thread. 1694 logprintf(12, "Log: Installing signal handlers\n"); 1695 sigset_t new_blocked_signals; 1696 sigemptyset(&new_blocked_signals); 1697 sigaddset(&new_blocked_signals, SIGINT); 1698 sigaddset(&new_blocked_signals, SIGTERM); 1699 sigset_t prev_blocked_signals; 1700 pthread_sigmask(SIG_BLOCK, &new_blocked_signals, &prev_blocked_signals); 1701 sighandler_t prev_sigint_handler = signal(SIGINT, SatHandleBreak); 1702 sighandler_t prev_sigterm_handler = signal(SIGTERM, SatHandleBreak); 1703 1704 // Kick off all the worker threads. 1705 logprintf(12, "Log: Launching worker threads\n"); 1706 InitializeThreads(); 1707 SpawnThreads(); 1708 pthread_sigmask(SIG_SETMASK, &prev_blocked_signals, NULL); 1709 1710 logprintf(12, "Log: Starting countdown with %d seconds\n", runtime_seconds_); 1711 1712 // In seconds. 1713 static const time_t kSleepFrequency = 5; 1714 // All of these are in seconds. You probably want them to be >= 1715 // kSleepFrequency and multiples of kSleepFrequency, but neither is necessary. 1716 static const time_t kInjectionFrequency = 10; 1717 static const time_t kPrintFrequency = 10; 1718 1719 const time_t start = time(NULL); 1720 const time_t end = start + runtime_seconds_; 1721 time_t now = start; 1722 time_t next_print = start + kPrintFrequency; 1723 time_t next_pause = start + pause_delay_; 1724 time_t next_resume = 0; 1725 time_t next_injection; 1726 if (crazy_error_injection_) { 1727 next_injection = start + kInjectionFrequency; 1728 } else { 1729 next_injection = 0; 1730 } 1731 1732 while (now < end) { 1733 // This is an int because it's for logprintf(). 1734 const int seconds_remaining = end - now; 1735 1736 if (user_break_) { 1737 // Handle early exit. 1738 logprintf(0, "Log: User exiting early (%d seconds remaining)\n", 1739 seconds_remaining); 1740 break; 1741 } 1742 1743 // If we have an error limit, check it here and see if we should exit. 1744 if (max_errorcount_ != 0) { 1745 uint64 errors = GetTotalErrorCount(); 1746 if (errors > max_errorcount_) { 1747 logprintf(0, "Log: Exiting early (%d seconds remaining) " 1748 "due to excessive failures (%lld)\n", 1749 seconds_remaining, 1750 errors); 1751 break; 1752 } 1753 } 1754 1755 if (now >= next_print) { 1756 // Print a count down message. 1757 logprintf(5, "Log: Seconds remaining: %d\n", seconds_remaining); 1758 next_print = NextOccurance(kPrintFrequency, start, now); 1759 } 1760 1761 if (next_injection && now >= next_injection) { 1762 // Inject an error. 1763 logprintf(4, "Log: Injecting error (%d seconds remaining)\n", 1764 seconds_remaining); 1765 struct page_entry src; 1766 GetValid(&src); 1767 src.pattern = patternlist_->GetPattern(0); 1768 PutValid(&src); 1769 next_injection = NextOccurance(kInjectionFrequency, start, now); 1770 } 1771 1772 if (next_pause && now >= next_pause) { 1773 // Tell worker threads to pause in preparation for a power spike. 1774 logprintf(4, "Log: Pausing worker threads in preparation for power spike " 1775 "(%d seconds remaining)\n", seconds_remaining); 1776 power_spike_status_.PauseWorkers(); 1777 logprintf(12, "Log: Worker threads paused\n"); 1778 next_pause = 0; 1779 next_resume = now + pause_duration_; 1780 } 1781 1782 if (next_resume && now >= next_resume) { 1783 // Tell worker threads to resume in order to cause a power spike. 1784 logprintf(4, "Log: Resuming worker threads to cause a power spike (%d " 1785 "seconds remaining)\n", seconds_remaining); 1786 power_spike_status_.ResumeWorkers(); 1787 logprintf(12, "Log: Worker threads resumed\n"); 1788 next_pause = NextOccurance(pause_delay_, start, now); 1789 next_resume = 0; 1790 } 1791 1792 sat_sleep(NextOccurance(kSleepFrequency, start, now) - now); 1793 now = time(NULL); 1794 } 1795 1796 JoinThreads(); 1797 1798 logprintf(0, "Stats: Found %lld hardware incidents\n", errorcount_); 1799 1800 if (!monitor_mode_) 1801 RunAnalysis(); 1802 1803 DeleteThreads(); 1804 1805 logprintf(12, "Log: Uninstalling signal handlers\n"); 1806 signal(SIGINT, prev_sigint_handler); 1807 signal(SIGTERM, prev_sigterm_handler); 1808 1809 return true; 1810 } 1811 1812 // Clean up all resources. 1813 bool Sat::Cleanup() { 1814 g_sat = NULL; 1815 Logger::GlobalLogger()->StopThread(); 1816 Logger::GlobalLogger()->SetStdoutOnly(); 1817 if (logfile_) { 1818 close(logfile_); 1819 logfile_ = 0; 1820 } 1821 if (patternlist_) { 1822 patternlist_->Destroy(); 1823 delete patternlist_; 1824 patternlist_ = 0; 1825 } 1826 if (os_) { 1827 os_->FreeTestMem(); 1828 delete os_; 1829 os_ = 0; 1830 } 1831 if (empty_) { 1832 delete empty_; 1833 empty_ = 0; 1834 } 1835 if (valid_) { 1836 delete valid_; 1837 valid_ = 0; 1838 } 1839 if (finelock_q_) { 1840 delete finelock_q_; 1841 finelock_q_ = 0; 1842 } 1843 if (page_bitmap_) { 1844 delete[] page_bitmap_; 1845 } 1846 1847 for (size_t i = 0; i < blocktables_.size(); i++) { 1848 delete blocktables_[i]; 1849 } 1850 1851 if (cc_cacheline_data_) { 1852 // The num integer arrays for all the cacheline structures are 1853 // allocated as a single chunk. The pointers in the cacheline struct 1854 // are populated accordingly. Hence calling free on the first 1855 // cacheline's num's address is going to free the entire array. 1856 // TODO(aganti): Refactor this to have a class for the cacheline 1857 // structure (currently defined in worker.h) and clean this up 1858 // in the destructor of that class. 1859 if (cc_cacheline_data_[0].num) { 1860 free(cc_cacheline_data_[0].num); 1861 } 1862 free(cc_cacheline_data_); 1863 } 1864 1865 sat_assert(0 == pthread_mutex_destroy(&worker_lock_)); 1866 1867 return true; 1868 } 1869 1870 1871 // Pretty print really obvious results. 1872 bool Sat::PrintResults() { 1873 bool result = true; 1874 1875 logprintf(4, "\n"); 1876 if (statuscount_) { 1877 logprintf(4, "Status: FAIL - test encountered procedural errors\n"); 1878 result = false; 1879 } else if (errorcount_) { 1880 logprintf(4, "Status: FAIL - test discovered HW problems\n"); 1881 result = false; 1882 } else { 1883 logprintf(4, "Status: PASS - please verify no corrected errors\n"); 1884 } 1885 logprintf(4, "\n"); 1886 1887 return result; 1888 } 1889 1890 // Helper functions. 1891 void Sat::AcquireWorkerLock() { 1892 sat_assert(0 == pthread_mutex_lock(&worker_lock_)); 1893 } 1894 void Sat::ReleaseWorkerLock() { 1895 sat_assert(0 == pthread_mutex_unlock(&worker_lock_)); 1896 } 1897 1898 void logprintf(int priority, const char *format, ...) { 1899 va_list args; 1900 va_start(args, format); 1901 Logger::GlobalLogger()->VLogF(priority, format, args); 1902 va_end(args); 1903 } 1904