Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc. All Rights Reserved.
      2 
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // error_diag.cc: Collects device errors for analysis to more accurately
     16 //                pin-point failed component.
     17 
     18 #include <set>
     19 #include <list>
     20 #include <map>
     21 
     22 // This file must work with autoconf on its public version,
     23 // so these includes are correct.
     24 #include "error_diag.h"
     25 #include "sattypes.h"
     26 
     27 
     28 // DeviceTree constructor.
     29 DeviceTree::DeviceTree(string name)
     30   : parent_(0), name_(name) {
     31   pthread_mutex_init(&device_tree_mutex_, NULL);
     32 }
     33 
     34 // DeviceTree destructor.
     35 DeviceTree::~DeviceTree() {
     36   // Deallocate subtree devices.
     37   for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
     38       itr != subdevices_.end();
     39       ++itr) {
     40     delete itr->second;
     41   }
     42   // Deallocate device errors.
     43   for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
     44       itr != errors_.end();
     45       ++itr) {
     46     delete (*itr);
     47   }
     48   pthread_mutex_destroy(&device_tree_mutex_);
     49 }
     50 
     51 // Atomically find named device in sub device tree.
     52 // Returns 0 if not found
     53 DeviceTree *DeviceTree::FindInSubTree(string name) {
     54   DeviceTree *ret;
     55   pthread_mutex_lock(&device_tree_mutex_);
     56   ret = UnlockedFindInSubTree(name);
     57   pthread_mutex_unlock(&device_tree_mutex_);
     58   return ret;
     59 }
     60 
     61 // Find named device in sub device tree (Non-atomic).
     62 // Returns 0 if not found
     63 DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
     64   std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
     65   if (itr != subdevices_.end()) {
     66     return itr->second;
     67   } else {
     68     // Search sub-tree.
     69     for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
     70         itr != subdevices_.end();
     71         ++itr) {
     72       DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
     73       if (result != 0)
     74         return result;
     75     }
     76     return 0;
     77   }
     78 }
     79 
     80 // Atomically add error instance to device.
     81 void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
     82   pthread_mutex_lock(&device_tree_mutex_);
     83   errors_.push_back(error_instance);
     84   pthread_mutex_unlock(&device_tree_mutex_);
     85 }
     86 
     87 // Find or add queried device as necessary.
     88 DeviceTree *DeviceTree::FindOrAddDevice(string name) {
     89   // Assume named device does not exist and try to insert the device anyway.
     90   // No-op if named device already exists.
     91   InsertSubDevice(name);
     92   // Find and return sub device pointer.
     93   return FindInSubTree(name);
     94 }
     95 
     96 // Pretty prints device tree.
     97 void DeviceTree::PrettyPrint(string spacer) {
     98   for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
     99       itr != subdevices_.end();
    100       ++itr) {
    101     printf("%s%s\n", spacer.c_str(), itr->first.c_str());
    102     itr->second->PrettyPrint(spacer+spacer);
    103   }
    104 }
    105 
    106 // Atomically add sub device.
    107 // No-op if named device already exists.
    108 void DeviceTree::InsertSubDevice(string name) {
    109   pthread_mutex_lock(&device_tree_mutex_);
    110   if (UnlockedFindInSubTree(name) != 0) {
    111     pthread_mutex_unlock(&device_tree_mutex_);
    112     return;
    113   }
    114   subdevices_[name] = new DeviceTree(name);
    115   subdevices_[name]->parent_ = this;
    116   pthread_mutex_unlock(&device_tree_mutex_);
    117 }
    118 
    119 
    120 // Returns true of any error associated with this device is fatal.
    121 bool DeviceTree::KnownBad() {
    122   pthread_mutex_lock(&device_tree_mutex_);
    123   for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
    124       itr != errors_.end();
    125       ++itr) {
    126     if ((*itr)->severity_ == SAT_ERROR_FATAL) {
    127       pthread_mutex_unlock(&device_tree_mutex_);
    128       return true;
    129     }
    130   }
    131   pthread_mutex_unlock(&device_tree_mutex_);
    132   return false;
    133 }
    134 
    135 
    136 // ErrorDiag constructor.
    137 ErrorDiag::ErrorDiag() {
    138   os_ = 0;
    139   system_tree_root_ = 0;
    140 }
    141 
    142 // ErrorDiag destructor.
    143 ErrorDiag::~ErrorDiag() {
    144   if (system_tree_root_)
    145     delete system_tree_root_;
    146 }
    147 
    148 // Set platform specific handle and initialize device tree.
    149 // Returns false on error. true otherwise.
    150 bool ErrorDiag::set_os(OsLayer *os) {
    151   os_ = os;
    152   return(InitializeDeviceTree());
    153 }
    154 
    155 // Create and initialize system device tree.
    156 // Returns false on error. true otherwise.
    157 bool ErrorDiag::InitializeDeviceTree() {
    158   system_tree_root_ = new DeviceTree("system_root");
    159   if (!system_tree_root_)
    160     return false;
    161   return true;
    162 }
    163 
    164 // Logs info about a CECC.
    165 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
    166 int ErrorDiag::AddCeccError(string dimm_string) {
    167   DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
    168   ECCErrorInstance *error = new ECCErrorInstance;
    169   if (!error)
    170     return -1;
    171   error->severity_ = SAT_ERROR_CORRECTABLE;
    172   dimm_device->AddErrorInstance(error);
    173   return 0;
    174 }
    175 
    176 // Logs info about a UECC.
    177 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
    178 int ErrorDiag::AddUeccError(string dimm_string) {
    179   DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
    180   ECCErrorInstance *error = new ECCErrorInstance;
    181   if (!error)
    182     return -1;
    183   error->severity_ = SAT_ERROR_FATAL;
    184   dimm_device->AddErrorInstance(error);
    185   return 0;
    186 }
    187 
    188 // Logs info about a miscompare.
    189 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
    190 int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
    191   DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
    192   MiscompareErrorInstance *error = new MiscompareErrorInstance;
    193   if (!error)
    194     return -1;
    195   error->severity_ = SAT_ERROR_FATAL;
    196   error->addr_ = addr;
    197   dimm_device->AddErrorInstance(error);
    198   os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
    199   return 1;
    200 }
    201 
    202 // Utility Function to translate a virtual address to DIMM number.
    203 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
    204 string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
    205   char dimm_string[256] = "";
    206   char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
    207   uint64 paddr = os->VirtualToPhysical(vbyteaddr);
    208   os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
    209   return string(dimm_string);
    210 }
    211 
    212 // Info about a miscompare from a drive.
    213 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
    214 int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
    215                                      void *src_addr, void *dst_addr) {
    216   bool mask_hdd_error = false;
    217 
    218   HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
    219   if (!error)
    220     return -1;
    221 
    222   error->addr_ = reinterpret_cast<uint64>(src_addr);
    223   error->addr2_ = reinterpret_cast<uint64>(dst_addr);
    224   error->offset_ = offset;
    225   error->block_ = block;
    226 
    227   string src_dimm = AddressToDimmString(os_, src_addr, offset);
    228   string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
    229 
    230   // DIMM name look up success
    231   if (src_dimm.compare("DIMM Unknown")) {
    232     // Add src DIMM as possible miscompare cause.
    233     DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
    234     error->causes_.insert(src_dimm_dev);
    235     if (src_dimm_dev->KnownBad()) {
    236       mask_hdd_error = true;
    237       logprintf(5, "Log: supressed %s miscompare report: "
    238                 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
    239     }
    240   }
    241   if (dst_dimm.compare("DIMM Unknown")) {
    242     // Add dst DIMM as possible miscompare cause.
    243     DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
    244     error->causes_.insert(dst_dimm_dev);
    245     if (dst_dimm_dev->KnownBad()) {
    246       mask_hdd_error = true;
    247       logprintf(5, "Log: supressed %s miscompare report: "
    248                 "known bad destination: %s\n", devicename.c_str(),
    249                 dst_dimm.c_str());
    250     }
    251   }
    252 
    253   DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
    254   hdd_dev->AddErrorInstance(error);
    255 
    256   // HDD error was not masked by bad DIMMs: report bad HDD.
    257   if (!mask_hdd_error) {
    258     os_->ErrorReport(devicename.c_str(), "miscompare", 1);
    259     error->severity_ = SAT_ERROR_FATAL;
    260     return 1;
    261   }
    262   return 0;
    263 }
    264 
    265 // Info about a sector tag miscompare from a drive.
    266 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
    267 int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
    268                                     int sector, void *src_addr,
    269                                     void *dst_addr) {
    270   bool mask_hdd_error = false;
    271 
    272   HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
    273   if (!error)
    274     return -1;
    275 
    276   error->addr_ = reinterpret_cast<uint64>(src_addr);
    277   error->addr2_ = reinterpret_cast<uint64>(dst_addr);
    278   error->sector_ = sector;
    279   error->block_ = block;
    280 
    281   string src_dimm = AddressToDimmString(os_, src_addr, offset);
    282   string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
    283 
    284   // DIMM name look up success
    285   if (src_dimm.compare("DIMM Unknown")) {
    286     // Add src DIMM as possible miscompare cause.
    287     DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
    288     error->causes_.insert(src_dimm_dev);
    289     if (src_dimm_dev->KnownBad()) {
    290       mask_hdd_error = true;
    291       logprintf(5, "Log: supressed %s sector tag error report: "
    292                 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
    293     }
    294   }
    295   if (dst_dimm.compare("DIMM Unknown")) {
    296     // Add dst DIMM as possible miscompare cause.
    297     DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
    298     error->causes_.insert(dst_dimm_dev);
    299     if (dst_dimm_dev->KnownBad()) {
    300       mask_hdd_error = true;
    301       logprintf(5, "Log: supressed %s sector tag error report: "
    302                 "known bad destination: %s\n", devicename.c_str(),
    303                 dst_dimm.c_str());
    304     }
    305   }
    306 
    307   DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
    308   hdd_dev->AddErrorInstance(error);
    309 
    310   // HDD error was not masked by bad DIMMs: report bad HDD.
    311   if (!mask_hdd_error) {
    312     os_->ErrorReport(devicename.c_str(), "sector", 1);
    313     error->severity_ = SAT_ERROR_FATAL;
    314     return 1;
    315   }
    316   return 0;
    317 }
    318