1 // Copyright 2008 Google Inc. All Rights Reserved. 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 7 // http://www.apache.org/licenses/LICENSE-2.0 8 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // error_diag.cc: Collects device errors for analysis to more accurately 16 // pin-point failed component. 17 18 #include <set> 19 #include <list> 20 #include <map> 21 22 // This file must work with autoconf on its public version, 23 // so these includes are correct. 24 #include "error_diag.h" 25 #include "sattypes.h" 26 27 28 // DeviceTree constructor. 29 DeviceTree::DeviceTree(string name) 30 : parent_(0), name_(name) { 31 pthread_mutex_init(&device_tree_mutex_, NULL); 32 } 33 34 // DeviceTree destructor. 35 DeviceTree::~DeviceTree() { 36 // Deallocate subtree devices. 37 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); 38 itr != subdevices_.end(); 39 ++itr) { 40 delete itr->second; 41 } 42 // Deallocate device errors. 43 for (std::list<ErrorInstance*>::iterator itr = errors_.begin(); 44 itr != errors_.end(); 45 ++itr) { 46 delete (*itr); 47 } 48 pthread_mutex_destroy(&device_tree_mutex_); 49 } 50 51 // Atomically find named device in sub device tree. 52 // Returns 0 if not found 53 DeviceTree *DeviceTree::FindInSubTree(string name) { 54 DeviceTree *ret; 55 pthread_mutex_lock(&device_tree_mutex_); 56 ret = UnlockedFindInSubTree(name); 57 pthread_mutex_unlock(&device_tree_mutex_); 58 return ret; 59 } 60 61 // Find named device in sub device tree (Non-atomic). 62 // Returns 0 if not found 63 DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) { 64 std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name); 65 if (itr != subdevices_.end()) { 66 return itr->second; 67 } else { 68 // Search sub-tree. 69 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); 70 itr != subdevices_.end(); 71 ++itr) { 72 DeviceTree *result = itr->second->UnlockedFindInSubTree(name); 73 if (result != 0) 74 return result; 75 } 76 return 0; 77 } 78 } 79 80 // Atomically add error instance to device. 81 void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) { 82 pthread_mutex_lock(&device_tree_mutex_); 83 errors_.push_back(error_instance); 84 pthread_mutex_unlock(&device_tree_mutex_); 85 } 86 87 // Find or add queried device as necessary. 88 DeviceTree *DeviceTree::FindOrAddDevice(string name) { 89 // Assume named device does not exist and try to insert the device anyway. 90 // No-op if named device already exists. 91 InsertSubDevice(name); 92 // Find and return sub device pointer. 93 return FindInSubTree(name); 94 } 95 96 // Pretty prints device tree. 97 void DeviceTree::PrettyPrint(string spacer) { 98 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); 99 itr != subdevices_.end(); 100 ++itr) { 101 printf("%s%s\n", spacer.c_str(), itr->first.c_str()); 102 itr->second->PrettyPrint(spacer+spacer); 103 } 104 } 105 106 // Atomically add sub device. 107 // No-op if named device already exists. 108 void DeviceTree::InsertSubDevice(string name) { 109 pthread_mutex_lock(&device_tree_mutex_); 110 if (UnlockedFindInSubTree(name) != 0) { 111 pthread_mutex_unlock(&device_tree_mutex_); 112 return; 113 } 114 subdevices_[name] = new DeviceTree(name); 115 subdevices_[name]->parent_ = this; 116 pthread_mutex_unlock(&device_tree_mutex_); 117 } 118 119 120 // Returns true of any error associated with this device is fatal. 121 bool DeviceTree::KnownBad() { 122 pthread_mutex_lock(&device_tree_mutex_); 123 for (std::list<ErrorInstance*>::iterator itr = errors_.begin(); 124 itr != errors_.end(); 125 ++itr) { 126 if ((*itr)->severity_ == SAT_ERROR_FATAL) { 127 pthread_mutex_unlock(&device_tree_mutex_); 128 return true; 129 } 130 } 131 pthread_mutex_unlock(&device_tree_mutex_); 132 return false; 133 } 134 135 136 // ErrorDiag constructor. 137 ErrorDiag::ErrorDiag() { 138 os_ = 0; 139 system_tree_root_ = 0; 140 } 141 142 // ErrorDiag destructor. 143 ErrorDiag::~ErrorDiag() { 144 if (system_tree_root_) 145 delete system_tree_root_; 146 } 147 148 // Set platform specific handle and initialize device tree. 149 // Returns false on error. true otherwise. 150 bool ErrorDiag::set_os(OsLayer *os) { 151 os_ = os; 152 return(InitializeDeviceTree()); 153 } 154 155 // Create and initialize system device tree. 156 // Returns false on error. true otherwise. 157 bool ErrorDiag::InitializeDeviceTree() { 158 system_tree_root_ = new DeviceTree("system_root"); 159 if (!system_tree_root_) 160 return false; 161 return true; 162 } 163 164 // Logs info about a CECC. 165 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 166 int ErrorDiag::AddCeccError(string dimm_string) { 167 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); 168 ECCErrorInstance *error = new ECCErrorInstance; 169 if (!error) 170 return -1; 171 error->severity_ = SAT_ERROR_CORRECTABLE; 172 dimm_device->AddErrorInstance(error); 173 return 0; 174 } 175 176 // Logs info about a UECC. 177 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 178 int ErrorDiag::AddUeccError(string dimm_string) { 179 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); 180 ECCErrorInstance *error = new ECCErrorInstance; 181 if (!error) 182 return -1; 183 error->severity_ = SAT_ERROR_FATAL; 184 dimm_device->AddErrorInstance(error); 185 return 0; 186 } 187 188 // Logs info about a miscompare. 189 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 190 int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) { 191 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); 192 MiscompareErrorInstance *error = new MiscompareErrorInstance; 193 if (!error) 194 return -1; 195 error->severity_ = SAT_ERROR_FATAL; 196 error->addr_ = addr; 197 dimm_device->AddErrorInstance(error); 198 os_->ErrorReport(dimm_string.c_str(), "miscompare", count); 199 return 1; 200 } 201 202 // Utility Function to translate a virtual address to DIMM number. 203 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 204 string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) { 205 char dimm_string[256] = ""; 206 char *vbyteaddr = reinterpret_cast<char*>(addr) + offset; 207 uint64 paddr = os->VirtualToPhysical(vbyteaddr); 208 os->FindDimm(paddr, dimm_string, sizeof(dimm_string)); 209 return string(dimm_string); 210 } 211 212 // Info about a miscompare from a drive. 213 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 214 int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset, 215 void *src_addr, void *dst_addr) { 216 bool mask_hdd_error = false; 217 218 HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance; 219 if (!error) 220 return -1; 221 222 error->addr_ = reinterpret_cast<uint64>(src_addr); 223 error->addr2_ = reinterpret_cast<uint64>(dst_addr); 224 error->offset_ = offset; 225 error->block_ = block; 226 227 string src_dimm = AddressToDimmString(os_, src_addr, offset); 228 string dst_dimm = AddressToDimmString(os_, dst_addr, offset); 229 230 // DIMM name look up success 231 if (src_dimm.compare("DIMM Unknown")) { 232 // Add src DIMM as possible miscompare cause. 233 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm); 234 error->causes_.insert(src_dimm_dev); 235 if (src_dimm_dev->KnownBad()) { 236 mask_hdd_error = true; 237 logprintf(5, "Log: supressed %s miscompare report: " 238 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str()); 239 } 240 } 241 if (dst_dimm.compare("DIMM Unknown")) { 242 // Add dst DIMM as possible miscompare cause. 243 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm); 244 error->causes_.insert(dst_dimm_dev); 245 if (dst_dimm_dev->KnownBad()) { 246 mask_hdd_error = true; 247 logprintf(5, "Log: supressed %s miscompare report: " 248 "known bad destination: %s\n", devicename.c_str(), 249 dst_dimm.c_str()); 250 } 251 } 252 253 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename); 254 hdd_dev->AddErrorInstance(error); 255 256 // HDD error was not masked by bad DIMMs: report bad HDD. 257 if (!mask_hdd_error) { 258 os_->ErrorReport(devicename.c_str(), "miscompare", 1); 259 error->severity_ = SAT_ERROR_FATAL; 260 return 1; 261 } 262 return 0; 263 } 264 265 // Info about a sector tag miscompare from a drive. 266 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. 267 int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset, 268 int sector, void *src_addr, 269 void *dst_addr) { 270 bool mask_hdd_error = false; 271 272 HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance; 273 if (!error) 274 return -1; 275 276 error->addr_ = reinterpret_cast<uint64>(src_addr); 277 error->addr2_ = reinterpret_cast<uint64>(dst_addr); 278 error->sector_ = sector; 279 error->block_ = block; 280 281 string src_dimm = AddressToDimmString(os_, src_addr, offset); 282 string dst_dimm = AddressToDimmString(os_, dst_addr, offset); 283 284 // DIMM name look up success 285 if (src_dimm.compare("DIMM Unknown")) { 286 // Add src DIMM as possible miscompare cause. 287 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm); 288 error->causes_.insert(src_dimm_dev); 289 if (src_dimm_dev->KnownBad()) { 290 mask_hdd_error = true; 291 logprintf(5, "Log: supressed %s sector tag error report: " 292 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str()); 293 } 294 } 295 if (dst_dimm.compare("DIMM Unknown")) { 296 // Add dst DIMM as possible miscompare cause. 297 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm); 298 error->causes_.insert(dst_dimm_dev); 299 if (dst_dimm_dev->KnownBad()) { 300 mask_hdd_error = true; 301 logprintf(5, "Log: supressed %s sector tag error report: " 302 "known bad destination: %s\n", devicename.c_str(), 303 dst_dimm.c_str()); 304 } 305 } 306 307 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename); 308 hdd_dev->AddErrorInstance(error); 309 310 // HDD error was not masked by bad DIMMs: report bad HDD. 311 if (!mask_hdd_error) { 312 os_->ErrorReport(devicename.c_str(), "sector", 1); 313 error->severity_ = SAT_ERROR_FATAL; 314 return 1; 315 } 316 return 0; 317 } 318