Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc. All Rights Reserved.
      2 
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // error_diag.h: Ambiguous error diagnosis class
     16 
     17 #ifndef STRESSAPPTEST_ERROR_DIAG_H_
     18 #define STRESSAPPTEST_ERROR_DIAG_H_
     19 
     20 #include <pthread.h>
     21 #include <list>
     22 #include <map>
     23 #include <set>
     24 #include <string>
     25 
     26 // This file must work with autoconf on its public version,
     27 // so these includes are correct.
     28 #include "sattypes.h"
     29 #include "os.h"
     30 
     31 class ErrorInstance;
     32 
     33 // This describes the components of the system.
     34 class DeviceTree {
     35  public:
     36   explicit DeviceTree(string name);
     37   ~DeviceTree();
     38 
     39   // Atomically find arbitrary device in subtree.
     40   DeviceTree *FindInSubTree(string name);
     41   // Find or add named device.
     42   DeviceTree *FindOrAddDevice(string name);
     43   // Atomically add sub device.
     44   void InsertSubDevice(string name);
     45   // Returns parent device.
     46   DeviceTree *GetParent() { return parent_; }
     47   // Pretty prints device tree.
     48   void PrettyPrint(string spacer = " ");
     49   // Atomically add error instance to device.
     50   void AddErrorInstance(ErrorInstance *error_instance);
     51   // Returns true of device is known to be bad.
     52   bool KnownBad();
     53   // Returns number of direct sub devices.
     54   int NumDirectSubDevices() { return subdevices_.size(); }
     55 
     56  private:
     57   // Unlocked version of FindInSubTree.
     58   DeviceTree *UnlockedFindInSubTree(string name);
     59 
     60   std::map<string, DeviceTree*> subdevices_;    // Map of sub-devices.
     61   std::list<ErrorInstance*> errors_;            // Log of errors.
     62   DeviceTree *parent_;                          // Pointer to parent device.
     63   string name_;                                 // Device name.
     64   pthread_mutex_t device_tree_mutex_;           // Mutex protecting device tree.
     65 };
     66 
     67 
     68 // enum type for collected errors.
     69 enum SATErrorType {
     70   SAT_ERROR_NONE = 0,
     71   SAT_ERROR_ECC,
     72   SAT_ERROR_MISCOMPARE,
     73   SAT_ERROR_SECTOR_TAG,
     74 };
     75 
     76 // enum type for error severity.
     77 enum SATErrorSeverity {
     78   SAT_ERROR_CORRECTABLE = 0,
     79   SAT_ERROR_FATAL,
     80 };
     81 
     82 // This describes an error and it's likely causes.
     83 class ErrorInstance {
     84  public:
     85   ErrorInstance(): type_(SAT_ERROR_NONE), severity_(SAT_ERROR_CORRECTABLE) {}
     86 
     87   SATErrorType type_;             // Type of error: ECC, miscompare, sector.
     88   SATErrorSeverity severity_;     // Correctable, or fatal.
     89   std::set<DeviceTree*> causes_;  // Devices that can cause this type of error.
     90 };
     91 
     92 // This describes ECC errors.
     93 class ECCErrorInstance: public ErrorInstance {
     94  public:
     95   ECCErrorInstance() { type_ = SAT_ERROR_ECC; }
     96 
     97   uint64 addr_;               // Address where error occured.
     98 };
     99 
    100 // This describes miscompare errors.
    101 class MiscompareErrorInstance: public ErrorInstance {
    102  public:
    103   MiscompareErrorInstance() { type_ = SAT_ERROR_MISCOMPARE; }
    104 
    105   uint64 addr_;               // Address where miscompare occured.
    106 };
    107 
    108 // This describes HDD miscompare errors.
    109 class HDDMiscompareErrorInstance: public MiscompareErrorInstance {
    110  public:
    111   uint64 addr2_;             // addr_ and addr2_ are src and dst memory addr.
    112   int offset_;               // offset.
    113   int block_;                // error block.
    114 };
    115 
    116 // This describes HDD miscompare errors.
    117 class HDDSectorTagErrorInstance: public ErrorInstance {
    118  public:
    119   HDDSectorTagErrorInstance() { type_ = SAT_ERROR_SECTOR_TAG; }
    120 
    121   uint64 addr_;
    122   uint64 addr2_;             // addr_ and addr2_ are src and dst memory addr.
    123   int sector_;               // error sector.
    124   int block_;                // error block.
    125 };
    126 
    127 // Generic error storage and sorting class.
    128 class ErrorDiag {
    129  public:
    130   ErrorDiag();
    131   virtual ~ErrorDiag();
    132 
    133   // Add info about a CECC.
    134   virtual int AddCeccError(string dimm_string);
    135 
    136   // Add info about a UECC.
    137   virtual int AddUeccError(string dimm_string);
    138 
    139   // Add info about a miscompare.
    140   virtual int AddMiscompareError(string dimm_string, uint64 addr, int count);
    141 
    142   // Add info about a miscompare from a drive.
    143   virtual int AddHDDMiscompareError(string devicename, int block, int offset,
    144                             void *src_addr, void *dst_addr);
    145 
    146   // Add info about a sector tag miscompare from a drive.
    147   virtual int AddHDDSectorTagError(string devicename, int block, int offset,
    148                            int sector, void *src_addr, void *dst_addr);
    149 
    150   // Set platform specific handle and initialize device tree.
    151   bool set_os(OsLayer *os);
    152 
    153  protected:
    154   // Create and initialize system device tree.
    155   virtual bool InitializeDeviceTree();
    156 
    157   // Utility Function to translate a virtual address to DIMM number.
    158   string AddressToDimmString(OsLayer *os, void *addr, int offset);
    159 
    160   DeviceTree *system_tree_root_;  // System device tree.
    161   OsLayer *os_;                   // Platform handle.
    162 
    163  private:
    164   DISALLOW_COPY_AND_ASSIGN(ErrorDiag);
    165 };
    166 
    167 #endif  // STRESSAPPTEST_ERROR_DIAG_H_
    168