Home | History | Annotate | Download | only in monitoring
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Standard format in which the metrics are collected, before being exported.
     17 // These are to be used only by the CollectionRegistry and exporters which
     18 // collect metrics using the CollectionRegistry.
     19 
     20 #ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
     21 #define TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
     22 
     23 #include <map>
     24 #include <memory>
     25 #include <string>
     26 #include <vector>
     27 
     28 #include "tensorflow/core/framework/summary.pb.h"
     29 #include "tensorflow/core/lib/monitoring/metric_def.h"
     30 
     31 namespace tensorflow {
     32 namespace monitoring {
     33 
     34 // A metric is a statistic about a monitorable entity.
     35 //
     36 // Metrics are named with path-like strings, which must conform to the regular
     37 // expression (/[a-zA-Z0-9_-]+)+.  For example:
     38 //
     39 //     /proc/cpu_usage
     40 //     /rpc/client/count
     41 //
     42 // Metrics may optionally have labels, which are additional dimensions used to
     43 // identify the metric's values.  For example, the metric /rpc/client/count
     44 // might have two labels named "rpc_service" and "rpc_method".
     45 //
     46 // A label name must be an identifier, which conform to the regular expression
     47 // [a-zA-Z_][a-zA-Z_0-9]*, and is only unique within the context of the metric
     48 // it is a label for.
     49 //
     50 // MetricDescriptor defines the structure of the metric (e.g. the fact that it's
     51 // a counter and that it has two labels named "rpc_service" and "rpc_method").
     52 // Individual points will provide a value for the metric (e.g. the counter
     53 // value) and specific values for each of the labels.
     54 //
     55 // There's no scoping relationship between metrics and monitorable entities: the
     56 // metric /rpc/client/count should be defined the same way no matter which
     57 // monitorable entity is exporting it.
     58 struct MetricDescriptor {
     59   // Metric names are path-like.  E.g., "/mycomponent/mymetric".
     60   string name;
     61 
     62   // A human-readable description of what this metric measures.
     63   string description;
     64 
     65   // Label names for the metric.
     66   // See the example in the top level comment for MetricDescriptor.
     67   std::vector<string> label_names;
     68 
     69   MetricKind metric_kind;
     70 
     71   ValueType value_type;
     72 };
     73 
     74 struct Point {
     75   // Usually a Point should provide a |label| field for each of the labels
     76   // defined in the corresponding MetricDescriptor.  During transitions in
     77   // metric definitions, however, there may be times when a Point provides more
     78   // or fewer labels than those that appear in the MetricDescriptor.
     79   struct Label {
     80     // The |name| field must match the |label_name| field in the
     81     // MetricDescriptor for this Point.
     82     string name;
     83     string value;
     84   };
     85   std::vector<Label> labels;
     86 
     87   // The actual metric value, dependent on the value_type enum.
     88   ValueType value_type;
     89   int64 int64_value;
     90   string string_value;
     91   bool bool_value;
     92   HistogramProto histogram_value;
     93 
     94   // start_timestamp and end_timestamp indicate the time period over which this
     95   // point's value measurement applies.
     96   //
     97   // A cumulative metric like /rpc/client/count typically has runs of
     98   // consecutive points that share a common start_timestamp, which is often
     99   // the time at which the exporting process started.  For example:
    100   //
    101   //   value:  3  start_timestamp: 1000  end_timestamp: 1234
    102   //   value:  7  start_timestamp: 1000  end_timestamp: 1245
    103   //   value: 10  start_timestamp: 1000  end_timestamp: 1256
    104   //   value: 15  start_timestamp: 1000  end_timestamp: 1267
    105   //   value: 21  start_timestamp: 1000  end_timestamp: 1278
    106   //   value:  4  start_timestamp: 1300  end_timestamp: 1400
    107   //
    108   // The meaning of each point is: "Over the time period from
    109   // 'start_timestamp' to 'end_timestamp', 'value' client RPCs finished."
    110   //
    111   // Note the changed start_timestamp and the decrease in 'value' in the
    112   // last line; those are the effects of the process restarting.
    113   //
    114   // Delta metrics have the same interpretation of the timestamps and values,
    115   // but the time ranges of two points do not overlap.  The delta form of the
    116   // above sequence would be:
    117   //
    118   //   value:  3  start_timestamp: 1000  end_timestamp: 1234
    119   //   value:  4  start_timestamp: 1235  end_timestamp: 1245
    120   //   value:  3  start_timestamp: 1246  end_timestamp: 1256
    121   //   value:  5  start_timestamp: 1257  end_timestamp: 1267
    122   //   value:  6  start_timestamp: 1268  end_timestamp: 1278
    123   //   value:  4  start_timestamp: 1300  end_timestamp: 1400
    124   //
    125   // For gauge metrics whose values are instantaneous measurements,
    126   // start_timestamp and end_timestamp may be identical.  I.e., there is no need
    127   // to strictly measure the time period during which the value measurement was
    128   // made.
    129   //
    130   // start_timestamp must not be younger than end_timestamp.
    131   uint64 start_timestamp_millis;
    132   uint64 end_timestamp_millis;
    133 };
    134 
    135 // A set of points belonging to a metric.
    136 struct PointSet {
    137   // This must match a name defined by a MetricDescriptor message.
    138   string metric_name;
    139 
    140   // No two Points in the same PointSet should have the same set of labels.
    141   std::vector<std::unique_ptr<Point>> points;
    142 };
    143 
    144 // Standard format in which the metrics are collected, before being exported.
    145 struct CollectedMetrics {
    146   // The keys are the metric-names.
    147   std::map<string, std::unique_ptr<MetricDescriptor>> metric_descriptor_map;
    148   std::map<string, std::unique_ptr<PointSet>> point_set_map;
    149 };
    150 
    151 }  // namespace monitoring
    152 }  // namespace tensorflow
    153 
    154 #endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
    155