Home | History | Annotate | Download | only in protobuf
      1 syntax = "proto3";
      2 
      3 package tensorflow;
      4 option cc_enable_arenas = true;
      5 option java_outer_classname = "ConfigProtos";
      6 option java_multiple_files = true;
      7 option java_package = "org.tensorflow.framework";
      8 
      9 import "tensorflow/core/framework/cost_graph.proto";
     10 import "tensorflow/core/framework/graph.proto";
     11 import "tensorflow/core/framework/step_stats.proto";
     12 import "tensorflow/core/protobuf/debug.proto";
     13 import "tensorflow/core/protobuf/cluster.proto";
     14 import "tensorflow/core/protobuf/rewriter_config.proto";
     15 
     16 message GPUOptions {
     17   // A value between 0 and 1 that indicates what fraction of the
     18   // available GPU memory to pre-allocate for each process.  1 means
     19   // to pre-allocate all of the GPU memory, 0.5 means the process
     20   // allocates ~50% of the available GPU memory.
     21   double per_process_gpu_memory_fraction = 1;
     22 
     23   // The type of GPU allocation strategy to use.
     24   //
     25   // Allowed values:
     26   // "": The empty string (default) uses a system-chosen default
     27   //     which may change over time.
     28   //
     29   // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
     30   //        version of dlmalloc.
     31   string allocator_type = 2;
     32 
     33   // Delay deletion of up to this many bytes to reduce the number of
     34   // interactions with gpu driver code.  If 0, the system chooses
     35   // a reasonable default (several MBs).
     36   int64 deferred_deletion_bytes = 3;
     37 
     38   // If true, the allocator does not pre-allocate the entire specified
     39   // GPU memory region, instead starting small and growing as needed.
     40   bool allow_growth = 4;
     41 
     42   // A comma-separated list of GPU ids that determines the 'visible'
     43   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
     44   // can see 8 GPU devices in the process, and one wanted to map
     45   // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
     46   // then one would specify this field as "5,3".  This field is similar in
     47   // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
     48   // it applies to the visible GPU devices in the process.
     49   //
     50   // NOTE:
     51   // 1. The GPU driver provides the process with the visible GPUs
     52   //    in an order which is not guaranteed to have any correlation to
     53   //    the *physical* GPU id in the machine.  This field is used for
     54   //    remapping "visible" to "virtual", which means this operates only
     55   //    after the process starts.  Users are required to use vendor
     56   //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
     57   //    physical to visible device mapping prior to invoking TensorFlow.
     58   // 2. In the code, the ids in this list are also called "CUDA GPU id"s,
     59   //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
     60   //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
     61   //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
     62   //    for more information.
     63   string visible_device_list = 5;
     64 
     65   // In the event polling loop sleep this many microseconds between
     66   // PollEvents calls, when the queue is not empty.  If value is not
     67   // set or set to 0, gets set to a non-zero default.
     68   int32 polling_active_delay_usecs = 6;
     69 
     70   // In the event polling loop sleep this many millisconds between
     71   // PollEvents calls, when the queue is empty.  If value is not
     72   // set or set to 0, gets set to a non-zero default.
     73   int32 polling_inactive_delay_msecs = 7;
     74 
     75   // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
     76   // enabling this option forces all CPU tensors to be allocated with Cuda
     77   // pinned memory. Normally, TensorFlow will infer which tensors should be
     78   // allocated as the pinned memory. But in case where the inference is
     79   // incomplete, this option can significantly speed up the cross-device memory
     80   // copy performance as long as it fits the memory.
     81   // Note that this option is not something that should be
     82   // enabled by default for unknown or very large models, since all Cuda pinned
     83   // memory is unpageable, having too much pinned memory might negatively impact
     84   // the overall host system performance.
     85   bool force_gpu_compatible = 8;
     86 
     87   // Everything inside Experimental is subject to change and is not subject
     88   // to API stability guarantees in
     89   // https://www.tensorflow.org/programmers_guide/version_compat.
     90   message Experimental {
     91     // Configuration for breaking down a visible GPU into multiple "virtual"
     92     // devices.
     93     message VirtualDevices {
     94       // Per "virtual" device memory limit, in MB. The number of elements in
     95       // the list is the number of virtual devices to create on the
     96       // corresponding visible GPU (see "virtual_devices" below).
     97       // If empty, it will create single virtual device taking all available
     98       // memory from the device.
     99       //
    100       // For the concept of "visible" and "virtual" GPU, see the comments for
    101       // "visible_device_list" above for more information.
    102       repeated float memory_limit_mb = 1;
    103     }
    104 
    105     // The multi virtual device settings. If empty (not set), it will create
    106     // single virtual device on each visible GPU, according to the settings
    107     // in "visible_device_list" above. Otherwise, the number of elements in the
    108     // list must be the same as the number of visible GPUs (after
    109     // "visible_device_list" filtering if it is set), and the string represented
    110     // device names (e.g. /device:GPU:<id>) will refer to the virtual
    111     // devices and have the <id> field assigned sequentially starting from 0,
    112     // according to the order they appear in this list and the "memory_limit"
    113     // list inside each element. For example,
    114     //   visible_device_list = "1,0"
    115     //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
    116     //   virtual_devices {}
    117     // will create three virtual devices as:
    118     //   /device:GPU:0 -> visible GPU 1 with 1GB memory
    119     //   /device:GPU:1 -> visible GPU 1 with 2GB memory
    120     //   /device:GPU:2 -> visible GPU 0 with all available memory
    121     //
    122     // NOTE:
    123     // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
    124     //    at the same time.
    125     // 2. Currently this setting is per-process, not per-session. Using
    126     //    different settings in different sessions within same process will
    127     //    result in undefined behavior.
    128     repeated VirtualDevices virtual_devices = 1;
    129   }
    130 
    131   Experimental experimental = 9;
    132 };
    133 
    134 // Options passed to the graph optimizer
    135 message OptimizerOptions {
    136   // If true, optimize the graph using common subexpression elimination.
    137   bool do_common_subexpression_elimination = 1;
    138 
    139   // If true, perform constant folding optimization on the graph.
    140   bool do_constant_folding = 2;
    141 
    142   // Constant folding optimization replaces tensors whose values can be
    143   // predetermined, with constant nodes. To avoid inserting too large constants,
    144   // the size of each constant created can be limited. If this value is zero, a
    145   // default limit of 10 MiB will be applied. If constant folding optimization
    146   // is disabled, this value is ignored.
    147   int64 max_folded_constant_in_bytes = 6;
    148 
    149   // If true, perform function inlining on the graph.
    150   bool do_function_inlining = 4;
    151 
    152   // Optimization level
    153   enum Level {
    154     // L1 is the default level.
    155     // Optimization performed at L1 :
    156     // 1. Common subexpression elimination
    157     // 2. Constant folding
    158     L1 = 0;
    159 
    160     // No optimizations
    161     L0 = -1;
    162   }
    163 
    164   // Overall optimization level. The actual optimizations applied will be the
    165   // logical OR of the flags that this level implies and any flags already set.
    166   Level opt_level = 3;
    167 
    168   // Control the use of the compiler/jit.  Experimental.
    169   enum GlobalJitLevel {
    170     DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
    171     OFF = -1;
    172     // The following settings turn on compilation, with higher values being
    173     // more aggressive.  Higher values may reduce opportunities for parallelism
    174     // and may use more memory.  (At present, there is no distinction, but this
    175     // is expected to change.)
    176     ON_1 = 1;
    177     ON_2 = 2;
    178   }
    179   GlobalJitLevel global_jit_level = 5;
    180 }
    181 
    182 message GraphOptions {
    183   // Removed, use optimizer_options below.
    184   reserved "skip_common_subexpression_elimination";
    185   reserved 1;
    186 
    187   // If true, use control flow to schedule the activation of Recv nodes.
    188   // (Currently ignored.)
    189   bool enable_recv_scheduling = 2;
    190 
    191   // Options controlling how graph is optimized.
    192   OptimizerOptions optimizer_options = 3;
    193 
    194   // The number of steps to run before returning a cost model detailing
    195   // the memory usage and performance of each node of the graph. 0 means
    196   // no cost model.
    197   int64 build_cost_model = 4;
    198 
    199   // The number of steps to skip before collecting statistics for the
    200   // cost model.
    201   int64 build_cost_model_after = 9;
    202 
    203   // Annotate each Node with Op output shape data, to the extent it can
    204   // be statically inferred.
    205   bool infer_shapes = 5;
    206 
    207   // Only place the subgraphs that are run, rather than the entire graph.
    208   //
    209   // This is useful for interactive graph building, where one might
    210   // produce graphs that cannot be placed during the debugging
    211   // process.  In particular, it allows the client to continue work in
    212   // a session after adding a node to a graph whose placement
    213   // constraints are unsatisfiable.
    214   bool place_pruned_graph = 6;
    215 
    216   // If true, transfer float values between processes as bfloat16.
    217   bool enable_bfloat16_sendrecv = 7;
    218 
    219   // If > 0, record a timeline every this many steps.
    220   // EXPERIMENTAL: This currently has no effect in MasterSession.
    221   int32 timeline_step = 8;
    222 
    223   // Options that control the type and amount of graph rewriting.
    224   // Not currently configurable via the public Python API (i.e. there is no API
    225   // stability guarantee if you import RewriterConfig explicitly).
    226   RewriterConfig rewrite_options = 10;
    227 };
    228 
    229 message ThreadPoolOptionProto {
    230   // The number of threads in the pool.
    231   //
    232   // 0 means the system picks a value based on where this option proto is used
    233   // (see the declaration of the specific field for more info).
    234   int32 num_threads = 1;
    235 
    236   // The global name of the threadpool.
    237   //
    238   // If empty, then the threadpool is made and used according to the scope it's
    239   // in - e.g., for a session threadpool, it is used by that session only.
    240   //
    241   // If non-empty, then:
    242   // - a global threadpool associated with this name is looked
    243   //   up or created. This allows, for example, sharing one threadpool across
    244   //   many sessions (e.g., like the default behavior, if
    245   //   inter_op_parallelism_threads is not configured), but still partitioning
    246   //   into a large and small pool.
    247   // - if the threadpool for this global_name already exists, then it is an
    248   //   error if the existing pool was created using a different num_threads
    249   //   value as is specified on this call.
    250   // - threadpools created this way are never garbage collected.
    251   string global_name = 2;
    252 };
    253 
    254 message RPCOptions {
    255   // If true, always use RPC to contact the session target.
    256   //
    257   // If false (the default option), TensorFlow may use an optimized
    258   // transport for client-master communication that avoids the RPC
    259   // stack. This option is primarily for used testing the RPC stack.
    260   bool use_rpc_for_inprocess_master = 1;
    261 };
    262 
    263 // Session configuration parameters.
    264 // The system picks appropriate values for fields that are not set.
    265 message ConfigProto {
    266   // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
    267   // number of devices of that type to use.  If a particular device
    268   // type is not found in the map, the system picks an appropriate
    269   // number.
    270   map<string, int32> device_count = 1;
    271 
    272   // The execution of an individual op (for some op types) can be
    273   // parallelized on a pool of intra_op_parallelism_threads.
    274   // 0 means the system picks an appropriate number.
    275   int32 intra_op_parallelism_threads = 2;
    276 
    277   // Nodes that perform blocking operations are enqueued on a pool of
    278   // inter_op_parallelism_threads available in each process.
    279   //
    280   // 0 means the system picks an appropriate number.
    281   //
    282   // Note that the first Session created in the process sets the
    283   // number of threads for all future sessions unless use_per_session_threads is
    284   // true or session_inter_op_thread_pool is configured.
    285   int32 inter_op_parallelism_threads = 5;
    286 
    287   // If true, use a new set of threads for this session rather than the global
    288   // pool of threads. Only supported by direct sessions.
    289   //
    290   // If false, use the global threads created by the first session, or the
    291   // per-session thread pools configured by session_inter_op_thread_pool.
    292   //
    293   // This option is deprecated. The same effect can be achieved by setting
    294   // session_inter_op_thread_pool to have one element, whose num_threads equals
    295   // inter_op_parallelism_threads.
    296   bool use_per_session_threads = 9;
    297 
    298   // This option is experimental - it may be replaced with a different mechanism
    299   // in the future.
    300   //
    301   // Configures session thread pools. If this is configured, then RunOptions for
    302   // a Run call can select the thread pool to use.
    303   //
    304   // The intended use is for when some session invocations need to run in a
    305   // background pool limited to a small number of threads:
    306   // - For example, a session may be configured to have one large pool (for
    307   // regular compute) and one small pool (for periodic, low priority work);
    308   // using the small pool is currently the mechanism for limiting the inter-op
    309   // parallelism of the low priority work.  Note that it does not limit the
    310   // parallelism of work spawned by a single op kernel implementation.
    311   // - Using this setting is normally not needed in training, but may help some
    312   // serving use cases.
    313   // - It is also generally recommended to set the global_name field of this
    314   // proto, to avoid creating multiple large pools. It is typically better to
    315   // run the non-low-priority work, even across sessions, in a single large
    316   // pool.
    317   repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
    318 
    319   // Assignment of Nodes to Devices is recomputed every placement_period
    320   // steps until the system warms up (at which point the recomputation
    321   // typically slows down automatically).
    322   int32 placement_period = 3;
    323 
    324   // When any filters are present sessions will ignore all devices which do not
    325   // match the filters. Each filter can be partially specified, e.g. "/job:ps"
    326   // "/job:worker/replica:3", etc.
    327   repeated string device_filters = 4;
    328 
    329   // Options that apply to all GPUs.
    330   GPUOptions gpu_options = 6;
    331 
    332   // Whether soft placement is allowed. If allow_soft_placement is true,
    333   // an op will be placed on CPU if
    334   //   1. there's no GPU implementation for the OP
    335   // or
    336   //   2. no GPU devices are known or registered
    337   // or
    338   //   3. need to co-locate with reftype input(s) which are from CPU.
    339   bool allow_soft_placement = 7;
    340 
    341   // Whether device placements should be logged.
    342   bool log_device_placement = 8;
    343 
    344   // Options that apply to all graphs.
    345   GraphOptions graph_options = 10;
    346 
    347   // Global timeout for all blocking operations in this session.  If non-zero,
    348   // and not overridden on a per-operation basis, this value will be used as the
    349   // deadline for all blocking operations.
    350   int64 operation_timeout_in_ms = 11;
    351 
    352   // Options that apply when this session uses the distributed runtime.
    353   RPCOptions rpc_options = 13;
    354 
    355   // Optional list of all workers to use in this session.
    356   ClusterDef cluster_def = 14;
    357 
    358   // If true, any resources such as Variables used in the session will not be
    359   // shared with other sessions.
    360   bool isolate_session_state = 15;
    361 
    362   // Next: 16
    363 };
    364 
    365 // Options for a single Run() call.
    366 message RunOptions {
    367   // TODO(pbar) Turn this into a TraceOptions proto which allows
    368   // tracing to be controlled in a more orthogonal manner?
    369   enum TraceLevel {
    370     NO_TRACE = 0;
    371     SOFTWARE_TRACE = 1;
    372     HARDWARE_TRACE = 2;
    373     FULL_TRACE = 3;
    374   }
    375   TraceLevel trace_level = 1;
    376 
    377   // Time to wait for operation to complete in milliseconds.
    378   int64 timeout_in_ms = 2;
    379 
    380   // The thread pool to use, if session_inter_op_thread_pool is configured.
    381   int32 inter_op_thread_pool = 3;
    382 
    383   // Whether the partition graph(s) executed by the executor(s) should be
    384   // outputted via RunMetadata.
    385   bool output_partition_graphs = 5;
    386 
    387   // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
    388   DebugOptions debug_options = 6;
    389 
    390   // When enabled, causes tensor allocation information to be included in
    391   // the error message when the Run() call fails because the allocator ran
    392   // out of memory (OOM).
    393   //
    394   // Enabling this option can slow down the Run() call.
    395   bool report_tensor_allocations_upon_oom = 7;
    396 
    397   reserved 4;
    398 }
    399 
    400 // Metadata output (i.e., non-Tensor) for a single Run() call.
    401 message RunMetadata {
    402   // Statistics traced for this step. Populated if tracing is turned on via the
    403   // "RunOptions" proto.
    404   // EXPERIMENTAL: The format and set of events may change in future versions.
    405   StepStats step_stats = 1;
    406 
    407   // The cost graph for the computation defined by the run call.
    408   CostGraphDef cost_graph = 2;
    409 
    410   // Graphs of the partitions executed by executors.
    411   repeated GraphDef partition_graphs = 3;
    412 }
    413