Home | History | Annotate | Download | only in training
      1 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #     http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 # ==============================================================================
     15 """Adagrad Dual Averaging for TensorFlow."""
     16 from __future__ import absolute_import
     17 from __future__ import division
     18 from __future__ import print_function
     19 
     20 from tensorflow.python.framework import constant_op
     21 from tensorflow.python.framework import ops
     22 from tensorflow.python.ops import array_ops
     23 from tensorflow.python.ops import math_ops
     24 from tensorflow.python.training import optimizer
     25 from tensorflow.python.training import training_ops
     26 from tensorflow.python.util.tf_export import tf_export
     27 
     28 
     29 @tf_export("train.AdagradDAOptimizer")
     30 class AdagradDAOptimizer(optimizer.Optimizer):
     31   """Adagrad Dual Averaging algorithm for sparse linear models.
     32 
     33   See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
     34 
     35   This optimizer takes care of regularization of unseen features in a mini batch
     36   by updating them when they are seen with a closed form update rule that is
     37   equivalent to having updated them on every mini-batch.
     38 
     39   AdagradDA is typically used when there is a need for large sparsity in the
     40   trained model. This optimizer only guarantees sparsity for linear models. Be
     41   careful when using AdagradDA for deep networks as it will require careful
     42   initialization of the gradient accumulators for it to train.
     43   """
     44 
     45   def __init__(self,
     46                learning_rate,
     47                global_step,
     48                initial_gradient_squared_accumulator_value=0.1,
     49                l1_regularization_strength=0.0,
     50                l2_regularization_strength=0.0,
     51                use_locking=False,
     52                name="AdagradDA"):
     53     """Construct a new AdagradDA optimizer.
     54 
     55     Args:
     56       learning_rate: A `Tensor` or a floating point value.  The learning rate.
     57       global_step: A `Tensor` containing the current training step number.
     58       initial_gradient_squared_accumulator_value: A floating point value.
     59         Starting value for the accumulators, must be positive.
     60       l1_regularization_strength: A float value, must be greater than or
     61         equal to zero.
     62       l2_regularization_strength: A float value, must be greater than or
     63         equal to zero.
     64       use_locking: If `True` use locks for update operations.
     65       name: Optional name prefix for the operations created when applying
     66         gradients.  Defaults to "AdagradDA".
     67 
     68     Raises:
     69       ValueError: If the `initial_gradient_squared_accumulator_value` is
     70       invalid.
     71     """
     72     if initial_gradient_squared_accumulator_value <= 0.0:
     73       raise ValueError("initial_gradient_squared_accumulator_value must be "
     74                        "positive: %s" %
     75                        initial_gradient_squared_accumulator_value)
     76     super(AdagradDAOptimizer, self).__init__(use_locking, name)
     77     self._learning_rate = learning_rate
     78     self._initial_gradient_squared_accumulator_value = (
     79         initial_gradient_squared_accumulator_value)
     80     # Created in Initialize.
     81     self._learning_rate_tensor = None
     82     self._l1_regularization_strength = l1_regularization_strength
     83     self._l2_regularization_strength = l2_regularization_strength
     84     self._global_step = global_step
     85     self._global_step_on_worker = None
     86 
     87   def _create_slots(self, var_list):
     88     for v in var_list:
     89       with ops.colocate_with(v):
     90         g_val = constant_op.constant(
     91             0.0, shape=v.get_shape(), dtype=v.dtype.base_dtype)
     92         gg_val = constant_op.constant(
     93             self._initial_gradient_squared_accumulator_value,
     94             shape=v.get_shape(),
     95             dtype=v.dtype.base_dtype)
     96       self._get_or_make_slot(v, g_val, "gradient_accumulator", self._name)
     97       self._get_or_make_slot(v, gg_val, "gradient_squared_accumulator",
     98                              self._name)
     99 
    100   def _prepare(self):
    101     self._learning_rate_tensor = ops.convert_to_tensor(
    102         self._learning_rate, name="learning_rate")
    103     # Performance optimization so that worker creates a copy of the global step
    104     # to avoid overloading the parameter server holding the global step.
    105     with ops.colocate_with(self._learning_rate_tensor):
    106       self._global_step_on_worker = array_ops.identity(self._global_step) + 1
    107 
    108   def _apply_dense(self, grad, var):
    109     g_acc = self.get_slot(var, "gradient_accumulator")
    110     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
    111     with ops.device(var.device):
    112       global_step = array_ops.identity(self._global_step_on_worker)
    113     return training_ops.apply_adagrad_da(
    114         var,
    115         g_acc,
    116         gg_acc,
    117         grad,
    118         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
    119         math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype),
    120         math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype),
    121         global_step,
    122         use_locking=self._use_locking)
    123 
    124   def _resource_apply_dense(self, grad, var):
    125     g_acc = self.get_slot(var, "gradient_accumulator")
    126     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
    127     with ops.device(var.device):
    128       global_step = array_ops.identity(self._global_step_on_worker)
    129     return training_ops.resource_apply_adagrad_da(
    130         var.handle,
    131         g_acc.handle,
    132         gg_acc.handle,
    133         grad,
    134         math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
    135         math_ops.cast(self._l1_regularization_strength, grad.dtype.base_dtype),
    136         math_ops.cast(self._l2_regularization_strength, grad.dtype.base_dtype),
    137         global_step,
    138         use_locking=self._use_locking)
    139 
    140   def _apply_sparse(self, grad, var):
    141     g_acc = self.get_slot(var, "gradient_accumulator")
    142     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
    143     with ops.device(var.device):
    144       global_step = array_ops.identity(self._global_step_on_worker)
    145     return training_ops.sparse_apply_adagrad_da(
    146         var,
    147         g_acc,
    148         gg_acc,
    149         grad.values,
    150         grad.indices,
    151         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
    152         math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype),
    153         math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype),
    154         global_step,
    155         use_locking=self._use_locking)
    156 
    157   def _resource_apply_sparse(self, grad, var, indices):
    158     g_acc = self.get_slot(var, "gradient_accumulator")
    159     gg_acc = self.get_slot(var, "gradient_squared_accumulator")
    160     with ops.device(var.device):
    161       global_step = array_ops.identity(self._global_step_on_worker)
    162     return training_ops.resource_sparse_apply_adagrad_da(
    163         var.handle,
    164         g_acc.handle,
    165         gg_acc.handle,
    166         grad,
    167         indices,
    168         math_ops.cast(self._learning_rate_tensor, grad.dtype),
    169         math_ops.cast(self._l1_regularization_strength, grad.dtype),
    170         math_ops.cast(self._l2_regularization_strength, grad.dtype),
    171         global_step,
    172         use_locking=self._use_locking)
    173