Home | History | Annotate | Download | only in training
      1 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #     http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 # ==============================================================================
     15 """Various learning rate decay functions."""
     16 from __future__ import absolute_import
     17 from __future__ import division
     18 from __future__ import print_function
     19 
     20 import math
     21 
     22 from tensorflow.python.framework import constant_op
     23 from tensorflow.python.framework import dtypes
     24 from tensorflow.python.framework import ops
     25 from tensorflow.python.ops import control_flow_ops
     26 from tensorflow.python.ops import math_ops
     27 from tensorflow.python.ops import random_ops
     28 from tensorflow.python.util.tf_export import tf_export
     29 
     30 
     31 @tf_export("train.exponential_decay")
     32 def exponential_decay(learning_rate,
     33                       global_step,
     34                       decay_steps,
     35                       decay_rate,
     36                       staircase=False,
     37                       name=None):
     38   """Applies exponential decay to the learning rate.
     39 
     40   When training a model, it is often recommended to lower the learning rate as
     41   the training progresses.  This function applies an exponential decay function
     42   to a provided initial learning rate.  It requires a `global_step` value to
     43   compute the decayed learning rate.  You can just pass a TensorFlow variable
     44   that you increment at each training step.
     45 
     46   The function returns the decayed learning rate.  It is computed as:
     47 
     48   ```python
     49   decayed_learning_rate = learning_rate *
     50                           decay_rate ^ (global_step / decay_steps)
     51   ```
     52 
     53   If the argument `staircase` is `True`, then `global_step / decay_steps` is an
     54   integer division and the decayed learning rate follows a staircase function.
     55 
     56   Example: decay every 100000 steps with a base of 0.96:
     57 
     58   ```python
     59   ...
     60   global_step = tf.Variable(0, trainable=False)
     61   starter_learning_rate = 0.1
     62   learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
     63                                              100000, 0.96, staircase=True)
     64   # Passing global_step to minimize() will increment it at each step.
     65   learning_step = (
     66       tf.train.GradientDescentOptimizer(learning_rate)
     67       .minimize(...my loss..., global_step=global_step)
     68   )
     69   ```
     70 
     71   Args:
     72     learning_rate: A scalar `float32` or `float64` `Tensor` or a
     73       Python number.  The initial learning rate.
     74     global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
     75       Global step to use for the decay computation.  Must not be negative.
     76     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
     77       Must be positive.  See the decay computation above.
     78     decay_rate: A scalar `float32` or `float64` `Tensor` or a
     79       Python number.  The decay rate.
     80     staircase: Boolean.  If `True` decay the learning rate at discrete intervals
     81     name: String.  Optional name of the operation.  Defaults to
     82       'ExponentialDecay'.
     83 
     84   Returns:
     85     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
     86     learning rate.
     87 
     88   Raises:
     89     ValueError: if `global_step` is not supplied.
     90   """
     91   if global_step is None:
     92     raise ValueError("global_step is required for exponential_decay.")
     93   with ops.name_scope(
     94       name, "ExponentialDecay",
     95       [learning_rate, global_step, decay_steps, decay_rate]) as name:
     96     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     97     dtype = learning_rate.dtype
     98     global_step = math_ops.cast(global_step, dtype)
     99     decay_steps = math_ops.cast(decay_steps, dtype)
    100     decay_rate = math_ops.cast(decay_rate, dtype)
    101     p = global_step / decay_steps
    102     if staircase:
    103       p = math_ops.floor(p)
    104     return math_ops.multiply(
    105         learning_rate, math_ops.pow(decay_rate, p), name=name)
    106 
    107 
    108 @tf_export("train.piecewise_constant")
    109 def piecewise_constant(x, boundaries, values, name=None):
    110   """Piecewise constant from boundaries and interval values.
    111 
    112   Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
    113     for the next 10000 steps, and 0.1 for any additional steps.
    114 
    115   ```python
    116   global_step = tf.Variable(0, trainable=False)
    117   boundaries = [100000, 110000]
    118   values = [1.0, 0.5, 0.1]
    119   learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
    120 
    121   # Later, whenever we perform an optimization step, we increment global_step.
    122   ```
    123 
    124   Args:
    125     x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
    126       `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
    127     boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
    128       increasing entries, and with all elements having the same type as `x`.
    129     values: A list of `Tensor`s or `float`s or `int`s that specifies the values
    130       for the intervals defined by `boundaries`. It should have one more element
    131       than `boundaries`, and all elements should have the same type.
    132     name: A string. Optional name of the operation. Defaults to
    133       'PiecewiseConstant'.
    134 
    135   Returns:
    136     A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
    137     `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
    138     and values[-1] when `x > boundaries[-1]`.
    139 
    140   Raises:
    141     ValueError: if types of `x` and `boundaries` do not match, or types of all
    142         `values` do not match or
    143         the number of elements in the lists does not match.
    144   """
    145   if len(boundaries) != len(values) - 1:
    146     raise ValueError(
    147         "The length of boundaries should be 1 less than the length of values")
    148   with ops.name_scope(name, "PiecewiseConstant",
    149                       [x, boundaries, values, name]) as name:
    150     x = ops.convert_to_tensor(x)
    151     # Avoid explicit conversion to x's dtype. This could result in faulty
    152     # comparisons, for example if floats are converted to integers.
    153     boundaries = ops.convert_n_to_tensor(boundaries)
    154     for i, b in enumerate(boundaries):
    155       if b.dtype.base_dtype != x.dtype.base_dtype:
    156         # We can promote int32 boundaries to int64 without loss of precision.
    157         # This covers the most common case where the user passes in boundaries
    158         # as an array of Python integers.
    159         if (b.dtype.base_dtype == dtypes.int32 and
    160             x.dtype.base_dtype == dtypes.int64):
    161           b = math_ops.cast(b, x.dtype.base_dtype)
    162           boundaries[i] = b
    163         else:
    164           raise ValueError(
    165               "Boundaries (%s) must have the same dtype as x (%s)." %
    166               (b.dtype.base_dtype, x.dtype.base_dtype))
    167     # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
    168     values = ops.convert_n_to_tensor(values)
    169     for v in values[1:]:
    170       if v.dtype.base_dtype != values[0].dtype.base_dtype:
    171         raise ValueError(
    172             "Values must have elements all with the same dtype (%s vs %s)." %
    173             (values[0].dtype.base_dtype, v.dtype.base_dtype))
    174     pred_fn_pairs = []
    175     pred_fn_pairs.append((x <= boundaries[0], lambda: values[0]))
    176     pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1]))
    177     for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
    178       # Need to bind v here; can do this with lambda v=v: ...
    179       pred = (x > low) & (x <= high)
    180       pred_fn_pairs.append((pred, lambda v=v: v))
    181 
    182     # The default isn't needed here because our conditions are mutually
    183     # exclusive and exhaustive, but tf.case requires it.
    184     default = lambda: values[0]
    185     return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
    186 
    187 
    188 @tf_export("train.polynomial_decay")
    189 def polynomial_decay(learning_rate,
    190                      global_step,
    191                      decay_steps,
    192                      end_learning_rate=0.0001,
    193                      power=1.0,
    194                      cycle=False,
    195                      name=None):
    196   """Applies a polynomial decay to the learning rate.
    197 
    198   It is commonly observed that a monotonically decreasing learning rate, whose
    199   degree of change is carefully chosen, results in a better performing model.
    200   This function applies a polynomial decay function to a provided initial
    201   `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
    202 
    203   It requires a `global_step` value to compute the decayed learning rate.  You
    204   can just pass a TensorFlow variable that you increment at each training step.
    205 
    206   The function returns the decayed learning rate.  It is computed as:
    207 
    208   ```python
    209   global_step = min(global_step, decay_steps)
    210   decayed_learning_rate = (learning_rate - end_learning_rate) *
    211                           (1 - global_step / decay_steps) ^ (power) +
    212                           end_learning_rate
    213 
    214   ```
    215 
    216   If `cycle` is True then a multiple of `decay_steps` is used, the first one
    217   that is bigger than `global_steps`.
    218 
    219   ```python
    220   decay_steps = decay_steps * ceil(global_step / decay_steps)
    221   decayed_learning_rate = (learning_rate - end_learning_rate) *
    222                           (1 - global_step / decay_steps) ^ (power) +
    223                           end_learning_rate
    224 
    225   ```
    226 
    227   Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
    228 
    229   ```python
    230   ...
    231   global_step = tf.Variable(0, trainable=False)
    232   starter_learning_rate = 0.1
    233   end_learning_rate = 0.01
    234   decay_steps = 10000
    235   learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
    236                                             decay_steps, end_learning_rate,
    237                                             power=0.5)
    238   # Passing global_step to minimize() will increment it at each step.
    239   learning_step = (
    240       tf.train.GradientDescentOptimizer(learning_rate)
    241       .minimize(...my loss..., global_step=global_step)
    242   )
    243   ```
    244 
    245   Args:
    246     learning_rate: A scalar `float32` or `float64` `Tensor` or a
    247       Python number.  The initial learning rate.
    248     global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
    249       Global step to use for the decay computation.  Must not be negative.
    250     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
    251       Must be positive.  See the decay computation above.
    252     end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
    253       Python number.  The minimal end learning rate.
    254     power: A scalar `float32` or `float64` `Tensor` or a
    255       Python number.  The power of the polynomial. Defaults to linear, 1.0.
    256     cycle: A boolean, whether or not it should cycle beyond decay_steps.
    257     name: String.  Optional name of the operation. Defaults to
    258       'PolynomialDecay'.
    259 
    260   Returns:
    261     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    262     learning rate.
    263 
    264   Raises:
    265     ValueError: if `global_step` is not supplied.
    266   """
    267   if global_step is None:
    268     raise ValueError("global_step is required for polynomial_decay.")
    269   with ops.name_scope(
    270       name, "PolynomialDecay",
    271       [learning_rate, global_step, decay_steps, end_learning_rate, power
    272       ]) as name:
    273     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    274     dtype = learning_rate.dtype
    275     global_step = math_ops.cast(global_step, dtype)
    276     decay_steps = math_ops.cast(decay_steps, dtype)
    277     end_learning_rate = math_ops.cast(end_learning_rate, dtype)
    278     power = math_ops.cast(power, dtype)
    279     if cycle:
    280       # Find the first multiple of decay_steps that is bigger than global_step.
    281       # If global_step is zero set the multiplier to 1
    282       multiplier = control_flow_ops.cond(
    283           math_ops.equal(global_step, 0), lambda: 1.0,
    284           lambda: math_ops.ceil(global_step / decay_steps))
    285       decay_steps = math_ops.multiply(decay_steps, multiplier)
    286     else:
    287       # Make sure that the global_step used is not bigger than decay_steps.
    288       global_step = math_ops.minimum(global_step, decay_steps)
    289 
    290     p = math_ops.div(global_step, decay_steps)
    291     return math_ops.add(
    292         math_ops.multiply(learning_rate - end_learning_rate,
    293                           math_ops.pow(1 - p, power)),
    294         end_learning_rate,
    295         name=name)
    296 
    297 
    298 @tf_export("train.natural_exp_decay")
    299 def natural_exp_decay(learning_rate,
    300                       global_step,
    301                       decay_steps,
    302                       decay_rate,
    303                       staircase=False,
    304                       name=None):
    305   """Applies natural exponential decay to the initial learning rate.
    306 
    307   When training a model, it is often recommended to lower the learning rate as
    308   the training progresses.  This function applies an exponential decay function
    309   to a provided initial learning rate.  It requires an `global_step` value to
    310   compute the decayed learning rate.  You can just pass a TensorFlow variable
    311   that you increment at each training step.
    312 
    313   The function returns the decayed learning rate.  It is computed as:
    314 
    315   ```python
    316   decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
    317   ```
    318 
    319   Example: decay exponentially with a base of 0.96:
    320 
    321   ```python
    322   ...
    323   global_step = tf.Variable(0, trainable=False)
    324   learning_rate = 0.1
    325   k = 0.5
    326   learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
    327 
    328   # Passing global_step to minimize() will increment it at each step.
    329   learning_step = (
    330       tf.train.GradientDescentOptimizer(learning_rate)
    331       .minimize(...my loss..., global_step=global_step)
    332   )
    333   ```
    334 
    335   Args:
    336     learning_rate: A scalar `float32` or `float64` `Tensor` or a
    337       Python number.  The initial learning rate.
    338     global_step: A Python number.
    339       Global step to use for the decay computation.  Must not be negative.
    340     decay_steps: How often to apply decay.
    341     decay_rate: A Python number.  The decay rate.
    342     staircase: Whether to apply decay in a discrete staircase, as opposed to
    343       continuous, fashion.
    344     name: String.  Optional name of the operation.  Defaults to
    345       'ExponentialTimeDecay'.
    346 
    347   Returns:
    348     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    349     learning rate.
    350 
    351   Raises:
    352     ValueError: if `global_step` is not supplied.
    353   """
    354   if global_step is None:
    355     raise ValueError("global_step is required for natural_exp_decay.")
    356   with ops.name_scope(name, "NaturalExpDecay",
    357                       [learning_rate, global_step, decay_rate]) as name:
    358     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    359     dtype = learning_rate.dtype
    360     global_step = math_ops.cast(global_step, dtype)
    361     decay_steps = math_ops.cast(decay_steps, dtype)
    362     decay_rate = math_ops.cast(decay_rate, dtype)
    363     p = global_step / decay_steps
    364     if staircase:
    365       p = math_ops.floor(p)
    366     exponent = math_ops.exp(math_ops.multiply(math_ops.negative(decay_rate), p))
    367     return math_ops.multiply(learning_rate, exponent, name=name)
    368 
    369 
    370 @tf_export("train.inverse_time_decay")
    371 def inverse_time_decay(learning_rate,
    372                        global_step,
    373                        decay_steps,
    374                        decay_rate,
    375                        staircase=False,
    376                        name=None):
    377   """Applies inverse time decay to the initial learning rate.
    378 
    379   When training a model, it is often recommended to lower the learning rate as
    380   the training progresses.  This function applies an inverse decay function
    381   to a provided initial learning rate.  It requires an `global_step` value to
    382   compute the decayed learning rate.  You can just pass a TensorFlow variable
    383   that you increment at each training step.
    384 
    385   The function returns the decayed learning rate.  It is computed as:
    386 
    387   ```python
    388   decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
    389   decay_step)
    390   ```
    391 
    392   or, if `staircase` is `True`, as:
    393 
    394   ```python
    395   decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
    396   decay_step))
    397   ```
    398 
    399   Example: decay 1/t with a rate of 0.5:
    400 
    401   ```python
    402   ...
    403   global_step = tf.Variable(0, trainable=False)
    404   learning_rate = 0.1
    405   decay_steps = 1.0
    406   decay_rate = 0.5
    407   learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
    408   decay_steps, decay_rate)
    409 
    410   # Passing global_step to minimize() will increment it at each step.
    411   learning_step = (
    412       tf.train.GradientDescentOptimizer(learning_rate)
    413       .minimize(...my loss..., global_step=global_step)
    414   )
    415   ```
    416 
    417   Args:
    418     learning_rate: A scalar `float32` or `float64` `Tensor` or a
    419       Python number.  The initial learning rate.
    420     global_step: A Python number.
    421       Global step to use for the decay computation.  Must not be negative.
    422     decay_steps: How often to apply decay.
    423     decay_rate: A Python number.  The decay rate.
    424     staircase: Whether to apply decay in a discrete staircase, as opposed to
    425       continuous, fashion.
    426     name: String.  Optional name of the operation.  Defaults to
    427       'InverseTimeDecay'.
    428 
    429   Returns:
    430     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    431     learning rate.
    432 
    433   Raises:
    434     ValueError: if `global_step` is not supplied.
    435   """
    436   if global_step is None:
    437     raise ValueError("global_step is required for inverse_time_decay.")
    438   with ops.name_scope(name, "InverseTimeDecay",
    439                       [learning_rate, global_step, decay_rate]) as name:
    440     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    441     dtype = learning_rate.dtype
    442     global_step = math_ops.cast(global_step, dtype)
    443     decay_steps = math_ops.cast(decay_steps, dtype)
    444     decay_rate = math_ops.cast(decay_rate, dtype)
    445     p = global_step / decay_steps
    446     if staircase:
    447       p = math_ops.floor(p)
    448     const = math_ops.cast(constant_op.constant(1), learning_rate.dtype)
    449     denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
    450     return math_ops.div(learning_rate, denom, name=name)
    451 
    452 
    453 @tf_export("train.cosine_decay")
    454 def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
    455   """Applies cosine decay to the learning rate.
    456 
    457   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
    458   with Warm Restarts. https://arxiv.org/abs/1608.03983
    459 
    460   When training a model, it is often recommended to lower the learning rate as
    461   the training progresses.  This function applies a cosine decay function
    462   to a provided initial learning rate.  It requires a `global_step` value to
    463   compute the decayed learning rate.  You can just pass a TensorFlow variable
    464   that you increment at each training step.
    465 
    466   The function returns the decayed learning rate.  It is computed as:
    467   ```python
    468   global_step = min(global_step, decay_steps)
    469   cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
    470   decayed = (1 - alpha) * cosine_decay + alpha
    471   decayed_learning_rate = learning_rate * decayed
    472   ```
    473 
    474   Example usage:
    475   ```python
    476   decay_steps = 1000
    477   lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
    478   ```
    479 
    480   Args:
    481     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
    482       The initial learning rate.
    483     global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
    484       Global step to use for the decay computation.
    485     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
    486       Number of steps to decay over.
    487     alpha: A scalar `float32` or `float64` Tensor or a Python number.
    488       Minimum learning rate value as a fraction of learning_rate.
    489     name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
    490   Returns:
    491     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    492     learning rate.
    493   Raises:
    494     ValueError: if `global_step` is not supplied.
    495   """
    496   if global_step is None:
    497     raise ValueError("cosine decay requires global_step")
    498   with ops.name_scope(name, "CosineDecay",
    499                       [learning_rate, global_step]) as name:
    500     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    501     dtype = learning_rate.dtype
    502     global_step = math_ops.cast(global_step, dtype)
    503     decay_steps = math_ops.cast(decay_steps, dtype)
    504     global_step = math_ops.minimum(global_step, decay_steps)
    505     completed_fraction = global_step / decay_steps
    506     cosine_decayed = 0.5 * (
    507         1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
    508 
    509     decayed = (1 - alpha) * cosine_decayed + alpha
    510     return math_ops.multiply(learning_rate, decayed)
    511 
    512 
    513 @tf_export("train.cosine_decay_restarts")
    514 def cosine_decay_restarts(learning_rate,
    515                           global_step,
    516                           first_decay_steps,
    517                           t_mul=2.0,
    518                           m_mul=1.0,
    519                           alpha=0.0,
    520                           name=None):
    521   """Applies cosine decay with restarts to the learning rate.
    522 
    523   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
    524   with Warm Restarts. https://arxiv.org/abs/1608.03983
    525 
    526   When training a model, it is often recommended to lower the learning rate as
    527   the training progresses.  This function applies a cosine decay function with
    528   restarts to a provided initial learning rate.  It requires a `global_step`
    529   value to compute the decayed learning rate.  You can just pass a TensorFlow
    530   variable that you increment at each training step.
    531 
    532   The function returns the decayed learning rate while taking into account
    533   possible warm restarts. The learning rate multiplier first decays
    534   from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
    535   restart is performed. Each new warm restart runs for `t_mul` times more steps
    536   and with `m_mul` times smaller initial learning rate.
    537 
    538   Example usage:
    539   ```python
    540   first_decay_steps = 1000
    541   lr_decayed = cosine_decay_restarts(learning_rate, global_step,
    542                                      first_decay_steps)
    543   ```
    544 
    545   Args:
    546     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
    547       The initial learning rate.
    548     global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
    549       Global step to use for the decay computation.
    550     first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
    551       Number of steps to decay over.
    552     t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
    553       Used to derive the number of iterations in the i-th period
    554     m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
    555       Used to derive the initial learning rate of the i-th period:
    556     alpha: A scalar `float32` or `float64` Tensor or a Python number.
    557       Minimum learning rate value as a fraction of the learning_rate.
    558     name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
    559   Returns:
    560     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    561     learning rate.
    562   Raises:
    563     ValueError: if `global_step` is not supplied.
    564   """
    565   if global_step is None:
    566     raise ValueError("cosine decay restarts requires global_step")
    567   with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
    568     learning_rate = ops.convert_to_tensor(
    569         learning_rate, name="initial_learning_rate")
    570     dtype = learning_rate.dtype
    571     global_step = math_ops.cast(global_step, dtype)
    572     first_decay_steps = math_ops.cast(first_decay_steps, dtype)
    573     alpha = math_ops.cast(alpha, dtype)
    574     t_mul = math_ops.cast(t_mul, dtype)
    575     m_mul = math_ops.cast(m_mul, dtype)
    576 
    577     completed_fraction = global_step / first_decay_steps
    578 
    579     def compute_step(completed_fraction, geometric=False):
    580       if geometric:
    581         i_restart = math_ops.floor(
    582             math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
    583             math_ops.log(t_mul))
    584 
    585         sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
    586         completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
    587 
    588       else:
    589         i_restart = math_ops.floor(completed_fraction)
    590         completed_fraction = completed_fraction - i_restart
    591 
    592       return i_restart, completed_fraction
    593 
    594     i_restart, completed_fraction = control_flow_ops.cond(
    595         math_ops.equal(t_mul, 1.0),
    596         lambda: compute_step(completed_fraction, geometric=False),
    597         lambda: compute_step(completed_fraction, geometric=True))
    598 
    599     m_fac = m_mul**i_restart
    600     cosine_decayed = 0.5 * m_fac * (
    601         1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
    602     decayed = (1 - alpha) * cosine_decayed + alpha
    603 
    604   return math_ops.multiply(learning_rate, decayed, name=name)
    605 
    606 
    607 @tf_export("train.linear_cosine_decay")
    608 def linear_cosine_decay(learning_rate,
    609                         global_step,
    610                         decay_steps,
    611                         num_periods=0.5,
    612                         alpha=0.0,
    613                         beta=0.001,
    614                         name=None):
    615   """Applies linear cosine decay to the learning rate.
    616 
    617   See [Bello et al., ICML2017] Neural Optimizer Search with RL.
    618   https://arxiv.org/abs/1709.07417
    619 
    620   For the idea of warm starts here controlled by `num_periods`,
    621   see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
    622   with Warm Restarts. https://arxiv.org/abs/1608.03983
    623 
    624   Note that linear cosine decay is more aggressive than cosine decay and
    625   larger initial learning rates can typically be used.
    626 
    627   When training a model, it is often recommended to lower the learning rate as
    628   the training progresses.  This function applies a linear cosine decay function
    629   to a provided initial learning rate.  It requires a `global_step` value to
    630   compute the decayed learning rate.  You can just pass a TensorFlow variable
    631   that you increment at each training step.
    632 
    633   The function returns the decayed learning rate.  It is computed as:
    634   ```python
    635   global_step = min(global_step, decay_steps)
    636   linear_decay = (decay_steps - global_step) / decay_steps)
    637   cosine_decay = 0.5 * (
    638       1 + cos(pi * 2 * num_periods * global_step / decay_steps))
    639   decayed = (alpha + linear_decay) * cosine_decay + beta
    640   decayed_learning_rate = learning_rate * decayed
    641   ```
    642 
    643   Example usage:
    644   ```python
    645   decay_steps = 1000
    646   lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps)
    647   ```
    648 
    649   Args:
    650     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
    651       The initial learning rate.
    652     global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
    653       Global step to use for the decay computation.
    654     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
    655       Number of steps to decay over.
    656     num_periods: Number of periods in the cosine part of the decay.
    657       See computation above.
    658     alpha: See computation above.
    659     beta: See computation above.
    660     name: String.  Optional name of the operation.  Defaults to
    661       'LinearCosineDecay'.
    662   Returns:
    663     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    664     learning rate.
    665   Raises:
    666     ValueError: if `global_step` is not supplied.
    667   """
    668   if global_step is None:
    669     raise ValueError("linear cosine decay requires global_step")
    670   with ops.name_scope(name, "LinearCosineDecay",
    671                       [learning_rate, global_step]) as name:
    672     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    673     dtype = learning_rate.dtype
    674     global_step = math_ops.cast(global_step, dtype)
    675     decay_steps = math_ops.cast(decay_steps, dtype)
    676     num_periods = math_ops.cast(num_periods, dtype)
    677     global_step = math_ops.minimum(global_step, decay_steps)
    678     alpha = math_ops.cast(alpha, dtype)
    679     beta = math_ops.cast(beta, dtype)
    680 
    681     linear_decayed = (decay_steps - global_step) / decay_steps
    682     completed_fraction = global_step / decay_steps
    683     fraction = 2.0 * num_periods * completed_fraction
    684     cosine_decayed = 0.5 * (
    685         1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
    686 
    687     linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
    688     return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
    689 
    690 
    691 @tf_export("train.noisy_linear_cosine_decay")
    692 def noisy_linear_cosine_decay(learning_rate,
    693                               global_step,
    694                               decay_steps,
    695                               initial_variance=1.0,
    696                               variance_decay=0.55,
    697                               num_periods=0.5,
    698                               alpha=0.0,
    699                               beta=0.001,
    700                               name=None):
    701   """Applies noisy linear cosine decay to the learning rate.
    702 
    703   See [Bello et al., ICML2017] Neural Optimizer Search with RL.
    704   https://arxiv.org/abs/1709.07417
    705 
    706   For the idea of warm starts here controlled by `num_periods`,
    707   see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
    708   with Warm Restarts. https://arxiv.org/abs/1608.03983
    709 
    710   Note that linear cosine decay is more aggressive than cosine decay and
    711   larger initial learning rates can typically be used.
    712 
    713   When training a model, it is often recommended to lower the learning rate as
    714   the training progresses.  This function applies a noisy linear
    715   cosine decay function to a provided initial learning rate.
    716   It requires a `global_step` value to compute the decayed learning rate.
    717   You can just pass a TensorFlow variable that you increment at each
    718   training step.
    719 
    720   The function returns the decayed learning rate.  It is computed as:
    721   ```python
    722   global_step = min(global_step, decay_steps)
    723   linear_decay = (decay_steps - global_step) / decay_steps)
    724   cosine_decay = 0.5 * (
    725       1 + cos(pi * 2 * num_periods * global_step / decay_steps))
    726   decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
    727   decayed_learning_rate = learning_rate * decayed
    728   ```
    729   where eps_t is 0-centered gaussian noise with variance
    730   initial_variance / (1 + global_step) ** variance_decay
    731 
    732   Example usage:
    733   ```python
    734   decay_steps = 1000
    735   lr_decayed = noisy_linear_cosine_decay(
    736     learning_rate, global_step, decay_steps)
    737   ```
    738 
    739   Args:
    740     learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
    741       The initial learning rate.
    742     global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
    743       Global step to use for the decay computation.
    744     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
    745       Number of steps to decay over.
    746     initial_variance: initial variance for the noise. See computation above.
    747     variance_decay: decay for the noise's variance. See computation above.
    748     num_periods: Number of periods in the cosine part of the decay.
    749       See computation above.
    750     alpha: See computation above.
    751     beta: See computation above.
    752     name: String.  Optional name of the operation.  Defaults to
    753       'NoisyLinearCosineDecay'.
    754   Returns:
    755     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    756     learning rate.
    757   Raises:
    758     ValueError: if `global_step` is not supplied.
    759   """
    760   if global_step is None:
    761     raise ValueError("noisy linear cosine decay requires global_step")
    762   with ops.name_scope(name, "NoisyLinearCosineDecay",
    763                       [learning_rate, global_step]) as name:
    764     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    765     dtype = learning_rate.dtype
    766     global_step = math_ops.cast(global_step, dtype)
    767     decay_steps = math_ops.cast(decay_steps, dtype)
    768     global_step = math_ops.minimum(global_step, decay_steps)
    769     initial_variance = math_ops.cast(initial_variance, dtype)
    770     variance_decay = math_ops.cast(variance_decay, dtype)
    771     num_periods = math_ops.cast(num_periods, dtype)
    772     alpha = math_ops.cast(alpha, dtype)
    773     beta = math_ops.cast(beta, dtype)
    774 
    775     linear_decayed = (decay_steps - global_step) / decay_steps
    776     variance = initial_variance / (
    777         math_ops.pow(1.0 + global_step, variance_decay))
    778     std = math_ops.sqrt(variance)
    779     noisy_linear_decayed = (
    780         linear_decayed +
    781         random_ops.random_normal(linear_decayed.shape, stddev=std))
    782 
    783     completed_fraction = global_step / decay_steps
    784     fraction = 2.0 * num_periods * completed_fraction
    785     cosine_decayed = 0.5 * (
    786         1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
    787     noisy_linear_cosine_decayed = (
    788         (alpha + noisy_linear_decayed) * cosine_decayed + beta)
    789 
    790     return math_ops.multiply(
    791         learning_rate, noisy_linear_cosine_decayed, name=name)
    792