Home | History | Annotate | Download | only in layers
      1 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #     http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 # ==============================================================================
     15 """Utilities related to FeatureColumn."""
     16 
     17 from __future__ import absolute_import
     18 from __future__ import division
     19 from __future__ import print_function
     20 
     21 import functools
     22 
     23 from tensorflow.contrib.framework.python.framework import experimental
     24 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
     25 from tensorflow.contrib.layers.python.layers import embedding_ops
     26 from tensorflow.contrib.layers.python.layers import feature_column as fc
     27 from tensorflow.contrib.layers.python.layers import layers
     28 from tensorflow.python.framework import dtypes
     29 from tensorflow.python.framework import ops
     30 from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
     31 from tensorflow.python.ops import array_ops
     32 from tensorflow.python.ops import init_ops
     33 from tensorflow.python.ops import math_ops
     34 from tensorflow.python.ops import nn_ops
     35 from tensorflow.python.ops import parsing_ops
     36 from tensorflow.python.ops import sparse_ops
     37 from tensorflow.python.ops import variable_scope
     38 from tensorflow.python.platform import tf_logging as logging
     39 from tensorflow.python.util import nest
     40 
     41 
     42 def _maybe_reshape_input_tensor(tensor, column_name, output_rank):
     43   """Reshape the input tensor by the following rule.
     44 
     45   1. If `output_rank > input_rank + 1`, raise a `ValueError`.
     46   2. If `output_rank == input_rank + 1`, expand the tensor by one dimension.
     47   3. If `output_rank == input_rank`, do nothing.
     48   4. If `output_rank < input_rank`, flatten the inner dimensions of the tensor.
     49 
     50   Args:
     51     tensor: A Tensor or SparseTensor to be reshaped.
     52     column_name: A string name of the feature column for the tensor.
     53     output_rank: the desired rank of the tensor.
     54   Returns:
     55     A reshaped Tensor or SparseTensor.
     56   Raises:
     57     ValueError: if `output_rank > input_rank + 1` for the input tensor.
     58   """
     59   input_rank = tensor.get_shape().ndims
     60 
     61   if input_rank is None and isinstance(tensor, sparse_tensor_py.SparseTensor):
     62     # Try to get the rank of a sparse tensor by its dense_shape's shape.
     63     input_rank = tensor.dense_shape.get_shape().as_list()[0]
     64 
     65   if input_rank is None:
     66     raise ValueError('Error while processing column {}. Rank of input Tensor '
     67                      'can not be None.'.format(column_name))
     68 
     69   if output_rank > input_rank + 1:
     70     raise ValueError('Error while processing column {}. Rank of input Tensor '
     71                      '({}) should be the same as output_rank ({}). For '
     72                      'example, sequence data should typically be 3 '
     73                      'dimensional (rank 3) while non-sequence data is '
     74                      'typically 2 dimensional (rank 2).'.format(
     75                          column_name, input_rank, output_rank))
     76   elif output_rank == input_rank + 1:
     77     # Expand the tensor's shape by 1 dimension.
     78     if isinstance(tensor, sparse_tensor_py.SparseTensor):
     79       output_shape = array_ops.concat([tensor.dense_shape, [1]], 0)
     80       return sparse_ops.sparse_reshape(tensor, output_shape)
     81     else:
     82       reshaped = array_ops.expand_dims(tensor, -1)
     83       # Try to calculate the new shape.
     84       static_shape = tensor.get_shape()
     85       if static_shape is not None and static_shape.dims is not None:
     86         reshaped.set_shape(static_shape.as_list() + [1])
     87       return reshaped
     88   elif output_rank < input_rank:
     89     return layers._inner_flatten(tensor, output_rank)  # pylint: disable=protected-access
     90   else:
     91     return tensor
     92 
     93 
     94 def _input_from_feature_columns(columns_to_tensors,
     95                                 feature_columns,
     96                                 weight_collections,
     97                                 trainable,
     98                                 scope,
     99                                 output_rank,
    100                                 default_name,
    101                                 cols_to_outs=None):
    102   """Implementation of `input_from(_sequence)_feature_columns`."""
    103   columns_to_tensors = columns_to_tensors.copy()
    104   check_feature_columns(feature_columns)
    105   if cols_to_outs is not None and not isinstance(cols_to_outs, dict):
    106     raise ValueError('cols_to_outs must be a dict unless None')
    107   with variable_scope.variable_scope(scope,
    108                                      default_name=default_name,
    109                                      values=columns_to_tensors.values()):
    110     output_tensors = []
    111     transformer = _Transformer(columns_to_tensors)
    112     if weight_collections:
    113       weight_collections = list(set(list(weight_collections) +
    114                                     [ops.GraphKeys.GLOBAL_VARIABLES]))
    115 
    116     for column in sorted(set(feature_columns), key=lambda x: x.key):
    117       with variable_scope.variable_scope(None,
    118                                          default_name=column.name,
    119                                          values=columns_to_tensors.values()):
    120         transformed_tensor = transformer.transform(column)
    121         if output_rank == 3:
    122           transformed_tensor = nest.map_structure(
    123               functools.partial(
    124                   _maybe_reshape_input_tensor,
    125                   column_name=column.name,
    126                   output_rank=output_rank), transformed_tensor)
    127         try:
    128           # pylint: disable=protected-access
    129           arguments = column._deep_embedding_lookup_arguments(
    130               transformed_tensor)
    131           output_tensors.append(
    132               fc._embeddings_from_arguments(  # pylint: disable=protected-access
    133                   column,
    134                   arguments,
    135                   weight_collections,
    136                   trainable,
    137                   output_rank=output_rank))
    138 
    139         except NotImplementedError as ee:
    140           try:
    141             # pylint: disable=protected-access
    142             output_tensors.append(column._to_dnn_input_layer(
    143                 transformed_tensor,
    144                 weight_collections,
    145                 trainable,
    146                 output_rank=output_rank))
    147           except ValueError as e:
    148             raise ValueError('Error creating input layer for column: {}.\n'
    149                              '{}, {}'.format(column.name, e, ee))
    150         if cols_to_outs is not None:
    151           cols_to_outs[column] = output_tensors[-1]
    152     return array_ops.concat(output_tensors, output_rank - 1)
    153 
    154 
    155 def input_from_feature_columns(columns_to_tensors,
    156                                feature_columns,
    157                                weight_collections=None,
    158                                trainable=True,
    159                                scope=None,
    160                                cols_to_outs=None):
    161   """A tf.contrib.layers style input layer builder based on FeatureColumns.
    162 
    163   Generally a single example in training data is described with feature columns.
    164   At the first layer of the model, this column oriented data should be converted
    165   to a single tensor. Each feature column needs a different kind of operation
    166   during this conversion. For example sparse features need a totally different
    167   handling than continuous features.
    168 
    169   Example:
    170 
    171   ```python
    172     # Building model for training
    173     columns_to_tensor = tf.parse_example(...)
    174     first_layer = input_from_feature_columns(
    175         columns_to_tensors=columns_to_tensor,
    176         feature_columns=feature_columns)
    177     second_layer = fully_connected(inputs=first_layer, ...)
    178     ...
    179   ```
    180 
    181   where feature_columns can be defined as follows:
    182 
    183   ```python
    184     sparse_feature = sparse_column_with_hash_bucket(
    185         column_name="sparse_col", ...)
    186     sparse_feature_emb = embedding_column(sparse_id_column=sparse_feature, ...)
    187     real_valued_feature = real_valued_column(...)
    188     real_valued_buckets = bucketized_column(
    189         source_column=real_valued_feature, ...)
    190 
    191     feature_columns=[sparse_feature_emb, real_valued_buckets]
    192   ```
    193 
    194   Args:
    195     columns_to_tensors: A mapping from feature column to tensors. 'string' key
    196       means a base feature (not-transformed). It can have FeatureColumn as a
    197       key too. That means that FeatureColumn is already transformed by input
    198       pipeline.
    199     feature_columns: A set containing all the feature columns. All items in the
    200       set should be instances of classes derived by FeatureColumn.
    201     weight_collections: List of graph collections to which weights are added.
    202     trainable: If `True` also add variables to the graph collection
    203       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    204     scope: Optional scope for variable_scope.
    205     cols_to_outs: Optional dict from feature column to output tensor,
    206       which is concatenated into the returned tensor.
    207 
    208   Returns:
    209     A Tensor which can be consumed by hidden layers in the neural network.
    210 
    211   Raises:
    212     ValueError: if FeatureColumn cannot be consumed by a neural network.
    213   """
    214   return _input_from_feature_columns(columns_to_tensors,
    215                                      feature_columns,
    216                                      weight_collections,
    217                                      trainable,
    218                                      scope,
    219                                      output_rank=2,
    220                                      default_name='input_from_feature_columns',
    221                                      cols_to_outs=cols_to_outs)
    222 
    223 
    224 @experimental
    225 def sequence_input_from_feature_columns(columns_to_tensors,
    226                                         feature_columns,
    227                                         weight_collections=None,
    228                                         trainable=True,
    229                                         scope=None):
    230   """Builds inputs for sequence models from `FeatureColumn`s.
    231 
    232   See documentation for `input_from_feature_columns`. The following types of
    233   `FeatureColumn` are permitted in `feature_columns`: `_OneHotColumn`,
    234   `_EmbeddingColumn`, `_ScatteredEmbeddingColumn`, `_RealValuedColumn`,
    235   `_DataFrameColumn`. In addition, columns in `feature_columns` may not be
    236   constructed using any of the following: `ScatteredEmbeddingColumn`,
    237   `BucketizedColumn`, `CrossedColumn`.
    238 
    239   Args:
    240     columns_to_tensors: A mapping from feature column to tensors. 'string' key
    241       means a base feature (not-transformed). It can have FeatureColumn as a
    242       key too. That means that FeatureColumn is already transformed by input
    243       pipeline.
    244     feature_columns: A set containing all the feature columns. All items in the
    245       set should be instances of classes derived by FeatureColumn.
    246     weight_collections: List of graph collections to which weights are added.
    247     trainable: If `True` also add variables to the graph collection
    248       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    249     scope: Optional scope for variable_scope.
    250 
    251   Returns:
    252     A Tensor which can be consumed by hidden layers in the neural network.
    253 
    254   Raises:
    255     ValueError: if FeatureColumn cannot be consumed by a neural network.
    256   """
    257   _check_supported_sequence_columns(feature_columns)
    258   _check_forbidden_sequence_columns(feature_columns)
    259 
    260   return _input_from_feature_columns(
    261       columns_to_tensors,
    262       feature_columns,
    263       weight_collections,
    264       trainable,
    265       scope,
    266       output_rank=3,
    267       default_name='sequence_input_from_feature_columns')
    268 
    269 
    270 def _create_embedding_lookup(column,
    271                              columns_to_tensors,
    272                              embedding_lookup_arguments,
    273                              num_outputs,
    274                              trainable,
    275                              weight_collections):
    276   """Creates variables and returns predictions for linear weights in a model.
    277 
    278   Args:
    279    column: the column we're working on.
    280    columns_to_tensors: a map from column name to tensors.
    281    embedding_lookup_arguments: arguments for embedding lookup.
    282    num_outputs: how many outputs.
    283    trainable: whether the variable we create is trainable.
    284    weight_collections: weights will be placed here.
    285 
    286   Returns:
    287   variables: the created embeddings.
    288   predictions: the computed predictions.
    289   """
    290   with variable_scope.variable_scope(
    291       None, default_name=column.name, values=columns_to_tensors.values()):
    292     variable = contrib_variables.model_variable(
    293         name='weights',
    294         shape=[embedding_lookup_arguments.vocab_size, num_outputs],
    295         dtype=dtypes.float32,
    296         initializer=embedding_lookup_arguments.initializer,
    297         trainable=trainable,
    298         collections=weight_collections)
    299     if fc._is_variable(variable):  # pylint: disable=protected-access
    300       variable = [variable]
    301     else:
    302       variable = variable._get_variable_list()  # pylint: disable=protected-access
    303     predictions = embedding_ops.safe_embedding_lookup_sparse(
    304         variable,
    305         embedding_lookup_arguments.input_tensor,
    306         sparse_weights=embedding_lookup_arguments.weight_tensor,
    307         combiner=embedding_lookup_arguments.combiner,
    308         name=column.name + '_weights')
    309     return variable, predictions
    310 
    311 
    312 def _create_joint_embedding_lookup(columns_to_tensors,
    313                                    embedding_lookup_arguments,
    314                                    num_outputs,
    315                                    trainable,
    316                                    weight_collections):
    317   """Creates an embedding lookup for all columns sharing a single weight."""
    318   for arg in embedding_lookup_arguments:
    319     assert arg.weight_tensor is None, (
    320         'Joint sums for weighted sparse columns are not supported. '
    321         'Please use weighted_sum_from_feature_columns instead.')
    322     assert arg.combiner == 'sum', (
    323         'Combiners other than sum are not supported for joint sums. '
    324         'Please use weighted_sum_from_feature_columns instead.')
    325   assert len(embedding_lookup_arguments) >= 1, (
    326       'At least one column must be in the model.')
    327   prev_size = 0
    328   sparse_tensors = []
    329   for a in embedding_lookup_arguments:
    330     t = a.input_tensor
    331     values = t.values + prev_size
    332     prev_size += a.vocab_size
    333     sparse_tensors.append(
    334         sparse_tensor_py.SparseTensor(t.indices,
    335                                       values,
    336                                       t.dense_shape))
    337   sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors)
    338   with variable_scope.variable_scope(
    339       None, default_name='linear_weights', values=columns_to_tensors.values()):
    340     variable = contrib_variables.model_variable(
    341         name='weights',
    342         shape=[prev_size, num_outputs],
    343         dtype=dtypes.float32,
    344         initializer=init_ops.zeros_initializer(),
    345         trainable=trainable,
    346         collections=weight_collections)
    347     if fc._is_variable(variable):  # pylint: disable=protected-access
    348       variable = [variable]
    349     else:
    350       variable = variable._get_variable_list()  # pylint: disable=protected-access
    351     predictions = embedding_ops.safe_embedding_lookup_sparse(
    352         variable,
    353         sparse_tensor,
    354         sparse_weights=None,
    355         combiner='sum',
    356         name='_weights')
    357     return variable, predictions
    358 
    359 
    360 def joint_weighted_sum_from_feature_columns(columns_to_tensors,
    361                                             feature_columns,
    362                                             num_outputs,
    363                                             weight_collections=None,
    364                                             trainable=True,
    365                                             scope=None):
    366   """A restricted linear prediction builder based on FeatureColumns.
    367 
    368   As long as all feature columns are unweighted sparse columns this computes the
    369   prediction of a linear model which stores all weights in a single variable.
    370 
    371   Args:
    372     columns_to_tensors: A mapping from feature column to tensors. 'string' key
    373       means a base feature (not-transformed). It can have FeatureColumn as a
    374       key too. That means that FeatureColumn is already transformed by input
    375       pipeline. For example, `inflow` may have handled transformations.
    376     feature_columns: A set containing all the feature columns. All items in the
    377       set should be instances of classes derived from FeatureColumn.
    378     num_outputs: An integer specifying number of outputs. Default value is 1.
    379     weight_collections: List of graph collections to which weights are added.
    380     trainable: If `True` also add variables to the graph collection
    381       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    382     scope: Optional scope for variable_scope.
    383 
    384   Returns:
    385     A tuple containing:
    386 
    387     * A Tensor which represents predictions of a linear model.
    388     * A list of Variables storing the weights.
    389     * A Variable which is used for bias.
    390 
    391   Raises:
    392     ValueError: if FeatureColumn cannot be used for linear predictions.
    393 
    394   """
    395   columns_to_tensors = columns_to_tensors.copy()
    396   check_feature_columns(feature_columns)
    397   with variable_scope.variable_scope(
    398       scope,
    399       default_name='joint_weighted_sum_from_feature_columns',
    400       values=columns_to_tensors.values()):
    401     transformer = _Transformer(columns_to_tensors)
    402     embedding_lookup_arguments = []
    403     for column in sorted(set(feature_columns), key=lambda x: x.key):
    404       transformed_tensor = transformer.transform(column)
    405       try:
    406         embedding_lookup_arguments.append(
    407             column._wide_embedding_lookup_arguments(transformed_tensor))   # pylint: disable=protected-access
    408       except NotImplementedError:
    409         raise NotImplementedError('Real-valued columns are not supported. '
    410                                   'Use weighted_sum_from_feature_columns '
    411                                   'instead, or bucketize these columns.')
    412 
    413     variable, predictions_no_bias = _create_joint_embedding_lookup(
    414         columns_to_tensors,
    415         embedding_lookup_arguments,
    416         num_outputs,
    417         trainable,
    418         weight_collections)
    419     bias = contrib_variables.model_variable(
    420         'bias_weight',
    421         shape=[num_outputs],
    422         initializer=init_ops.zeros_initializer(),
    423         trainable=trainable,
    424         collections=_add_variable_collection(weight_collections))
    425     _log_variable(bias)
    426     predictions = nn_ops.bias_add(predictions_no_bias, bias)
    427 
    428     return predictions, variable, bias
    429 
    430 
    431 def weighted_sum_from_feature_columns(columns_to_tensors,
    432                                       feature_columns,
    433                                       num_outputs,
    434                                       weight_collections=None,
    435                                       trainable=True,
    436                                       scope=None):
    437   """A tf.contrib.layers style linear prediction builder based on FeatureColumn.
    438 
    439   Generally a single example in training data is described with feature columns.
    440   This function generates weighted sum for each num_outputs. Weighted sum refers
    441   to logits in classification problems. It refers to prediction itself for
    442   linear regression problems.
    443 
    444   Example:
    445 
    446     ```
    447     # Building model for training
    448     feature_columns = (
    449         real_valued_column("my_feature1"),
    450         ...
    451     )
    452     columns_to_tensor = tf.parse_example(...)
    453     logits = weighted_sum_from_feature_columns(
    454         columns_to_tensors=columns_to_tensor,
    455         feature_columns=feature_columns,
    456         num_outputs=1)
    457     loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
    458                                                    logits=logits)
    459     ```
    460 
    461   Args:
    462     columns_to_tensors: A mapping from feature column to tensors. 'string' key
    463       means a base feature (not-transformed). It can have FeatureColumn as a
    464       key too. That means that FeatureColumn is already transformed by input
    465       pipeline. For example, `inflow` may have handled transformations.
    466     feature_columns: A set containing all the feature columns. All items in the
    467       set should be instances of classes derived from FeatureColumn.
    468     num_outputs: An integer specifying number of outputs. Default value is 1.
    469     weight_collections: List of graph collections to which weights are added.
    470     trainable: If `True` also add variables to the graph collection
    471       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    472     scope: Optional scope for variable_scope.
    473 
    474   Returns:
    475     A tuple containing:
    476 
    477       * A Tensor which represents predictions of a linear model.
    478       * A dictionary which maps feature_column to corresponding Variable.
    479       * A Variable which is used for bias.
    480 
    481   Raises:
    482     ValueError: if FeatureColumn cannot be used for linear predictions.
    483   """
    484   columns_to_tensors = columns_to_tensors.copy()
    485   check_feature_columns(feature_columns)
    486   with variable_scope.variable_scope(
    487       scope,
    488       default_name='weighted_sum_from_feature_columns',
    489       values=columns_to_tensors.values()):
    490     output_tensors = []
    491     column_to_variable = dict()
    492     transformer = _Transformer(columns_to_tensors)
    493     # pylint: disable=protected-access
    494     for column in sorted(set(feature_columns), key=lambda x: x.key):
    495       transformed_tensor = transformer.transform(column)
    496       try:
    497         embedding_lookup_arguments = column._wide_embedding_lookup_arguments(
    498             transformed_tensor)
    499         variable, predictions = _create_embedding_lookup(
    500             column,
    501             columns_to_tensors,
    502             embedding_lookup_arguments,
    503             num_outputs,
    504             trainable,
    505             weight_collections)
    506       except NotImplementedError:
    507         with variable_scope.variable_scope(
    508             None,
    509             default_name=column.name,
    510             values=columns_to_tensors.values()):
    511           tensor = column._to_dense_tensor(transformed_tensor)
    512           tensor = _maybe_reshape_input_tensor(
    513               tensor, column.name, output_rank=2)
    514           variable = [
    515               contrib_variables.model_variable(
    516                   name='weight',
    517                   shape=[tensor.get_shape()[1], num_outputs],
    518                   initializer=init_ops.zeros_initializer(),
    519                   trainable=trainable,
    520                   collections=weight_collections)
    521           ]
    522           predictions = math_ops.matmul(tensor, variable[0], name='matmul')
    523       except ValueError as ee:
    524         raise ValueError('Error creating weighted sum for column: {}.\n'
    525                          '{}'.format(column.name, ee))
    526       output_tensors.append(array_ops.reshape(
    527           predictions, shape=(-1, num_outputs)))
    528       column_to_variable[column] = variable
    529       _log_variable(variable)
    530       fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable)  # pylint: disable=protected-access
    531     # pylint: enable=protected-access
    532     predictions_no_bias = math_ops.add_n(output_tensors)
    533     bias = contrib_variables.model_variable(
    534         'bias_weight',
    535         shape=[num_outputs],
    536         initializer=init_ops.zeros_initializer(),
    537         trainable=trainable,
    538         collections=_add_variable_collection(weight_collections))
    539     _log_variable(bias)
    540     predictions = nn_ops.bias_add(predictions_no_bias, bias)
    541 
    542     return predictions, column_to_variable, bias
    543 
    544 
    545 def parse_feature_columns_from_examples(serialized,
    546                                         feature_columns,
    547                                         name=None,
    548                                         example_names=None):
    549   """Parses tf.Examples to extract tensors for given feature_columns.
    550 
    551   This is a wrapper of 'tf.parse_example'.
    552 
    553   Example:
    554 
    555   ```python
    556   columns_to_tensor = parse_feature_columns_from_examples(
    557       serialized=my_data,
    558       feature_columns=my_features)
    559 
    560   # Where my_features are:
    561   # Define features and transformations
    562   sparse_feature_a = sparse_column_with_keys(
    563       column_name="sparse_feature_a", keys=["AB", "CD", ...])
    564 
    565   embedding_feature_a = embedding_column(
    566       sparse_id_column=sparse_feature_a, dimension=3, combiner="sum")
    567 
    568   sparse_feature_b = sparse_column_with_hash_bucket(
    569       column_name="sparse_feature_b", hash_bucket_size=1000)
    570 
    571   embedding_feature_b = embedding_column(
    572       sparse_id_column=sparse_feature_b, dimension=16, combiner="sum")
    573 
    574   crossed_feature_a_x_b = crossed_column(
    575       columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000)
    576 
    577   real_feature = real_valued_column("real_feature")
    578   real_feature_buckets = bucketized_column(
    579       source_column=real_feature, boundaries=[...])
    580 
    581   my_features = [embedding_feature_b, real_feature_buckets, embedding_feature_a]
    582   ```
    583 
    584   Args:
    585     serialized: A vector (1-D Tensor) of strings, a batch of binary
    586       serialized `Example` protos.
    587     feature_columns: An iterable containing all the feature columns. All items
    588       should be instances of classes derived from _FeatureColumn.
    589     name: A name for this operation (optional).
    590     example_names: A vector (1-D Tensor) of strings (optional), the names of
    591       the serialized protos in the batch.
    592 
    593   Returns:
    594     A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
    595   """
    596   check_feature_columns(feature_columns)
    597   columns_to_tensors = parsing_ops.parse_example(
    598       serialized=serialized,
    599       features=fc.create_feature_spec_for_parsing(feature_columns),
    600       name=name,
    601       example_names=example_names)
    602 
    603   transformer = _Transformer(columns_to_tensors)
    604   for column in sorted(set(feature_columns), key=lambda x: x.key):
    605     transformer.transform(column)
    606   return columns_to_tensors
    607 
    608 
    609 def transform_features(features, feature_columns):
    610   """Returns transformed features based on features columns passed in.
    611 
    612   Example:
    613 
    614   ```python
    615   columns_to_tensor = transform_features(features=features,
    616                                          feature_columns=feature_columns)
    617 
    618   # Where my_features are:
    619   # Define features and transformations
    620   sparse_feature_a = sparse_column_with_keys(
    621       column_name="sparse_feature_a", keys=["AB", "CD", ...])
    622 
    623   embedding_feature_a = embedding_column(
    624       sparse_id_column=sparse_feature_a, dimension=3, combiner="sum")
    625 
    626   sparse_feature_b = sparse_column_with_hash_bucket(
    627       column_name="sparse_feature_b", hash_bucket_size=1000)
    628 
    629   embedding_feature_b = embedding_column(
    630       sparse_id_column=sparse_feature_b, dimension=16, combiner="sum")
    631 
    632   crossed_feature_a_x_b = crossed_column(
    633       columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000)
    634 
    635   real_feature = real_valued_column("real_feature")
    636   real_feature_buckets = bucketized_column(
    637       source_column=real_feature, boundaries=[...])
    638 
    639   feature_columns = [embedding_feature_b,
    640                      real_feature_buckets,
    641                      embedding_feature_a]
    642   ```
    643 
    644   Args:
    645     features: A dictionary of features.
    646     feature_columns: An iterable containing all the feature columns. All items
    647       should be instances of classes derived from _FeatureColumn.
    648 
    649   Returns:
    650     A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values.
    651   """
    652   columns_to_tensor = features.copy()
    653   check_feature_columns(feature_columns)
    654   transformer = _Transformer(columns_to_tensor)
    655   for column in sorted(set(feature_columns), key=lambda x: x.key):
    656     transformer.transform(column)
    657   keys = list(columns_to_tensor.keys())
    658   for k in keys:
    659     if k not in feature_columns:
    660       columns_to_tensor.pop(k)
    661   return columns_to_tensor
    662 
    663 
    664 def parse_feature_columns_from_sequence_examples(
    665     serialized,
    666     context_feature_columns,
    667     sequence_feature_columns,
    668     name=None,
    669     example_name=None):
    670   """Parses tf.SequenceExamples to extract tensors for given `FeatureColumn`s.
    671 
    672   Args:
    673     serialized: A scalar (0-D Tensor) of type string, a single serialized
    674       `SequenceExample` proto.
    675     context_feature_columns: An iterable containing the feature columns for
    676       context features. All items should be instances of classes derived from
    677       `_FeatureColumn`. Can be `None`.
    678     sequence_feature_columns: An iterable containing the feature columns for
    679       sequence features. All items should be instances of classes derived from
    680       `_FeatureColumn`. Can be `None`.
    681     name: A name for this operation (optional).
    682     example_name: A scalar (0-D Tensor) of type string (optional), the names of
    683       the serialized proto.
    684 
    685   Returns:
    686     A tuple consisting of:
    687     context_features: a dict mapping `FeatureColumns` from
    688       `context_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
    689     sequence_features: a dict mapping `FeatureColumns` from
    690       `sequence_feature_columns` to their parsed `Tensors`/`SparseTensor`s.
    691   """
    692   # Sequence example parsing requires a single (scalar) example.
    693   try:
    694     serialized = array_ops.reshape(serialized, [])
    695   except ValueError as e:
    696     raise ValueError(
    697         'serialized must contain as single sequence example. Batching must be '
    698         'done after parsing for sequence examples. Error: {}'.format(e))
    699 
    700   if context_feature_columns is None:
    701     context_feature_columns = []
    702   if sequence_feature_columns is None:
    703     sequence_feature_columns = []
    704 
    705   check_feature_columns(context_feature_columns)
    706   context_feature_spec = fc.create_feature_spec_for_parsing(
    707       context_feature_columns)
    708 
    709   check_feature_columns(sequence_feature_columns)
    710   sequence_feature_spec = fc._create_sequence_feature_spec_for_parsing(  # pylint: disable=protected-access
    711       sequence_feature_columns, allow_missing_by_default=False)
    712 
    713   return parsing_ops.parse_single_sequence_example(serialized,
    714                                                    context_feature_spec,
    715                                                    sequence_feature_spec,
    716                                                    example_name,
    717                                                    name)
    718 
    719 
    720 def _log_variable(variable):
    721   if isinstance(variable, list):
    722     for var in variable:
    723       if fc._is_variable(variable):  # pylint: disable=protected-access
    724         logging.info('Created variable %s, with device=%s', var.name,
    725                      var.device)
    726   elif fc._is_variable(variable):  # pylint: disable=protected-access
    727     logging.info('Created variable %s, with device=%s', variable.name,
    728                  variable.device)
    729 
    730 
    731 def _infer_real_valued_column_for_tensor(name, tensor):
    732   """Creates a real_valued_column for given tensor and name."""
    733   if isinstance(tensor, sparse_tensor_py.SparseTensor):
    734     raise ValueError(
    735         'SparseTensor is not supported for auto detection. Please define '
    736         'corresponding FeatureColumn for tensor {} {}.', name, tensor)
    737 
    738   if not (tensor.dtype.is_integer or tensor.dtype.is_floating):
    739     raise ValueError(
    740         'Non integer or non floating types are not supported for auto detection'
    741         '. Please define corresponding FeatureColumn for tensor {} {}.', name,
    742         tensor)
    743 
    744   shape = tensor.get_shape().as_list()
    745   dimension = 1
    746   for i in range(1, len(shape)):
    747     dimension *= shape[i]
    748   return fc.real_valued_column(name, dimension=dimension, dtype=tensor.dtype)
    749 
    750 
    751 def infer_real_valued_columns(features):
    752   if not isinstance(features, dict):
    753     return [_infer_real_valued_column_for_tensor('', features)]
    754 
    755   feature_columns = []
    756   for key, value in features.items():
    757     feature_columns.append(_infer_real_valued_column_for_tensor(key, value))
    758 
    759   return feature_columns
    760 
    761 
    762 def check_feature_columns(feature_columns):
    763   """Checks the validity of the set of FeatureColumns.
    764 
    765   Args:
    766     feature_columns: An iterable of instances or subclasses of FeatureColumn.
    767 
    768   Raises:
    769     ValueError: If `feature_columns` is a dict.
    770     ValueError: If there are duplicate feature column keys.
    771   """
    772   if isinstance(feature_columns, dict):
    773     raise ValueError('Expected feature_columns to be iterable, found dict.')
    774   seen_keys = set()
    775   for f in feature_columns:
    776     key = f.key
    777     if key in seen_keys:
    778       raise ValueError('Duplicate feature column key found for column: {}. '
    779                        'This usually means that the column is almost identical '
    780                        'to another column, and one must be discarded.'.format(
    781                            f.name))
    782     seen_keys.add(key)
    783 
    784 
    785 class _Transformer(object):
    786   """Handles all the transformations defined by FeatureColumn if needed.
    787 
    788   FeatureColumn specifies how to digest an input column to the network. Some
    789   feature columns require data transformations. This class handles those
    790   transformations if they are not handled already.
    791 
    792   Some features may be used in more than one place. For example, one can use a
    793   bucketized feature by itself and a cross with it. In that case Transformer
    794   should create only one bucketization op instead of multiple ops for each
    795   feature column. To handle re-use of transformed columns, Transformer keeps all
    796   previously transformed columns.
    797 
    798   Example:
    799 
    800   ```python
    801     sparse_feature = sparse_column_with_hash_bucket(...)
    802     real_valued_feature = real_valued_column(...)
    803     real_valued_buckets = bucketized_column(source_column=real_valued_feature,
    804                                             ...)
    805     sparse_x_real = crossed_column(
    806         columns=[sparse_feature, real_valued_buckets], hash_bucket_size=10000)
    807 
    808     columns_to_tensor = tf.parse_example(...)
    809     transformer = Transformer(columns_to_tensor)
    810 
    811     sparse_x_real_tensor = transformer.transform(sparse_x_real)
    812     sparse_tensor = transformer.transform(sparse_feature)
    813     real_buckets_tensor = transformer.transform(real_valued_buckets)
    814   ```
    815   """
    816 
    817   def __init__(self, columns_to_tensors):
    818     """Initializes transfomer.
    819 
    820     Args:
    821       columns_to_tensors: A mapping from feature columns to tensors. 'string'
    822         key means a base feature (not-transformed). It can have FeatureColumn as
    823         a key too. That means that FeatureColumn is already transformed by input
    824         pipeline. For example, `inflow` may have handled transformations.
    825         Transformed features are inserted in columns_to_tensors.
    826     """
    827     self._columns_to_tensors = columns_to_tensors
    828 
    829   def transform(self, feature_column):
    830     """Returns a Tensor which represents given feature_column.
    831 
    832     Args:
    833       feature_column: An instance of FeatureColumn.
    834 
    835     Returns:
    836       A Tensor which represents given feature_column. It may create a new Tensor
    837       or re-use an existing one.
    838 
    839     Raises:
    840       ValueError: if FeatureColumn cannot be handled by this Transformer.
    841     """
    842     logging.debug('Transforming feature_column %s', feature_column)
    843     if feature_column in self._columns_to_tensors:
    844       # Feature_column is already transformed.
    845       return self._columns_to_tensors[feature_column]
    846 
    847     feature_column.insert_transformed_feature(self._columns_to_tensors)
    848 
    849     if feature_column not in self._columns_to_tensors:
    850       raise ValueError('Column {} is not supported.'.format(
    851           feature_column.name))
    852 
    853     return self._columns_to_tensors[feature_column]
    854 
    855 
    856 def _add_variable_collection(weight_collections):
    857   if weight_collections:
    858     weight_collections = list(
    859         set(list(weight_collections) + [ops.GraphKeys.GLOBAL_VARIABLES]))
    860   return weight_collections
    861 
    862 
    863 # TODO(jamieas): remove the following logic once all FeatureColumn types are
    864 # supported for sequences.
    865 # pylint: disable=protected-access
    866 _SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn,
    867                                fc._EmbeddingColumn,
    868                                fc._RealValuedColumn,
    869                                fc._RealValuedVarLenColumn)
    870 
    871 _FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn,
    872                                fc._BucketizedColumn,
    873                                fc._CrossedColumn)
    874 
    875 
    876 def _check_supported_sequence_columns(feature_columns):
    877   """Asserts `feature_columns` are in `_SUPPORTED_SEQUENCE_COLUMNS`."""
    878   for col in feature_columns:
    879     if not isinstance(col, _SUPPORTED_SEQUENCE_COLUMNS):
    880       raise ValueError(
    881           'FeatureColumn type {} is not currently supported for sequence data.'.
    882           format(type(col).__name__))
    883 
    884 
    885 def _get_parent_columns(feature_column):
    886   """Returns the tuple of `FeatureColumn`s that `feature_column` depends on."""
    887   if isinstance(feature_column, (fc._WeightedSparseColumn,
    888                                  fc._OneHotColumn,
    889                                  fc._EmbeddingColumn,)):
    890     return (feature_column.sparse_id_column,)
    891   if isinstance(feature_column, (fc._BucketizedColumn,)):
    892     return (feature_column.source_column,)
    893   if isinstance(feature_column, (fc._CrossedColumn)):
    894     return tuple(feature_column.columns)
    895   return tuple()
    896 
    897 
    898 def _gather_feature_columns(feature_columns):
    899   """Returns a list of all ancestor `FeatureColumns` of `feature_columns`."""
    900   gathered = list(feature_columns)
    901   i = 0
    902   while i < len(gathered):
    903     for column in _get_parent_columns(gathered[i]):
    904       if column not in gathered:
    905         gathered.append(column)
    906     i += 1
    907   return gathered
    908 
    909 
    910 def _check_forbidden_sequence_columns(feature_columns):
    911   """Recursively cecks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
    912   all_feature_columns = _gather_feature_columns(feature_columns)
    913   for feature_column in all_feature_columns:
    914     if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
    915       raise ValueError(
    916           'Column {} is of type {}, which is not currently supported for '
    917           'sequences.'.format(feature_column.name,
    918                               type(feature_column).__name__))
    919