Home | History | Annotate | Download | only in feature_column
      1 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #     http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 # ==============================================================================
     15 """This API defines FeatureColumn abstraction.
     16 
     17 FeatureColumns provide a high level abstraction for ingesting and representing
     18 features. FeatureColumns are also the primary way of encoding features for
     19 canned ${tf.estimator.Estimator}s.
     20 
     21 When using FeatureColumns with `Estimators`, the type of feature column you
     22 should choose depends on (1) the feature type and (2) the model type.
     23 
     24 1. Feature type:
     25 
     26   * Continuous features can be represented by `numeric_column`.
     27   * Categorical features can be represented by any `categorical_column_with_*`
     28   column:
     29     - `categorical_column_with_vocabulary_list`
     30     - `categorical_column_with_vocabulary_file`
     31     - `categorical_column_with_hash_bucket`
     32     - `categorical_column_with_identity`
     33     - `weighted_categorical_column`
     34 
     35 2. Model type:
     36 
     37   * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
     38 
     39     Continuous features can be directly fed into deep neural network models.
     40 
     41       age_column = numeric_column("age")
     42 
     43     To feed sparse features into DNN models, wrap the column with
     44     `embedding_column` or `indicator_column`. `indicator_column` is recommended
     45     for features with only a few possible values. For features with many
     46     possible values, to reduce the size of your model, `embedding_column` is
     47     recommended.
     48 
     49       embedded_dept_column = embedding_column(
     50           categorical_column_with_vocabulary_list(
     51               "department", ["math", "philosphy", ...]), dimension=10)
     52 
     53   * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
     54 
     55     Sparse features can be fed directly into linear models. They behave like an
     56     indicator column but with an efficient implementation.
     57 
     58       dept_column = categorical_column_with_vocabulary_list("department",
     59           ["math", "philosophy", "english"])
     60 
     61     It is recommended that continuous features be bucketized before being
     62     fed into linear models.
     63 
     64       bucketized_age_column = bucketized_column(
     65           source_column=age_column,
     66           boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
     67 
     68     Sparse features can be crossed (also known as conjuncted or combined) in
     69     order to form non-linearities, and then fed into linear models.
     70 
     71       cross_dept_age_column = crossed_column(
     72           columns=["department", bucketized_age_column],
     73           hash_bucket_size=1000)
     74 
     75 Example of building canned `Estimator`s using FeatureColumns:
     76 
     77   ```python
     78   # Define features and transformations
     79   deep_feature_columns = [age_column, embedded_dept_column]
     80   wide_feature_columns = [dept_column, bucketized_age_column,
     81       cross_dept_age_column]
     82 
     83   # Build deep model
     84   estimator = DNNClassifier(
     85       feature_columns=deep_feature_columns,
     86       hidden_units=[500, 250, 50])
     87   estimator.train(...)
     88 
     89   # Or build a wide model
     90   estimator = LinearClassifier(
     91       feature_columns=wide_feature_columns)
     92   estimator.train(...)
     93 
     94   # Or build a wide and deep model!
     95   estimator = DNNLinearCombinedClassifier(
     96       linear_feature_columns=wide_feature_columns,
     97       dnn_feature_columns=deep_feature_columns,
     98       dnn_hidden_units=[500, 250, 50])
     99   estimator.train(...)
    100   ```
    101 
    102 
    103 FeatureColumns can also be transformed into a generic input layer for
    104 custom models using `input_layer`.
    105 
    106 Example of building model using FeatureColumns, this can be used in a
    107 `model_fn` which is given to the {tf.estimator.Estimator}:
    108 
    109   ```python
    110   # Building model via layers
    111 
    112   deep_feature_columns = [age_column, embedded_dept_column]
    113   columns_to_tensor = parse_feature_columns_from_examples(
    114       serialized=my_data,
    115       feature_columns=deep_feature_columns)
    116   first_layer = input_layer(
    117       features=columns_to_tensor,
    118       feature_columns=deep_feature_columns)
    119   second_layer = fully_connected(first_layer, ...)
    120   ```
    121 
    122 NOTE: Functions prefixed with "_" indicate experimental or private parts of
    123 the API subject to change, and should not be relied upon!
    124 """
    125 
    126 from __future__ import absolute_import
    127 from __future__ import division
    128 from __future__ import print_function
    129 
    130 import abc
    131 import collections
    132 import math
    133 
    134 import numpy as np
    135 import six
    136 
    137 
    138 from tensorflow.python.framework import dtypes
    139 from tensorflow.python.framework import ops
    140 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
    141 from tensorflow.python.framework import tensor_shape
    142 from tensorflow.python.ops import array_ops
    143 from tensorflow.python.ops import check_ops
    144 from tensorflow.python.ops import control_flow_ops
    145 from tensorflow.python.ops import embedding_ops
    146 from tensorflow.python.ops import init_ops
    147 from tensorflow.python.ops import lookup_ops
    148 from tensorflow.python.ops import math_ops
    149 from tensorflow.python.ops import nn_ops
    150 from tensorflow.python.ops import parsing_ops
    151 from tensorflow.python.ops import sparse_ops
    152 from tensorflow.python.ops import string_ops
    153 from tensorflow.python.ops import template
    154 from tensorflow.python.ops import variable_scope
    155 from tensorflow.python.ops import variables
    156 from tensorflow.python.platform import gfile
    157 from tensorflow.python.platform import tf_logging as logging
    158 from tensorflow.python.training import checkpoint_utils
    159 from tensorflow.python.util import nest
    160 from tensorflow.python.util.tf_export import tf_export
    161 from tensorflow.python.util.tf_export import tf_export
    162 
    163 
    164 def _internal_input_layer(features,
    165                           feature_columns,
    166                           weight_collections=None,
    167                           trainable=True,
    168                           cols_to_vars=None,
    169                           scope=None):
    170   """See input_layer. `scope` is a name or variable scope to use."""
    171 
    172   feature_columns = _clean_feature_columns(feature_columns)
    173   for column in feature_columns:
    174     if not isinstance(column, _DenseColumn):
    175       raise ValueError(
    176           'Items of feature_columns must be a _DenseColumn. '
    177           'You can wrap a categorical column with an '
    178           'embedding_column or indicator_column. Given: {}'.format(column))
    179   weight_collections = list(weight_collections or [])
    180   if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
    181     weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
    182   if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
    183     weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
    184 
    185   # a non-None `scope` can allow for variable reuse, when, e.g., this function
    186   # is wrapped by a `make_template`.
    187   with variable_scope.variable_scope(
    188       scope, default_name='input_layer', values=features.values()):
    189     builder = _LazyBuilder(features)
    190     output_tensors = []
    191     ordered_columns = []
    192     for column in sorted(feature_columns, key=lambda x: x.name):
    193       ordered_columns.append(column)
    194       with variable_scope.variable_scope(
    195           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
    196         tensor = column._get_dense_tensor(  # pylint: disable=protected-access
    197             builder,
    198             weight_collections=weight_collections,
    199             trainable=trainable)
    200         num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
    201         batch_size = array_ops.shape(tensor)[0]
    202         output_tensors.append(
    203             array_ops.reshape(tensor, shape=(batch_size, num_elements)))
    204         if cols_to_vars is not None:
    205           # Retrieve any variables created (some _DenseColumn's don't create
    206           # variables, in which case an empty list is returned).
    207           cols_to_vars[column] = ops.get_collection(
    208               ops.GraphKeys.GLOBAL_VARIABLES,
    209               scope=variable_scope.get_variable_scope().name)
    210     _verify_static_batch_size_equality(output_tensors, ordered_columns)
    211     return array_ops.concat(output_tensors, 1)
    212 
    213 
    214 @tf_export('feature_column.input_layer')
    215 def input_layer(features,
    216                 feature_columns,
    217                 weight_collections=None,
    218                 trainable=True,
    219                 cols_to_vars=None):
    220   """Returns a dense `Tensor` as input layer based on given `feature_columns`.
    221 
    222   Generally a single example in training data is described with FeatureColumns.
    223   At the first layer of the model, this column oriented data should be converted
    224   to a single `Tensor`.
    225 
    226   Example:
    227 
    228   ```python
    229   price = numeric_column('price')
    230   keywords_embedded = embedding_column(
    231       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
    232   columns = [price, keywords_embedded, ...]
    233   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    234   dense_tensor = input_layer(features, columns)
    235   for units in [128, 64, 32]:
    236     dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
    237   prediction = tf.layers.dense(dense_tensor, 1)
    238   ```
    239 
    240   Args:
    241     features: A mapping from key to tensors. `_FeatureColumn`s look up via these
    242       keys. For example `numeric_column('price')` will look at 'price' key in
    243       this dict. Values can be a `SparseTensor` or a `Tensor` depends on
    244       corresponding `_FeatureColumn`.
    245     feature_columns: An iterable containing the FeatureColumns to use as inputs
    246       to your model. All items should be instances of classes derived from
    247       `_DenseColumn` such as `numeric_column`, `embedding_column`,
    248       `bucketized_column`, `indicator_column`. If you have categorical features,
    249       you can wrap them with an `embedding_column` or `indicator_column`.
    250     weight_collections: A list of collection names to which the Variable will be
    251       added. Note that variables will also be added to collections
    252       `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
    253     trainable: If `True` also add the variable to the graph collection
    254       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    255     cols_to_vars: If not `None`, must be a dictionary that will be filled with a
    256       mapping from `_FeatureColumn` to list of `Variable`s.  For example, after
    257       the call, we might have cols_to_vars =
    258       {_EmbeddingColumn(
    259         categorical_column=_HashedCategoricalColumn(
    260           key='sparse_feature', hash_bucket_size=5, dtype=tf.string),
    261         dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10),
    262                         <tf.Variable 'some_variable:1' shape=(5, 10)]}
    263       If a column creates no variables, its value will be an empty list.
    264 
    265   Returns:
    266     A `Tensor` which represents input layer of a model. Its shape
    267     is (batch_size, first_layer_dimension) and its dtype is `float32`.
    268     first_layer_dimension is determined based on given `feature_columns`.
    269 
    270   Raises:
    271     ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
    272   """
    273   return _internal_input_layer(features, feature_columns, weight_collections,
    274                                trainable, cols_to_vars)
    275 
    276 
    277 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
    278 # should implement the logic in input_layer using Layer's build-and-call
    279 # paradigm; input_layer should create an instance of InputLayer and
    280 # return the result of inovking its apply method, just as functional layers do.
    281 class InputLayer(object):
    282   """An object-oriented version of `input_layer` that reuses variables."""
    283 
    284   def __init__(self,
    285                feature_columns,
    286                weight_collections=None,
    287                trainable=True,
    288                cols_to_vars=None):
    289     """See `input_layer`."""
    290 
    291     self._feature_columns = feature_columns
    292     self._weight_collections = weight_collections
    293     self._trainable = trainable
    294     self._cols_to_vars = cols_to_vars
    295     self._input_layer_template = template.make_template(
    296         'feature_column_input_layer',
    297         _internal_input_layer,
    298         create_scope_now_=True)
    299     self._scope = self._input_layer_template.variable_scope
    300 
    301   def __call__(self, features):
    302     return self._input_layer_template(
    303         features=features,
    304         feature_columns=self._feature_columns,
    305         weight_collections=self._weight_collections,
    306         trainable=self._trainable,
    307         cols_to_vars=None,
    308         scope=self._scope)
    309 
    310   @property
    311   def non_trainable_variables(self):
    312     return self._input_layer_template.non_trainable_variables
    313 
    314   @property
    315   def non_trainable_weights(self):
    316     return self._input_layer_template.non_trainable_weights
    317 
    318   @property
    319   def trainable_variables(self):
    320     return self._input_layer_template.trainable_variables
    321 
    322   @property
    323   def trainable_weights(self):
    324     return self._input_layer_template.trainable_weights
    325 
    326   @property
    327   def variables(self):
    328     return self._input_layer_template.variables
    329 
    330   @property
    331   def weights(self):
    332     return self._input_layer_template.weights
    333 
    334 
    335 @tf_export('feature_column.linear_model')
    336 def linear_model(features,
    337                  feature_columns,
    338                  units=1,
    339                  sparse_combiner='sum',
    340                  weight_collections=None,
    341                  trainable=True,
    342                  cols_to_vars=None):
    343   """Returns a linear prediction `Tensor` based on given `feature_columns`.
    344 
    345   This function generates a weighted sum based on output dimension `units`.
    346   Weighted sum refers to logits in classification problems. It refers to the
    347   prediction itself for linear regression problems.
    348 
    349   Note on supported columns: `linear_model` treats categorical columns as
    350   `indicator_column`s while `input_layer` explicitly requires wrapping each
    351   of them with an `embedding_column` or an `indicator_column`.
    352 
    353   Example:
    354 
    355   ```python
    356   price = numeric_column('price')
    357   price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
    358   keywords = categorical_column_with_hash_bucket("keywords", 10K)
    359   keywords_price = crossed_column('keywords', price_buckets, ...)
    360   columns = [price_buckets, keywords, keywords_price ...]
    361   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    362   prediction = linear_model(features, columns)
    363   ```
    364 
    365   Args:
    366     features: A mapping from key to tensors. `_FeatureColumn`s look up via these
    367       keys. For example `numeric_column('price')` will look at 'price' key in
    368       this dict. Values are `Tensor` or `SparseTensor` depending on
    369       corresponding `_FeatureColumn`.
    370     feature_columns: An iterable containing the FeatureColumns to use as inputs
    371       to your model. All items should be instances of classes derived from
    372       `_FeatureColumn`s.
    373     units: An integer, dimensionality of the output space. Default value is 1.
    374     sparse_combiner: A string specifying how to reduce if a sparse column is
    375       multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
    376       the default. "sqrtn" often achieves good accuracy, in particular with
    377       bag-of-words columns. It combines each sparse columns independently.
    378         * "sum": do not normalize features in the column
    379         * "mean": do l1 normalization on features in the column
    380         * "sqrtn": do l2 normalization on features in the column
    381     weight_collections: A list of collection names to which the Variable will be
    382       added. Note that, variables will also be added to collections
    383       `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
    384     trainable: If `True` also add the variable to the graph collection
    385       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    386     cols_to_vars: If not `None`, must be a dictionary that will be filled with a
    387       mapping from `_FeatureColumn` to associated list of `Variable`s.  For
    388       example, after the call, we might have cols_to_vars = {
    389         _NumericColumn(
    390           key='numeric_feature1', shape=(1,):
    391         [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>],
    392         'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>],
    393         _NumericColumn(
    394           key='numeric_feature2', shape=(2,)):
    395         [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]}
    396       If a column creates no variables, its value will be an empty list. Note
    397       that cols_to_vars will also contain a string key 'bias' that maps to a
    398       list of Variables.
    399 
    400   Returns:
    401     A `Tensor` which represents predictions/logits of a linear model. Its shape
    402     is (batch_size, units) and its dtype is `float32`.
    403 
    404   Raises:
    405     ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
    406       nor `_CategoricalColumn`.
    407   """
    408   feature_columns = _clean_feature_columns(feature_columns)
    409   for column in feature_columns:
    410     if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
    411       raise ValueError('Items of feature_columns must be either a _DenseColumn '
    412                        'or _CategoricalColumn. Given: {}'.format(column))
    413   weight_collections = list(weight_collections or [])
    414   if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
    415     weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
    416   if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
    417     weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
    418   with variable_scope.variable_scope(
    419       None, default_name='linear_model', values=features.values()):
    420     weighted_sums = []
    421     ordered_columns = []
    422     builder = _LazyBuilder(features)
    423     for column in sorted(feature_columns, key=lambda x: x.name):
    424       with variable_scope.variable_scope(
    425           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
    426         ordered_columns.append(column)
    427         weighted_sum = _create_weighted_sum(
    428             column=column,
    429             builder=builder,
    430             units=units,
    431             sparse_combiner=sparse_combiner,
    432             weight_collections=weight_collections,
    433             trainable=trainable)
    434         weighted_sums.append(weighted_sum)
    435         if cols_to_vars is not None:
    436           # Retrieve the variables created.
    437           cols_to_vars[column] = ops.get_collection(
    438               ops.GraphKeys.GLOBAL_VARIABLES,
    439               scope=variable_scope.get_variable_scope().name)
    440     _verify_static_batch_size_equality(weighted_sums, ordered_columns)
    441     predictions_no_bias = math_ops.add_n(
    442         weighted_sums, name='weighted_sum_no_bias')
    443     bias = variable_scope.get_variable(
    444         'bias_weights',
    445         shape=[units],
    446         initializer=init_ops.zeros_initializer(),
    447         trainable=trainable,
    448         collections=weight_collections)
    449     predictions = nn_ops.bias_add(
    450         predictions_no_bias, bias, name='weighted_sum')
    451     if cols_to_vars is not None:
    452       # Add the bias to cols_to_vars as well, converting the Variable or
    453       # PartitionedVariable to a list of Variable's.
    454       if isinstance(bias, variables.Variable):
    455         cols_to_vars['bias'] = [bias]
    456       else:  # Must be a PartitionedVariable.
    457         cols_to_vars['bias'] = list(bias)
    458     return predictions
    459 
    460 
    461 def _transform_features(features, feature_columns):
    462   """Returns transformed features based on features columns passed in.
    463 
    464   Please note that most probably you would not need to use this function. Please
    465   check `input_layer` and `linear_model` to see whether they will
    466   satisfy your use case or not.
    467 
    468   Example:
    469 
    470   ```python
    471   # Define features and transformations
    472   crosses_a_x_b = crossed_column(
    473       columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
    474   price_buckets = bucketized_column(
    475       source_column=numeric_column("price"), boundaries=[...])
    476 
    477   columns = [crosses_a_x_b, price_buckets]
    478   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    479   transformed = transform_features(features=features, feature_columns=columns)
    480 
    481   assertCountEqual(columns, transformed.keys())
    482   ```
    483 
    484   Args:
    485     features: A mapping from key to tensors. `_FeatureColumn`s look up via these
    486       keys. For example `numeric_column('price')` will look at 'price' key in
    487       this dict. Values can be a `SparseTensor` or a `Tensor` depends on
    488       corresponding `_FeatureColumn`.
    489     feature_columns: An iterable containing all the `_FeatureColumn`s.
    490 
    491   Returns:
    492     A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
    493   """
    494   feature_columns = _clean_feature_columns(feature_columns)
    495   outputs = {}
    496   with ops.name_scope(
    497       None, default_name='transform_features', values=features.values()):
    498     builder = _LazyBuilder(features)
    499     for column in sorted(feature_columns, key=lambda x: x.name):
    500       with ops.name_scope(None, default_name=column.name):
    501         outputs[column] = builder.get(column)
    502   return outputs
    503 
    504 
    505 @tf_export('feature_column.make_parse_example_spec')
    506 def make_parse_example_spec(feature_columns):
    507   """Creates parsing spec dictionary from input feature_columns.
    508 
    509   The returned dictionary can be used as arg 'features' in `tf.parse_example`.
    510 
    511   Typical usage example:
    512 
    513   ```python
    514   # Define features and transformations
    515   feature_a = categorical_column_with_vocabulary_file(...)
    516   feature_b = numeric_column(...)
    517   feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
    518   feature_a_x_feature_c = crossed_column(
    519       columns=["feature_a", feature_c_bucketized], ...)
    520 
    521   feature_columns = set(
    522       [feature_b, feature_c_bucketized, feature_a_x_feature_c])
    523   features = tf.parse_example(
    524       serialized=serialized_examples,
    525       features=make_parse_example_spec(feature_columns))
    526   ```
    527 
    528   For the above example, make_parse_example_spec would return the dict:
    529 
    530   ```python
    531   {
    532       "feature_a": parsing_ops.VarLenFeature(tf.string),
    533       "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
    534       "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
    535   }
    536   ```
    537 
    538   Args:
    539     feature_columns: An iterable containing all feature columns. All items
    540       should be instances of classes derived from `_FeatureColumn`.
    541 
    542   Returns:
    543     A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
    544     value.
    545 
    546   Raises:
    547     ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
    548       instance.
    549   """
    550   result = {}
    551   for column in feature_columns:
    552     if not isinstance(column, _FeatureColumn):
    553       raise ValueError(
    554           'All feature_columns must be _FeatureColumn instances. '
    555           'Given: {}'.format(column))
    556     config = column._parse_example_spec  # pylint: disable=protected-access
    557     for key, value in six.iteritems(config):
    558       if key in result and value != result[key]:
    559         raise ValueError(
    560             'feature_columns contain different parse_spec for key '
    561             '{}. Given {} and {}'.format(key, value, result[key]))
    562     result.update(config)
    563   return result
    564 
    565 
    566 @tf_export('feature_column.embedding_column')
    567 def embedding_column(
    568     categorical_column, dimension, combiner='mean', initializer=None,
    569     ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
    570     trainable=True):
    571   """`_DenseColumn` that converts from sparse, categorical input.
    572 
    573   Use this when your inputs are sparse, but you want to convert them to a dense
    574   representation (e.g., to feed to a DNN).
    575 
    576   Inputs must be a `_CategoricalColumn` created by any of the
    577   `categorical_column_*` function. Here is an example of using
    578   `embedding_column` with `DNNClassifier`:
    579 
    580   ```python
    581   video_id = categorical_column_with_identity(
    582       key='video_id', num_buckets=1000000, default_value=0)
    583   columns = [embedding_column(video_id, 9),...]
    584 
    585   estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
    586 
    587   label_column = ...
    588   def input_fn():
    589     features = tf.parse_example(
    590         ..., features=make_parse_example_spec(columns + [label_column]))
    591     labels = features.pop(label_column.name)
    592     return features, labels
    593 
    594   estimator.train(input_fn=input_fn, steps=100)
    595   ```
    596 
    597   Here is an example using `embedding_column` with model_fn:
    598 
    599   ```python
    600   def model_fn(features, ...):
    601     video_id = categorical_column_with_identity(
    602         key='video_id', num_buckets=1000000, default_value=0)
    603     columns = [embedding_column(video_id, 9),...]
    604     dense_tensor = input_layer(features, columns)
    605     # Form DNN layers, calculate loss, and return EstimatorSpec.
    606     ...
    607   ```
    608 
    609   Args:
    610     categorical_column: A `_CategoricalColumn` created by a
    611       `categorical_column_with_*` function. This column produces the sparse IDs
    612       that are inputs to the embedding lookup.
    613     dimension: An integer specifying dimension of the embedding, must be > 0.
    614     combiner: A string specifying how to reduce if there are multiple entries
    615       in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
    616       'mean' the default. 'sqrtn' often achieves good accuracy, in particular
    617       with bag-of-words columns. Each of this can be thought as example level
    618       normalizations on the column. For more information, see
    619       `tf.embedding_lookup_sparse`.
    620     initializer: A variable initializer function to be used in embedding
    621       variable initialization. If not specified, defaults to
    622       `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
    623       `1/sqrt(dimension)`.
    624     ckpt_to_load_from: String representing checkpoint name/pattern from which to
    625       restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
    626     tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
    627       which to restore the column weights. Required if `ckpt_to_load_from` is
    628       not `None`.
    629     max_norm: If not `None`, embedding values are l2-normalized to this value.
    630     trainable: Whether or not the embedding is trainable. Default is True.
    631 
    632   Returns:
    633     `_DenseColumn` that converts from sparse input.
    634 
    635   Raises:
    636     ValueError: if `dimension` not > 0.
    637     ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
    638       is specified.
    639     ValueError: if `initializer` is specified and is not callable.
    640     RuntimeError: If eager execution is enabled.
    641   """
    642   if (dimension is None) or (dimension < 1):
    643     raise ValueError('Invalid dimension {}.'.format(dimension))
    644   if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
    645     raise ValueError('Must specify both `ckpt_to_load_from` and '
    646                      '`tensor_name_in_ckpt` or none of them.')
    647 
    648   if (initializer is not None) and (not callable(initializer)):
    649     raise ValueError('initializer must be callable if specified. '
    650                      'Embedding of column_name: {}'.format(
    651                          categorical_column.name))
    652   if initializer is None:
    653     initializer = init_ops.truncated_normal_initializer(
    654         mean=0.0, stddev=1 / math.sqrt(dimension))
    655 
    656   return _EmbeddingColumn(
    657       categorical_column=categorical_column,
    658       dimension=dimension,
    659       combiner=combiner,
    660       initializer=initializer,
    661       ckpt_to_load_from=ckpt_to_load_from,
    662       tensor_name_in_ckpt=tensor_name_in_ckpt,
    663       max_norm=max_norm,
    664       trainable=trainable)
    665 
    666 
    667 @tf_export('feature_column.shared_embedding_columns')
    668 def shared_embedding_columns(
    669     categorical_columns, dimension, combiner='mean', initializer=None,
    670     shared_embedding_collection_name=None, ckpt_to_load_from=None,
    671     tensor_name_in_ckpt=None, max_norm=None, trainable=True):
    672   """List of dense columns that convert from sparse, categorical input.
    673 
    674   This is similar to `embedding_column`, except that that it produces a list of
    675   embedding columns that share the same embedding weights.
    676 
    677   Use this when your inputs are sparse and of the same type (e.g. watched and
    678   impression video IDs that share the same vocabulary), and you want to convert
    679   them to a dense representation (e.g., to feed to a DNN).
    680 
    681   Inputs must be a list of categorical columns created by any of the
    682   `categorical_column_*` function. They must all be of the same type and have
    683   the same arguments except `key`. E.g. they can be
    684   categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
    685   all columns could also be weighted_categorical_column.
    686 
    687   Here is an example embedding of two features for a DNNClassifier model:
    688 
    689   ```python
    690   watched_video_id = categorical_column_with_vocabulary_file(
    691       'watched_video_id', video_vocabulary_file, video_vocabulary_size)
    692   impression_video_id = categorical_column_with_vocabulary_file(
    693       'impression_video_id', video_vocabulary_file, video_vocabulary_size)
    694   columns = shared_embedding_columns(
    695       [watched_video_id, impression_video_id], dimension=10)
    696 
    697   estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
    698 
    699   label_column = ...
    700   def input_fn():
    701     features = tf.parse_example(
    702         ..., features=make_parse_example_spec(columns + [label_column]))
    703     labels = features.pop(label_column.name)
    704     return features, labels
    705 
    706   estimator.train(input_fn=input_fn, steps=100)
    707   ```
    708 
    709   Here is an example using `shared_embedding_columns` with model_fn:
    710 
    711   ```python
    712   def model_fn(features, ...):
    713     watched_video_id = categorical_column_with_vocabulary_file(
    714         'watched_video_id', video_vocabulary_file, video_vocabulary_size)
    715     impression_video_id = categorical_column_with_vocabulary_file(
    716         'impression_video_id', video_vocabulary_file, video_vocabulary_size)
    717     columns = shared_embedding_columns(
    718         [watched_video_id, impression_video_id], dimension=10)
    719     dense_tensor = input_layer(features, columns)
    720     # Form DNN layers, calculate loss, and return EstimatorSpec.
    721     ...
    722   ```
    723 
    724   Args:
    725     categorical_columns: List of categorical columns created by a
    726       `categorical_column_with_*` function. These columns produce the sparse IDs
    727       that are inputs to the embedding lookup. All columns must be of the same
    728       type and have the same arguments except `key`. E.g. they can be
    729       categorical_column_with_vocabulary_file with the same vocabulary_file.
    730       Some or all columns could also be weighted_categorical_column.
    731     dimension: An integer specifying dimension of the embedding, must be > 0.
    732     combiner: A string specifying how to reduce if there are multiple entries
    733       in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
    734       'mean' the default. 'sqrtn' often achieves good accuracy, in particular
    735       with bag-of-words columns. Each of this can be thought as example level
    736       normalizations on the column. For more information, see
    737       `tf.embedding_lookup_sparse`.
    738     initializer: A variable initializer function to be used in embedding
    739       variable initialization. If not specified, defaults to
    740       `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
    741       `1/sqrt(dimension)`.
    742     shared_embedding_collection_name: Optional name of the collection where
    743       shared embedding weights are added. If not given, a reasonable name will
    744       be chosen based on the names of `categorical_columns`. This is also used
    745       in `variable_scope` when creating shared embedding weights.
    746     ckpt_to_load_from: String representing checkpoint name/pattern from which to
    747       restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
    748     tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
    749       which to restore the column weights. Required if `ckpt_to_load_from` is
    750       not `None`.
    751     max_norm: If not `None`, embedding values are l2-normalized to this value.
    752     trainable: Whether or not the embedding is trainable. Default is True.
    753 
    754   Returns:
    755     A list of dense columns that converts from sparse input. The order of
    756     results follows the ordering of `categorical_columns`.
    757 
    758   Raises:
    759     ValueError: if `dimension` not > 0.
    760     ValueError: if any of the given `categorical_columns` is of different type
    761       or has different arguments than the others.
    762     ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
    763       is specified.
    764     ValueError: if `initializer` is specified and is not callable.
    765   """
    766   if (dimension is None) or (dimension < 1):
    767     raise ValueError('Invalid dimension {}.'.format(dimension))
    768   if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
    769     raise ValueError('Must specify both `ckpt_to_load_from` and '
    770                      '`tensor_name_in_ckpt` or none of them.')
    771 
    772   if (initializer is not None) and (not callable(initializer)):
    773     raise ValueError('initializer must be callable if specified.')
    774   if initializer is None:
    775     initializer = init_ops.truncated_normal_initializer(
    776         mean=0.0, stddev=1. / math.sqrt(dimension))
    777 
    778   # Sort the columns so the default collection name is deterministic even if the
    779   # user passes columns from an unsorted collection, such as dict.values().
    780   sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
    781 
    782   c0 = sorted_columns[0]
    783   if not isinstance(c0, _CategoricalColumn):
    784     raise ValueError(
    785         'All categorical_columns must be subclasses of _CategoricalColumn. '
    786         'Given: {}, of type: {}'.format(c0, type(c0)))
    787   if isinstance(c0, _WeightedCategoricalColumn):
    788     c0 = c0.categorical_column
    789   for c in sorted_columns[1:]:
    790     if isinstance(c, _WeightedCategoricalColumn):
    791       c = c.categorical_column
    792     if not isinstance(c, type(c0)):
    793       raise ValueError(
    794           'To use shared_embedding_column, all categorical_columns must have '
    795           'the same type, or be weighted_categorical_column of the same type. '
    796           'Given column: {} of type: {} does not match given column: {} of '
    797           'type: {}'.format(c0, type(c0), c, type(c)))
    798 
    799   if not shared_embedding_collection_name:
    800     shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
    801     shared_embedding_collection_name += '_shared_embedding'
    802 
    803   result = []
    804   for column in categorical_columns:
    805     result.append(_SharedEmbeddingColumn(
    806         categorical_column=column,
    807         dimension=dimension,
    808         combiner=combiner,
    809         initializer=initializer,
    810         shared_embedding_collection_name=shared_embedding_collection_name,
    811         ckpt_to_load_from=ckpt_to_load_from,
    812         tensor_name_in_ckpt=tensor_name_in_ckpt,
    813         max_norm=max_norm,
    814         trainable=trainable))
    815   return result
    816 
    817 
    818 @tf_export('feature_column.numeric_column')
    819 def numeric_column(key,
    820                    shape=(1,),
    821                    default_value=None,
    822                    dtype=dtypes.float32,
    823                    normalizer_fn=None):
    824   """Represents real valued or numerical features.
    825 
    826   Example:
    827 
    828   ```python
    829   price = numeric_column('price')
    830   columns = [price, ...]
    831   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    832   dense_tensor = input_layer(features, columns)
    833 
    834   # or
    835   bucketized_price = bucketized_column(price, boundaries=[...])
    836   columns = [bucketized_price, ...]
    837   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    838   linear_prediction = linear_model(features, columns)
    839   ```
    840 
    841   Args:
    842     key: A unique string identifying the input feature. It is used as the
    843       column name and the dictionary key for feature parsing configs, feature
    844       `Tensor` objects, and feature columns.
    845     shape: An iterable of integers specifies the shape of the `Tensor`. An
    846       integer can be given which means a single dimension `Tensor` with given
    847       width. The `Tensor` representing the column will have the shape of
    848       [batch_size] + `shape`.
    849     default_value: A single value compatible with `dtype` or an iterable of
    850       values compatible with `dtype` which the column takes on during
    851       `tf.Example` parsing if data is missing. A default value of `None` will
    852       cause `tf.parse_example` to fail if an example does not contain this
    853       column. If a single value is provided, the same value will be applied as
    854       the default value for every item. If an iterable of values is provided,
    855       the shape of the `default_value` should be equal to the given `shape`.
    856     dtype: defines the type of values. Default value is `tf.float32`. Must be a
    857       non-quantized, real integer or floating point type.
    858     normalizer_fn: If not `None`, a function that can be used to normalize the
    859       value of the tensor after `default_value` is applied for parsing.
    860       Normalizer function takes the input `Tensor` as its argument, and returns
    861       the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
    862       even though the most common use case of this function is normalization, it
    863       can be used for any kind of Tensorflow transformations.
    864 
    865   Returns:
    866     A `_NumericColumn`.
    867 
    868   Raises:
    869     TypeError: if any dimension in shape is not an int
    870     ValueError: if any dimension in shape is not a positive integer
    871     TypeError: if `default_value` is an iterable but not compatible with `shape`
    872     TypeError: if `default_value` is not compatible with `dtype`.
    873     ValueError: if `dtype` is not convertible to `tf.float32`.
    874   """
    875   shape = _check_shape(shape, key)
    876   if not (dtype.is_integer or dtype.is_floating):
    877     raise ValueError('dtype must be convertible to float. '
    878                      'dtype: {}, key: {}'.format(dtype, key))
    879   default_value = _check_default_value(shape, default_value, dtype, key)
    880 
    881   if normalizer_fn is not None and not callable(normalizer_fn):
    882     raise TypeError(
    883         'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
    884 
    885   return _NumericColumn(
    886       key,
    887       shape=shape,
    888       default_value=default_value,
    889       dtype=dtype,
    890       normalizer_fn=normalizer_fn)
    891 
    892 
    893 @tf_export('feature_column.bucketized_column')
    894 def bucketized_column(source_column, boundaries):
    895   """Represents discretized dense input.
    896 
    897   Buckets include the left boundary, and exclude the right boundary. Namely,
    898   `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
    899   `[1., 2.)`, and `[2., +inf)`.
    900 
    901   For example, if the inputs are
    902 
    903   ```python
    904   boundaries = [0, 10, 100]
    905   input tensor = [[-5, 10000]
    906                   [150,   10]
    907                   [5,    100]]
    908   ```
    909 
    910   then the output will be
    911 
    912   ```python
    913   output = [[0, 3]
    914             [3, 2]
    915             [1, 3]]
    916   ```
    917 
    918   Example:
    919 
    920   ```python
    921   price = numeric_column('price')
    922   bucketized_price = bucketized_column(price, boundaries=[...])
    923   columns = [bucketized_price, ...]
    924   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    925   linear_prediction = linear_model(features, columns)
    926 
    927   # or
    928   columns = [bucketized_price, ...]
    929   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    930   dense_tensor = input_layer(features, columns)
    931   ```
    932 
    933   `bucketized_column` can also be crossed with another categorical column using
    934   `crossed_column`:
    935 
    936   ```python
    937   price = numeric_column('price')
    938   # bucketized_column converts numerical feature to a categorical one.
    939   bucketized_price = bucketized_column(price, boundaries=[...])
    940   # 'keywords' is a string feature.
    941   price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
    942   columns = [price_x_keywords, ...]
    943   features = tf.parse_example(..., features=make_parse_example_spec(columns))
    944   linear_prediction = linear_model(features, columns)
    945   ```
    946 
    947   Args:
    948     source_column: A one-dimensional dense column which is generated with
    949       `numeric_column`.
    950     boundaries: A sorted list or tuple of floats specifying the boundaries.
    951 
    952   Returns:
    953     A `_BucketizedColumn`.
    954 
    955   Raises:
    956     ValueError: If `source_column` is not a numeric column, or if it is not
    957       one-dimensional.
    958     ValueError: If `boundaries` is not a sorted list or tuple.
    959   """
    960   if not isinstance(source_column, _NumericColumn):
    961     raise ValueError(
    962         'source_column must be a column generated with numeric_column(). '
    963         'Given: {}'.format(source_column))
    964   if len(source_column.shape) > 1:
    965     raise ValueError(
    966         'source_column must be one-dimensional column. '
    967         'Given: {}'.format(source_column))
    968   if (not boundaries or
    969       not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
    970     raise ValueError('boundaries must be a sorted list.')
    971   for i in range(len(boundaries) - 1):
    972     if boundaries[i] >= boundaries[i + 1]:
    973       raise ValueError('boundaries must be a sorted list.')
    974   return _BucketizedColumn(source_column, tuple(boundaries))
    975 
    976 
    977 def _assert_string_or_int(dtype, prefix):
    978   if (dtype != dtypes.string) and (not dtype.is_integer):
    979     raise ValueError(
    980         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
    981 
    982 
    983 @tf_export('feature_column.categorical_column_with_hash_bucket')
    984 def categorical_column_with_hash_bucket(key,
    985                                         hash_bucket_size,
    986                                         dtype=dtypes.string):
    987   """Represents sparse feature where ids are set by hashing.
    988 
    989   Use this when your sparse features are in string or integer format, and you
    990   want to distribute your inputs into a finite number of buckets by hashing.
    991   output_id = Hash(input_feature_string) % bucket_size
    992 
    993   For input dictionary `features`, `features[key]` is either `Tensor` or
    994   `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
    995   and `''` for string. Note that these values are independent of the
    996   `default_value` argument.
    997 
    998   Example:
    999 
   1000   ```python
   1001   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   1002   columns = [keywords, ...]
   1003   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1004   linear_prediction = linear_model(features, columns)
   1005 
   1006   # or
   1007   keywords_embedded = embedding_column(keywords, 16)
   1008   columns = [keywords_embedded, ...]
   1009   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1010   dense_tensor = input_layer(features, columns)
   1011   ```
   1012 
   1013   Args:
   1014     key: A unique string identifying the input feature. It is used as the
   1015       column name and the dictionary key for feature parsing configs, feature
   1016       `Tensor` objects, and feature columns.
   1017     hash_bucket_size: An int > 1. The number of buckets.
   1018     dtype: The type of features. Only string and integer types are supported.
   1019 
   1020   Returns:
   1021     A `_HashedCategoricalColumn`.
   1022 
   1023   Raises:
   1024     ValueError: `hash_bucket_size` is not greater than 1.
   1025     ValueError: `dtype` is neither string nor integer.
   1026   """
   1027   if hash_bucket_size is None:
   1028     raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
   1029 
   1030   if hash_bucket_size < 1:
   1031     raise ValueError('hash_bucket_size must be at least 1. '
   1032                      'hash_bucket_size: {}, key: {}'.format(
   1033                          hash_bucket_size, key))
   1034 
   1035   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   1036 
   1037   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
   1038 
   1039 
   1040 @tf_export('feature_column.categorical_column_with_vocabulary_file')
   1041 def categorical_column_with_vocabulary_file(key,
   1042                                             vocabulary_file,
   1043                                             vocabulary_size=None,
   1044                                             num_oov_buckets=0,
   1045                                             default_value=None,
   1046                                             dtype=dtypes.string):
   1047   """A `_CategoricalColumn` with a vocabulary file.
   1048 
   1049   Use this when your inputs are in string or integer format, and you have a
   1050   vocabulary file that maps each value to an integer ID. By default,
   1051   out-of-vocabulary values are ignored. Use either (but not both) of
   1052   `num_oov_buckets` and `default_value` to specify how to include
   1053   out-of-vocabulary values.
   1054 
   1055   For input dictionary `features`, `features[key]` is either `Tensor` or
   1056   `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
   1057   and `''` for string. Note that these values are independent of the
   1058   `default_value` argument.
   1059 
   1060   Example with `num_oov_buckets`:
   1061   File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
   1062   abbreviation. All inputs with values in that file are assigned an ID 0-49,
   1063   corresponding to its line number. All other values are hashed and assigned an
   1064   ID 50-54.
   1065 
   1066   ```python
   1067   states = categorical_column_with_vocabulary_file(
   1068       key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
   1069       num_oov_buckets=5)
   1070   columns = [states, ...]
   1071   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1072   linear_prediction = linear_model(features, columns)
   1073   ```
   1074 
   1075   Example with `default_value`:
   1076   File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
   1077   other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
   1078   in input, and other values missing from the file, will be assigned ID 0. All
   1079   others are assigned the corresponding line number 1-50.
   1080 
   1081   ```python
   1082   states = categorical_column_with_vocabulary_file(
   1083       key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
   1084       default_value=0)
   1085   columns = [states, ...]
   1086   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1087   linear_prediction, _, _ = linear_model(features, columns)
   1088   ```
   1089 
   1090   And to make an embedding with either:
   1091 
   1092   ```python
   1093   columns = [embedding_column(states, 3),...]
   1094   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1095   dense_tensor = input_layer(features, columns)
   1096   ```
   1097 
   1098   Args:
   1099     key: A unique string identifying the input feature. It is used as the
   1100       column name and the dictionary key for feature parsing configs, feature
   1101       `Tensor` objects, and feature columns.
   1102     vocabulary_file: The vocabulary file name.
   1103     vocabulary_size: Number of the elements in the vocabulary. This must be no
   1104       greater than length of `vocabulary_file`, if less than length, later
   1105       values are ignored. If None, it is set to the length of `vocabulary_file`.
   1106     num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
   1107       buckets. All out-of-vocabulary inputs will be assigned IDs in the range
   1108       `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
   1109       the input value. A positive `num_oov_buckets` can not be specified with
   1110       `default_value`.
   1111     default_value: The integer ID value to return for out-of-vocabulary feature
   1112       values, defaults to `-1`. This can not be specified with a positive
   1113       `num_oov_buckets`.
   1114     dtype: The type of features. Only string and integer types are supported.
   1115 
   1116   Returns:
   1117     A `_CategoricalColumn` with a vocabulary file.
   1118 
   1119   Raises:
   1120     ValueError: `vocabulary_file` is missing or cannot be opened.
   1121     ValueError: `vocabulary_size` is missing or < 1.
   1122     ValueError: `num_oov_buckets` is a negative integer.
   1123     ValueError: `num_oov_buckets` and `default_value` are both specified.
   1124     ValueError: `dtype` is neither string nor integer.
   1125   """
   1126   if not vocabulary_file:
   1127     raise ValueError('Missing vocabulary_file in {}.'.format(key))
   1128 
   1129   if vocabulary_size is None:
   1130     if not gfile.Exists(vocabulary_file):
   1131       raise ValueError('vocabulary_file in {} does not exist.'.format(key))
   1132 
   1133     with gfile.GFile(vocabulary_file) as f:
   1134       vocabulary_size = sum(1 for _ in f)
   1135     logging.info(
   1136         'vocabulary_size = %d in %s is inferred from the number of elements '
   1137         'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
   1138 
   1139   # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
   1140   if vocabulary_size < 1:
   1141     raise ValueError('Invalid vocabulary_size in {}.'.format(key))
   1142   if num_oov_buckets:
   1143     if default_value is not None:
   1144       raise ValueError(
   1145           'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
   1146               key))
   1147     if num_oov_buckets < 0:
   1148       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
   1149           num_oov_buckets, key))
   1150   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   1151   return _VocabularyFileCategoricalColumn(
   1152       key=key,
   1153       vocabulary_file=vocabulary_file,
   1154       vocabulary_size=vocabulary_size,
   1155       num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
   1156       default_value=-1 if default_value is None else default_value,
   1157       dtype=dtype)
   1158 
   1159 
   1160 @tf_export('feature_column.categorical_column_with_vocabulary_list')
   1161 def categorical_column_with_vocabulary_list(
   1162     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   1163   """A `_CategoricalColumn` with in-memory vocabulary.
   1164 
   1165   Use this when your inputs are in string or integer format, and you have an
   1166   in-memory vocabulary mapping each value to an integer ID. By default,
   1167   out-of-vocabulary values are ignored. Use either (but not both) of
   1168   `num_oov_buckets` and `default_value` to specify how to include
   1169   out-of-vocabulary values.
   1170 
   1171   For input dictionary `features`, `features[key]` is either `Tensor` or
   1172   `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
   1173   and `''` for string. Note that these values are independent of the
   1174   `default_value` argument.
   1175 
   1176   Example with `num_oov_buckets`:
   1177   In the following example, each input in `vocabulary_list` is assigned an ID
   1178   0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
   1179   inputs are hashed and assigned an ID 4-5.
   1180 
   1181   ```python
   1182   colors = categorical_column_with_vocabulary_list(
   1183       key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
   1184       num_oov_buckets=2)
   1185   columns = [colors, ...]
   1186   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1187   linear_prediction, _, _ = linear_model(features, columns)
   1188   ```
   1189 
   1190   Example with `default_value`:
   1191   In the following example, each input in `vocabulary_list` is assigned an ID
   1192   0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
   1193   inputs are assigned `default_value` 0.
   1194 
   1195 
   1196   ```python
   1197   colors = categorical_column_with_vocabulary_list(
   1198       key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
   1199   columns = [colors, ...]
   1200   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1201   linear_prediction, _, _ = linear_model(features, columns)
   1202   ```
   1203 
   1204   And to make an embedding with either:
   1205 
   1206   ```python
   1207   columns = [embedding_column(colors, 3),...]
   1208   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1209   dense_tensor = input_layer(features, columns)
   1210   ```
   1211 
   1212   Args:
   1213     key: A unique string identifying the input feature. It is used as the
   1214       column name and the dictionary key for feature parsing configs, feature
   1215       `Tensor` objects, and feature columns.
   1216     vocabulary_list: An ordered iterable defining the vocabulary. Each feature
   1217       is mapped to the index of its value (if present) in `vocabulary_list`.
   1218       Must be castable to `dtype`.
   1219     dtype: The type of features. Only string and integer types are supported.
   1220       If `None`, it will be inferred from `vocabulary_list`.
   1221     default_value: The integer ID value to return for out-of-vocabulary feature
   1222       values, defaults to `-1`. This can not be specified with a positive
   1223       `num_oov_buckets`.
   1224     num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
   1225       buckets. All out-of-vocabulary inputs will be assigned IDs in the range
   1226       `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
   1227       hash of the input value. A positive `num_oov_buckets` can not be specified
   1228       with `default_value`.
   1229 
   1230   Returns:
   1231     A `_CategoricalColumn` with in-memory vocabulary.
   1232 
   1233   Raises:
   1234     ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
   1235     ValueError: `num_oov_buckets` is a negative integer.
   1236     ValueError: `num_oov_buckets` and `default_value` are both specified.
   1237     ValueError: if `dtype` is not integer or string.
   1238   """
   1239   if (vocabulary_list is None) or (len(vocabulary_list) < 1):
   1240     raise ValueError(
   1241         'vocabulary_list {} must be non-empty, column_name: {}'.format(
   1242             vocabulary_list, key))
   1243   if len(set(vocabulary_list)) != len(vocabulary_list):
   1244     raise ValueError(
   1245         'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
   1246             vocabulary_list, key))
   1247   vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
   1248   if num_oov_buckets:
   1249     if default_value != -1:
   1250       raise ValueError(
   1251           'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
   1252               key))
   1253     if num_oov_buckets < 0:
   1254       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
   1255           num_oov_buckets, key))
   1256   _assert_string_or_int(
   1257       vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
   1258   if dtype is None:
   1259     dtype = vocabulary_dtype
   1260   elif dtype.is_integer != vocabulary_dtype.is_integer:
   1261     raise ValueError(
   1262         'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
   1263             dtype, vocabulary_dtype, key))
   1264   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
   1265 
   1266   return _VocabularyListCategoricalColumn(
   1267       key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
   1268       default_value=default_value, num_oov_buckets=num_oov_buckets)
   1269 
   1270 
   1271 @tf_export('feature_column.categorical_column_with_identity')
   1272 def categorical_column_with_identity(key, num_buckets, default_value=None):
   1273   """A `_CategoricalColumn` that returns identity values.
   1274 
   1275   Use this when your inputs are integers in the range `[0, num_buckets)`, and
   1276   you want to use the input value itself as the categorical ID. Values outside
   1277   this range will result in `default_value` if specified, otherwise it will
   1278   fail.
   1279 
   1280   Typically, this is used for contiguous ranges of integer indexes, but
   1281   it doesn't have to be. This might be inefficient, however, if many of IDs
   1282   are unused. Consider `categorical_column_with_hash_bucket` in that case.
   1283 
   1284   For input dictionary `features`, `features[key]` is either `Tensor` or
   1285   `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
   1286   and `''` for string. Note that these values are independent of the
   1287   `default_value` argument.
   1288 
   1289   In the following examples, each input in the range `[0, 1000000)` is assigned
   1290   the same value. All other inputs are assigned `default_value` 0. Note that a
   1291   literal 0 in inputs will result in the same default ID.
   1292 
   1293   Linear model:
   1294 
   1295   ```python
   1296   video_id = categorical_column_with_identity(
   1297       key='video_id', num_buckets=1000000, default_value=0)
   1298   columns = [video_id, ...]
   1299   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1300   linear_prediction, _, _ = linear_model(features, columns)
   1301   ```
   1302 
   1303   Embedding for a DNN model:
   1304 
   1305   ```python
   1306   columns = [embedding_column(video_id, 9),...]
   1307   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1308   dense_tensor = input_layer(features, columns)
   1309   ```
   1310 
   1311   Args:
   1312     key: A unique string identifying the input feature. It is used as the
   1313       column name and the dictionary key for feature parsing configs, feature
   1314       `Tensor` objects, and feature columns.
   1315     num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
   1316     default_value: If `None`, this column's graph operations will fail for
   1317       out-of-range inputs. Otherwise, this value must be in the range
   1318       `[0, num_buckets)`, and will replace inputs in that range.
   1319 
   1320   Returns:
   1321     A `_CategoricalColumn` that returns identity values.
   1322 
   1323   Raises:
   1324     ValueError: if `num_buckets` is less than one.
   1325     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   1326   """
   1327   if num_buckets < 1:
   1328     raise ValueError(
   1329         'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
   1330   if (default_value is not None) and (
   1331       (default_value < 0) or (default_value >= num_buckets)):
   1332     raise ValueError(
   1333         'default_value {} not in range [0, {}), column_name {}'.format(
   1334             default_value, num_buckets, key))
   1335   return _IdentityCategoricalColumn(
   1336       key=key, num_buckets=num_buckets, default_value=default_value)
   1337 
   1338 
   1339 @tf_export('feature_column.indicator_column')
   1340 def indicator_column(categorical_column):
   1341   """Represents multi-hot representation of given categorical column.
   1342 
   1343   Used to wrap any `categorical_column_*` (e.g., to feed to DNN). Use
   1344   `embedding_column` if the inputs are sparse.
   1345 
   1346   ```python
   1347   name = indicator_column(categorical_column_with_vocabulary_list(
   1348       'name', ['bob', 'george', 'wanda'])
   1349   columns = [name, ...]
   1350   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1351   dense_tensor = input_layer(features, columns)
   1352 
   1353   dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
   1354   dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
   1355   dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
   1356   ```
   1357 
   1358   Args:
   1359     categorical_column: A `_CategoricalColumn` which is created by
   1360       `categorical_column_with_*` or `crossed_column` functions.
   1361 
   1362   Returns:
   1363     An `_IndicatorColumn`.
   1364   """
   1365   return _IndicatorColumn(categorical_column)
   1366 
   1367 
   1368 @tf_export('feature_column.weighted_categorical_column')
   1369 def weighted_categorical_column(
   1370     categorical_column, weight_feature_key, dtype=dtypes.float32):
   1371   """Applies weight values to a `_CategoricalColumn`.
   1372 
   1373   Use this when each of your sparse inputs has both an ID and a value. For
   1374   example, if you're representing text documents as a collection of word
   1375   frequencies, you can provide 2 parallel sparse input features ('terms' and
   1376   'frequencies' below).
   1377 
   1378   Example:
   1379 
   1380   Input `tf.Example` objects:
   1381 
   1382   ```proto
   1383   [
   1384     features {
   1385       feature {
   1386         key: "terms"
   1387         value {bytes_list {value: "very" value: "model"}}
   1388       }
   1389       feature {
   1390         key: "frequencies"
   1391         value {float_list {value: 0.3 value: 0.1}}
   1392       }
   1393     },
   1394     features {
   1395       feature {
   1396         key: "terms"
   1397         value {bytes_list {value: "when" value: "course" value: "human"}}
   1398       }
   1399       feature {
   1400         key: "frequencies"
   1401         value {float_list {value: 0.4 value: 0.1 value: 0.2}}
   1402       }
   1403     }
   1404   ]
   1405   ```
   1406 
   1407   ```python
   1408   categorical_column = categorical_column_with_hash_bucket(
   1409       column_name='terms', hash_bucket_size=1000)
   1410   weighted_column = weighted_categorical_column(
   1411       categorical_column=categorical_column, weight_feature_key='frequencies')
   1412   columns = [weighted_column, ...]
   1413   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1414   linear_prediction, _, _ = linear_model(features, columns)
   1415   ```
   1416 
   1417   This assumes the input dictionary contains a `SparseTensor` for key
   1418   'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
   1419   the same indices and dense shape.
   1420 
   1421   Args:
   1422     categorical_column: A `_CategoricalColumn` created by
   1423       `categorical_column_with_*` functions.
   1424     weight_feature_key: String key for weight values.
   1425     dtype: Type of weights, such as `tf.float32`. Only float and integer weights
   1426       are supported.
   1427 
   1428   Returns:
   1429     A `_CategoricalColumn` composed of two sparse features: one represents id,
   1430     the other represents weight (value) of the id feature in that example.
   1431 
   1432   Raises:
   1433     ValueError: if `dtype` is not convertible to float.
   1434   """
   1435   if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
   1436     raise ValueError('dtype {} is not convertible to float.'.format(dtype))
   1437   return _WeightedCategoricalColumn(
   1438       categorical_column=categorical_column,
   1439       weight_feature_key=weight_feature_key,
   1440       dtype=dtype)
   1441 
   1442 
   1443 @tf_export('feature_column.crossed_column')
   1444 def crossed_column(keys, hash_bucket_size, hash_key=None):
   1445   """Returns a column for performing crosses of categorical features.
   1446 
   1447   Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
   1448   the transformation can be thought of as:
   1449     Hash(cartesian product of features) % `hash_bucket_size`
   1450 
   1451   For example, if the input features are:
   1452 
   1453   * SparseTensor referred by first key:
   1454 
   1455     ```python
   1456     shape = [2, 2]
   1457     {
   1458         [0, 0]: "a"
   1459         [1, 0]: "b"
   1460         [1, 1]: "c"
   1461     }
   1462     ```
   1463 
   1464   * SparseTensor referred by second key:
   1465 
   1466     ```python
   1467     shape = [2, 1]
   1468     {
   1469         [0, 0]: "d"
   1470         [1, 0]: "e"
   1471     }
   1472     ```
   1473 
   1474   then crossed feature will look like:
   1475 
   1476   ```python
   1477    shape = [2, 2]
   1478   {
   1479       [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
   1480       [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
   1481       [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
   1482   }
   1483   ```
   1484 
   1485   Here is an example to create a linear model with crosses of string features:
   1486 
   1487   ```python
   1488   keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
   1489   columns = [keywords_x_doc_terms, ...]
   1490   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1491   linear_prediction = linear_model(features, columns)
   1492   ```
   1493 
   1494   You could also use vocabulary lookup before crossing:
   1495 
   1496   ```python
   1497   keywords = categorical_column_with_vocabulary_file(
   1498       'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
   1499   keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
   1500   columns = [keywords_x_doc_terms, ...]
   1501   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1502   linear_prediction = linear_model(features, columns)
   1503   ```
   1504 
   1505   If an input feature is of numeric type, you can use
   1506   `categorical_column_with_identity`, or `bucketized_column`, as in the example:
   1507 
   1508   ```python
   1509   # vertical_id is an integer categorical feature.
   1510   vertical_id = categorical_column_with_identity('vertical_id', 10K)
   1511   price = numeric_column('price')
   1512   # bucketized_column converts numerical feature to a categorical one.
   1513   bucketized_price = bucketized_column(price, boundaries=[...])
   1514   vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
   1515   columns = [vertical_id_x_price, ...]
   1516   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   1517   linear_prediction = linear_model(features, columns)
   1518   ```
   1519 
   1520   To use crossed column in DNN model, you need to add it in an embedding column
   1521   as in this example:
   1522 
   1523   ```python
   1524   vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
   1525   vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
   1526   dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
   1527   ```
   1528 
   1529   Args:
   1530     keys: An iterable identifying the features to be crossed. Each element can
   1531       be either:
   1532       * string: Will use the corresponding feature which must be of string type.
   1533       * `_CategoricalColumn`: Will use the transformed tensor produced by this
   1534         column. Does not support hashed categorical column.
   1535     hash_bucket_size: An int > 1. The number of buckets.
   1536     hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
   1537       function to combine the crosses fingerprints on SparseCrossOp (optional).
   1538 
   1539   Returns:
   1540     A `_CrossedColumn`.
   1541 
   1542   Raises:
   1543     ValueError: If `len(keys) < 2`.
   1544     ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
   1545     ValueError: If any of the keys is `_HashedCategoricalColumn`.
   1546     ValueError: If `hash_bucket_size < 1`.
   1547   """
   1548   if not hash_bucket_size or hash_bucket_size < 1:
   1549     raise ValueError('hash_bucket_size must be > 1. '
   1550                      'hash_bucket_size: {}'.format(hash_bucket_size))
   1551   if not keys or len(keys) < 2:
   1552     raise ValueError(
   1553         'keys must be a list with length > 1. Given: {}'.format(keys))
   1554   for key in keys:
   1555     if (not isinstance(key, six.string_types) and
   1556         not isinstance(key, _CategoricalColumn)):
   1557       raise ValueError(
   1558           'Unsupported key type. All keys must be either string, or '
   1559           'categorical column except _HashedCategoricalColumn. '
   1560           'Given: {}'.format(key))
   1561     if isinstance(key, _HashedCategoricalColumn):
   1562       raise ValueError(
   1563           'categorical_column_with_hash_bucket is not supported for crossing. '
   1564           'Hashing before crossing will increase probability of collision. '
   1565           'Instead, use the feature name as a string. Given: {}'.format(key))
   1566   return _CrossedColumn(
   1567       keys=tuple(keys), hash_bucket_size=hash_bucket_size,
   1568       hash_key=hash_key)
   1569 
   1570 
   1571 class _FeatureColumn(object):
   1572   """Represents a feature column abstraction.
   1573 
   1574   WARNING: Do not subclass this layer unless you know what you are doing:
   1575   the API is subject to future changes.
   1576 
   1577   To distinguish the concept of a feature family and a specific binary feature
   1578   within a family, we refer to a feature family like "country" as a feature
   1579   column. Following is an example feature in a `tf.Example` format:
   1580     {key: "country",  value: [ "US" ]}
   1581   In this example the value of feature is "US" and "country" refers to the
   1582   column of the feature.
   1583 
   1584   This class is an abstract class. User should not create instances of this.
   1585   """
   1586   __metaclass__ = abc.ABCMeta
   1587 
   1588   @abc.abstractproperty
   1589   def name(self):
   1590     """Returns string. Used for naming and for name_scope."""
   1591     pass
   1592 
   1593   @property
   1594   def _var_scope_name(self):
   1595     """Returns string. Used for variable_scope. Defaults to self.name."""
   1596     return self.name
   1597 
   1598   @abc.abstractmethod
   1599   def _transform_feature(self, inputs):
   1600     """Returns intermediate representation (usually a `Tensor`).
   1601 
   1602     Uses `inputs` to create an intermediate representation (usually a `Tensor`)
   1603     that other feature columns can use.
   1604 
   1605     Example usage of `inputs`:
   1606     Let's say a Feature column depends on raw feature ('raw') and another
   1607     `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
   1608     be used as follows:
   1609 
   1610     ```python
   1611     raw_tensor = inputs.get('raw')
   1612     fc_tensor = inputs.get(input_fc)
   1613     ```
   1614 
   1615     Args:
   1616       inputs: A `_LazyBuilder` object to access inputs.
   1617 
   1618     Returns:
   1619       Transformed feature `Tensor`.
   1620     """
   1621     pass
   1622 
   1623   @abc.abstractproperty
   1624   def _parse_example_spec(self):
   1625     """Returns a `tf.Example` parsing spec as dict.
   1626 
   1627     It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
   1628     dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
   1629     supported objects. Please check documentation of ${tf.parse_example} for all
   1630     supported spec objects.
   1631 
   1632     Let's say a Feature column depends on raw feature ('raw') and another
   1633     `_FeatureColumn` (input_fc). One possible implementation of
   1634     _parse_example_spec is as follows:
   1635 
   1636     ```python
   1637     spec = {'raw': tf.FixedLenFeature(...)}
   1638     spec.update(input_fc._parse_example_spec)
   1639     return spec
   1640     ```
   1641     """
   1642     pass
   1643 
   1644 
   1645 class _DenseColumn(_FeatureColumn):
   1646   """Represents a column which can be represented as `Tensor`.
   1647 
   1648   WARNING: Do not subclass this layer unless you know what you are doing:
   1649   the API is subject to future changes.
   1650 
   1651   Some examples of this type are: numeric_column, embedding_column,
   1652   indicator_column.
   1653   """
   1654 
   1655   __metaclass__ = abc.ABCMeta
   1656 
   1657   @abc.abstractproperty
   1658   def _variable_shape(self):
   1659     """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
   1660     pass
   1661 
   1662   @abc.abstractmethod
   1663   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   1664     """Returns a `Tensor`.
   1665 
   1666     The output of this function will be used by model-builder-functions. For
   1667     example the pseudo code of `input_layer` will be like:
   1668 
   1669     ```python
   1670     def input_layer(features, feature_columns, ...):
   1671       outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
   1672       return tf.concat(outputs)
   1673     ```
   1674 
   1675     Args:
   1676       inputs: A `_LazyBuilder` object to access inputs.
   1677       weight_collections: List of graph collections to which Variables (if any
   1678         will be created) are added.
   1679       trainable: If `True` also add variables to the graph collection
   1680         `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}).
   1681 
   1682     Returns:
   1683       `Tensor` of shape [batch_size] + `_variable_shape`.
   1684     """
   1685     pass
   1686 
   1687 
   1688 def _create_weighted_sum(
   1689     column,
   1690     builder,
   1691     units,
   1692     sparse_combiner,
   1693     weight_collections,
   1694     trainable):
   1695   """Creates a weighted sum for a dense or sparse column for linear_model."""
   1696   if isinstance(column, _CategoricalColumn):
   1697     return _create_categorical_column_weighted_sum(
   1698         column=column,
   1699         builder=builder,
   1700         units=units,
   1701         sparse_combiner=sparse_combiner,
   1702         weight_collections=weight_collections,
   1703         trainable=trainable)
   1704   else:
   1705     return _create_dense_column_weighted_sum(
   1706         column=column,
   1707         builder=builder,
   1708         units=units,
   1709         weight_collections=weight_collections,
   1710         trainable=trainable)
   1711 
   1712 
   1713 def _create_dense_column_weighted_sum(
   1714     column, builder, units, weight_collections, trainable):
   1715   """Create a weighted sum of a dense column for linear_model."""
   1716   tensor = column._get_dense_tensor(  # pylint: disable=protected-access
   1717       builder,
   1718       weight_collections=weight_collections,
   1719       trainable=trainable)
   1720   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   1721   batch_size = array_ops.shape(tensor)[0]
   1722   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
   1723   weight = variable_scope.get_variable(
   1724       name='weights',
   1725       shape=[num_elements, units],
   1726       initializer=init_ops.zeros_initializer(),
   1727       trainable=trainable,
   1728       collections=weight_collections)
   1729   return math_ops.matmul(tensor, weight, name='weighted_sum')
   1730 
   1731 
   1732 class _CategoricalColumn(_FeatureColumn):
   1733   """Represents a categorical feature.
   1734 
   1735   WARNING: Do not subclass this layer unless you know what you are doing:
   1736   the API is subject to future changes.
   1737 
   1738   A categorical feature typically handled with a ${tf.SparseTensor} of IDs.
   1739   """
   1740   __metaclass__ = abc.ABCMeta
   1741 
   1742   IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
   1743       'IdWeightPair', ['id_tensor', 'weight_tensor'])
   1744 
   1745   @abc.abstractproperty
   1746   def _num_buckets(self):
   1747     """Returns number of buckets in this sparse feature."""
   1748     pass
   1749 
   1750   @abc.abstractmethod
   1751   def _get_sparse_tensors(self,
   1752                           inputs,
   1753                           weight_collections=None,
   1754                           trainable=None):
   1755     """Returns an IdWeightPair.
   1756 
   1757     `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
   1758     weights.
   1759 
   1760     `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
   1761     `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
   1762     `SparseTensor` of `float` or `None` to indicate all weights should be
   1763     taken to be 1. If specified, `weight_tensor` must have exactly the same
   1764     shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
   1765     output of a `VarLenFeature` which is a ragged matrix.
   1766 
   1767     Args:
   1768       inputs: A `LazyBuilder` as a cache to get input tensors required to
   1769         create `IdWeightPair`.
   1770       weight_collections: List of graph collections to which variables (if any
   1771         will be created) are added.
   1772       trainable: If `True` also add variables to the graph collection
   1773         `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}).
   1774     """
   1775     pass
   1776 
   1777 
   1778 def _create_categorical_column_weighted_sum(
   1779     column, builder, units, sparse_combiner, weight_collections, trainable):
   1780   """Create a weighted sum of a categorical column for linear_model."""
   1781   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
   1782       builder,
   1783       weight_collections=weight_collections,
   1784       trainable=trainable)
   1785   id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
   1786       array_ops.shape(sparse_tensors.id_tensor)[0], -1
   1787   ])
   1788   weight_tensor = sparse_tensors.weight_tensor
   1789   if weight_tensor is not None:
   1790     weight_tensor = sparse_ops.sparse_reshape(
   1791         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
   1792 
   1793   weight = variable_scope.get_variable(
   1794       name='weights',
   1795       shape=(column._num_buckets, units),  # pylint: disable=protected-access
   1796       initializer=init_ops.zeros_initializer(),
   1797       trainable=trainable,
   1798       collections=weight_collections)
   1799   return _safe_embedding_lookup_sparse(
   1800       weight,
   1801       id_tensor,
   1802       sparse_weights=weight_tensor,
   1803       combiner=sparse_combiner,
   1804       name='weighted_sum')
   1805 
   1806 
   1807 class _LazyBuilder(object):
   1808   """Handles caching of transformations while building the model.
   1809 
   1810   `_FeatureColumn` specifies how to digest an input column to the network. Some
   1811   feature columns require data transformations. This class caches those
   1812   transformations.
   1813 
   1814   Some features may be used in more than one place. For example, one can use a
   1815   bucketized feature by itself and a cross with it. In that case we
   1816   should create only one bucketization op instead of creating ops for each
   1817   feature column separately. To handle re-use of transformed columns,
   1818   `_LazyBuilder` caches all previously transformed columns.
   1819 
   1820   Example:
   1821   We're trying to use the following `_FeatureColumn`s:
   1822 
   1823   ```python
   1824   bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
   1825   keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
   1826   age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
   1827   ... = linear_model(features,
   1828                           [bucketized_age, keywords, age_X_keywords]
   1829   ```
   1830 
   1831   If we transform each column independently, then we'll get duplication of
   1832   bucketization (one for cross, one for bucketization itself).
   1833   The `_LazyBuilder` eliminates this duplication.
   1834   """
   1835 
   1836   def __init__(self, features):
   1837     """Creates a `_LazyBuilder`.
   1838 
   1839     Args:
   1840       features: A mapping from feature column to objects that are `Tensor` or
   1841         `SparseTensor`, or can be converted to same via
   1842         `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
   1843         signifies a base feature (not-transformed). A `_FeatureColumn` key
   1844         means that this `Tensor` is the output of an existing `_FeatureColumn`
   1845         which can be reused.
   1846     """
   1847     self._features = features.copy()
   1848     self._feature_tensors = {}
   1849 
   1850   def get(self, key):
   1851     """Returns a `Tensor` for the given key.
   1852 
   1853     A `str` key is used to access a base feature (not-transformed). When a
   1854     `_FeatureColumn` is passed, the transformed feature is returned if it
   1855     already exists, otherwise the given `_FeatureColumn` is asked to provide its
   1856     transformed output, which is then cached.
   1857 
   1858     Args:
   1859       key: a `str` or a `_FeatureColumn`.
   1860 
   1861     Returns:
   1862       The transformed `Tensor` corresponding to the `key`.
   1863 
   1864     Raises:
   1865       ValueError: if key is not found or a transformed `Tensor` cannot be
   1866         computed.
   1867     """
   1868     if key in self._feature_tensors:
   1869       # FeatureColumn is already transformed or converted.
   1870       return self._feature_tensors[key]
   1871 
   1872     if key in self._features:
   1873       feature_tensor = self._get_raw_feature_as_tensor(key)
   1874       self._feature_tensors[key] = feature_tensor
   1875       return feature_tensor
   1876 
   1877     if not isinstance(key, (str, _FeatureColumn)):
   1878       raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
   1879                       'Provided: {}'.format(key))
   1880 
   1881     if not isinstance(key, _FeatureColumn):
   1882       raise ValueError('Feature {} is not in features dictionary.'.format(key))
   1883 
   1884     column = key
   1885     logging.debug('Transforming feature_column %s.', column)
   1886     transformed = column._transform_feature(self)  # pylint: disable=protected-access
   1887     if transformed is None:
   1888       raise ValueError('Column {} is not supported.'.format(column.name))
   1889     self._feature_tensors[column] = transformed
   1890     return transformed
   1891 
   1892   def _get_raw_feature_as_tensor(self, key):
   1893     """Gets the raw_feature (keyed by `key`) as `tensor`.
   1894 
   1895     The raw feature is converted to (sparse) tensor and maybe expand dim.
   1896 
   1897     For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
   1898     the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
   1899     error out as it is not supported.
   1900 
   1901     Args:
   1902       key: A `str` key to access the raw feature.
   1903 
   1904     Returns:
   1905       A `Tensor` or `SparseTensor`.
   1906 
   1907     Raises:
   1908       ValueError: if the raw feature has rank 0.
   1909     """
   1910     raw_feature = self._features[key]
   1911     feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
   1912         raw_feature)
   1913 
   1914     def expand_dims(input_tensor):
   1915       # Input_tensor must have rank 1.
   1916       if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
   1917         return sparse_ops.sparse_reshape(
   1918             input_tensor, [array_ops.shape(input_tensor)[0], -1])
   1919       else:
   1920         return array_ops.expand_dims(input_tensor, -1)
   1921 
   1922     rank = feature_tensor.get_shape().ndims
   1923     if rank is not None:
   1924       if rank == 0:
   1925         raise ValueError(
   1926             'Feature (key: {}) cannot have rank 0. Give: {}'.format(
   1927                 key, feature_tensor))
   1928       return feature_tensor if rank != 1 else expand_dims(feature_tensor)
   1929 
   1930     # Handle dynamic rank.
   1931     with ops.control_dependencies([
   1932         check_ops.assert_positive(
   1933             array_ops.rank(feature_tensor),
   1934             message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
   1935                 key, feature_tensor))]):
   1936       return control_flow_ops.cond(
   1937           math_ops.equal(1, array_ops.rank(feature_tensor)),
   1938           lambda: expand_dims(feature_tensor),
   1939           lambda: feature_tensor)
   1940 
   1941 
   1942 # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
   1943 def _shape_offsets(shape):
   1944   """Returns moving offset for each dimension given shape."""
   1945   offsets = []
   1946   for dim in reversed(shape):
   1947     if offsets:
   1948       offsets.append(dim * offsets[-1])
   1949     else:
   1950       offsets.append(dim)
   1951   offsets.reverse()
   1952   return offsets
   1953 
   1954 
   1955 # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
   1956 def _to_sparse_input(input_tensor, ignore_value=None):
   1957   """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
   1958 
   1959   If `input_tensor` is already a `SparseTensor`, just return it.
   1960 
   1961   Args:
   1962     input_tensor: A string or integer `Tensor`.
   1963     ignore_value: Entries in `dense_tensor` equal to this value will be
   1964       absent from the resulting `SparseTensor`. If `None`, default value of
   1965       `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
   1966 
   1967   Returns:
   1968     A `SparseTensor` with the same shape as `input_tensor`.
   1969 
   1970   Raises:
   1971     ValueError: when `input_tensor`'s rank is `None`.
   1972   """
   1973   input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
   1974       input_tensor)
   1975   if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
   1976     return input_tensor
   1977   with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
   1978     if ignore_value is None:
   1979       if input_tensor.dtype == dtypes.string:
   1980         # Exception due to TF strings are converted to numpy objects by default.
   1981         ignore_value = ''
   1982       elif input_tensor.dtype.is_integer:
   1983         ignore_value = -1  # -1 has a special meaning of missing feature
   1984       else:
   1985         # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
   1986         # constructing a new numpy object of the given type, which yields the
   1987         # default value for that type.
   1988         ignore_value = input_tensor.dtype.as_numpy_dtype()
   1989     ignore_value = math_ops.cast(
   1990         ignore_value, input_tensor.dtype, name='ignore_value')
   1991     indices = array_ops.where(
   1992         math_ops.not_equal(input_tensor, ignore_value), name='indices')
   1993     return sparse_tensor_lib.SparseTensor(
   1994         indices=indices,
   1995         values=array_ops.gather_nd(input_tensor, indices, name='values'),
   1996         dense_shape=array_ops.shape(
   1997             input_tensor, out_type=dtypes.int64, name='dense_shape'))
   1998 
   1999 
   2000 def _clean_feature_columns(feature_columns):
   2001   """Verifies and normalizes `feature_columns` input."""
   2002   if isinstance(feature_columns, _FeatureColumn):
   2003     feature_columns = [feature_columns]
   2004 
   2005   if isinstance(feature_columns, collections.Iterator):
   2006     feature_columns = list(feature_columns)
   2007 
   2008   if isinstance(feature_columns, dict):
   2009     raise ValueError('Expected feature_columns to be iterable, found dict.')
   2010 
   2011   for column in feature_columns:
   2012     if not isinstance(column, _FeatureColumn):
   2013       raise ValueError('Items of feature_columns must be a _FeatureColumn. '
   2014                        'Given (type {}): {}.'.format(type(column), column))
   2015   if not feature_columns:
   2016     raise ValueError('feature_columns must not be empty.')
   2017   name_to_column = dict()
   2018   for column in feature_columns:
   2019     if column.name in name_to_column:
   2020       raise ValueError('Duplicate feature column name found for columns: {} '
   2021                        'and {}. This usually means that these columns refer to '
   2022                        'same base feature. Either one must be discarded or a '
   2023                        'duplicated but renamed item must be inserted in '
   2024                        'features dict.'.format(column,
   2025                                                name_to_column[column.name]))
   2026     name_to_column[column.name] = column
   2027 
   2028   return feature_columns
   2029 
   2030 
   2031 class _NumericColumn(_DenseColumn,
   2032                      collections.namedtuple('_NumericColumn', [
   2033                          'key', 'shape', 'default_value', 'dtype',
   2034                          'normalizer_fn'
   2035                      ])):
   2036   """see `numeric_column`."""
   2037 
   2038   @property
   2039   def name(self):
   2040     return self.key
   2041 
   2042   @property
   2043   def _parse_example_spec(self):
   2044     return {
   2045         self.key:
   2046             parsing_ops.FixedLenFeature(self.shape, self.dtype,
   2047                                         self.default_value)
   2048     }
   2049 
   2050   def _transform_feature(self, inputs):
   2051     input_tensor = inputs.get(self.key)
   2052     if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
   2053       raise ValueError(
   2054           'The corresponding Tensor of numerical column must be a Tensor. '
   2055           'SparseTensor is not supported. key: {}'.format(self.key))
   2056     if self.normalizer_fn is not None:
   2057       input_tensor = self.normalizer_fn(input_tensor)
   2058     return math_ops.to_float(input_tensor)
   2059 
   2060   @property
   2061   def _variable_shape(self):
   2062     return tensor_shape.TensorShape(self.shape)
   2063 
   2064   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   2065     """Returns dense `Tensor` representing numeric feature.
   2066 
   2067     Args:
   2068       inputs: A `_LazyBuilder` object to access inputs.
   2069       weight_collections: Unused `weight_collections` since no variables are
   2070         created in this function.
   2071       trainable: Unused `trainable` bool since no variables are created in
   2072         this function.
   2073 
   2074     Returns:
   2075       Dense `Tensor` created within `_transform_feature`.
   2076     """
   2077     # Do nothing with weight_collections and trainable since no variables are
   2078     # created in this function.
   2079     del weight_collections
   2080     del trainable
   2081     # Feature has been already transformed. Return the intermediate
   2082     # representation created by _transform_feature.
   2083     return inputs.get(self)
   2084 
   2085 
   2086 class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
   2087                         collections.namedtuple('_BucketizedColumn', [
   2088                             'source_column', 'boundaries'])):
   2089   """See `bucketized_column`."""
   2090 
   2091   @property
   2092   def name(self):
   2093     return '{}_bucketized'.format(self.source_column.name)
   2094 
   2095   @property
   2096   def _parse_example_spec(self):
   2097     return self.source_column._parse_example_spec  # pylint: disable=protected-access
   2098 
   2099   def _transform_feature(self, inputs):
   2100     source_tensor = inputs.get(self.source_column)
   2101     return math_ops._bucketize(  # pylint: disable=protected-access
   2102         source_tensor,
   2103         boundaries=self.boundaries)
   2104 
   2105   @property
   2106   def _variable_shape(self):
   2107     return tensor_shape.TensorShape(
   2108         tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
   2109 
   2110   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   2111     del weight_collections
   2112     del trainable
   2113     input_tensor = inputs.get(self)
   2114     return array_ops.one_hot(
   2115         indices=math_ops.to_int64(input_tensor),
   2116         depth=len(self.boundaries) + 1,
   2117         on_value=1.,
   2118         off_value=0.)
   2119 
   2120   @property
   2121   def _num_buckets(self):
   2122     # By construction, source_column is always one-dimensional.
   2123     return (len(self.boundaries) + 1) * self.source_column.shape[0]
   2124 
   2125   def _get_sparse_tensors(self, inputs, weight_collections=None,
   2126                           trainable=None):
   2127     input_tensor = inputs.get(self)
   2128     batch_size = array_ops.shape(input_tensor)[0]
   2129     # By construction, source_column is always one-dimensional.
   2130     source_dimension = self.source_column.shape[0]
   2131 
   2132     i1 = array_ops.reshape(
   2133         array_ops.tile(
   2134             array_ops.expand_dims(math_ops.range(0, batch_size), 1),
   2135             [1, source_dimension]),
   2136         (-1,))
   2137     i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
   2138     # Flatten the bucket indices and unique them across dimensions
   2139     # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
   2140     bucket_indices = (
   2141         array_ops.reshape(input_tensor, (-1,)) +
   2142         (len(self.boundaries) + 1) * i2)
   2143 
   2144     indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
   2145     dense_shape = math_ops.to_int64(array_ops.stack(
   2146         [batch_size, source_dimension]))
   2147     sparse_tensor = sparse_tensor_lib.SparseTensor(
   2148         indices=indices,
   2149         values=bucket_indices,
   2150         dense_shape=dense_shape)
   2151     return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
   2152 
   2153 
   2154 class _EmbeddingColumn(
   2155     _DenseColumn,
   2156     collections.namedtuple('_EmbeddingColumn', (
   2157         'categorical_column', 'dimension', 'combiner', 'initializer',
   2158         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
   2159     ))):
   2160   """See `embedding_column`."""
   2161 
   2162   @property
   2163   def name(self):
   2164     if not hasattr(self, '_name'):
   2165       self._name = '{}_embedding'.format(self.categorical_column.name)
   2166     return self._name
   2167 
   2168   @property
   2169   def _parse_example_spec(self):
   2170     return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
   2171 
   2172   def _transform_feature(self, inputs):
   2173     return inputs.get(self.categorical_column)
   2174 
   2175   @property
   2176   def _variable_shape(self):
   2177     if not hasattr(self, '_shape'):
   2178       self._shape = tensor_shape.vector(self.dimension)
   2179     return self._shape
   2180 
   2181   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   2182     # Get sparse IDs and weights.
   2183     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
   2184         inputs, weight_collections=weight_collections, trainable=trainable)
   2185     sparse_ids = sparse_tensors.id_tensor
   2186     sparse_weights = sparse_tensors.weight_tensor
   2187 
   2188     embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
   2189     embedding_weights = variable_scope.get_variable(
   2190         name='embedding_weights',
   2191         shape=embedding_shape,
   2192         dtype=dtypes.float32,
   2193         initializer=self.initializer,
   2194         trainable=self.trainable and trainable,
   2195         collections=weight_collections)
   2196     if self.ckpt_to_load_from is not None:
   2197       to_restore = embedding_weights
   2198       if isinstance(to_restore, variables.PartitionedVariable):
   2199         to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
   2200       checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
   2201           self.tensor_name_in_ckpt: to_restore
   2202       })
   2203 
   2204     # Return embedding lookup result.
   2205     return _safe_embedding_lookup_sparse(
   2206         embedding_weights=embedding_weights,
   2207         sparse_ids=sparse_ids,
   2208         sparse_weights=sparse_weights,
   2209         combiner=self.combiner,
   2210         name='%s_weights' % self.name,
   2211         max_norm=self.max_norm)
   2212 
   2213 
   2214 class _SharedEmbeddingColumn(
   2215     _DenseColumn,
   2216     collections.namedtuple('_SharedEmbeddingColumn', (
   2217         'categorical_column', 'dimension', 'combiner', 'initializer',
   2218         'shared_embedding_collection_name', 'ckpt_to_load_from',
   2219         'tensor_name_in_ckpt', 'max_norm', 'trainable'
   2220     ))):
   2221   """See `embedding_column`."""
   2222 
   2223   @property
   2224   def name(self):
   2225     if not hasattr(self, '_name'):
   2226       self._name = '{}_shared_embedding'.format(self.categorical_column.name)
   2227     return self._name
   2228 
   2229   @property
   2230   def _var_scope_name(self):
   2231     return self.shared_embedding_collection_name
   2232 
   2233   @property
   2234   def _parse_example_spec(self):
   2235     return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
   2236 
   2237   def _transform_feature(self, inputs):
   2238     return inputs.get(self.categorical_column)
   2239 
   2240   @property
   2241   def _variable_shape(self):
   2242     if not hasattr(self, '_shape'):
   2243       self._shape = tensor_shape.vector(self.dimension)
   2244     return self._shape
   2245 
   2246   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   2247     # This method is called from a variable_scope with name _var_scope_name,
   2248     # which is shared among all shared embeddings. Open a name_scope here, so
   2249     # that the ops for different columns have distinct names.
   2250     with ops.name_scope(None, default_name=self.name):
   2251       # Get sparse IDs and weights.
   2252       sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
   2253           inputs, weight_collections=weight_collections, trainable=trainable)
   2254       sparse_ids = sparse_tensors.id_tensor
   2255       sparse_weights = sparse_tensors.weight_tensor
   2256 
   2257       embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
   2258       shared_embedding_collection = ops.get_collection(
   2259           self.shared_embedding_collection_name)
   2260       if shared_embedding_collection:
   2261         if len(shared_embedding_collection) > 1:
   2262           raise ValueError(
   2263               'Collection {} can only contain one variable. '
   2264               'Suggested fix A: Choose a unique name for this collection. '
   2265               'Suggested fix B: Do not add any variables to this collection. '
   2266               'The feature_column library already adds a variable under the '
   2267               'hood.'.format(shared_embedding_collection))
   2268         embedding_weights = shared_embedding_collection[0]
   2269         if embedding_weights.get_shape() != embedding_shape:
   2270           raise ValueError(
   2271               'Shared embedding collection {} contains variable {} of '
   2272               'unexpected shape {}. Expected shape is {}. '
   2273               'Suggested fix A: Choose a unique name for this collection. '
   2274               'Suggested fix B: Do not add any variables to this collection. '
   2275               'The feature_column library already adds a variable under the '
   2276               'hood.'.format(
   2277                   self.shared_embedding_collection_name, embedding_weights.name,
   2278                   embedding_weights.get_shape(), embedding_shape))
   2279       else:
   2280         embedding_weights = variable_scope.get_variable(
   2281             name='embedding_weights',
   2282             shape=embedding_shape,
   2283             dtype=dtypes.float32,
   2284             initializer=self.initializer,
   2285             trainable=self.trainable and trainable,
   2286             collections=weight_collections)
   2287         ops.add_to_collection(
   2288             self.shared_embedding_collection_name, embedding_weights)
   2289       if self.ckpt_to_load_from is not None:
   2290         to_restore = embedding_weights
   2291         if isinstance(to_restore, variables.PartitionedVariable):
   2292           to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
   2293         checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
   2294             self.tensor_name_in_ckpt: to_restore
   2295         })
   2296 
   2297       # Return embedding lookup result.
   2298       return _safe_embedding_lookup_sparse(
   2299           embedding_weights=embedding_weights,
   2300           sparse_ids=sparse_ids,
   2301           sparse_weights=sparse_weights,
   2302           combiner=self.combiner,
   2303           name='%s_weights' % self.name,
   2304           max_norm=self.max_norm)
   2305 
   2306 
   2307 def _create_tuple(shape, value):
   2308   """Returns a tuple with given shape and filled with value."""
   2309   if shape:
   2310     return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
   2311   return value
   2312 
   2313 
   2314 def _as_tuple(value):
   2315   if not nest.is_sequence(value):
   2316     return value
   2317   return tuple([_as_tuple(v) for v in value])
   2318 
   2319 
   2320 def _check_shape(shape, key):
   2321   """Returns shape if it's valid, raises error otherwise."""
   2322   assert shape is not None
   2323   if not nest.is_sequence(shape):
   2324     shape = [shape]
   2325   shape = tuple(shape)
   2326   for dimension in shape:
   2327     if not isinstance(dimension, int):
   2328       raise TypeError('shape dimensions must be integer. '
   2329                       'shape: {}, key: {}'.format(shape, key))
   2330     if dimension < 1:
   2331       raise ValueError('shape dimensions must be greater than 0. '
   2332                        'shape: {}, key: {}'.format(shape, key))
   2333   return shape
   2334 
   2335 
   2336 def _is_shape_and_default_value_compatible(default_value, shape):
   2337   """Verifies compatibility of shape and default_value."""
   2338   # Invalid condition:
   2339   #  * if default_value is not a scalar and shape is empty
   2340   #  * or if default_value is an iterable and shape is not empty
   2341   if nest.is_sequence(default_value) != bool(shape):
   2342     return False
   2343   if not shape:
   2344     return True
   2345   if len(default_value) != shape[0]:
   2346     return False
   2347   for i in range(shape[0]):
   2348     if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
   2349       return False
   2350   return True
   2351 
   2352 
   2353 def _check_default_value(shape, default_value, dtype, key):
   2354   """Returns default value as tuple if it's valid, otherwise raises errors.
   2355 
   2356   This function verifies that `default_value` is compatible with both `shape`
   2357   and `dtype`. If it is not compatible, it raises an error. If it is compatible,
   2358   it casts default_value to a tuple and returns it. `key` is used only
   2359   for error message.
   2360 
   2361   Args:
   2362     shape: An iterable of integers specifies the shape of the `Tensor`.
   2363     default_value: If a single value is provided, the same value will be applied
   2364       as the default value for every item. If an iterable of values is
   2365       provided, the shape of the `default_value` should be equal to the given
   2366       `shape`.
   2367     dtype: defines the type of values. Default value is `tf.float32`. Must be a
   2368       non-quantized, real integer or floating point type.
   2369     key: Column name, used only for error messages.
   2370 
   2371   Returns:
   2372     A tuple which will be used as default value.
   2373 
   2374   Raises:
   2375     TypeError: if `default_value` is an iterable but not compatible with `shape`
   2376     TypeError: if `default_value` is not compatible with `dtype`.
   2377     ValueError: if `dtype` is not convertible to `tf.float32`.
   2378   """
   2379   if default_value is None:
   2380     return None
   2381 
   2382   if isinstance(default_value, int):
   2383     return _create_tuple(shape, default_value)
   2384 
   2385   if isinstance(default_value, float) and dtype.is_floating:
   2386     return _create_tuple(shape, default_value)
   2387 
   2388   if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
   2389     default_value = default_value.tolist()
   2390 
   2391   if nest.is_sequence(default_value):
   2392     if not _is_shape_and_default_value_compatible(default_value, shape):
   2393       raise ValueError(
   2394           'The shape of default_value must be equal to given shape. '
   2395           'default_value: {}, shape: {}, key: {}'.format(
   2396               default_value, shape, key))
   2397     # Check if the values in the list are all integers or are convertible to
   2398     # floats.
   2399     is_list_all_int = all(
   2400         isinstance(v, int) for v in nest.flatten(default_value))
   2401     is_list_has_float = any(
   2402         isinstance(v, float) for v in nest.flatten(default_value))
   2403     if is_list_all_int:
   2404       return _as_tuple(default_value)
   2405     if is_list_has_float and dtype.is_floating:
   2406       return _as_tuple(default_value)
   2407   raise TypeError('default_value must be compatible with dtype. '
   2408                   'default_value: {}, dtype: {}, key: {}'.format(
   2409                       default_value, dtype, key))
   2410 
   2411 
   2412 class _HashedCategoricalColumn(
   2413     _CategoricalColumn,
   2414     collections.namedtuple('_HashedCategoricalColumn',
   2415                            ['key', 'hash_bucket_size', 'dtype'])):
   2416   """see `categorical_column_with_hash_bucket`."""
   2417 
   2418   @property
   2419   def name(self):
   2420     return self.key
   2421 
   2422   @property
   2423   def _parse_example_spec(self):
   2424     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
   2425 
   2426   def _transform_feature(self, inputs):
   2427     input_tensor = _to_sparse_input(inputs.get(self.key))
   2428     if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
   2429       raise ValueError('SparseColumn input must be a SparseTensor.')
   2430 
   2431     _assert_string_or_int(
   2432         input_tensor.dtype,
   2433         prefix='column_name: {} input_tensor'.format(self.key))
   2434 
   2435     if self.dtype.is_integer != input_tensor.dtype.is_integer:
   2436       raise ValueError(
   2437           'Column dtype and SparseTensors dtype must be compatible. '
   2438           'key: {}, column dtype: {}, tensor dtype: {}'.format(
   2439               self.key, self.dtype, input_tensor.dtype))
   2440 
   2441     if self.dtype == dtypes.string:
   2442       sparse_values = input_tensor.values
   2443     else:
   2444       sparse_values = string_ops.as_string(input_tensor.values)
   2445 
   2446     sparse_id_values = string_ops.string_to_hash_bucket_fast(
   2447         sparse_values, self.hash_bucket_size, name='lookup')
   2448     return sparse_tensor_lib.SparseTensor(
   2449         input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
   2450 
   2451   @property
   2452   def _num_buckets(self):
   2453     """Returns number of buckets in this sparse feature."""
   2454     return self.hash_bucket_size
   2455 
   2456   def _get_sparse_tensors(self, inputs, weight_collections=None,
   2457                           trainable=None):
   2458     return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
   2459 
   2460 
   2461 class _VocabularyFileCategoricalColumn(
   2462     _CategoricalColumn,
   2463     collections.namedtuple('_VocabularyFileCategoricalColumn', (
   2464         'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
   2465         'default_value'
   2466     ))):
   2467   """See `categorical_column_with_vocabulary_file`."""
   2468 
   2469   @property
   2470   def name(self):
   2471     return self.key
   2472 
   2473   @property
   2474   def _parse_example_spec(self):
   2475     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
   2476 
   2477   def _transform_feature(self, inputs):
   2478     input_tensor = _to_sparse_input(inputs.get(self.key))
   2479 
   2480     if self.dtype.is_integer != input_tensor.dtype.is_integer:
   2481       raise ValueError(
   2482           'Column dtype and SparseTensors dtype must be compatible. '
   2483           'key: {}, column dtype: {}, tensor dtype: {}'.format(
   2484               self.key, self.dtype, input_tensor.dtype))
   2485 
   2486     _assert_string_or_int(
   2487         input_tensor.dtype,
   2488         prefix='column_name: {} input_tensor'.format(self.key))
   2489 
   2490     key_dtype = self.dtype
   2491     if input_tensor.dtype.is_integer:
   2492       # `index_table_from_file` requires 64-bit integer keys.
   2493       key_dtype = dtypes.int64
   2494       input_tensor = math_ops.to_int64(input_tensor)
   2495 
   2496     return lookup_ops.index_table_from_file(
   2497         vocabulary_file=self.vocabulary_file,
   2498         num_oov_buckets=self.num_oov_buckets,
   2499         vocab_size=self.vocabulary_size,
   2500         default_value=self.default_value,
   2501         key_dtype=key_dtype,
   2502         name='{}_lookup'.format(self.key)).lookup(input_tensor)
   2503 
   2504   @property
   2505   def _num_buckets(self):
   2506     """Returns number of buckets in this sparse feature."""
   2507     return self.vocabulary_size + self.num_oov_buckets
   2508 
   2509   def _get_sparse_tensors(
   2510       self, inputs, weight_collections=None, trainable=None):
   2511     return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
   2512 
   2513 
   2514 class _VocabularyListCategoricalColumn(
   2515     _CategoricalColumn,
   2516     collections.namedtuple('_VocabularyListCategoricalColumn', (
   2517         'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'
   2518     ))):
   2519   """See `categorical_column_with_vocabulary_list`."""
   2520 
   2521   @property
   2522   def name(self):
   2523     return self.key
   2524 
   2525   @property
   2526   def _parse_example_spec(self):
   2527     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
   2528 
   2529   def _transform_feature(self, inputs):
   2530     input_tensor = _to_sparse_input(inputs.get(self.key))
   2531 
   2532     if self.dtype.is_integer != input_tensor.dtype.is_integer:
   2533       raise ValueError(
   2534           'Column dtype and SparseTensors dtype must be compatible. '
   2535           'key: {}, column dtype: {}, tensor dtype: {}'.format(
   2536               self.key, self.dtype, input_tensor.dtype))
   2537 
   2538     _assert_string_or_int(
   2539         input_tensor.dtype,
   2540         prefix='column_name: {} input_tensor'.format(self.key))
   2541 
   2542     key_dtype = self.dtype
   2543     if input_tensor.dtype.is_integer:
   2544       # `index_table_from_tensor` requires 64-bit integer keys.
   2545       key_dtype = dtypes.int64
   2546       input_tensor = math_ops.to_int64(input_tensor)
   2547 
   2548     return lookup_ops.index_table_from_tensor(
   2549         vocabulary_list=tuple(self.vocabulary_list),
   2550         default_value=self.default_value,
   2551         num_oov_buckets=self.num_oov_buckets,
   2552         dtype=key_dtype,
   2553         name='{}_lookup'.format(self.key)).lookup(input_tensor)
   2554 
   2555   @property
   2556   def _num_buckets(self):
   2557     """Returns number of buckets in this sparse feature."""
   2558     return len(self.vocabulary_list) + self.num_oov_buckets
   2559 
   2560   def _get_sparse_tensors(
   2561       self, inputs, weight_collections=None, trainable=None):
   2562     return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
   2563 
   2564 
   2565 class _IdentityCategoricalColumn(
   2566     _CategoricalColumn,
   2567     collections.namedtuple('_IdentityCategoricalColumn', (
   2568         'key', 'num_buckets', 'default_value'
   2569     ))):
   2570 
   2571   """See `categorical_column_with_identity`."""
   2572 
   2573   @property
   2574   def name(self):
   2575     return self.key
   2576 
   2577   @property
   2578   def _parse_example_spec(self):
   2579     return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
   2580 
   2581   def _transform_feature(self, inputs):
   2582     input_tensor = _to_sparse_input(inputs.get(self.key))
   2583 
   2584     if not input_tensor.dtype.is_integer:
   2585       raise ValueError(
   2586           'Invalid input, not integer. key: {} dtype: {}'.format(
   2587               self.key, input_tensor.dtype))
   2588 
   2589     values = math_ops.to_int64(input_tensor.values, name='values')
   2590     num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
   2591     zero = math_ops.to_int64(0, name='zero')
   2592     if self.default_value is None:
   2593       # Fail if values are out-of-range.
   2594       assert_less = check_ops.assert_less(
   2595           values, num_buckets, data=(values, num_buckets),
   2596           name='assert_less_than_num_buckets')
   2597       assert_greater = check_ops.assert_greater_equal(
   2598           values, zero, data=(values,),
   2599           name='assert_greater_or_equal_0')
   2600       with ops.control_dependencies((assert_less, assert_greater)):
   2601         values = array_ops.identity(values)
   2602     else:
   2603       # Assign default for out-of-range values.
   2604       values = array_ops.where(
   2605           math_ops.logical_or(
   2606               values < zero, values >= num_buckets, name='out_of_range'),
   2607           array_ops.fill(
   2608               dims=array_ops.shape(values),
   2609               value=math_ops.to_int64(self.default_value),
   2610               name='default_values'),
   2611           values)
   2612 
   2613     return sparse_tensor_lib.SparseTensor(
   2614         indices=input_tensor.indices,
   2615         values=values,
   2616         dense_shape=input_tensor.dense_shape)
   2617 
   2618   @property
   2619   def _num_buckets(self):
   2620     """Returns number of buckets in this sparse feature."""
   2621     return self.num_buckets
   2622 
   2623   def _get_sparse_tensors(
   2624       self, inputs, weight_collections=None, trainable=None):
   2625     return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
   2626 
   2627 
   2628 class _WeightedCategoricalColumn(
   2629     _CategoricalColumn,
   2630     collections.namedtuple('_WeightedCategoricalColumn', (
   2631         'categorical_column', 'weight_feature_key', 'dtype'
   2632     ))):
   2633   """See `weighted_categorical_column`."""
   2634 
   2635   @property
   2636   def name(self):
   2637     return '{}_weighted_by_{}'.format(
   2638         self.categorical_column.name, self.weight_feature_key)
   2639 
   2640   @property
   2641   def _parse_example_spec(self):
   2642     config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
   2643     if self.weight_feature_key in config:
   2644       raise ValueError('Parse config {} already exists for {}.'.format(
   2645           config[self.weight_feature_key], self.weight_feature_key))
   2646     config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
   2647     return config
   2648 
   2649   @property
   2650   def _num_buckets(self):
   2651     return self.categorical_column._num_buckets  # pylint: disable=protected-access
   2652 
   2653   def _transform_feature(self, inputs):
   2654     weight_tensor = inputs.get(self.weight_feature_key)
   2655     if weight_tensor is None:
   2656       raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
   2657     weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
   2658         weight_tensor)
   2659     if self.dtype != weight_tensor.dtype.base_dtype:
   2660       raise ValueError('Bad dtype, expected {}, but got {}.'.format(
   2661           self.dtype, weight_tensor.dtype))
   2662     if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
   2663       # The weight tensor can be a regular Tensor. In this case, sparsify it.
   2664       weight_tensor = _to_sparse_input(weight_tensor, ignore_value=0.0)
   2665     if not weight_tensor.dtype.is_floating:
   2666       weight_tensor = math_ops.to_float(weight_tensor)
   2667     return (inputs.get(self.categorical_column), weight_tensor)
   2668 
   2669   def _get_sparse_tensors(
   2670       self, inputs, weight_collections=None, trainable=None):
   2671     del weight_collections
   2672     del trainable
   2673     tensors = inputs.get(self)
   2674     return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
   2675 
   2676 
   2677 class _CrossedColumn(
   2678     _CategoricalColumn,
   2679     collections.namedtuple('_CrossedColumn',
   2680                            ['keys', 'hash_bucket_size', 'hash_key'])):
   2681   """See `crossed_column`."""
   2682 
   2683   @property
   2684   def name(self):
   2685     feature_names = []
   2686     for key in _collect_leaf_level_keys(self):
   2687       if isinstance(key, _FeatureColumn):
   2688         feature_names.append(key.name)
   2689       else:  # key must be a string
   2690         feature_names.append(key)
   2691     return '_X_'.join(sorted(feature_names))
   2692 
   2693   @property
   2694   def _parse_example_spec(self):
   2695     config = {}
   2696     for key in self.keys:
   2697       if isinstance(key, _FeatureColumn):
   2698         config.update(key._parse_example_spec)  # pylint: disable=protected-access
   2699       else:  # key must be a string
   2700         config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
   2701     return config
   2702 
   2703   def _transform_feature(self, inputs):
   2704     feature_tensors = []
   2705     for key in _collect_leaf_level_keys(self):
   2706       if isinstance(key, six.string_types):
   2707         feature_tensors.append(inputs.get(key))
   2708       elif isinstance(key, _CategoricalColumn):
   2709         ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
   2710         if ids_and_weights.weight_tensor is not None:
   2711           raise ValueError(
   2712               'crossed_column does not support weight_tensor, but the given '
   2713               'column populates weight_tensor. '
   2714               'Given column: {}'.format(key.name))
   2715         feature_tensors.append(ids_and_weights.id_tensor)
   2716       else:
   2717         raise ValueError('Unsupported column type. Given: {}'.format(key))
   2718     return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
   2719         inputs=feature_tensors,
   2720         num_buckets=self.hash_bucket_size,
   2721         hash_key=self.hash_key)
   2722 
   2723   @property
   2724   def _num_buckets(self):
   2725     """Returns number of buckets in this sparse feature."""
   2726     return self.hash_bucket_size
   2727 
   2728   def _get_sparse_tensors(self, inputs, weight_collections=None,
   2729                           trainable=None):
   2730     return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
   2731 
   2732 
   2733 def _collect_leaf_level_keys(cross):
   2734   """Collects base keys by expanding all nested crosses.
   2735 
   2736   Args:
   2737     cross: A `_CrossedColumn`.
   2738 
   2739   Returns:
   2740     A list of strings or `_CategoricalColumn` instances.
   2741   """
   2742   leaf_level_keys = []
   2743   for k in cross.keys:
   2744     if isinstance(k, _CrossedColumn):
   2745       leaf_level_keys.extend(_collect_leaf_level_keys(k))
   2746     else:
   2747       leaf_level_keys.append(k)
   2748   return leaf_level_keys
   2749 
   2750 
   2751 # TODO(zakaria): Move this to embedding_ops and make it public.
   2752 def _safe_embedding_lookup_sparse(embedding_weights,
   2753                                   sparse_ids,
   2754                                   sparse_weights=None,
   2755                                   combiner='mean',
   2756                                   default_id=None,
   2757                                   name=None,
   2758                                   partition_strategy='div',
   2759                                   max_norm=None):
   2760   """Lookup embedding results, accounting for invalid IDs and empty features.
   2761 
   2762   The partitioned embedding in `embedding_weights` must all be the same shape
   2763   except for the first dimension. The first dimension is allowed to vary as the
   2764   vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
   2765   may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
   2766   partitioner.
   2767 
   2768   Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
   2769   with non-positive weight. For an entry with no features, the embedding vector
   2770   for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
   2771 
   2772   The ids and weights may be multi-dimensional. Embeddings are always aggregated
   2773   along the last dimension.
   2774 
   2775   Args:
   2776     embedding_weights:  A list of `P` float `Tensor`s or values representing
   2777         partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
   2778         created by partitioning along dimension 0.  The total unpartitioned
   2779         shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
   2780         vocab size and `e_1, ..., e_m` are the embedding dimensions.
   2781     sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
   2782         ids. `d_0` is typically batch size.
   2783     sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
   2784         float weights corresponding to `sparse_ids`, or `None` if all weights
   2785         are be assumed to be 1.0.
   2786     combiner: A string specifying how to combine embedding results for each
   2787         entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
   2788         the default.
   2789     default_id: The id to use for an entry with no features.
   2790     name: A name for this operation (optional).
   2791     partition_strategy: A string specifying the partitioning strategy.
   2792         Currently `"div"` and `"mod"` are supported. Default is `"div"`.
   2793     max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
   2794         combining.
   2795 
   2796 
   2797   Returns:
   2798     Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
   2799 
   2800   Raises:
   2801     ValueError: if `embedding_weights` is empty.
   2802   """
   2803   if embedding_weights is None:
   2804     raise ValueError('Missing embedding_weights %s.' % embedding_weights)
   2805   if isinstance(embedding_weights, variables.PartitionedVariable):
   2806     embedding_weights = list(embedding_weights)  # get underlying Variables.
   2807   if not isinstance(embedding_weights, list):
   2808     embedding_weights = [embedding_weights]
   2809   if len(embedding_weights) < 1:
   2810     raise ValueError('Missing embedding_weights %s.' % embedding_weights)
   2811 
   2812   dtype = sparse_weights.dtype if sparse_weights is not None else None
   2813   embedding_weights = [
   2814       ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
   2815   ]
   2816 
   2817   with ops.name_scope(name, 'embedding_lookup',
   2818                       embedding_weights + [sparse_ids,
   2819                                            sparse_weights]) as scope:
   2820     # Reshape higher-rank sparse ids and weights to linear segment ids.
   2821     original_shape = sparse_ids.dense_shape
   2822     original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
   2823     original_rank = (
   2824         array_ops.size(original_shape)
   2825         if original_rank_dim.value is None
   2826         else original_rank_dim.value)
   2827     sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
   2828         math_ops.reduce_prod(
   2829             array_ops.slice(original_shape, [0], [original_rank - 1])),
   2830         array_ops.gather(original_shape, original_rank - 1)])
   2831     if sparse_weights is not None:
   2832       sparse_weights = sparse_tensor_lib.SparseTensor(
   2833           sparse_ids.indices,
   2834           sparse_weights.values, sparse_ids.dense_shape)
   2835 
   2836     # Prune invalid ids and weights.
   2837     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
   2838 
   2839     # Fill in dummy values for empty features, if necessary.
   2840     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
   2841                                                                  default_id or
   2842                                                                  0)
   2843     if sparse_weights is not None:
   2844       sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
   2845 
   2846     result = embedding_ops.embedding_lookup_sparse(
   2847         embedding_weights,
   2848         sparse_ids,
   2849         sparse_weights,
   2850         combiner=combiner,
   2851         partition_strategy=partition_strategy,
   2852         name=None if default_id is None else scope,
   2853         max_norm=max_norm)
   2854 
   2855     if default_id is None:
   2856       # Broadcast is_row_empty to the same shape as embedding_lookup_result,
   2857       # for use in Select.
   2858       is_row_empty = array_ops.tile(
   2859           array_ops.reshape(is_row_empty, [-1, 1]),
   2860           array_ops.stack([1, array_ops.shape(result)[1]]))
   2861 
   2862       result = array_ops.where(is_row_empty,
   2863                                array_ops.zeros_like(result),
   2864                                result,
   2865                                name=scope)
   2866 
   2867     # Reshape back from linear ids back into higher-dimensional dense result.
   2868     final_result = array_ops.reshape(
   2869         result,
   2870         array_ops.concat([
   2871             array_ops.slice(
   2872                 math_ops.cast(original_shape, dtypes.int32), [0],
   2873                 [original_rank - 1]),
   2874             array_ops.slice(array_ops.shape(result), [1], [-1])
   2875         ], 0))
   2876     final_result.set_shape(tensor_shape.unknown_shape(
   2877         (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
   2878     return final_result
   2879 
   2880 
   2881 def _prune_invalid_ids(sparse_ids, sparse_weights):
   2882   """Prune invalid IDs (< 0) from the input ids and weights."""
   2883   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   2884   if sparse_weights is not None:
   2885     is_id_valid = math_ops.logical_and(
   2886         is_id_valid, math_ops.greater(sparse_weights.values, 0))
   2887   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   2888   if sparse_weights is not None:
   2889     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   2890   return sparse_ids, sparse_weights
   2891 
   2892 
   2893 class _IndicatorColumn(_DenseColumn,
   2894                        collections.namedtuple('_IndicatorColumn',
   2895                                               ['categorical_column'])):
   2896   """Represents a one-hot column for use in deep networks.
   2897 
   2898   Args:
   2899     categorical_column: A `_CategoricalColumn` which is created by
   2900       `categorical_column_with_*` function.
   2901   """
   2902 
   2903   @property
   2904   def name(self):
   2905     return '{}_indicator'.format(self.categorical_column.name)
   2906 
   2907   def _transform_feature(self, inputs):
   2908     """Returns dense `Tensor` representing feature.
   2909 
   2910     Args:
   2911       inputs: A `_LazyBuilder` object to access inputs.
   2912 
   2913     Returns:
   2914       Transformed feature `Tensor`.
   2915 
   2916     Raises:
   2917       ValueError: if input rank is not known at graph building time.
   2918     """
   2919     id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
   2920     id_tensor = id_weight_pair.id_tensor
   2921     weight_tensor = id_weight_pair.weight_tensor
   2922 
   2923     # If the underlying column is weighted, return the input as a dense tensor.
   2924     if weight_tensor is not None:
   2925       weighted_column = sparse_ops.sparse_merge(
   2926           sp_ids=id_tensor,
   2927           sp_values=weight_tensor,
   2928           vocab_size=int(self._variable_shape[-1]))
   2929       # Remove (?, -1) index
   2930       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
   2931                                                 weighted_column.dense_shape)
   2932       return sparse_ops.sparse_tensor_to_dense(weighted_column)
   2933 
   2934     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
   2935         id_tensor, default_value=-1)
   2936 
   2937     # One hot must be float for tf.concat reasons since all other inputs to
   2938     # input_layer are float32.
   2939     one_hot_id_tensor = array_ops.one_hot(
   2940         dense_id_tensor,
   2941         depth=self._variable_shape[-1],
   2942         on_value=1.0,
   2943         off_value=0.0)
   2944 
   2945     # Reduce to get a multi-hot per example.
   2946     return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
   2947 
   2948   @property
   2949   def _parse_example_spec(self):
   2950     return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
   2951 
   2952   @property
   2953   def _variable_shape(self):
   2954     """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
   2955     return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
   2956 
   2957   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   2958     """Returns dense `Tensor` representing feature.
   2959 
   2960     Args:
   2961       inputs: A `_LazyBuilder` object to access inputs.
   2962       weight_collections: Unused `weight_collections` since no variables are
   2963         created in this function.
   2964       trainable: Unused `trainable` bool since no variables are created in
   2965         this function.
   2966 
   2967     Returns:
   2968       Dense `Tensor` created within `_transform_feature`.
   2969     """
   2970     # Do nothing with weight_collections and trainable since no variables are
   2971     # created in this function.
   2972     del weight_collections
   2973     del trainable
   2974     # Feature has been already transformed. Return the intermediate
   2975     # representation created by _transform_feature.
   2976     return inputs.get(self)
   2977 
   2978 
   2979 def _verify_static_batch_size_equality(tensors, columns):
   2980   # bath_size is a tf.Dimension object.
   2981   expected_batch_size = None
   2982   for i in range(0, len(tensors)):
   2983     if tensors[i].shape[0].value is not None:
   2984       if expected_batch_size is None:
   2985         bath_size_column_index = i
   2986         expected_batch_size = tensors[i].shape[0]
   2987       elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
   2988         raise ValueError(
   2989             'Batch size (first dimension) of each feature must be same. '
   2990             'Batch size of columns ({}, {}): ({}, {})'.format(
   2991                 columns[bath_size_column_index].name, columns[i].name,
   2992                 expected_batch_size, tensors[i].shape[0]))
   2993