Home | History | Annotate | Download | only in layers
      1 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #     http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 # ==============================================================================
     15 """This API defines FeatureColumn abstraction.
     16 
     17 FeatureColumns provide a high level abstraction for ingesting and representing
     18 features in `Estimator` models.
     19 
     20 FeatureColumns are the primary way of encoding features for pre-canned
     21 `Estimator` models.
     22 
     23 When using FeatureColumns with `Estimator` models, the type of feature column
     24 you should choose depends on (1) the feature type and (2) the model type.
     25 
     26 (1) Feature type:
     27 
     28  * Continuous features can be represented by `real_valued_column`.
     29  * Categorical features can be represented by any `sparse_column_with_*`
     30  column (`sparse_column_with_keys`, `sparse_column_with_vocabulary_file`,
     31  `sparse_column_with_hash_bucket`, `sparse_column_with_integerized_feature`).
     32 
     33 (2) Model type:
     34 
     35  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
     36 
     37    Continuous features can be directly fed into deep neural network models.
     38 
     39      age_column = real_valued_column("age")
     40 
     41    To feed sparse features into DNN models, wrap the column with
     42    `embedding_column` or `one_hot_column`. `one_hot_column` will create a dense
     43    boolean tensor with an entry for each possible value, and thus the
     44    computation cost is linear in the number of possible values versus the number
     45    of values that occur in the sparse tensor. Thus using a "one_hot_column" is
     46    only recommended for features with only a few possible values. For features
     47    with many possible values or for very sparse features, `embedding_column` is
     48    recommended.
     49 
     50      embedded_dept_column = embedding_column(
     51        sparse_column_with_keys("department", ["math", "philosphy", ...]),
     52        dimension=10)
     53 
     54 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
     55 
     56    Sparse features can be fed directly into linear models. When doing so
     57    an embedding_lookups are used to efficiently perform the sparse matrix
     58    multiplication.
     59 
     60      dept_column = sparse_column_with_keys("department",
     61        ["math", "philosophy", "english"])
     62 
     63    It is recommended that continuous features be bucketized before being
     64    fed into linear models.
     65 
     66      bucketized_age_column = bucketized_column(
     67       source_column=age_column,
     68       boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
     69 
     70    Sparse features can be crossed (also known as conjuncted or combined) in
     71    order to form non-linearities, and then fed into linear models.
     72 
     73     cross_dept_age_column = crossed_column(
     74       columns=[department_column, bucketized_age_column],
     75       hash_bucket_size=1000)
     76 
     77 Example of building an `Estimator` model using FeatureColumns:
     78 
     79   # Define features and transformations
     80   deep_feature_columns = [age_column, embedded_dept_column]
     81   wide_feature_columns = [dept_column, bucketized_age_column,
     82       cross_dept_age_column]
     83 
     84   # Build deep model
     85   estimator = DNNClassifier(
     86       feature_columns=deep_feature_columns,
     87       hidden_units=[500, 250, 50])
     88   estimator.train(...)
     89 
     90   # Or build a wide model
     91   estimator = LinearClassifier(
     92       feature_columns=wide_feature_columns)
     93   estimator.train(...)
     94 
     95   # Or build a wide and deep model!
     96   estimator = DNNLinearCombinedClassifier(
     97       linear_feature_columns=wide_feature_columns,
     98       dnn_feature_columns=deep_feature_columns,
     99       dnn_hidden_units=[500, 250, 50])
    100   estimator.train(...)
    101 
    102 
    103 FeatureColumns can also be transformed into a generic input layer for
    104 custom models using `input_from_feature_columns` within
    105 `feature_column_ops.py`.
    106 
    107 Example of building a non-`Estimator` model using FeatureColumns:
    108 
    109   # Building model via layers
    110 
    111   deep_feature_columns = [age_column, embedded_dept_column]
    112   columns_to_tensor = parse_feature_columns_from_examples(
    113       serialized=my_data,
    114       feature_columns=deep_feature_columns)
    115   first_layer = input_from_feature_columns(
    116       columns_to_tensors=columns_to_tensor,
    117       feature_columns=deep_feature_columns)
    118   second_layer = fully_connected(first_layer, ...)
    119 
    120 See feature_column_ops_test for more examples.
    121 """
    122 
    123 from __future__ import absolute_import
    124 from __future__ import division
    125 from __future__ import print_function
    126 
    127 import abc
    128 import collections
    129 import math
    130 
    131 import six
    132 
    133 from tensorflow.contrib import lookup
    134 from tensorflow.contrib.framework.python.framework import checkpoint_utils
    135 from tensorflow.contrib.framework.python.framework import experimental
    136 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
    137 from tensorflow.contrib.layers.python.layers import embedding_ops
    138 from tensorflow.contrib.layers.python.layers import layers
    139 from tensorflow.contrib.layers.python.ops import bucketization_op
    140 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
    141 from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops
    142 from tensorflow.python.feature_column import feature_column as fc_core
    143 from tensorflow.python.framework import dtypes
    144 from tensorflow.python.framework import ops
    145 from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
    146 from tensorflow.python.framework import tensor_shape
    147 from tensorflow.python.ops import array_ops
    148 from tensorflow.python.ops import init_ops
    149 from tensorflow.python.ops import math_ops
    150 from tensorflow.python.ops import parsing_ops
    151 from tensorflow.python.ops import resource_variable_ops
    152 from tensorflow.python.ops import sparse_ops
    153 from tensorflow.python.ops import string_ops
    154 from tensorflow.python.ops import variables
    155 from tensorflow.python.platform import tf_logging as logging
    156 from tensorflow.python.util import deprecation
    157 from tensorflow.python.util import nest
    158 
    159 
    160 # Imports the core `InputLayer` symbol in contrib during development.
    161 InputLayer = fc_core.InputLayer  # pylint: disable=invalid-name
    162 
    163 
    164 class _LinearEmbeddingLookupArguments(
    165     collections.namedtuple("_LinearEmbeddingLookupArguments",
    166                            ["input_tensor",
    167                             "weight_tensor",
    168                             "vocab_size",
    169                             "initializer",
    170                             "combiner"])):
    171   """Represents the information needed from a column for embedding lookup.
    172 
    173   Used to compute DNN inputs and weighted sum.
    174   """
    175   pass
    176 
    177 
    178 class _DeepEmbeddingLookupArguments(
    179     collections.namedtuple("_DeepEmbeddingLookupArguments",
    180                            ["input_tensor",
    181                             "weight_tensor",
    182                             "vocab_size",
    183                             "initializer",
    184                             "combiner",
    185                             "dimension",
    186                             "shared_embedding_name",
    187                             "hash_key",
    188                             "max_norm",
    189                             "trainable"])):
    190   """Represents the information needed from a column for embedding lookup.
    191 
    192   Used to compute DNN inputs and weighted sum.
    193   """
    194   pass
    195 
    196 
    197 class _FeatureColumn(object):
    198   """Represents a feature column abstraction.
    199 
    200   To distinguish the concept of a feature family and a specific binary feature
    201   within a family, we refer to a feature family like "country" as a feature
    202   column. For example "country:US" is a feature which is in "country" feature
    203   column and has a feature value ("US").
    204   This class is an abstract class. User should not create one instance of this.
    205   Following classes (_SparseColumn, _RealValuedColumn, ...) are concrete
    206   instances.
    207   """
    208   __metaclass__ = abc.ABCMeta
    209 
    210   @abc.abstractproperty
    211   @deprecation.deprecated(
    212       "2016-09-25",
    213       "Should be private.")
    214   def name(self):
    215     """Returns the name of column or transformed column."""
    216     pass
    217 
    218   @abc.abstractproperty
    219   @deprecation.deprecated(
    220       "2016-09-25",
    221       "Should be private.")
    222   def config(self):
    223     """Returns configuration of the base feature for `tf.parse_example`."""
    224     pass
    225 
    226   @abc.abstractproperty
    227   @deprecation.deprecated(
    228       "2016-09-25",
    229       "Should be private.")
    230   def key(self):
    231     """Returns a string which will be used as a key when we do sorting."""
    232     pass
    233 
    234   @abc.abstractmethod
    235   @deprecation.deprecated(
    236       "2016-09-25",
    237       "Should be private.")
    238   def insert_transformed_feature(self, columns_to_tensors):
    239     """Apply transformation and inserts it into columns_to_tensors.
    240 
    241     Args:
    242       columns_to_tensors: A mapping from feature columns to tensors. 'string'
    243         key means a base feature (not-transformed). It can have _FeatureColumn
    244         as a key too. That means that _FeatureColumn is already transformed.
    245     """
    246     raise NotImplementedError("Transform is not implemented for {}.".format(
    247         self))
    248 
    249   # pylint: disable=unused-argument
    250   def _to_dnn_input_layer(self,
    251                           input_tensor,
    252                           weight_collection=None,
    253                           trainable=True,
    254                           output_rank=2):
    255     """Returns a Tensor as an input to the first layer of neural network."""
    256     raise ValueError("Calling an abstract method.")
    257 
    258   def _deep_embedding_lookup_arguments(self, input_tensor):
    259     """Returns arguments to embedding lookup to build an input layer."""
    260     raise NotImplementedError(
    261         "No deep embedding lookup arguments for column {}.".format(self))
    262 
    263   # It is expected that classes implement either wide_embedding_lookup_arguments
    264   # or to_dense_tensor to be used in linear models.
    265   # pylint: disable=unused-argument
    266   def _wide_embedding_lookup_arguments(self, input_tensor):
    267     """Returns arguments to look up embeddings for this column."""
    268     raise NotImplementedError(
    269         "No wide embedding lookup arguments for column {}.".format(self))
    270 
    271   # pylint: disable=unused-argument
    272   def _to_dense_tensor(self, input_tensor):
    273     """Returns a dense tensor representing this column's values."""
    274     raise NotImplementedError(
    275         "No dense tensor representation for column {}.".format(self))
    276 
    277   def _checkpoint_path(self):
    278     """Returns None, or a (path,tensor_name) to load a checkpoint from."""
    279     return None
    280 
    281   def _key_without_properties(self, properties):
    282     """Helper method for self.key() that omits particular properties."""
    283     fields_values = []
    284     # pylint: disable=protected-access
    285     for i, k in enumerate(self._fields):
    286       if k in properties:
    287         # Excludes a property from the key.
    288         # For instance, exclude `initializer` from the key of EmbeddingColumn
    289         # since we don't support users specifying different initializers for
    290         # the same embedding column. Ditto for `normalizer` and
    291         # RealValuedColumn.
    292         # Special treatment is needed since the default str form of a
    293         # function contains its address, which could introduce non-determinism
    294         # in sorting.
    295         continue
    296       fields_values.append("{}={}".format(k, self[i]))
    297     # pylint: enable=protected-access
    298 
    299     # This is effectively the same format as str(self), except with our special
    300     # treatment.
    301     return "{}({})".format(type(self).__name__, ", ".join(fields_values))
    302 
    303 
    304 # TODO(b/30410315): Support warm starting in all feature columns.
    305 class _SparseColumn(
    306     _FeatureColumn,
    307     fc_core._CategoricalColumn,  # pylint: disable=protected-access
    308     collections.namedtuple("_SparseColumn", [
    309         "column_name", "is_integerized", "bucket_size", "lookup_config",
    310         "combiner", "dtype"
    311     ])):
    312   """Represents a sparse feature column also known as categorical features.
    313 
    314   Instances of this class are immutable. A sparse column means features are
    315   sparse and dictionary returned by InputBuilder contains a
    316   ("column_name", SparseTensor) pair.
    317   One and only one of bucket_size or lookup_config should be set. If
    318   is_integerized is True then bucket_size should be set.
    319 
    320   Attributes:
    321     column_name: A string defining sparse column name.
    322     is_integerized: A bool if True means type of feature is an integer.
    323       Integerized means we can use the feature itself as id.
    324     bucket_size: An int that is > 0. The number of buckets.
    325     lookup_config: A _SparseIdLookupConfig defining feature-to-id lookup
    326       configuration
    327     combiner: A string specifying how to reduce if the sparse column is
    328       multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
    329       the default. "sqrtn" often achieves good accuracy, in particular with
    330       bag-of-words columns.
    331         * "sum": do not normalize features in the column
    332         * "mean": do l1 normalization on features in the column
    333         * "sqrtn": do l2 normalization on features in the column
    334       For more information: `tf.embedding_lookup_sparse`.
    335     dtype: Type of features, either `tf.string` or `tf.int64`.
    336 
    337   Raises:
    338     TypeError: if lookup_config is not a _SparseIdLookupConfig.
    339     ValueError: if above expectations about input fails.
    340   """
    341 
    342   def __new__(cls,
    343               column_name,
    344               is_integerized=False,
    345               bucket_size=None,
    346               lookup_config=None,
    347               combiner="sum",
    348               dtype=dtypes.string):
    349     if is_integerized and bucket_size is None:
    350       raise ValueError("bucket_size must be set if is_integerized is True. "
    351                        "column_name: {}".format(column_name))
    352 
    353     if is_integerized and not dtype.is_integer:
    354       raise ValueError("dtype must be an integer if is_integerized is True. "
    355                        "dtype: {}, column_name: {}.".format(dtype, column_name))
    356     if dtype != dtypes.string and not dtype.is_integer:
    357       raise ValueError("dtype must be string or integer. "
    358                        "dtype: {}, column_name: {}".format(dtype, column_name))
    359 
    360     if bucket_size is None and lookup_config is None:
    361       raise ValueError("one of bucket_size or lookup_config must be set. "
    362                        "column_name: {}".format(column_name))
    363 
    364     if bucket_size is not None and lookup_config:
    365       raise ValueError("one and only one of bucket_size or lookup_config "
    366                        "must be set. column_name: {}".format(column_name))
    367 
    368     if bucket_size is not None and bucket_size < 1:
    369       raise ValueError("bucket_size must be at least 1. "
    370                        "bucket_size: {}, column_name: {}".format(bucket_size,
    371                                                                  column_name))
    372 
    373     if ((lookup_config) and
    374         (not isinstance(lookup_config, _SparseIdLookupConfig))):
    375       raise TypeError(
    376           "lookup_config must be an instance of _SparseIdLookupConfig. "
    377           "Given one is in type {} for column_name {}".format(
    378               type(lookup_config), column_name))
    379 
    380     if (lookup_config and lookup_config.vocabulary_file and
    381         lookup_config.vocab_size is None):
    382       raise ValueError("vocab_size must be defined. "
    383                        "column_name: {}".format(column_name))
    384 
    385     return super(_SparseColumn, cls).__new__(
    386         cls,
    387         column_name,
    388         is_integerized=is_integerized,
    389         bucket_size=bucket_size,
    390         lookup_config=lookup_config,
    391         combiner=combiner,
    392         dtype=dtype)
    393 
    394   @property
    395   def name(self):
    396     return self.column_name
    397 
    398   @property
    399   def length(self):
    400     """Returns vocabulary or hash_bucket size."""
    401     if self.bucket_size is not None:
    402       return self.bucket_size
    403     return self.lookup_config.vocab_size + self.lookup_config.num_oov_buckets
    404 
    405   @property
    406   def config(self):
    407     return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
    408 
    409   @property
    410   def key(self):
    411     """Returns a string which will be used as a key when we do sorting."""
    412     return "{}".format(self)
    413 
    414   def id_tensor(self, input_tensor):
    415     """Returns the id tensor from the given transformed input_tensor."""
    416     return input_tensor
    417 
    418   # pylint: disable=unused-argument
    419   def weight_tensor(self, input_tensor):
    420     """Returns the weight tensor from the given transformed input_tensor."""
    421     return None
    422 
    423   # pylint: disable=unused-argument
    424   def _to_dnn_input_layer(self,
    425                           input_tensor,
    426                           weight_collections=None,
    427                           trainable=True,
    428                           output_rank=2):
    429     raise ValueError(
    430         "SparseColumn is not supported in DNN. "
    431         "Please use embedding_column or one_hot_column. column: {}".format(
    432             self))
    433 
    434   def _wide_embedding_lookup_arguments(self, input_tensor):
    435     return _LinearEmbeddingLookupArguments(
    436         input_tensor=self.id_tensor(input_tensor),
    437         weight_tensor=self.weight_tensor(input_tensor),
    438         vocab_size=self.length,
    439         initializer=init_ops.zeros_initializer(),
    440         combiner=self.combiner)
    441 
    442   def _get_input_sparse_tensor(self, input_tensor):
    443     """sparsify input_tensor if dense."""
    444     if not isinstance(input_tensor, sparse_tensor_py.SparseTensor):
    445       # To avoid making any assumptions about which values are to be ignored,
    446       # we set ignore_value to -1 for numeric tensors to avoid excluding valid
    447       # indices.
    448       if input_tensor.dtype == dtypes.string:
    449         ignore_value = ""
    450       else:
    451         ignore_value = -1
    452       input_tensor = _reshape_real_valued_tensor(input_tensor, 2, self.name)
    453       input_tensor = contrib_sparse_ops.dense_to_sparse_tensor(
    454           input_tensor, ignore_value=ignore_value)
    455 
    456     return input_tensor
    457 
    458   def is_compatible(self, other_column):
    459     """Check compatibility of two sparse columns."""
    460     if self.lookup_config and other_column.lookup_config:
    461       return self.lookup_config == other_column.lookup_config
    462     compatible = (self.length == other_column.length and
    463                   (self.dtype == other_column.dtype or
    464                    (self.dtype.is_integer and other_column.dtype.is_integer)))
    465     if compatible:
    466       logging.warn("Column {} and {} may not have the same vocabulary.".
    467                    format(self.name, other_column.name))
    468     return compatible
    469 
    470   @abc.abstractmethod
    471   def _do_transform(self, input_tensor):
    472     pass
    473 
    474   def insert_transformed_feature(self, columns_to_tensors):
    475     """Handles sparse column to id conversion."""
    476     input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name])
    477     columns_to_tensors[self] = self._do_transform(input_tensor)
    478 
    479   def _transform_feature(self, inputs):
    480     input_tensor = self._get_input_sparse_tensor(inputs.get(self.name))
    481     return self._do_transform(input_tensor)
    482 
    483   @property
    484   def _parse_example_spec(self):
    485     return self.config
    486 
    487   @property
    488   def _num_buckets(self):
    489     return self.length
    490 
    491   def _get_sparse_tensors(self, inputs, weight_collections=None,
    492                           trainable=None):
    493     del weight_collections
    494     del trainable
    495     input_tensor = inputs.get(self)
    496     return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
    497         self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
    498 
    499 
    500 class _SparseColumnIntegerized(_SparseColumn):
    501   """See `sparse_column_with_integerized_feature`."""
    502 
    503   def _do_transform(self, input_tensor):
    504     sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size,
    505                                     name="mod")
    506     return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
    507                                          input_tensor.dense_shape)
    508 
    509 
    510 def sparse_column_with_integerized_feature(column_name,
    511                                            bucket_size,
    512                                            combiner="sum",
    513                                            dtype=dtypes.int64):
    514   """Creates an integerized _SparseColumn.
    515 
    516   Use this when your features are already pre-integerized into int64 IDs, that
    517   is, when the set of values to output is already coming in as what's desired in
    518   the output. Integerized means we can use the feature value itself as id.
    519 
    520   Typically this is used for reading contiguous ranges of integers indexes, but
    521   it doesn't have to be. The output value is simply copied from the
    522   input_feature, whatever it is. Just be aware, however, that if you have large
    523   gaps of unused integers it might affect what you feed those in (for instance,
    524   if you make up a one-hot tensor from these, the unused integers will appear as
    525   values in the tensor which are always zero.)
    526 
    527   Args:
    528     column_name: A string defining sparse column name.
    529     bucket_size: An int that is >= 1. The number of buckets. It should be bigger
    530       than maximum feature. In other words features in this column should be an
    531       int64 in range [0, bucket_size)
    532     combiner: A string specifying how to reduce if the sparse column is
    533       multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
    534       the default. "sqrtn" often achieves good accuracy, in particular with
    535       bag-of-words columns.
    536         * "sum": do not normalize features in the column
    537         * "mean": do l1 normalization on features in the column
    538         * "sqrtn": do l2 normalization on features in the column
    539       For more information: `tf.embedding_lookup_sparse`.
    540     dtype: Type of features. It should be an integer type. Default value is
    541       dtypes.int64.
    542 
    543   Returns:
    544     An integerized _SparseColumn definition.
    545 
    546   Raises:
    547     ValueError: bucket_size is less than 1.
    548     ValueError: dtype is not integer.
    549   """
    550   return _SparseColumnIntegerized(
    551       column_name, is_integerized=True, bucket_size=bucket_size,
    552       combiner=combiner, dtype=dtype)
    553 
    554 
    555 class _SparseColumnHashed(_SparseColumn):
    556   """See `sparse_column_with_hash_bucket`."""
    557 
    558   def __new__(cls,
    559               column_name,
    560               is_integerized=False,
    561               bucket_size=None,
    562               lookup_config=None,
    563               combiner="sum",
    564               dtype=dtypes.string,
    565               hash_keys=None):
    566     if hash_keys is not None:
    567       if not isinstance(hash_keys, list) or not hash_keys:
    568         raise ValueError("hash_keys must be a non-empty list.")
    569       if (any([not isinstance(key_pair, list) for key_pair in hash_keys]) or
    570           any([len(key_pair) != 2 for key_pair in hash_keys]) or
    571           any([not isinstance(key, int) for key in nest.flatten(hash_keys)])):
    572         raise ValueError(
    573             "Each element of hash_keys must be a pair of integers.")
    574     obj = super(_SparseColumnHashed, cls).__new__(
    575         cls,
    576         column_name,
    577         is_integerized=is_integerized,
    578         bucket_size=bucket_size,
    579         lookup_config=lookup_config,
    580         combiner=combiner,
    581         dtype=dtype)
    582     obj.hash_keys = hash_keys
    583     return obj
    584 
    585   def _do_transform(self, input_tensor):
    586     if self.dtype.is_integer:
    587       sparse_values = string_ops.as_string(input_tensor.values)
    588     else:
    589       sparse_values = input_tensor.values
    590 
    591     if self.hash_keys:
    592       result = []
    593       for key in self.hash_keys:
    594         sparse_id_values = string_ops.string_to_hash_bucket_strong(
    595             sparse_values, self.bucket_size, key)
    596         result.append(
    597             sparse_tensor_py.SparseTensor(input_tensor.indices,
    598                                           sparse_id_values,
    599                                           input_tensor.dense_shape))
    600       return sparse_ops.sparse_concat(axis=1, sp_inputs=result, name="lookup")
    601     else:
    602       sparse_id_values = string_ops.string_to_hash_bucket_fast(
    603           sparse_values, self.bucket_size, name="lookup")
    604       return sparse_tensor_py.SparseTensor(
    605           input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
    606 
    607 
    608 def sparse_column_with_hash_bucket(column_name,
    609                                    hash_bucket_size,
    610                                    combiner="sum",
    611                                    dtype=dtypes.string,
    612                                    hash_keys=None):
    613   """Creates a _SparseColumn with hashed bucket configuration.
    614 
    615   Use this when your sparse features are in string or integer format, but you
    616   don't have a vocab file that maps each value to an integer ID.
    617   output_id = Hash(input_feature_string) % bucket_size
    618 
    619   When hash_keys is set, multiple integer IDs would be created with each key
    620   pair in the `hash_keys`. This is useful to reduce the collision of hashed ids.
    621 
    622   Args:
    623     column_name: A string defining sparse column name.
    624     hash_bucket_size: An int that is > 1. The number of buckets.
    625     combiner: A string specifying how to reduce if the sparse column is
    626       multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
    627       the default. "sqrtn" often achieves good accuracy, in particular with
    628       bag-of-words columns.
    629         * "sum": do not normalize features in the column
    630         * "mean": do l1 normalization on features in the column
    631         * "sqrtn": do l2 normalization on features in the column
    632       For more information: `tf.embedding_lookup_sparse`.
    633     dtype: The type of features. Only string and integer types are supported.
    634     hash_keys: The hash keys to use. It is a list of lists of two uint64s. If
    635       None, simple and fast hashing algorithm is used. Otherwise, multiple
    636       strong hash ids would be produced with each two unit64s in this argument.
    637 
    638   Returns:
    639     A _SparseColumn with hashed bucket configuration
    640 
    641   Raises:
    642     ValueError: hash_bucket_size is not greater than 2.
    643     ValueError: dtype is neither string nor integer.
    644   """
    645   return _SparseColumnHashed(
    646       column_name,
    647       bucket_size=hash_bucket_size,
    648       combiner=combiner,
    649       dtype=dtype,
    650       hash_keys=hash_keys)
    651 
    652 
    653 class _SparseColumnKeys(_SparseColumn):
    654   """See `sparse_column_with_keys`."""
    655 
    656   def _do_transform(self, input_tensor):
    657     table = lookup.index_table_from_tensor(
    658         mapping=tuple(self.lookup_config.keys),
    659         default_value=self.lookup_config.default_value,
    660         dtype=self.dtype,
    661         name="lookup")
    662     return table.lookup(input_tensor)
    663 
    664 
    665 def sparse_column_with_keys(
    666     column_name, keys, default_value=-1, combiner="sum", dtype=dtypes.string):
    667   """Creates a _SparseColumn with keys.
    668 
    669   Look up logic is as follows:
    670   lookup_id = index_of_feature_in_keys if feature in keys else default_value
    671 
    672   Args:
    673     column_name: A string defining sparse column name.
    674     keys: A list or tuple defining vocabulary. Must be castable to `dtype`.
    675     default_value: The value to use for out-of-vocabulary feature values.
    676       Default is -1.
    677     combiner: A string specifying how to reduce if the sparse column is
    678       multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
    679       the default. "sqrtn" often achieves good accuracy, in particular with
    680       bag-of-words columns.
    681         * "sum": do not normalize features in the column
    682         * "mean": do l1 normalization on features in the column
    683         * "sqrtn": do l2 normalization on features in the column
    684       For more information: `tf.embedding_lookup_sparse`.
    685     dtype: Type of features. Only integer and string are supported.
    686 
    687   Returns:
    688     A _SparseColumnKeys with keys configuration.
    689   """
    690   keys = tuple(keys)
    691   return _SparseColumnKeys(
    692       column_name,
    693       lookup_config=_SparseIdLookupConfig(
    694           keys=keys, vocab_size=len(keys), default_value=default_value),
    695       combiner=combiner,
    696       dtype=dtype)
    697 
    698 
    699 class _SparseColumnVocabulary(_SparseColumn):
    700   """See `sparse_column_with_vocabulary_file`."""
    701 
    702   def _do_transform(self, st):
    703     if self.dtype.is_integer:
    704       sparse_string_values = string_ops.as_string(st.values)
    705       sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices,
    706                                                            sparse_string_values,
    707                                                            st.dense_shape)
    708     else:
    709       sparse_string_tensor = st
    710 
    711     table = lookup.index_table_from_file(
    712         vocabulary_file=self.lookup_config.vocabulary_file,
    713         num_oov_buckets=self.lookup_config.num_oov_buckets,
    714         vocab_size=self.lookup_config.vocab_size,
    715         default_value=self.lookup_config.default_value,
    716         name=self.name + "_lookup")
    717     return table.lookup(sparse_string_tensor)
    718 
    719 
    720 def sparse_column_with_vocabulary_file(column_name,
    721                                        vocabulary_file,
    722                                        num_oov_buckets=0,
    723                                        vocab_size=None,
    724                                        default_value=-1,
    725                                        combiner="sum",
    726                                        dtype=dtypes.string):
    727   """Creates a _SparseColumn with vocabulary file configuration.
    728 
    729   Use this when your sparse features are in string or integer format, and you
    730   have a vocab file that maps each value to an integer ID.
    731   output_id = LookupIdFromVocab(input_feature_string)
    732 
    733   Args:
    734     column_name: A string defining sparse column name.
    735     vocabulary_file: The vocabulary filename.
    736     num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of
    737       vocabulary features will be ignored.
    738     vocab_size: Number of the elements in the vocabulary.
    739     default_value: The value to use for out-of-vocabulary feature values.
    740       Defaults to -1.
    741     combiner: A string specifying how to reduce if the sparse column is
    742       multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
    743       the default. "sqrtn" often achieves good accuracy, in particular with
    744       bag-of-words columns.
    745         * "sum": do not normalize features in the column
    746         * "mean": do l1 normalization on features in the column
    747         * "sqrtn": do l2 normalization on features in the column
    748       For more information: `tf.embedding_lookup_sparse`.
    749     dtype: The type of features. Only string and integer types are supported.
    750 
    751   Returns:
    752     A _SparseColumn with vocabulary file configuration.
    753 
    754   Raises:
    755     ValueError: vocab_size is not defined.
    756     ValueError: dtype is neither string nor integer.
    757   """
    758   if vocab_size is None:
    759     raise ValueError("vocab_size should be defined. "
    760                      "column_name: {}".format(column_name))
    761 
    762   return _SparseColumnVocabulary(
    763       column_name,
    764       lookup_config=_SparseIdLookupConfig(
    765           vocabulary_file=vocabulary_file,
    766           num_oov_buckets=num_oov_buckets,
    767           vocab_size=vocab_size,
    768           default_value=default_value),
    769       combiner=combiner,
    770       dtype=dtype)
    771 
    772 
    773 class _WeightedSparseColumn(
    774     _FeatureColumn,
    775     fc_core._CategoricalColumn,  # pylint: disable=protected-access
    776     collections.namedtuple("_WeightedSparseColumn",
    777                            ["sparse_id_column", "weight_column_name",
    778                             "dtype"])):
    779   """See `weighted_sparse_column`."""
    780 
    781   def __new__(cls, sparse_id_column, weight_column_name, dtype):
    782     return super(_WeightedSparseColumn, cls).__new__(cls, sparse_id_column,
    783                                                      weight_column_name, dtype)
    784 
    785   @property
    786   def name(self):
    787     return "{}_weighted_by_{}".format(self.sparse_id_column.name,
    788                                       self.weight_column_name)
    789 
    790   @property
    791   def length(self):
    792     """Returns id size."""
    793     return self.sparse_id_column.length
    794 
    795   @property
    796   def config(self):
    797     config = _get_feature_config(self.sparse_id_column)
    798     config.update(
    799         {self.weight_column_name: parsing_ops.VarLenFeature(self.dtype)})
    800     return config
    801 
    802   @property
    803   def lookup_config(self):
    804     return self.sparse_id_column.lookup_config
    805 
    806   @property
    807   def key(self):
    808     """Returns a string which will be used as a key when we do sorting."""
    809     return "{}".format(self)
    810 
    811   def id_tensor(self, input_tensor):
    812     """Returns the id tensor from the given transformed input_tensor."""
    813     return input_tensor[0]
    814 
    815   def weight_tensor(self, input_tensor):
    816     """Returns the weight tensor from the given transformed input_tensor."""
    817     return input_tensor[1]
    818 
    819   # pylint: disable=unused-argument
    820   def _to_dnn_input_layer(self,
    821                           input_tensor,
    822                           weight_collections=None,
    823                           trainable=True,
    824                           output_rank=2):
    825     raise ValueError(
    826         "WeightedSparseColumn is not supported in DNN. "
    827         "Please use embedding_column or one_hot_column. column: {}".format(
    828             self))
    829 
    830   def _wide_embedding_lookup_arguments(self, input_tensor):
    831     return _LinearEmbeddingLookupArguments(
    832         input_tensor=self.id_tensor(input_tensor),
    833         weight_tensor=self.weight_tensor(input_tensor),
    834         vocab_size=self.length,
    835         initializer=init_ops.zeros_initializer(),
    836         combiner=self.sparse_id_column.combiner)
    837 
    838   def _do_transform(self, id_tensor, weight_tensor):
    839     if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
    840       # The weight tensor can be a regular Tensor. In such case, sparsify it.
    841       weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
    842     if not self.dtype.is_floating:
    843       weight_tensor = math_ops.to_float(weight_tensor)
    844     return tuple([id_tensor, weight_tensor])
    845 
    846   def insert_transformed_feature(self, columns_to_tensors):
    847     """Inserts a tuple with the id and weight tensors."""
    848     if self.sparse_id_column not in columns_to_tensors:
    849       self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
    850 
    851     weight_tensor = columns_to_tensors[self.weight_column_name]
    852     columns_to_tensors[self] = self._do_transform(
    853         columns_to_tensors[self.sparse_id_column], weight_tensor)
    854 
    855   def _transform_feature(self, inputs):
    856     return self._do_transform(
    857         inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
    858 
    859   @property
    860   def _parse_example_spec(self):
    861     return self.config
    862 
    863   @property
    864   def _num_buckets(self):
    865     return self.length
    866 
    867   def _get_sparse_tensors(self, inputs, weight_collections=None,
    868                           trainable=None):
    869     del weight_collections
    870     del trainable
    871     input_tensor = inputs.get(self)
    872     return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
    873         self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
    874 
    875   def is_compatible(self, other_column):
    876     """Check compatibility with other sparse column."""
    877     if isinstance(other_column, _WeightedSparseColumn):
    878       return self.sparse_id_column.is_compatible(other_column.sparse_id_column)
    879     return self.sparse_id_column.is_compatible(other_column)
    880 
    881 
    882 def weighted_sparse_column(sparse_id_column,
    883                            weight_column_name,
    884                            dtype=dtypes.float32):
    885   """Creates a _SparseColumn by combining sparse_id_column with a weight column.
    886 
    887   Example:
    888 
    889     ```python
    890     sparse_feature = sparse_column_with_hash_bucket(column_name="sparse_col",
    891                                                     hash_bucket_size=1000)
    892     weighted_feature = weighted_sparse_column(sparse_id_column=sparse_feature,
    893                                               weight_column_name="weights_col")
    894     ```
    895 
    896     This configuration assumes that input dictionary of model contains the
    897     following two items:
    898       * (key="sparse_col", value=sparse_tensor) where sparse_tensor is
    899         a SparseTensor.
    900       * (key="weights_col", value=weights_tensor) where weights_tensor
    901         is a SparseTensor.
    902      Following are assumed to be true:
    903        * sparse_tensor.indices = weights_tensor.indices
    904        * sparse_tensor.dense_shape = weights_tensor.dense_shape
    905 
    906   Args:
    907     sparse_id_column: A `_SparseColumn` which is created by
    908       `sparse_column_with_*` functions.
    909     weight_column_name: A string defining a sparse column name which represents
    910       weight or value of the corresponding sparse id feature.
    911     dtype: Type of weights, such as `tf.float32`. Only floating and integer
    912       weights are supported.
    913 
    914   Returns:
    915     A _WeightedSparseColumn composed of two sparse features: one represents id,
    916     the other represents weight (value) of the id feature in that example.
    917 
    918   Raises:
    919     ValueError: if dtype is not convertible to float.
    920   """
    921   if not (dtype.is_integer or dtype.is_floating):
    922     raise ValueError("dtype is not convertible to float. Given {}".format(
    923         dtype))
    924 
    925   return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype)
    926 
    927 
    928 class _OneHotColumn(
    929     _FeatureColumn,
    930     fc_core._DenseColumn,  # pylint: disable=protected-access
    931     collections.namedtuple("_OneHotColumn", ["sparse_id_column"])):
    932   """Represents a one-hot column for use in deep networks.
    933 
    934   Args:
    935     sparse_id_column: A _SparseColumn which is created by `sparse_column_with_*`
    936       function.
    937   """
    938 
    939   @property
    940   def name(self):
    941     return "{}_one_hot".format(self.sparse_id_column.name)
    942 
    943   @property
    944   def length(self):
    945     """Returns vocabulary or hash_bucket size."""
    946     return self.sparse_id_column.length
    947 
    948   @property
    949   def config(self):
    950     """Returns the parsing config of the origin column."""
    951     return _get_feature_config(self.sparse_id_column)
    952 
    953   @property
    954   def key(self):
    955     """Returns a string which will be used as a key when we do sorting."""
    956     return "{}".format(self)
    957 
    958   def insert_transformed_feature(self, columns_to_tensors):
    959     """Used by the Transformer to prevent double transformations."""
    960     if self.sparse_id_column not in columns_to_tensors:
    961       self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
    962     columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column]
    963 
    964   def _to_dnn_input_layer(self,
    965                           transformed_input_tensor,
    966                           unused_weight_collections=None,
    967                           unused_trainable=False,
    968                           output_rank=2):
    969     """Returns a Tensor as an input to the first layer of neural network.
    970 
    971     Args:
    972       transformed_input_tensor: A tensor that has undergone the transformations
    973       in `insert_transformed_feature`. Rank should be >= `output_rank`.
    974       unused_weight_collections: Unused. One hot encodings are not variable.
    975       unused_trainable: Unused. One hot encodings are not trainable.
    976       output_rank: the desired rank of the output `Tensor`.
    977 
    978     Returns:
    979       A multi-hot Tensor to be fed into the first layer of neural network.
    980 
    981     Raises:
    982       ValueError: When using one_hot_column with weighted_sparse_column.
    983       This is not yet supported.
    984     """
    985 
    986     # Reshape ID column to `output_rank`.
    987     sparse_id_column = self.sparse_id_column.id_tensor(transformed_input_tensor)
    988     # pylint: disable=protected-access
    989     sparse_id_column = layers._inner_flatten(sparse_id_column, output_rank)
    990 
    991     weight_tensor = self.sparse_id_column.weight_tensor(
    992         transformed_input_tensor)
    993     if weight_tensor is not None:
    994       weighted_column = sparse_ops.sparse_merge(sp_ids=sparse_id_column,
    995                                                 sp_values=weight_tensor,
    996                                                 vocab_size=self.length)
    997       # Remove (?, -1) index
    998       weighted_column = sparse_ops.sparse_slice(
    999           weighted_column,
   1000           [0, 0],
   1001           weighted_column.dense_shape)
   1002       return sparse_ops.sparse_tensor_to_dense(weighted_column)
   1003 
   1004     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column,
   1005                                                         default_value=-1)
   1006 
   1007     # One hot must be float for tf.concat reasons since all other inputs to
   1008     # input_layer are float32.
   1009     one_hot_id_tensor = array_ops.one_hot(
   1010         dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0)
   1011 
   1012     # Reduce to get a multi-hot per example.
   1013     return math_ops.reduce_sum(
   1014         one_hot_id_tensor, reduction_indices=[output_rank - 1])
   1015 
   1016   @property
   1017   def _variable_shape(self):
   1018     return tensor_shape.TensorShape([self.length])
   1019 
   1020   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   1021     del weight_collections
   1022     del trainable
   1023     return inputs.get(self)
   1024 
   1025   def _transform_feature(self, inputs):
   1026     return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
   1027 
   1028   @property
   1029   def _parse_example_spec(self):
   1030     return self.config
   1031 
   1032 
   1033 class _EmbeddingColumn(
   1034     _FeatureColumn,
   1035     fc_core._DenseColumn,  # pylint: disable=protected-access
   1036     collections.namedtuple("_EmbeddingColumn", [
   1037         "sparse_id_column", "dimension", "combiner", "initializer",
   1038         "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
   1039         "shared_vocab_size", "max_norm", "trainable"
   1040     ])):
   1041   """Represents an embedding column.
   1042 
   1043   Args:
   1044     sparse_id_column: A `_SparseColumn` which is created by
   1045       `sparse_column_with_*` or `weighted_sparse_column` functions.
   1046     dimension: An integer specifying dimension of the embedding.
   1047     combiner: A string specifying how to reduce if there are multiple entries
   1048       in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
   1049       "mean" the default. "sqrtn" often achieves good accuracy, in particular
   1050       with bag-of-words columns. Each of this can be thought as example level
   1051       normalizations on the column:
   1052         * "sum": do not normalize features in the column
   1053         * "mean": do l1 normalization on features in the column
   1054         * "sqrtn": do l2 normalization on features in the column
   1055       For more information: `tf.embedding_lookup_sparse`.
   1056     initializer: A variable initializer function to be used in embedding
   1057       variable initialization. If not specified, defaults to
   1058       `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
   1059       1/sqrt(sparse_id_column.length).
   1060     ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
   1061       to restore the column weights. Required if `tensor_name_in_ckpt` is not
   1062       None.
   1063     tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
   1064       checkpoint from which to restore the column weights. Required if
   1065       `ckpt_to_load_from` is not None.
   1066     shared_embedding_name: (Optional). The common name for shared embedding.
   1067     shared_vocab_size: (Optional). The common vocab_size used for shared
   1068       embedding space.
   1069     max_norm: (Optional). If not None, embedding values are l2-normalized to
   1070       the value of max_norm.
   1071     trainable: (Optional). Should the embedding be trainable. Default is True.
   1072 
   1073   Raises:
   1074     ValueError: if `initializer` is specified and is not callable. Also,
   1075       if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified.
   1076   """
   1077 
   1078   def __new__(cls,
   1079               sparse_id_column,
   1080               dimension,
   1081               combiner="mean",
   1082               initializer=None,
   1083               ckpt_to_load_from=None,
   1084               tensor_name_in_ckpt=None,
   1085               shared_embedding_name=None,
   1086               shared_vocab_size=None,
   1087               max_norm=None,
   1088               trainable=True):
   1089     if initializer is not None and not callable(initializer):
   1090       raise ValueError("initializer must be callable if specified. "
   1091                        "Embedding of column_name: {}".format(
   1092                            sparse_id_column.name))
   1093 
   1094     if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
   1095       raise ValueError("Must specify both `ckpt_to_load_from` and "
   1096                        "`tensor_name_in_ckpt` or none of them.")
   1097     if initializer is None:
   1098       logging.warn("The default stddev value of initializer will change from "
   1099                    "\"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after "
   1100                    "2017/02/25.")
   1101       stddev = 1 / math.sqrt(sparse_id_column.length)
   1102       initializer = init_ops.truncated_normal_initializer(
   1103           mean=0.0, stddev=stddev)
   1104     return super(_EmbeddingColumn, cls).__new__(cls, sparse_id_column,
   1105                                                 dimension, combiner,
   1106                                                 initializer, ckpt_to_load_from,
   1107                                                 tensor_name_in_ckpt,
   1108                                                 shared_embedding_name,
   1109                                                 shared_vocab_size,
   1110                                                 max_norm,
   1111                                                 trainable)
   1112 
   1113   @property
   1114   def name(self):
   1115     if self.shared_embedding_name is None:
   1116       return "{}_embedding".format(self.sparse_id_column.name)
   1117     else:
   1118       return "{}_shared_embedding".format(self.sparse_id_column.name)
   1119 
   1120   @property
   1121   def length(self):
   1122     """Returns id size."""
   1123     if self.shared_vocab_size is None:
   1124       return self.sparse_id_column.length
   1125     else:
   1126       return self.shared_vocab_size
   1127 
   1128   @property
   1129   def config(self):
   1130     return _get_feature_config(self.sparse_id_column)
   1131 
   1132   @property
   1133   def key(self):
   1134     """Returns a string which will be used as a key when we do sorting."""
   1135     return self._key_without_properties(["initializer"])
   1136 
   1137   def insert_transformed_feature(self, columns_to_tensors):
   1138     if self.sparse_id_column not in columns_to_tensors:
   1139       self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
   1140     columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column]
   1141 
   1142   def _deep_embedding_lookup_arguments(self, input_tensor):
   1143     return _DeepEmbeddingLookupArguments(
   1144         input_tensor=self.sparse_id_column.id_tensor(input_tensor),
   1145         weight_tensor=self.sparse_id_column.weight_tensor(input_tensor),
   1146         vocab_size=self.length,
   1147         dimension=self.dimension,
   1148         initializer=self.initializer,
   1149         combiner=self.combiner,
   1150         shared_embedding_name=self.shared_embedding_name,
   1151         hash_key=None,
   1152         max_norm=self.max_norm,
   1153         trainable=self.trainable)
   1154 
   1155   def _checkpoint_path(self):
   1156     if self.ckpt_to_load_from is not None:
   1157       return self.ckpt_to_load_from, self.tensor_name_in_ckpt
   1158     return None
   1159 
   1160   # pylint: disable=unused-argument
   1161   def _wide_embedding_lookup_arguments(self, input_tensor):
   1162     raise ValueError("Column {} is not supported in linear models. "
   1163                      "Please use sparse_column.".format(self))
   1164 
   1165   @property
   1166   def _variable_shape(self):
   1167     return tensor_shape.TensorShape([self.dimension])
   1168 
   1169   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   1170     return _embeddings_from_arguments(
   1171         self,
   1172         self._deep_embedding_lookup_arguments(inputs.get(self)),
   1173         weight_collections, trainable)
   1174 
   1175   def _transform_feature(self, inputs):
   1176     return inputs.get(self.sparse_id_column)
   1177 
   1178   @property
   1179   def _parse_example_spec(self):
   1180     return self.config
   1181 
   1182 
   1183 def _is_variable(v):
   1184   """Returns true if `v` is a variable."""
   1185   return isinstance(v, (variables.Variable,
   1186                         resource_variable_ops.ResourceVariable))
   1187 
   1188 
   1189 def _embeddings_from_arguments(column,
   1190                                args,
   1191                                weight_collections,
   1192                                trainable,
   1193                                output_rank=2):
   1194   """Returns embeddings for a column based on the computed arguments.
   1195 
   1196   Args:
   1197    column: the column name.
   1198    args: the _DeepEmbeddingLookupArguments for this column.
   1199    weight_collections: collections to store weights in.
   1200    trainable: whether these embeddings should be trainable.
   1201    output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
   1202      be combined to produce the desired rank.
   1203 
   1204   Returns:
   1205    the embeddings.
   1206 
   1207   Raises:
   1208    ValueError: if not possible to create.
   1209   """
   1210   # pylint: disable=protected-access
   1211   input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
   1212   weight_tensor = None
   1213   if args.weight_tensor is not None:
   1214     weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
   1215   # pylint: enable=protected-access
   1216 
   1217   # This option is only enabled for scattered_embedding_column.
   1218   if args.hash_key:
   1219     embeddings = contrib_variables.model_variable(
   1220         name="weights",
   1221         shape=[args.vocab_size],
   1222         dtype=dtypes.float32,
   1223         initializer=args.initializer,
   1224         trainable=(trainable and args.trainable),
   1225         collections=weight_collections)
   1226 
   1227     return embedding_ops.scattered_embedding_lookup_sparse(
   1228         embeddings,
   1229         input_tensor,
   1230         args.dimension,
   1231         hash_key=args.hash_key,
   1232         combiner=args.combiner,
   1233         name="lookup")
   1234 
   1235   if args.shared_embedding_name is not None:
   1236     shared_embedding_collection_name = (
   1237         "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper())
   1238     graph = ops.get_default_graph()
   1239     shared_embedding_collection = (
   1240         graph.get_collection_ref(shared_embedding_collection_name))
   1241     shape = [args.vocab_size, args.dimension]
   1242     if shared_embedding_collection:
   1243       if len(shared_embedding_collection) > 1:
   1244         raise ValueError(
   1245             "Collection %s can only contain one "
   1246             "(partitioned) variable." % shared_embedding_collection_name)
   1247       else:
   1248         embeddings = shared_embedding_collection[0]
   1249         if embeddings.get_shape() != shape:
   1250           raise ValueError(
   1251               "The embedding variable with name {} already "
   1252               "exists, but its shape does not match required "
   1253               "embedding shape here. Please make sure to use "
   1254               "different shared_embedding_name for different "
   1255               "shared embeddings.".format(args.shared_embedding_name))
   1256     else:
   1257       embeddings = contrib_variables.model_variable(
   1258           name=args.shared_embedding_name,
   1259           shape=shape,
   1260           dtype=dtypes.float32,
   1261           initializer=args.initializer,
   1262           trainable=(trainable and args.trainable),
   1263           collections=weight_collections)
   1264       graph.add_to_collection(shared_embedding_collection_name, embeddings)
   1265   else:
   1266     embeddings = contrib_variables.model_variable(
   1267         name="weights",
   1268         shape=[args.vocab_size, args.dimension],
   1269         dtype=dtypes.float32,
   1270         initializer=args.initializer,
   1271         trainable=(trainable and args.trainable),
   1272         collections=weight_collections)
   1273 
   1274   if _is_variable(embeddings):
   1275     embeddings = [embeddings]
   1276   else:
   1277     embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
   1278   # pylint: disable=protected-access
   1279   _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings)
   1280   return embedding_ops.safe_embedding_lookup_sparse(
   1281       embeddings,
   1282       input_tensor,
   1283       sparse_weights=weight_tensor,
   1284       combiner=args.combiner,
   1285       name=column.name + "weights",
   1286       max_norm=args.max_norm)
   1287 
   1288 
   1289 def _maybe_restore_from_checkpoint(checkpoint_path, variable):
   1290   if checkpoint_path is not None:
   1291     path, tensor_name = checkpoint_path
   1292     weights_to_restore = variable
   1293     if len(variable) == 1:
   1294       weights_to_restore = variable[0]
   1295     checkpoint_utils.init_from_checkpoint(path,
   1296                                           {tensor_name: weights_to_restore})
   1297 
   1298 
   1299 def one_hot_column(sparse_id_column):
   1300   """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
   1301 
   1302   Args:
   1303       sparse_id_column: A _SparseColumn which is created by
   1304         `sparse_column_with_*`
   1305         or crossed_column functions. Note that `combiner` defined in
   1306         `sparse_id_column` is ignored.
   1307 
   1308   Returns:
   1309     An _OneHotColumn.
   1310   """
   1311   return _OneHotColumn(sparse_id_column)
   1312 
   1313 
   1314 def embedding_column(sparse_id_column,
   1315                      dimension,
   1316                      combiner="mean",
   1317                      initializer=None,
   1318                      ckpt_to_load_from=None,
   1319                      tensor_name_in_ckpt=None,
   1320                      max_norm=None,
   1321                      trainable=True):
   1322   """Creates an `_EmbeddingColumn` for feeding sparse data into a DNN.
   1323 
   1324   Args:
   1325     sparse_id_column: A `_SparseColumn` which is created by for example
   1326       `sparse_column_with_*` or crossed_column functions. Note that `combiner`
   1327       defined in `sparse_id_column` is ignored.
   1328     dimension: An integer specifying dimension of the embedding.
   1329     combiner: A string specifying how to reduce if there are multiple entries
   1330       in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
   1331       "mean" the default. "sqrtn" often achieves good accuracy, in particular
   1332       with bag-of-words columns. Each of this can be thought as example level
   1333       normalizations on the column:
   1334         * "sum": do not normalize
   1335         * "mean": do l1 normalization
   1336         * "sqrtn": do l2 normalization
   1337       For more information: `tf.embedding_lookup_sparse`.
   1338     initializer: A variable initializer function to be used in embedding
   1339       variable initialization. If not specified, defaults to
   1340       `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
   1341       1/sqrt(sparse_id_column.length).
   1342     ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
   1343       to restore the column weights. Required if `tensor_name_in_ckpt` is not
   1344       None.
   1345     tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
   1346       checkpoint from which to restore the column weights. Required if
   1347       `ckpt_to_load_from` is not None.
   1348     max_norm: (Optional). If not None, embedding values are l2-normalized to
   1349       the value of max_norm.
   1350     trainable: (Optional). Should the embedding be trainable. Default is True
   1351 
   1352   Returns:
   1353     An `_EmbeddingColumn`.
   1354   """
   1355   return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer,
   1356                           ckpt_to_load_from, tensor_name_in_ckpt,
   1357                           max_norm=max_norm, trainable=trainable)
   1358 
   1359 
   1360 def shared_embedding_columns(sparse_id_columns,
   1361                              dimension,
   1362                              combiner="mean",
   1363                              shared_embedding_name=None,
   1364                              initializer=None,
   1365                              ckpt_to_load_from=None,
   1366                              tensor_name_in_ckpt=None,
   1367                              max_norm=None,
   1368                              trainable=True):
   1369   """Creates a list of `_EmbeddingColumn` sharing the same embedding.
   1370 
   1371   Args:
   1372     sparse_id_columns: An iterable of `_SparseColumn`, such as those created by
   1373       `sparse_column_with_*` or crossed_column functions. Note that `combiner`
   1374       defined in each sparse_id_column is ignored.
   1375     dimension: An integer specifying dimension of the embedding.
   1376     combiner: A string specifying how to reduce if there are multiple entries
   1377       in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
   1378       "mean" the default. "sqrtn" often achieves good accuracy, in particular
   1379       with bag-of-words columns. Each of this can be thought as example level
   1380       normalizations on the column:
   1381         * "sum": do not normalize
   1382         * "mean": do l1 normalization
   1383         * "sqrtn": do l2 normalization
   1384       For more information: `tf.embedding_lookup_sparse`.
   1385     shared_embedding_name: (Optional). A string specifying the name of shared
   1386       embedding weights. This will be needed if you want to reference the shared
   1387       embedding separately from the generated `_EmbeddingColumn`.
   1388     initializer: A variable initializer function to be used in embedding
   1389       variable initialization. If not specified, defaults to
   1390       `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
   1391       1/sqrt(sparse_id_columns[0].length).
   1392     ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
   1393       to restore the column weights. Required if `tensor_name_in_ckpt` is not
   1394       None.
   1395     tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
   1396       checkpoint from which to restore the column weights. Required if
   1397       `ckpt_to_load_from` is not None.
   1398     max_norm: (Optional). If not None, embedding values are l2-normalized to
   1399       the value of max_norm.
   1400     trainable: (Optional). Should the embedding be trainable. Default is True
   1401 
   1402   Returns:
   1403     A tuple of `_EmbeddingColumn` with shared embedding space.
   1404 
   1405   Raises:
   1406     ValueError: if sparse_id_columns is empty, or its elements are not
   1407       compatible with each other.
   1408     TypeError: if `sparse_id_columns` is not a sequence or is a string. If at
   1409       least one element of `sparse_id_columns` is not a `SparseColumn` or a
   1410       `WeightedSparseColumn`.
   1411   """
   1412   if (not isinstance(sparse_id_columns, collections.Sequence) or
   1413       isinstance(sparse_id_columns, six.string_types)):
   1414     raise TypeError(
   1415         "sparse_id_columns must be a non-string sequence (ex: list or tuple) "
   1416         "instead of type {}.".format(type(sparse_id_columns)))
   1417   if len(sparse_id_columns) < 1:
   1418     raise ValueError("The input sparse_id_columns should have at least one "
   1419                      "element.")
   1420   for sparse_id_column in sparse_id_columns:
   1421     if not (isinstance(sparse_id_column, _SparseColumn) or
   1422             isinstance(sparse_id_column, _WeightedSparseColumn)):
   1423       raise TypeError("Elements of sparse_id_columns must be _SparseColumn or "
   1424                       "_WeightedSparseColumn, but {} is not."
   1425                       .format(sparse_id_column))
   1426 
   1427   if len(sparse_id_columns) == 1:
   1428     return [
   1429         _EmbeddingColumn(sparse_id_columns[0], dimension, combiner, initializer,
   1430                          ckpt_to_load_from, tensor_name_in_ckpt,
   1431                          shared_embedding_name, max_norm=max_norm,
   1432                          trainable=trainable)]
   1433   else:
   1434     # Check compatibility of sparse_id_columns
   1435     compatible = True
   1436     for column in sparse_id_columns[1:]:
   1437       if isinstance(sparse_id_columns[0], _WeightedSparseColumn):
   1438         compatible = compatible and sparse_id_columns[0].is_compatible(column)
   1439       else:
   1440         compatible = compatible and column.is_compatible(sparse_id_columns[0])
   1441     if not compatible:
   1442       raise ValueError("The input sparse id columns are not compatible.")
   1443     # Construct the shared name and size for shared embedding space.
   1444     if not shared_embedding_name:
   1445       # Sort the columns so that shared_embedding_name will be deterministic
   1446       # even if users pass in unsorted columns from a dict or something.
   1447       # Since they are different classes, ordering is SparseColumns first,
   1448       # then WeightedSparseColumns.
   1449       sparse_columns = []
   1450       weighted_sparse_columns = []
   1451       for column in sparse_id_columns:
   1452         if isinstance(column, _SparseColumn):
   1453           sparse_columns.append(column)
   1454         else:
   1455           weighted_sparse_columns.append(column)
   1456       sorted_columns = sorted(sparse_columns) + sorted(
   1457           weighted_sparse_columns, key=lambda x: x.name)
   1458       if len(sorted_columns) <= 3:
   1459         shared_embedding_name = "_".join([column.name
   1460                                           for column in sorted_columns])
   1461       else:
   1462         shared_embedding_name = "_".join([column.name
   1463                                           for column in sorted_columns[0:3]])
   1464         shared_embedding_name += (
   1465             "_plus_{}_others".format(len(sorted_columns) - 3))
   1466       shared_embedding_name += "_shared_embedding"
   1467     shared_vocab_size = sparse_id_columns[0].length
   1468 
   1469     embedded_columns = []
   1470     for column in sparse_id_columns:
   1471       embedded_columns.append(
   1472           _EmbeddingColumn(column, dimension, combiner, initializer,
   1473                            ckpt_to_load_from, tensor_name_in_ckpt,
   1474                            shared_embedding_name, shared_vocab_size,
   1475                            max_norm=max_norm, trainable=trainable))
   1476     return tuple(embedded_columns)
   1477 
   1478 
   1479 class _ScatteredEmbeddingColumn(
   1480     _FeatureColumn,
   1481     fc_core._DenseColumn,  # pylint: disable=protected-access
   1482     collections.namedtuple("_ScatteredEmbeddingColumn", [
   1483         "column_name", "size", "dimension", "hash_key", "combiner",
   1484         "initializer"
   1485     ])):
   1486   """See `scattered_embedding_column`."""
   1487 
   1488   def __new__(cls,
   1489               column_name,
   1490               size,
   1491               dimension,
   1492               hash_key,
   1493               combiner="sqrtn",
   1494               initializer=None):
   1495     if initializer is not None and not callable(initializer):
   1496       raise ValueError("initializer must be callable if specified. "
   1497                        "column_name: {}".format(column_name))
   1498     if initializer is None:
   1499       logging.warn("The default stddev value of initializer will change from "
   1500                    "\"0.1\" to \"1/sqrt(dimension)\" after 2017/02/25.")
   1501       stddev = 0.1
   1502       initializer = init_ops.truncated_normal_initializer(
   1503           mean=0.0, stddev=stddev)
   1504     return super(_ScatteredEmbeddingColumn, cls).__new__(cls, column_name, size,
   1505                                                          dimension, hash_key,
   1506                                                          combiner,
   1507                                                          initializer)
   1508 
   1509   @property
   1510   def name(self):
   1511     return "{}_scattered_embedding".format(self.column_name)
   1512 
   1513   @property
   1514   def config(self):
   1515     return {self.column_name: parsing_ops.VarLenFeature(dtypes.string)}
   1516 
   1517   @property
   1518   def key(self):
   1519     """Returns a string which will be used as a key when we do sorting."""
   1520     return self._key_without_properties(["initializer"])
   1521 
   1522   def insert_transformed_feature(self, columns_to_tensors):
   1523     columns_to_tensors[self] = columns_to_tensors[self.column_name]
   1524 
   1525   def _deep_embedding_lookup_arguments(self, input_tensor):
   1526     return _DeepEmbeddingLookupArguments(
   1527         input_tensor=input_tensor,
   1528         weight_tensor=None,
   1529         vocab_size=self.size,
   1530         initializer=self.initializer,
   1531         combiner=self.combiner,
   1532         dimension=self.dimension,
   1533         shared_embedding_name=None,
   1534         hash_key=self.hash_key,
   1535         max_norm=None,
   1536         trainable=True)
   1537 
   1538   @property
   1539   def _variable_shape(self):
   1540     return tensor_shape.TensorShape([self.dimension])
   1541 
   1542   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   1543     return _embeddings_from_arguments(
   1544         self,
   1545         self._deep_embedding_lookup_arguments(inputs.get(self)),
   1546         weight_collections, trainable)
   1547 
   1548   def _transform_feature(self, inputs):
   1549     return inputs.get(self.column_name)
   1550 
   1551   @property
   1552   def _parse_example_spec(self):
   1553     return self.config
   1554 
   1555 
   1556 def scattered_embedding_column(column_name,
   1557                                size,
   1558                                dimension,
   1559                                hash_key,
   1560                                combiner="mean",
   1561                                initializer=None):
   1562   """Creates an embedding column of a sparse feature using parameter hashing.
   1563 
   1564   This is a useful shorthand when you have a sparse feature you want to use an
   1565   embedding for, but also want to hash the embedding's values in each dimension
   1566   to a variable based on a different hash.
   1567 
   1568   Specifically, the i-th embedding component of a value v is found by retrieving
   1569   an embedding weight whose index is a fingerprint of the pair (v,i).
   1570 
   1571   An embedding column with sparse_column_with_hash_bucket such as
   1572 
   1573       embedding_column(
   1574         sparse_column_with_hash_bucket(column_name, bucket_size),
   1575         dimension)
   1576 
   1577   could be replaced by
   1578 
   1579       scattered_embedding_column(
   1580         column_name,
   1581         size=bucket_size * dimension,
   1582         dimension=dimension,
   1583         hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
   1584 
   1585   for the same number of embedding parameters. This should hopefully reduce the
   1586   impact of collisions, but adds the cost of slowing down training.
   1587 
   1588   Args:
   1589     column_name: A string defining sparse column name.
   1590     size: An integer specifying the number of parameters in the embedding layer.
   1591     dimension: An integer specifying dimension of the embedding.
   1592     hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
   1593       function to combine the crosses fingerprints on SparseFeatureCrossOp.
   1594     combiner: A string specifying how to reduce if there are multiple entries
   1595       in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
   1596       "mean" the default. "sqrtn" often achieves good accuracy, in particular
   1597       with bag-of-words columns. Each of this can be thought as example level
   1598       normalizations on the column:
   1599         * "sum": do not normalize features in the column
   1600         * "mean": do l1 normalization on features in the column
   1601         * "sqrtn": do l2 normalization on features in the column
   1602       For more information: `tf.embedding_lookup_sparse`.
   1603     initializer: A variable initializer function to be used in embedding
   1604       variable initialization. If not specified, defaults to
   1605       `tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1.
   1606 
   1607   Returns:
   1608     A _ScatteredEmbeddingColumn.
   1609 
   1610   Raises:
   1611     ValueError: if dimension or size is not a positive integer; or if combiner
   1612       is not supported.
   1613 
   1614   """
   1615   if (dimension < 1) or (size < 1):
   1616     raise ValueError("Dimension and size must be greater than 0. "
   1617                      "dimension: {}, size: {}, column_name: {}".format(
   1618                          dimension, size, column_name))
   1619 
   1620   if combiner not in ("mean", "sqrtn", "sum"):
   1621     raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'. "
   1622                      "combiner: {}, column_name: {}".format(combiner,
   1623                                                             column_name))
   1624 
   1625   return _ScatteredEmbeddingColumn(column_name, size, dimension, hash_key,
   1626                                    combiner, initializer)
   1627 
   1628 
   1629 def _reshape_real_valued_tensor(input_tensor, output_rank, column_name=None):
   1630   """Reshaping logic for dense, numeric `Tensors`.
   1631 
   1632   Follows the following rules:
   1633     1. If `output_rank > input_rank + 1` raise a `ValueError`.
   1634     2. If `output_rank == input_rank + 1`, expand `input_tensor` by one
   1635        dimension and return
   1636     3. If `output_rank == input_rank`, return `input_tensor`.
   1637     4. If `output_rank < input_rank`, flatten the inner dimensions of
   1638        `input_tensor` and return a `Tensor` with `output_rank`
   1639 
   1640   Args:
   1641     input_tensor: a dense `Tensor` to be reshaped.
   1642     output_rank: the desired rank of the reshaped `Tensor`.
   1643     column_name: (optional) the name of the associated column. Used for error
   1644       messages.
   1645   Returns:
   1646     A `Tensor` with the same entries as `input_tensor` and rank `output_rank`.
   1647   Raises:
   1648     ValueError: if `output_rank > input_rank + 1`.
   1649   """
   1650   input_rank = input_tensor.get_shape().ndims
   1651   if input_rank is not None:
   1652     if output_rank > input_rank + 1:
   1653       error_string = ("Rank of input Tensor ({}) should be the same as "
   1654                       "output_rank ({}). For example, sequence data should "
   1655                       "typically be 3 dimensional (rank 3) while non-sequence "
   1656                       "data is typically 2 dimensional (rank 2).".format(
   1657                           input_rank, output_rank))
   1658       if column_name is not None:
   1659         error_string = ("Error while processing column {}.".format(column_name)
   1660                         + error_string)
   1661       raise ValueError(error_string)
   1662     if output_rank == input_rank + 1:
   1663       logging.warning(
   1664           "Rank of input Tensor ({}) should be the same as output_rank ({}) "
   1665           "for column. Will attempt to expand dims. It is highly recommended "
   1666           "that you resize your input, as this behavior may change.".format(
   1667               input_rank, output_rank))
   1668       return array_ops.expand_dims(input_tensor, -1, name="expand_dims")
   1669     if output_rank == input_rank:
   1670       return input_tensor
   1671   # Here, either `input_rank` is unknown or it is greater than `output_rank`.
   1672   return layers._inner_flatten(input_tensor, output_rank)  # pylint: disable=protected-access
   1673 
   1674 
   1675 class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple(
   1676     "_RealValuedVarLenColumn",
   1677     ["column_name", "default_value", "dtype", "normalizer", "is_sparse"])):
   1678   """Represents a real valued feature column for variable length Features.
   1679 
   1680   Instances of this class are immutable.
   1681   If is_sparse=False, the dictionary returned by InputBuilder contains a
   1682   ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension).
   1683   If is_sparse=True, the dictionary contains a ("column_name", SparseTensor)
   1684   pair instead with shape inferred after parsing.
   1685   """
   1686 
   1687   @property
   1688   def name(self):
   1689     return self.column_name
   1690 
   1691   @property
   1692   def config(self):
   1693     if self.is_sparse:
   1694       return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
   1695     else:
   1696       return {self.column_name: parsing_ops.FixedLenSequenceFeature(
   1697           [], self.dtype, allow_missing=True,
   1698           default_value=self.default_value)}
   1699 
   1700   @property
   1701   def key(self):
   1702     """Returns a string which will be used as a key when we do sorting."""
   1703     return self._key_without_properties(["normalizer"])
   1704 
   1705   @property
   1706   def normalizer_fn(self):
   1707     """Returns the function used to normalize the column."""
   1708     return self.normalizer
   1709 
   1710   def _normalized_input_tensor(self, input_tensor):
   1711     """Returns the input tensor after custom normalization is applied."""
   1712     if self.normalizer is None:
   1713       return input_tensor
   1714     if self.is_sparse:
   1715       return sparse_tensor_py.SparseTensor(
   1716           input_tensor.indices,
   1717           self.normalizer(input_tensor.values),
   1718           input_tensor.dense_shape)
   1719     else:
   1720       return self.normalizer(input_tensor)
   1721 
   1722   def insert_transformed_feature(self, columns_to_tensors):
   1723     """Apply transformation and inserts it into columns_to_tensors.
   1724 
   1725     Args:
   1726       columns_to_tensors: A mapping from feature columns to tensors. 'string'
   1727         key means a base feature (not-transformed). It can have _FeatureColumn
   1728         as a key too. That means that _FeatureColumn is already transformed.
   1729     """
   1730     # Transform the input tensor according to the normalizer function.
   1731     input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
   1732     columns_to_tensors[self] = math_ops.to_float(input_tensor)
   1733 
   1734   # pylint: disable=unused-argument
   1735   def _to_dnn_input_layer(self,
   1736                           input_tensor,
   1737                           weight_collections=None,
   1738                           trainable=True,
   1739                           output_rank=2):
   1740     return _reshape_real_valued_tensor(
   1741         self._to_dense_tensor(input_tensor), output_rank, self.name)
   1742 
   1743   def _to_dense_tensor(self, input_tensor):
   1744     if not self.is_sparse:
   1745       return input_tensor
   1746     raise ValueError("Set is_sparse to False if you want a dense Tensor for "
   1747                      "column_name: {}".format(self.name))
   1748 
   1749 
   1750 @experimental
   1751 def _real_valued_var_len_column(column_name,
   1752                                 default_value=None,
   1753                                 dtype=dtypes.float32,
   1754                                 normalizer=None,
   1755                                 is_sparse=False):
   1756   """Creates a `_RealValuedVarLenColumn` for variable-length numeric data.
   1757 
   1758   Note, this is not integrated with any of the DNNEstimators, except the RNN
   1759   ones DynamicRNNEstimator and the StateSavingRNNEstimator.
   1760 
   1761   It can either create a parsing config for a SparseTensor (with is_sparse=True)
   1762   or a padded Tensor.
   1763   The (dense_)shape of the result will be [batch_size, None], which can be used
   1764   with is_sparse=False as input into an RNN (see DynamicRNNEstimator or
   1765   StateSavingRNNEstimator) or with is_sparse=True as input into a tree (see
   1766   gtflow).
   1767 
   1768   Use real_valued_column if the Feature has a fixed length. Use some
   1769   SparseColumn for columns to be embedded / one-hot-encoded.
   1770 
   1771   Args:
   1772     column_name: A string defining real valued column name.
   1773     default_value: A scalar value compatible with dtype. Needs to be specified
   1774       if is_sparse=False.
   1775     dtype: Defines the type of values. Default value is tf.float32. Needs to be
   1776       convertible to tf.float32.
   1777     normalizer: If not None, a function that can be used to normalize the value
   1778       of the real valued column after default_value is applied for parsing.
   1779       Normalizer function takes the input tensor as its argument, and returns
   1780       the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
   1781       is_sparse=False, the normalizer will be run on the values of the
   1782       `SparseTensor`.
   1783     is_sparse: A boolean defining whether to create a SparseTensor or a Tensor.
   1784   Returns:
   1785     A _RealValuedSparseColumn.
   1786   Raises:
   1787     TypeError: if default_value is not a scalar value compatible with dtype.
   1788     TypeError: if dtype is not convertible to tf.float32.
   1789     ValueError: if default_value is None and is_sparse is False.
   1790   """
   1791   if not (dtype.is_integer or dtype.is_floating):
   1792     raise TypeError("dtype must be convertible to float. "
   1793                     "dtype: {}, column_name: {}".format(dtype, column_name))
   1794 
   1795   if default_value is None and not is_sparse:
   1796     raise ValueError("default_value must be provided when is_sparse=False to "
   1797                      "parse a padded Tensor. "
   1798                      "column_name: {}".format(column_name))
   1799   if isinstance(default_value, list):
   1800     raise ValueError(
   1801         "Only scalar default value. default_value: {}, column_name: {}".format(
   1802             default_value, column_name))
   1803   if default_value is not None:
   1804     if dtype.is_integer:
   1805       default_value = int(default_value)
   1806     elif dtype.is_floating:
   1807       default_value = float(default_value)
   1808 
   1809   return _RealValuedVarLenColumn(column_name, default_value, dtype, normalizer,
   1810                                  is_sparse)
   1811 
   1812 
   1813 class _RealValuedColumn(
   1814     _FeatureColumn,
   1815     fc_core._DenseColumn,  # pylint: disable=protected-access
   1816     collections.namedtuple(
   1817         "_RealValuedColumn",
   1818         ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
   1819   """Represents a real valued feature column also known as continuous features.
   1820 
   1821   Instances of this class are immutable. The dictionary returned by InputBuilder
   1822   contains a ("column_name", Tensor) pair with a Tensor shape of
   1823   (batch_size, dimension).
   1824   """
   1825 
   1826   def __new__(cls, column_name, dimension, default_value,
   1827               dtype, normalizer):
   1828     if default_value is not None:
   1829       default_value = tuple(default_value)
   1830     return super(_RealValuedColumn, cls).__new__(cls, column_name, dimension,
   1831                                                  default_value, dtype,
   1832                                                  normalizer)
   1833 
   1834   @property
   1835   def name(self):
   1836     return self.column_name
   1837 
   1838   @property
   1839   def config(self):
   1840     default_value = self.default_value
   1841     if default_value is not None:
   1842       default_value = list(default_value)
   1843     return {self.column_name: parsing_ops.FixedLenFeature([self.dimension],
   1844                                                           self.dtype,
   1845                                                           default_value)}
   1846 
   1847   @property
   1848   def key(self):
   1849     """Returns a string which will be used as a key when we do sorting."""
   1850     return self._key_without_properties(["normalizer"])
   1851 
   1852   @property
   1853   def normalizer_fn(self):
   1854     """Returns the function used to normalize the column."""
   1855     return self.normalizer
   1856 
   1857   def _normalized_input_tensor(self, input_tensor):
   1858     """Returns the input tensor after custom normalization is applied."""
   1859     return (self.normalizer(input_tensor) if self.normalizer is not None else
   1860             input_tensor)
   1861 
   1862   def insert_transformed_feature(self, columns_to_tensors):
   1863     """Apply transformation and inserts it into columns_to_tensors.
   1864 
   1865     Args:
   1866       columns_to_tensors: A mapping from feature columns to tensors. 'string'
   1867         key means a base feature (not-transformed). It can have _FeatureColumn
   1868         as a key too. That means that _FeatureColumn is already transformed.
   1869     """
   1870     # Transform the input tensor according to the normalizer function.
   1871     input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
   1872     columns_to_tensors[self] = math_ops.to_float(input_tensor)
   1873 
   1874   # pylint: disable=unused-argument
   1875   def _to_dnn_input_layer(self,
   1876                           input_tensor,
   1877                           weight_collections=None,
   1878                           trainable=True,
   1879                           output_rank=2):
   1880     input_tensor = self._to_dense_tensor(input_tensor)
   1881     if input_tensor.dtype != dtypes.float32:
   1882       input_tensor = math_ops.to_float(input_tensor)
   1883     return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
   1884 
   1885   def _to_dense_tensor(self, input_tensor):
   1886     return input_tensor
   1887 
   1888   @property
   1889   def _variable_shape(self):
   1890     return tensor_shape.TensorShape([self.dimension])
   1891 
   1892   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   1893     del weight_collections
   1894     del trainable
   1895     return inputs.get(self)
   1896 
   1897   def _transform_feature(self, inputs):
   1898     return math_ops.to_float(
   1899         self._normalized_input_tensor(inputs.get(self.name)))
   1900 
   1901   @property
   1902   def _parse_example_spec(self):
   1903     return self.config
   1904 
   1905 
   1906 def real_valued_column(column_name,
   1907                        dimension=1,
   1908                        default_value=None,
   1909                        dtype=dtypes.float32,
   1910                        normalizer=None):
   1911   """Creates a `_RealValuedColumn` for dense numeric data.
   1912 
   1913   Args:
   1914     column_name: A string defining real valued column name.
   1915     dimension: An integer specifying dimension of the real valued column.
   1916       The default is 1.
   1917     default_value: A single value compatible with dtype or a list of values
   1918       compatible with dtype which the column takes on during tf.Example parsing
   1919       if data is missing. When dimension is not None, a default value of None
   1920       will cause tf.parse_example to fail if an example does not contain this
   1921       column. If a single value is provided, the same value will be applied as
   1922       the default value for every dimension. If a list of values is provided,
   1923       the length of the list should be equal to the value of `dimension`.
   1924       Only scalar default value is supported in case dimension is not specified.
   1925     dtype: defines the type of values. Default value is tf.float32. Must be a
   1926       non-quantized, real integer or floating point type.
   1927     normalizer: If not None, a function that can be used to normalize the value
   1928       of the real valued column after default_value is applied for parsing.
   1929       Normalizer function takes the input tensor as its argument, and returns
   1930       the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
   1931       variable length columns, the normalizer should expect an input_tensor of
   1932       type `SparseTensor`.
   1933   Returns:
   1934     A _RealValuedColumn.
   1935   Raises:
   1936     TypeError: if dimension is not an int
   1937     ValueError: if dimension is not a positive integer
   1938     TypeError: if default_value is a list but its length is not equal to the
   1939       value of `dimension`.
   1940     TypeError: if default_value is not compatible with dtype.
   1941     ValueError: if dtype is not convertible to tf.float32.
   1942   """
   1943 
   1944   if dimension is None:
   1945     raise TypeError("dimension must be an integer. Use the "
   1946                     "_real_valued_var_len_column for variable length features."
   1947                     "dimension: {}, column_name: {}".format(dimension,
   1948                                                             column_name))
   1949   if not isinstance(dimension, int):
   1950     raise TypeError("dimension must be an integer. "
   1951                     "dimension: {}, column_name: {}".format(dimension,
   1952                                                             column_name))
   1953   if dimension < 1:
   1954     raise ValueError("dimension must be greater than 0. "
   1955                      "dimension: {}, column_name: {}".format(dimension,
   1956                                                              column_name))
   1957 
   1958   if not (dtype.is_integer or dtype.is_floating):
   1959     raise ValueError("dtype must be convertible to float. "
   1960                      "dtype: {}, column_name: {}".format(dtype, column_name))
   1961 
   1962   if default_value is None:
   1963     return _RealValuedColumn(column_name, dimension, default_value, dtype,
   1964                              normalizer)
   1965 
   1966   if isinstance(default_value, int):
   1967     if dtype.is_integer:
   1968       default_value = ([default_value for _ in range(dimension)] if dimension
   1969                        else [default_value])
   1970       return _RealValuedColumn(column_name, dimension, default_value, dtype,
   1971                                normalizer)
   1972     if dtype.is_floating:
   1973       default_value = float(default_value)
   1974       default_value = ([default_value for _ in range(dimension)] if dimension
   1975                        else [default_value])
   1976       return _RealValuedColumn(column_name, dimension, default_value, dtype,
   1977                                normalizer)
   1978 
   1979   if isinstance(default_value, float):
   1980     if dtype.is_floating and (not dtype.is_integer):
   1981       default_value = ([default_value for _ in range(dimension)] if dimension
   1982                        else [default_value])
   1983       return _RealValuedColumn(column_name, dimension, default_value, dtype,
   1984                                normalizer)
   1985 
   1986   if isinstance(default_value, list):
   1987     if len(default_value) != dimension:
   1988       raise ValueError(
   1989           "The length of default_value must be equal to dimension. "
   1990           "default_value: {}, dimension: {}, column_name: {}".format(
   1991               default_value, dimension, column_name))
   1992     # Check if the values in the list are all integers or are convertible to
   1993     # floats.
   1994     is_list_all_int = True
   1995     is_list_all_float = True
   1996     for v in default_value:
   1997       if not isinstance(v, int):
   1998         is_list_all_int = False
   1999       if not (isinstance(v, float) or isinstance(v, int)):
   2000         is_list_all_float = False
   2001     if is_list_all_int:
   2002       if dtype.is_integer:
   2003         return _RealValuedColumn(column_name, dimension, default_value, dtype,
   2004                                  normalizer)
   2005       elif dtype.is_floating:
   2006         default_value = [float(v) for v in default_value]
   2007         return _RealValuedColumn(column_name, dimension, default_value, dtype,
   2008                                  normalizer)
   2009     if is_list_all_float:
   2010       if dtype.is_floating and (not dtype.is_integer):
   2011         default_value = [float(v) for v in default_value]
   2012         return _RealValuedColumn(column_name, dimension, default_value, dtype,
   2013                                  normalizer)
   2014 
   2015   raise TypeError("default_value must be compatible with dtype. "
   2016                   "default_value: {}, dtype: {}, column_name: {}".format(
   2017                       default_value, dtype, column_name))
   2018 
   2019 
   2020 class _BucketizedColumn(
   2021     _FeatureColumn,
   2022     fc_core._CategoricalColumn,  # pylint: disable=protected-access
   2023     fc_core._DenseColumn,  # pylint: disable=protected-access
   2024     collections.namedtuple("_BucketizedColumn", ["source_column",
   2025                                                  "boundaries"])):
   2026   """Represents a bucketization transformation also known as binning.
   2027 
   2028   Instances of this class are immutable. Values in `source_column` will be
   2029   bucketized based on `boundaries`.
   2030   For example, if the inputs are:
   2031       boundaries = [0, 10, 100]
   2032       source_column = [[-5], [150], [10], [0], [4], [19]]
   2033 
   2034   then the bucketized feature will be:
   2035       output = [[0], [3], [2], [1], [1], [2]]
   2036 
   2037   Attributes:
   2038     source_column: A _RealValuedColumn defining dense column.
   2039     boundaries: A list or tuple of floats specifying the boundaries. It has to
   2040       be sorted. [a, b, c] defines following buckets: (-inf., a), [a, b),
   2041       [b, c), [c, inf.)
   2042   Raises:
   2043     ValueError: if 'boundaries' is empty or not sorted.
   2044   """
   2045 
   2046   def __new__(cls, source_column, boundaries):
   2047     if not isinstance(source_column, _RealValuedColumn):
   2048       raise TypeError("source_column must be an instance of _RealValuedColumn. "
   2049                       "source_column: {}".format(source_column))
   2050 
   2051     if source_column.dimension is None:
   2052       raise ValueError("source_column must have a defined dimension. "
   2053                        "source_column: {}".format(source_column))
   2054 
   2055     if (not isinstance(boundaries, list) and
   2056         not isinstance(boundaries, tuple)) or not boundaries:
   2057       raise ValueError("boundaries must be a non-empty list or tuple. "
   2058                        "boundaries: {}".format(boundaries))
   2059 
   2060     # We allow bucket boundaries to be monotonically increasing
   2061     # (ie a[i+1] >= a[i]). When two bucket boundaries are the same, we
   2062     # de-duplicate.
   2063     sanitized_boundaries = []
   2064     for i in range(len(boundaries) - 1):
   2065       if boundaries[i] == boundaries[i + 1]:
   2066         continue
   2067       elif boundaries[i] < boundaries[i + 1]:
   2068         sanitized_boundaries.append(boundaries[i])
   2069       else:
   2070         raise ValueError("boundaries must be a sorted list. "
   2071                          "boundaries: {}".format(boundaries))
   2072     sanitized_boundaries.append(boundaries[len(boundaries) - 1])
   2073 
   2074     return super(_BucketizedColumn, cls).__new__(cls, source_column,
   2075                                                  tuple(sanitized_boundaries))
   2076 
   2077   @property
   2078   def name(self):
   2079     return "{}_bucketized".format(self.source_column.name)
   2080 
   2081   @property
   2082   def length(self):
   2083     """Returns total number of buckets."""
   2084     return len(self.boundaries) + 1
   2085 
   2086   @property
   2087   def config(self):
   2088     return self.source_column.config
   2089 
   2090   @property
   2091   def key(self):
   2092     """Returns a string which will be used as a key when we do sorting."""
   2093     return "{}".format(self)
   2094 
   2095   # pylint: disable=unused-argument
   2096   def _to_dnn_input_layer(self,
   2097                           input_tensor,
   2098                           weight_collections=None,
   2099                           trainable=True,
   2100                           output_rank=2):
   2101     if output_rank != 2:
   2102       raise ValueError("BucketizedColumn currently only supports output_rank=2")
   2103     return array_ops.reshape(
   2104         array_ops.one_hot(
   2105             math_ops.to_int64(input_tensor),
   2106             self.length,
   2107             1.,
   2108             0.,
   2109             name="one_hot"), [-1, self.length * self.source_column.dimension],
   2110         name="reshape")
   2111 
   2112   def to_sparse_tensor(self, input_tensor):
   2113     """Creates a SparseTensor from the bucketized Tensor."""
   2114     dimension = self.source_column.dimension
   2115     batch_size = array_ops.shape(input_tensor, name="shape")[0]
   2116 
   2117     if dimension > 1:
   2118       i1 = array_ops.reshape(
   2119           array_ops.tile(
   2120               array_ops.expand_dims(
   2121                   math_ops.range(0, batch_size), 1, name="expand_dims"),
   2122               [1, dimension],
   2123               name="tile"), [-1],
   2124           name="reshape")
   2125       i2 = array_ops.tile(
   2126           math_ops.range(0, dimension), [batch_size], name="tile")
   2127       # Flatten the bucket indices and unique them across dimensions
   2128       # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
   2129       bucket_indices = array_ops.reshape(
   2130           input_tensor, [-1], name="reshape") + self.length * i2
   2131     else:
   2132       # Simpler indices when dimension=1
   2133       i1 = math_ops.range(0, batch_size)
   2134       i2 = array_ops.zeros([batch_size], dtype=dtypes.int32, name="zeros")
   2135       bucket_indices = array_ops.reshape(input_tensor, [-1], name="reshape")
   2136 
   2137     indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
   2138     shape = math_ops.to_int64(array_ops.stack([batch_size, dimension]))
   2139     sparse_id_values = sparse_tensor_py.SparseTensor(
   2140         indices, bucket_indices, shape)
   2141 
   2142     return sparse_id_values
   2143 
   2144   def _wide_embedding_lookup_arguments(self, input_tensor):
   2145     return _LinearEmbeddingLookupArguments(
   2146         input_tensor=self.to_sparse_tensor(input_tensor),
   2147         weight_tensor=None,
   2148         vocab_size=self.length * self.source_column.dimension,
   2149         initializer=init_ops.zeros_initializer(),
   2150         combiner="sum")
   2151 
   2152   def _transform_feature(self, inputs):
   2153     """Handles cross transformation."""
   2154     # Bucketize the source column.
   2155     return bucketization_op.bucketize(
   2156         inputs.get(self.source_column),
   2157         boundaries=list(self.boundaries),
   2158         name="bucketize")
   2159 
   2160   def insert_transformed_feature(self, columns_to_tensors):
   2161     """Handles sparse column to id conversion."""
   2162     columns_to_tensors[self] = self._transform_feature(
   2163         _LazyBuilderByColumnsToTensor(columns_to_tensors))
   2164 
   2165   @property
   2166   def _parse_example_spec(self):
   2167     return self.config
   2168 
   2169   @property
   2170   def _num_buckets(self):
   2171     return self.length * self.source_column.dimension
   2172 
   2173   def _get_sparse_tensors(self, inputs, weight_collections=None,
   2174                           trainable=None):
   2175     del weight_collections
   2176     del trainable
   2177     return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
   2178         self.to_sparse_tensor(inputs.get(self)), None)
   2179 
   2180   @property
   2181   def _variable_shape(self):
   2182     return tensor_shape.TensorShape(
   2183         [self.length * self.source_column.dimension])
   2184 
   2185   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
   2186     return self._to_dnn_input_layer(
   2187         inputs.get(self), weight_collections, trainable)
   2188 
   2189 
   2190 def bucketized_column(source_column, boundaries):
   2191   """Creates a _BucketizedColumn for discretizing dense input.
   2192 
   2193   Args:
   2194     source_column: A _RealValuedColumn defining dense column.
   2195     boundaries: A list or tuple of floats specifying the boundaries. It has to
   2196       be sorted.
   2197 
   2198   Returns:
   2199     A _BucketizedColumn.
   2200 
   2201   Raises:
   2202     ValueError: if 'boundaries' is empty or not sorted.
   2203   """
   2204   return _BucketizedColumn(source_column, boundaries)
   2205 
   2206 
   2207 class _CrossedColumn(
   2208     _FeatureColumn,
   2209     fc_core._CategoricalColumn,  # pylint: disable=protected-access
   2210     collections.namedtuple("_CrossedColumn", [
   2211         "columns", "hash_bucket_size", "hash_key", "combiner",
   2212         "ckpt_to_load_from", "tensor_name_in_ckpt"
   2213     ])):
   2214   """Represents a cross transformation also known as conjunction or combination.
   2215 
   2216   Instances of this class are immutable. It crosses given `columns`. Crossed
   2217   column output will be hashed to hash_bucket_size.
   2218   Conceptually, transformation can be thought as:
   2219     Hash(cartesian product of features in columns) % `hash_bucket_size`
   2220 
   2221   For example, if the columns are
   2222 
   2223       SparseTensor referred by first column: shape = [2, 2]
   2224       [0, 0]: "a"
   2225       [1, 0]: "b"
   2226       [1, 1]: "c"
   2227 
   2228       SparseTensor referred by second column: : shape = [2, 1]
   2229       [0, 0]: "d"
   2230       [1, 0]: "e"
   2231 
   2232   then crossed feature will look like:
   2233 
   2234       shape = [2, 2]
   2235       [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
   2236       [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
   2237       [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
   2238 
   2239   Attributes:
   2240     columns: An iterable of _FeatureColumn. Items can be an instance of
   2241       _SparseColumn, _CrossedColumn, or _BucketizedColumn.
   2242     hash_bucket_size: An int that is > 1. The number of buckets.
   2243     combiner: A string specifying how to reduce if there are multiple entries
   2244       in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
   2245       "sum" the default. "sqrtn" often achieves good accuracy, in particular
   2246       with bag-of-words columns. Each of this can be thought as example level
   2247       normalizations on the column::
   2248         * "sum": do not normalize
   2249         * "mean": do l1 normalization
   2250         * "sqrtn": do l2 normalization
   2251       For more information: `tf.embedding_lookup_sparse`.
   2252     ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
   2253       to restore the column weights. Required if `tensor_name_in_ckpt` is not
   2254       None.
   2255     tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
   2256       checkpoint from which to restore the column weights. Required if
   2257       `ckpt_to_load_from` is not None.
   2258 
   2259   Raises:
   2260     TypeError: if all items in columns are not an instance of _SparseColumn,
   2261       _CrossedColumn, or _BucketizedColumn.
   2262     ValueError: if hash_bucket_size is not > 1 or len(columns) is not > 1. Also,
   2263       if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified.
   2264   """
   2265 
   2266   @staticmethod
   2267   def _assert_is_crossable(column):
   2268     if isinstance(column, (_SparseColumn, _CrossedColumn, _BucketizedColumn)):
   2269       return
   2270     raise TypeError("columns must be a set of _SparseColumn, "
   2271                     "_CrossedColumn, or _BucketizedColumn instances. "
   2272                     "(column {} is a {})".format(column,
   2273                                                  column.__class__.__name__))
   2274 
   2275   def __new__(cls,
   2276               columns,
   2277               hash_bucket_size,
   2278               hash_key,
   2279               combiner="sum",
   2280               ckpt_to_load_from=None,
   2281               tensor_name_in_ckpt=None):
   2282     for column in columns:
   2283       _CrossedColumn._assert_is_crossable(column)
   2284 
   2285     if len(columns) < 2:
   2286       raise ValueError("columns must contain at least 2 elements. "
   2287                        "columns: {}".format(columns))
   2288 
   2289     if hash_bucket_size < 2:
   2290       raise ValueError("hash_bucket_size must be at least 2. "
   2291                        "hash_bucket_size: {}".format(hash_bucket_size))
   2292 
   2293     if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
   2294       raise ValueError("Must specify both `ckpt_to_load_from` and "
   2295                        "`tensor_name_in_ckpt` or none of them.")
   2296 
   2297     sorted_columns = sorted(
   2298         [column for column in columns], key=lambda column: column.name)
   2299     return super(_CrossedColumn, cls).__new__(cls, tuple(sorted_columns),
   2300                                               hash_bucket_size, hash_key,
   2301                                               combiner,
   2302                                               ckpt_to_load_from,
   2303                                               tensor_name_in_ckpt)
   2304 
   2305   @property
   2306   def name(self):
   2307     sorted_names = sorted([column.name for column in self.columns])
   2308     return "_X_".join(sorted_names)
   2309 
   2310   @property
   2311   def config(self):
   2312     config = {}
   2313     for column in self.columns:
   2314       config.update(_get_feature_config(column))
   2315     return config
   2316 
   2317   @property
   2318   def length(self):
   2319     """Returns total number of buckets."""
   2320     return self.hash_bucket_size
   2321 
   2322   @property
   2323   def key(self):
   2324     """Returns a string which will be used as a key when we do sorting."""
   2325     return "{}".format(self)
   2326 
   2327   def id_tensor(self, input_tensor):
   2328     """Returns the id tensor from the given transformed input_tensor."""
   2329     return input_tensor
   2330 
   2331   def weight_tensor(self, input_tensor):
   2332     """Returns the weight tensor from the given transformed input_tensor."""
   2333     del input_tensor
   2334     return None
   2335 
   2336   def _to_dnn_input_layer(self,
   2337                           input_tensor,
   2338                           weight_collections=None,
   2339                           trainable=True,
   2340                           output_rank=2):
   2341     del input_tensor
   2342     del weight_collections
   2343     del trainable
   2344     del output_rank
   2345     raise ValueError("CrossedColumn is not supported in DNN. "
   2346                      "Please use embedding_column. column: {}".format(self))
   2347 
   2348   def _checkpoint_path(self):
   2349     if self.ckpt_to_load_from is not None:
   2350       return self.ckpt_to_load_from, self.tensor_name_in_ckpt
   2351     return None
   2352 
   2353   def _wide_embedding_lookup_arguments(self, input_tensor):
   2354     return _LinearEmbeddingLookupArguments(
   2355         input_tensor=input_tensor,
   2356         weight_tensor=None,
   2357         vocab_size=self.length,
   2358         initializer=init_ops.zeros_initializer(),
   2359         combiner=self.combiner)
   2360 
   2361   def _transform_feature(self, inputs):
   2362     """Handles cross transformation."""
   2363 
   2364     def _collect_leaf_level_columns(cross):
   2365       """Collects base columns contained in the cross."""
   2366       leaf_level_columns = []
   2367       for c in cross.columns:
   2368         if isinstance(c, _CrossedColumn):
   2369           leaf_level_columns.extend(_collect_leaf_level_columns(c))
   2370         else:
   2371           leaf_level_columns.append(c)
   2372       return leaf_level_columns
   2373 
   2374     feature_tensors = []
   2375     for c in _collect_leaf_level_columns(self):
   2376       if isinstance(c, _SparseColumn):
   2377         feature_tensors.append(inputs.get(c.name))
   2378       else:
   2379         if isinstance(c, _BucketizedColumn):
   2380           feature_tensors.append(c.to_sparse_tensor(inputs.get(c)))
   2381         else:
   2382           feature_tensors.append(inputs.get(c))
   2383     return sparse_feature_cross_op.sparse_feature_cross(
   2384         feature_tensors,
   2385         hashed_output=True,
   2386         num_buckets=self.hash_bucket_size,
   2387         hash_key=self.hash_key,
   2388         name="cross")
   2389 
   2390   def insert_transformed_feature(self, columns_to_tensors):
   2391     """Handles sparse column to id conversion."""
   2392     columns_to_tensors[self] = self._transform_feature(
   2393         _LazyBuilderByColumnsToTensor(columns_to_tensors))
   2394 
   2395   @property
   2396   def _parse_example_spec(self):
   2397     return self.config
   2398 
   2399   @property
   2400   def _num_buckets(self):
   2401     return self.length
   2402 
   2403   def _get_sparse_tensors(self, inputs, weight_collections=None,
   2404                           trainable=None):
   2405     del weight_collections
   2406     del trainable
   2407     return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None)  # pylint: disable=protected-access
   2408 
   2409 
   2410 class _LazyBuilderByColumnsToTensor(object):
   2411 
   2412   def __init__(self, columns_to_tensors):
   2413     self._columns_to_tensors = columns_to_tensors
   2414 
   2415   def get(self, key):
   2416     """Gets the transformed feature column."""
   2417     if key in self._columns_to_tensors:
   2418       return self._columns_to_tensors[key]
   2419     if isinstance(key, str):
   2420       raise ValueError(
   2421           "features dictionary doesn't contain key ({})".format(key))
   2422     if not isinstance(key, _FeatureColumn):
   2423       raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
   2424                       "Provided: {}".format(key))
   2425 
   2426     key.insert_transformed_feature(self._columns_to_tensors)
   2427     return self._columns_to_tensors[key]
   2428 
   2429 
   2430 def crossed_column(columns, hash_bucket_size, combiner="sum",
   2431                    ckpt_to_load_from=None,
   2432                    tensor_name_in_ckpt=None,
   2433                    hash_key=None):
   2434   """Creates a _CrossedColumn for performing feature crosses.
   2435 
   2436   Args:
   2437     columns: An iterable of _FeatureColumn. Items can be an instance of
   2438       _SparseColumn, _CrossedColumn, or _BucketizedColumn.
   2439     hash_bucket_size: An int that is > 1. The number of buckets.
   2440     combiner: A string specifying how to reduce if there are multiple entries
   2441       in a single row. Currently "mean", "sqrtn" and "sum" are supported, with
   2442       "sum" the default. "sqrtn" often achieves good accuracy, in particular
   2443       with bag-of-words columns. Each of this can be thought as example level
   2444       normalizations on the column::
   2445         * "sum": do not normalize
   2446         * "mean": do l1 normalization
   2447         * "sqrtn": do l2 normalization
   2448       For more information: `tf.embedding_lookup_sparse`.
   2449     ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
   2450       to restore the column weights. Required if `tensor_name_in_ckpt` is not
   2451       None.
   2452     tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
   2453       checkpoint from which to restore the column weights. Required if
   2454       `ckpt_to_load_from` is not None.
   2455     hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
   2456       function to combine the crosses fingerprints on SparseFeatureCrossOp
   2457       (optional).
   2458 
   2459   Returns:
   2460     A _CrossedColumn.
   2461 
   2462   Raises:
   2463     TypeError: if any item in columns is not an instance of _SparseColumn,
   2464       _CrossedColumn, or _BucketizedColumn, or
   2465       hash_bucket_size is not an int.
   2466     ValueError: if hash_bucket_size is not > 1 or
   2467       len(columns) is not > 1.
   2468   """
   2469   return _CrossedColumn(
   2470       columns,
   2471       hash_bucket_size,
   2472       hash_key,
   2473       combiner=combiner,
   2474       ckpt_to_load_from=ckpt_to_load_from,
   2475       tensor_name_in_ckpt=tensor_name_in_ckpt)
   2476 
   2477 
   2478 class DataFrameColumn(_FeatureColumn,
   2479                       collections.namedtuple("DataFrameColumn",
   2480                                              ["column_name", "series"])):
   2481   """Represents a feature column produced from a `DataFrame`.
   2482 
   2483   Instances of this class are immutable.  A `DataFrame` column may be dense or
   2484   sparse, and may have any shape, with the constraint that dimension 0 is
   2485   batch_size.
   2486 
   2487   Args:
   2488     column_name: a name for this column
   2489     series: a `Series` to be wrapped, which has already had its base features
   2490       substituted with `PredefinedSeries`.
   2491   """
   2492 
   2493   def __new__(cls, column_name, series):
   2494     return super(DataFrameColumn, cls).__new__(cls, column_name, series)
   2495 
   2496   @property
   2497   def name(self):
   2498     return self.column_name
   2499 
   2500   @property
   2501   def config(self):
   2502     return self.series.required_base_features()
   2503 
   2504   @property
   2505   def key(self):
   2506     """Returns a string which will be used as a key when we do sorting."""
   2507     return self.name
   2508 
   2509   def insert_transformed_feature(self, columns_to_tensors):
   2510     # The cache must already contain mappings from the expected base feature
   2511     # names to Tensors.
   2512 
   2513     # Passing columns_to_tensors as the cache here means that multiple outputs
   2514     # of the transform will be cached, keyed by the repr of their associated
   2515     # TransformedSeries.
   2516     # The specific requested output ends up in columns_to_tensors twice: once
   2517     # keyed by the TransformedSeries repr, and once keyed by this
   2518     # DataFrameColumn instance.
   2519     columns_to_tensors[self] = self.series.build(columns_to_tensors)
   2520 
   2521   # pylint: disable=unused-argument
   2522   def _to_dnn_input_layer(self,
   2523                           input_tensor,
   2524                           weight_collections=None,
   2525                           trainable=True,
   2526                           output_rank=2):
   2527     if input_tensor.dtype != dtypes.float32:
   2528       input_tensor = math_ops.to_float(input_tensor)
   2529     return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
   2530 
   2531   def _to_dense_tensor(self, input_tensor):
   2532     return self._to_dnn_input_layer(input_tensor)
   2533 
   2534   def __eq__(self, other):
   2535     if isinstance(other, self.__class__):
   2536       return self.__dict__ == other.__dict__
   2537     else:
   2538       return False
   2539 
   2540   def __ne__(self, other):
   2541     return not self.__eq__(other)
   2542 
   2543 
   2544 def _get_feature_config(feature_column):
   2545   """Returns configuration for the base feature defined in feature_column."""
   2546   if not isinstance(feature_column, _FeatureColumn):
   2547     raise TypeError(
   2548         "feature_columns should only contain instances of _FeatureColumn. "
   2549         "Given column is {}".format(feature_column))
   2550   if isinstance(feature_column, (_SparseColumn, _WeightedSparseColumn,
   2551                                  _EmbeddingColumn, _RealValuedColumn,
   2552                                  _RealValuedVarLenColumn,
   2553                                  _BucketizedColumn, _CrossedColumn,
   2554                                  _OneHotColumn, _ScatteredEmbeddingColumn)):
   2555     return feature_column.config
   2556 
   2557   raise TypeError("Not supported _FeatureColumn type. "
   2558                   "Given column is {}".format(feature_column))
   2559 
   2560 
   2561 def create_feature_spec_for_parsing(feature_columns):
   2562   """Helper that prepares features config from input feature_columns.
   2563 
   2564   The returned feature config can be used as arg 'features' in tf.parse_example.
   2565 
   2566   Typical usage example:
   2567 
   2568   ```python
   2569   # Define features and transformations
   2570   feature_a = sparse_column_with_vocabulary_file(...)
   2571   feature_b = real_valued_column(...)
   2572   feature_c_bucketized = bucketized_column(real_valued_column("feature_c"), ...)
   2573   feature_a_x_feature_c = crossed_column(
   2574     columns=[feature_a, feature_c_bucketized], ...)
   2575 
   2576   feature_columns = set(
   2577     [feature_b, feature_c_bucketized, feature_a_x_feature_c])
   2578   batch_examples = tf.parse_example(
   2579       serialized=serialized_examples,
   2580       features=create_feature_spec_for_parsing(feature_columns))
   2581   ```
   2582 
   2583   For the above example, create_feature_spec_for_parsing would return the dict:
   2584   {
   2585     "feature_a": parsing_ops.VarLenFeature(tf.string),
   2586     "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
   2587     "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
   2588   }
   2589 
   2590   Args:
   2591     feature_columns: An iterable containing all the feature columns. All items
   2592       should be instances of classes derived from _FeatureColumn, unless
   2593       feature_columns is a dict -- in which case, this should be true of all
   2594       values in the dict.
   2595   Returns:
   2596     A dict mapping feature keys to FixedLenFeature or VarLenFeature values.
   2597   """
   2598   if isinstance(feature_columns, dict):
   2599     feature_columns = feature_columns.values()
   2600 
   2601   features_config = {}
   2602   for column in feature_columns:
   2603     features_config.update(_get_feature_config(column))
   2604   return features_config
   2605 
   2606 
   2607 def _create_sequence_feature_spec_for_parsing(sequence_feature_columns,
   2608                                               allow_missing_by_default=False):
   2609   """Prepares a feature spec for parsing `tf.SequenceExample`s.
   2610 
   2611   Args:
   2612     sequence_feature_columns: an iterable containing all the feature columns.
   2613       All items should be instances of classes derived from `_FeatureColumn`.
   2614     allow_missing_by_default: whether to set `allow_missing=True` by default for
   2615       `FixedLenSequenceFeature`s.
   2616   Returns:
   2617     A dict mapping feature keys to `FixedLenSequenceFeature` or `VarLenFeature`.
   2618   """
   2619   feature_spec = create_feature_spec_for_parsing(sequence_feature_columns)
   2620   sequence_feature_spec = {}
   2621   for key, feature in feature_spec.items():
   2622     if isinstance(feature, parsing_ops.VarLenFeature):
   2623       sequence_feature = feature
   2624     elif (isinstance(feature, parsing_ops.FixedLenFeature) or
   2625           isinstance(feature, parsing_ops.FixedLenSequenceFeature)):
   2626       default_is_set = feature.default_value is not None
   2627       if default_is_set:
   2628         logging.warning(
   2629             'Found default value {} for feature "{}". Ignoring this value and '
   2630             'setting `allow_missing=True` instead.'.
   2631             format(feature.default_value, key))
   2632       sequence_feature = parsing_ops.FixedLenSequenceFeature(
   2633           shape=feature.shape,
   2634           dtype=feature.dtype,
   2635           allow_missing=(allow_missing_by_default or default_is_set))
   2636     else:
   2637       raise TypeError(
   2638           "Unsupported feature type: {}".format(type(feature).__name__))
   2639     sequence_feature_spec[key] = sequence_feature
   2640   return sequence_feature_spec
   2641 
   2642 
   2643 def make_place_holder_tensors_for_base_features(feature_columns):
   2644   """Returns placeholder tensors for inference.
   2645 
   2646   Args:
   2647     feature_columns: An iterable containing all the feature columns. All items
   2648       should be instances of classes derived from _FeatureColumn.
   2649   Returns:
   2650     A dict mapping feature keys to SparseTensors (sparse columns) or
   2651     placeholder Tensors (dense columns).
   2652   """
   2653   # Get dict mapping features to FixedLenFeature or VarLenFeature values.
   2654   dict_for_parse_example = create_feature_spec_for_parsing(feature_columns)
   2655   placeholders = {}
   2656   for column_name, column_type in dict_for_parse_example.items():
   2657     if isinstance(column_type, parsing_ops.VarLenFeature):
   2658       # Sparse placeholder for sparse tensors.
   2659       placeholders[column_name] = array_ops.sparse_placeholder(
   2660           column_type.dtype, name="Placeholder_{}".format(column_name))
   2661     else:
   2662       # Simple placeholder for dense tensors.
   2663       placeholders[column_name] = array_ops.placeholder(
   2664           column_type.dtype,
   2665           shape=(None, column_type.shape[0]),
   2666           name="Placeholder_{}".format(column_name))
   2667   return placeholders
   2668 
   2669 
   2670 class _SparseIdLookupConfig(
   2671     collections.namedtuple("_SparseIdLookupConfig",
   2672                            ["vocabulary_file", "keys", "num_oov_buckets",
   2673                             "vocab_size", "default_value"])):
   2674   """Defines lookup configuration for a sparse feature.
   2675 
   2676   An immutable object defines lookup table configuration used by
   2677   tf.feature_to_id_v2.
   2678 
   2679   Attributes:
   2680     vocabulary_file: The vocabulary filename. vocabulary_file cannot be combined
   2681       with keys.
   2682     keys: A 1-D string iterable that specifies the mapping of strings to
   2683       indices. It means a feature in keys will map to it's index in keys.
   2684     num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of
   2685       vocabulary features will be ignored.
   2686     vocab_size: Number of the elements in the vocabulary.
   2687     default_value: The value to use for out-of-vocabulary feature values.
   2688       Defaults to -1.
   2689   """
   2690 
   2691   def __new__(cls,
   2692               vocabulary_file=None,
   2693               keys=None,
   2694               num_oov_buckets=0,
   2695               vocab_size=None,
   2696               default_value=-1):
   2697 
   2698     return super(_SparseIdLookupConfig, cls).__new__(cls, vocabulary_file, keys,
   2699                                                      num_oov_buckets,
   2700                                                      vocab_size, default_value)
   2701