1 # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 # ============================================================================== 15 """This API defines FeatureColumn abstraction. 16 17 FeatureColumns provide a high level abstraction for ingesting and representing 18 features in `Estimator` models. 19 20 FeatureColumns are the primary way of encoding features for pre-canned 21 `Estimator` models. 22 23 When using FeatureColumns with `Estimator` models, the type of feature column 24 you should choose depends on (1) the feature type and (2) the model type. 25 26 (1) Feature type: 27 28 * Continuous features can be represented by `real_valued_column`. 29 * Categorical features can be represented by any `sparse_column_with_*` 30 column (`sparse_column_with_keys`, `sparse_column_with_vocabulary_file`, 31 `sparse_column_with_hash_bucket`, `sparse_column_with_integerized_feature`). 32 33 (2) Model type: 34 35 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 36 37 Continuous features can be directly fed into deep neural network models. 38 39 age_column = real_valued_column("age") 40 41 To feed sparse features into DNN models, wrap the column with 42 `embedding_column` or `one_hot_column`. `one_hot_column` will create a dense 43 boolean tensor with an entry for each possible value, and thus the 44 computation cost is linear in the number of possible values versus the number 45 of values that occur in the sparse tensor. Thus using a "one_hot_column" is 46 only recommended for features with only a few possible values. For features 47 with many possible values or for very sparse features, `embedding_column` is 48 recommended. 49 50 embedded_dept_column = embedding_column( 51 sparse_column_with_keys("department", ["math", "philosphy", ...]), 52 dimension=10) 53 54 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 55 56 Sparse features can be fed directly into linear models. When doing so 57 an embedding_lookups are used to efficiently perform the sparse matrix 58 multiplication. 59 60 dept_column = sparse_column_with_keys("department", 61 ["math", "philosophy", "english"]) 62 63 It is recommended that continuous features be bucketized before being 64 fed into linear models. 65 66 bucketized_age_column = bucketized_column( 67 source_column=age_column, 68 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 69 70 Sparse features can be crossed (also known as conjuncted or combined) in 71 order to form non-linearities, and then fed into linear models. 72 73 cross_dept_age_column = crossed_column( 74 columns=[department_column, bucketized_age_column], 75 hash_bucket_size=1000) 76 77 Example of building an `Estimator` model using FeatureColumns: 78 79 # Define features and transformations 80 deep_feature_columns = [age_column, embedded_dept_column] 81 wide_feature_columns = [dept_column, bucketized_age_column, 82 cross_dept_age_column] 83 84 # Build deep model 85 estimator = DNNClassifier( 86 feature_columns=deep_feature_columns, 87 hidden_units=[500, 250, 50]) 88 estimator.train(...) 89 90 # Or build a wide model 91 estimator = LinearClassifier( 92 feature_columns=wide_feature_columns) 93 estimator.train(...) 94 95 # Or build a wide and deep model! 96 estimator = DNNLinearCombinedClassifier( 97 linear_feature_columns=wide_feature_columns, 98 dnn_feature_columns=deep_feature_columns, 99 dnn_hidden_units=[500, 250, 50]) 100 estimator.train(...) 101 102 103 FeatureColumns can also be transformed into a generic input layer for 104 custom models using `input_from_feature_columns` within 105 `feature_column_ops.py`. 106 107 Example of building a non-`Estimator` model using FeatureColumns: 108 109 # Building model via layers 110 111 deep_feature_columns = [age_column, embedded_dept_column] 112 columns_to_tensor = parse_feature_columns_from_examples( 113 serialized=my_data, 114 feature_columns=deep_feature_columns) 115 first_layer = input_from_feature_columns( 116 columns_to_tensors=columns_to_tensor, 117 feature_columns=deep_feature_columns) 118 second_layer = fully_connected(first_layer, ...) 119 120 See feature_column_ops_test for more examples. 121 """ 122 123 from __future__ import absolute_import 124 from __future__ import division 125 from __future__ import print_function 126 127 import abc 128 import collections 129 import math 130 131 import six 132 133 from tensorflow.contrib import lookup 134 from tensorflow.contrib.framework.python.framework import checkpoint_utils 135 from tensorflow.contrib.framework.python.framework import experimental 136 from tensorflow.contrib.framework.python.ops import variables as contrib_variables 137 from tensorflow.contrib.layers.python.layers import embedding_ops 138 from tensorflow.contrib.layers.python.layers import layers 139 from tensorflow.contrib.layers.python.ops import bucketization_op 140 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op 141 from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops 142 from tensorflow.python.feature_column import feature_column as fc_core 143 from tensorflow.python.framework import dtypes 144 from tensorflow.python.framework import ops 145 from tensorflow.python.framework import sparse_tensor as sparse_tensor_py 146 from tensorflow.python.framework import tensor_shape 147 from tensorflow.python.ops import array_ops 148 from tensorflow.python.ops import init_ops 149 from tensorflow.python.ops import math_ops 150 from tensorflow.python.ops import parsing_ops 151 from tensorflow.python.ops import resource_variable_ops 152 from tensorflow.python.ops import sparse_ops 153 from tensorflow.python.ops import string_ops 154 from tensorflow.python.ops import variables 155 from tensorflow.python.platform import tf_logging as logging 156 from tensorflow.python.util import deprecation 157 from tensorflow.python.util import nest 158 159 160 # Imports the core `InputLayer` symbol in contrib during development. 161 InputLayer = fc_core.InputLayer # pylint: disable=invalid-name 162 163 164 class _LinearEmbeddingLookupArguments( 165 collections.namedtuple("_LinearEmbeddingLookupArguments", 166 ["input_tensor", 167 "weight_tensor", 168 "vocab_size", 169 "initializer", 170 "combiner"])): 171 """Represents the information needed from a column for embedding lookup. 172 173 Used to compute DNN inputs and weighted sum. 174 """ 175 pass 176 177 178 class _DeepEmbeddingLookupArguments( 179 collections.namedtuple("_DeepEmbeddingLookupArguments", 180 ["input_tensor", 181 "weight_tensor", 182 "vocab_size", 183 "initializer", 184 "combiner", 185 "dimension", 186 "shared_embedding_name", 187 "hash_key", 188 "max_norm", 189 "trainable"])): 190 """Represents the information needed from a column for embedding lookup. 191 192 Used to compute DNN inputs and weighted sum. 193 """ 194 pass 195 196 197 class _FeatureColumn(object): 198 """Represents a feature column abstraction. 199 200 To distinguish the concept of a feature family and a specific binary feature 201 within a family, we refer to a feature family like "country" as a feature 202 column. For example "country:US" is a feature which is in "country" feature 203 column and has a feature value ("US"). 204 This class is an abstract class. User should not create one instance of this. 205 Following classes (_SparseColumn, _RealValuedColumn, ...) are concrete 206 instances. 207 """ 208 __metaclass__ = abc.ABCMeta 209 210 @abc.abstractproperty 211 @deprecation.deprecated( 212 "2016-09-25", 213 "Should be private.") 214 def name(self): 215 """Returns the name of column or transformed column.""" 216 pass 217 218 @abc.abstractproperty 219 @deprecation.deprecated( 220 "2016-09-25", 221 "Should be private.") 222 def config(self): 223 """Returns configuration of the base feature for `tf.parse_example`.""" 224 pass 225 226 @abc.abstractproperty 227 @deprecation.deprecated( 228 "2016-09-25", 229 "Should be private.") 230 def key(self): 231 """Returns a string which will be used as a key when we do sorting.""" 232 pass 233 234 @abc.abstractmethod 235 @deprecation.deprecated( 236 "2016-09-25", 237 "Should be private.") 238 def insert_transformed_feature(self, columns_to_tensors): 239 """Apply transformation and inserts it into columns_to_tensors. 240 241 Args: 242 columns_to_tensors: A mapping from feature columns to tensors. 'string' 243 key means a base feature (not-transformed). It can have _FeatureColumn 244 as a key too. That means that _FeatureColumn is already transformed. 245 """ 246 raise NotImplementedError("Transform is not implemented for {}.".format( 247 self)) 248 249 # pylint: disable=unused-argument 250 def _to_dnn_input_layer(self, 251 input_tensor, 252 weight_collection=None, 253 trainable=True, 254 output_rank=2): 255 """Returns a Tensor as an input to the first layer of neural network.""" 256 raise ValueError("Calling an abstract method.") 257 258 def _deep_embedding_lookup_arguments(self, input_tensor): 259 """Returns arguments to embedding lookup to build an input layer.""" 260 raise NotImplementedError( 261 "No deep embedding lookup arguments for column {}.".format(self)) 262 263 # It is expected that classes implement either wide_embedding_lookup_arguments 264 # or to_dense_tensor to be used in linear models. 265 # pylint: disable=unused-argument 266 def _wide_embedding_lookup_arguments(self, input_tensor): 267 """Returns arguments to look up embeddings for this column.""" 268 raise NotImplementedError( 269 "No wide embedding lookup arguments for column {}.".format(self)) 270 271 # pylint: disable=unused-argument 272 def _to_dense_tensor(self, input_tensor): 273 """Returns a dense tensor representing this column's values.""" 274 raise NotImplementedError( 275 "No dense tensor representation for column {}.".format(self)) 276 277 def _checkpoint_path(self): 278 """Returns None, or a (path,tensor_name) to load a checkpoint from.""" 279 return None 280 281 def _key_without_properties(self, properties): 282 """Helper method for self.key() that omits particular properties.""" 283 fields_values = [] 284 # pylint: disable=protected-access 285 for i, k in enumerate(self._fields): 286 if k in properties: 287 # Excludes a property from the key. 288 # For instance, exclude `initializer` from the key of EmbeddingColumn 289 # since we don't support users specifying different initializers for 290 # the same embedding column. Ditto for `normalizer` and 291 # RealValuedColumn. 292 # Special treatment is needed since the default str form of a 293 # function contains its address, which could introduce non-determinism 294 # in sorting. 295 continue 296 fields_values.append("{}={}".format(k, self[i])) 297 # pylint: enable=protected-access 298 299 # This is effectively the same format as str(self), except with our special 300 # treatment. 301 return "{}({})".format(type(self).__name__, ", ".join(fields_values)) 302 303 304 # TODO(b/30410315): Support warm starting in all feature columns. 305 class _SparseColumn( 306 _FeatureColumn, 307 fc_core._CategoricalColumn, # pylint: disable=protected-access 308 collections.namedtuple("_SparseColumn", [ 309 "column_name", "is_integerized", "bucket_size", "lookup_config", 310 "combiner", "dtype" 311 ])): 312 """Represents a sparse feature column also known as categorical features. 313 314 Instances of this class are immutable. A sparse column means features are 315 sparse and dictionary returned by InputBuilder contains a 316 ("column_name", SparseTensor) pair. 317 One and only one of bucket_size or lookup_config should be set. If 318 is_integerized is True then bucket_size should be set. 319 320 Attributes: 321 column_name: A string defining sparse column name. 322 is_integerized: A bool if True means type of feature is an integer. 323 Integerized means we can use the feature itself as id. 324 bucket_size: An int that is > 0. The number of buckets. 325 lookup_config: A _SparseIdLookupConfig defining feature-to-id lookup 326 configuration 327 combiner: A string specifying how to reduce if the sparse column is 328 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 329 the default. "sqrtn" often achieves good accuracy, in particular with 330 bag-of-words columns. 331 * "sum": do not normalize features in the column 332 * "mean": do l1 normalization on features in the column 333 * "sqrtn": do l2 normalization on features in the column 334 For more information: `tf.embedding_lookup_sparse`. 335 dtype: Type of features, either `tf.string` or `tf.int64`. 336 337 Raises: 338 TypeError: if lookup_config is not a _SparseIdLookupConfig. 339 ValueError: if above expectations about input fails. 340 """ 341 342 def __new__(cls, 343 column_name, 344 is_integerized=False, 345 bucket_size=None, 346 lookup_config=None, 347 combiner="sum", 348 dtype=dtypes.string): 349 if is_integerized and bucket_size is None: 350 raise ValueError("bucket_size must be set if is_integerized is True. " 351 "column_name: {}".format(column_name)) 352 353 if is_integerized and not dtype.is_integer: 354 raise ValueError("dtype must be an integer if is_integerized is True. " 355 "dtype: {}, column_name: {}.".format(dtype, column_name)) 356 if dtype != dtypes.string and not dtype.is_integer: 357 raise ValueError("dtype must be string or integer. " 358 "dtype: {}, column_name: {}".format(dtype, column_name)) 359 360 if bucket_size is None and lookup_config is None: 361 raise ValueError("one of bucket_size or lookup_config must be set. " 362 "column_name: {}".format(column_name)) 363 364 if bucket_size is not None and lookup_config: 365 raise ValueError("one and only one of bucket_size or lookup_config " 366 "must be set. column_name: {}".format(column_name)) 367 368 if bucket_size is not None and bucket_size < 1: 369 raise ValueError("bucket_size must be at least 1. " 370 "bucket_size: {}, column_name: {}".format(bucket_size, 371 column_name)) 372 373 if ((lookup_config) and 374 (not isinstance(lookup_config, _SparseIdLookupConfig))): 375 raise TypeError( 376 "lookup_config must be an instance of _SparseIdLookupConfig. " 377 "Given one is in type {} for column_name {}".format( 378 type(lookup_config), column_name)) 379 380 if (lookup_config and lookup_config.vocabulary_file and 381 lookup_config.vocab_size is None): 382 raise ValueError("vocab_size must be defined. " 383 "column_name: {}".format(column_name)) 384 385 return super(_SparseColumn, cls).__new__( 386 cls, 387 column_name, 388 is_integerized=is_integerized, 389 bucket_size=bucket_size, 390 lookup_config=lookup_config, 391 combiner=combiner, 392 dtype=dtype) 393 394 @property 395 def name(self): 396 return self.column_name 397 398 @property 399 def length(self): 400 """Returns vocabulary or hash_bucket size.""" 401 if self.bucket_size is not None: 402 return self.bucket_size 403 return self.lookup_config.vocab_size + self.lookup_config.num_oov_buckets 404 405 @property 406 def config(self): 407 return {self.column_name: parsing_ops.VarLenFeature(self.dtype)} 408 409 @property 410 def key(self): 411 """Returns a string which will be used as a key when we do sorting.""" 412 return "{}".format(self) 413 414 def id_tensor(self, input_tensor): 415 """Returns the id tensor from the given transformed input_tensor.""" 416 return input_tensor 417 418 # pylint: disable=unused-argument 419 def weight_tensor(self, input_tensor): 420 """Returns the weight tensor from the given transformed input_tensor.""" 421 return None 422 423 # pylint: disable=unused-argument 424 def _to_dnn_input_layer(self, 425 input_tensor, 426 weight_collections=None, 427 trainable=True, 428 output_rank=2): 429 raise ValueError( 430 "SparseColumn is not supported in DNN. " 431 "Please use embedding_column or one_hot_column. column: {}".format( 432 self)) 433 434 def _wide_embedding_lookup_arguments(self, input_tensor): 435 return _LinearEmbeddingLookupArguments( 436 input_tensor=self.id_tensor(input_tensor), 437 weight_tensor=self.weight_tensor(input_tensor), 438 vocab_size=self.length, 439 initializer=init_ops.zeros_initializer(), 440 combiner=self.combiner) 441 442 def _get_input_sparse_tensor(self, input_tensor): 443 """sparsify input_tensor if dense.""" 444 if not isinstance(input_tensor, sparse_tensor_py.SparseTensor): 445 # To avoid making any assumptions about which values are to be ignored, 446 # we set ignore_value to -1 for numeric tensors to avoid excluding valid 447 # indices. 448 if input_tensor.dtype == dtypes.string: 449 ignore_value = "" 450 else: 451 ignore_value = -1 452 input_tensor = _reshape_real_valued_tensor(input_tensor, 2, self.name) 453 input_tensor = contrib_sparse_ops.dense_to_sparse_tensor( 454 input_tensor, ignore_value=ignore_value) 455 456 return input_tensor 457 458 def is_compatible(self, other_column): 459 """Check compatibility of two sparse columns.""" 460 if self.lookup_config and other_column.lookup_config: 461 return self.lookup_config == other_column.lookup_config 462 compatible = (self.length == other_column.length and 463 (self.dtype == other_column.dtype or 464 (self.dtype.is_integer and other_column.dtype.is_integer))) 465 if compatible: 466 logging.warn("Column {} and {} may not have the same vocabulary.". 467 format(self.name, other_column.name)) 468 return compatible 469 470 @abc.abstractmethod 471 def _do_transform(self, input_tensor): 472 pass 473 474 def insert_transformed_feature(self, columns_to_tensors): 475 """Handles sparse column to id conversion.""" 476 input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name]) 477 columns_to_tensors[self] = self._do_transform(input_tensor) 478 479 def _transform_feature(self, inputs): 480 input_tensor = self._get_input_sparse_tensor(inputs.get(self.name)) 481 return self._do_transform(input_tensor) 482 483 @property 484 def _parse_example_spec(self): 485 return self.config 486 487 @property 488 def _num_buckets(self): 489 return self.length 490 491 def _get_sparse_tensors(self, inputs, weight_collections=None, 492 trainable=None): 493 del weight_collections 494 del trainable 495 input_tensor = inputs.get(self) 496 return fc_core._CategoricalColumn.IdWeightPair( # pylint: disable=protected-access 497 self.id_tensor(input_tensor), self.weight_tensor(input_tensor)) 498 499 500 class _SparseColumnIntegerized(_SparseColumn): 501 """See `sparse_column_with_integerized_feature`.""" 502 503 def _do_transform(self, input_tensor): 504 sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size, 505 name="mod") 506 return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values, 507 input_tensor.dense_shape) 508 509 510 def sparse_column_with_integerized_feature(column_name, 511 bucket_size, 512 combiner="sum", 513 dtype=dtypes.int64): 514 """Creates an integerized _SparseColumn. 515 516 Use this when your features are already pre-integerized into int64 IDs, that 517 is, when the set of values to output is already coming in as what's desired in 518 the output. Integerized means we can use the feature value itself as id. 519 520 Typically this is used for reading contiguous ranges of integers indexes, but 521 it doesn't have to be. The output value is simply copied from the 522 input_feature, whatever it is. Just be aware, however, that if you have large 523 gaps of unused integers it might affect what you feed those in (for instance, 524 if you make up a one-hot tensor from these, the unused integers will appear as 525 values in the tensor which are always zero.) 526 527 Args: 528 column_name: A string defining sparse column name. 529 bucket_size: An int that is >= 1. The number of buckets. It should be bigger 530 than maximum feature. In other words features in this column should be an 531 int64 in range [0, bucket_size) 532 combiner: A string specifying how to reduce if the sparse column is 533 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 534 the default. "sqrtn" often achieves good accuracy, in particular with 535 bag-of-words columns. 536 * "sum": do not normalize features in the column 537 * "mean": do l1 normalization on features in the column 538 * "sqrtn": do l2 normalization on features in the column 539 For more information: `tf.embedding_lookup_sparse`. 540 dtype: Type of features. It should be an integer type. Default value is 541 dtypes.int64. 542 543 Returns: 544 An integerized _SparseColumn definition. 545 546 Raises: 547 ValueError: bucket_size is less than 1. 548 ValueError: dtype is not integer. 549 """ 550 return _SparseColumnIntegerized( 551 column_name, is_integerized=True, bucket_size=bucket_size, 552 combiner=combiner, dtype=dtype) 553 554 555 class _SparseColumnHashed(_SparseColumn): 556 """See `sparse_column_with_hash_bucket`.""" 557 558 def __new__(cls, 559 column_name, 560 is_integerized=False, 561 bucket_size=None, 562 lookup_config=None, 563 combiner="sum", 564 dtype=dtypes.string, 565 hash_keys=None): 566 if hash_keys is not None: 567 if not isinstance(hash_keys, list) or not hash_keys: 568 raise ValueError("hash_keys must be a non-empty list.") 569 if (any([not isinstance(key_pair, list) for key_pair in hash_keys]) or 570 any([len(key_pair) != 2 for key_pair in hash_keys]) or 571 any([not isinstance(key, int) for key in nest.flatten(hash_keys)])): 572 raise ValueError( 573 "Each element of hash_keys must be a pair of integers.") 574 obj = super(_SparseColumnHashed, cls).__new__( 575 cls, 576 column_name, 577 is_integerized=is_integerized, 578 bucket_size=bucket_size, 579 lookup_config=lookup_config, 580 combiner=combiner, 581 dtype=dtype) 582 obj.hash_keys = hash_keys 583 return obj 584 585 def _do_transform(self, input_tensor): 586 if self.dtype.is_integer: 587 sparse_values = string_ops.as_string(input_tensor.values) 588 else: 589 sparse_values = input_tensor.values 590 591 if self.hash_keys: 592 result = [] 593 for key in self.hash_keys: 594 sparse_id_values = string_ops.string_to_hash_bucket_strong( 595 sparse_values, self.bucket_size, key) 596 result.append( 597 sparse_tensor_py.SparseTensor(input_tensor.indices, 598 sparse_id_values, 599 input_tensor.dense_shape)) 600 return sparse_ops.sparse_concat(axis=1, sp_inputs=result, name="lookup") 601 else: 602 sparse_id_values = string_ops.string_to_hash_bucket_fast( 603 sparse_values, self.bucket_size, name="lookup") 604 return sparse_tensor_py.SparseTensor( 605 input_tensor.indices, sparse_id_values, input_tensor.dense_shape) 606 607 608 def sparse_column_with_hash_bucket(column_name, 609 hash_bucket_size, 610 combiner="sum", 611 dtype=dtypes.string, 612 hash_keys=None): 613 """Creates a _SparseColumn with hashed bucket configuration. 614 615 Use this when your sparse features are in string or integer format, but you 616 don't have a vocab file that maps each value to an integer ID. 617 output_id = Hash(input_feature_string) % bucket_size 618 619 When hash_keys is set, multiple integer IDs would be created with each key 620 pair in the `hash_keys`. This is useful to reduce the collision of hashed ids. 621 622 Args: 623 column_name: A string defining sparse column name. 624 hash_bucket_size: An int that is > 1. The number of buckets. 625 combiner: A string specifying how to reduce if the sparse column is 626 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 627 the default. "sqrtn" often achieves good accuracy, in particular with 628 bag-of-words columns. 629 * "sum": do not normalize features in the column 630 * "mean": do l1 normalization on features in the column 631 * "sqrtn": do l2 normalization on features in the column 632 For more information: `tf.embedding_lookup_sparse`. 633 dtype: The type of features. Only string and integer types are supported. 634 hash_keys: The hash keys to use. It is a list of lists of two uint64s. If 635 None, simple and fast hashing algorithm is used. Otherwise, multiple 636 strong hash ids would be produced with each two unit64s in this argument. 637 638 Returns: 639 A _SparseColumn with hashed bucket configuration 640 641 Raises: 642 ValueError: hash_bucket_size is not greater than 2. 643 ValueError: dtype is neither string nor integer. 644 """ 645 return _SparseColumnHashed( 646 column_name, 647 bucket_size=hash_bucket_size, 648 combiner=combiner, 649 dtype=dtype, 650 hash_keys=hash_keys) 651 652 653 class _SparseColumnKeys(_SparseColumn): 654 """See `sparse_column_with_keys`.""" 655 656 def _do_transform(self, input_tensor): 657 table = lookup.index_table_from_tensor( 658 mapping=tuple(self.lookup_config.keys), 659 default_value=self.lookup_config.default_value, 660 dtype=self.dtype, 661 name="lookup") 662 return table.lookup(input_tensor) 663 664 665 def sparse_column_with_keys( 666 column_name, keys, default_value=-1, combiner="sum", dtype=dtypes.string): 667 """Creates a _SparseColumn with keys. 668 669 Look up logic is as follows: 670 lookup_id = index_of_feature_in_keys if feature in keys else default_value 671 672 Args: 673 column_name: A string defining sparse column name. 674 keys: A list or tuple defining vocabulary. Must be castable to `dtype`. 675 default_value: The value to use for out-of-vocabulary feature values. 676 Default is -1. 677 combiner: A string specifying how to reduce if the sparse column is 678 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 679 the default. "sqrtn" often achieves good accuracy, in particular with 680 bag-of-words columns. 681 * "sum": do not normalize features in the column 682 * "mean": do l1 normalization on features in the column 683 * "sqrtn": do l2 normalization on features in the column 684 For more information: `tf.embedding_lookup_sparse`. 685 dtype: Type of features. Only integer and string are supported. 686 687 Returns: 688 A _SparseColumnKeys with keys configuration. 689 """ 690 keys = tuple(keys) 691 return _SparseColumnKeys( 692 column_name, 693 lookup_config=_SparseIdLookupConfig( 694 keys=keys, vocab_size=len(keys), default_value=default_value), 695 combiner=combiner, 696 dtype=dtype) 697 698 699 class _SparseColumnVocabulary(_SparseColumn): 700 """See `sparse_column_with_vocabulary_file`.""" 701 702 def _do_transform(self, st): 703 if self.dtype.is_integer: 704 sparse_string_values = string_ops.as_string(st.values) 705 sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices, 706 sparse_string_values, 707 st.dense_shape) 708 else: 709 sparse_string_tensor = st 710 711 table = lookup.index_table_from_file( 712 vocabulary_file=self.lookup_config.vocabulary_file, 713 num_oov_buckets=self.lookup_config.num_oov_buckets, 714 vocab_size=self.lookup_config.vocab_size, 715 default_value=self.lookup_config.default_value, 716 name=self.name + "_lookup") 717 return table.lookup(sparse_string_tensor) 718 719 720 def sparse_column_with_vocabulary_file(column_name, 721 vocabulary_file, 722 num_oov_buckets=0, 723 vocab_size=None, 724 default_value=-1, 725 combiner="sum", 726 dtype=dtypes.string): 727 """Creates a _SparseColumn with vocabulary file configuration. 728 729 Use this when your sparse features are in string or integer format, and you 730 have a vocab file that maps each value to an integer ID. 731 output_id = LookupIdFromVocab(input_feature_string) 732 733 Args: 734 column_name: A string defining sparse column name. 735 vocabulary_file: The vocabulary filename. 736 num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of 737 vocabulary features will be ignored. 738 vocab_size: Number of the elements in the vocabulary. 739 default_value: The value to use for out-of-vocabulary feature values. 740 Defaults to -1. 741 combiner: A string specifying how to reduce if the sparse column is 742 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 743 the default. "sqrtn" often achieves good accuracy, in particular with 744 bag-of-words columns. 745 * "sum": do not normalize features in the column 746 * "mean": do l1 normalization on features in the column 747 * "sqrtn": do l2 normalization on features in the column 748 For more information: `tf.embedding_lookup_sparse`. 749 dtype: The type of features. Only string and integer types are supported. 750 751 Returns: 752 A _SparseColumn with vocabulary file configuration. 753 754 Raises: 755 ValueError: vocab_size is not defined. 756 ValueError: dtype is neither string nor integer. 757 """ 758 if vocab_size is None: 759 raise ValueError("vocab_size should be defined. " 760 "column_name: {}".format(column_name)) 761 762 return _SparseColumnVocabulary( 763 column_name, 764 lookup_config=_SparseIdLookupConfig( 765 vocabulary_file=vocabulary_file, 766 num_oov_buckets=num_oov_buckets, 767 vocab_size=vocab_size, 768 default_value=default_value), 769 combiner=combiner, 770 dtype=dtype) 771 772 773 class _WeightedSparseColumn( 774 _FeatureColumn, 775 fc_core._CategoricalColumn, # pylint: disable=protected-access 776 collections.namedtuple("_WeightedSparseColumn", 777 ["sparse_id_column", "weight_column_name", 778 "dtype"])): 779 """See `weighted_sparse_column`.""" 780 781 def __new__(cls, sparse_id_column, weight_column_name, dtype): 782 return super(_WeightedSparseColumn, cls).__new__(cls, sparse_id_column, 783 weight_column_name, dtype) 784 785 @property 786 def name(self): 787 return "{}_weighted_by_{}".format(self.sparse_id_column.name, 788 self.weight_column_name) 789 790 @property 791 def length(self): 792 """Returns id size.""" 793 return self.sparse_id_column.length 794 795 @property 796 def config(self): 797 config = _get_feature_config(self.sparse_id_column) 798 config.update( 799 {self.weight_column_name: parsing_ops.VarLenFeature(self.dtype)}) 800 return config 801 802 @property 803 def lookup_config(self): 804 return self.sparse_id_column.lookup_config 805 806 @property 807 def key(self): 808 """Returns a string which will be used as a key when we do sorting.""" 809 return "{}".format(self) 810 811 def id_tensor(self, input_tensor): 812 """Returns the id tensor from the given transformed input_tensor.""" 813 return input_tensor[0] 814 815 def weight_tensor(self, input_tensor): 816 """Returns the weight tensor from the given transformed input_tensor.""" 817 return input_tensor[1] 818 819 # pylint: disable=unused-argument 820 def _to_dnn_input_layer(self, 821 input_tensor, 822 weight_collections=None, 823 trainable=True, 824 output_rank=2): 825 raise ValueError( 826 "WeightedSparseColumn is not supported in DNN. " 827 "Please use embedding_column or one_hot_column. column: {}".format( 828 self)) 829 830 def _wide_embedding_lookup_arguments(self, input_tensor): 831 return _LinearEmbeddingLookupArguments( 832 input_tensor=self.id_tensor(input_tensor), 833 weight_tensor=self.weight_tensor(input_tensor), 834 vocab_size=self.length, 835 initializer=init_ops.zeros_initializer(), 836 combiner=self.sparse_id_column.combiner) 837 838 def _do_transform(self, id_tensor, weight_tensor): 839 if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor): 840 # The weight tensor can be a regular Tensor. In such case, sparsify it. 841 weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor) 842 if not self.dtype.is_floating: 843 weight_tensor = math_ops.to_float(weight_tensor) 844 return tuple([id_tensor, weight_tensor]) 845 846 def insert_transformed_feature(self, columns_to_tensors): 847 """Inserts a tuple with the id and weight tensors.""" 848 if self.sparse_id_column not in columns_to_tensors: 849 self.sparse_id_column.insert_transformed_feature(columns_to_tensors) 850 851 weight_tensor = columns_to_tensors[self.weight_column_name] 852 columns_to_tensors[self] = self._do_transform( 853 columns_to_tensors[self.sparse_id_column], weight_tensor) 854 855 def _transform_feature(self, inputs): 856 return self._do_transform( 857 inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name)) 858 859 @property 860 def _parse_example_spec(self): 861 return self.config 862 863 @property 864 def _num_buckets(self): 865 return self.length 866 867 def _get_sparse_tensors(self, inputs, weight_collections=None, 868 trainable=None): 869 del weight_collections 870 del trainable 871 input_tensor = inputs.get(self) 872 return fc_core._CategoricalColumn.IdWeightPair( # pylint: disable=protected-access 873 self.id_tensor(input_tensor), self.weight_tensor(input_tensor)) 874 875 def is_compatible(self, other_column): 876 """Check compatibility with other sparse column.""" 877 if isinstance(other_column, _WeightedSparseColumn): 878 return self.sparse_id_column.is_compatible(other_column.sparse_id_column) 879 return self.sparse_id_column.is_compatible(other_column) 880 881 882 def weighted_sparse_column(sparse_id_column, 883 weight_column_name, 884 dtype=dtypes.float32): 885 """Creates a _SparseColumn by combining sparse_id_column with a weight column. 886 887 Example: 888 889 ```python 890 sparse_feature = sparse_column_with_hash_bucket(column_name="sparse_col", 891 hash_bucket_size=1000) 892 weighted_feature = weighted_sparse_column(sparse_id_column=sparse_feature, 893 weight_column_name="weights_col") 894 ``` 895 896 This configuration assumes that input dictionary of model contains the 897 following two items: 898 * (key="sparse_col", value=sparse_tensor) where sparse_tensor is 899 a SparseTensor. 900 * (key="weights_col", value=weights_tensor) where weights_tensor 901 is a SparseTensor. 902 Following are assumed to be true: 903 * sparse_tensor.indices = weights_tensor.indices 904 * sparse_tensor.dense_shape = weights_tensor.dense_shape 905 906 Args: 907 sparse_id_column: A `_SparseColumn` which is created by 908 `sparse_column_with_*` functions. 909 weight_column_name: A string defining a sparse column name which represents 910 weight or value of the corresponding sparse id feature. 911 dtype: Type of weights, such as `tf.float32`. Only floating and integer 912 weights are supported. 913 914 Returns: 915 A _WeightedSparseColumn composed of two sparse features: one represents id, 916 the other represents weight (value) of the id feature in that example. 917 918 Raises: 919 ValueError: if dtype is not convertible to float. 920 """ 921 if not (dtype.is_integer or dtype.is_floating): 922 raise ValueError("dtype is not convertible to float. Given {}".format( 923 dtype)) 924 925 return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype) 926 927 928 class _OneHotColumn( 929 _FeatureColumn, 930 fc_core._DenseColumn, # pylint: disable=protected-access 931 collections.namedtuple("_OneHotColumn", ["sparse_id_column"])): 932 """Represents a one-hot column for use in deep networks. 933 934 Args: 935 sparse_id_column: A _SparseColumn which is created by `sparse_column_with_*` 936 function. 937 """ 938 939 @property 940 def name(self): 941 return "{}_one_hot".format(self.sparse_id_column.name) 942 943 @property 944 def length(self): 945 """Returns vocabulary or hash_bucket size.""" 946 return self.sparse_id_column.length 947 948 @property 949 def config(self): 950 """Returns the parsing config of the origin column.""" 951 return _get_feature_config(self.sparse_id_column) 952 953 @property 954 def key(self): 955 """Returns a string which will be used as a key when we do sorting.""" 956 return "{}".format(self) 957 958 def insert_transformed_feature(self, columns_to_tensors): 959 """Used by the Transformer to prevent double transformations.""" 960 if self.sparse_id_column not in columns_to_tensors: 961 self.sparse_id_column.insert_transformed_feature(columns_to_tensors) 962 columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column] 963 964 def _to_dnn_input_layer(self, 965 transformed_input_tensor, 966 unused_weight_collections=None, 967 unused_trainable=False, 968 output_rank=2): 969 """Returns a Tensor as an input to the first layer of neural network. 970 971 Args: 972 transformed_input_tensor: A tensor that has undergone the transformations 973 in `insert_transformed_feature`. Rank should be >= `output_rank`. 974 unused_weight_collections: Unused. One hot encodings are not variable. 975 unused_trainable: Unused. One hot encodings are not trainable. 976 output_rank: the desired rank of the output `Tensor`. 977 978 Returns: 979 A multi-hot Tensor to be fed into the first layer of neural network. 980 981 Raises: 982 ValueError: When using one_hot_column with weighted_sparse_column. 983 This is not yet supported. 984 """ 985 986 # Reshape ID column to `output_rank`. 987 sparse_id_column = self.sparse_id_column.id_tensor(transformed_input_tensor) 988 # pylint: disable=protected-access 989 sparse_id_column = layers._inner_flatten(sparse_id_column, output_rank) 990 991 weight_tensor = self.sparse_id_column.weight_tensor( 992 transformed_input_tensor) 993 if weight_tensor is not None: 994 weighted_column = sparse_ops.sparse_merge(sp_ids=sparse_id_column, 995 sp_values=weight_tensor, 996 vocab_size=self.length) 997 # Remove (?, -1) index 998 weighted_column = sparse_ops.sparse_slice( 999 weighted_column, 1000 [0, 0], 1001 weighted_column.dense_shape) 1002 return sparse_ops.sparse_tensor_to_dense(weighted_column) 1003 1004 dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column, 1005 default_value=-1) 1006 1007 # One hot must be float for tf.concat reasons since all other inputs to 1008 # input_layer are float32. 1009 one_hot_id_tensor = array_ops.one_hot( 1010 dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0) 1011 1012 # Reduce to get a multi-hot per example. 1013 return math_ops.reduce_sum( 1014 one_hot_id_tensor, reduction_indices=[output_rank - 1]) 1015 1016 @property 1017 def _variable_shape(self): 1018 return tensor_shape.TensorShape([self.length]) 1019 1020 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1021 del weight_collections 1022 del trainable 1023 return inputs.get(self) 1024 1025 def _transform_feature(self, inputs): 1026 return self._to_dnn_input_layer(inputs.get(self.sparse_id_column)) 1027 1028 @property 1029 def _parse_example_spec(self): 1030 return self.config 1031 1032 1033 class _EmbeddingColumn( 1034 _FeatureColumn, 1035 fc_core._DenseColumn, # pylint: disable=protected-access 1036 collections.namedtuple("_EmbeddingColumn", [ 1037 "sparse_id_column", "dimension", "combiner", "initializer", 1038 "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name", 1039 "shared_vocab_size", "max_norm", "trainable" 1040 ])): 1041 """Represents an embedding column. 1042 1043 Args: 1044 sparse_id_column: A `_SparseColumn` which is created by 1045 `sparse_column_with_*` or `weighted_sparse_column` functions. 1046 dimension: An integer specifying dimension of the embedding. 1047 combiner: A string specifying how to reduce if there are multiple entries 1048 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1049 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1050 with bag-of-words columns. Each of this can be thought as example level 1051 normalizations on the column: 1052 * "sum": do not normalize features in the column 1053 * "mean": do l1 normalization on features in the column 1054 * "sqrtn": do l2 normalization on features in the column 1055 For more information: `tf.embedding_lookup_sparse`. 1056 initializer: A variable initializer function to be used in embedding 1057 variable initialization. If not specified, defaults to 1058 `tf.truncated_normal_initializer` with mean 0.0 and standard deviation 1059 1/sqrt(sparse_id_column.length). 1060 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 1061 to restore the column weights. Required if `tensor_name_in_ckpt` is not 1062 None. 1063 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 1064 checkpoint from which to restore the column weights. Required if 1065 `ckpt_to_load_from` is not None. 1066 shared_embedding_name: (Optional). The common name for shared embedding. 1067 shared_vocab_size: (Optional). The common vocab_size used for shared 1068 embedding space. 1069 max_norm: (Optional). If not None, embedding values are l2-normalized to 1070 the value of max_norm. 1071 trainable: (Optional). Should the embedding be trainable. Default is True. 1072 1073 Raises: 1074 ValueError: if `initializer` is specified and is not callable. Also, 1075 if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified. 1076 """ 1077 1078 def __new__(cls, 1079 sparse_id_column, 1080 dimension, 1081 combiner="mean", 1082 initializer=None, 1083 ckpt_to_load_from=None, 1084 tensor_name_in_ckpt=None, 1085 shared_embedding_name=None, 1086 shared_vocab_size=None, 1087 max_norm=None, 1088 trainable=True): 1089 if initializer is not None and not callable(initializer): 1090 raise ValueError("initializer must be callable if specified. " 1091 "Embedding of column_name: {}".format( 1092 sparse_id_column.name)) 1093 1094 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 1095 raise ValueError("Must specify both `ckpt_to_load_from` and " 1096 "`tensor_name_in_ckpt` or none of them.") 1097 if initializer is None: 1098 logging.warn("The default stddev value of initializer will change from " 1099 "\"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" after " 1100 "2017/02/25.") 1101 stddev = 1 / math.sqrt(sparse_id_column.length) 1102 initializer = init_ops.truncated_normal_initializer( 1103 mean=0.0, stddev=stddev) 1104 return super(_EmbeddingColumn, cls).__new__(cls, sparse_id_column, 1105 dimension, combiner, 1106 initializer, ckpt_to_load_from, 1107 tensor_name_in_ckpt, 1108 shared_embedding_name, 1109 shared_vocab_size, 1110 max_norm, 1111 trainable) 1112 1113 @property 1114 def name(self): 1115 if self.shared_embedding_name is None: 1116 return "{}_embedding".format(self.sparse_id_column.name) 1117 else: 1118 return "{}_shared_embedding".format(self.sparse_id_column.name) 1119 1120 @property 1121 def length(self): 1122 """Returns id size.""" 1123 if self.shared_vocab_size is None: 1124 return self.sparse_id_column.length 1125 else: 1126 return self.shared_vocab_size 1127 1128 @property 1129 def config(self): 1130 return _get_feature_config(self.sparse_id_column) 1131 1132 @property 1133 def key(self): 1134 """Returns a string which will be used as a key when we do sorting.""" 1135 return self._key_without_properties(["initializer"]) 1136 1137 def insert_transformed_feature(self, columns_to_tensors): 1138 if self.sparse_id_column not in columns_to_tensors: 1139 self.sparse_id_column.insert_transformed_feature(columns_to_tensors) 1140 columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column] 1141 1142 def _deep_embedding_lookup_arguments(self, input_tensor): 1143 return _DeepEmbeddingLookupArguments( 1144 input_tensor=self.sparse_id_column.id_tensor(input_tensor), 1145 weight_tensor=self.sparse_id_column.weight_tensor(input_tensor), 1146 vocab_size=self.length, 1147 dimension=self.dimension, 1148 initializer=self.initializer, 1149 combiner=self.combiner, 1150 shared_embedding_name=self.shared_embedding_name, 1151 hash_key=None, 1152 max_norm=self.max_norm, 1153 trainable=self.trainable) 1154 1155 def _checkpoint_path(self): 1156 if self.ckpt_to_load_from is not None: 1157 return self.ckpt_to_load_from, self.tensor_name_in_ckpt 1158 return None 1159 1160 # pylint: disable=unused-argument 1161 def _wide_embedding_lookup_arguments(self, input_tensor): 1162 raise ValueError("Column {} is not supported in linear models. " 1163 "Please use sparse_column.".format(self)) 1164 1165 @property 1166 def _variable_shape(self): 1167 return tensor_shape.TensorShape([self.dimension]) 1168 1169 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1170 return _embeddings_from_arguments( 1171 self, 1172 self._deep_embedding_lookup_arguments(inputs.get(self)), 1173 weight_collections, trainable) 1174 1175 def _transform_feature(self, inputs): 1176 return inputs.get(self.sparse_id_column) 1177 1178 @property 1179 def _parse_example_spec(self): 1180 return self.config 1181 1182 1183 def _is_variable(v): 1184 """Returns true if `v` is a variable.""" 1185 return isinstance(v, (variables.Variable, 1186 resource_variable_ops.ResourceVariable)) 1187 1188 1189 def _embeddings_from_arguments(column, 1190 args, 1191 weight_collections, 1192 trainable, 1193 output_rank=2): 1194 """Returns embeddings for a column based on the computed arguments. 1195 1196 Args: 1197 column: the column name. 1198 args: the _DeepEmbeddingLookupArguments for this column. 1199 weight_collections: collections to store weights in. 1200 trainable: whether these embeddings should be trainable. 1201 output_rank: the desired rank of the returned `Tensor`. Inner dimensions will 1202 be combined to produce the desired rank. 1203 1204 Returns: 1205 the embeddings. 1206 1207 Raises: 1208 ValueError: if not possible to create. 1209 """ 1210 # pylint: disable=protected-access 1211 input_tensor = layers._inner_flatten(args.input_tensor, output_rank) 1212 weight_tensor = None 1213 if args.weight_tensor is not None: 1214 weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank) 1215 # pylint: enable=protected-access 1216 1217 # This option is only enabled for scattered_embedding_column. 1218 if args.hash_key: 1219 embeddings = contrib_variables.model_variable( 1220 name="weights", 1221 shape=[args.vocab_size], 1222 dtype=dtypes.float32, 1223 initializer=args.initializer, 1224 trainable=(trainable and args.trainable), 1225 collections=weight_collections) 1226 1227 return embedding_ops.scattered_embedding_lookup_sparse( 1228 embeddings, 1229 input_tensor, 1230 args.dimension, 1231 hash_key=args.hash_key, 1232 combiner=args.combiner, 1233 name="lookup") 1234 1235 if args.shared_embedding_name is not None: 1236 shared_embedding_collection_name = ( 1237 "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper()) 1238 graph = ops.get_default_graph() 1239 shared_embedding_collection = ( 1240 graph.get_collection_ref(shared_embedding_collection_name)) 1241 shape = [args.vocab_size, args.dimension] 1242 if shared_embedding_collection: 1243 if len(shared_embedding_collection) > 1: 1244 raise ValueError( 1245 "Collection %s can only contain one " 1246 "(partitioned) variable." % shared_embedding_collection_name) 1247 else: 1248 embeddings = shared_embedding_collection[0] 1249 if embeddings.get_shape() != shape: 1250 raise ValueError( 1251 "The embedding variable with name {} already " 1252 "exists, but its shape does not match required " 1253 "embedding shape here. Please make sure to use " 1254 "different shared_embedding_name for different " 1255 "shared embeddings.".format(args.shared_embedding_name)) 1256 else: 1257 embeddings = contrib_variables.model_variable( 1258 name=args.shared_embedding_name, 1259 shape=shape, 1260 dtype=dtypes.float32, 1261 initializer=args.initializer, 1262 trainable=(trainable and args.trainable), 1263 collections=weight_collections) 1264 graph.add_to_collection(shared_embedding_collection_name, embeddings) 1265 else: 1266 embeddings = contrib_variables.model_variable( 1267 name="weights", 1268 shape=[args.vocab_size, args.dimension], 1269 dtype=dtypes.float32, 1270 initializer=args.initializer, 1271 trainable=(trainable and args.trainable), 1272 collections=weight_collections) 1273 1274 if _is_variable(embeddings): 1275 embeddings = [embeddings] 1276 else: 1277 embeddings = embeddings._get_variable_list() # pylint: disable=protected-access 1278 # pylint: disable=protected-access 1279 _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings) 1280 return embedding_ops.safe_embedding_lookup_sparse( 1281 embeddings, 1282 input_tensor, 1283 sparse_weights=weight_tensor, 1284 combiner=args.combiner, 1285 name=column.name + "weights", 1286 max_norm=args.max_norm) 1287 1288 1289 def _maybe_restore_from_checkpoint(checkpoint_path, variable): 1290 if checkpoint_path is not None: 1291 path, tensor_name = checkpoint_path 1292 weights_to_restore = variable 1293 if len(variable) == 1: 1294 weights_to_restore = variable[0] 1295 checkpoint_utils.init_from_checkpoint(path, 1296 {tensor_name: weights_to_restore}) 1297 1298 1299 def one_hot_column(sparse_id_column): 1300 """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN. 1301 1302 Args: 1303 sparse_id_column: A _SparseColumn which is created by 1304 `sparse_column_with_*` 1305 or crossed_column functions. Note that `combiner` defined in 1306 `sparse_id_column` is ignored. 1307 1308 Returns: 1309 An _OneHotColumn. 1310 """ 1311 return _OneHotColumn(sparse_id_column) 1312 1313 1314 def embedding_column(sparse_id_column, 1315 dimension, 1316 combiner="mean", 1317 initializer=None, 1318 ckpt_to_load_from=None, 1319 tensor_name_in_ckpt=None, 1320 max_norm=None, 1321 trainable=True): 1322 """Creates an `_EmbeddingColumn` for feeding sparse data into a DNN. 1323 1324 Args: 1325 sparse_id_column: A `_SparseColumn` which is created by for example 1326 `sparse_column_with_*` or crossed_column functions. Note that `combiner` 1327 defined in `sparse_id_column` is ignored. 1328 dimension: An integer specifying dimension of the embedding. 1329 combiner: A string specifying how to reduce if there are multiple entries 1330 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1331 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1332 with bag-of-words columns. Each of this can be thought as example level 1333 normalizations on the column: 1334 * "sum": do not normalize 1335 * "mean": do l1 normalization 1336 * "sqrtn": do l2 normalization 1337 For more information: `tf.embedding_lookup_sparse`. 1338 initializer: A variable initializer function to be used in embedding 1339 variable initialization. If not specified, defaults to 1340 `tf.truncated_normal_initializer` with mean 0.0 and standard deviation 1341 1/sqrt(sparse_id_column.length). 1342 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 1343 to restore the column weights. Required if `tensor_name_in_ckpt` is not 1344 None. 1345 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 1346 checkpoint from which to restore the column weights. Required if 1347 `ckpt_to_load_from` is not None. 1348 max_norm: (Optional). If not None, embedding values are l2-normalized to 1349 the value of max_norm. 1350 trainable: (Optional). Should the embedding be trainable. Default is True 1351 1352 Returns: 1353 An `_EmbeddingColumn`. 1354 """ 1355 return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer, 1356 ckpt_to_load_from, tensor_name_in_ckpt, 1357 max_norm=max_norm, trainable=trainable) 1358 1359 1360 def shared_embedding_columns(sparse_id_columns, 1361 dimension, 1362 combiner="mean", 1363 shared_embedding_name=None, 1364 initializer=None, 1365 ckpt_to_load_from=None, 1366 tensor_name_in_ckpt=None, 1367 max_norm=None, 1368 trainable=True): 1369 """Creates a list of `_EmbeddingColumn` sharing the same embedding. 1370 1371 Args: 1372 sparse_id_columns: An iterable of `_SparseColumn`, such as those created by 1373 `sparse_column_with_*` or crossed_column functions. Note that `combiner` 1374 defined in each sparse_id_column is ignored. 1375 dimension: An integer specifying dimension of the embedding. 1376 combiner: A string specifying how to reduce if there are multiple entries 1377 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1378 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1379 with bag-of-words columns. Each of this can be thought as example level 1380 normalizations on the column: 1381 * "sum": do not normalize 1382 * "mean": do l1 normalization 1383 * "sqrtn": do l2 normalization 1384 For more information: `tf.embedding_lookup_sparse`. 1385 shared_embedding_name: (Optional). A string specifying the name of shared 1386 embedding weights. This will be needed if you want to reference the shared 1387 embedding separately from the generated `_EmbeddingColumn`. 1388 initializer: A variable initializer function to be used in embedding 1389 variable initialization. If not specified, defaults to 1390 `tf.truncated_normal_initializer` with mean 0.0 and standard deviation 1391 1/sqrt(sparse_id_columns[0].length). 1392 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 1393 to restore the column weights. Required if `tensor_name_in_ckpt` is not 1394 None. 1395 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 1396 checkpoint from which to restore the column weights. Required if 1397 `ckpt_to_load_from` is not None. 1398 max_norm: (Optional). If not None, embedding values are l2-normalized to 1399 the value of max_norm. 1400 trainable: (Optional). Should the embedding be trainable. Default is True 1401 1402 Returns: 1403 A tuple of `_EmbeddingColumn` with shared embedding space. 1404 1405 Raises: 1406 ValueError: if sparse_id_columns is empty, or its elements are not 1407 compatible with each other. 1408 TypeError: if `sparse_id_columns` is not a sequence or is a string. If at 1409 least one element of `sparse_id_columns` is not a `SparseColumn` or a 1410 `WeightedSparseColumn`. 1411 """ 1412 if (not isinstance(sparse_id_columns, collections.Sequence) or 1413 isinstance(sparse_id_columns, six.string_types)): 1414 raise TypeError( 1415 "sparse_id_columns must be a non-string sequence (ex: list or tuple) " 1416 "instead of type {}.".format(type(sparse_id_columns))) 1417 if len(sparse_id_columns) < 1: 1418 raise ValueError("The input sparse_id_columns should have at least one " 1419 "element.") 1420 for sparse_id_column in sparse_id_columns: 1421 if not (isinstance(sparse_id_column, _SparseColumn) or 1422 isinstance(sparse_id_column, _WeightedSparseColumn)): 1423 raise TypeError("Elements of sparse_id_columns must be _SparseColumn or " 1424 "_WeightedSparseColumn, but {} is not." 1425 .format(sparse_id_column)) 1426 1427 if len(sparse_id_columns) == 1: 1428 return [ 1429 _EmbeddingColumn(sparse_id_columns[0], dimension, combiner, initializer, 1430 ckpt_to_load_from, tensor_name_in_ckpt, 1431 shared_embedding_name, max_norm=max_norm, 1432 trainable=trainable)] 1433 else: 1434 # Check compatibility of sparse_id_columns 1435 compatible = True 1436 for column in sparse_id_columns[1:]: 1437 if isinstance(sparse_id_columns[0], _WeightedSparseColumn): 1438 compatible = compatible and sparse_id_columns[0].is_compatible(column) 1439 else: 1440 compatible = compatible and column.is_compatible(sparse_id_columns[0]) 1441 if not compatible: 1442 raise ValueError("The input sparse id columns are not compatible.") 1443 # Construct the shared name and size for shared embedding space. 1444 if not shared_embedding_name: 1445 # Sort the columns so that shared_embedding_name will be deterministic 1446 # even if users pass in unsorted columns from a dict or something. 1447 # Since they are different classes, ordering is SparseColumns first, 1448 # then WeightedSparseColumns. 1449 sparse_columns = [] 1450 weighted_sparse_columns = [] 1451 for column in sparse_id_columns: 1452 if isinstance(column, _SparseColumn): 1453 sparse_columns.append(column) 1454 else: 1455 weighted_sparse_columns.append(column) 1456 sorted_columns = sorted(sparse_columns) + sorted( 1457 weighted_sparse_columns, key=lambda x: x.name) 1458 if len(sorted_columns) <= 3: 1459 shared_embedding_name = "_".join([column.name 1460 for column in sorted_columns]) 1461 else: 1462 shared_embedding_name = "_".join([column.name 1463 for column in sorted_columns[0:3]]) 1464 shared_embedding_name += ( 1465 "_plus_{}_others".format(len(sorted_columns) - 3)) 1466 shared_embedding_name += "_shared_embedding" 1467 shared_vocab_size = sparse_id_columns[0].length 1468 1469 embedded_columns = [] 1470 for column in sparse_id_columns: 1471 embedded_columns.append( 1472 _EmbeddingColumn(column, dimension, combiner, initializer, 1473 ckpt_to_load_from, tensor_name_in_ckpt, 1474 shared_embedding_name, shared_vocab_size, 1475 max_norm=max_norm, trainable=trainable)) 1476 return tuple(embedded_columns) 1477 1478 1479 class _ScatteredEmbeddingColumn( 1480 _FeatureColumn, 1481 fc_core._DenseColumn, # pylint: disable=protected-access 1482 collections.namedtuple("_ScatteredEmbeddingColumn", [ 1483 "column_name", "size", "dimension", "hash_key", "combiner", 1484 "initializer" 1485 ])): 1486 """See `scattered_embedding_column`.""" 1487 1488 def __new__(cls, 1489 column_name, 1490 size, 1491 dimension, 1492 hash_key, 1493 combiner="sqrtn", 1494 initializer=None): 1495 if initializer is not None and not callable(initializer): 1496 raise ValueError("initializer must be callable if specified. " 1497 "column_name: {}".format(column_name)) 1498 if initializer is None: 1499 logging.warn("The default stddev value of initializer will change from " 1500 "\"0.1\" to \"1/sqrt(dimension)\" after 2017/02/25.") 1501 stddev = 0.1 1502 initializer = init_ops.truncated_normal_initializer( 1503 mean=0.0, stddev=stddev) 1504 return super(_ScatteredEmbeddingColumn, cls).__new__(cls, column_name, size, 1505 dimension, hash_key, 1506 combiner, 1507 initializer) 1508 1509 @property 1510 def name(self): 1511 return "{}_scattered_embedding".format(self.column_name) 1512 1513 @property 1514 def config(self): 1515 return {self.column_name: parsing_ops.VarLenFeature(dtypes.string)} 1516 1517 @property 1518 def key(self): 1519 """Returns a string which will be used as a key when we do sorting.""" 1520 return self._key_without_properties(["initializer"]) 1521 1522 def insert_transformed_feature(self, columns_to_tensors): 1523 columns_to_tensors[self] = columns_to_tensors[self.column_name] 1524 1525 def _deep_embedding_lookup_arguments(self, input_tensor): 1526 return _DeepEmbeddingLookupArguments( 1527 input_tensor=input_tensor, 1528 weight_tensor=None, 1529 vocab_size=self.size, 1530 initializer=self.initializer, 1531 combiner=self.combiner, 1532 dimension=self.dimension, 1533 shared_embedding_name=None, 1534 hash_key=self.hash_key, 1535 max_norm=None, 1536 trainable=True) 1537 1538 @property 1539 def _variable_shape(self): 1540 return tensor_shape.TensorShape([self.dimension]) 1541 1542 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1543 return _embeddings_from_arguments( 1544 self, 1545 self._deep_embedding_lookup_arguments(inputs.get(self)), 1546 weight_collections, trainable) 1547 1548 def _transform_feature(self, inputs): 1549 return inputs.get(self.column_name) 1550 1551 @property 1552 def _parse_example_spec(self): 1553 return self.config 1554 1555 1556 def scattered_embedding_column(column_name, 1557 size, 1558 dimension, 1559 hash_key, 1560 combiner="mean", 1561 initializer=None): 1562 """Creates an embedding column of a sparse feature using parameter hashing. 1563 1564 This is a useful shorthand when you have a sparse feature you want to use an 1565 embedding for, but also want to hash the embedding's values in each dimension 1566 to a variable based on a different hash. 1567 1568 Specifically, the i-th embedding component of a value v is found by retrieving 1569 an embedding weight whose index is a fingerprint of the pair (v,i). 1570 1571 An embedding column with sparse_column_with_hash_bucket such as 1572 1573 embedding_column( 1574 sparse_column_with_hash_bucket(column_name, bucket_size), 1575 dimension) 1576 1577 could be replaced by 1578 1579 scattered_embedding_column( 1580 column_name, 1581 size=bucket_size * dimension, 1582 dimension=dimension, 1583 hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY) 1584 1585 for the same number of embedding parameters. This should hopefully reduce the 1586 impact of collisions, but adds the cost of slowing down training. 1587 1588 Args: 1589 column_name: A string defining sparse column name. 1590 size: An integer specifying the number of parameters in the embedding layer. 1591 dimension: An integer specifying dimension of the embedding. 1592 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 1593 function to combine the crosses fingerprints on SparseFeatureCrossOp. 1594 combiner: A string specifying how to reduce if there are multiple entries 1595 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1596 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1597 with bag-of-words columns. Each of this can be thought as example level 1598 normalizations on the column: 1599 * "sum": do not normalize features in the column 1600 * "mean": do l1 normalization on features in the column 1601 * "sqrtn": do l2 normalization on features in the column 1602 For more information: `tf.embedding_lookup_sparse`. 1603 initializer: A variable initializer function to be used in embedding 1604 variable initialization. If not specified, defaults to 1605 `tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1. 1606 1607 Returns: 1608 A _ScatteredEmbeddingColumn. 1609 1610 Raises: 1611 ValueError: if dimension or size is not a positive integer; or if combiner 1612 is not supported. 1613 1614 """ 1615 if (dimension < 1) or (size < 1): 1616 raise ValueError("Dimension and size must be greater than 0. " 1617 "dimension: {}, size: {}, column_name: {}".format( 1618 dimension, size, column_name)) 1619 1620 if combiner not in ("mean", "sqrtn", "sum"): 1621 raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'. " 1622 "combiner: {}, column_name: {}".format(combiner, 1623 column_name)) 1624 1625 return _ScatteredEmbeddingColumn(column_name, size, dimension, hash_key, 1626 combiner, initializer) 1627 1628 1629 def _reshape_real_valued_tensor(input_tensor, output_rank, column_name=None): 1630 """Reshaping logic for dense, numeric `Tensors`. 1631 1632 Follows the following rules: 1633 1. If `output_rank > input_rank + 1` raise a `ValueError`. 1634 2. If `output_rank == input_rank + 1`, expand `input_tensor` by one 1635 dimension and return 1636 3. If `output_rank == input_rank`, return `input_tensor`. 1637 4. If `output_rank < input_rank`, flatten the inner dimensions of 1638 `input_tensor` and return a `Tensor` with `output_rank` 1639 1640 Args: 1641 input_tensor: a dense `Tensor` to be reshaped. 1642 output_rank: the desired rank of the reshaped `Tensor`. 1643 column_name: (optional) the name of the associated column. Used for error 1644 messages. 1645 Returns: 1646 A `Tensor` with the same entries as `input_tensor` and rank `output_rank`. 1647 Raises: 1648 ValueError: if `output_rank > input_rank + 1`. 1649 """ 1650 input_rank = input_tensor.get_shape().ndims 1651 if input_rank is not None: 1652 if output_rank > input_rank + 1: 1653 error_string = ("Rank of input Tensor ({}) should be the same as " 1654 "output_rank ({}). For example, sequence data should " 1655 "typically be 3 dimensional (rank 3) while non-sequence " 1656 "data is typically 2 dimensional (rank 2).".format( 1657 input_rank, output_rank)) 1658 if column_name is not None: 1659 error_string = ("Error while processing column {}.".format(column_name) 1660 + error_string) 1661 raise ValueError(error_string) 1662 if output_rank == input_rank + 1: 1663 logging.warning( 1664 "Rank of input Tensor ({}) should be the same as output_rank ({}) " 1665 "for column. Will attempt to expand dims. It is highly recommended " 1666 "that you resize your input, as this behavior may change.".format( 1667 input_rank, output_rank)) 1668 return array_ops.expand_dims(input_tensor, -1, name="expand_dims") 1669 if output_rank == input_rank: 1670 return input_tensor 1671 # Here, either `input_rank` is unknown or it is greater than `output_rank`. 1672 return layers._inner_flatten(input_tensor, output_rank) # pylint: disable=protected-access 1673 1674 1675 class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple( 1676 "_RealValuedVarLenColumn", 1677 ["column_name", "default_value", "dtype", "normalizer", "is_sparse"])): 1678 """Represents a real valued feature column for variable length Features. 1679 1680 Instances of this class are immutable. 1681 If is_sparse=False, the dictionary returned by InputBuilder contains a 1682 ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension). 1683 If is_sparse=True, the dictionary contains a ("column_name", SparseTensor) 1684 pair instead with shape inferred after parsing. 1685 """ 1686 1687 @property 1688 def name(self): 1689 return self.column_name 1690 1691 @property 1692 def config(self): 1693 if self.is_sparse: 1694 return {self.column_name: parsing_ops.VarLenFeature(self.dtype)} 1695 else: 1696 return {self.column_name: parsing_ops.FixedLenSequenceFeature( 1697 [], self.dtype, allow_missing=True, 1698 default_value=self.default_value)} 1699 1700 @property 1701 def key(self): 1702 """Returns a string which will be used as a key when we do sorting.""" 1703 return self._key_without_properties(["normalizer"]) 1704 1705 @property 1706 def normalizer_fn(self): 1707 """Returns the function used to normalize the column.""" 1708 return self.normalizer 1709 1710 def _normalized_input_tensor(self, input_tensor): 1711 """Returns the input tensor after custom normalization is applied.""" 1712 if self.normalizer is None: 1713 return input_tensor 1714 if self.is_sparse: 1715 return sparse_tensor_py.SparseTensor( 1716 input_tensor.indices, 1717 self.normalizer(input_tensor.values), 1718 input_tensor.dense_shape) 1719 else: 1720 return self.normalizer(input_tensor) 1721 1722 def insert_transformed_feature(self, columns_to_tensors): 1723 """Apply transformation and inserts it into columns_to_tensors. 1724 1725 Args: 1726 columns_to_tensors: A mapping from feature columns to tensors. 'string' 1727 key means a base feature (not-transformed). It can have _FeatureColumn 1728 as a key too. That means that _FeatureColumn is already transformed. 1729 """ 1730 # Transform the input tensor according to the normalizer function. 1731 input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name]) 1732 columns_to_tensors[self] = math_ops.to_float(input_tensor) 1733 1734 # pylint: disable=unused-argument 1735 def _to_dnn_input_layer(self, 1736 input_tensor, 1737 weight_collections=None, 1738 trainable=True, 1739 output_rank=2): 1740 return _reshape_real_valued_tensor( 1741 self._to_dense_tensor(input_tensor), output_rank, self.name) 1742 1743 def _to_dense_tensor(self, input_tensor): 1744 if not self.is_sparse: 1745 return input_tensor 1746 raise ValueError("Set is_sparse to False if you want a dense Tensor for " 1747 "column_name: {}".format(self.name)) 1748 1749 1750 @experimental 1751 def _real_valued_var_len_column(column_name, 1752 default_value=None, 1753 dtype=dtypes.float32, 1754 normalizer=None, 1755 is_sparse=False): 1756 """Creates a `_RealValuedVarLenColumn` for variable-length numeric data. 1757 1758 Note, this is not integrated with any of the DNNEstimators, except the RNN 1759 ones DynamicRNNEstimator and the StateSavingRNNEstimator. 1760 1761 It can either create a parsing config for a SparseTensor (with is_sparse=True) 1762 or a padded Tensor. 1763 The (dense_)shape of the result will be [batch_size, None], which can be used 1764 with is_sparse=False as input into an RNN (see DynamicRNNEstimator or 1765 StateSavingRNNEstimator) or with is_sparse=True as input into a tree (see 1766 gtflow). 1767 1768 Use real_valued_column if the Feature has a fixed length. Use some 1769 SparseColumn for columns to be embedded / one-hot-encoded. 1770 1771 Args: 1772 column_name: A string defining real valued column name. 1773 default_value: A scalar value compatible with dtype. Needs to be specified 1774 if is_sparse=False. 1775 dtype: Defines the type of values. Default value is tf.float32. Needs to be 1776 convertible to tf.float32. 1777 normalizer: If not None, a function that can be used to normalize the value 1778 of the real valued column after default_value is applied for parsing. 1779 Normalizer function takes the input tensor as its argument, and returns 1780 the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for 1781 is_sparse=False, the normalizer will be run on the values of the 1782 `SparseTensor`. 1783 is_sparse: A boolean defining whether to create a SparseTensor or a Tensor. 1784 Returns: 1785 A _RealValuedSparseColumn. 1786 Raises: 1787 TypeError: if default_value is not a scalar value compatible with dtype. 1788 TypeError: if dtype is not convertible to tf.float32. 1789 ValueError: if default_value is None and is_sparse is False. 1790 """ 1791 if not (dtype.is_integer or dtype.is_floating): 1792 raise TypeError("dtype must be convertible to float. " 1793 "dtype: {}, column_name: {}".format(dtype, column_name)) 1794 1795 if default_value is None and not is_sparse: 1796 raise ValueError("default_value must be provided when is_sparse=False to " 1797 "parse a padded Tensor. " 1798 "column_name: {}".format(column_name)) 1799 if isinstance(default_value, list): 1800 raise ValueError( 1801 "Only scalar default value. default_value: {}, column_name: {}".format( 1802 default_value, column_name)) 1803 if default_value is not None: 1804 if dtype.is_integer: 1805 default_value = int(default_value) 1806 elif dtype.is_floating: 1807 default_value = float(default_value) 1808 1809 return _RealValuedVarLenColumn(column_name, default_value, dtype, normalizer, 1810 is_sparse) 1811 1812 1813 class _RealValuedColumn( 1814 _FeatureColumn, 1815 fc_core._DenseColumn, # pylint: disable=protected-access 1816 collections.namedtuple( 1817 "_RealValuedColumn", 1818 ["column_name", "dimension", "default_value", "dtype", "normalizer"])): 1819 """Represents a real valued feature column also known as continuous features. 1820 1821 Instances of this class are immutable. The dictionary returned by InputBuilder 1822 contains a ("column_name", Tensor) pair with a Tensor shape of 1823 (batch_size, dimension). 1824 """ 1825 1826 def __new__(cls, column_name, dimension, default_value, 1827 dtype, normalizer): 1828 if default_value is not None: 1829 default_value = tuple(default_value) 1830 return super(_RealValuedColumn, cls).__new__(cls, column_name, dimension, 1831 default_value, dtype, 1832 normalizer) 1833 1834 @property 1835 def name(self): 1836 return self.column_name 1837 1838 @property 1839 def config(self): 1840 default_value = self.default_value 1841 if default_value is not None: 1842 default_value = list(default_value) 1843 return {self.column_name: parsing_ops.FixedLenFeature([self.dimension], 1844 self.dtype, 1845 default_value)} 1846 1847 @property 1848 def key(self): 1849 """Returns a string which will be used as a key when we do sorting.""" 1850 return self._key_without_properties(["normalizer"]) 1851 1852 @property 1853 def normalizer_fn(self): 1854 """Returns the function used to normalize the column.""" 1855 return self.normalizer 1856 1857 def _normalized_input_tensor(self, input_tensor): 1858 """Returns the input tensor after custom normalization is applied.""" 1859 return (self.normalizer(input_tensor) if self.normalizer is not None else 1860 input_tensor) 1861 1862 def insert_transformed_feature(self, columns_to_tensors): 1863 """Apply transformation and inserts it into columns_to_tensors. 1864 1865 Args: 1866 columns_to_tensors: A mapping from feature columns to tensors. 'string' 1867 key means a base feature (not-transformed). It can have _FeatureColumn 1868 as a key too. That means that _FeatureColumn is already transformed. 1869 """ 1870 # Transform the input tensor according to the normalizer function. 1871 input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name]) 1872 columns_to_tensors[self] = math_ops.to_float(input_tensor) 1873 1874 # pylint: disable=unused-argument 1875 def _to_dnn_input_layer(self, 1876 input_tensor, 1877 weight_collections=None, 1878 trainable=True, 1879 output_rank=2): 1880 input_tensor = self._to_dense_tensor(input_tensor) 1881 if input_tensor.dtype != dtypes.float32: 1882 input_tensor = math_ops.to_float(input_tensor) 1883 return _reshape_real_valued_tensor(input_tensor, output_rank, self.name) 1884 1885 def _to_dense_tensor(self, input_tensor): 1886 return input_tensor 1887 1888 @property 1889 def _variable_shape(self): 1890 return tensor_shape.TensorShape([self.dimension]) 1891 1892 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1893 del weight_collections 1894 del trainable 1895 return inputs.get(self) 1896 1897 def _transform_feature(self, inputs): 1898 return math_ops.to_float( 1899 self._normalized_input_tensor(inputs.get(self.name))) 1900 1901 @property 1902 def _parse_example_spec(self): 1903 return self.config 1904 1905 1906 def real_valued_column(column_name, 1907 dimension=1, 1908 default_value=None, 1909 dtype=dtypes.float32, 1910 normalizer=None): 1911 """Creates a `_RealValuedColumn` for dense numeric data. 1912 1913 Args: 1914 column_name: A string defining real valued column name. 1915 dimension: An integer specifying dimension of the real valued column. 1916 The default is 1. 1917 default_value: A single value compatible with dtype or a list of values 1918 compatible with dtype which the column takes on during tf.Example parsing 1919 if data is missing. When dimension is not None, a default value of None 1920 will cause tf.parse_example to fail if an example does not contain this 1921 column. If a single value is provided, the same value will be applied as 1922 the default value for every dimension. If a list of values is provided, 1923 the length of the list should be equal to the value of `dimension`. 1924 Only scalar default value is supported in case dimension is not specified. 1925 dtype: defines the type of values. Default value is tf.float32. Must be a 1926 non-quantized, real integer or floating point type. 1927 normalizer: If not None, a function that can be used to normalize the value 1928 of the real valued column after default_value is applied for parsing. 1929 Normalizer function takes the input tensor as its argument, and returns 1930 the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for 1931 variable length columns, the normalizer should expect an input_tensor of 1932 type `SparseTensor`. 1933 Returns: 1934 A _RealValuedColumn. 1935 Raises: 1936 TypeError: if dimension is not an int 1937 ValueError: if dimension is not a positive integer 1938 TypeError: if default_value is a list but its length is not equal to the 1939 value of `dimension`. 1940 TypeError: if default_value is not compatible with dtype. 1941 ValueError: if dtype is not convertible to tf.float32. 1942 """ 1943 1944 if dimension is None: 1945 raise TypeError("dimension must be an integer. Use the " 1946 "_real_valued_var_len_column for variable length features." 1947 "dimension: {}, column_name: {}".format(dimension, 1948 column_name)) 1949 if not isinstance(dimension, int): 1950 raise TypeError("dimension must be an integer. " 1951 "dimension: {}, column_name: {}".format(dimension, 1952 column_name)) 1953 if dimension < 1: 1954 raise ValueError("dimension must be greater than 0. " 1955 "dimension: {}, column_name: {}".format(dimension, 1956 column_name)) 1957 1958 if not (dtype.is_integer or dtype.is_floating): 1959 raise ValueError("dtype must be convertible to float. " 1960 "dtype: {}, column_name: {}".format(dtype, column_name)) 1961 1962 if default_value is None: 1963 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1964 normalizer) 1965 1966 if isinstance(default_value, int): 1967 if dtype.is_integer: 1968 default_value = ([default_value for _ in range(dimension)] if dimension 1969 else [default_value]) 1970 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1971 normalizer) 1972 if dtype.is_floating: 1973 default_value = float(default_value) 1974 default_value = ([default_value for _ in range(dimension)] if dimension 1975 else [default_value]) 1976 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1977 normalizer) 1978 1979 if isinstance(default_value, float): 1980 if dtype.is_floating and (not dtype.is_integer): 1981 default_value = ([default_value for _ in range(dimension)] if dimension 1982 else [default_value]) 1983 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1984 normalizer) 1985 1986 if isinstance(default_value, list): 1987 if len(default_value) != dimension: 1988 raise ValueError( 1989 "The length of default_value must be equal to dimension. " 1990 "default_value: {}, dimension: {}, column_name: {}".format( 1991 default_value, dimension, column_name)) 1992 # Check if the values in the list are all integers or are convertible to 1993 # floats. 1994 is_list_all_int = True 1995 is_list_all_float = True 1996 for v in default_value: 1997 if not isinstance(v, int): 1998 is_list_all_int = False 1999 if not (isinstance(v, float) or isinstance(v, int)): 2000 is_list_all_float = False 2001 if is_list_all_int: 2002 if dtype.is_integer: 2003 return _RealValuedColumn(column_name, dimension, default_value, dtype, 2004 normalizer) 2005 elif dtype.is_floating: 2006 default_value = [float(v) for v in default_value] 2007 return _RealValuedColumn(column_name, dimension, default_value, dtype, 2008 normalizer) 2009 if is_list_all_float: 2010 if dtype.is_floating and (not dtype.is_integer): 2011 default_value = [float(v) for v in default_value] 2012 return _RealValuedColumn(column_name, dimension, default_value, dtype, 2013 normalizer) 2014 2015 raise TypeError("default_value must be compatible with dtype. " 2016 "default_value: {}, dtype: {}, column_name: {}".format( 2017 default_value, dtype, column_name)) 2018 2019 2020 class _BucketizedColumn( 2021 _FeatureColumn, 2022 fc_core._CategoricalColumn, # pylint: disable=protected-access 2023 fc_core._DenseColumn, # pylint: disable=protected-access 2024 collections.namedtuple("_BucketizedColumn", ["source_column", 2025 "boundaries"])): 2026 """Represents a bucketization transformation also known as binning. 2027 2028 Instances of this class are immutable. Values in `source_column` will be 2029 bucketized based on `boundaries`. 2030 For example, if the inputs are: 2031 boundaries = [0, 10, 100] 2032 source_column = [[-5], [150], [10], [0], [4], [19]] 2033 2034 then the bucketized feature will be: 2035 output = [[0], [3], [2], [1], [1], [2]] 2036 2037 Attributes: 2038 source_column: A _RealValuedColumn defining dense column. 2039 boundaries: A list or tuple of floats specifying the boundaries. It has to 2040 be sorted. [a, b, c] defines following buckets: (-inf., a), [a, b), 2041 [b, c), [c, inf.) 2042 Raises: 2043 ValueError: if 'boundaries' is empty or not sorted. 2044 """ 2045 2046 def __new__(cls, source_column, boundaries): 2047 if not isinstance(source_column, _RealValuedColumn): 2048 raise TypeError("source_column must be an instance of _RealValuedColumn. " 2049 "source_column: {}".format(source_column)) 2050 2051 if source_column.dimension is None: 2052 raise ValueError("source_column must have a defined dimension. " 2053 "source_column: {}".format(source_column)) 2054 2055 if (not isinstance(boundaries, list) and 2056 not isinstance(boundaries, tuple)) or not boundaries: 2057 raise ValueError("boundaries must be a non-empty list or tuple. " 2058 "boundaries: {}".format(boundaries)) 2059 2060 # We allow bucket boundaries to be monotonically increasing 2061 # (ie a[i+1] >= a[i]). When two bucket boundaries are the same, we 2062 # de-duplicate. 2063 sanitized_boundaries = [] 2064 for i in range(len(boundaries) - 1): 2065 if boundaries[i] == boundaries[i + 1]: 2066 continue 2067 elif boundaries[i] < boundaries[i + 1]: 2068 sanitized_boundaries.append(boundaries[i]) 2069 else: 2070 raise ValueError("boundaries must be a sorted list. " 2071 "boundaries: {}".format(boundaries)) 2072 sanitized_boundaries.append(boundaries[len(boundaries) - 1]) 2073 2074 return super(_BucketizedColumn, cls).__new__(cls, source_column, 2075 tuple(sanitized_boundaries)) 2076 2077 @property 2078 def name(self): 2079 return "{}_bucketized".format(self.source_column.name) 2080 2081 @property 2082 def length(self): 2083 """Returns total number of buckets.""" 2084 return len(self.boundaries) + 1 2085 2086 @property 2087 def config(self): 2088 return self.source_column.config 2089 2090 @property 2091 def key(self): 2092 """Returns a string which will be used as a key when we do sorting.""" 2093 return "{}".format(self) 2094 2095 # pylint: disable=unused-argument 2096 def _to_dnn_input_layer(self, 2097 input_tensor, 2098 weight_collections=None, 2099 trainable=True, 2100 output_rank=2): 2101 if output_rank != 2: 2102 raise ValueError("BucketizedColumn currently only supports output_rank=2") 2103 return array_ops.reshape( 2104 array_ops.one_hot( 2105 math_ops.to_int64(input_tensor), 2106 self.length, 2107 1., 2108 0., 2109 name="one_hot"), [-1, self.length * self.source_column.dimension], 2110 name="reshape") 2111 2112 def to_sparse_tensor(self, input_tensor): 2113 """Creates a SparseTensor from the bucketized Tensor.""" 2114 dimension = self.source_column.dimension 2115 batch_size = array_ops.shape(input_tensor, name="shape")[0] 2116 2117 if dimension > 1: 2118 i1 = array_ops.reshape( 2119 array_ops.tile( 2120 array_ops.expand_dims( 2121 math_ops.range(0, batch_size), 1, name="expand_dims"), 2122 [1, dimension], 2123 name="tile"), [-1], 2124 name="reshape") 2125 i2 = array_ops.tile( 2126 math_ops.range(0, dimension), [batch_size], name="tile") 2127 # Flatten the bucket indices and unique them across dimensions 2128 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 2129 bucket_indices = array_ops.reshape( 2130 input_tensor, [-1], name="reshape") + self.length * i2 2131 else: 2132 # Simpler indices when dimension=1 2133 i1 = math_ops.range(0, batch_size) 2134 i2 = array_ops.zeros([batch_size], dtype=dtypes.int32, name="zeros") 2135 bucket_indices = array_ops.reshape(input_tensor, [-1], name="reshape") 2136 2137 indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2)))) 2138 shape = math_ops.to_int64(array_ops.stack([batch_size, dimension])) 2139 sparse_id_values = sparse_tensor_py.SparseTensor( 2140 indices, bucket_indices, shape) 2141 2142 return sparse_id_values 2143 2144 def _wide_embedding_lookup_arguments(self, input_tensor): 2145 return _LinearEmbeddingLookupArguments( 2146 input_tensor=self.to_sparse_tensor(input_tensor), 2147 weight_tensor=None, 2148 vocab_size=self.length * self.source_column.dimension, 2149 initializer=init_ops.zeros_initializer(), 2150 combiner="sum") 2151 2152 def _transform_feature(self, inputs): 2153 """Handles cross transformation.""" 2154 # Bucketize the source column. 2155 return bucketization_op.bucketize( 2156 inputs.get(self.source_column), 2157 boundaries=list(self.boundaries), 2158 name="bucketize") 2159 2160 def insert_transformed_feature(self, columns_to_tensors): 2161 """Handles sparse column to id conversion.""" 2162 columns_to_tensors[self] = self._transform_feature( 2163 _LazyBuilderByColumnsToTensor(columns_to_tensors)) 2164 2165 @property 2166 def _parse_example_spec(self): 2167 return self.config 2168 2169 @property 2170 def _num_buckets(self): 2171 return self.length * self.source_column.dimension 2172 2173 def _get_sparse_tensors(self, inputs, weight_collections=None, 2174 trainable=None): 2175 del weight_collections 2176 del trainable 2177 return fc_core._CategoricalColumn.IdWeightPair( # pylint: disable=protected-access 2178 self.to_sparse_tensor(inputs.get(self)), None) 2179 2180 @property 2181 def _variable_shape(self): 2182 return tensor_shape.TensorShape( 2183 [self.length * self.source_column.dimension]) 2184 2185 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2186 return self._to_dnn_input_layer( 2187 inputs.get(self), weight_collections, trainable) 2188 2189 2190 def bucketized_column(source_column, boundaries): 2191 """Creates a _BucketizedColumn for discretizing dense input. 2192 2193 Args: 2194 source_column: A _RealValuedColumn defining dense column. 2195 boundaries: A list or tuple of floats specifying the boundaries. It has to 2196 be sorted. 2197 2198 Returns: 2199 A _BucketizedColumn. 2200 2201 Raises: 2202 ValueError: if 'boundaries' is empty or not sorted. 2203 """ 2204 return _BucketizedColumn(source_column, boundaries) 2205 2206 2207 class _CrossedColumn( 2208 _FeatureColumn, 2209 fc_core._CategoricalColumn, # pylint: disable=protected-access 2210 collections.namedtuple("_CrossedColumn", [ 2211 "columns", "hash_bucket_size", "hash_key", "combiner", 2212 "ckpt_to_load_from", "tensor_name_in_ckpt" 2213 ])): 2214 """Represents a cross transformation also known as conjunction or combination. 2215 2216 Instances of this class are immutable. It crosses given `columns`. Crossed 2217 column output will be hashed to hash_bucket_size. 2218 Conceptually, transformation can be thought as: 2219 Hash(cartesian product of features in columns) % `hash_bucket_size` 2220 2221 For example, if the columns are 2222 2223 SparseTensor referred by first column: shape = [2, 2] 2224 [0, 0]: "a" 2225 [1, 0]: "b" 2226 [1, 1]: "c" 2227 2228 SparseTensor referred by second column: : shape = [2, 1] 2229 [0, 0]: "d" 2230 [1, 0]: "e" 2231 2232 then crossed feature will look like: 2233 2234 shape = [2, 2] 2235 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 2236 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 2237 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 2238 2239 Attributes: 2240 columns: An iterable of _FeatureColumn. Items can be an instance of 2241 _SparseColumn, _CrossedColumn, or _BucketizedColumn. 2242 hash_bucket_size: An int that is > 1. The number of buckets. 2243 combiner: A string specifying how to reduce if there are multiple entries 2244 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 2245 "sum" the default. "sqrtn" often achieves good accuracy, in particular 2246 with bag-of-words columns. Each of this can be thought as example level 2247 normalizations on the column:: 2248 * "sum": do not normalize 2249 * "mean": do l1 normalization 2250 * "sqrtn": do l2 normalization 2251 For more information: `tf.embedding_lookup_sparse`. 2252 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 2253 to restore the column weights. Required if `tensor_name_in_ckpt` is not 2254 None. 2255 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 2256 checkpoint from which to restore the column weights. Required if 2257 `ckpt_to_load_from` is not None. 2258 2259 Raises: 2260 TypeError: if all items in columns are not an instance of _SparseColumn, 2261 _CrossedColumn, or _BucketizedColumn. 2262 ValueError: if hash_bucket_size is not > 1 or len(columns) is not > 1. Also, 2263 if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified. 2264 """ 2265 2266 @staticmethod 2267 def _assert_is_crossable(column): 2268 if isinstance(column, (_SparseColumn, _CrossedColumn, _BucketizedColumn)): 2269 return 2270 raise TypeError("columns must be a set of _SparseColumn, " 2271 "_CrossedColumn, or _BucketizedColumn instances. " 2272 "(column {} is a {})".format(column, 2273 column.__class__.__name__)) 2274 2275 def __new__(cls, 2276 columns, 2277 hash_bucket_size, 2278 hash_key, 2279 combiner="sum", 2280 ckpt_to_load_from=None, 2281 tensor_name_in_ckpt=None): 2282 for column in columns: 2283 _CrossedColumn._assert_is_crossable(column) 2284 2285 if len(columns) < 2: 2286 raise ValueError("columns must contain at least 2 elements. " 2287 "columns: {}".format(columns)) 2288 2289 if hash_bucket_size < 2: 2290 raise ValueError("hash_bucket_size must be at least 2. " 2291 "hash_bucket_size: {}".format(hash_bucket_size)) 2292 2293 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 2294 raise ValueError("Must specify both `ckpt_to_load_from` and " 2295 "`tensor_name_in_ckpt` or none of them.") 2296 2297 sorted_columns = sorted( 2298 [column for column in columns], key=lambda column: column.name) 2299 return super(_CrossedColumn, cls).__new__(cls, tuple(sorted_columns), 2300 hash_bucket_size, hash_key, 2301 combiner, 2302 ckpt_to_load_from, 2303 tensor_name_in_ckpt) 2304 2305 @property 2306 def name(self): 2307 sorted_names = sorted([column.name for column in self.columns]) 2308 return "_X_".join(sorted_names) 2309 2310 @property 2311 def config(self): 2312 config = {} 2313 for column in self.columns: 2314 config.update(_get_feature_config(column)) 2315 return config 2316 2317 @property 2318 def length(self): 2319 """Returns total number of buckets.""" 2320 return self.hash_bucket_size 2321 2322 @property 2323 def key(self): 2324 """Returns a string which will be used as a key when we do sorting.""" 2325 return "{}".format(self) 2326 2327 def id_tensor(self, input_tensor): 2328 """Returns the id tensor from the given transformed input_tensor.""" 2329 return input_tensor 2330 2331 def weight_tensor(self, input_tensor): 2332 """Returns the weight tensor from the given transformed input_tensor.""" 2333 del input_tensor 2334 return None 2335 2336 def _to_dnn_input_layer(self, 2337 input_tensor, 2338 weight_collections=None, 2339 trainable=True, 2340 output_rank=2): 2341 del input_tensor 2342 del weight_collections 2343 del trainable 2344 del output_rank 2345 raise ValueError("CrossedColumn is not supported in DNN. " 2346 "Please use embedding_column. column: {}".format(self)) 2347 2348 def _checkpoint_path(self): 2349 if self.ckpt_to_load_from is not None: 2350 return self.ckpt_to_load_from, self.tensor_name_in_ckpt 2351 return None 2352 2353 def _wide_embedding_lookup_arguments(self, input_tensor): 2354 return _LinearEmbeddingLookupArguments( 2355 input_tensor=input_tensor, 2356 weight_tensor=None, 2357 vocab_size=self.length, 2358 initializer=init_ops.zeros_initializer(), 2359 combiner=self.combiner) 2360 2361 def _transform_feature(self, inputs): 2362 """Handles cross transformation.""" 2363 2364 def _collect_leaf_level_columns(cross): 2365 """Collects base columns contained in the cross.""" 2366 leaf_level_columns = [] 2367 for c in cross.columns: 2368 if isinstance(c, _CrossedColumn): 2369 leaf_level_columns.extend(_collect_leaf_level_columns(c)) 2370 else: 2371 leaf_level_columns.append(c) 2372 return leaf_level_columns 2373 2374 feature_tensors = [] 2375 for c in _collect_leaf_level_columns(self): 2376 if isinstance(c, _SparseColumn): 2377 feature_tensors.append(inputs.get(c.name)) 2378 else: 2379 if isinstance(c, _BucketizedColumn): 2380 feature_tensors.append(c.to_sparse_tensor(inputs.get(c))) 2381 else: 2382 feature_tensors.append(inputs.get(c)) 2383 return sparse_feature_cross_op.sparse_feature_cross( 2384 feature_tensors, 2385 hashed_output=True, 2386 num_buckets=self.hash_bucket_size, 2387 hash_key=self.hash_key, 2388 name="cross") 2389 2390 def insert_transformed_feature(self, columns_to_tensors): 2391 """Handles sparse column to id conversion.""" 2392 columns_to_tensors[self] = self._transform_feature( 2393 _LazyBuilderByColumnsToTensor(columns_to_tensors)) 2394 2395 @property 2396 def _parse_example_spec(self): 2397 return self.config 2398 2399 @property 2400 def _num_buckets(self): 2401 return self.length 2402 2403 def _get_sparse_tensors(self, inputs, weight_collections=None, 2404 trainable=None): 2405 del weight_collections 2406 del trainable 2407 return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None) # pylint: disable=protected-access 2408 2409 2410 class _LazyBuilderByColumnsToTensor(object): 2411 2412 def __init__(self, columns_to_tensors): 2413 self._columns_to_tensors = columns_to_tensors 2414 2415 def get(self, key): 2416 """Gets the transformed feature column.""" 2417 if key in self._columns_to_tensors: 2418 return self._columns_to_tensors[key] 2419 if isinstance(key, str): 2420 raise ValueError( 2421 "features dictionary doesn't contain key ({})".format(key)) 2422 if not isinstance(key, _FeatureColumn): 2423 raise TypeError('"key" must be either a "str" or "_FeatureColumn". ' 2424 "Provided: {}".format(key)) 2425 2426 key.insert_transformed_feature(self._columns_to_tensors) 2427 return self._columns_to_tensors[key] 2428 2429 2430 def crossed_column(columns, hash_bucket_size, combiner="sum", 2431 ckpt_to_load_from=None, 2432 tensor_name_in_ckpt=None, 2433 hash_key=None): 2434 """Creates a _CrossedColumn for performing feature crosses. 2435 2436 Args: 2437 columns: An iterable of _FeatureColumn. Items can be an instance of 2438 _SparseColumn, _CrossedColumn, or _BucketizedColumn. 2439 hash_bucket_size: An int that is > 1. The number of buckets. 2440 combiner: A string specifying how to reduce if there are multiple entries 2441 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 2442 "sum" the default. "sqrtn" often achieves good accuracy, in particular 2443 with bag-of-words columns. Each of this can be thought as example level 2444 normalizations on the column:: 2445 * "sum": do not normalize 2446 * "mean": do l1 normalization 2447 * "sqrtn": do l2 normalization 2448 For more information: `tf.embedding_lookup_sparse`. 2449 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 2450 to restore the column weights. Required if `tensor_name_in_ckpt` is not 2451 None. 2452 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 2453 checkpoint from which to restore the column weights. Required if 2454 `ckpt_to_load_from` is not None. 2455 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 2456 function to combine the crosses fingerprints on SparseFeatureCrossOp 2457 (optional). 2458 2459 Returns: 2460 A _CrossedColumn. 2461 2462 Raises: 2463 TypeError: if any item in columns is not an instance of _SparseColumn, 2464 _CrossedColumn, or _BucketizedColumn, or 2465 hash_bucket_size is not an int. 2466 ValueError: if hash_bucket_size is not > 1 or 2467 len(columns) is not > 1. 2468 """ 2469 return _CrossedColumn( 2470 columns, 2471 hash_bucket_size, 2472 hash_key, 2473 combiner=combiner, 2474 ckpt_to_load_from=ckpt_to_load_from, 2475 tensor_name_in_ckpt=tensor_name_in_ckpt) 2476 2477 2478 class DataFrameColumn(_FeatureColumn, 2479 collections.namedtuple("DataFrameColumn", 2480 ["column_name", "series"])): 2481 """Represents a feature column produced from a `DataFrame`. 2482 2483 Instances of this class are immutable. A `DataFrame` column may be dense or 2484 sparse, and may have any shape, with the constraint that dimension 0 is 2485 batch_size. 2486 2487 Args: 2488 column_name: a name for this column 2489 series: a `Series` to be wrapped, which has already had its base features 2490 substituted with `PredefinedSeries`. 2491 """ 2492 2493 def __new__(cls, column_name, series): 2494 return super(DataFrameColumn, cls).__new__(cls, column_name, series) 2495 2496 @property 2497 def name(self): 2498 return self.column_name 2499 2500 @property 2501 def config(self): 2502 return self.series.required_base_features() 2503 2504 @property 2505 def key(self): 2506 """Returns a string which will be used as a key when we do sorting.""" 2507 return self.name 2508 2509 def insert_transformed_feature(self, columns_to_tensors): 2510 # The cache must already contain mappings from the expected base feature 2511 # names to Tensors. 2512 2513 # Passing columns_to_tensors as the cache here means that multiple outputs 2514 # of the transform will be cached, keyed by the repr of their associated 2515 # TransformedSeries. 2516 # The specific requested output ends up in columns_to_tensors twice: once 2517 # keyed by the TransformedSeries repr, and once keyed by this 2518 # DataFrameColumn instance. 2519 columns_to_tensors[self] = self.series.build(columns_to_tensors) 2520 2521 # pylint: disable=unused-argument 2522 def _to_dnn_input_layer(self, 2523 input_tensor, 2524 weight_collections=None, 2525 trainable=True, 2526 output_rank=2): 2527 if input_tensor.dtype != dtypes.float32: 2528 input_tensor = math_ops.to_float(input_tensor) 2529 return _reshape_real_valued_tensor(input_tensor, output_rank, self.name) 2530 2531 def _to_dense_tensor(self, input_tensor): 2532 return self._to_dnn_input_layer(input_tensor) 2533 2534 def __eq__(self, other): 2535 if isinstance(other, self.__class__): 2536 return self.__dict__ == other.__dict__ 2537 else: 2538 return False 2539 2540 def __ne__(self, other): 2541 return not self.__eq__(other) 2542 2543 2544 def _get_feature_config(feature_column): 2545 """Returns configuration for the base feature defined in feature_column.""" 2546 if not isinstance(feature_column, _FeatureColumn): 2547 raise TypeError( 2548 "feature_columns should only contain instances of _FeatureColumn. " 2549 "Given column is {}".format(feature_column)) 2550 if isinstance(feature_column, (_SparseColumn, _WeightedSparseColumn, 2551 _EmbeddingColumn, _RealValuedColumn, 2552 _RealValuedVarLenColumn, 2553 _BucketizedColumn, _CrossedColumn, 2554 _OneHotColumn, _ScatteredEmbeddingColumn)): 2555 return feature_column.config 2556 2557 raise TypeError("Not supported _FeatureColumn type. " 2558 "Given column is {}".format(feature_column)) 2559 2560 2561 def create_feature_spec_for_parsing(feature_columns): 2562 """Helper that prepares features config from input feature_columns. 2563 2564 The returned feature config can be used as arg 'features' in tf.parse_example. 2565 2566 Typical usage example: 2567 2568 ```python 2569 # Define features and transformations 2570 feature_a = sparse_column_with_vocabulary_file(...) 2571 feature_b = real_valued_column(...) 2572 feature_c_bucketized = bucketized_column(real_valued_column("feature_c"), ...) 2573 feature_a_x_feature_c = crossed_column( 2574 columns=[feature_a, feature_c_bucketized], ...) 2575 2576 feature_columns = set( 2577 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 2578 batch_examples = tf.parse_example( 2579 serialized=serialized_examples, 2580 features=create_feature_spec_for_parsing(feature_columns)) 2581 ``` 2582 2583 For the above example, create_feature_spec_for_parsing would return the dict: 2584 { 2585 "feature_a": parsing_ops.VarLenFeature(tf.string), 2586 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 2587 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 2588 } 2589 2590 Args: 2591 feature_columns: An iterable containing all the feature columns. All items 2592 should be instances of classes derived from _FeatureColumn, unless 2593 feature_columns is a dict -- in which case, this should be true of all 2594 values in the dict. 2595 Returns: 2596 A dict mapping feature keys to FixedLenFeature or VarLenFeature values. 2597 """ 2598 if isinstance(feature_columns, dict): 2599 feature_columns = feature_columns.values() 2600 2601 features_config = {} 2602 for column in feature_columns: 2603 features_config.update(_get_feature_config(column)) 2604 return features_config 2605 2606 2607 def _create_sequence_feature_spec_for_parsing(sequence_feature_columns, 2608 allow_missing_by_default=False): 2609 """Prepares a feature spec for parsing `tf.SequenceExample`s. 2610 2611 Args: 2612 sequence_feature_columns: an iterable containing all the feature columns. 2613 All items should be instances of classes derived from `_FeatureColumn`. 2614 allow_missing_by_default: whether to set `allow_missing=True` by default for 2615 `FixedLenSequenceFeature`s. 2616 Returns: 2617 A dict mapping feature keys to `FixedLenSequenceFeature` or `VarLenFeature`. 2618 """ 2619 feature_spec = create_feature_spec_for_parsing(sequence_feature_columns) 2620 sequence_feature_spec = {} 2621 for key, feature in feature_spec.items(): 2622 if isinstance(feature, parsing_ops.VarLenFeature): 2623 sequence_feature = feature 2624 elif (isinstance(feature, parsing_ops.FixedLenFeature) or 2625 isinstance(feature, parsing_ops.FixedLenSequenceFeature)): 2626 default_is_set = feature.default_value is not None 2627 if default_is_set: 2628 logging.warning( 2629 'Found default value {} for feature "{}". Ignoring this value and ' 2630 'setting `allow_missing=True` instead.'. 2631 format(feature.default_value, key)) 2632 sequence_feature = parsing_ops.FixedLenSequenceFeature( 2633 shape=feature.shape, 2634 dtype=feature.dtype, 2635 allow_missing=(allow_missing_by_default or default_is_set)) 2636 else: 2637 raise TypeError( 2638 "Unsupported feature type: {}".format(type(feature).__name__)) 2639 sequence_feature_spec[key] = sequence_feature 2640 return sequence_feature_spec 2641 2642 2643 def make_place_holder_tensors_for_base_features(feature_columns): 2644 """Returns placeholder tensors for inference. 2645 2646 Args: 2647 feature_columns: An iterable containing all the feature columns. All items 2648 should be instances of classes derived from _FeatureColumn. 2649 Returns: 2650 A dict mapping feature keys to SparseTensors (sparse columns) or 2651 placeholder Tensors (dense columns). 2652 """ 2653 # Get dict mapping features to FixedLenFeature or VarLenFeature values. 2654 dict_for_parse_example = create_feature_spec_for_parsing(feature_columns) 2655 placeholders = {} 2656 for column_name, column_type in dict_for_parse_example.items(): 2657 if isinstance(column_type, parsing_ops.VarLenFeature): 2658 # Sparse placeholder for sparse tensors. 2659 placeholders[column_name] = array_ops.sparse_placeholder( 2660 column_type.dtype, name="Placeholder_{}".format(column_name)) 2661 else: 2662 # Simple placeholder for dense tensors. 2663 placeholders[column_name] = array_ops.placeholder( 2664 column_type.dtype, 2665 shape=(None, column_type.shape[0]), 2666 name="Placeholder_{}".format(column_name)) 2667 return placeholders 2668 2669 2670 class _SparseIdLookupConfig( 2671 collections.namedtuple("_SparseIdLookupConfig", 2672 ["vocabulary_file", "keys", "num_oov_buckets", 2673 "vocab_size", "default_value"])): 2674 """Defines lookup configuration for a sparse feature. 2675 2676 An immutable object defines lookup table configuration used by 2677 tf.feature_to_id_v2. 2678 2679 Attributes: 2680 vocabulary_file: The vocabulary filename. vocabulary_file cannot be combined 2681 with keys. 2682 keys: A 1-D string iterable that specifies the mapping of strings to 2683 indices. It means a feature in keys will map to it's index in keys. 2684 num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of 2685 vocabulary features will be ignored. 2686 vocab_size: Number of the elements in the vocabulary. 2687 default_value: The value to use for out-of-vocabulary feature values. 2688 Defaults to -1. 2689 """ 2690 2691 def __new__(cls, 2692 vocabulary_file=None, 2693 keys=None, 2694 num_oov_buckets=0, 2695 vocab_size=None, 2696 default_value=-1): 2697 2698 return super(_SparseIdLookupConfig, cls).__new__(cls, vocabulary_file, keys, 2699 num_oov_buckets, 2700 vocab_size, default_value) 2701