Home | History | Annotate | Download | only in regression
      1 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #     http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 # ==============================================================================
     15 """A dataset loader for imports85.data."""
     16 
     17 from __future__ import absolute_import
     18 from __future__ import division
     19 from __future__ import print_function
     20 
     21 import collections
     22 
     23 import numpy as np
     24 import tensorflow as tf
     25 
     26 try:
     27   import pandas as pd  # pylint: disable=g-import-not-at-top
     28 except ImportError:
     29   pass
     30 
     31 
     32 URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
     33 
     34 # Order is important for the csv-readers, so we use an OrderedDict here.
     35 defaults = collections.OrderedDict([
     36     ("symboling", [0]),
     37     ("normalized-losses", [0.0]),
     38     ("make", [""]),
     39     ("fuel-type", [""]),
     40     ("aspiration", [""]),
     41     ("num-of-doors", [""]),
     42     ("body-style", [""]),
     43     ("drive-wheels", [""]),
     44     ("engine-location", [""]),
     45     ("wheel-base", [0.0]),
     46     ("length", [0.0]),
     47     ("width", [0.0]),
     48     ("height", [0.0]),
     49     ("curb-weight", [0.0]),
     50     ("engine-type", [""]),
     51     ("num-of-cylinders", [""]),
     52     ("engine-size", [0.0]),
     53     ("fuel-system", [""]),
     54     ("bore", [0.0]),
     55     ("stroke", [0.0]),
     56     ("compression-ratio", [0.0]),
     57     ("horsepower", [0.0]),
     58     ("peak-rpm", [0.0]),
     59     ("city-mpg", [0.0]),
     60     ("highway-mpg", [0.0]),
     61     ("price", [0.0])
     62 ])  # pyformat: disable
     63 
     64 
     65 types = collections.OrderedDict((key, type(value[0]))
     66                                 for key, value in defaults.items())
     67 
     68 
     69 def _get_imports85():
     70   path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL)
     71   return path
     72 
     73 
     74 def dataset(y_name="price", train_fraction=0.7):
     75   """Load the imports85 data as a (train,test) pair of `Dataset`.
     76 
     77   Each dataset generates (features_dict, label) pairs.
     78 
     79   Args:
     80     y_name: The name of the column to use as the label.
     81     train_fraction: A float, the fraction of data to use for training. The
     82         remainder will be used for evaluation.
     83   Returns:
     84     A (train,test) pair of `Datasets`
     85   """
     86   # Download and cache the data
     87   path = _get_imports85()
     88 
     89   # Define how the lines of the file should be parsed
     90   def decode_line(line):
     91     """Convert a csv line into a (features_dict,label) pair."""
     92     # Decode the line to a tuple of items based on the types of
     93     # csv_header.values().
     94     items = tf.decode_csv(line, list(defaults.values()))
     95 
     96     # Convert the keys and items to a dict.
     97     pairs = zip(defaults.keys(), items)
     98     features_dict = dict(pairs)
     99 
    100     # Remove the label from the features_dict
    101     label = features_dict.pop(y_name)
    102 
    103     return features_dict, label
    104 
    105   def has_no_question_marks(line):
    106     """Returns True if the line of text has no question marks."""
    107     # split the line into an array of characters
    108     chars = tf.string_split(line[tf.newaxis], "").values
    109     # for each character check if it is a question mark
    110     is_question = tf.equal(chars, "?")
    111     any_question = tf.reduce_any(is_question)
    112     no_question = ~any_question
    113 
    114     return no_question
    115 
    116   def in_training_set(line):
    117     """Returns a boolean tensor, true if the line is in the training set."""
    118     # If you randomly split the dataset you won't get the same split in both
    119     # sessions if you stop and restart training later. Also a simple
    120     # random split won't work with a dataset that's too big to `.cache()` as
    121     # we are doing here.
    122     num_buckets = 1000000
    123     bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
    124     # Use the hash bucket id as a random number that's deterministic per example
    125     return bucket_id < int(train_fraction * num_buckets)
    126 
    127   def in_test_set(line):
    128     """Returns a boolean tensor, true if the line is in the training set."""
    129     # Items not in the training set are in the test set.
    130     # This line must use `~` instead of `not` because `not` only works on python
    131     # booleans but we are dealing with symbolic tensors.
    132     return ~in_training_set(line)
    133 
    134   base_dataset = (tf.data
    135                   # Get the lines from the file.
    136                   .TextLineDataset(path)
    137                   # drop lines with question marks.
    138                   .filter(has_no_question_marks))
    139 
    140   train = (base_dataset
    141            # Take only the training-set lines.
    142            .filter(in_training_set)
    143            # Decode each line into a (features_dict, label) pair.
    144            .map(decode_line)
    145            # Cache data so you only decode the file once.
    146            .cache())
    147 
    148   # Do the same for the test-set.
    149   test = (base_dataset.filter(in_test_set).cache().map(decode_line))
    150 
    151   return train, test
    152 
    153 
    154 def raw_dataframe():
    155   """Load the imports85 data as a pd.DataFrame."""
    156   # Download and cache the data
    157   path = _get_imports85()
    158 
    159   # Load it into a pandas dataframe
    160   df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")
    161 
    162   return df
    163 
    164 
    165 def load_data(y_name="price", train_fraction=0.7, seed=None):
    166   """Get the imports85 data set.
    167 
    168   A description of the data is available at:
    169     https://archive.ics.uci.edu/ml/datasets/automobile
    170 
    171   The data itself can be found at:
    172     https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
    173 
    174   Args:
    175     y_name: the column to return as the label.
    176     train_fraction: the fraction of the dataset to use for training.
    177     seed: The random seed to use when shuffling the data. `None` generates a
    178       unique shuffle every run.
    179   Returns:
    180     a pair of pairs where the first pair is the training data, and the second
    181     is the test data:
    182     `(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
    183     `x` contains a pandas DataFrame of features, while `y` contains the label
    184     array.
    185   """
    186   # Load the raw data columns.
    187   data = raw_dataframe()
    188 
    189   # Delete rows with unknowns
    190   data = data.dropna()
    191 
    192   # Shuffle the data
    193   np.random.seed(seed)
    194 
    195   # Split the data into train/test subsets.
    196   x_train = data.sample(frac=train_fraction, random_state=seed)
    197   x_test = data.drop(x_train.index)
    198 
    199   # Extract the label from the features dataframe.
    200   y_train = x_train.pop(y_name)
    201   y_test = x_test.pop(y_name)
    202 
    203   return (x_train, y_train), (x_test, y_test)
    204