In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

In [2]:
# First thing to do: Download https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [3]:
# We're using pandas to read the CSV file. This is easy for small datasets, but for large and complex datasets,
# tensorflow parsing and processing functions are more powerful.
import pandas as pd
import numpy as np

In [4]:
# The CSV file does not have a header, so we have to fill in column names.
names = [
    'symboling', 
    'normalized-losses', 
    'make', 
    'fuel-type', 
    'aspiration',
    'num-of-doors',
    'body-style',
    'drive-wheels',
    'engine-location',
    'wheel-base',
    'length',
    'width',
    'height',
    'curb-weight',
    'engine-type',
    'num-of-cylinders',
    'engine-size',
    'fuel-system',
    'bore',
    'stroke',
    'compression-ratio',
    'horsepower',
    'peak-rpm',
    'city-mpg',
    'highway-mpg',
    'price',
]

# We also have to specify dtypes.
dtypes = {
    'symboling': np.int32, 
    'normalized-losses': np.float32, 
    'make': str, 
    'fuel-type': str, 
    'aspiration': str,
    'num-of-doors': str,
    'body-style': str,
    'drive-wheels': str,
    'engine-location': str,
    'wheel-base': np.float32,
    'length': np.float32,
    'width': np.float32,
    'height': np.float32,
    'curb-weight': np.float32,
    'engine-type': str,
    'num-of-cylinders': str,
    'engine-size': np.float32,
    'fuel-system': str,
    'bore': np.float32,
    'stroke': np.float32,
    'compression-ratio': np.float32,
    'horsepower': np.float32,
    'peak-rpm': np.float32,
    'city-mpg': np.float32,
    'highway-mpg': np.float32,
    'price': np.float32,    
}

In [5]:
# Read the file.
df = pd.read_csv('imports-85.data', names=names, dtype=dtypes, na_values='?')

In [6]:
# Some rows don't have price data, we can't use those.
df = df.dropna(axis='rows', how='any', subset=['price'])

In [7]:
# Fill missing values in continuous columns with zeros instead of NaN.
float_columns = [k for k,v in dtypes.items() if v == np.float32]
df[float_columns] = df[float_columns].fillna(value=0., axis='columns')
# Fill missing values in continuous columns with '' instead of NaN (NaN mixed with strings is very bad for us).
string_columns = [k for k,v in dtypes.items() if v == str]
df[string_columns] = df[string_columns].fillna(value='', axis='columns')

In [8]:
# Split the data into a training set and an eval set.
training_data = df[:160]
eval_data = df[160:]

# Separate input features from labels
training_label = training_data.pop('price')
eval_label = eval_data.pop('price')

In [9]:
# Now we can start using some TensorFlow.
import tensorflow as tf
print('please make sure that version >= 1.2:')
print(tf.__version__)

please make sure that version >= 1.2:
1.2.0-rc0


In [10]:
# Make input function for training: 
#   num_epochs=None -> will cycle through input data forever
#   shuffle=True -> randomize order of input data
training_input_fn = tf.estimator.inputs.pandas_input_fn(x=training_data, y=training_label, batch_size=64, shuffle=True, num_epochs=None)

# Make input function for evaluation:
#   shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=eval_data, y=eval_label, batch_size=64, shuffle=False)

In [11]:
# Describe how the model should interpret the inputs. The names of the feature columns have to match the names
# of the series in the dataframe.

symboling = tf.feature_column.numeric_column('symboling')
normalized_losses = tf.feature_column.numeric_column('normalized-losses')
make = tf.feature_column.categorical_column_with_hash_bucket('make', 50)
fuel_type = tf.feature_column.categorical_column_with_vocabulary_list('fuel-type', vocabulary_list=['diesel', 'gas'])
aspiration = tf.feature_column.categorical_column_with_vocabulary_list('aspiration', vocabulary_list=['std', 'turbo'])
num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list('num-of-doors', vocabulary_list=['two', 'four'])
body_style = tf.feature_column.categorical_column_with_vocabulary_list('body-style', vocabulary_list=['hardtop', 'wagon', 'sedan', 'hatchback', 'convertible'])
drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list('drive-wheels', vocabulary_list=['4wd', 'rwd', 'fwd'])
engine_location = tf.feature_column.categorical_column_with_vocabulary_list('engine-location', vocabulary_list=['front', 'rear'])
wheel_base = tf.feature_column.numeric_column('wheel-base')
length = tf.feature_column.numeric_column('length')
width = tf.feature_column.numeric_column('width')
height = tf.feature_column.numeric_column('height')
curb_weight = tf.feature_column.numeric_column('curb-weight')
engine_type = tf.feature_column.categorical_column_with_vocabulary_list('engine-type', ['dohc', 'dohcv', 'l', 'ohc', 'ohcf', 'ohcv', 'rotor'])
num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list('num-of-cylinders', ['eight', 'five', 'four', 'six', 'three', 'twelve', 'two'])
engine_size = tf.feature_column.numeric_column('engine-size')
fuel_system = tf.feature_column.categorical_column_with_vocabulary_list('fuel-system', ['1bbl', '2bbl', '4bbl', 'idi', 'mfi', 'mpfi', 'spdi', 'spfi'])
bore = tf.feature_column.numeric_column('bore')
stroke = tf.feature_column.numeric_column('stroke')
compression_ratio = tf.feature_column.numeric_column('compression-ratio')
horsepower = tf.feature_column.numeric_column('horsepower')
peak_rpm = tf.feature_column.numeric_column('peak-rpm')
city_mpg = tf.feature_column.numeric_column('city-mpg')
highway_mpg = tf.feature_column.numeric_column('highway-mpg')

In [None]:
linear_features = [symboling, normalized_losses, make, fuel_type, aspiration, num_of_doors,
                   body_style, drive_wheels, engine_location, wheel_base, length, width,
                   height, curb_weight, engine_type, num_of_cylinders, engine_size, fuel_system,
                   bore, stroke, compression_ratio, horsepower, peak_rpm, city_mpg, highway_mpg]

In [21]:
regressor = tf.contrib.learn.LinearRegressor(feature_columns=linear_features)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc2d61bac10>, '_model_dir': '/tmp/tmp6ZlFc0', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_evaluation_master': '', '_master': ''}


In [13]:
regressor.fit(input_fn=training_input_fn, steps=10000)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpNU0eqz/model.ckpt.
INFO:tensorflow:loss = 2.413e+08, step = 1
INFO:tensorflow:global_step/sec: 176.967
INFO:tensorflow:loss = 8.0506e+07, step = 101 (0.570 sec)
INFO:tensorflow:global_step/sec: 199.97
INFO:tensorflow:loss = 6.6745e+07, step = 201 (0.496 sec)
INFO:tensorflow:global_step/sec: 216.747
INFO:tensorflow:loss = 6.00864e+07, step = 301 (0.465 sec)
INFO:tensorflow:global_step/sec: 191.496
INFO:tensorflow:loss = 3.47049e+07, step = 401 (0.522 sec)
INFO:tensorflow:global_step/sec: 208.642
INFO:tensorflow:loss = 3.39389e+07, step = 501 (0.478 sec)
INFO:tensorflo

INFO:tensorflow:global_step/sec: 191.929
INFO:tensorflow:loss = 2.00178e+07, step = 7501 (0.518 sec)
INFO:tensorflow:global_step/sec: 203.697
INFO:tensorflow:loss = 1.53318e+07, step = 7601 (0.495 sec)
INFO:tensorflow:global_step/sec: 202.307
INFO:tensorflow:loss = 2.0854e+07, step = 7701 (0.495 sec)
INFO:tensorflow:global_step/sec: 203.504
INFO:tensorflow:loss = 1.40464e+07, step = 7801 (0.490 sec)
INFO:tensorflow:global_step/sec: 194.591
INFO:tensorflow:loss = 1.54422e+07, step = 7901 (0.513 sec)
INFO:tensorflow:global_step/sec: 199.267
INFO:tensorflow:loss = 2.12707e+07, step = 8001 (0.504 sec)
INFO:tensorflow:global_step/sec: 195.141
INFO:tensorflow:loss = 2.22916e+07, step = 8101 (0.509 sec)
INFO:tensorflow:global_step/sec: 194.245
INFO:tensorflow:loss = 1.87815e+07, step = 8201 (0.518 sec)
INFO:tensorflow:global_step/sec: 205.331
INFO:tensorflow:loss = 1.70511e+07, step = 8301 (0.484 sec)
INFO:tensorflow:global_step/sec: 193.251
INFO:tensorflow:loss = 1.845e+07, step = 8401 (0.51

LinearRegressor(params={'gradient_clip_norm': None, 'head': <tensorflow.contrib.learn.python.learn.estimators.head._RegressionHead object at 0x7f34ca5ac190>, 'joint_weights': False, 'optimizer': None, 'feature_columns': [_NumericColumn(key='symboling', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='normalized-losses', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _HashedCategoricalColumn(key='make', hash_bucket_size=50, dtype=tf.string), _VocabularyListCategoricalColumn(key='fuel-type', vocabulary_list=('diesel', 'gas'), dtype=tf.string, default_value=-1), _VocabularyListCategoricalColumn(key='aspiration', vocabulary_list=('std', 'turbo'), dtype=tf.string, default_value=-1), _VocabularyListCategoricalColumn(key='num-of-doors', vocabulary_list=('two', 'four'), dtype=tf.string, default_value=-1), _VocabularyListCategoricalColumn(key='body-style', vocabulary_list=('hardtop', 'wagon', 'sedan', 'hatchback', 'convertible'),

In [14]:
regressor.evaluate(input_fn=eval_input_fn)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-05-17-21:48:53
INFO:tensorflow:Restoring parameters from /tmp/tmpNU0eqz/model.ckpt-10000
INFO:tensorflow:Finished evaluation at 2017-05-17-21:48:54
INFO:tensorflow:Saving dict for global step 10000: global_step = 10000, loss = 7.96542e+06


{'global_step': 10000, 'loss': 7965423.5}

In [15]:
dnn_features = [
    #numerical features
    symboling, normalized_losses, wheel_base, length, width, height, curb_weight, engine_size,
    bore, stroke, compression_ratio, horsepower, peak_rpm, city_mpg, highway_mpg,    
    # densify categorical features:
    tf.feature_column.indicator_column(make),
    tf.feature_column.indicator_column(fuel_type),
    tf.feature_column.indicator_column(aspiration),
    tf.feature_column.indicator_column(num_of_doors),
    tf.feature_column.indicator_column(body_style),
    tf.feature_column.indicator_column(drive_wheels), 
    tf.feature_column.indicator_column(engine_location),
    tf.feature_column.indicator_column(engine_type),
    tf.feature_column.indicator_column(num_of_cylinders),
    tf.feature_column.indicator_column(fuel_system),
]

In [16]:
dnnregressor = tf.contrib.learn.DNNRegressor(feature_columns=dnn_features, hidden_units=[50, 30, 10])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc2d60a9bd0>, '_model_dir': '/tmp/tmpnqPn3Q', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_evaluation_master': '', '_master': ''}


In [17]:
dnnregressor.fit(input_fn=training_input_fn, steps=10000)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpnqPn3Q/model.ckpt.
INFO:tensorflow:loss = 3.22914e+08, step = 1
INFO:tensorflow:global_step/sec: 219.187
INFO:tensorflow:loss = 3.491e+07, step = 101 (0.464 sec)
INFO:tensorflow:global_step/sec: 205.938
INFO:tensorflow:loss = 1.59505e+07, step = 201 (0.482 sec)
INFO:tensorflow:global_step/sec: 205.809
INFO:tensorflow:loss = 1.67622e+07, step = 301 (0.488 sec)
INFO:tensorflow:global_step/sec: 197.773
INFO:tensorflow:loss = 1.92105e+07, step = 401 (0.507 sec)
INFO:tensorflow:global_step/sec: 195.636
INFO:tensorflow:loss = 1.33924e+07, step = 501 (0.514 sec)
INFO:tensor

INFO:tensorflow:global_step/sec: 198.116
INFO:tensorflow:loss = 4.21432e+06, step = 7401 (0.510 sec)
INFO:tensorflow:global_step/sec: 202.815
INFO:tensorflow:loss = 3.10884e+06, step = 7501 (0.493 sec)
INFO:tensorflow:global_step/sec: 199.3
INFO:tensorflow:loss = 2.30774e+06, step = 7601 (0.498 sec)
INFO:tensorflow:global_step/sec: 227.64
INFO:tensorflow:loss = 3.16538e+06, step = 7701 (0.445 sec)
INFO:tensorflow:global_step/sec: 216.803
INFO:tensorflow:loss = 2.16325e+06, step = 7801 (0.456 sec)
INFO:tensorflow:global_step/sec: 222.97
INFO:tensorflow:loss = 4.19254e+06, step = 7901 (0.446 sec)
INFO:tensorflow:global_step/sec: 222.506
INFO:tensorflow:loss = 3.27005e+06, step = 8001 (0.454 sec)
INFO:tensorflow:global_step/sec: 220.794
INFO:tensorflow:loss = 3.39485e+06, step = 8101 (0.451 sec)
INFO:tensorflow:global_step/sec: 204.882
INFO:tensorflow:loss = 2.88965e+06, step = 8201 (0.493 sec)
INFO:tensorflow:global_step/sec: 210.89
INFO:tensorflow:loss = 4.03541e+06, step = 8301 (0.473 

DNNRegressor(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._RegressionHead object at 0x7fc2d60a98d0>, 'hidden_units': [50, 30, 10], 'feature_columns': (_NumericColumn(key='symboling', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='normalized-losses', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='wheel-base', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='width', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='height', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='curb-weight', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='engine-size', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=Non

In [18]:
dnnregressor.evaluate(input_fn=eval_input_fn)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-05-18-02:27:00
INFO:tensorflow:Restoring parameters from /tmp/tmpnqPn3Q/model.ckpt-10000
INFO:tensorflow:Finished evaluation at 2017-05-18-02:27:00
INFO:tensorflow:Saving dict for global step 10000: global_step = 10000, loss = 9.60459e+06


{'global_step': 10000, 'loss': 9604591.0}

In [19]:
def experiment_fn(run_config, params):
  # This function makes an Experiment, containing an Estimator and inputs for training and evaluation.
  # You can use params and config here to customize the Estimator depending on the cluster or to use
  # hyperparameter tuning.

  # Collect information for training
  return tf.contrib.learn.Experiment(estimator=tf.contrib.learn.LinearRegressor(
                                     feature_columns=linear_features, config=run_config),
                                     train_input_fn=training_input_fn,
                                     train_steps=10000,
                                     eval_input_fn=eval_input_fn)

In [22]:
import shutil
shutil.rmtree("/tmp/output_dir", ignore_errors=True)
tf.contrib.learn.learn_runner.run(experiment_fn, run_config=tf.contrib.learn.RunConfig(model_dir="/tmp/output_dir"))

INFO:tensorflow:Using config: {'_model_dir': '/tmp/output_dir', '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc2d5195410>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': '', '_session_config': None}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op

INFO:tensorflow:loss = 1.80479e+07, step = 5201 (0.805 sec)
INFO:tensorflow:global_step/sec: 124.082
INFO:tensorflow:loss = 1.42421e+07, step = 5301 (0.806 sec)
INFO:tensorflow:global_step/sec: 115.894
INFO:tensorflow:loss = 2.45195e+07, step = 5401 (0.863 sec)
INFO:tensorflow:global_step/sec: 121.967
INFO:tensorflow:loss = 2.10585e+07, step = 5501 (0.820 sec)
INFO:tensorflow:global_step/sec: 121.443
INFO:tensorflow:loss = 1.59946e+07, step = 5601 (0.824 sec)
INFO:tensorflow:global_step/sec: 118.556
INFO:tensorflow:loss = 1.98039e+07, step = 5701 (0.845 sec)
INFO:tensorflow:global_step/sec: 117.499
INFO:tensorflow:loss = 1.51192e+07, step = 5801 (0.849 sec)
INFO:tensorflow:global_step/sec: 115.655
INFO:tensorflow:loss = 3.23047e+07, step = 5901 (0.864 sec)
INFO:tensorflow:global_step/sec: 119.006
INFO:tensorflow:loss = 2.65075e+07, step = 6001 (0.841 sec)
INFO:tensorflow:global_step/sec: 121.712
INFO:tensorflow:loss = 2.03057e+07, step = 6101 (0.822 sec)
INFO:tensorflow:global_step/sec

({'global_step': 10000, 'loss': 8276404.5}, [])