https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
In [86]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
In [87]:
# We're using pandas to read the CSV file. This is easy for small datasets, but for large and complex datasets,
# tensorflow parsing and processing functions are more powerful
import pandas as pd
import numpy as np
# TensorFlow
import tensorflow as tf
print('please make sure that version >= 1.2:')
print(tf.__version__)
print('@monteirom: I made changes so it also works with 1.1.0 that is the current pip install version')
print('@monteirom: The lines that were changed have @1.2 as comment')
# Layers that will define the features
#
# real_value_column: real values, float32
# sparse_column_with_hash_bucket: Use this when your sparse features are in string or integer format,
# but you don't have a vocab file that maps each value to an integer ID.
# output_id = Hash(input_feature_string) % bucket_size
# sparse_column_with_keys: Look up logic is as follows:
# lookup_id = index_of_feature_in_keys if feature in keys else default_value.
# You should use this when you know the vocab file for the feature
# one_hot_column: Creates an _OneHotColumn for a one-hot or multi-hot repr in a DNN.
# The input can be a _SparseColumn which is created by `sparse_column_with_*`
# or crossed_column functions
from tensorflow.contrib.layers import real_valued_column, sparse_column_with_keys, sparse_column_with_hash_bucket
from tensorflow.contrib.layers import one_hot_column
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data And move it to data/
So: data/imports-85.data is expected to exist!
In [88]:
# The CSV file does not have a header, so we have to fill in column names.
names = [
'symboling',
'normalized-losses',
'make',
'fuel-type',
'aspiration',
'num-of-doors',
'body-style',
'drive-wheels',
'engine-location',
'wheel-base',
'length',
'width',
'height',
'curb-weight',
'engine-type',
'num-of-cylinders',
'engine-size',
'fuel-system',
'bore',
'stroke',
'compression-ratio',
'horsepower',
'peak-rpm',
'city-mpg',
'highway-mpg',
'price',
]
# We also have to specify dtypes.
dtypes = {
'symboling': np.int32,
'normalized-losses': np.float32,
'make': str,
'fuel-type': str,
'aspiration': str,
'num-of-doors': str,
'body-style': str,
'drive-wheels': str,
'engine-location': str,
'wheel-base': np.float32,
'length': np.float32,
'width': np.float32,
'height': np.float32,
'curb-weight': np.float32,
'engine-type': str,
'num-of-cylinders': str,
'engine-size': np.float32,
'fuel-system': str,
'bore': np.float32,
'stroke': np.float32,
'compression-ratio': np.float32,
'horsepower': np.float32,
'peak-rpm': np.float32,
'city-mpg': np.float32,
'highway-mpg': np.float32,
'price': np.float32,
}
In [89]:
# Read the file.
df = pd.read_csv('data/imports-85.data', names=names, dtype=dtypes, na_values='?')
In [90]:
# Some rows don't have price data, we can't use those.
df = df.dropna(axis='rows', how='any', subset=['price'])
There are many approaches possibles for NaN values in the data, here we just changing it to " " or 0 depending of the data type. This is the simplest way, but for sure is not the best in most cases, so in practice you should try some other ways to use the NaN data. Some approaches are:
In [91]:
# Fill missing values in continuous columns with zeros instead of NaN.
float_columns = [k for k,v in dtypes.items() if v == np.float32]
df[float_columns] = df[float_columns].fillna(value=0., axis='columns')
# Fill missing values in continuous columns with '' instead of NaN (NaN mixed with strings is very bad for us).
string_columns = [k for k,v in dtypes.items() if v == str]
df[string_columns] = df[string_columns].fillna(value='', axis='columns')
In [92]:
# We have too many variables let's just use some of them
df = df[['num-of-doors','num-of-cylinders', 'horsepower', 'make', 'price', 'length', 'height', 'width']]
In [93]:
# Since we're possibly dealing with parameters of different units and scales. We'll need to rescale our data.
# There are two main ways to do it:
# * Normalization, which scales all numeric variables in the range [0,1].
# Example:
# * Standardization, it will then transform it to have zero mean and unit variance.
# Example:
# Which is better? It deppends of your data and your features.
# But one disadvantage of normalization over standardization is that it loses
# some information in the data. Since normalization loses more info it can make harder
# for gradient descent to converse, so we'll use standardization.
# In practice: please analyse your data and see what gives you better results.
def std(x):
return (x - x.mean()) / x.std()
before = df.length[0]
df.length = std(df.length)
df.width = std(df.width)
df.height = std(df.height)
df.horsepower = std(df.horsepower)
after = df.length[0]
print('before:', before, 'after:', after)
In [94]:
TRAINING_DATA_SIZE = 160
TEST_DATA_SIZE = 10
LABEL = 'price'
# Split the data into a training set, eval set and test set
training_data = df[:TRAINING_DATA_SIZE]
eval_data = df[TRAINING_DATA_SIZE: TRAINING_DATA_SIZE + TEST_DATA_SIZE]
test_data = df[TRAINING_DATA_SIZE + TEST_DATA_SIZE:]
# Separate input features from labels
training_label = training_data.pop(LABEL)
eval_label = eval_data.pop(LABEL)
test_label = test_data.pop(LABEL)
In [95]:
BATCH_SIZE = 64
# Make input function for training:
# num_epochs=None -> will cycle through input data forever
# shuffle=True -> randomize order of input data
training_input_fn = tf.estimator.inputs.pandas_input_fn(x=training_data,
y=training_label,
batch_size=BATCH_SIZE,
shuffle=True,
num_epochs=None)
# Make input function for evaluation:
# shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=eval_data,
y=eval_label,
batch_size=BATCH_SIZE,
shuffle=False)
# Make input function for testing:
# shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=test_data,
y=test_label,
batch_size=1,
shuffle=False)
In [100]:
# Describe how the model should interpret the inputs. The names of the feature columns have to match the names
# of the series in the dataframe.
# @1.2.0 tf.feature_column.numeric_column -> tf.contrib.layers.real_valued_column
horsepower = real_valued_column('horsepower')
width = real_valued_column('width')
height = real_valued_column('height')
length = real_valued_column('length')
# @1.2.0 tf.feature_column.categorical_column_with_hash_bucket -> tf.contrib.layers.sparse_column_with_hash_bucket
make = sparse_column_with_hash_bucket('make', 50)
# @1.2.0 tf.feature_column.categorical_column_with_vocabulary_list -> tf.contrib.layers.sparse_column_with_keys
fuel_type = sparse_column_with_keys('fuel-type', keys=['diesel', 'gas'])
num_of_doors = sparse_column_with_keys('num-of-doors', keys=['two', 'four'])
num_of_cylinders = sparse_column_with_keys('num-of-cylinders', ['eight', 'five', 'four', 'six', 'three', 'twelve', 'two'])
linear_features = [horsepower, make, num_of_doors, num_of_cylinders, length, width, height]
In [101]:
regressor = tf.contrib.learn.LinearRegressor(feature_columns=linear_features, model_dir='tensorboard/linear_regressor/')
In [113]:
regressor.fit(input_fn=training_input_fn, steps=10000)
Out[113]:
In [114]:
regressor.evaluate(input_fn=eval_input_fn)
Out[114]:
In [115]:
preds = list(regressor.predict(input_fn=eval_input_fn))
for i in range(TEST_DATA_SIZE):
print('prediction:', preds[i], 'real value:', test_label.iloc[i])
In [105]:
# @1.2.0 tf.feature_column.indicator_column -> tf.contrib.layers.one_hot_column(tf.contrib.layers.sparse_column_with_keys(...))
dnn_features = [
#numerical features
length, width, height, horsepower,
# densify categorical features:
one_hot_column(make),
one_hot_column(num_of_doors)
]
In [109]:
dnnregressor = tf.contrib.learn.DNNRegressor(feature_columns=dnn_features,
hidden_units=[50, 30, 10], model_dir='tensorboard/DNN_regressor/')
In [116]:
dnnregressor.fit(input_fn=training_input_fn, steps=10000)
Out[116]:
In [117]:
dnnregressor.evaluate(input_fn=eval_input_fn)
Out[117]:
In [118]:
preds = list(dnnregressor.predict(input_fn=eval_input_fn))
for i in range(TEST_DATA_SIZE):
print('prediction:', preds[i], 'real value:', test_label.iloc[i])
In [82]:
# @1.2.0 experiment_fn(run_config, params) - > experiment_fn(output_dir)
def experiment_fn(output_dir):
# This function makes an Experiment, containing an Estimator and inputs for training and evaluation.
# You can use params and config here to customize the Estimator depending on the cluster or to use
# hyperparameter tuning.
# Collect information for training
# @1.2.0 config=run_config -> ''
return tf.contrib.learn.Experiment(estimator=tf.contrib.learn.LinearRegressor(
feature_columns=linear_features, model_dir=output_dir),
train_input_fn=training_input_fn,
train_steps=10000,
eval_input_fn=eval_input_fn)
In [83]:
import shutil
# @1.2.0 tf.contrib.learn.learn_runner(exp, run_config=tf.contrib.learn.RunConfig(model_dir="/tmp/output_dir")
# -> tf.contrib.learn.python.learn.learm_runner.run(exp, output_dir='/tmp/output_dir')
shutil.rmtree("/tmp/output_dir", ignore_errors=True)
from tensorflow.contrib.learn.python.learn import learn_runner
learn_runner.run(experiment_fn, output_dir='/tmp/output_dir')
Out[83]: