Try applying some deep learning to Housing Prices in the hope of saving some time on feature extraction.


In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from keras.layers import Input 
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Model 

from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform


Using TensorFlow backend.

In [90]:
def preprocess_data(test_data=False):
    def encode_one_categorical_feature(column):
        le = LabelEncoder()
        ohe = OneHotEncoder(sparse=False)
        num_encoded = le.fit_transform(column.fillna('unk'))
        oh_encoded = ohe.fit_transform(num_encoded.reshape(-1, 1))
        return oh_encoded
    data = pd.read_csv('data/train.csv')
    target = ['SalePrice']
    features = data.drop(['Id'] + target, axis=1).columns
    
    dataset_types = pd.DataFrame(data[features].dtypes, columns=['datatype'])
    dataset_types.reset_index(inplace=True)

    numeric_features = dataset_types.rename(columns={"index" : "feature"}).feature[(dataset_types.datatype == 'float64') | (dataset_types.datatype == 'int64')]
    num_data = data[numeric_features]
    num_features = num_data.fillna(num_data.mean()).values
    scaler = StandardScaler()
    num_features_scaled = scaler.fit_transform(num_features)

    categorical_features = dataset_types.rename(columns={"index" : "feature"}).feature[(dataset_types.datatype == 'object')]
    cat_data = data[categorical_features]
    cat_features = np.hstack([encode_one_categorical_feature(data[column]) for column in cat_data.columns])
    
    print("Of the {} features in this dataset".format(len(data.columns)))
    print("{} features are numeric".format(len(numeric_features)))
    print("and {} features are categorical.".format(len(categorical_features)))
    print("The last two are the target, which is numeric, and the id column.")
    
    X = np.hstack((num_features_scaled, cat_features))
    if test_data == True:
        return X
    y = data[target].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=606)
    return X_train, X_test, y_train, y_test

def plot_history(history):
    plt.plot(history.history['loss'], 'b')
    plt.plot(history.history['val_loss'], 'r')
    plt.title('model accuracy') 
    plt.ylabel('loss') 
    plt.xlabel('epoch') 
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
    
def keras_model(X_train, X_test, y_train, y_test):
    NUM_EPOCHS = 50
    BATCH_SIZE = 128
    
    inputs = Input(shape=(304, ))
    x = Dropout(0.2)(inputs)
    
    x = Dense(256)(x)
    x = Activation("relu")(x)
    x = Dropout(0.2)(x)
    
    x = Dense(256)(x)
    x = Activation("relu")(x)
    x = Dropout(0.4)(x)
    
    x = Dense(256)(x)
    x = Activation("relu")(x)
    x = Dropout(0.4)(x)
    
    x = Dense(256)(x)
    x = Activation("relu")(x)
    x = Dropout(0.4)(x)
    
    x = Dense(256)(x)
    x = Activation("relu")(x)
    x = Dropout(0.4)(x)
    
    x = Dense(256)(x)
    x = Activation("relu")(x)
    x = Dropout(0.4)(x)
        
    predictions = Dense(1)(x)

    model = Model(inputs=[inputs], outputs=[predictions])

    model.compile(loss="mse", optimizer="adam")
    history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_split=0.2, verbose=0)
    
    plot_history(history)
    
    score = model.evaluate(X_test, y_test, verbose=0)
    print("Test MSE is {:.2e}".format(score))
    return history, model

In [91]:
X_train, X_test, y_train, y_test = preprocess_data()


Of the 81 features in this dataset
36 features are numeric
and 43 features are categorical.
The last two are the target, which is numeric, and the id column.

Optimizing out model using Bayesian Optimization!

It's not reflected in this workbook, but my workflow for Keras has recently transitioned to primarily using Hyperas for hypermarameter optimization.


In [92]:
model, history = keras_model(X_train, X_test, y_train, y_test)


Test MSE is 1.04e+09

In [93]:
predicted = model.model.predict(X_test)

In [94]:
plt.plot(y_test - predicted)


Out[94]:
[<matplotlib.lines.Line2D at 0x1b62738a128>]

In [17]:
test_data = preprocess_data(test_data=True)


Of the 81 features in this dataset
36 features are numeric
and 43 features are categorical.
The last two are the target, which is numeric, and the id column.

Improving on our results using an autoencoder


In [ ]: