In [ ]:

    
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.base 

sys.path.insert(1, os.path.join(sys.path[0], '..'))
import samlib



In [ ]:

    
%store -r



In [ ]:

    
key_features

Load numerical data

Generated in notebook data_exploration_numerical_features.ipynb



In [ ]:

    
dfnum_t2 = pd.read_csv('transformed_dataset_dfnum_t2.csv', index_col=['Dataset','Id'])



In [ ]:

    
dfnum_t2.head()



In [ ]:

    
dfnum_t2.tail()

Recreate transformed (standardized) sale price



In [ ]:

    
target = pd.read_csv('../data/train_target.csv')



In [ ]:

    
scaler = sk.preprocessing.StandardScaler()

def standardize(df):
    _values = sk.preprocessing.StandardScaler().fit_transform(df)
    return pd.DataFrame(data=_values, columns=df.columns)


def transform_target(target):
    logtarget = np.log1p(target / 1000)
    return scaler.fit_transform(logtarget)


def inverse_transform_target(target_t):
    logtarget = scaler.inverse_transform(target_t)
    return np.expm1(logtarget) * 1000


target_t = transform_target(target)



In [ ]:

    
# Test
assert all(target == inverse_transform_target(target_t))

Ordinary Least Squares model with key features

We're left with 22 features. The first 4 should all be highly correlated with the price.



In [ ]:

    
data = dfnum_t2.loc['train',:].copy()
data['SalePrice'] = target_t



In [ ]:

    
fig, axes = plt.subplots(2,2,figsize=(10,10))
for feature, ax in zip(key_features[:4], itertools.chain.from_iterable(axes)):
    ax.plot(data[feature], data['SalePrice'], 'o')
    ax.set(xlabel=feature, ylabel='SalePrice')

Let's build a simple linear regression model based on these 4 features.



In [ ]:

    
regression1 = smapi.ols("SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea", data=data).fit()
regression1.summary()

R-squared equals 0.79 so it's pretty good for a first try. Let's see what happens if we include all our numerical features.



In [ ]:

    
data.columns

Statsmodels gets confused with columns that start with a digit, so let's rename that column first



In [ ]:

    
data['1stFlrSF'].name = 'FlrSF'



In [ ]:

    
def rename_columns(df):
    return df.rename_axis({'1stFlrSF': 'FirstFlrSF', '2ndFlrSF': 'SndFlrSF'}, axis=1)

data = rename_columns(data)



In [ ]:

    
data.columns



In [ ]:

    
desc = 'SalePrice ~ ' + ' + '.join(data.drop('SalePrice', axis=1))
desc

As can be seen below, using more numerical values improves R-squared to 0.88 which is pretty good, though there's of course a risk of overfitting.



In [ ]:

    
regression2 = smapi.ols(desc, data=data).fit()
regression2.summary()

Cross validation



In [ ]:

    
def get_data(X, y):
    df = X.copy()
    df['SalePrice'] = y
    return df

def ols1(X, y):
    data = get_data(X, y)
    return smapi.ols("SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea", data=data)

def ols2(X, y):
    data = get_data(X, y)
    return smapi.ols(desc, data=data)

Test the model

Use `sklearn.model_selection.train_test_split` to run some experiments and validate the models



In [ ]:

    
def rmse(prediction, exact):
    return np.mean((prediction - exact)**2.0)**0.5

def run_experiment(estimator, scoring=rmse):
    Xtrain, Xtest, ytrain, ytest = sk.model_selection.train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'])
    model = estimator(Xtrain, ytrain).fit()
    return scoring(model.predict(Xtest), ytest)

def cross_validate(estimator, cv=5):
    return np.array([run_experiment(estimator) for _ in range(cv)])

for model in [ols1, ols2]:
    errors = cross_validate(model)
    print(errors, errors.mean())

Use `sklearn.model_selection_cross_val_score` to validate the models



In [ ]:

    
for model in [ols1, ols2]:
    mse = np.sqrt(-sk.model_selection.cross_val_score(samlib.Regressor(model), data.drop('SalePrice', axis=1), y=data['SalePrice'],  
                                   scoring='neg_mean_squared_error', cv=5))
    print(mse, mse.mean())

Make a submission



In [ ]:

    
dfnum_t2 = rename_columns(dfnum_t2)
submission_t = regression2.predict(dfnum_t2.loc['test',:])

Scale the result



In [ ]:

    
submission = inverse_transform_target(submission_t)
submission



In [ ]:

    
def save(filename, submission):
    df = pd.DataFrame(data={
            "Id": np.arange(len(submission)) + 1461,
            "SalePrice": submission
            })
    df.to_csv(filename, index=False)
    
save('ols_key_numerical_features_only.csv', submission)

Regression interpretation

Statsmodels has special plots to explore the outcome of a regression model http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html



In [ ]: