In [ ]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.base 

sys.path.insert(1, os.path.join(sys.path[0], '..'))
import samlib

In [ ]:
# Number of categorical features
ncat = 12

Load data

Generated in notebook data_exploration_numerical_features.ipynb


In [ ]:
dfnum = pd.read_csv('transformed_numerical_dataset_imputed.csv', index_col=['Dataset','Id'])

In [ ]:
dfnum.head()

In [ ]:
dfcat = pd.read_csv('cleaned_categorical_vars_with_colz_sorted_by_goodness.csv', index_col=['Dataset','Id'])
dfcat.head()

In [ ]:
dfcat.head()

In [ ]:
df = pd.concat([dfnum, dfcat.iloc[:, :ncat]], axis=1)
df.shape

Recreate transformed (standardized) sale price


In [ ]:
target = pd.read_csv('../data/train_target.csv')

In [ ]:
scaler = sk.preprocessing.StandardScaler()

def transform_target(target):
    logtarget = np.log1p(target / 1000)
    return scaler.fit_transform(logtarget)


def inverse_transform_target(target_t):
    logtarget = scaler.inverse_transform(target_t)
    return np.expm1(logtarget) * 1000


target_t = transform_target(target)

In [ ]:
# Test
assert all(target == inverse_transform_target(target_t))

Ordinary Least Squares


In [ ]:
data = df.loc['train',:].copy()
data['SalePrice'] = target_t

In [ ]:
data.columns

In [ ]:
desc = 'SalePrice' + \
    ' ~ ' + \
    ' + '.join(data.drop('SalePrice', axis=1).iloc[:, :-ncat]) + \
    ' + ' + \
    ' + '.join('C({})'.format(col) for col in data.drop('SalePrice', axis=1).iloc[:, -ncat:])
desc

As can be seen below, using more numerical values improves R-squared to 0.88 which is pretty good, though there's of course a risk of overfitting.


In [ ]:
regression2 = smapi.ols(desc, data=data).fit()
regression2.summary()

Cross validation


In [ ]:
def get_data(X, y):
    df = X.copy()
    df['SalePrice'] = y
    return df

def ols3(X, y):
    data = get_data(X, y)
    return smapi.ols(desc, data=data)

Make a submission


In [ ]:
submission_t = regression2.predict(df.loc['test',:])

Scale the result


In [ ]:
submission = inverse_transform_target(submission_t)
submission

In [ ]:
def save(filename, submission):
    df = pd.DataFrame(data={
            "Id": np.arange(len(submission)) + 1461,
            "SalePrice": submission
            })
    df.to_csv(filename, index=False)
    
save('ols_full_{}.csv'.format(ncat), submission)

Regression interpretation

Statsmodels has special plots to explore the outcome of a regression model http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html


In [ ]: