Exploratory Data Analysis of Kaggle competition House Prices: Advanced Regression Techniques

Competition page

Original kernel


In [1]:
# Import needed packages
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 20
sns.set()
pd.set_option('display.float_format', lambda x: '%.3f' % x)


/usr/local/lib/python3.4/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

A. Exploratory Data Analysis


In [44]:
# Load train and test.csv

train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [3]:
train.head()
test.head()


Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub ... 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1 1462 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
2 1463 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub ... 0 0 NaN MnPrv NaN 0 3 2010 WD Normal
3 1464 60 RL 78.0 9978 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 6 2010 WD Normal
4 1465 120 RL 43.0 5005 Pave NaN IR1 HLS AllPub ... 144 0 NaN NaN NaN 0 1 2010 WD Normal

5 rows × 80 columns


In [38]:
# Select quantitative and qualitative 

quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

In [39]:
print("Number of quantitative features:", len(quantitative))
print("Number of qualitative features:", len(qualitative))


Number of quantitative features: 36
Number of qualitative features: 43

In [40]:
quantitative
qualitative


Out[40]:
['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']
Out[40]:
['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [7]:
# Explore datasets' sizes
print("# of samples in training set:", len(train))
print("# of samples in test set:", len(test))


# of samples in training set: 1460
# of samples in test set: 1459

In [8]:
# Explore features with missing values
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True, ascending=False)
missing.plot.bar()


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ec2ed68>

In [9]:
# Explore distribution of the target, SalePrice
import scipy.stats as st
y = train['SalePrice']
plt.figure(1)
plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2)
plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3)
plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)


Out[9]:
<matplotlib.figure.Figure at 0x1152b7780>
Out[9]:
<matplotlib.text.Text at 0x115383f60>
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1152b7860>
Out[9]:
<matplotlib.figure.Figure at 0x11535ac50>
Out[9]:
<matplotlib.text.Text at 0x115d25f28>
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1153c32e8>
Out[9]:
<matplotlib.figure.Figure at 0x115cf1ef0>
Out[9]:
<matplotlib.text.Text at 0x115e76cf8>
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x115d5e2b0>

It appears that Johnson Su provides the best fit for the SalePrice

Qualitative Features


In [10]:
# Test the quantitative variables for normality, using Shapiro Wilk, with alpha=0.01
# https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test

test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01
normal = pd.DataFrame(train[quantitative])
normal = normal.apply(test_normality)

# True = reject null hypothesis (i.e., NOT normally distributed)
print(normal)


MSSubClass       True
LotFrontage      True
LotArea          True
OverallQual      True
OverallCond      True
YearBuilt        True
YearRemodAdd     True
MasVnrArea       True
BsmtFinSF1       True
BsmtFinSF2       True
BsmtUnfSF        True
TotalBsmtSF      True
1stFlrSF         True
2ndFlrSF         True
LowQualFinSF     True
GrLivArea        True
BsmtFullBath     True
BsmtHalfBath     True
FullBath         True
HalfBath         True
BedroomAbvGr     True
KitchenAbvGr     True
TotRmsAbvGrd     True
Fireplaces       True
GarageYrBlt      True
GarageCars       True
GarageArea       True
WoodDeckSF       True
OpenPorchSF      True
EnclosedPorch    True
3SsnPorch        True
ScreenPorch      True
PoolArea         True
MiscVal          True
MoSold           True
YrSold           True
dtype: bool

Also none of quantitative variables has normal distribution so these should be transformed as well.


In [11]:
f = pd.melt(train, value_vars=quantitative)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")


Good candidates for log transformation

  • LotFrontage
  • LotArea
  • TotalBsmtSF

Categorical variables


In [12]:
# Convert categorical features to dtype `categry` and fill missing
for c in qualitative:
    train[c] = train[c].astype('category')
    
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)

f = pd.melt(train, id_vars=['SalePrice'], value_vars=qualitative)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g.map(boxplot, "value", "SalePrice")


/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x117f3dac8>

(Below) For each variable SalePrices are partitioned to distinct sets based on category values. Then check with ANOVA test if sets have similar distributions. If variable has minor impact then set means should be equal. Decreasing pval is sign of increasing diversity in partitions.


In [13]:
# Apply one-way ANOVA. For each feature, null hypothesis = no difference between mean SalePrice
# for each level
def anova(frame):
    anv = pd.DataFrame()
    anv['feature'] = qualitative
    pvals = []
    for c in qualitative:
        samples = []
        for cls in frame[c].unique():
            s = frame[frame[c] == cls]['SalePrice'].values
            samples.append(s)
        
        pval = stats.f_oneway(*samples)[1]
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

a = anova(train)
a['disparity'] = np.log(1./a['pval'].values)
plt.figure(figsize=(10, 6))
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)


/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/scipy/stats/stats.py:2958: RuntimeWarning: invalid value encountered in double_scalars
  ssbn += _square_of_sums(a - offset) / float(len(a))
Out[13]:
<matplotlib.figure.Figure at 0x117c74630>
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1169ad470>

In [14]:
# Encode categorical variables. For each categorical variable, order each level based on the mean SalePrice, 
# from lowest to highest. Then encode each level from 1 to N, and store the encoding in `feature`_E
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
    
qual_encoded = []
for q in qualitative:  
    encode(train, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)


['MSZoning_E', 'Street_E', 'Alley_E', 'LotShape_E', 'LandContour_E', 'Utilities_E', 'LotConfig_E', 'LandSlope_E', 'Neighborhood_E', 'Condition1_E', 'Condition2_E', 'BldgType_E', 'HouseStyle_E', 'RoofStyle_E', 'RoofMatl_E', 'Exterior1st_E', 'Exterior2nd_E', 'MasVnrType_E', 'ExterQual_E', 'ExterCond_E', 'Foundation_E', 'BsmtQual_E', 'BsmtCond_E', 'BsmtExposure_E', 'BsmtFinType1_E', 'BsmtFinType2_E', 'Heating_E', 'HeatingQC_E', 'CentralAir_E', 'Electrical_E', 'KitchenQual_E', 'Functional_E', 'FireplaceQu_E', 'GarageType_E', 'GarageFinish_E', 'GarageQual_E', 'GarageCond_E', 'PavedDrive_E', 'PoolQC_E', 'Fence_E', 'MiscFeature_E', 'SaleType_E', 'SaleCondition_E']

In [15]:
train.head()


Out[15]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... GarageType_E GarageFinish_E GarageQual_E GarageCond_E PavedDrive_E PoolQC_E Fence_E MiscFeature_E SaleType_E SaleCondition_E
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 5.0 2.0 3.0 5.0 3.0 NaN NaN NaN 5.0 5.0
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 5.0 2.0 3.0 5.0 3.0 NaN NaN NaN 5.0 5.0
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 5.0 2.0 3.0 5.0 3.0 NaN NaN NaN 5.0 5.0
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 2.0 1.0 3.0 5.0 3.0 NaN NaN NaN 5.0 2.0
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 5.0 2.0 3.0 5.0 3.0 NaN NaN NaN 5.0 5.0

5 rows × 124 columns

Correlations

Generally to reduce confunding only variables uncorrelated with each other should be added to regression models (which are correlated with SalePrice).


In [16]:
# Calculate the Spearman correlation between SalePrice and each variable
def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [
        frame[f].corr(frame['SalePrice'], 'spearman') for f in features
    ]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25 * len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')


features = quantitative + qual_encoded
spearman(train, features)


OverallQal and Neighborhood (encoded) have the strongest positive correlation with SalePrice


In [17]:
# Correlation between quantitative variables
plt.figure(1)
corr = train[quantitative + ['SalePrice']].corr()
sns.heatmap(corr, center=0., cmap="coolwarm")

# Correlation between qualitative variables
plt.figure(2)
corr = train[qual_encoded + ['SalePrice']].corr()
sns.heatmap(corr, center=0., cmap="coolwarm")

# Correlation between quantitative and qualitative variables

plt.figure(3)
corr = pd.DataFrame(
    np.zeros([len(quantitative) + 1,
               len(qualitative) + 1]),
    index=quantitative + ['SalePrice'],
    columns=qual_encoded + ['SalePrice'])
for q1 in quantitative + ['SalePrice']:
    for q2 in qual_encoded + ['SalePrice']:
        corr.loc[q1, q2] = train[q1].corr(train[q2])
sns.heatmap(corr, center=0., cmap="coolwarm")


Out[17]:
<matplotlib.figure.Figure at 0x117ed8908>
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x117ec1128>
Out[17]:
<matplotlib.figure.Figure at 0x117ec16a0>
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x115473518>
Out[17]:
<matplotlib.figure.Figure at 0x117c92dd8>
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x117d690f0>

In [18]:
def pairplot(x, y, **kwargs):
    ax = plt.gca()
    ts = pd.DataFrame({'time':x, 'val':y})
    ts=ts.groupby('time').mean()
    ts.plot(ax=ax)
    plt.xticks(rotation=90)
    
f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(pairplot, "value", "SalePrice")


Clustering


In [19]:
features = quantitative + qual_encoded
model = TSNE(n_components=2, random_state=0, perplexity=50)
X = train[features].fillna(0.).values
tsne = model.fit_transform(X)

std = StandardScaler()
s = std.fit_transform(X)
pca = PCA(n_components=30)
pca.fit(s)
pc = pca.transform(s)
kmeans = KMeans(n_clusters=5)
kmeans.fit(pc)

fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
print(np.sum(pca.explained_variance_ratio_))


Out[19]:
PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
Out[19]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x117f46748>
0.754861813003

Set up for learning models


In [2]:
# Import sklearn
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer

In [3]:
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better=False)


def rmse_cv_train(model):
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y_train, scoring=scorer, cv=10))
    return (rmse)


def rmse_cv_test(model):
    rmse = np.sqrt(-cross_val_score(
        model, X_test, y_test, scoring=scorer, cv=10))
    return (rmse)


def rmse(y_true, y_pred):

    return np.sqrt(mean_squared_error(y_true, y_pred))

Set up reusable data preprocessing pipeline


In [12]:
from scipy.stats import skew


class DataPreprocessor():
    """This class encapsulates the transformations that got applied to the dfing dataset. 
    It provides fit, transform, and fit_transform methods that can be used to prepare the test dataset for evaluation
    """

    def __init__(self):
        pass

    def fit(self, df):
        self._fit_transform(df)

    def fit_transform(self, df):
        return self._fit_transform(df)

    def transform(self, df):
        return self._fit_transform(df, transform_only=True)

    def _fit_transform(self, df, transform_only=False):

        df = df.copy()

        # Step 1: fill missing columns
        df = self.fill_missing_inplace(df)

        # Step 2: Replace numerical features that are actually really categories
        df.replace(
            {
                "MSSubClass": {
                    20: "SC20",
                    30: "SC30",
                    40: "SC40",
                    45: "SC45",
                    50: "SC50",
                    60: "SC60",
                    70: "SC70",
                    75: "SC75",
                    80: "SC80",
                    85: "SC85",
                    90: "SC90",
                    120: "SC120",
                    150: "SC150",
                    160: "SC160",
                    180: "SC180",
                    190: "SC190"
                },
                "MoSold": {
                    1: "Jan",
                    2: "Feb",
                    3: "Mar",
                    4: "Apr",
                    5: "May",
                    6: "Jun",
                    7: "Jul",
                    8: "Aug",
                    9: "Sep",
                    10: "Oct",
                    11: "Nov",
                    12: "Dec"
                }
            },
            inplace=True)

        # Step 3: Select numeric and categorical features
        if not transform_only:
            self.numerical_features, self.categorical_features = self.select_features(
                df)

        # Step 4: Encode categorical features based on level ordering

        # If fitting and transforming, then record the features encoded and the orderings
        if not transform_only:
            self.categorical_features_encoded = []
            self.categorical_features_orderings = dict()

        for feature in self.categorical_features:
            if not transform_only:
                self.categorical_features_orderings[feature] = self.encode_feature(
                    df, feature)
                self.categorical_features_encoded.append(feature + '_E')

            for cat, o in self.categorical_features_orderings[feature].items():
                df.loc[df[feature] == cat, feature + '_E'] = o
        # Drop the original features after encoding
        df.drop(self.categorical_features, axis=1)

        # Step 5:

        # 3* Polynomials on the top 10 existing numeric features

        if not transform_only:
            corr = df.corr()
            corr.sort_values(["SalePrice"], ascending=False, inplace=True)

            # Select the top 10 features based on correlation with SalePrice
            i = 0
            self.features_to_expand = []
            for feature, _ in corr.SalePrice[1:].iteritems():
                if i >= 10:
                    break
                if "_E" not in feature:
                    self.features_to_expand.append(feature)
                    i += 1

        for feature in self.features_to_expand:
            df[feature + 'Sqrt'] = np.sqrt(df[feature])
            df[feature + '-2'] = df[feature]**2
            df[feature + '-3'] = df[feature]**3
            
            if not transform_only:
                self.numerical_features.extend(
                    [feature + 'Sqrt', feature + '-2', feature + '-3'])

        # Step 6: Transform skewed numerical features
        df_num = df[self.numerical_features]

        if not transform_only:
            skewness = df_num.apply(lambda x: skew(x))
            skewness = skewness[abs(skewness) > 0.5]
            self.skewed_features = skewness.index

        df_num.loc[:, self.skewed_features] = np.log1p(
            df_num.loc[:, self.skewed_features])

        # Step 7: generate dummy variables for categorical features
        df_cat = df[self.categorical_features_encoded]
        df_cat_dummies = pd.get_dummies(
            df_cat, columns=self.categorical_features_encoded)
        
        if not transform_only:
            self.cat_dummy_columns = df_cat_dummies.columns
        
        # Fill gaps in dummy variables when transform_only
        else:
            for col in self.cat_dummy_columns:
                if col not in df_cat_dummies.columns:
                    df_cat_dummies[col] = 0
            

        print("df_num shape:", df_num.shape)
        print("df_cat shape:", df_cat.shape)
        print("df_cat_dummies shape:", df_cat_dummies.shape)

        # Step 8: Apply standard scalar to numerical features
        if not transform_only:
            self.stdSc = StandardScaler()
            self.stdSc.fit(df.loc[:, self.numerical_features])
        df.loc[:, self.numerical_features] = self.stdSc.transform(
            df.loc[:, self.numerical_features])

        df_ = pd.concat([df_num, df_cat_dummies], axis=1)

        return df_

    def fill_missing_inplace(self, df_in):

        df = df_in.copy()
        # Handle missing values for features where median/mean or most common value doesn't make sense

        # Alley : data description says NA means "no alley access"
        df.loc[:, "Alley"] = df.loc[:, "Alley"].fillna("None")
        # BedroomAbvGr : NA most likely means 0
        df.loc[:, "BedroomAbvGr"] = df.loc[:, "BedroomAbvGr"].fillna(0)
        # BsmtQual etc : data description says NA for basement features is "no basement"
        df.loc[:, "BsmtQual"] = df.loc[:, "BsmtQual"].fillna("No")
        df.loc[:, "BsmtCond"] = df.loc[:, "BsmtCond"].fillna("No")
        df.loc[:, "BsmtExposure"] = df.loc[:, "BsmtExposure"].fillna("No")
        df.loc[:, "BsmtFinType1"] = df.loc[:, "BsmtFinType1"].fillna("No")
        df.loc[:, "BsmtFinType2"] = df.loc[:, "BsmtFinType2"].fillna("No")
        df.loc[:, "BsmtFullBath"] = df.loc[:, "BsmtFullBath"].fillna(0)
        df.loc[:, "BsmtHalfBath"] = df.loc[:, "BsmtHalfBath"].fillna(0)
        df.loc[:, "BsmtUnfSF"] = df.loc[:, "BsmtUnfSF"].fillna(0)
        df.loc[:, "TotalBsmtSF"] = df.loc[:, "TotalBsmtSF"].fillna(0)
        df.loc[:, "BsmtFinSF1"] = df.loc[:, "BsmtFinSF1"].fillna(0)
        df.loc[:, "BsmtFinSF2"] = df.loc[:, "BsmtFinSF2"].fillna(0)

        # CentralAir : NA most likely means No
        df.loc[:, "CentralAir"] = df.loc[:, "CentralAir"].fillna("N")
        # Condition : NA most likely means Normal
        df.loc[:, "Condition1"] = df.loc[:, "Condition1"].fillna("Norm")
        df.loc[:, "Condition2"] = df.loc[:, "Condition2"].fillna("Norm")

        # Electrical: Fill with the most common value
        df.loc[:, "Electrical"] = df.loc[:, 'Electrical'].fillna(
            df['Electrical'].value_counts().index[0])

        # EnclosedPorch : NA most likely means no enclosed porch
        df.loc[:, "EnclosedPorch"] = df.loc[:, "EnclosedPorch"].fillna(0)
        # External stuff : NA most likely means average
        df.loc[:, "ExterCond"] = df.loc[:, "ExterCond"].fillna("TA")
        df.loc[:, "ExterQual"] = df.loc[:, "ExterQual"].fillna("TA")

        # Exterior: NA = Other
        df.loc[:, "Exterior1st"] = df.loc[:, "Exterior1st"].fillna("Other")
        df.loc[:, "Exterior2nd"] = df.loc[:, "Exterior2nd"].fillna("Other")

        # Fence : data description says NA means "no fence"
        df.loc[:, "Fence"] = df.loc[:, "Fence"].fillna("No")
        # FireplaceQu : data description says NA means "no fireplace"
        df.loc[:, "FireplaceQu"] = df.loc[:, "FireplaceQu"].fillna("No")
        df.loc[:, "Fireplaces"] = df.loc[:, "Fireplaces"].fillna(0)
        # Functional : data description says NA means typical
        df.loc[:, "Functional"] = df.loc[:, "Functional"].fillna("Typ")
        # GarageType etc : data description says NA for garage features is "no garage"
        df.loc[:, "GarageType"] = df.loc[:, "GarageType"].fillna("No")
        df.loc[:, "GarageFinish"] = df.loc[:, "GarageFinish"].fillna("No")
        df.loc[:, "GarageQual"] = df.loc[:, "GarageQual"].fillna("No")
        df.loc[:, "GarageCond"] = df.loc[:, "GarageCond"].fillna("No")
        df.loc[:, "GarageArea"] = df.loc[:, "GarageArea"].fillna(0)
        df.loc[:, "GarageCars"] = df.loc[:, "GarageCars"].fillna(0)
        df.loc[:, "GarageYrBlt"] = df.loc[:, "GarageYrBlt"].fillna(0)

        # HalfBath : NA most likely means no half baths above grade
        df.loc[:, "HalfBath"] = df.loc[:, "HalfBath"].fillna(0)
        # HeatingQC : NA most likely means typical
        df.loc[:, "HeatingQC"] = df.loc[:, "HeatingQC"].fillna("TA")
        # KitchenAbvGr : NA most likely means 0
        df.loc[:, "KitchenAbvGr"] = df.loc[:, "KitchenAbvGr"].fillna(0)
        # KitchenQual : NA most likely means typical
        df.loc[:, "KitchenQual"] = df.loc[:, "KitchenQual"].fillna("TA")
        # LotFrontage : NA most likely means no lot frontage
        df.loc[:, "LotFrontage"] = df.loc[:, "LotFrontage"].fillna(0)
        # LotShape : NA most likely means regular
        df.loc[:, "LotShape"] = df.loc[:, "LotShape"].fillna("Reg")
        # MasVnrType : NA most likely means no veneer
        df.loc[:, "MasVnrType"] = df.loc[:, "MasVnrType"].fillna("None")
        df.loc[:, "MasVnrArea"] = df.loc[:, "MasVnrArea"].fillna(0)
        # MiscFeature : data description says NA means "no misc feature"
        df.loc[:, "MiscFeature"] = df.loc[:, "MiscFeature"].fillna("No")
        df.loc[:, "MiscVal"] = df.loc[:, "MiscVal"].fillna(0)

        # MSZoning: Fill with the most common value
        df.loc[:, "MSZoning"] = df.loc[:, 'MSZoning'].fillna(
            df['MSZoning'].value_counts().index[0])

        # OpenPorchSF : NA most likely means no open porch
        df.loc[:, "OpenPorchSF"] = df.loc[:, "OpenPorchSF"].fillna(0)
        # PavedDrive : NA most likely means not paved
        df.loc[:, "PavedDrive"] = df.loc[:, "PavedDrive"].fillna("N")
        # PoolQC : data description says NA means "no pool"
        df.loc[:, "PoolQC"] = df.loc[:, "PoolQC"].fillna("No")
        df.loc[:, "PoolArea"] = df.loc[:, "PoolArea"].fillna(0)
        # SaleCondition : NA most likely means normal sale
        df.loc[:, "SaleCondition"] = df.loc[:, "SaleCondition"].fillna(
            "Normal")

        # SaleType: NA most likely Oth
        df.loc[:, "SaleType"] = df.loc[:, "SaleType"].fillna("Oth")

        # ScreenPorch : NA most likely means no screen porch
        df.loc[:, "ScreenPorch"] = df.loc[:, "ScreenPorch"].fillna(0)
        # TotRmsAbvGrd : NA most likely means 0
        df.loc[:, "TotRmsAbvGrd"] = df.loc[:, "TotRmsAbvGrd"].fillna(0)
        # Utilities : NA most likely means all public utilities
        df.loc[:, "Utilities"] = df.loc[:, "Utilities"].fillna("AllPub")
        # WoodDeckSF : NA most likely means no wood deck
        df.loc[:, "WoodDeckSF"] = df.loc[:, "WoodDeckSF"].fillna(0)
        return df

    # Encode categorical variables. For each categorical variable, order each level based on the mean SalePrice,
    # from lowest to highest. Then encode each level from 1 to N, and store the encoding in `feature`_E
    def encode_feature(self, frame, feature):
        ordering = pd.DataFrame()
        ordering['val'] = frame[feature].unique()
        ordering.index = ordering.val
        ordering['spmean'] = frame[[feature, 'SalePrice'
                                    ]].groupby(feature).mean()['SalePrice']
        ordering = ordering.sort_values('spmean')
        ordering['ordering'] = range(0, ordering.shape[0])
        ordering = ordering['ordering'].to_dict()

        return ordering

    def select_features(self, df):
        numerical_features = [
            f for f in df.columns if df.dtypes[f] != 'object'
        ]
        try:
            numerical_features.remove('SalePrice')
            numerical_features.remove('Id')
        except:
            pass
        categorical_features = [
            f for f in df.columns if df.dtypes[f] == 'object'
        ]

        return numerical_features, categorical_features

Prepare training and test data


In [13]:
pp = DataPreprocessor()

In [14]:
# Reload train.csv and test.csv
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [15]:
train_transformed = pp.fit_transform(train)
train_transformed.sort_index(axis=1, inplace=True)


/usr/local/lib/python3.4/dist-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
df_num shape: (1460, 64)
df_cat shape: (1460, 45)
df_cat_dummies shape: (1460, 292)

In [16]:
test_transformed = pp.transform(test)
test_transformed.sort_index(axis=1, inplace=True)


/usr/local/lib/python3.4/dist-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
df_num shape: (1459, 64)
df_cat shape: (1459, 45)
df_cat_dummies shape: (1459, 292)

In [18]:
y = np.log(train['SalePrice'])

Split between training and validation sets


In [19]:
# Partition the dataset in train + validation sets
X_train, X_val, y_train, y_val = train_test_split(train_transformed, y, test_size = 0.2, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_val : " + str(X_val.shape))
print("y_train : " + str(y_train.shape))
print("y_val : " + str(y_val.shape))


X_train : (1168, 356)
X_val : (292, 356)
y_train : (1168,)
y_val : (292,)

Train with XGboost


In [25]:
# XGboost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

fit_params = {'eval_metric': 'rmse'}

xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(
    xgb_model, {
        'max_depth': [2, 3, 4, 5, 6],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800]
    },
    fit_params=fit_params,
    verbose=1)

clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

# Look at predictions on training and validation set
print("RMSE on Training set :", rmse(y_train, y_train_pred))
print("RMSE on Validation set :", rmse(y_val, y_val_pred))

# Plot residuals
plt.scatter(
    y_train_pred,
    y_train_pred - y_train,
    c="blue",
    marker="s",
    label="Training data")
plt.scatter(
    y_val_pred,
    y_val_pred - y_val,
    c="lightgreen",
    marker="s",
    label="Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.show()

# Plot predictions
plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data")
plt.scatter(
    y_val_pred, y_val, c="lightgreen", marker="s", label="Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.show()


Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.3min finished
0.902625916044
{'n_estimators': 700, 'max_depth': 2}
RMSE on Training set : 0.0572210833068
RMSE on Validation set : 0.125626846211

In [21]:
clf.best_params_


Out[21]:
{'max_depth': 2, 'n_estimators': 700}

In [37]:
y_test_pred = clf.predict(test_transformed)

In [38]:
y_test_pred


Out[38]:
array([ 11.66674995,  12.00013351,  12.13452148, ...,  12.0263443 ,
        11.59327888,  12.29862595], dtype=float32)

In [49]:
test_SalePrice = pd.Series(np.exp(y_test_pred), index=test.index, name='SalePrice')

In [50]:
submission = pd.concat([test['Id'], test_SalePrice], axis=1)

In [53]:
submission.to_csv('./input/submission.csv', index=False)