Exploratory Data Analysis of Kaggle competition House Prices: Advanced Regression Techniques



In [1]:

    
# Import needed packages
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 20
sns.set()
pd.set_option('display.float_format', lambda x: '%.3f' % x)









    



/usr/local/lib/python3.4/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

A. Exploratory Data Analysis



In [44]:

    
# Load train and test.csv

train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")



In [3]:

    
train.head()
test.head()









    Out[3]:







  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 81 columns







    Out[3]:







  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1461
      20
      RH
      80.0
      11622
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      120
      0
      NaN
      MnPrv
      NaN
      0
      6
      2010
      WD
      Normal
    
    
      1
      1462
      20
      RL
      81.0
      14267
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      0
      NaN
      NaN
      Gar2
      12500
      6
      2010
      WD
      Normal
    
    
      2
      1463
      60
      RL
      74.0
      13830
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      0
      NaN
      MnPrv
      NaN
      0
      3
      2010
      WD
      Normal
    
    
      3
      1464
      60
      RL
      78.0
      9978
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      0
      NaN
      NaN
      NaN
      0
      6
      2010
      WD
      Normal
    
    
      4
      1465
      120
      RL
      43.0
      5005
      Pave
      NaN
      IR1
      HLS
      AllPub
      ...
      144
      0
      NaN
      NaN
      NaN
      0
      1
      2010
      WD
      Normal
    
  

5 rows × 80 columns



In [38]:

    
# Select quantitative and qualitative 

quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']



In [39]:

    
print("Number of quantitative features:", len(quantitative))
print("Number of qualitative features:", len(qualitative))









    



Number of quantitative features: 36
Number of qualitative features: 43



In [40]:

    
quantitative
qualitative









    Out[40]:





['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']






    Out[40]:





['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']



In [7]:

    
# Explore datasets' sizes
print("# of samples in training set:", len(train))
print("# of samples in test set:", len(test))









    



# of samples in training set: 1460
# of samples in test set: 1459



In [8]:

    
# Explore features with missing values
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True, ascending=False)
missing.plot.bar()









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x10ec2ed68>



In [9]:

    
# Explore distribution of the target, SalePrice
import scipy.stats as st
y = train['SalePrice']
plt.figure(1)
plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2)
plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3)
plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)









    Out[9]:





<matplotlib.figure.Figure at 0x1152b7780>






    Out[9]:





<matplotlib.text.Text at 0x115383f60>






    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1152b7860>






    Out[9]:





<matplotlib.figure.Figure at 0x11535ac50>






    Out[9]:





<matplotlib.text.Text at 0x115d25f28>






    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1153c32e8>






    Out[9]:





<matplotlib.figure.Figure at 0x115cf1ef0>






    Out[9]:





<matplotlib.text.Text at 0x115e76cf8>






    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x115d5e2b0>

It appears that Johnson Su provides the best fit for the SalePrice

Qualitative Features



In [10]:

    
# Test the quantitative variables for normality, using Shapiro Wilk, with alpha=0.01
# https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test

test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01
normal = pd.DataFrame(train[quantitative])
normal = normal.apply(test_normality)

# True = reject null hypothesis (i.e., NOT normally distributed)
print(normal)









    



MSSubClass       True
LotFrontage      True
LotArea          True
OverallQual      True
OverallCond      True
YearBuilt        True
YearRemodAdd     True
MasVnrArea       True
BsmtFinSF1       True
BsmtFinSF2       True
BsmtUnfSF        True
TotalBsmtSF      True
1stFlrSF         True
2ndFlrSF         True
LowQualFinSF     True
GrLivArea        True
BsmtFullBath     True
BsmtHalfBath     True
FullBath         True
HalfBath         True
BedroomAbvGr     True
KitchenAbvGr     True
TotRmsAbvGrd     True
Fireplaces       True
GarageYrBlt      True
GarageCars       True
GarageArea       True
WoodDeckSF       True
OpenPorchSF      True
EnclosedPorch    True
3SsnPorch        True
ScreenPorch      True
PoolArea         True
MiscVal          True
MoSold           True
YrSold           True
dtype: bool

Also none of quantitative variables has normal distribution so these should be transformed as well.



In [11]:

    
f = pd.melt(train, value_vars=quantitative)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

Good candidates for log transformation

LotFrontage
LotArea
TotalBsmtSF

Categorical variables



In [12]:

    
# Convert categorical features to dtype `categry` and fill missing
for c in qualitative:
    train[c] = train[c].astype('category')
    
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)

f = pd.melt(train, id_vars=['SalePrice'], value_vars=qualitative)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g.map(boxplot, "value", "SalePrice")









    



/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):
/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):






    Out[12]:





<seaborn.axisgrid.FacetGrid at 0x117f3dac8>

(Below) For each variable SalePrices are partitioned to distinct sets based on category values. Then check with ANOVA test if sets have similar distributions. If variable has minor impact then set means should be equal. Decreasing pval is sign of increasing diversity in partitions.



In [13]:

    
# Apply one-way ANOVA. For each feature, null hypothesis = no difference between mean SalePrice
# for each level
def anova(frame):
    anv = pd.DataFrame()
    anv['feature'] = qualitative
    pvals = []
    for c in qualitative:
        samples = []
        for cls in frame[c].unique():
            s = frame[frame[c] == cls]['SalePrice'].values
            samples.append(s)
        
        pval = stats.f_oneway(*samples)[1]
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

a = anova(train)
a['disparity'] = np.log(1./a['pval'].values)
plt.figure(figsize=(10, 6))
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)









    



/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/scipy/stats/stats.py:2958: RuntimeWarning: invalid value encountered in double_scalars
  ssbn += _square_of_sums(a - offset) / float(len(a))






    Out[13]:





<matplotlib.figure.Figure at 0x117c74630>






    



/Users/luke/.virtualenvs/ml-python3/lib/python3.6/site-packages/seaborn/categorical.py:342: DeprecationWarning: pandas.core.common.is_categorical_dtype is deprecated. import from the public API: pandas.api.types.is_categorical_dtype instead
  elif is_categorical(y):






    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x1169ad470>



In [14]:

    
# Encode categorical variables. For each categorical variable, order each level based on the mean SalePrice, 
# from lowest to highest. Then encode each level from 1 to N, and store the encoding in `feature`_E
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
    
qual_encoded = []
for q in qualitative:  
    encode(train, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)









    



['MSZoning_E', 'Street_E', 'Alley_E', 'LotShape_E', 'LandContour_E', 'Utilities_E', 'LotConfig_E', 'LandSlope_E', 'Neighborhood_E', 'Condition1_E', 'Condition2_E', 'BldgType_E', 'HouseStyle_E', 'RoofStyle_E', 'RoofMatl_E', 'Exterior1st_E', 'Exterior2nd_E', 'MasVnrType_E', 'ExterQual_E', 'ExterCond_E', 'Foundation_E', 'BsmtQual_E', 'BsmtCond_E', 'BsmtExposure_E', 'BsmtFinType1_E', 'BsmtFinType2_E', 'Heating_E', 'HeatingQC_E', 'CentralAir_E', 'Electrical_E', 'KitchenQual_E', 'Functional_E', 'FireplaceQu_E', 'GarageType_E', 'GarageFinish_E', 'GarageQual_E', 'GarageCond_E', 'PavedDrive_E', 'PoolQC_E', 'Fence_E', 'MiscFeature_E', 'SaleType_E', 'SaleCondition_E']



In [15]:

    
train.head()









    Out[15]:







  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      GarageType_E
      GarageFinish_E
      GarageQual_E
      GarageCond_E
      PavedDrive_E
      PoolQC_E
      Fence_E
      MiscFeature_E
      SaleType_E
      SaleCondition_E
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      5.0
      2.0
      3.0
      5.0
      3.0
      NaN
      NaN
      NaN
      5.0
      5.0
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      5.0
      2.0
      3.0
      5.0
      3.0
      NaN
      NaN
      NaN
      5.0
      5.0
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      5.0
      2.0
      3.0
      5.0
      3.0
      NaN
      NaN
      NaN
      5.0
      5.0
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      2.0
      1.0
      3.0
      5.0
      3.0
      NaN
      NaN
      NaN
      5.0
      2.0
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      5.0
      2.0
      3.0
      5.0
      3.0
      NaN
      NaN
      NaN
      5.0
      5.0
    
  

5 rows × 124 columns

Correlations

Generally to reduce confunding only variables uncorrelated with each other should be added to regression models (which are correlated with SalePrice).



In [16]:

    
# Calculate the Spearman correlation between SalePrice and each variable
def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [
        frame[f].corr(frame['SalePrice'], 'spearman') for f in features
    ]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25 * len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')


features = quantitative + qual_encoded
spearman(train, features)

OverallQal and Neighborhood (encoded) have the strongest positive correlation with SalePrice



In [17]:

    
# Correlation between quantitative variables
plt.figure(1)
corr = train[quantitative + ['SalePrice']].corr()
sns.heatmap(corr, center=0., cmap="coolwarm")

# Correlation between qualitative variables
plt.figure(2)
corr = train[qual_encoded + ['SalePrice']].corr()
sns.heatmap(corr, center=0., cmap="coolwarm")

# Correlation between quantitative and qualitative variables

plt.figure(3)
corr = pd.DataFrame(
    np.zeros([len(quantitative) + 1,
               len(qualitative) + 1]),
    index=quantitative + ['SalePrice'],
    columns=qual_encoded + ['SalePrice'])
for q1 in quantitative + ['SalePrice']:
    for q2 in qual_encoded + ['SalePrice']:
        corr.loc[q1, q2] = train[q1].corr(train[q2])
sns.heatmap(corr, center=0., cmap="coolwarm")









    Out[17]:





<matplotlib.figure.Figure at 0x117ed8908>






    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x117ec1128>






    Out[17]:





<matplotlib.figure.Figure at 0x117ec16a0>






    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x115473518>






    Out[17]:





<matplotlib.figure.Figure at 0x117c92dd8>






    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x117d690f0>



In [18]:

    
def pairplot(x, y, **kwargs):
    ax = plt.gca()
    ts = pd.DataFrame({'time':x, 'val':y})
    ts=ts.groupby('time').mean()
    ts.plot(ax=ax)
    plt.xticks(rotation=90)
    
f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(pairplot, "value", "SalePrice")

Clustering



In [19]:

    
features = quantitative + qual_encoded
model = TSNE(n_components=2, random_state=0, perplexity=50)
X = train[features].fillna(0.).values
tsne = model.fit_transform(X)

std = StandardScaler()
s = std.fit_transform(X)
pca = PCA(n_components=30)
pca.fit(s)
pc = pca.transform(s)
kmeans = KMeans(n_clusters=5)
kmeans.fit(pc)

fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
print(np.sum(pca.explained_variance_ratio_))









    Out[19]:





PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)






    Out[19]:





KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)






    Out[19]:





<seaborn.axisgrid.FacetGrid at 0x117f46748>






    



0.754861813003

Set up for learning models



In [2]:

    
# Import sklearn
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer



In [3]:

    
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better=False)


def rmse_cv_train(model):
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y_train, scoring=scorer, cv=10))
    return (rmse)


def rmse_cv_test(model):
    rmse = np.sqrt(-cross_val_score(
        model, X_test, y_test, scoring=scorer, cv=10))
    return (rmse)


def rmse(y_true, y_pred):

    return np.sqrt(mean_squared_error(y_true, y_pred))

Set up reusable data preprocessing pipeline



In [12]:

    
from scipy.stats import skew


class DataPreprocessor():
    """This class encapsulates the transformations that got applied to the dfing dataset. 
    It provides fit, transform, and fit_transform methods that can be used to prepare the test dataset for evaluation
    """

    def __init__(self):
        pass

    def fit(self, df):
        self._fit_transform(df)

    def fit_transform(self, df):
        return self._fit_transform(df)

    def transform(self, df):
        return self._fit_transform(df, transform_only=True)

    def _fit_transform(self, df, transform_only=False):

        df = df.copy()

        # Step 1: fill missing columns
        df = self.fill_missing_inplace(df)

        # Step 2: Replace numerical features that are actually really categories
        df.replace(
            {
                "MSSubClass": {
                    20: "SC20",
                    30: "SC30",
                    40: "SC40",
                    45: "SC45",
                    50: "SC50",
                    60: "SC60",
                    70: "SC70",
                    75: "SC75",
                    80: "SC80",
                    85: "SC85",
                    90: "SC90",
                    120: "SC120",
                    150: "SC150",
                    160: "SC160",
                    180: "SC180",
                    190: "SC190"
                },
                "MoSold": {
                    1: "Jan",
                    2: "Feb",
                    3: "Mar",
                    4: "Apr",
                    5: "May",
                    6: "Jun",
                    7: "Jul",
                    8: "Aug",
                    9: "Sep",
                    10: "Oct",
                    11: "Nov",
                    12: "Dec"
                }
            },
            inplace=True)

        # Step 3: Select numeric and categorical features
        if not transform_only:
            self.numerical_features, self.categorical_features = self.select_features(
                df)

        # Step 4: Encode categorical features based on level ordering

        # If fitting and transforming, then record the features encoded and the orderings
        if not transform_only:
            self.categorical_features_encoded = []
            self.categorical_features_orderings = dict()

        for feature in self.categorical_features:
            if not transform_only:
                self.categorical_features_orderings[feature] = self.encode_feature(
                    df, feature)
                self.categorical_features_encoded.append(feature + '_E')

            for cat, o in self.categorical_features_orderings[feature].items():
                df.loc[df[feature] == cat, feature + '_E'] = o
        # Drop the original features after encoding
        df.drop(self.categorical_features, axis=1)

        # Step 5:

        # 3* Polynomials on the top 10 existing numeric features

        if not transform_only:
            corr = df.corr()
            corr.sort_values(["SalePrice"], ascending=False, inplace=True)

            # Select the top 10 features based on correlation with SalePrice
            i = 0
            self.features_to_expand = []
            for feature, _ in corr.SalePrice[1:].iteritems():
                if i >= 10:
                    break
                if "_E" not in feature:
                    self.features_to_expand.append(feature)
                    i += 1

        for feature in self.features_to_expand:
            df[feature + 'Sqrt'] = np.sqrt(df[feature])
            df[feature + '-2'] = df[feature]**2
            df[feature + '-3'] = df[feature]**3
            
            if not transform_only:
                self.numerical_features.extend(
                    [feature + 'Sqrt', feature + '-2', feature + '-3'])

        # Step 6: Transform skewed numerical features
        df_num = df[self.numerical_features]

        if not transform_only:
            skewness = df_num.apply(lambda x: skew(x))
            skewness = skewness[abs(skewness) > 0.5]
            self.skewed_features = skewness.index

        df_num.loc[:, self.skewed_features] = np.log1p(
            df_num.loc[:, self.skewed_features])

        # Step 7: generate dummy variables for categorical features
        df_cat = df[self.categorical_features_encoded]
        df_cat_dummies = pd.get_dummies(
            df_cat, columns=self.categorical_features_encoded)
        
        if not transform_only:
            self.cat_dummy_columns = df_cat_dummies.columns
        
        # Fill gaps in dummy variables when transform_only
        else:
            for col in self.cat_dummy_columns:
                if col not in df_cat_dummies.columns:
                    df_cat_dummies[col] = 0
            

        print("df_num shape:", df_num.shape)
        print("df_cat shape:", df_cat.shape)
        print("df_cat_dummies shape:", df_cat_dummies.shape)

        # Step 8: Apply standard scalar to numerical features
        if not transform_only:
            self.stdSc = StandardScaler()
            self.stdSc.fit(df.loc[:, self.numerical_features])
        df.loc[:, self.numerical_features] = self.stdSc.transform(
            df.loc[:, self.numerical_features])

        df_ = pd.concat([df_num, df_cat_dummies], axis=1)

        return df_

    def fill_missing_inplace(self, df_in):

        df = df_in.copy()
        # Handle missing values for features where median/mean or most common value doesn't make sense

        # Alley : data description says NA means "no alley access"
        df.loc[:, "Alley"] = df.loc[:, "Alley"].fillna("None")
        # BedroomAbvGr : NA most likely means 0
        df.loc[:, "BedroomAbvGr"] = df.loc[:, "BedroomAbvGr"].fillna(0)
        # BsmtQual etc : data description says NA for basement features is "no basement"
        df.loc[:, "BsmtQual"] = df.loc[:, "BsmtQual"].fillna("No")
        df.loc[:, "BsmtCond"] = df.loc[:, "BsmtCond"].fillna("No")
        df.loc[:, "BsmtExposure"] = df.loc[:, "BsmtExposure"].fillna("No")
        df.loc[:, "BsmtFinType1"] = df.loc[:, "BsmtFinType1"].fillna("No")
        df.loc[:, "BsmtFinType2"] = df.loc[:, "BsmtFinType2"].fillna("No")
        df.loc[:, "BsmtFullBath"] = df.loc[:, "BsmtFullBath"].fillna(0)
        df.loc[:, "BsmtHalfBath"] = df.loc[:, "BsmtHalfBath"].fillna(0)
        df.loc[:, "BsmtUnfSF"] = df.loc[:, "BsmtUnfSF"].fillna(0)
        df.loc[:, "TotalBsmtSF"] = df.loc[:, "TotalBsmtSF"].fillna(0)
        df.loc[:, "BsmtFinSF1"] = df.loc[:, "BsmtFinSF1"].fillna(0)
        df.loc[:, "BsmtFinSF2"] = df.loc[:, "BsmtFinSF2"].fillna(0)

        # CentralAir : NA most likely means No
        df.loc[:, "CentralAir"] = df.loc[:, "CentralAir"].fillna("N")
        # Condition : NA most likely means Normal
        df.loc[:, "Condition1"] = df.loc[:, "Condition1"].fillna("Norm")
        df.loc[:, "Condition2"] = df.loc[:, "Condition2"].fillna("Norm")

        # Electrical: Fill with the most common value
        df.loc[:, "Electrical"] = df.loc[:, 'Electrical'].fillna(
            df['Electrical'].value_counts().index[0])

        # EnclosedPorch : NA most likely means no enclosed porch
        df.loc[:, "EnclosedPorch"] = df.loc[:, "EnclosedPorch"].fillna(0)
        # External stuff : NA most likely means average
        df.loc[:, "ExterCond"] = df.loc[:, "ExterCond"].fillna("TA")
        df.loc[:, "ExterQual"] = df.loc[:, "ExterQual"].fillna("TA")

        # Exterior: NA = Other
        df.loc[:, "Exterior1st"] = df.loc[:, "Exterior1st"].fillna("Other")
        df.loc[:, "Exterior2nd"] = df.loc[:, "Exterior2nd"].fillna("Other")

        # Fence : data description says NA means "no fence"
        df.loc[:, "Fence"] = df.loc[:, "Fence"].fillna("No")
        # FireplaceQu : data description says NA means "no fireplace"
        df.loc[:, "FireplaceQu"] = df.loc[:, "FireplaceQu"].fillna("No")
        df.loc[:, "Fireplaces"] = df.loc[:, "Fireplaces"].fillna(0)
        # Functional : data description says NA means typical
        df.loc[:, "Functional"] = df.loc[:, "Functional"].fillna("Typ")
        # GarageType etc : data description says NA for garage features is "no garage"
        df.loc[:, "GarageType"] = df.loc[:, "GarageType"].fillna("No")
        df.loc[:, "GarageFinish"] = df.loc[:, "GarageFinish"].fillna("No")
        df.loc[:, "GarageQual"] = df.loc[:, "GarageQual"].fillna("No")
        df.loc[:, "GarageCond"] = df.loc[:, "GarageCond"].fillna("No")
        df.loc[:, "GarageArea"] = df.loc[:, "GarageArea"].fillna(0)
        df.loc[:, "GarageCars"] = df.loc[:, "GarageCars"].fillna(0)
        df.loc[:, "GarageYrBlt"] = df.loc[:, "GarageYrBlt"].fillna(0)

        # HalfBath : NA most likely means no half baths above grade
        df.loc[:, "HalfBath"] = df.loc[:, "HalfBath"].fillna(0)
        # HeatingQC : NA most likely means typical
        df.loc[:, "HeatingQC"] = df.loc[:, "HeatingQC"].fillna("TA")
        # KitchenAbvGr : NA most likely means 0
        df.loc[:, "KitchenAbvGr"] = df.loc[:, "KitchenAbvGr"].fillna(0)
        # KitchenQual : NA most likely means typical
        df.loc[:, "KitchenQual"] = df.loc[:, "KitchenQual"].fillna("TA")
        # LotFrontage : NA most likely means no lot frontage
        df.loc[:, "LotFrontage"] = df.loc[:, "LotFrontage"].fillna(0)
        # LotShape : NA most likely means regular
        df.loc[:, "LotShape"] = df.loc[:, "LotShape"].fillna("Reg")
        # MasVnrType : NA most likely means no veneer
        df.loc[:, "MasVnrType"] = df.loc[:, "MasVnrType"].fillna("None")
        df.loc[:, "MasVnrArea"] = df.loc[:, "MasVnrArea"].fillna(0)
        # MiscFeature : data description says NA means "no misc feature"
        df.loc[:, "MiscFeature"] = df.loc[:, "MiscFeature"].fillna("No")
        df.loc[:, "MiscVal"] = df.loc[:, "MiscVal"].fillna(0)

        # MSZoning: Fill with the most common value
        df.loc[:, "MSZoning"] = df.loc[:, 'MSZoning'].fillna(
            df['MSZoning'].value_counts().index[0])

        # OpenPorchSF : NA most likely means no open porch
        df.loc[:, "OpenPorchSF"] = df.loc[:, "OpenPorchSF"].fillna(0)
        # PavedDrive : NA most likely means not paved
        df.loc[:, "PavedDrive"] = df.loc[:, "PavedDrive"].fillna("N")
        # PoolQC : data description says NA means "no pool"
        df.loc[:, "PoolQC"] = df.loc[:, "PoolQC"].fillna("No")
        df.loc[:, "PoolArea"] = df.loc[:, "PoolArea"].fillna(0)
        # SaleCondition : NA most likely means normal sale
        df.loc[:, "SaleCondition"] = df.loc[:, "SaleCondition"].fillna(
            "Normal")

        # SaleType: NA most likely Oth
        df.loc[:, "SaleType"] = df.loc[:, "SaleType"].fillna("Oth")

        # ScreenPorch : NA most likely means no screen porch
        df.loc[:, "ScreenPorch"] = df.loc[:, "ScreenPorch"].fillna(0)
        # TotRmsAbvGrd : NA most likely means 0
        df.loc[:, "TotRmsAbvGrd"] = df.loc[:, "TotRmsAbvGrd"].fillna(0)
        # Utilities : NA most likely means all public utilities
        df.loc[:, "Utilities"] = df.loc[:, "Utilities"].fillna("AllPub")
        # WoodDeckSF : NA most likely means no wood deck
        df.loc[:, "WoodDeckSF"] = df.loc[:, "WoodDeckSF"].fillna(0)
        return df

    # Encode categorical variables. For each categorical variable, order each level based on the mean SalePrice,
    # from lowest to highest. Then encode each level from 1 to N, and store the encoding in `feature`_E
    def encode_feature(self, frame, feature):
        ordering = pd.DataFrame()
        ordering['val'] = frame[feature].unique()
        ordering.index = ordering.val
        ordering['spmean'] = frame[[feature, 'SalePrice'
                                    ]].groupby(feature).mean()['SalePrice']
        ordering = ordering.sort_values('spmean')
        ordering['ordering'] = range(0, ordering.shape[0])
        ordering = ordering['ordering'].to_dict()

        return ordering

    def select_features(self, df):
        numerical_features = [
            f for f in df.columns if df.dtypes[f] != 'object'
        ]
        try:
            numerical_features.remove('SalePrice')
            numerical_features.remove('Id')
        except:
            pass
        categorical_features = [
            f for f in df.columns if df.dtypes[f] == 'object'
        ]

        return numerical_features, categorical_features

Prepare training and test data



In [13]:

    
pp = DataPreprocessor()



In [14]:

    
# Reload train.csv and test.csv
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")



In [15]:

    
train_transformed = pp.fit_transform(train)
train_transformed.sort_index(axis=1, inplace=True)









    



/usr/local/lib/python3.4/dist-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s






    



df_num shape: (1460, 64)
df_cat shape: (1460, 45)
df_cat_dummies shape: (1460, 292)



In [16]:

    
test_transformed = pp.transform(test)
test_transformed.sort_index(axis=1, inplace=True)









    



/usr/local/lib/python3.4/dist-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s






    



df_num shape: (1459, 64)
df_cat shape: (1459, 45)
df_cat_dummies shape: (1459, 292)



In [18]:

    
y = np.log(train['SalePrice'])

Split between training and validation sets



In [19]:

    
# Partition the dataset in train + validation sets
X_train, X_val, y_train, y_val = train_test_split(train_transformed, y, test_size = 0.2, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_val : " + str(X_val.shape))
print("y_train : " + str(y_train.shape))
print("y_val : " + str(y_val.shape))









    



X_train : (1168, 356)
X_val : (292, 356)
y_train : (1168,)
y_val : (292,)

Train with XGboost



In [25]:

    
# XGboost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

fit_params = {'eval_metric': 'rmse'}

xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(
    xgb_model, {
        'max_depth': [2, 3, 4, 5, 6],
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800]
    },
    fit_params=fit_params,
    verbose=1)

clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

# Look at predictions on training and validation set
print("RMSE on Training set :", rmse(y_train, y_train_pred))
print("RMSE on Validation set :", rmse(y_val, y_val_pred))

# Plot residuals
plt.scatter(
    y_train_pred,
    y_train_pred - y_train,
    c="blue",
    marker="s",
    label="Training data")
plt.scatter(
    y_val_pred,
    y_val_pred - y_val,
    c="lightgreen",
    marker="s",
    label="Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.show()

# Plot predictions
plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data")
plt.scatter(
    y_val_pred, y_val, c="lightgreen", marker="s", label="Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.show()









    



Fitting 3 folds for each of 40 candidates, totalling 120 fits






    



[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.3min finished






    



0.902625916044
{'n_estimators': 700, 'max_depth': 2}
RMSE on Training set : 0.0572210833068
RMSE on Validation set : 0.125626846211



In [21]:

    
clf.best_params_









    Out[21]:





{'max_depth': 2, 'n_estimators': 700}



In [37]:

    
y_test_pred = clf.predict(test_transformed)



In [38]:

    
y_test_pred









    Out[38]:





array([ 11.66674995,  12.00013351,  12.13452148, ...,  12.0263443 ,
        11.59327888,  12.29862595], dtype=float32)



In [49]:

    
test_SalePrice = pd.Series(np.exp(y_test_pred), index=test.index, name='SalePrice')



In [50]:

    
submission = pd.concat([test['Id'], test_SalePrice], axis=1)



In [53]:

    
submission.to_csv('./input/submission.csv', index=False)

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	ScreenPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition
0	1461	20	RH	80.0	11622	Pave	NaN	Reg	Lvl	AllPub	...	120	NaN	MnPrv	NaN	0	6	2010	WD	Normal
1	1462	20	RL	81.0	14267	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	Gar2	12500	6	2010	WD	Normal
2	1463	60	RL	74.0	13830	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	3	2010	WD	Normal
3	1464	60	RL	78.0	9978	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2010	WD	Normal
4	1465	120	RL	43.0	5005	Pave	NaN	IR1	HLS	AllPub	...	144	NaN	NaN	NaN	0	1	2010	WD	Normal