Real Estate competition - Kaggle

We will focus on a random forest model, and a few variations

Let's start by importing the necessary modules



In [1]:

    
import os
import numpy as np
import pandas as pd
from scipy.stats import kendalltau

import sklearn.linear_model as lm
import sklearn.cross_validation as cv
import sklearn.preprocessing as pp
from  sklearn import metrics, tree, grid_search
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor 

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline









    



/Users/Tavo/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/Tavo/anaconda3/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [2]:

    
os.getcwd()









    Out[2]:





'/Users/Tavo/Data_science'

We read the files...



In [3]:

    
realestate = pd.read_csv('RealEstate/train.csv')
realestate_test = pd.read_csv('RealEstate/test.csv')

data_description = open('RealEstate/data_description.txt', 'r')
#print (data_description.read())

Let's examine the columns and get rid of colums (1) with all null values (2) too few values



In [4]:

    
realestate.info()
realestate_test.info()

realestate = realestate.dropna(how='all')
realestate_test = realestate_test.dropna(how='all')


#columns withe very few values
few_values = ['Alley', 'PoolQC', 'MiscFeature']
realestate = realestate.drop(few_values, axis=1)
realestate_test = realestate_test.drop(few_values, axis=1)









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
PoolQC           7 non-null object
Fence            281 non-null object
MiscFeature      54 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1443 non-null object
MasVnrArea       1444 non-null float64
ExterQual        1459 non-null object
ExterCond        1459 non-null object
Foundation       1459 non-null object
BsmtQual         1415 non-null object
BsmtCond         1414 non-null object
BsmtExposure     1415 non-null object
BsmtFinType1     1417 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinType2     1417 non-null object
BsmtFinSF2       1458 non-null float64
BsmtUnfSF        1458 non-null float64
TotalBsmtSF      1458 non-null float64
Heating          1459 non-null object
HeatingQC        1459 non-null object
CentralAir       1459 non-null object
Electrical       1459 non-null object
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
BsmtFullBath     1457 non-null float64
BsmtHalfBath     1457 non-null float64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
KitchenQual      1458 non-null object
TotRmsAbvGrd     1459 non-null int64
Functional       1457 non-null object
Fireplaces       1459 non-null int64
FireplaceQu      729 non-null object
GarageType       1383 non-null object
GarageYrBlt      1381 non-null float64
GarageFinish     1381 non-null object
GarageCars       1458 non-null float64
GarageArea       1458 non-null float64
GarageQual       1381 non-null object
GarageCond       1381 non-null object
PavedDrive       1459 non-null object
WoodDeckSF       1459 non-null int64
OpenPorchSF      1459 non-null int64
EnclosedPorch    1459 non-null int64
3SsnPorch        1459 non-null int64
ScreenPorch      1459 non-null int64
PoolArea         1459 non-null int64
PoolQC           3 non-null object
Fence            290 non-null object
MiscFeature      51 non-null object
MiscVal          1459 non-null int64
MoSold           1459 non-null int64
YrSold           1459 non-null int64
SaleType         1458 non-null object
SaleCondition    1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB



In [5]:

    
print(realestate.info(), realestate.shape)
print(realestate_test.info(), realestate_test.shape)









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 78 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
Fence            281 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(40)
memory usage: 901.1+ KB
None (1460, 78)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Data columns (total 77 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1443 non-null object
MasVnrArea       1444 non-null float64
ExterQual        1459 non-null object
ExterCond        1459 non-null object
Foundation       1459 non-null object
BsmtQual         1415 non-null object
BsmtCond         1414 non-null object
BsmtExposure     1415 non-null object
BsmtFinType1     1417 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinType2     1417 non-null object
BsmtFinSF2       1458 non-null float64
BsmtUnfSF        1458 non-null float64
TotalBsmtSF      1458 non-null float64
Heating          1459 non-null object
HeatingQC        1459 non-null object
CentralAir       1459 non-null object
Electrical       1459 non-null object
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
BsmtFullBath     1457 non-null float64
BsmtHalfBath     1457 non-null float64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
KitchenQual      1458 non-null object
TotRmsAbvGrd     1459 non-null int64
Functional       1457 non-null object
Fireplaces       1459 non-null int64
FireplaceQu      729 non-null object
GarageType       1383 non-null object
GarageYrBlt      1381 non-null float64
GarageFinish     1381 non-null object
GarageCars       1458 non-null float64
GarageArea       1458 non-null float64
GarageQual       1381 non-null object
GarageCond       1381 non-null object
PavedDrive       1459 non-null object
WoodDeckSF       1459 non-null int64
OpenPorchSF      1459 non-null int64
EnclosedPorch    1459 non-null int64
3SsnPorch        1459 non-null int64
ScreenPorch      1459 non-null int64
PoolArea         1459 non-null int64
Fence            290 non-null object
MiscVal          1459 non-null int64
MoSold           1459 non-null int64
YrSold           1459 non-null int64
SaleType         1458 non-null object
SaleCondition    1459 non-null object
dtypes: float64(11), int64(26), object(40)
memory usage: 889.1+ KB
None (1459, 77)

Let's change the data types of some of the columns too



In [6]:

    
#some columns with categorical variables are not 'object'
realestate.MSSubClass.astype('object', inplace=True)
realestate_test.MSSubClass.astype('object', inplace=True)
print('Done')









    



Done

Some columns could actually be treated as quantitative variables instead of categorical.

Not strictly necessary, but I wanted to try this little challenge. As you can see I had some issues with in-string variable replacement, so I found a practical solution



In [7]:

    
#some columns could be treated as quantitative variables
def change_scale(legend, scale, column_to_replace):
    j = 0
    for i in legend:
        command = column_to_replace + '.replace(to_replace="' + i + '"' + ', value=' + scale[j].astype('str') + ', inplace=True)'
        #print(command)        
        exec(command)
        j += 1
        if j == len(scale):
            break
    return 

legend = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
scale = np.arange(5,0,-1)
column_to_replace = 'realestate.HeatingQC'
change_scale(legend, scale, column_to_replace)
change_scale(legend, scale, 'realestate_test.HeatingQC')

legend = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
scale = np.arange(5,0,-1)
column_to_replace = 'realestate.KitchenQual'
change_scale(legend, scale, column_to_replace)
change_scale(legend, scale, 'realestate_test.KitchenQual')

Separation of the categorical and the quantitative variables



In [8]:

    
#Let's separate categorical from quantitative variables and make two dataframes
#Thanks to the kernel by BreadenFitz-Gerald for the idea

df = realestate
categorical = []
for col in df.columns.values:
    if df[col].dtype == 'object':
        categorical.append(col)

df_category = df[categorical]
df_quant = df.drop(categorical, axis=1)

df_category_test = realestate_test[categorical]
df_quant_test = realestate_test.drop(categorical, axis=1)

Correlation coefficients of the quantitative variables

Particularly useful if we do a linear regression, to avoid multicollinearity. In our case, we will go straight ahead with a random forest, but understanding the correlation between variables can be useful when interpreting the variable importance anyway



In [9]:

    
#We can calculate the correlation coefficients among variables and flag those with extremely high values
corr = df_quant[df_quant.columns[1:39]].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

fig = plt.figure(figsize=(20,20))
plt.subplot2grid((1,1), (0,0))
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, vmax=.9, square=True, annot=False)

Calculation of the skewness and check for high values that could justify a transformation



In [10]:

    
#We can also calculate the skewness and notice that there are many variables
skew = df_quant[df_quant.columns[1:40]].skew()
print(skew)









    



MSSubClass        1.407657
LotFrontage       2.163569
LotArea          12.207688
OverallQual       0.216944
OverallCond       0.693067
YearBuilt        -0.613461
YearRemodAdd     -0.503562
MasVnrArea        2.669084
BsmtFinSF1        1.685503
BsmtFinSF2        4.255261
BsmtUnfSF         0.920268
TotalBsmtSF       1.524255
HeatingQC        -0.540458
1stFlrSF          1.376757
2ndFlrSF          0.813030
LowQualFinSF      9.011341
GrLivArea         1.366560
BsmtFullBath      0.596067
BsmtHalfBath      4.103403
FullBath          0.036562
HalfBath          0.675897
BedroomAbvGr      0.211790
KitchenAbvGr      4.488397
KitchenQual       0.386765
TotRmsAbvGrd      0.676341
Fireplaces        0.649565
GarageYrBlt      -0.649415
GarageCars       -0.342549
GarageArea        0.179981
WoodDeckSF        1.541376
OpenPorchSF       2.364342
EnclosedPorch     3.089872
3SsnPorch        10.304342
ScreenPorch       4.122214
PoolArea         14.828374
MiscVal          24.476794
MoSold            0.212053
YrSold            0.096269
SalePrice         1.882876
dtype: float64



In [11]:

    
df_quant.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 40 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
MasVnrArea       1452 non-null float64
BsmtFinSF1       1460 non-null int64
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
HeatingQC        1460 non-null int64
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null int64
TotRmsAbvGrd     1460 non-null int64
Fireplaces       1460 non-null int64
GarageYrBlt      1379 non-null float64
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SalePrice        1460 non-null int64
dtypes: float64(3), int64(37)
memory usage: 467.7 KB



In [12]:

    
#Function to eliminate columns with more than N null values and substitute null values in the remaining with median:
def null_value_treatment(dataframe, thresh_null):
    for col in dataframe.columns.values:
        if np.sum(dataframe[col].isnull()) > thresh_null:
            dataframe.drop(col, axis=1, inplace=True)
            print(col)
        elif np.sum(dataframe[col].isnull()) > 0:
            median = dataframe[col].median()
            idx = np.where(dataframe[col].isnull())[0]
            dataframe[col].iloc[idx] = median
    return

#We could do the same operation for the test dataset, but in reality, we want to keep the same number of predictors



In [13]:

    
null_value_treatment(df_quant, 150)
null_value_treatment(df_quant_test, 150)









    



LotFrontage






    



/Users/Tavo/anaconda3/lib/python3.5/site-packages/pandas/core/indexing.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)






    



LotFrontage



In [14]:

    
def transform_skew(dataframe, skew_thresh):
    for col in dataframe.columns.values: 
        if (dataframe[col].skew()) > skew_thresh:
            dataframe[col] = np.log(dataframe[col])
            dataframe[col] = dataframe[col].apply(lambda x: 0 if x == (-1*np.inf) else x)
#           df_quant[col] = Normalizer().fit_transform(df_quant[col].reshape(1,-1))[0]

transform_skew(df_quant, 1.0)
transform_skew(df_quant_test, 1.0)









    



/Users/Tavo/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:4: RuntimeWarning: divide by zero encountered in log



In [15]:

    
def null_value_treatment_categorical(dataframe, thresh_null):
    for col in dataframe.columns.values:
        if np.sum(dataframe[col].isnull()) > thresh_null:
            dataframe.drop(col, axis=1, inplace=True)
            print(col)
        elif np.sum(dataframe[col].isnull()) > 0:
            dataframe[col] = dataframe[col].fillna('MIA', inplace=True)
    return

null_value_treatment_categorical(df_category, 150)
print('----------------')
null_value_treatment_categorical(df_category_test, 150)









    



/Users/Tavo/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py:3295: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/Users/Tavo/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/Tavo/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



FireplaceQu
Fence
----------------
FireplaceQu
Fence

We need to be careful to keep the same number of predictors in the train and tests datasets. Otherwise, the fitted model won't be useful for the test data. If we examine the shape of the dummy dataframes, we realise that they differ greatly. This is beacuase some values within a predictor are not represented in the test dataset. To avoid this, we will merge the train and test dataset, then run the dummy variables, and then separate the datasets again.



In [16]:

    
cat_variables = df_category.columns.values
cat_variables_test = df_category_test.columns.values

df_dummies = pd.get_dummies(df_category, columns=cat_variables)
df_dummies_test = pd.get_dummies(df_category_test, columns=cat_variables)



In [17]:

    
#Dummies are different size because of missing dimensions/values within some of the predictors
print(df_category.shape)
print('------------------')
print(df_category_test.shape)
print('------------------')
print(df_dummies.shape)
print('------------------')
print(df_dummies_test.shape)
print('------------------')
print(df_quant.shape) #one predictor more than test because it still contains the 'predicted' feature.
print('------------------')
print(df_quant_test.shape)









    



(1460, 36)
------------------
(1459, 36)
------------------
(1460, 173)
------------------
(1459, 113)
------------------
(1460, 39)
------------------
(1459, 38)



In [18]:

    
#Let's check if we have the same columns
print(df_category.columns)
df_category_test.columns









    



Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType',
       'SaleCondition'],
      dtype='object')






    Out[18]:





Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType',
       'SaleCondition'],
      dtype='object')



In [19]:

    
#Here we have to merge categorical datasets, then run the dummies, then separate.
df_category_joint = pd.concat([df_category, df_category_test])
df_category_joint.shape









    Out[19]:





(2919, 36)



In [20]:

    
df_dummies_joint = pd.get_dummies(df_category_joint, columns=cat_variables, drop_first=True)
df_dummies_joint.shape

df_dummies = df_dummies_joint[0:1460]
df_dummies_test = df_dummies_joint[1460:2919]



In [21]:

    
#Let's verify...
print(df_dummies_joint[0:1460].shape)
print(df_dummies_joint[1460:2919].shape)









    



(1460, 151)
(1459, 151)



In [22]:

    
y_train = df_quant['SalePrice']

X_train = df_dummies.join(df_quant)
X_train = X_train.drop(['SalePrice', 'Id'], axis=1)

X_test = df_dummies_test.join(df_quant_test)
X_test = X_test.drop(['Id'], axis=1)



In [23]:

    
X_train.head()









    Out[23]:






  
    
      
      MSZoning_FV
      MSZoning_RH
      MSZoning_RL
      MSZoning_RM
      Street_Pave
      LotShape_IR2
      LotShape_IR3
      LotShape_Reg
      LandContour_HLS
      LandContour_Low
      ...
      GarageArea
      WoodDeckSF
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
    
  
  
    
      0
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0
      ...
      548
      0.000000
      4.110874
      0.000000
      0.0
      0.0
      0.0
      0.0
      2
      2008
    
    
      1
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0
      ...
      460
      5.697093
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.0
      5
      2007
    
    
      2
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      ...
      608
      0.000000
      3.737670
      0.000000
      0.0
      0.0
      0.0
      0.0
      9
      2008
    
    
      3
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      ...
      642
      0.000000
      3.555348
      5.605802
      0.0
      0.0
      0.0
      0.0
      2
      2006
    
    
      4
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      ...
      836
      5.257495
      4.430817
      0.000000
      0.0
      0.0
      0.0
      0.0
      12
      2008
    
  

5 rows × 188 columns



In [24]:

    
X_test.head()









    Out[24]:






  
    
      
      MSZoning_FV
      MSZoning_RH
      MSZoning_RL
      MSZoning_RM
      Street_Pave
      LotShape_IR2
      LotShape_IR3
      LotShape_Reg
      LandContour_HLS
      LandContour_Low
      ...
      GarageArea
      WoodDeckSF
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
    
  
  
    
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      ...
      730.0
      4.941642
      0.000000
      0.0
      0.0
      4.787492
      0.0
      0.000000
      6
      2010
    
    
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      312.0
      5.973810
      3.583519
      0.0
      0.0
      0.000000
      0.0
      9.433484
      6
      2010
    
    
      2
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      482.0
      5.356586
      3.526361
      0.0
      0.0
      0.000000
      0.0
      0.000000
      3
      2010
    
    
      3
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      470.0
      5.886104
      3.583519
      0.0
      0.0
      0.000000
      0.0
      0.000000
      6
      2010
    
    
      4
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      506.0
      0.000000
      4.406719
      0.0
      0.0
      4.969813
      0.0
      0.000000
      1
      2010
    
  

5 rows × 188 columns



In [25]:

    
y_train.head()
y_train.shape
#---xxx---









    Out[25]:





(1460,)

We need to creat a dataset that is scaled and see how our predictions change_scale



In [26]:

    
saved_cols = X_train.columns.values
scaler = pp.StandardScaler().fit(X_train)
X_train_scale = pd.DataFrame(scaler.transform(X_train), columns=saved_cols)
X_test_scale = pd.DataFrame(scaler.transform(X_test), columns=saved_cols)



In [27]:

    
X_train_scale.head()









    Out[27]:






  
    
      
      MSZoning_FV
      MSZoning_RH
      MSZoning_RL
      MSZoning_RM
      Street_Pave
      LotShape_IR2
      LotShape_IR3
      LotShape_Reg
      LandContour_HLS
      LandContour_Low
      ...
      GarageArea
      WoodDeckSF
      OpenPorchSF
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
    
  
  
    
      0
      -0.215859
      -0.105263
      0.518133
      -0.418955
      0.064238
      -0.169981
      -0.083045
      0.760512
      -0.188311
      -0.159
      ...
      0.351000
      -0.946448
      0.845412
      -0.404151
      -0.128501
      -0.292752
      -0.069393
      -0.190474
      -1.599111
      0.138777
    
    
      1
      -0.215859
      -0.105263
      0.518133
      -0.418955
      0.064238
      -0.169981
      -0.083045
      0.760512
      -0.188311
      -0.159
      ...
      -0.060731
      1.251026
      -1.071354
      -0.404151
      -0.128501
      -0.292752
      -0.069393
      -0.190474
      -0.489110
      -0.614439
    
    
      2
      -0.215859
      -0.105263
      0.518133
      -0.418955
      0.064238
      -0.169981
      -0.083045
      -1.314904
      -0.188311
      -0.159
      ...
      0.631726
      -0.946448
      0.671399
      -0.404151
      -0.128501
      -0.292752
      -0.069393
      -0.190474
      0.990891
      0.138777
    
    
      3
      -0.215859
      -0.105263
      0.518133
      -0.418955
      0.064238
      -0.169981
      -0.083045
      -1.314904
      -0.188311
      -0.159
      ...
      0.790804
      -0.946448
      0.586389
      2.847712
      -0.128501
      -0.292752
      -0.069393
      -0.190474
      -1.599111
      -1.367655
    
    
      4
      -0.215859
      -0.105263
      0.518133
      -0.418955
      0.064238
      -0.169981
      -0.083045
      -1.314904
      -0.188311
      -0.159
      ...
      1.698485
      1.081465
      0.994591
      -0.404151
      -0.128501
      -0.292752
      -0.069393
      -0.190474
      2.100892
      0.138777
    
  

5 rows × 188 columns

A Random Forest to start

Ideally, we would fit a linear model with two intentions: (1) Eliminate redundant (collinear) variables (2) assess the importance of each variables. This way, we would have a reference point (and a score) to compare with when doing our Random Forests.

However, I will skip this step and head straight into the random forest model... I will do some optimisation by hand, putting some parameters by-ear that give me a reasonable cross validation score. Here it goes:



In [28]:

    
# Train a random forest with XXX decision trees
model_rf1 = RandomForestRegressor(n_estimators=100, max_depth=15)



In [29]:

    
#Fit the training data
model_rf1.fit(X_train, y_train)









    Out[29]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Let's define the folds and check the score...



In [30]:

    
# Define folds for cross-validation
kf = cv.KFold(1460, n_folds=10, shuffle=True)
#kf = 5
scores = cv.cross_val_score(model_rf1, X_train, y_train, cv=kf)
print(scores, ' and the mean score is = ', scores.mean())









    



[ 0.87149888  0.90035744  0.86957492  0.86249839  0.88209593  0.85655768
  0.84123492  0.8557912   0.8987714   0.82671275]  and the mean score is =  0.866509351384

Let's investigate the feature importance, possibly eliminating redundant variables, if any



In [31]:

    
# Investigate importances of predictors
###model_rf1.feature_importances_
feature_importance = model_rf1.feature_importances_



In [32]:

    
# make importances relative to max importance
feature_importance = 100 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
featimp = feature_importance[sorted_idx]
feat = X_train.columns[sorted_idx]
pos = np.arange(sorted_idx.shape[0]) + .5

a = 0 
b = 50 #To limit the number of features
c = b - a

featimp= featimp[::-1][a:b]
feat = feat[::-1][a:b]
pos = pos[::-1][a:b]

fig = plt.figure(figsize=(8,8))
plt.subplot2grid((1,1), (0,0))
with sns.axes_style("white"):
    ax = sns.barplot(y=feat, x=featimp)
plt.xlabel('Relative Importance')
plt.title('Variable Importance (first {0} variables)'.format(c))
plt.show()



In [33]:

    
#Let's check for a second the names of the columns to be sure what we have...
#X_train.columns.values

Let's drop columns that may be redundant. I will keep all at the moment. I already turned on 'drop_first' on get_dummies in order to generate k-1 variables for k indicators.



In [34]:

    
#redundant_variables = ['CentralAir_N'] #undecided if eliminate 'GarageArea' as it is highly correlated with 'GarageCars'
#However, in the context of an American house, perhaps the area of the garage and the number of cars fitting it are considered separately by the consumer?
#A 1-car garage but huge might not be the same than a 1-car gerage barely fittign the car. 
#With some research we could elucidate this and take an informed decision. 

#X_train = X_train.drop(redundant_variables, axis=1)
#X_test = X_test.drop(redundant_variables, axis=1)



In [35]:

    
print(X_test.shape, X_train.shape)









    



(1459, 188) (1460, 188)

Assess the scores for a few different regressors based on decision trees



In [36]:

    
def check_classifiers(X, y):
    """
    Returns a sorted list of accuracy scores from fitting and scoring passed data
    against several algorithms.
    """
    params = 100
    _cv = kf
    classifier_score = {}
    
    scores = cv.cross_val_score(RandomForestRegressor(n_estimators=params), X, y, cv=_cv)
    classifier_score['Random Forest Regressor'] = scores.mean()
    
    scores = cv.cross_val_score(BaggingRegressor(n_estimators=params), X, y, cv=_cv)
    classifier_score['Bagging Regressor'] = scores.mean()
    
    scores = cv.cross_val_score(ExtraTreesRegressor(n_estimators=params), X, y, cv=_cv)
    classifier_score['ExtraTrees Regressor'] = scores.mean()
    
    scores = cv.cross_val_score(AdaBoostRegressor(n_estimators=params), X, y, cv=_cv)
    classifier_score['AdaBoost Regressor'] = scores.mean()
    
    scores = cv.cross_val_score(GradientBoostingRegressor(n_estimators=params), X, y, cv=_cv)
    classifier_score['Gradient Boost Regressor'] = scores.mean()

    #return sorted(classifier_score.items(), key=operator.itemgetter(1), reverse=True)
    return sorted(classifier_score.items(), reverse=True)

check_classifiers(X_train, y_train)









    Out[36]:





[('Random Forest Regressor', 0.86734133801755642),
 ('Gradient Boost Regressor', 0.89018079393200922),
 ('ExtraTrees Regressor', 0.86917950469467387),
 ('Bagging Regressor', 0.87045368993384797),
 ('AdaBoost Regressor', 0.79895862561606046)]

Select a regressor and perform grid search for selection of best parameters. The gradient boosting regressor seems to do a good job.



In [37]:

    
model_rf2 = GradientBoostingRegressor()

# Determine ‘optimal’ number of components
gs = grid_search.GridSearchCV(
    estimator=model_rf2,
    param_grid={'loss': ['ls', 'lad', 'huber'],
                'learning_rate': [0.001, 0.01, 0.1],
                'n_estimators': np.arange(100, 300, 100),
                'max_depth': [3, 5, 7]
                },
    scoring='neg_mean_squared_error',
    cv=kf
    )

gs.fit(X_train, y_train)


gs.grid_scores_









    Out[37]:





[mean: -0.13994, std: 0.01851, params: {'learning_rate': 0.001, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.12334, std: 0.01660, params: {'learning_rate': 0.001, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.13740, std: 0.01775, params: {'learning_rate': 0.001, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.11878, std: 0.01543, params: {'learning_rate': 0.001, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.13593, std: 0.01760, params: {'learning_rate': 0.001, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.11684, std: 0.01489, params: {'learning_rate': 0.001, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.14492, std: 0.01981, params: {'learning_rate': 0.001, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.13251, std: 0.01886, params: {'learning_rate': 0.001, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.14427, std: 0.01976, params: {'learning_rate': 0.001, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.13081, std: 0.01864, params: {'learning_rate': 0.001, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.14387, std: 0.01969, params: {'learning_rate': 0.001, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.13011, std: 0.01858, params: {'learning_rate': 0.001, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.14035, std: 0.01861, params: {'learning_rate': 0.001, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.12401, std: 0.01664, params: {'learning_rate': 0.001, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.13811, std: 0.01824, params: {'learning_rate': 0.001, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.11969, std: 0.01583, params: {'learning_rate': 0.001, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.13681, std: 0.01797, params: {'learning_rate': 0.001, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.11748, std: 0.01540, params: {'learning_rate': 0.001, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.05427, std: 0.00859, params: {'learning_rate': 0.01, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.03025, std: 0.00553, params: {'learning_rate': 0.01, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.04657, std: 0.00683, params: {'learning_rate': 0.01, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.02529, std: 0.00415, params: {'learning_rate': 0.01, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.04392, std: 0.00580, params: {'learning_rate': 0.01, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.02536, std: 0.00433, params: {'learning_rate': 0.01, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.07280, std: 0.01304, params: {'learning_rate': 0.01, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.04285, std: 0.00867, params: {'learning_rate': 0.01, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.06633, std: 0.01187, params: {'learning_rate': 0.01, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.03721, std: 0.00763, params: {'learning_rate': 0.01, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.06451, std: 0.01160, params: {'learning_rate': 0.01, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.03505, std: 0.00707, params: {'learning_rate': 0.01, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.05605, std: 0.00947, params: {'learning_rate': 0.01, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.03181, std: 0.00614, params: {'learning_rate': 0.01, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.04793, std: 0.00754, params: {'learning_rate': 0.01, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.02633, std: 0.00499, params: {'learning_rate': 0.01, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.04524, std: 0.00680, params: {'learning_rate': 0.01, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.02599, std: 0.00442, params: {'learning_rate': 0.01, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.01712, std: 0.00279, params: {'learning_rate': 0.1, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.01699, std: 0.00310, params: {'learning_rate': 0.1, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.01771, std: 0.00288, params: {'learning_rate': 0.1, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.01744, std: 0.00284, params: {'learning_rate': 0.1, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.02013, std: 0.00347, params: {'learning_rate': 0.1, 'loss': 'ls', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.02002, std: 0.00358, params: {'learning_rate': 0.1, 'loss': 'ls', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.01813, std: 0.00367, params: {'learning_rate': 0.1, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.01740, std: 0.00344, params: {'learning_rate': 0.1, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.01754, std: 0.00359, params: {'learning_rate': 0.1, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.01722, std: 0.00324, params: {'learning_rate': 0.1, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.01760, std: 0.00354, params: {'learning_rate': 0.1, 'loss': 'lad', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.01698, std: 0.00330, params: {'learning_rate': 0.1, 'loss': 'lad', 'n_estimators': 200, 'max_depth': 7},
 mean: -0.01682, std: 0.00313, params: {'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 3},
 mean: -0.01699, std: 0.00318, params: {'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 3},
 mean: -0.01772, std: 0.00345, params: {'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 5},
 mean: -0.01763, std: 0.00323, params: {'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 5},
 mean: -0.01981, std: 0.00331, params: {'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 100, 'max_depth': 7},
 mean: -0.01961, std: 0.00334, params: {'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 200, 'max_depth': 7}]



In [38]:

    
print('Lowest mean squared error = ', gs.best_score_)
gs.best_estimator_









    



Lowest mean squared error =  -0.01681886504023499






    Out[38]:





GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=3,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)



In [39]:

    
model_rf3 = gs.best_estimator_
model_rf3.fit(X_train, y_train)









    Out[39]:





GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=3,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)



In [40]:

    
scores = cv.cross_val_score(model_rf3, X_train, y_train, cv=kf)
scores.mean()









    Out[40]:





0.8918956258552857



In [41]:

    
y_test = model_rf3.predict(X_test)



In [42]:

    
y = np.exp(y_test)
y









    Out[42]:





array([ 132437.60441813,  167864.9846383 ,  192532.3557644 , ...,
        187342.43523339,  141969.17189513,  252640.75812441])

Some graphs, I'll stick to the main predictor, but we can do it for the top 10 or 20 predictors and improve the unerstanding of our data and the quality of our predictions



In [43]:

    
#Let's group some data...
train_diag = X_train.join(np.exp(y_train))
train_diag['dataset'] = 'train'
train_diag['Id'] = realestate['Id'
                             ]
test_diag = X_test
test_diag['SalePrice'] = y
test_diag['dataset'] = 'test'
test_diag['Id'] = realestate_test['Id']

total_diag = pd.concat([train_diag, test_diag])

print(train_diag.shape, test_diag.shape, total_diag.shape)
test_diag.head()
#total_diag.columns.values









    



(1460, 191) (1459, 191) (2919, 191)






    Out[43]:






  
    
      
      MSZoning_FV
      MSZoning_RH
      MSZoning_RL
      MSZoning_RM
      Street_Pave
      LotShape_IR2
      LotShape_IR3
      LotShape_Reg
      LandContour_HLS
      LandContour_Low
      ...
      EnclosedPorch
      3SsnPorch
      ScreenPorch
      PoolArea
      MiscVal
      MoSold
      YrSold
      SalePrice
      dataset
      Id
    
  
  
    
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
      0
      ...
      0.0
      0.0
      4.787492
      0.0
      0.000000
      6
      2010
      132437.604418
      test
      1461
    
    
      1
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.0
      9.433484
      6
      2010
      167864.984638
      test
      1462
    
    
      2
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.0
      0.000000
      3
      2010
      192532.355764
      test
      1463
    
    
      3
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.0
      0.000000
      6
      2010
      212424.109832
      test
      1464
    
    
      4
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      0.0
      0.0
      4.969813
      0.0
      0.000000
      1
      2010
      220891.993449
      test
      1465
    
  

5 rows × 191 columns



In [44]:

    
fig = plt.figure(figsize=(10,15))

plt.subplot2grid((2,1), (0,0))
with sns.axes_style("white"):
    sns.violinplot(x='OverallQual', y='SalePrice', hue='dataset', data=total_diag, split=True,
               inner='quart', palette={'train': 'r', 'test': 'y'})
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('Overall quality')
plt.ylabel('Sale price')
plt.title('Sale price and overall quality')

'''
plt.subplot2grid((2,1), (0,1))
with sns.axes_style("white"):
    ax = sns.jointplot(X_test['OverallQual'], y, kind="hex", stat_func=kendalltau, color="#4CB391")
plt.xlabel('Relative Importance')
#plt.title('Variable Importance (first {0} variables)'.format(c))'''

plt.show()

As you can see towards the bottom adn the top of the quality scale, the distribution of the train and test datasets differs much more. It owuld be interesting to see the trends in other variables. If this is a consistent mismatch, we need to think on adjusting our algorithm , perhaps 'regularising' the variables, to give the right weight to each, at each part of the spectrum.

Output the predictions



In [45]:

    
submission = test_diag[['Id', 'SalePrice']]
print('Done!')
submission.to_csv("submission_realestate_tawonque2.csv", index=False)









    



Done!



In [ ]:

	MSZoning_RL	Street_Pave	LotShape_Reg	...	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	MoSold	YrSold
0	1	1	1	...	548	0.000000	4.110874	0.000000	2	2008
1	1	1	1	...	460	5.697093	0.000000	0.000000	5	2007
2	1	1	0	...	608	0.000000	3.737670	0.000000	9	2008
3	1	1	0	...	642	0.000000	3.555348	5.605802	2	2006
4	1	1	0	...	836	5.257495	4.430817	0.000000	12	2008

	Street_Pave	LotShape_Reg	LandContour_HLS	...	GarageArea	WoodDeckSF	OpenPorchSF	ScreenPorch	MiscVal	MoSold	YrSold
0	1	1	0	...	730.0	4.941642	0.000000	4.787492	0.000000	6	2010
1	1	0	0	...	312.0	5.973810	3.583519	0.000000	9.433484	6	2010
2	1	0	0	...	482.0	5.356586	3.526361	0.000000	0.000000	3	2010
3	1	0	0	...	470.0	5.886104	3.583519	0.000000	0.000000	6	2010
4	1	0	1	...	506.0	0.000000	4.406719	4.969813	0.000000	1	2010

	MSZoning_FV	MSZoning_RH	MSZoning_RL	MSZoning_RM	Street_Pave	LotShape_IR2	LotShape_IR3	LotShape_Reg	LandContour_HLS	LandContour_Low	...	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold
0	-0.215859	-0.105263	0.518133	-0.418955	0.064238	-0.169981	-0.083045	0.760512	-0.188311	-0.159	...	0.351000	-0.946448	0.845412	-0.404151	-0.128501	-0.292752	-0.069393	-0.190474	-1.599111	0.138777
1	-0.215859	-0.105263	0.518133	-0.418955	0.064238	-0.169981	-0.083045	0.760512	-0.188311	-0.159	...	-0.060731	1.251026	-1.071354	-0.404151	-0.128501	-0.292752	-0.069393	-0.190474	-0.489110	-0.614439
2	-0.215859	-0.105263	0.518133	-0.418955	0.064238	-0.169981	-0.083045	-1.314904	-0.188311	-0.159	...	0.631726	-0.946448	0.671399	-0.404151	-0.128501	-0.292752	-0.069393	-0.190474	0.990891	0.138777
3	-0.215859	-0.105263	0.518133	-0.418955	0.064238	-0.169981	-0.083045	-1.314904	-0.188311	-0.159	...	0.790804	-0.946448	0.586389	2.847712	-0.128501	-0.292752	-0.069393	-0.190474	-1.599111	-1.367655
4	-0.215859	-0.105263	0.518133	-0.418955	0.064238	-0.169981	-0.083045	-1.314904	-0.188311	-0.159	...	1.698485	1.081465	0.994591	-0.404151	-0.128501	-0.292752	-0.069393	-0.190474	2.100892	0.138777