In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale, StandardScaler
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
# Stats helpers
from scipy.stats import norm
from scipy import stats
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [2]:
# get home price train & test csv files as a DataFrame
train = pd.read_csv("../Data/train.csv")
test = pd.read_csv("../Data/test.csv")
full = train.append(test, ignore_index=True)
print (train.shape, test.shape, full.shape)
In [3]:
train.head()
Out[3]:
In [4]:
test.head()
Out[4]:
In [5]:
train.columns
Out[5]:
In [6]:
train.SalePrice.hist()
Out[6]:
In [7]:
#correlation matrix
corrmat = train.corr()
#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
From this correlation map we can make the following interpretations:
In [8]:
col = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF',
'FullBath', 'YearBuilt'
]
In [9]:
train_selected = train[col]
test_selected = test[col]
print train_selected.shape, test_selected.shape
In [10]:
#missing data in train_selected data
total = train_selected.isnull().sum().sort_values(ascending=False)
percent = (train_selected.isnull().sum()/train_selected.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(6)
Out[10]:
In [11]:
#missing data in test_selected data
total = test_selected.isnull().sum().sort_values(ascending=False)
percent = (test_selected.isnull().sum()/test_selected.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(6)
Out[11]:
Only 1 entry has missing values for the 'TotalBsmtSF' and 'GarageCars'. We will fill them with -1
In [12]:
test_selected.TotalBsmtSF.fillna(0, inplace=True);
test_selected.GarageCars.fillna(0, inplace=True);
Make sure Test data has no more missing data
In [13]:
#missing data in test data
total = test_selected.isnull().sum().sort_values(ascending=False)
percent = (test_selected.isnull().sum()/test_selected.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(6)
Out[13]:
In [14]:
train.MSSubClass.isnull().sum()
Out[14]:
In [15]:
#box plot MSSubClass/saleprice
var = 'MSSubClass'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
Some observations:
First option that we can do is to map each choice to a binary feature. This will result in 16 additional features
Second option is to replacce all 16 features with more higher level binary representation. For example: newer or older than 1946, 1 or 1-1/2 story property, 2 or 2-1/2 story property, PUD
In [16]:
ms_sub_class_train = pd.get_dummies(train.MSSubClass, prefix='MSSubClass')
ms_sub_class_train.shape
Out[16]:
According to the features description, there is 16 possible values for 'MSSubClass', we only got 15 which means one value is never present in the train data. To solve this, we need to find that value and add a column with zeros to our features
In [17]:
ms_sub_class_train.head()
Out[17]:
The missing value is 150. So we will add a column with label 'MSSubClass_150'
In [18]:
ms_sub_class_train['MSSubClass_150'] = 0
ms_sub_class_train.head()
Out[18]:
Let's do the same thing for the test data
In [19]:
ms_sub_class_test = pd.get_dummies(test.MSSubClass, prefix='MSSubClass')
ms_sub_class_test.shape
Out[19]:
In [20]:
ms_sub_class_test.head()
Out[20]:
For the test data we have all 16 values so no columns need to be added
In [21]:
#box plot MSSubClass/saleprice
var = 'MSZoning'
data =pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [22]:
ms_zoning_train = pd.get_dummies(train.MSZoning, prefix='MSZoning')
ms_zoning_train.shape
Out[22]:
In [23]:
ms_zoning_train.head()
Out[23]:
In [24]:
ms_zoning_test = pd.get_dummies(test.MSZoning, prefix='MSZoning')
ms_zoning_test.shape
Out[24]:
In [25]:
var = 'Street'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [26]:
# Transform Street into binary values 0 and 1
street_train = pd.Series(np.where(train.Street == 'Pave', 1, 0), name='Street')
street_train.shape
Out[26]:
In [27]:
street_train.head()
Out[27]:
In [28]:
street_test = pd.Series(np.where(test.Street == 'Pave', 1, 0), name='Street')
street_test.shape
Out[28]:
In [29]:
street_test.head()
Out[29]:
In [30]:
var = 'Alley'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [31]:
alley_train = pd.get_dummies(train.Alley, prefix='Alley')
alley_train.shape
Out[31]:
In [32]:
alley_train.head()
Out[32]:
In [33]:
alley_test = pd.get_dummies(test.Alley, prefix='Alley')
alley_test.shape
Out[33]:
In [34]:
train.LotShape.isnull().sum()
Out[34]:
In [35]:
var = 'LotShape'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [36]:
lot_shape_train = pd.get_dummies(train.LotShape, prefix='LotShape')
lot_shape_train.shape
Out[36]:
In [37]:
lot_shape_test = pd.get_dummies(test.LotShape, prefix='LotShape')
lot_shape_test.shape
Out[37]:
In [38]:
lot_shape_test.head()
Out[38]:
In [39]:
train.LandContour.isnull().sum()
Out[39]:
In [40]:
var = 'LandContour'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [41]:
land_contour_train = pd.get_dummies(train.LandContour, prefix='LandContour')
land_contour_train.shape
Out[41]:
In [42]:
land_contour_test = pd.get_dummies(test.LandContour, prefix='LandContour')
land_contour_test.shape
Out[42]:
In [43]:
train.Utilities.isnull().sum()
Out[43]:
In [44]:
var = 'Utilities'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [45]:
train.LotConfig.isnull().sum()
Out[45]:
In [46]:
var = 'LotConfig'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [47]:
train.LandSlope.isnull().sum()
Out[47]:
In [48]:
var = 'LandSlope'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [49]:
train.Neighborhood.isnull().sum()
Out[49]:
In [50]:
var = 'Neighborhood'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(20, 10))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [51]:
neighborhood_train = pd.get_dummies(train.Neighborhood, prefix='N')
neighborhood_train.shape
Out[51]:
In [52]:
neighborhood_test = pd.get_dummies(test.Neighborhood, prefix='N')
neighborhood_test.shape
Out[52]:
In [53]:
train.Condition1.isnull().sum()
Out[53]:
In [54]:
var = 'Condition1'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(20, 10))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [55]:
train.BldgType.isnull().sum()
Out[55]:
In [56]:
var = 'BldgType'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(20, 10))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [57]:
bldgtype_train = pd.get_dummies(train.BldgType, prefix='Bldg')
bldgtype_train.shape
Out[57]:
In [58]:
bldgtype_test = pd.get_dummies(test.BldgType, prefix='Bldg')
bldgtype_test.shape
Out[58]:
In [59]:
train.BsmtCond.isnull().sum()
Out[59]:
In [60]:
var = 'BsmtCond'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(20, 10))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [61]:
bsmtCond_train = pd.get_dummies(train.BsmtCond, prefix='Bldg')
bsmtCond_train.shape
Out[61]:
In [62]:
bsmtCond_test = pd.get_dummies(test.BsmtCond, prefix='Bldg')
bsmtCond_test.shape
Out[62]:
In [63]:
train.SaleCondition.isnull().sum()
Out[63]:
In [64]:
var = 'SaleCondition'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(20, 10))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [65]:
saleCond_train = pd.get_dummies(train.SaleCondition, prefix='saleCond')
saleCond_train.shape
Out[65]:
In [66]:
saleCond_test = pd.get_dummies(test.SaleCondition, prefix='saleCond')
saleCond_test.shape
Out[66]:
Let's concatenate the additional features for the train and test data
Features to choose from:
In [67]:
train_selected = pd.concat([train_selected,
ms_zoning_train,
alley_train,
land_contour_train], axis=1)
train_selected.shape
Out[67]:
In [68]:
test_selected = pd.concat([test_selected,
ms_zoning_test,
alley_test,
land_contour_test], axis=1)
test_selected.shape
Out[68]:
In [69]:
#train_selected_y = train.SalePrice
train_selected_y = np.log1p(train["SalePrice"])
train_selected_y.head()
Out[69]:
In [70]:
train_x, valid_x, train_y, valid_y = train_test_split(train_selected,
train_selected_y,
train_size=0.7)
train_x.shape, valid_x.shape, train_y.shape, valid_y.shape, test_selected.shape
Out[70]:
In [71]:
model = RandomForestRegressor(n_estimators=100)
#model = SVC()
#model = GradientBoostingRegressor()
#model = KNeighborsClassifier(n_neighbors = 3)
#model = GaussianNB()
#model = LogisticRegression()
In [72]:
model.fit(train_x, train_y)
Out[72]:
In [73]:
# Score the model
print (model.score(train_x, train_y), model.score(valid_x, valid_y))
In [75]:
model.fit(train_selected, train_selected_y)
Out[75]:
In [76]:
test_y = model.predict(test_selected)
test_y = np.expm1(test_y)
test_id = test.Id
test_submit = pd.DataFrame({'Id': test_id, 'SalePrice': test_y})
test_submit.shape
test_submit.head()
test_submit.to_csv('house_price_pred_log.csv', index=False)
Using the correlation method, we were able to go from 36 variables to only 6. Performance wise the score dropped from 0.22628 to 0.22856 using a Random Forest model. I believe we can further improve it by analysing the categorical variables.
Using binary variables for the categorical feature 'MSSubClass' seemed to decrease the performance of the prediction
Using binary variable for the categorical feature 'MSZoning' improved the error of the model from 0.22628 to 0.21959.
Using binary variable for the categorical feature 'Street' decreased the perdormance of the model
Using binary variable for the categorical feature 'Alley' improved the error of the model from 0.21959 to 0.21904.
Using binary variable for the categorical feature 'LotShape' decreased the performance of the model.
Using binary variable for the categorical feature 'LandContour' improved the error from 0.21904 to 0.21623.
Using binary variable for the categorical feature 'Neighborhood' decreased the performance of the model.
Using binary variable for the categorical feature 'Building type' decreased the performance of the model.
Using the binary variable of the categorical feature 'BsmntCond' decreased the performance of the model.
Using the binary variable fot the categorical feature 'SaleCondition' decreased the performance of the model.
Never, EVER, use a classification model for regression!!! Changed RandomForestClassifier to RandomForestRegressor and improved error from 0.21623 to 0.16517
Applied log+1 to 'SalePrice' to remove skewness. Error improved from 0.16517 to 0.16083
Many of the analysis and core snippets are from this very detailed post: https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python