In [1]:
# A logistic regression learning algorithm example using TensorFlow library.

# Author: Alaa Awad
# Project: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
%matplotlib inline

In [3]:
train = pd.read_csv('../input/train.csv.gz')
test = pd.read_csv('../input/test.csv.gz')

In [4]:
train.head()


Out[4]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns


In [7]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
labels = train['SalePrice']

In [8]:
all_data.head()


Out[8]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside ... 0 0 NaN NaN NaN 0 2 2008 WD Normal
1 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 ... 0 0 NaN NaN NaN 0 5 2007 WD Normal
2 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside ... 0 0 NaN NaN NaN 0 9 2008 WD Normal
3 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner ... 0 0 NaN NaN NaN 0 2 2006 WD Abnorml
4 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 ... 0 0 NaN NaN NaN 0 12 2008 WD Normal

5 rows × 79 columns


In [8]:
all_data.describe()


Out[8]:
MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 ... GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
count 2919.000000 2433.000000 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000 2896.000000 2918.000000 2918.000000 ... 2918.000000 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000 2919.000000
mean 57.137718 69.305795 10168.114080 6.089072 5.564577 1971.312778 1984.264474 102.201312 441.423235 49.582248 ... 472.874572 93.709832 47.486811 23.098321 2.602261 16.062350 2.251799 50.825968 6.213087 2007.792737
std 42.517628 23.344905 7886.996359 1.409947 1.113131 30.291442 20.894344 179.334253 455.610826 169.205611 ... 215.394815 126.526589 67.575493 64.244246 25.188169 56.184365 35.663946 567.402211 2.714762 1.314964
min 20.000000 21.000000 1300.000000 1.000000 1.000000 1872.000000 1950.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2006.000000
25% 20.000000 59.000000 7478.000000 5.000000 5.000000 1953.500000 1965.000000 0.000000 0.000000 0.000000 ... 320.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000 2007.000000
50% 50.000000 68.000000 9453.000000 6.000000 5.000000 1973.000000 1993.000000 0.000000 368.500000 0.000000 ... 480.000000 0.000000 26.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.000000 2008.000000
75% 70.000000 80.000000 11570.000000 7.000000 6.000000 2001.000000 2004.000000 164.000000 733.000000 0.000000 ... 576.000000 168.000000 70.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 2009.000000
max 190.000000 313.000000 215245.000000 10.000000 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 1526.000000 ... 1488.000000 1424.000000 742.000000 1012.000000 508.000000 576.000000 800.000000 17000.000000 12.000000 2010.000000

8 rows × 36 columns


In [10]:
all_data.columns


Out[10]:
Index([u'MSSubClass', u'MSZoning', u'LotFrontage', u'LotArea', u'Street',
       u'Alley', u'LotShape', u'LandContour', u'Utilities', u'LotConfig',
       u'LandSlope', u'Neighborhood', u'Condition1', u'Condition2',
       u'BldgType', u'HouseStyle', u'OverallQual', u'OverallCond',
       u'YearBuilt', u'YearRemodAdd', u'RoofStyle', u'RoofMatl',
       u'Exterior1st', u'Exterior2nd', u'MasVnrType', u'MasVnrArea',
       u'ExterQual', u'ExterCond', u'Foundation', u'BsmtQual', u'BsmtCond',
       u'BsmtExposure', u'BsmtFinType1', u'BsmtFinSF1', u'BsmtFinType2',
       u'BsmtFinSF2', u'BsmtUnfSF', u'TotalBsmtSF', u'Heating', u'HeatingQC',
       u'CentralAir', u'Electrical', u'1stFlrSF', u'2ndFlrSF', u'LowQualFinSF',
       u'GrLivArea', u'BsmtFullBath', u'BsmtHalfBath', u'FullBath',
       u'HalfBath', u'BedroomAbvGr', u'KitchenAbvGr', u'KitchenQual',
       u'TotRmsAbvGrd', u'Functional', u'Fireplaces', u'FireplaceQu',
       u'GarageType', u'GarageYrBlt', u'GarageFinish', u'GarageCars',
       u'GarageArea', u'GarageQual', u'GarageCond', u'PavedDrive',
       u'WoodDeckSF', u'OpenPorchSF', u'EnclosedPorch', u'3SsnPorch',
       u'ScreenPorch', u'PoolArea', u'PoolQC', u'Fence', u'MiscFeature',
       u'MiscVal', u'MoSold', u'YrSold', u'SaleType', u'SaleCondition'],
      dtype='object')

In [11]:
print("Some housing prices statistics\n")
print(train['SalePrice'].describe())
print("\nThe median of the Housing Price is: ", train['SalePrice'].median(axis = 0))


Some housing prices statistics

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
('\nThe median of the Housing Price is: ', 163000.0)

Data preprocessing:

We're not going to do anything fancy here:

  • First I'll transform the skewed numeric features by taking log(feature + 1) - this will make the features more normal
  • Create Dummy variables for the categorical features
  • Replace the numeric missing values (NaN's) with the mean of their respective columns

In [15]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
prices.hist()


Out[15]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x117d69290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x117e075d0>]], dtype=object)

In [24]:
sns.distplot(train["SalePrice"], bins=30, kde = False, color = 'b', hist_kws={'alpha': 0.9})


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b6e3dd0>

In [22]:
sns.distplot(np.log1p(train["SalePrice"]), bins=30, kde = False, color = 'b', hist_kws={'alpha': 0.9})


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b39c410>

Numerical Features


In [25]:
corr = train.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)


//anaconda/lib/python2.7/site-packages/matplotlib/collections.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b9e5fd0>

In [28]:
cor_dict = corr['SalePrice'].to_dict()
del cor_dict['SalePrice']
print("List the numerical features decendingly by their correlation with Sale Price:\n")
for ele in sorted(cor_dict.items(), key = lambda x: -abs(x[1])):
    print("{0}: \t{1}".format(*ele))


List the numerical features decendingly by their correlation with Sale Price:

OverallQual: 	0.790981600584
GrLivArea: 	0.708624477613
GarageCars: 	0.640409197258
GarageArea: 	0.623431438918
TotalBsmtSF: 	0.613580551559
1stFlrSF: 	0.605852184692
FullBath: 	0.560663762748
TotRmsAbvGrd: 	0.533723155582
YearBuilt: 	0.522897332879
YearRemodAdd: 	0.507100967111
GarageYrBlt: 	0.486361677488
MasVnrArea: 	0.477493047096
Fireplaces: 	0.466928836752
BsmtFinSF1: 	0.386419806242
LotFrontage: 	0.351799096571
WoodDeckSF: 	0.324413444568
2ndFlrSF: 	0.319333802832
OpenPorchSF: 	0.315856227116
HalfBath: 	0.284107675595
LotArea: 	0.263843353871
BsmtFullBath: 	0.227122233131
BsmtUnfSF: 	0.214479105547
BedroomAbvGr: 	0.168213154301
KitchenAbvGr: 	-0.135907370842
EnclosedPorch: 	-0.128577957926
ScreenPorch: 	0.111446571143
PoolArea: 	0.0924035494919
MSSubClass: 	-0.0842841351266
OverallCond: 	-0.0778558940487
MoSold: 	0.0464322452238
3SsnPorch: 	0.0445836653357
YrSold: 	-0.0289225851687
LowQualFinSF: 	-0.0256061300007
MiscVal: 	-0.0211895796403
BsmtHalfBath: 	-0.0168441542974
BsmtFinSF2: 	-0.0113781214502

The housing price correlates strongly with `OverallQual, GrLivArea(GarageCars), GargeArea, TotalBsmtSF, 1stFlrSF, FullBath, TotRmsAbvGrd, YearBuilt, YearRemodAdd, GargeYrBlt, MasVnrArea` and `Fireplaces`. But some of those features are highly correlated among each others.


In [30]:
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = train, color = 'Orange')


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bd9c150>

In [36]:
plt.figure(1)

f, axarr = plt.subplots(3, 3, figsize=(10, 9))
price = train.SalePrice.values
axarr[0, 0].scatter(train.GrLivArea.values, price)
axarr[0, 0].set_title('GrLiveArea')
axarr[0, 1].scatter(train.GarageArea.values, price)
axarr[0, 1].set_title('GarageArea')
axarr[0, 2].scatter(train.GarageCars.values, price)
axarr[0, 2].set_title('GarageCars')
axarr[1, 0].scatter(train.TotalBsmtSF.values, price)
axarr[1, 0].set_title('TotalBsmtSF')
axarr[1, 1].scatter(train['1stFlrSF'].values, price)
axarr[1, 1].set_title('1stFlrSF')
axarr[1, 2].scatter(train.OverallQual.values, price)
axarr[1, 2].set_title('OverallQual')
axarr[2, 0].scatter(train.TotRmsAbvGrd.values, price)
axarr[2, 0].set_title('TotRmsAbvGrd')
axarr[2, 1].scatter(train.MasVnrArea.values, price)
axarr[2, 1].set_title('MasVnrArea')
axarr[2, 2].scatter(train.YearBuilt.values, price)
axarr[2, 2].set_title('YearBuilt')

f.text(-0.01, 0.5, 'Sale Price', va='center', rotation='vertical', fontsize = 12)
plt.tight_layout()
plt.show()


<matplotlib.figure.Figure at 0x11d68bd90>

In [38]:
fig = plt.figure(2, figsize=(9, 7))
plt.subplot(211)
plt.scatter(train.YearRemodAdd.values, price)
plt.title('YearRemodAdd')

plt.subplot(212)
plt.scatter(train.YearBuilt.values, price)
plt.title('YearBuilt')

fig.text(-0.01, 0.5, 'Sale Price', va = 'center', rotation = 'vertical', fontsize = 12)

plt.tight_layout()


Categorical Features


In [40]:
print(train.select_dtypes(include=['object']).columns.values)


['MSZoning' 'Street' 'Alley' 'LotShape' 'LandContour' 'Utilities'
 'LotConfig' 'LandSlope' 'Neighborhood' 'Condition1' 'Condition2'
 'BldgType' 'HouseStyle' 'RoofStyle' 'RoofMatl' 'Exterior1st' 'Exterior2nd'
 'MasVnrType' 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond'
 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2' 'Heating' 'HeatingQC'
 'CentralAir' 'Electrical' 'KitchenQual' 'Functional' 'FireplaceQu'
 'GarageType' 'GarageFinish' 'GarageQual' 'GarageCond' 'PavedDrive'
 'PoolQC' 'Fence' 'MiscFeature' 'SaleType' 'SaleCondition']

In [41]:
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice',  data = train)
xt = plt.xticks(rotation=45)



In [42]:
plt.figure(figsize = (12, 6))
sns.countplot(x = 'Neighborhood', data = train)
xt = plt.xticks(rotation=45)


Could group those Neighborhoods with similar housing price into a same bucket for dimension-reduction.

Housing Price vs Sales

  • Sale Type & Condition
  • Sales Seasonality

In [44]:
fig, ax = plt.subplots(1, 2, figsize = (10, 6))
sns.boxplot(x = 'SaleType', y = 'SalePrice', data = train, ax = ax[0])
sns.boxplot(x = 'SaleCondition', y = 'SalePrice', data = train, ax = ax[1])
plt.tight_layout()



In [47]:
g = sns.FacetGrid(train, col = 'YrSold', col_wrap = 3)
g.map(sns.boxplot, 'MoSold', 'SalePrice', palette='Set2', order = range(1, 13))\
.set(ylim = (0, 500000))
plt.tight_layout()


Sale's timing does not seem to hugely affect the house.

  • Housing Style

In [49]:
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'BldgType', y = 'SalePrice', data = train, ax = ax[0])
sns.boxplot(x = 'HouseStyle', y = 'SalePrice', data = train, ax = ax[1])


Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x11de83950>
  • House Condition

In [51]:
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'Condition1', y = 'SalePrice', data = train, ax = ax[0])
sns.boxplot(x = 'Exterior1st', y = 'SalePrice', data = train, ax = ax[1])
x = plt.xticks(rotation = 45)
plt.show()



In [53]:
fig, ax = plt.subplots(2, 2, figsize = (10, 8))
sns.boxplot('BsmtCond', 'SalePrice', data = train, ax = ax[0, 0])
sns.boxplot('BsmtQual', 'SalePrice', data = train, ax = ax[0, 1])
sns.boxplot('BsmtExposure', 'SalePrice', data = train, ax = ax[1, 0])
sns.boxplot('BsmtFinType1', 'SalePrice', data = train, ax = ax[1, 1])


Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x122813190>
  • Home Functionality

In [56]:
sns.violinplot('Functional', 'SalePrice', data = train)


Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e095490>
  • FirePlaceQu

In [63]:
sns.factorplot('FireplaceQu', 'SalePrice', data = train, color = 'm', \
               estimator = np.median, order = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], size = 4.5,  aspect=1.35)


Out[63]:
<seaborn.axisgrid.FacetGrid at 0x1238cb710>

In [65]:
pd.crosstab(train.Fireplaces, train.FireplaceQu)


Out[65]:
FireplaceQu Ex Fa Gd Po TA
Fireplaces
1 19 28 324 20 259
2 4 4 54 0 53
3 1 1 2 0 1

In [67]:
g = sns.FacetGrid(train, col = 'FireplaceQu', col_wrap = 3, col_order=['Ex', 'Gd', 'TA', 'Fa', 'Po'])
g.map(sns.boxplot, 'Fireplaces', 'SalePrice', order = [1, 2, 3], palette = 'Set2')


Out[67]:
<seaborn.axisgrid.FacetGrid at 0x123d247d0>
  • Heating

Ames is a cold place in winter, so heating (as well as fireplace qualities) are quite important.


In [69]:
pd.crosstab(train.HeatingQC, train.CentralAir)


Out[69]:
CentralAir N Y
HeatingQC
Ex 8 733
Fa 24 25
Gd 13 228
Po 1 0
TA 49 379

In [70]:
pd.crosstab(train.HeatingQC, train.FireplaceQu)


Out[70]:
FireplaceQu Ex Fa Gd Po TA
HeatingQC
Ex 22 14 254 4 160
Fa 0 1 13 1 5
Gd 2 3 45 5 57
TA 0 15 68 10 91

In [73]:
sns.factorplot('HeatingQC', 'SalePrice', hue = 'CentralAir', estimator = np.mean, data = train, 
             size = 4.5, aspect = 1.4)


Out[73]:
<seaborn.axisgrid.FacetGrid at 0x120422d90>

In [75]:
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot('Electrical', 'SalePrice', data = train, ax = ax[0]).set(ylim = (0, 400000))
sns.countplot('Electrical', data = train)
plt.tight_layout()


  • Kitchen Quality

In [77]:
sns.factorplot('KitchenQual', 'SalePrice', estimator = np.mean, 
               size = 4.5, aspect = 1.4, data = train, order = ['Ex', 'Gd', 'TA', 'Fa'])


Out[77]:
<seaborn.axisgrid.FacetGrid at 0x125577e50>
  • MSZonig

In [79]:
sns.boxplot(x = 'MSZoning', y = 'SalePrice', data = train)


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x125d2df90>
  • Street & Alley Access

In [82]:
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot(x = 'Street', y = 'SalePrice', data = train, ax = ax[0], order=['Grvl','Pave'])
sns.boxplot(x = 'Alley', y = 'SalePrice', data = train, ax = ax[1], order=['Grvl','Pave'])
plt.tight_layout()



In [85]:
print("The NA's in Alley is: ", train['Alley'].isnull().sum())
print("\nThere are so many NA's in Alley. When Alley is NA, Street = ", 
      train[train.Alley.notnull()].Street.unique())
print("\n", pd.crosstab(train.Street, train.Alley))


("The NA's in Alley is: ", 1369)
("\nThere are so many NA's in Alley. When Alley is NA, Street = ", array(['Pave'], dtype=object))
('\n', Alley   Grvl  Pave
Street            
Pave      50    41)

In [ ]: