Load Data


In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns

train = pd.read_csv('train.csv', header=0)
test = pd.read_csv('test.csv', header=0)
full = pd.concat([train.drop('SalePrice', axis=1), test], ignore_index=True)
print 'train'
train.info()
print 'test'
test.info()
print 'full'
full.info()

#train.head()
#test.head()
full.head()


train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 non-null object
MasVnrArea       1452 non-null float64
ExterQual        1460 non-null object
ExterCond        1460 non-null object
Foundation       1460 non-null object
BsmtQual         1423 non-null object
BsmtCond         1423 non-null object
BsmtExposure     1422 non-null object
BsmtFinType1     1423 non-null object
BsmtFinSF1       1460 non-null int64
BsmtFinType2     1422 non-null object
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
Heating          1460 non-null object
HeatingQC        1460 non-null object
CentralAir       1460 non-null object
Electrical       1459 non-null object
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int64
Functional       1460 non-null object
Fireplaces       1460 non-null int64
FireplaceQu      770 non-null object
GarageType       1379 non-null object
GarageYrBlt      1379 non-null float64
GarageFinish     1379 non-null object
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
GarageQual       1379 non-null object
GarageCond       1379 non-null object
PavedDrive       1460 non-null object
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
PoolQC           7 non-null object
Fence            281 non-null object
MiscFeature      54 non-null object
MiscVal          1460 non-null int64
MoSold           1460 non-null int64
YrSold           1460 non-null int64
SaleType         1460 non-null object
SaleCondition    1460 non-null object
SalePrice        1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
test
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non-null object
MasVnrType       1443 non-null object
MasVnrArea       1444 non-null float64
ExterQual        1459 non-null object
ExterCond        1459 non-null object
Foundation       1459 non-null object
BsmtQual         1415 non-null object
BsmtCond         1414 non-null object
BsmtExposure     1415 non-null object
BsmtFinType1     1417 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinType2     1417 non-null object
BsmtFinSF2       1458 non-null float64
BsmtUnfSF        1458 non-null float64
TotalBsmtSF      1458 non-null float64
Heating          1459 non-null object
HeatingQC        1459 non-null object
CentralAir       1459 non-null object
Electrical       1459 non-null object
1stFlrSF         1459 non-null int64
2ndFlrSF         1459 non-null int64
LowQualFinSF     1459 non-null int64
GrLivArea        1459 non-null int64
BsmtFullBath     1457 non-null float64
BsmtHalfBath     1457 non-null float64
FullBath         1459 non-null int64
HalfBath         1459 non-null int64
BedroomAbvGr     1459 non-null int64
KitchenAbvGr     1459 non-null int64
KitchenQual      1458 non-null object
TotRmsAbvGrd     1459 non-null int64
Functional       1457 non-null object
Fireplaces       1459 non-null int64
FireplaceQu      729 non-null object
GarageType       1383 non-null object
GarageYrBlt      1381 non-null float64
GarageFinish     1381 non-null object
GarageCars       1458 non-null float64
GarageArea       1458 non-null float64
GarageQual       1381 non-null object
GarageCond       1381 non-null object
PavedDrive       1459 non-null object
WoodDeckSF       1459 non-null int64
OpenPorchSF      1459 non-null int64
EnclosedPorch    1459 non-null int64
3SsnPorch        1459 non-null int64
ScreenPorch      1459 non-null int64
PoolArea         1459 non-null int64
PoolQC           3 non-null object
Fence            290 non-null object
MiscFeature      51 non-null object
MiscVal          1459 non-null int64
MoSold           1459 non-null int64
YrSold           1459 non-null int64
SaleType         1458 non-null object
SaleCondition    1459 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 911.9+ KB
full
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-null object
Exterior2nd      2918 non-null object
MasVnrType       2895 non-null object
MasVnrArea       2896 non-null float64
ExterQual        2919 non-null object
ExterCond        2919 non-null object
Foundation       2919 non-null object
BsmtQual         2838 non-null object
BsmtCond         2837 non-null object
BsmtExposure     2837 non-null object
BsmtFinType1     2840 non-null object
BsmtFinSF1       2918 non-null float64
BsmtFinType2     2839 non-null object
BsmtFinSF2       2918 non-null float64
BsmtUnfSF        2918 non-null float64
TotalBsmtSF      2918 non-null float64
Heating          2919 non-null object
HeatingQC        2919 non-null object
CentralAir       2919 non-null object
Electrical       2918 non-null object
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2917 non-null float64
BsmtHalfBath     2917 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
KitchenQual      2918 non-null object
TotRmsAbvGrd     2919 non-null int64
Functional       2917 non-null object
Fireplaces       2919 non-null int64
FireplaceQu      1499 non-null object
GarageType       2762 non-null object
GarageYrBlt      2760 non-null float64
GarageFinish     2760 non-null object
GarageCars       2918 non-null float64
GarageArea       2918 non-null float64
GarageQual       2760 non-null object
GarageCond       2760 non-null object
PavedDrive       2919 non-null object
WoodDeckSF       2919 non-null int64
OpenPorchSF      2919 non-null int64
EnclosedPorch    2919 non-null int64
3SsnPorch        2919 non-null int64
ScreenPorch      2919 non-null int64
PoolArea         2919 non-null int64
PoolQC           10 non-null object
Fence            571 non-null object
MiscFeature      105 non-null object
MiscVal          2919 non-null int64
MoSold           2919 non-null int64
YrSold           2919 non-null int64
SaleType         2918 non-null object
SaleCondition    2919 non-null object
dtypes: float64(11), int64(26), object(43)
memory usage: 1.8+ MB
Out[2]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 0 NaN NaN NaN 0 2 2008 WD Normal
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 0 NaN NaN NaN 0 5 2007 WD Normal
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 9 2008 WD Normal
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 2 2006 WD Abnorml
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 12 2008 WD Normal

5 rows × 80 columns

Understand Data

Check Date Coverage


In [2]:
#train_dt = pd.to_datetime(dict(year=train['YrSold'], month=train['MoSold'], day=np.ones((len(train['MoSold'])))))
#test_dt = pd.to_datetime(dict(year=test['YrSold'], month=test['MoSold'], day=np.ones((len(test['MoSold'])))))
sns.distplot(train['YrSold'], kde=False, rug=True)
sns.distplot(test['YrSold'], kde=False, rug=True)


Out[2]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe0480536d0>

The train and test data spans the same date range.

Check Plausibility (e.g. outliers)


In [31]:
integers = full.drop('Id', axis=1).select_dtypes(include = ['int64'])
fig, axs = plt.subplots(int(np.ceil(len(integers.axes[1])/3.0)), 3, figsize=(14, 24))
for i, name in enumerate(integers.axes[1]):
    ix = np.unravel_index(i, axs.shape)
    sns.distplot(integers[name], kde=False, rug=True, ax=axs[ix])
sns.despine()
plt.tight_layout()


Some series might be more appropriately represented on a logx scale:


In [30]:
log_cols = ['LotArea', 'LowQualFinSF', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MiscVal']
fig, axs = plt.subplots(len(log_cols), 2, figsize=(10, 16))
for i, name in enumerate(log_cols):
    sns.distplot(full[name], kde=False, rug=True, ax=axs[i, 0])
    sns.distplot(full[name] + 1, kde=False, rug=True, ax=axs[i, 1], color='purple',
                 bins=np.logspace(np.log10(full[name].min() + 1), np.log10(full[name].max()), 50))
    axs[i, 1].set_xscale('log')
sns.despine()
plt.tight_layout()


Living area sanity checks:


In [26]:
livarea_diff = pd.DataFrame({'livarea_diff': full['GrLivArea'] - (full['1stFlrSF'] + full['2ndFlrSF'])})
livarea_diff = livarea_diff.loc[livarea_diff['livarea_diff'] > 0]
livarea = pd.merge(full, livarea_diff, left_index=True, right_index=True).sort_values(by='livarea_diff')
livarea = livarea.loc[livarea['HouseStyle'] != '2.5Fin']
print '%d out of %d rows' % (len(livarea), len(full))
livarea[['GrLivArea', '1stFlrSF', '2ndFlrSF', 'livarea_diff', 'HouseStyle']].head(40)
# 'Bedroom' + 'Kitchen' = 'TotRmsAbvGrd'
# 'GarageCars' <=> 'GarageArea'
# 'PoolArea' > 0 <=> 'PoolQC' != NA && 'PoolQC' != None
# 'MiscVal' > 0 <=> 'MiscFeature' != NA && 'MiscFeature' != None


33 out of 2919 rows
Out[26]:
GrLivArea 1stFlrSF 2ndFlrSF livarea_diff HouseStyle
868 2320 1547 720 53 2Story
2713 1223 520 623 80 2Story
829 1223 520 623 80 2Story
1364 1200 520 600 80 2Story
831 1200 520 600 80 2Story
2090 1594 828 658 108 1.5Fin
2473 1484 866 504 114 1.5Fin
945 1869 1188 561 120 1.5Fin
2818 1700 880 680 140 1.5Fin
187 1656 808 704 144 1.5Fin
589 935 779 0 156 1Story
2578 845 640 0 205 1.5Fin
1349 2358 938 1215 205 2Story
873 1268 1036 0 232 1Story
125 754 520 0 234 1.5Fin
1844 1774 904 611 259 SLvl
1912 2168 928 928 312 2Story
51 1176 816 0 360 1.5Fin
729 1208 848 0 360 1.5Fin
1570 1531 1169 0 362 1.5Fin
263 1316 926 0 390 1.5Fin
197 3112 1360 1360 392 2Story
1801 1495 1064 0 431 1.5Fin
1820 1480 1044 0 436 1.5Fin
2870 1414 964 0 450 1.5Fin
406 1639 1166 0 473 1.5Fin
1173 3086 1636 971 479 1.5Fin
2423 1728 1216 0 512 1.5Fin
88 1526 1013 0 513 1.5Fin
1009 1522 1008 0 514 1.5Fin
170 1382 854 0 528 1.5Fin
2130 1969 1272 0 697 1.5Fin
1785 2377 1313 0 1064 1.5Fin

In [27]:
bsmtarea_diff = pd.DataFrame({'bsmtarea_diff': full['TotalBsmtSF'] -
                              (full['BsmtFinSF1'] + full['BsmtFinSF2'] + full['BsmtUnfSF'])})
bsmtarea_diff = bsmtarea_diff.loc[bsmtarea_diff['bsmtarea_diff'] > 0]
bsmtarea = pd.merge(full, bsmtarea_diff, left_index=True, right_index=True).sort_values(by='bsmtarea_diff')
print '%d out of %d rows' % (len(bsmtarea), len(full))
bsmtarea[['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'bsmtarea_diff']].head(40)


0 out of 2919 rows
Out[27]:
TotalBsmtSF BsmtFinSF1 BsmtFinSF2 BsmtUnfSF bsmtarea_diff

In [33]:
rooms_diff = pd.DataFrame({'rooms_diff': full['TotRmsAbvGrd'] - (full['BedroomAbvGr'] + full['KitchenAbvGr'])})
rooms_diff = rooms_diff.loc[rooms_diff['rooms_diff'] > 6]
rooms = pd.merge(full, rooms_diff, left_index=True, right_index=True).sort_values(by='rooms_diff')
print '%d out of %d rows' % (len(rooms), len(full))
rooms[['TotRmsAbvGrd', 'BedroomAbvGr', 'KitchenAbvGr', 'rooms_diff']].head(40)


16 out of 2919 rows
Out[33]:
TotRmsAbvGrd BedroomAbvGr KitchenAbvGr rooms_diff
178 9 1 1 7
185 12 4 1 7
440 10 2 1 7
523 11 3 1 7
527 11 3 1 7
769 12 4 1 7
803 12 4 1 7
1024 10 2 1 7
1440 11 3 1 7
1998 12 4 1 7
2627 11 3 1 7
2689 11 3 1 7
898 11 2 1 8
1173 12 3 1 8
1298 12 3 1 8
2549 15 2 1 12

In [9]:
floats = full.drop('Id', axis=1).select_dtypes(include = ['float64'])
fig, axs = plt.subplots(int(np.ceil(len(floats.axes[1])/3.0)), 3, figsize=(14, 14))
for i, name in enumerate(floats.axes[1]):
    ix = np.unravel_index(i, axs.shape)
    sns.distplot(floats[name].dropna(), kde=False, rug=True, ax=axs[ix])
sns.despine()
plt.tight_layout()


Check Distribution of Response Variable


In [39]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
sns.distplot(train['SalePrice'], kde=False, rug=True, ax=axs[0])
sns.distplot(train['SalePrice'], kde=False, rug=True, ax=axs[1], color='purple',
             bins=np.logspace(np.log10(train['SalePrice'].min()), np.log10(train['SalePrice'].max()), 50))
axs[1].set_xscale('log')
sns.despine()
plt.tight_layout()



In [40]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
sns.distplot(train['SalePrice'][train['GrLivArea'] < 4000], kde=False, rug=True, ax=axs[0])
sns.distplot(train['SalePrice'][train['GrLivArea'] < 4000], kde=False, rug=True, ax=axs[1], color='purple',
             bins=np.logspace(np.log10(train['SalePrice'][train['GrLivArea'] < 4000].min()),
                              np.log10(train['SalePrice'][train['GrLivArea'] < 4000].max()), 50))
axs[1].set_xscale('log')
sns.despine()
plt.tight_layout()



In [41]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
sns.distplot(train['SalePrice'][train['GrLivArea'] < 4000], kde=False, rug=True, ax=axs[0])
sns.distplot(np.sqrt(train['SalePrice'][train['GrLivArea'] < 4000]), kde=False, rug=True, ax=axs[1], color='purple')
sns.despine()
plt.tight_layout()


Slightly skewed data. We might want to transform it to log scale in our model later on (see second plot).

Clean Data

Imputation

Fix Outliers

Data Transformation


In [ ]:
# Convert numerical categorical data to categorical type: MSSubClass

In [3]:
# Convert ordinal categorical data to numeric types
quality = {v: k for k, v in enumerate(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])}
quality_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
                'GarageQual', 'GarageCond', 'PoolQC']
df.replace({c: quality for c in quality_cols}, inplace=True)

rating = {v: k for k, v in enumerate(['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])}
df.replace({c: rating for c in ['BsmtFinType1', 'BsmtFinType2']}, inplace=True)

df.replace({'BsmtExposure': {v: k for k, v in enumerate(['NA', 'No', 'Mn', 'Av', 'Gd'])}}, inplace=True)
df.replace({'Functional': {v: k for k, v in
                           enumerate(['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'])}}, inplace=True)
df.replace({'GarageFinish': {v: k for k, v in enumerate(['NA', 'Unf', 'RFn', 'Fin'])}}, inplace=True)
df.replace({'PavedDrive': {v: k for k, v in enumerate(['N', 'P', 'Y'])}}, inplace=True)

cols_cat_num = quality_cols + ['BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'Functional', 'GarageFinish',
                               'PavedDrive']
for name in cols_cat_num:
    df[name] = pd.to_numeric(df[name])
df[cols_cat_num].info()
df[cols_cat_num].head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
ExterQual       1460 non-null int64
ExterCond       1460 non-null int64
BsmtQual        1423 non-null float64
BsmtCond        1423 non-null float64
HeatingQC       1460 non-null int64
KitchenQual     1460 non-null int64
FireplaceQu     770 non-null float64
GarageQual      1379 non-null float64
GarageCond      1379 non-null float64
PoolQC          7 non-null float64
BsmtFinType1    1423 non-null float64
BsmtFinType2    1422 non-null float64
BsmtExposure    1422 non-null float64
Functional      1460 non-null int64
GarageFinish    1379 non-null float64
PavedDrive      1460 non-null int64
dtypes: float64(10), int64(6)
memory usage: 182.6 KB
Out[3]:
ExterQual ExterCond BsmtQual BsmtCond HeatingQC KitchenQual FireplaceQu GarageQual GarageCond PoolQC BsmtFinType1 BsmtFinType2 BsmtExposure Functional GarageFinish PavedDrive
0 4 3 4.0 3.0 5 4 NaN 3.0 3.0 NaN 6.0 1.0 1.0 7 2.0 2
1 3 3 4.0 3.0 5 3 3.0 3.0 3.0 NaN 5.0 1.0 4.0 7 2.0 2
2 4 3 4.0 3.0 5 4 3.0 3.0 3.0 NaN 6.0 1.0 2.0 7 2.0 2
3 3 3 3.0 4.0 4 4 4.0 3.0 3.0 NaN 5.0 1.0 1.0 7 1.0 2
4 4 3 4.0 3.0 5 4 3.0 3.0 3.0 NaN 6.0 1.0 3.0 7 2.0 2

In [5]:
# Convert ordinal categorical data to categorical type
cols_cat_ord = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
                'GarageQual', 'GarageCond', 'PoolQC', 'BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'Functional',
                'GarageFinish', 'PavedDrive']
for name in cols_cat_ord:
    df[name] = df[name].astype('category')
df[cols_cat_ord].info()
df[cols_cat_ord].head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
ExterQual       1460 non-null category
ExterCond       1460 non-null category
BsmtQual        1423 non-null category
BsmtCond        1423 non-null category
HeatingQC       1460 non-null category
KitchenQual     1460 non-null category
FireplaceQu     770 non-null category
GarageQual      1379 non-null category
GarageCond      1379 non-null category
PoolQC          7 non-null category
BsmtFinType1    1423 non-null category
BsmtFinType2    1422 non-null category
BsmtExposure    1422 non-null category
Functional      1460 non-null category
GarageFinish    1379 non-null category
PavedDrive      1460 non-null category
dtypes: category(16)
memory usage: 23.5 KB
Out[5]:
ExterQual ExterCond BsmtQual BsmtCond HeatingQC KitchenQual FireplaceQu GarageQual GarageCond PoolQC BsmtFinType1 BsmtFinType2 BsmtExposure Functional GarageFinish PavedDrive
0 Gd TA Gd TA Ex Gd NaN TA TA NaN GLQ Unf No Typ RFn Y
1 TA TA Gd TA Ex TA TA TA TA NaN ALQ Unf Gd Typ RFn Y
2 Gd TA Gd TA Ex Gd TA TA TA NaN GLQ Unf Mn Typ RFn Y
3 TA TA TA Gd Gd Gd Gd TA TA NaN ALQ Unf No Typ Unf Y
4 Gd TA Gd TA Ex Gd TA TA TA NaN GLQ Unf Av Typ RFn Y

In [6]:
# Convert non-ordinal categorical data to categorical type
cols_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
            'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
            'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
            'Electrical', 'GarageType', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'MoSold']
for name in cols_cat:
    df[name] = df[name].astype('category')
df[cols_cat].info()
df[cols_cat].head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 29 columns):
MSSubClass       1460 non-null category
MSZoning         1460 non-null category
Street           1460 non-null category
Alley            91 non-null category
LotShape         1460 non-null category
LandContour      1460 non-null category
Utilities        1460 non-null category
LotConfig        1460 non-null category
LandSlope        1460 non-null category
Neighborhood     1460 non-null category
Condition1       1460 non-null category
Condition2       1460 non-null category
BldgType         1460 non-null category
HouseStyle       1460 non-null category
RoofStyle        1460 non-null category
RoofMatl         1460 non-null category
Exterior1st      1460 non-null category
Exterior2nd      1460 non-null category
MasVnrType       1452 non-null category
Foundation       1460 non-null category
Heating          1460 non-null category
CentralAir       1460 non-null category
Electrical       1459 non-null category
GarageType       1379 non-null category
Fence            281 non-null category
MiscFeature      54 non-null category
SaleType         1460 non-null category
SaleCondition    1460 non-null category
MoSold           1460 non-null category
dtypes: category(29)
memory usage: 43.0 KB
Out[6]:
MSSubClass MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood ... Foundation Heating CentralAir Electrical GarageType Fence MiscFeature SaleType SaleCondition MoSold
0 60 RL Pave NaN Reg Lvl AllPub Inside Gtl CollgCr ... PConc GasA Y SBrkr Attchd NaN NaN WD Normal 2
1 20 RL Pave NaN Reg Lvl AllPub FR2 Gtl Veenker ... CBlock GasA Y SBrkr Attchd NaN NaN WD Normal 5
2 60 RL Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr ... PConc GasA Y SBrkr Attchd NaN NaN WD Normal 9
3 70 RL Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor ... BrkTil GasA Y SBrkr Detchd NaN NaN WD Abnorml 2
4 60 RL Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge ... PConc GasA Y SBrkr Attchd NaN NaN WD Normal 12

5 rows × 29 columns


In [11]:
corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
# TODO: sort by correlation with SalesPrice
plt.figure(figsize=(15, 15))
mask = np.triu(np.ones(corr.values.shape)).astype(np.bool)
sns.heatmap(corr, vmin=-1, vmax=1, square=True, mask=mask)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4a9a537690>

In [4]:
#cat = df[cols_cat[0]]
#cat.head()
#cat_num.info()
#sns.swarmplot(x=df[cols_cat[0]])
#sns.set_palette(sns.cubehelix_palette(n_colors=15, start=.5, rot=-.75))
#sns.set_palette(sns.choose_colorbrewer_palette('sequential'))
#sns.set_style('white')
#sns.swarmplot(x='MSSubClass', y='SalePrice', data=df)
#sns.stripplot(x='MSSubClass', y='SalePrice', data=df, jitter=True)

fig, axs = plt.subplots(int(np.ceil(len(cols_cat)/3.0)), 3, figsize=(14, 24))
for i, name in enumerate(cols_cat):
    ix = np.unravel_index(i, axs.shape)
    #print ix, name
    sns.stripplot(x=name, y='SalePrice', data=df, jitter=True, ax=axs[ix])
sns.despine()
plt.tight_layout()

#melted = pd.melt(cat_num.reset_index(), value_vars=cols_cat_num, value_name='value')
#melted = melted.set_index('variable', append=True).sort_index()
#melted.head()
#grouped = melted.groupby('variable', axis=0)
#grouped.groups.keys()
#sns.swarmplot(x='value', data=melted['ExterCond'])



In [6]:
fig, axs = plt.subplots(int(np.ceil(len(cols_cat_ord)/3.0)), 3, figsize=(14, 24))
for i, name in enumerate(cols_cat_ord):
    ix = np.unravel_index(i, axs.shape)
    #print ix, name
    sns.stripplot(x=name, y='SalePrice', data=df, jitter=True, ax=axs[ix])
sns.despine()
plt.tight_layout()



In [12]:
#corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
numeric = df.select_dtypes(include=['int64'])
del numeric['Id']
#numeric = numeric.drop(numeric.columns[[i for i in range(7, numeric.shape[-1])]], axis=1)
numeric.info()

g = sns.PairGrid(numeric)
print 'mapping diag'
g.map_diag(lambda *args, **kwargs: None)
print 'mapping lower'
g.map_lower(plt.hexbin, gridsize=15, cmap=cmap)
print 'mapping upper'
g.map_upper(lambda *args, **kwargs: None)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 32 columns):
LotArea          1460 non-null int64
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
BsmtFinSF1       1460 non-null int64
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
BedroomAbvGr     1460 non-null int64
KitchenAbvGr     1460 non-null int64
TotRmsAbvGrd     1460 non-null int64
Fireplaces       1460 non-null int64
GarageCars       1460 non-null int64
GarageArea       1460 non-null int64
WoodDeckSF       1460 non-null int64
OpenPorchSF      1460 non-null int64
EnclosedPorch    1460 non-null int64
3SsnPorch        1460 non-null int64
ScreenPorch      1460 non-null int64
PoolArea         1460 non-null int64
MiscVal          1460 non-null int64
YrSold           1460 non-null int64
SalePrice        1460 non-null int64
dtypes: int64(32)
memory usage: 365.1 KB
mapping diag
mapping lower
mapping upper
Out[12]:
<seaborn.axisgrid.PairGrid at 0x7fdc22580310>