In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
train = pd.read_csv('train.csv', header=0)
test = pd.read_csv('test.csv', header=0)
full = pd.concat([train.drop('SalePrice', axis=1), test], ignore_index=True)
print 'train'
train.info()
print 'test'
test.info()
print 'full'
full.info()
#train.head()
#test.head()
full.head()
Out[2]:
In [2]:
#train_dt = pd.to_datetime(dict(year=train['YrSold'], month=train['MoSold'], day=np.ones((len(train['MoSold'])))))
#test_dt = pd.to_datetime(dict(year=test['YrSold'], month=test['MoSold'], day=np.ones((len(test['MoSold'])))))
sns.distplot(train['YrSold'], kde=False, rug=True)
sns.distplot(test['YrSold'], kde=False, rug=True)
Out[2]:
The train
and test
data spans the same date range.
In [31]:
integers = full.drop('Id', axis=1).select_dtypes(include = ['int64'])
fig, axs = plt.subplots(int(np.ceil(len(integers.axes[1])/3.0)), 3, figsize=(14, 24))
for i, name in enumerate(integers.axes[1]):
ix = np.unravel_index(i, axs.shape)
sns.distplot(integers[name], kde=False, rug=True, ax=axs[ix])
sns.despine()
plt.tight_layout()
Some series might be more appropriately represented on a logx scale:
In [30]:
log_cols = ['LotArea', 'LowQualFinSF', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MiscVal']
fig, axs = plt.subplots(len(log_cols), 2, figsize=(10, 16))
for i, name in enumerate(log_cols):
sns.distplot(full[name], kde=False, rug=True, ax=axs[i, 0])
sns.distplot(full[name] + 1, kde=False, rug=True, ax=axs[i, 1], color='purple',
bins=np.logspace(np.log10(full[name].min() + 1), np.log10(full[name].max()), 50))
axs[i, 1].set_xscale('log')
sns.despine()
plt.tight_layout()
Living area sanity checks:
In [26]:
livarea_diff = pd.DataFrame({'livarea_diff': full['GrLivArea'] - (full['1stFlrSF'] + full['2ndFlrSF'])})
livarea_diff = livarea_diff.loc[livarea_diff['livarea_diff'] > 0]
livarea = pd.merge(full, livarea_diff, left_index=True, right_index=True).sort_values(by='livarea_diff')
livarea = livarea.loc[livarea['HouseStyle'] != '2.5Fin']
print '%d out of %d rows' % (len(livarea), len(full))
livarea[['GrLivArea', '1stFlrSF', '2ndFlrSF', 'livarea_diff', 'HouseStyle']].head(40)
# 'Bedroom' + 'Kitchen' = 'TotRmsAbvGrd'
# 'GarageCars' <=> 'GarageArea'
# 'PoolArea' > 0 <=> 'PoolQC' != NA && 'PoolQC' != None
# 'MiscVal' > 0 <=> 'MiscFeature' != NA && 'MiscFeature' != None
Out[26]:
In [27]:
bsmtarea_diff = pd.DataFrame({'bsmtarea_diff': full['TotalBsmtSF'] -
(full['BsmtFinSF1'] + full['BsmtFinSF2'] + full['BsmtUnfSF'])})
bsmtarea_diff = bsmtarea_diff.loc[bsmtarea_diff['bsmtarea_diff'] > 0]
bsmtarea = pd.merge(full, bsmtarea_diff, left_index=True, right_index=True).sort_values(by='bsmtarea_diff')
print '%d out of %d rows' % (len(bsmtarea), len(full))
bsmtarea[['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'bsmtarea_diff']].head(40)
Out[27]:
In [33]:
rooms_diff = pd.DataFrame({'rooms_diff': full['TotRmsAbvGrd'] - (full['BedroomAbvGr'] + full['KitchenAbvGr'])})
rooms_diff = rooms_diff.loc[rooms_diff['rooms_diff'] > 6]
rooms = pd.merge(full, rooms_diff, left_index=True, right_index=True).sort_values(by='rooms_diff')
print '%d out of %d rows' % (len(rooms), len(full))
rooms[['TotRmsAbvGrd', 'BedroomAbvGr', 'KitchenAbvGr', 'rooms_diff']].head(40)
Out[33]:
In [9]:
floats = full.drop('Id', axis=1).select_dtypes(include = ['float64'])
fig, axs = plt.subplots(int(np.ceil(len(floats.axes[1])/3.0)), 3, figsize=(14, 14))
for i, name in enumerate(floats.axes[1]):
ix = np.unravel_index(i, axs.shape)
sns.distplot(floats[name].dropna(), kde=False, rug=True, ax=axs[ix])
sns.despine()
plt.tight_layout()
In [39]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
sns.distplot(train['SalePrice'], kde=False, rug=True, ax=axs[0])
sns.distplot(train['SalePrice'], kde=False, rug=True, ax=axs[1], color='purple',
bins=np.logspace(np.log10(train['SalePrice'].min()), np.log10(train['SalePrice'].max()), 50))
axs[1].set_xscale('log')
sns.despine()
plt.tight_layout()
In [40]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
sns.distplot(train['SalePrice'][train['GrLivArea'] < 4000], kde=False, rug=True, ax=axs[0])
sns.distplot(train['SalePrice'][train['GrLivArea'] < 4000], kde=False, rug=True, ax=axs[1], color='purple',
bins=np.logspace(np.log10(train['SalePrice'][train['GrLivArea'] < 4000].min()),
np.log10(train['SalePrice'][train['GrLivArea'] < 4000].max()), 50))
axs[1].set_xscale('log')
sns.despine()
plt.tight_layout()
In [41]:
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
sns.distplot(train['SalePrice'][train['GrLivArea'] < 4000], kde=False, rug=True, ax=axs[0])
sns.distplot(np.sqrt(train['SalePrice'][train['GrLivArea'] < 4000]), kde=False, rug=True, ax=axs[1], color='purple')
sns.despine()
plt.tight_layout()
Slightly skewed data. We might want to transform it to log scale in our model later on (see second plot).
In [ ]:
# Convert numerical categorical data to categorical type: MSSubClass
In [3]:
# Convert ordinal categorical data to numeric types
quality = {v: k for k, v in enumerate(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])}
quality_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
'GarageQual', 'GarageCond', 'PoolQC']
df.replace({c: quality for c in quality_cols}, inplace=True)
rating = {v: k for k, v in enumerate(['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])}
df.replace({c: rating for c in ['BsmtFinType1', 'BsmtFinType2']}, inplace=True)
df.replace({'BsmtExposure': {v: k for k, v in enumerate(['NA', 'No', 'Mn', 'Av', 'Gd'])}}, inplace=True)
df.replace({'Functional': {v: k for k, v in
enumerate(['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'])}}, inplace=True)
df.replace({'GarageFinish': {v: k for k, v in enumerate(['NA', 'Unf', 'RFn', 'Fin'])}}, inplace=True)
df.replace({'PavedDrive': {v: k for k, v in enumerate(['N', 'P', 'Y'])}}, inplace=True)
cols_cat_num = quality_cols + ['BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'Functional', 'GarageFinish',
'PavedDrive']
for name in cols_cat_num:
df[name] = pd.to_numeric(df[name])
df[cols_cat_num].info()
df[cols_cat_num].head()
Out[3]:
In [5]:
# Convert ordinal categorical data to categorical type
cols_cat_ord = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
'GarageQual', 'GarageCond', 'PoolQC', 'BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'Functional',
'GarageFinish', 'PavedDrive']
for name in cols_cat_ord:
df[name] = df[name].astype('category')
df[cols_cat_ord].info()
df[cols_cat_ord].head()
Out[5]:
In [6]:
# Convert non-ordinal categorical data to categorical type
cols_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
'Electrical', 'GarageType', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'MoSold']
for name in cols_cat:
df[name] = df[name].astype('category')
df[cols_cat].info()
df[cols_cat].head()
Out[6]:
In [11]:
corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
# TODO: sort by correlation with SalesPrice
plt.figure(figsize=(15, 15))
mask = np.triu(np.ones(corr.values.shape)).astype(np.bool)
sns.heatmap(corr, vmin=-1, vmax=1, square=True, mask=mask)
Out[11]:
In [4]:
#cat = df[cols_cat[0]]
#cat.head()
#cat_num.info()
#sns.swarmplot(x=df[cols_cat[0]])
#sns.set_palette(sns.cubehelix_palette(n_colors=15, start=.5, rot=-.75))
#sns.set_palette(sns.choose_colorbrewer_palette('sequential'))
#sns.set_style('white')
#sns.swarmplot(x='MSSubClass', y='SalePrice', data=df)
#sns.stripplot(x='MSSubClass', y='SalePrice', data=df, jitter=True)
fig, axs = plt.subplots(int(np.ceil(len(cols_cat)/3.0)), 3, figsize=(14, 24))
for i, name in enumerate(cols_cat):
ix = np.unravel_index(i, axs.shape)
#print ix, name
sns.stripplot(x=name, y='SalePrice', data=df, jitter=True, ax=axs[ix])
sns.despine()
plt.tight_layout()
#melted = pd.melt(cat_num.reset_index(), value_vars=cols_cat_num, value_name='value')
#melted = melted.set_index('variable', append=True).sort_index()
#melted.head()
#grouped = melted.groupby('variable', axis=0)
#grouped.groups.keys()
#sns.swarmplot(x='value', data=melted['ExterCond'])
In [6]:
fig, axs = plt.subplots(int(np.ceil(len(cols_cat_ord)/3.0)), 3, figsize=(14, 24))
for i, name in enumerate(cols_cat_ord):
ix = np.unravel_index(i, axs.shape)
#print ix, name
sns.stripplot(x=name, y='SalePrice', data=df, jitter=True, ax=axs[ix])
sns.despine()
plt.tight_layout()
In [12]:
#corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
numeric = df.select_dtypes(include=['int64'])
del numeric['Id']
#numeric = numeric.drop(numeric.columns[[i for i in range(7, numeric.shape[-1])]], axis=1)
numeric.info()
g = sns.PairGrid(numeric)
print 'mapping diag'
g.map_diag(lambda *args, **kwargs: None)
print 'mapping lower'
g.map_lower(plt.hexbin, gridsize=15, cmap=cmap)
print 'mapping upper'
g.map_upper(lambda *args, **kwargs: None)
Out[12]: