In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
df = pd.read_csv('data/train_prepared_light.csv')
In [ ]:
df.head()
In [ ]:
df.shape
In [ ]:
ncategories = sum(df.dtypes == int)
ncategories
In [ ]:
target = pd.read_csv('data/train_target.csv')
In [ ]:
target.shape
In [ ]:
df.columns
In [ ]:
def plot_corr(df, ax=None, log=False):
"""Function plots a graphical correlation matrix for each pair of columns in the dataframe.
Input:
df: pandas DataFrame
size: vertical and horizontal size of the plot
"""
corr = df.corr()
if log:
corr = np.log(np.abs(corr))
diag = np.diag(np.diagonal(corr))
if ax is None:
ax = plt.gca()
mat = corr - diag
ax.matshow(mat)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90);
plt.yticks(range(len(corr.columns)), corr.columns, rotation=0);
plt.axis('tight')
return corr - diag
In [ ]:
plt.figure(0, figsize=(15,15))
plt.clf()
corr = plot_corr(df);
In [ ]:
corr.describe()
In [ ]:
plt.figure(figsize=(20, 10))
plt.plot(corr.max().values, 'o')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=90);
#ax.set_xticklabels(corr.columns, rotation=90)
In [ ]:
hcorr_columns = corr.loc[:, corr.max() > 0.63].columns
sorted(hcorr_columns)
In [ ]:
dfc = df[hcorr_columns]
dfc.head()
In [ ]:
plot_corr(dfc);
In [ ]:
important_columns = ['Neighborhood', 'YearBuilt', 'OverallQual', 'OverallCond',
'ExterQual', 'ExterCond', 'MSSubClass','MSZoning','HeatingQC',
'TotRmsAbvGrd','GarageArea','GarageCond','Fence','GrLivArea']
dfi = df[important_columns]
In [ ]:
isnum = dfi.dtypes == float
dfinum = dfi.loc[:, isnum]
dfinum.shape
In [ ]:
fig, axes = plt.subplots(3,2, figsize=(10,10))
from itertools import chain
for ax, col in zip(chain.from_iterable(axes), dfinum.columns):
sns.boxplot(dfinum[col], ax=ax)
plt.tight_layout();
In [ ]:
sns.pairplot(dfi.loc[:, isnum])
In [ ]:
sns.distplot(target / 1000);
In [ ]:
dfit = dfi.copy()
dfit['target'] = target / 1000
In [ ]:
sns.pairplot(dfit, y_vars=dfinum.columns, x_vars=['target'], size=5)
Next step is to facet our plots using categorical data. Let's look for the categorical features that have got only a few categories.
In [ ]:
dficat = dfi.loc[:, ~isnum]
dficat.shape
In [ ]:
categories = {}
for col in dficat.columns:
cats = dficat[col].unique()
categories[col] = len(cats)
In [ ]:
categories
Let's try faceting with the 'ExterQual' category.
In [ ]:
for category in list(categories.keys()):
if categories[category] > 6:
continue
fig, axes = plt.subplots(len(dfinum.columns) // 2, 2, figsize=(10,10))
g = sns.FacetGrid(dfit, hue=category, size=5)
for feature, ax in zip(dfinum.columns, chain.from_iterable(axes)):
g.map(ax.scatter, feature, 'target', alpha=0.7)
ax.set(xlabel=feature, ylabel='target')
ax.legend(loc='upper left')
plt.close(g.fig)
fig.suptitle(category)
#plt.tight_layout()
So exterior quality, heating QC, and MSZoning seem to be the most helpful categories. Let's see if we can confirm that using more advanced visualisations.
In [ ]:
import pandas.tools.plotting as pdp
In [ ]:
pdp.radviz(dfi, 'HeatingQC') # not very helpful
In [ ]:
pdp.andrews_curves(dfi, 'HeatingQC') # not very helpful
In [ ]:
pdp.radviz(dfi, 'ExterQual') # not very helpful
In [ ]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
x1, x2, x3, x4, x5 = pca.fit(dfi).transform(dfi).T
dfpca = pd.DataFrame(data={'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5, 'target':target})
#sns.FacetGrid(dfpca, hue='Name', size=5).map(plt.scatter, 'PCA1', 'PCA2').add_legend()
In [ ]:
xlabels = ['x' + str(i + 1) for i in range(5)]
components = pd.DataFrame(data=pca.components_, columns=dfi.columns, index=xlabels)
components
In [ ]:
fig, axes = plt.subplots(1,2,figsize=(12, 6), sharey=True)
components.loc['x1'].plot(kind='bar', ax=axes[0])
components.loc['x2'].plot(kind='bar', ax=axes[1])
axes[0].set_xticklabels(components.columns, rotation=30);
axes[1].set_xticklabels(components.columns, rotation=30);
In [ ]:
plt.stem(pca.explained_variance_, basefmt='')
#plt.xlim([-0.5, 1.5])
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(['x' + str(i + 1) for i in range(5)])
plt.xlabel('PCA directions')
plt.ylabel('PCA explained variance')