In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
df = pd.read_csv('data/train_prepared_light.csv')

In [ ]:
df.head()

In [ ]:
df.shape

In [ ]:
ncategories = sum(df.dtypes == int)
ncategories

In [ ]:
target = pd.read_csv('data/train_target.csv')

In [ ]:
target.shape

Exploration

The dataset is massive, so let's pick a few columns to get started. We look for features that explain the the sell prices, so these features should be correlated. One way to find them is to plot the correlation matrix and look for correlated variables.


In [ ]:
df.columns

Using correlation to identify useful features


In [ ]:
def plot_corr(df, ax=None, log=False):
    """Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot
    """
    corr = df.corr()
    if log:
        corr = np.log(np.abs(corr))
    diag = np.diag(np.diagonal(corr))
    if ax is None:
        ax = plt.gca()
    mat = corr - diag
    ax.matshow(mat)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90);
    plt.yticks(range(len(corr.columns)), corr.columns, rotation=0);
    plt.axis('tight')
    return corr - diag

In [ ]:
plt.figure(0, figsize=(15,15))
plt.clf()
corr = plot_corr(df);

In [ ]:
corr.describe()

In [ ]:
plt.figure(figsize=(20, 10))
plt.plot(corr.max().values, 'o')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=90);
#ax.set_xticklabels(corr.columns, rotation=90)

In [ ]:
hcorr_columns = corr.loc[:, corr.max() > 0.63].columns
sorted(hcorr_columns)

In [ ]:
dfc = df[hcorr_columns]
dfc.head()

In [ ]:
plot_corr(dfc);

Pick important features


In [ ]:
important_columns = ['Neighborhood', 'YearBuilt', 'OverallQual', 'OverallCond', 
                     'ExterQual', 'ExterCond', 'MSSubClass','MSZoning','HeatingQC',
                     'TotRmsAbvGrd','GarageArea','GarageCond','Fence','GrLivArea']
dfi = df[important_columns]

Looks at some numerical columns


In [ ]:
isnum = dfi.dtypes == float
dfinum = dfi.loc[:, isnum]
dfinum.shape

In [ ]:
fig, axes = plt.subplots(3,2, figsize=(10,10))
from itertools import chain
for ax, col in zip(chain.from_iterable(axes), dfinum.columns):
    sns.boxplot(dfinum[col], ax=ax)
plt.tight_layout();

In [ ]:
sns.pairplot(dfi.loc[:, isnum])

Look at the target


In [ ]:
sns.distplot(target / 1000);

Compare our numerical colums with the target


In [ ]:
dfit = dfi.copy()
dfit['target'] = target / 1000

In [ ]:
sns.pairplot(dfit, y_vars=dfinum.columns, x_vars=['target'], size=5)

Conditional plots

Next step is to facet our plots using categorical data. Let's look for the categorical features that have got only a few categories.


In [ ]:
dficat = dfi.loc[:, ~isnum]
dficat.shape

In [ ]:
categories = {}
for col in dficat.columns:
    cats = dficat[col].unique()
    categories[col] = len(cats)

In [ ]:
categories

Let's try faceting with the 'ExterQual' category.


In [ ]:
for category in list(categories.keys()):
    if categories[category] > 6:
        continue
    fig, axes = plt.subplots(len(dfinum.columns) // 2, 2, figsize=(10,10))
    g = sns.FacetGrid(dfit, hue=category, size=5)
    for feature, ax in zip(dfinum.columns, chain.from_iterable(axes)):
        g.map(ax.scatter, feature, 'target', alpha=0.7)
        ax.set(xlabel=feature, ylabel='target')
        ax.legend(loc='upper left')
    plt.close(g.fig)
    fig.suptitle(category)
    #plt.tight_layout()

So exterior quality, heating QC, and MSZoning seem to be the most helpful categories. Let's see if we can confirm that using more advanced visualisations.

Not very helpful advanced visualizations


In [ ]:
import pandas.tools.plotting as pdp

In [ ]:
pdp.radviz(dfi, 'HeatingQC')  # not very helpful

In [ ]:
pdp.andrews_curves(dfi, 'HeatingQC')  # not very helpful

In [ ]:
pdp.radviz(dfi, 'ExterQual')  # not very helpful

Reduce dimensionality using PCA


In [ ]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
x1, x2, x3, x4, x5 = pca.fit(dfi).transform(dfi).T
dfpca = pd.DataFrame(data={'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5, 'target':target})
#sns.FacetGrid(dfpca, hue='Name', size=5).map(plt.scatter, 'PCA1', 'PCA2').add_legend()

In [ ]:
xlabels = ['x' + str(i + 1) for i in range(5)]
components = pd.DataFrame(data=pca.components_, columns=dfi.columns, index=xlabels)
components

In [ ]:
fig, axes = plt.subplots(1,2,figsize=(12, 6), sharey=True)
components.loc['x1'].plot(kind='bar', ax=axes[0])
components.loc['x2'].plot(kind='bar', ax=axes[1])
axes[0].set_xticklabels(components.columns, rotation=30);
axes[1].set_xticklabels(components.columns, rotation=30);

In [ ]:
plt.stem(pca.explained_variance_, basefmt='')
#plt.xlim([-0.5, 1.5])
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(['x' + str(i + 1) for i in range(5)])
plt.xlabel('PCA directions')
plt.ylabel('PCA explained variance')