In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [ ]:

    
df = pd.read_csv('data/train_prepared_light.csv')



In [ ]:

    
df.head()



In [ ]:

    
df.shape



In [ ]:

    
ncategories = sum(df.dtypes == int)
ncategories



In [ ]:

    
target = pd.read_csv('data/train_target.csv')



In [ ]:

    
target.shape

Exploration

The dataset is massive, so let's pick a few columns to get started. We look for features that explain the the sell prices, so these features should be correlated. One way to find them is to plot the correlation matrix and look for correlated variables.



In [ ]:

    
df.columns

Using correlation to identify useful features



In [ ]:

    
def plot_corr(df, ax=None, log=False):
    """Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot
    """
    corr = df.corr()
    if log:
        corr = np.log(np.abs(corr))
    diag = np.diag(np.diagonal(corr))
    if ax is None:
        ax = plt.gca()
    mat = corr - diag
    ax.matshow(mat)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90);
    plt.yticks(range(len(corr.columns)), corr.columns, rotation=0);
    plt.axis('tight')
    return corr - diag



In [ ]:

    
plt.figure(0, figsize=(15,15))
plt.clf()
corr = plot_corr(df);



In [ ]:

    
corr.describe()



In [ ]:

    
plt.figure(figsize=(20, 10))
plt.plot(corr.max().values, 'o')
plt.xticks(range(corr.shape[1]), corr.columns, rotation=90);
#ax.set_xticklabels(corr.columns, rotation=90)



In [ ]:

    
hcorr_columns = corr.loc[:, corr.max() > 0.63].columns
sorted(hcorr_columns)



In [ ]:

    
dfc = df[hcorr_columns]
dfc.head()



In [ ]:

    
plot_corr(dfc);

Pick important features



In [ ]:

    
important_columns = ['Neighborhood', 'YearBuilt', 'OverallQual', 'OverallCond', 
                     'ExterQual', 'ExterCond', 'MSSubClass','MSZoning','HeatingQC',
                     'TotRmsAbvGrd','GarageArea','GarageCond','Fence','GrLivArea']
dfi = df[important_columns]

Looks at some numerical columns



In [ ]:

    
isnum = dfi.dtypes == float
dfinum = dfi.loc[:, isnum]
dfinum.shape



In [ ]:

    
fig, axes = plt.subplots(3,2, figsize=(10,10))
from itertools import chain
for ax, col in zip(chain.from_iterable(axes), dfinum.columns):
    sns.boxplot(dfinum[col], ax=ax)
plt.tight_layout();



In [ ]:

    
sns.pairplot(dfi.loc[:, isnum])

Look at the target



In [ ]:

    
sns.distplot(target / 1000);

Compare our numerical colums with the target



In [ ]:

    
dfit = dfi.copy()
dfit['target'] = target / 1000



In [ ]:

    
sns.pairplot(dfit, y_vars=dfinum.columns, x_vars=['target'], size=5)

Conditional plots

Next step is to facet our plots using categorical data. Let's look for the categorical features that have got only a few categories.



In [ ]:

    
dficat = dfi.loc[:, ~isnum]
dficat.shape



In [ ]:

    
categories = {}
for col in dficat.columns:
    cats = dficat[col].unique()
    categories[col] = len(cats)



In [ ]:

    
categories

Let's try faceting with the 'ExterQual' category.



In [ ]:

    
for category in list(categories.keys()):
    if categories[category] > 6:
        continue
    fig, axes = plt.subplots(len(dfinum.columns) // 2, 2, figsize=(10,10))
    g = sns.FacetGrid(dfit, hue=category, size=5)
    for feature, ax in zip(dfinum.columns, chain.from_iterable(axes)):
        g.map(ax.scatter, feature, 'target', alpha=0.7)
        ax.set(xlabel=feature, ylabel='target')
        ax.legend(loc='upper left')
    plt.close(g.fig)
    fig.suptitle(category)
    #plt.tight_layout()

So exterior quality, heating QC, and MSZoning seem to be the most helpful categories. Let's see if we can confirm that using more advanced visualisations.

Not very helpful advanced visualizations



In [ ]:

    
import pandas.tools.plotting as pdp



In [ ]:

    
pdp.radviz(dfi, 'HeatingQC')  # not very helpful



In [ ]:

    
pdp.andrews_curves(dfi, 'HeatingQC')  # not very helpful



In [ ]:

    
pdp.radviz(dfi, 'ExterQual')  # not very helpful

Reduce dimensionality using PCA



In [ ]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
x1, x2, x3, x4, x5 = pca.fit(dfi).transform(dfi).T
dfpca = pd.DataFrame(data={'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5, 'target':target})
#sns.FacetGrid(dfpca, hue='Name', size=5).map(plt.scatter, 'PCA1', 'PCA2').add_legend()



In [ ]:

    
xlabels = ['x' + str(i + 1) for i in range(5)]
components = pd.DataFrame(data=pca.components_, columns=dfi.columns, index=xlabels)
components



In [ ]:

    
fig, axes = plt.subplots(1,2,figsize=(12, 6), sharey=True)
components.loc['x1'].plot(kind='bar', ax=axes[0])
components.loc['x2'].plot(kind='bar', ax=axes[1])
axes[0].set_xticklabels(components.columns, rotation=30);
axes[1].set_xticklabels(components.columns, rotation=30);



In [ ]:

    
plt.stem(pca.explained_variance_, basefmt='')
#plt.xlim([-0.5, 1.5])
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(['x' + str(i + 1) for i in range(5)])
plt.xlabel('PCA directions')
plt.ylabel('PCA explained variance')