In [ ]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

In [ ]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))  # add parent directory to path
import samlib

Sale price distribution

First step is to look at the target sale price for the training data set, i.e. the column we're trying to predict.


In [ ]:
target = pd.read_csv('../data/train_target.csv')

In [ ]:
target.describe()

The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.


In [ ]:
target = target / 1000

In [ ]:
sns.distplot(target);
plt.title('SalePrice')

In [ ]:
import scipy as sp
sp.stats.skew(target)

In [ ]:
sp.stats.skewtest(target)

The distribution is skewed (as demonstrated by the large z-score (and small pvalue) of teh skewtest). It is right skewed (the skew is positive). Skewed distribution are not ideal for linear models, which often assume a normal distribution. One way to correct for right-skewness is to take the log [1,2,3]

We apply the function $x \rightarrow \log(1 + x)$ because it is always positive for $x \geq 0$


In [ ]:
logtarget = np.log1p(target)
print('skewness of logtarget = ', sp.stats.skew(logtarget)[0])
print('skewness test of logtarget = ', sp.stats.skewtest(logtarget))
sns.distplot(logtarget)
plt.title(r'log(1 + SalePrice)')

Merge the training and test datasets for data preparation

We're going to explore the training dataset and apply some transformations to it (fixing missing values, transforming columns etc). We'll need to apply the same transformations to the test dataset. To make that easy, let's put the training and test datasets into one dataframe.


In [ ]:
raw_train = pd.read_csv('../data/train_prepared_light.csv')
raw_test = pd.read_csv('../data/test_prepared_light.csv')
df = pd.concat([raw_train, raw_test], keys=['train', 'test'])

In [ ]:
df.shape

In [ ]:
ncategories = sum(df.dtypes == object)
ncategories

In [ ]:
df.head()

In [ ]:
df.tail()

Features

The dataset is wide with 78 features.


In [ ]:
df.columns, len(df.columns)

We've got 3 data types: int, float and object


In [ ]:
df.dtypes.unique()

Split the data between categorical and numerical features


In [ ]:
is_categorical = (df.dtypes == object)
is_numerical = ~is_categorical

Numerical features

Create a numerical dataset to keep track of the features


In [ ]:
dfnum = df.loc[:, is_numerical].copy()

In [ ]:
dfnum.columns, len(dfnum.columns)

We've got 36 numerical features. We can use the describe method to get some statistics:


In [ ]:
dfnum.describe()

But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values. http://sebastianraschka.com/Articles/2014_about_feature_scaling.html

Deal with NaN values


In [ ]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls

In [ ]:
dfnum.shape

In [ ]:
dfnum[cols_with_nulls].isnull().sum().sort_values(ascending=False)
#.plot(kind='bar')

Based on the description, the null values for the MasVnrArea should be 0 (no massonry veneer type)


In [ ]:
# We may want to refine this in the future. Perhaps build a model to predict the missing GarageCars from the other features?
median_list = 'LotFrontage', 'BsmtFullBath','BsmtHalfBath', 'GarageCars', 'GarageArea'
zero_list = 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF'

In [ ]:
for feature in median_list:
    dfnum[feature].fillna(dfnum[feature].median(), inplace=True)

In [ ]:
for feature in zero_list:
    dfnum[feature].fillna(0, inplace=True)

For the GarageYrBlt, replace by the year the house was built.


In [ ]:
dfnum.GarageYrBlt.fillna(dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()], inplace=True)

In [ ]:
# Check that we got rid of the nulls
dfnum.isnull().sum().any()

In [ ]:
# Assign to the slice (see the copy / write problem in Pandas)
df.loc[:, is_numerical] = dfnum

Standardize the data


In [ ]:
def standardize(df):
    return sk.preprocessing.StandardScaler().fit_transform(df)

dfnum_t = dfnum.apply(standardize)
dfnum_t.head()

Plot violinplots for each feature

The violin plots give us some idea of the distribution of data for each feature. We can look for things like skewness, non-normality, and the presence of outliers.


In [ ]:
def violinplot(df, ax=None):
    if ax is None:
        ax = plt.gca()
    sns.violinplot(df, ax=ax)
    for xlab in ax.get_xticklabels():
        xlab.set_rotation(30)

In [ ]:
def featureplot(df, nrows=1, figsize=(12,8), plotfunc=violinplot):
    """Plot the dataframe features"""
    width, height = figsize
    fig, axes = plt.subplots(nrows, 1, figsize=(width, height * nrows));
    i = 0
    plots_per_figure = df.shape[1] // nrows
    if nrows == 1:
        axes = [axes]
    for j, ax in zip(range(plots_per_figure, df.shape[1] + 1, plots_per_figure), axes):
        plotfunc(df.iloc[:, i:j], ax=ax)
        i = j

In [ ]:
dfnum_t.head()

In [ ]:
train = dfnum.loc['train',:]
train_t = dfnum_t.loc['train',:]

Many of the features are higly skewed with very long tails.


In [ ]:
featureplot(train_t.iloc[:, 0:9])

Most of these are right skewed as well. BsmtFullBath has some discrete values (number of bathrooms).


In [ ]:
featureplot(train_t.iloc[:, 9:18])

Some features, such as BsmtFinSF2, are almost constant (blobs with long tail) as can be seen below


In [ ]:
fig, ax = plt.subplots(1,1, figsize=(4, 4))
sns.distplot(train_t['BsmtFinSF2'], ax=ax)
ax.set_title('Distribution of BsmtFinSF2')

Drop nearly constant features


In [ ]:
def test_nearly_constant(series):
    counts = series.value_counts()
    max_val_count = max(counts)
    other_val_count = counts.drop(counts.argmax()).sum()
    return other_val_count / max_val_count < 0.25

is_nearly_constant = train_t.apply(test_nearly_constant)
is_nearly_constant.value_counts()

In [ ]:
dropme = train_t.columns[is_nearly_constant]
dropme

In [ ]:
df = df.drop(dropme, axis=1)
train = train.drop(dropme, axis=1)
train_t = train_t.drop(dropme, axis=1)

Log transform the other features if they have a high skewness

Using a log transformation for some of the skewed features should help, as illustrated below. We use the raw data (not the standardized one) because we need positive values for the log function (we'll standardize the transformed variables later).


In [ ]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(train['LotArea'], ax=axes[0])
sns.distplot(np.log1p(train['LotArea']), ax=axes[1])

In [ ]:
zfactors = sp.stats.skewtest(train)[0]
sns.distplot(zfactors)

In [ ]:
is_skewed = np.abs(zfactors) > 10
pd.Series(data=zfactors, index=train.columns)[is_skewed].sort_values().plot(kind='barh')
plt.title('Z-factor for skewtest')

Check the sign of the skewness for all these


In [ ]:
assert all(np.sign(sp.stats.skew(train)[is_skewed]) > 0)

Let's apply a log1p transform to all these and plot the distributions again


In [ ]:
def transform_skewed_colums(dfnum, is_skewed=is_skewed):
    """
    dfnum: dataframe to transform
    dropme: columns to drop
    is_skewed: iterable of length dfnum.columns indicating if a column is skewed
    """
    dfnum2 = dfnum.copy()
    for feature, skewed_feature in zip(dfnum.columns, is_skewed):
        if skewed_feature:
            dfnum2[feature] = np.log1p(dfnum[feature])

    dfnum_t2 = dfnum2.apply(standardize)
    return dfnum_t2

# the transformed dataset has fewer columns and we only want those
dfnum_t2 = transform_skewed_colums(df.loc[:, is_numerical])

In [ ]:
dfnum_t2.iloc[:, is_skewed].columns

In [ ]:
zfactors2 = sp.stats.skewtest(dfnum_t2)[0]
pd.Series(data=zfactors2, index=dfnum_t2.columns)[is_skewed].sort_values().plot(kind='barh')

Now our originally skewed features look more symmetric.


In [ ]:
featureplot(dfnum_t2.iloc[:, is_skewed], nrows=2, figsize=(10,5))

In [ ]:
featureplot(dfnum_t2.iloc[:, ~is_skewed], nrows=2, figsize=(10, 5))

Save transformed numerical data

Use the storage magic to communicate between notebooks.


In [ ]:
dfnum_t2.index.names = ['Dataset', 'Id']
dfnum_t2.head()

In [ ]:
dfnum_t2.to_csv('transformed_dataset_dfnum_t2.csv', index=True)

In [ ]:
nfeatures = dfnum_t2.columns
target_t = logtarget.apply(standardize)
target_t.head()

Feature selection

We're now in a good position to identify the key numerical features. Those should be hightly correlated with the sale price.


In [ ]:
dfnum_t2.head()

In [ ]:
corr = pd.DataFrame(data=dfnum_t2.loc['train',:].apply(lambda feature: sp.stats.pearsonr(feature, target_t['SalePrice'])), 
                    columns=['pearsonr'])
corr = corr.assign(correlation=corr.applymap(lambda x: x[0]),
                pvalue=corr.applymap(lambda x: x[1]))
corr = corr.drop('pearsonr', axis=1)

In [ ]:
corr.head()

In [ ]:
corr.sort_values('pvalue', ascending=False)['correlation'].plot(kind='barh')

In [ ]:
corr.sort_values('pvalue').head()

In [ ]:
corr.sort_values('pvalue').tail()

Let's keep only the features that have a high enough correlation with the price (correlation less than 0.2)


In [ ]:
min_correlation = 0.2
key_features = corr[np.abs(corr['correlation'] > min_correlation)].sort_values(by='correlation', ascending=False).index.values
key_features, key_features.size

In [ ]:
%store key_features

In [ ]: