In [ ]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

In [ ]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))  # add parent directory to path
import samlib

Use the cleaner chaining method for transforming the data https://tomaugspurger.github.io/method-chaining.html

Sale price distribution

First step is to look at the target sale price for the training data set, i.e. the column we're trying to predict.


In [ ]:
target = pd.read_csv('../data/train_target.csv')

In [ ]:
target.describe()

The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.


In [ ]:
target = target / 1000

In [ ]:
sns.distplot(target);
plt.title('SalePrice')

In [ ]:
import scipy as sp
sp.stats.skew(target)

In [ ]:
sp.stats.skewtest(target)

The distribution is skewed (as demonstrated by the large z-score (and small pvalue) of teh skewtest). It is right skewed (the skew is positive). Skewed distribution are not ideal for linear models, which often assume a normal distribution. One way to correct for right-skewness is to take the log [1,2,3]

We apply the function $x \rightarrow \log(1 + x)$ because it is always positive for $x \geq 0$


In [ ]:
logtarget = np.log1p(target)
print('skewness of logtarget = ', sp.stats.skew(logtarget)[0])
print('skewness test of logtarget = ', sp.stats.skewtest(logtarget))
sns.distplot(logtarget)
plt.title(r'log(1 + SalePrice)')

Merge the training and test datasets for data preparation

We're going to explore the training dataset and apply some transformations to it (fixing missing values, transforming columns etc). We'll need to apply the same transformations to the test dataset. To make that easy, let's put the training and test datasets into one dataframe.


In [ ]:
def read():
    """Read training and test data and return a dataframe with ['Dataset','Id'] multi-index
    """
    raw_train = pd.read_csv('../data/train_prepared_light.csv')
    raw_test = pd.read_csv('../data/test_prepared_light.csv')
    df = pd.concat([raw_train, raw_test], keys=['train', 'test'])
    df.index.names = 'Dataset', 'Id'
    return df
    
df = read()

In [ ]:
df.shape

In [ ]:
df.head()

In [ ]:
df.tail()

Initialize pipeline with raw data. We can always get the data and apply all the transformations in the pipeline by calling pp().


In [ ]:
pp = samlib.Pipeline(df.copy())  
assert pp == df  # the pipeline output equals df

Select Numerical features

The dataset is wide with 78 features. Create dataframe containing the numerical features only.


In [ ]:
df.columns, len(df.columns)

We've got 3 data types: int, float and object


In [ ]:
df.dtypes.value_counts()

Split the data between categorical and numerical features


In [ ]:
is_categorical = (df.dtypes == object)
is_numerical = ~is_categorical

In [ ]:
dfnum = df.loc[:, is_numerical].copy()

In [ ]:
dfnum.columns, len(dfnum.columns)

We've got 36 numerical features. We can use the describe method to get some statistics:


In [ ]:
dfnum.describe()

But that's a lot of numbers to digest. Better get started plotting!


In [ ]:
def select_numerical_features(df):
    return df.loc[:, df.dtypes != object]

pp.append(select_numerical_features)
# Check the pipline
pp == dfnum

Deal with NaN values


In [ ]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls

In [ ]:
dfnum[cols_with_nulls].isnull().sum().sort_values(ascending=False)

Based on the description, the null values for the MasVnrArea should be 0 (no massonry veneer type)


In [ ]:
# We may want to refine this in the future. Perhaps build a model to predict the missing GarageCars from the other features?
median_list = 'LotFrontage', 'BsmtFullBath','BsmtHalfBath', 'GarageCars', 'GarageArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF'
zero_list = []

def fillnans(dfnum):
    return dfnum.pipe(samlib.fillna, 'median', median_list)\
     .pipe(samlib.fillna, lambda df: 0, zero_list)\
     .assign(GarageYrBlt=dfnum.GarageYrBlt.fillna(
                dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()]))  # fill with year garage was built

In [ ]:
dfnum = fillnans(dfnum)

In [ ]:
# Check that we got rid of the nulls
assert not samlib.has_nulls(dfnum)

In [ ]:
pp.append(fillnans)
# Check the pipline
pp == dfnum

Order columns in alphabetical order


In [ ]:
def order_columns(df):
    return df.reindex_axis(df.columns.sort_values(), 1)

In [ ]:
pp.append(order_columns)

In [ ]:
pp().head()

In [ ]:
dfnum = pp()
dfnum.head()

Plot violinplots for each feature

The violin plots give us some idea of the distribution of data for each feature. We can look for things like skewness, non-normality, and the presence of outliers.


In [ ]:
dfnum.shape

In [ ]:
samlib.featureplot(dfnum, ncols=6, nrows=6, figsize=(12, 4))

Many of the features are higly skewed and some have very long tails. Some have discrete values (YrSold, Fireplaces).

The features with very long and thin tails, such as ScreenPorch, are almost constant (blobs with long tail) as can be seen below


In [ ]:
fig, ax = plt.subplots(1,1, figsize=(4, 4))
sns.distplot(dfnum.ScreenPorch, ax=ax)
ax.set_title('Distribution of ScreenPorch')

Drop nearly constant features


In [ ]:
def test_nearly_constant(series):
    counts = series.value_counts()
    max_val_count = max(counts)
    other_val_count = counts.drop(counts.argmax()).sum()
    return other_val_count / max_val_count < 0.25

is_nearly_constant = dfnum.apply(test_nearly_constant)
is_nearly_constant.value_counts()

In [ ]:
dropme = dfnum.columns[is_nearly_constant]
dropme

In [ ]:
def drop_constant_features(df):
    return df.drop(df.columns[df.apply(test_nearly_constant)], axis=1)

pp.append(drop_constant_features)
pp == dfnum.drop(dropme, axis=1)

In [ ]:
dfnum = dfnum.drop(dropme, axis=1)

Log transform the other features if they have a high skewness

Using a log transformation for some of the skewed features should help, as illustrated below. We use the raw data (not the standardized one) because we need positive values for the log function (we'll standardize the transformed variables later).


In [ ]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(dfnum['LotArea'], ax=axes[0])
sns.distplot(np.log1p(dfnum['LotArea']), ax=axes[1])

Use dataframe & series whenever possible for maximum flexibility (see below)


In [ ]:
def skewtest(train, sort=True, ascending=True):
    """Return dataframe of zfactor and pvalue for skew test"""
    test = sp.stats.skewtest(train)
    zfactor = test[0]
    pvalue = test[1]
    df = pd.DataFrame(dict(zfactor=zfactor, pvalue=pvalue), index=train.columns)
    if sort:
        return df.sort_values(by='zfactor', ascending=ascending)
    else:
        return df

skewtest(dfnum).head()

In [ ]:
def is_skewed(train, min_zfactor=10, plot=False):
    """Return series of booleans indicating whether a column is skewed or not.
    """
    sk = skewtest(train)
    if plot:
        plt.figure(1)
        plt.title('Z-factor distribution from skewtest')
        plt.xlabel('Z-factor')
        sns.distplot(sk.zfactor)
        plt.figure(2)
        sk.zfactor.plot(kind='barh')
        plt.title('Z-factor for skewtest')
    return sk.zfactor > min_zfactor

In [ ]:
is_skewed(dfnum, min_zfactor=10, plot=True)

Let's apply a log1p transform to all these and plot the distributions again


In [ ]:
def transform_skewed_colums(dfnum):
    """
    dfnum: dataframe to transform
    dropme: columns to drop
    is_skewed: iterable of length dfnum.columns indicating if a column is skewed
    """
    dfnum2 = dfnum.copy()
    skewed_colz = is_skewed(dfnum)
    dfnum2.loc[:, skewed_colz] = dfnum2.loc[:, skewed_colz].apply(np.log1p)
    return dfnum2

pp.append(transform_skewed_colums)

# the transformed dataset has fewer columns and we only want those
dfnum2 = pp()

In [ ]:
dfnum2.columns

In [ ]:
is_skewed(dfnum2)

In [ ]:
sorted(sp.stats.skewtest(dfnum2)[0])

In [ ]:
zfactors2 = sp.stats.skewtest(dfnum2)[0]
pd.Series(data=zfactors2, index=dfnum2.columns)[is_skewed(dfnum)].sort_values().plot(kind='barh')

Now our originally skewed features look more symmetric.

Check that the distributions are less skewed


In [ ]:
skewed = is_skewed(dfnum)
skewed.value_counts()

In [ ]:
dfnum.shape

In [ ]:
samlib.featureplot(dfnum2.loc[:, skewed], nrows=3, ncols=6, figsize=(10,3))

In [ ]:
samlib.featureplot(dfnum2.loc[:, ~skewed], nrows=2, ncols=5, figsize=(10, 3))

Save transformed numerical data

Use the storage magic to communicate between notebooks.


In [ ]:
dfnum2.to_csv('transformed_dataset_dfnum2.csv', index=True)

Correlations

We're now in a good position to identify the key numerical features. Those should be hightly correlated with the sale price.


In [ ]:
def correlation(train, target_t):
    corr = pd.DataFrame(data=train.apply(lambda feature: sp.stats.pearsonr(feature, target_t['SalePrice'])), 
                        columns=['pearsonr'])
    corr = corr.assign(correlation=corr.applymap(lambda x: x[0]),
                       pvalue=corr.applymap(lambda x: x[1]))
    corr = corr.drop('pearsonr', axis=1)
    return corr.sort_values('pvalue', ascending=False)['correlation']

In [ ]:
correlation(dfnum2.loc['train', :], logtarget).plot(kind='barh')

Sort columns in dfnum2 by correlation.


In [ ]:
def sort_columns_by_correlation(dfnum2, target_t=logtarget):
    corr = correlation(dfnum2.loc['train',:], target_t)
    return dfnum2.reindex_axis(corr[::-1].index, axis=1)

#sort_columns_by_correlation(dfnum2)
pp.append(sort_columns_by_correlation)

In [ ]:
pp().head()

In [ ]:
pp().to_csv('transformed_numerical_dataset.csv', index=True)

Scatter plots

The correlation can be high even if there is no linear relationship between a feature and the target, so we should check the scatter plots.


In [ ]:
train = pp().loc['train'].assign(target=logtarget)

In [ ]:
samlib.featureplot2(train, ncols=4, size=3, aspect=1.0, plotfunc=sns.regplot, y="target", data=train)
#(train.iloc[:, :-1], ncols=4, nrows=7, plotfunc=scatter, figsize=(12,3))

Some features have some sort of bi-modal distribution with a lots of 0 values.


In [ ]:
cols_with_zeros = ['OpenPorchSF', 'MasVnrArea', 'TotalBsmtSF', 'WoodDeckSF', 'BsmtUnfSF', 'BsmtFinSF1', '2ndFlrSF']

In [ ]:
not_oktrain = train.loc[:, cols_with_zeros + ["target"]]
samlib.featureplot2(not_oktrain, ncols=4, size=3, aspect=1.0, 
                    plotfunc=sns.regplot, y="target", data=not_oktrain)

Dealing with the zeros

Let's recompute the correlations after dropping the zeros.


In [ ]:
notok = not_oktrain[not_oktrain != 0].drop('target', axis=1)
#correlation(notok, logtarget)
notok.head()

In [ ]:
def correlation2(train, target=logtarget['SalePrice'], ignorena=True):
    
    def corr(series):
        if ignorena:
            mask = ~series.isnull()
            return sp.stats.pearsonr(series[mask], target[mask])
    
    df = pd.DataFrame(data=train.apply(corr), columns=['pearsonr'])
    return df.assign(pearson=df.applymap(lambda x: x[0]), pvalue=df.applymap(lambda x: x[1])).drop('pearsonr', axis=1)

notok_corrs = correlation2(notok).sort_values('pearson')

In [ ]:
corrs_with_zeros = correlation(not_oktrain, logtarget).reindex(notok_corrs.index)
corrs_without_zeros = notok_corrs['pearson']

In [ ]:
pd.DataFrame(dict(with_zeros=corrs_with_zeros, no_zeros=corrs_without_zeros)).plot(kind='barh')
plt.title('Effect of removing zeros on correlation')

In [ ]:
not_oktrain2 = not_oktrain.reindex_axis(notok_corrs.index[::-1], axis=1);

In [ ]:
def regplot_dropzeros(data=not_oktrain, drop_zeros=False, **kwargs):
    col = data.columns[0]
    if drop_zeros:
        mask = data[col] != 0
        xt = data[mask].squeeze()
        yt = logtarget[mask].squeeze()
    else:
        xt = data.squeeze()
        yt = logtarget.squeeze()
    sns.regplot(xt, yt, **kwargs)

With the zeros


In [ ]:
samlib.featureplot(not_oktrain2, ncols=4, nrows=2, figsize=(12, 3), plotfunc=regplot_dropzeros, drop_zeros=False)

Without the zeros


In [ ]:
samlib.featureplot(not_oktrain2, ncols=4, nrows=2, figsize=(12, 3), plotfunc=regplot_dropzeros, drop_zeros=True)

Data imputation

Impute the zero values from the rest of the dataset for the "not ok" columns.


In [ ]:
import statsmodels.api as sm

In [ ]:
from statsmodels.imputation import mice

In [ ]:
df = pp()

In [ ]:
# Replace zeros by NaNs
df[df.loc[:, cols_with_zeros] == 0] = np.nan

In [ ]:
df = df.rename_axis({'1stFlrSF':'FrstFlrSF', '2ndFlrSF':'SndFlrSF'}, axis=1)
df.loc['train','SalePrice'] = logtarget.values
df.head()

In [ ]:
samlib.has_nulls(df)

In [ ]:
imp = mice.MICEData(df)

In [ ]:
imp.update_all()

In [ ]:
imp.data.head()

In [ ]:
cols = ['OpenPorchSF', 'MasVnrArea', 'TotalBsmtSF', 'WoodDeckSF', 'BsmtUnfSF', 'BsmtFinSF1', 'SndFlrSF','SalePrice']

In [ ]:
imputed_notok = imp.data.loc[:, cols]

In [ ]:
imputed_notok.columns

In [ ]:
samlib.featureplot2(imputed_notok, ncols=4, size=3, aspect=1.0, 
                    plotfunc=sns.regplot, y="SalePrice", data=imputed_notok)

In [ ]:
imp.data.shape

In [ ]:
df.shape

In [ ]:
imp.data.index=df.index

In [ ]:
imp.data.head()

In [ ]:
imp.data.reindex_like(df).to_csv('transformed_numerical_dataset_imputed.csv', index=True)