In [ ]:
import itertools
import os
import sys
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi
In [ ]:
sys.path.insert(1, os.path.join(sys.path[0], '..')) # add parent directory to path
import samlib
Use the cleaner chaining method for transforming the data https://tomaugspurger.github.io/method-chaining.html
In [ ]:
target = pd.read_csv('../data/train_target.csv')
In [ ]:
target.describe()
The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.
In [ ]:
target = target / 1000
In [ ]:
sns.distplot(target);
plt.title('SalePrice')
In [ ]:
import scipy as sp
sp.stats.skew(target)
In [ ]:
sp.stats.skewtest(target)
The distribution is skewed (as demonstrated by the large z-score (and small pvalue) of teh skewtest). It is right skewed (the skew is positive). Skewed distribution are not ideal for linear models, which often assume a normal distribution. One way to correct for right-skewness is to take the log [1,2,3]
We apply the function $x \rightarrow \log(1 + x)$ because it is always positive for $x \geq 0$
In [ ]:
logtarget = np.log1p(target)
print('skewness of logtarget = ', sp.stats.skew(logtarget)[0])
print('skewness test of logtarget = ', sp.stats.skewtest(logtarget))
sns.distplot(logtarget)
plt.title(r'log(1 + SalePrice)')
We're going to explore the training dataset and apply some transformations to it (fixing missing values, transforming columns etc). We'll need to apply the same transformations to the test dataset. To make that easy, let's put the training and test datasets into one dataframe.
In [ ]:
def read():
"""Read training and test data and return a dataframe with ['Dataset','Id'] multi-index
"""
raw_train = pd.read_csv('../data/train_prepared_light.csv')
raw_test = pd.read_csv('../data/test_prepared_light.csv')
df = pd.concat([raw_train, raw_test], keys=['train', 'test'])
df.index.names = 'Dataset', 'Id'
return df
df = read()
In [ ]:
df.shape
In [ ]:
df.head()
In [ ]:
df.tail()
Initialize pipeline with raw data. We can always get the data and apply all the transformations in the pipeline by calling pp().
In [ ]:
pp = samlib.Pipeline(df.copy())
assert pp == df # the pipeline output equals df
The dataset is wide with 78 features. Create dataframe containing the numerical features only.
In [ ]:
df.columns, len(df.columns)
We've got 3 data types: int, float and object
In [ ]:
df.dtypes.value_counts()
Split the data between categorical and numerical features
In [ ]:
is_categorical = (df.dtypes == object)
is_numerical = ~is_categorical
In [ ]:
dfnum = df.loc[:, is_numerical].copy()
In [ ]:
dfnum.columns, len(dfnum.columns)
We've got 36 numerical features. We can use the describe method to get some statistics:
In [ ]:
dfnum.describe()
But that's a lot of numbers to digest. Better get started plotting!
In [ ]:
def select_numerical_features(df):
return df.loc[:, df.dtypes != object]
pp.append(select_numerical_features)
# Check the pipline
pp == dfnum
In [ ]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls
In [ ]:
dfnum[cols_with_nulls].isnull().sum().sort_values(ascending=False)
Based on the description, the null values for the MasVnrArea should be 0 (no massonry veneer type)
In [ ]:
# We may want to refine this in the future. Perhaps build a model to predict the missing GarageCars from the other features?
median_list = 'LotFrontage', 'BsmtFullBath','BsmtHalfBath', 'GarageCars', 'GarageArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF'
zero_list = []
def fillnans(dfnum):
return dfnum.pipe(samlib.fillna, 'median', median_list)\
.pipe(samlib.fillna, lambda df: 0, zero_list)\
.assign(GarageYrBlt=dfnum.GarageYrBlt.fillna(
dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()])) # fill with year garage was built
In [ ]:
dfnum = fillnans(dfnum)
In [ ]:
# Check that we got rid of the nulls
assert not samlib.has_nulls(dfnum)
In [ ]:
pp.append(fillnans)
# Check the pipline
pp == dfnum
In [ ]:
def order_columns(df):
return df.reindex_axis(df.columns.sort_values(), 1)
In [ ]:
pp.append(order_columns)
In [ ]:
pp().head()
In [ ]:
dfnum = pp()
dfnum.head()
In [ ]:
dfnum.shape
In [ ]:
samlib.featureplot(dfnum, ncols=6, nrows=6, figsize=(12, 4))
Many of the features are higly skewed and some have very long tails. Some have discrete values (YrSold, Fireplaces).
The features with very long and thin tails, such as ScreenPorch, are almost constant (blobs with long tail) as can be seen below
In [ ]:
fig, ax = plt.subplots(1,1, figsize=(4, 4))
sns.distplot(dfnum.ScreenPorch, ax=ax)
ax.set_title('Distribution of ScreenPorch')
In [ ]:
def test_nearly_constant(series):
counts = series.value_counts()
max_val_count = max(counts)
other_val_count = counts.drop(counts.argmax()).sum()
return other_val_count / max_val_count < 0.25
is_nearly_constant = dfnum.apply(test_nearly_constant)
is_nearly_constant.value_counts()
In [ ]:
dropme = dfnum.columns[is_nearly_constant]
dropme
In [ ]:
def drop_constant_features(df):
return df.drop(df.columns[df.apply(test_nearly_constant)], axis=1)
pp.append(drop_constant_features)
pp == dfnum.drop(dropme, axis=1)
In [ ]:
dfnum = dfnum.drop(dropme, axis=1)
Using a log transformation for some of the skewed features should help, as illustrated below. We use the raw data (not the standardized one) because we need positive values for the log function (we'll standardize the transformed variables later).
In [ ]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(dfnum['LotArea'], ax=axes[0])
sns.distplot(np.log1p(dfnum['LotArea']), ax=axes[1])
Use dataframe & series whenever possible for maximum flexibility (see below)
In [ ]:
def skewtest(train, sort=True, ascending=True):
"""Return dataframe of zfactor and pvalue for skew test"""
test = sp.stats.skewtest(train)
zfactor = test[0]
pvalue = test[1]
df = pd.DataFrame(dict(zfactor=zfactor, pvalue=pvalue), index=train.columns)
if sort:
return df.sort_values(by='zfactor', ascending=ascending)
else:
return df
skewtest(dfnum).head()
In [ ]:
def is_skewed(train, min_zfactor=10, plot=False):
"""Return series of booleans indicating whether a column is skewed or not.
"""
sk = skewtest(train)
if plot:
plt.figure(1)
plt.title('Z-factor distribution from skewtest')
plt.xlabel('Z-factor')
sns.distplot(sk.zfactor)
plt.figure(2)
sk.zfactor.plot(kind='barh')
plt.title('Z-factor for skewtest')
return sk.zfactor > min_zfactor
In [ ]:
is_skewed(dfnum, min_zfactor=10, plot=True)
Let's apply a log1p transform to all these and plot the distributions again
In [ ]:
def transform_skewed_colums(dfnum):
"""
dfnum: dataframe to transform
dropme: columns to drop
is_skewed: iterable of length dfnum.columns indicating if a column is skewed
"""
dfnum2 = dfnum.copy()
skewed_colz = is_skewed(dfnum)
dfnum2.loc[:, skewed_colz] = dfnum2.loc[:, skewed_colz].apply(np.log1p)
return dfnum2
pp.append(transform_skewed_colums)
# the transformed dataset has fewer columns and we only want those
dfnum2 = pp()
In [ ]:
dfnum2.columns
In [ ]:
is_skewed(dfnum2)
In [ ]:
sorted(sp.stats.skewtest(dfnum2)[0])
In [ ]:
zfactors2 = sp.stats.skewtest(dfnum2)[0]
pd.Series(data=zfactors2, index=dfnum2.columns)[is_skewed(dfnum)].sort_values().plot(kind='barh')
Now our originally skewed features look more symmetric.
In [ ]:
skewed = is_skewed(dfnum)
skewed.value_counts()
In [ ]:
dfnum.shape
In [ ]:
samlib.featureplot(dfnum2.loc[:, skewed], nrows=3, ncols=6, figsize=(10,3))
In [ ]:
samlib.featureplot(dfnum2.loc[:, ~skewed], nrows=2, ncols=5, figsize=(10, 3))
In [ ]:
dfnum2.to_csv('transformed_dataset_dfnum2.csv', index=True)
In [ ]:
def correlation(train, target_t):
corr = pd.DataFrame(data=train.apply(lambda feature: sp.stats.pearsonr(feature, target_t['SalePrice'])),
columns=['pearsonr'])
corr = corr.assign(correlation=corr.applymap(lambda x: x[0]),
pvalue=corr.applymap(lambda x: x[1]))
corr = corr.drop('pearsonr', axis=1)
return corr.sort_values('pvalue', ascending=False)['correlation']
In [ ]:
correlation(dfnum2.loc['train', :], logtarget).plot(kind='barh')
Sort columns in dfnum2 by correlation.
In [ ]:
def sort_columns_by_correlation(dfnum2, target_t=logtarget):
corr = correlation(dfnum2.loc['train',:], target_t)
return dfnum2.reindex_axis(corr[::-1].index, axis=1)
#sort_columns_by_correlation(dfnum2)
pp.append(sort_columns_by_correlation)
In [ ]:
pp().head()
In [ ]:
pp().to_csv('transformed_numerical_dataset.csv', index=True)
In [ ]:
train = pp().loc['train'].assign(target=logtarget)
In [ ]:
samlib.featureplot2(train, ncols=4, size=3, aspect=1.0, plotfunc=sns.regplot, y="target", data=train)
#(train.iloc[:, :-1], ncols=4, nrows=7, plotfunc=scatter, figsize=(12,3))
Some features have some sort of bi-modal distribution with a lots of 0 values.
In [ ]:
cols_with_zeros = ['OpenPorchSF', 'MasVnrArea', 'TotalBsmtSF', 'WoodDeckSF', 'BsmtUnfSF', 'BsmtFinSF1', '2ndFlrSF']
In [ ]:
not_oktrain = train.loc[:, cols_with_zeros + ["target"]]
samlib.featureplot2(not_oktrain, ncols=4, size=3, aspect=1.0,
plotfunc=sns.regplot, y="target", data=not_oktrain)
In [ ]:
notok = not_oktrain[not_oktrain != 0].drop('target', axis=1)
#correlation(notok, logtarget)
notok.head()
In [ ]:
def correlation2(train, target=logtarget['SalePrice'], ignorena=True):
def corr(series):
if ignorena:
mask = ~series.isnull()
return sp.stats.pearsonr(series[mask], target[mask])
df = pd.DataFrame(data=train.apply(corr), columns=['pearsonr'])
return df.assign(pearson=df.applymap(lambda x: x[0]), pvalue=df.applymap(lambda x: x[1])).drop('pearsonr', axis=1)
notok_corrs = correlation2(notok).sort_values('pearson')
In [ ]:
corrs_with_zeros = correlation(not_oktrain, logtarget).reindex(notok_corrs.index)
corrs_without_zeros = notok_corrs['pearson']
In [ ]:
pd.DataFrame(dict(with_zeros=corrs_with_zeros, no_zeros=corrs_without_zeros)).plot(kind='barh')
plt.title('Effect of removing zeros on correlation')
In [ ]:
not_oktrain2 = not_oktrain.reindex_axis(notok_corrs.index[::-1], axis=1);
In [ ]:
def regplot_dropzeros(data=not_oktrain, drop_zeros=False, **kwargs):
col = data.columns[0]
if drop_zeros:
mask = data[col] != 0
xt = data[mask].squeeze()
yt = logtarget[mask].squeeze()
else:
xt = data.squeeze()
yt = logtarget.squeeze()
sns.regplot(xt, yt, **kwargs)
With the zeros
In [ ]:
samlib.featureplot(not_oktrain2, ncols=4, nrows=2, figsize=(12, 3), plotfunc=regplot_dropzeros, drop_zeros=False)
Without the zeros
In [ ]:
samlib.featureplot(not_oktrain2, ncols=4, nrows=2, figsize=(12, 3), plotfunc=regplot_dropzeros, drop_zeros=True)
In [ ]:
import statsmodels.api as sm
In [ ]:
from statsmodels.imputation import mice
In [ ]:
df = pp()
In [ ]:
# Replace zeros by NaNs
df[df.loc[:, cols_with_zeros] == 0] = np.nan
In [ ]:
df = df.rename_axis({'1stFlrSF':'FrstFlrSF', '2ndFlrSF':'SndFlrSF'}, axis=1)
df.loc['train','SalePrice'] = logtarget.values
df.head()
In [ ]:
samlib.has_nulls(df)
In [ ]:
imp = mice.MICEData(df)
In [ ]:
imp.update_all()
In [ ]:
imp.data.head()
In [ ]:
cols = ['OpenPorchSF', 'MasVnrArea', 'TotalBsmtSF', 'WoodDeckSF', 'BsmtUnfSF', 'BsmtFinSF1', 'SndFlrSF','SalePrice']
In [ ]:
imputed_notok = imp.data.loc[:, cols]
In [ ]:
imputed_notok.columns
In [ ]:
samlib.featureplot2(imputed_notok, ncols=4, size=3, aspect=1.0,
plotfunc=sns.regplot, y="SalePrice", data=imputed_notok)
In [ ]:
imp.data.shape
In [ ]:
df.shape
In [ ]:
imp.data.index=df.index
In [ ]:
imp.data.head()
In [ ]:
imp.data.reindex_like(df).to_csv('transformed_numerical_dataset_imputed.csv', index=True)