In [ ]:
import itertools
import os
import sys
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi
In [ ]:
sys.path.insert(1, os.path.join(sys.path[0], '..')) # add parent directory to path
import samlib
In [ ]:
target = pd.read_csv('../data/train_target.csv')
In [ ]:
target.describe()
The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.
In [ ]:
target = target / 1000
In [ ]:
sns.distplot(target);
plt.title('SalePrice')
In [ ]:
import scipy as sp
sp.stats.skew(target)
In [ ]:
sp.stats.skewtest(target)
The distribution is skewed (as demonstrated by the large z-score (and small pvalue) of teh skewtest). It is right skewed (the skew is positive). Skewed distribution are not ideal for linear models, which often assume a normal distribution. One way to correct for right-skewness is to take the log [1,2,3]
We apply the function $x \rightarrow \log(1 + x)$ because it is always positive for $x \geq 0$
In [ ]:
logtarget = np.log1p(target)
print('skewness of logtarget = ', sp.stats.skew(logtarget)[0])
print('skewness test of logtarget = ', sp.stats.skewtest(logtarget))
sns.distplot(logtarget)
plt.title(r'log(1 + SalePrice)')
We're going to explore the training dataset and apply some transformations to it (fixing missing values, transforming columns etc). We'll need to apply the same transformations to the test dataset. To make that easy, let's put the training and test datasets into one dataframe.
In [ ]:
raw_train = pd.read_csv('../data/train_prepared_light.csv')
raw_test = pd.read_csv('../data/test_prepared_light.csv')
df = pd.concat([raw_train, raw_test], keys=['train', 'test'])
In [ ]:
df.shape
In [ ]:
ncategories = sum(df.dtypes == object)
ncategories
In [ ]:
df.head()
In [ ]:
df.tail()
In [ ]:
df.columns, len(df.columns)
We've got 3 data types: int, float and object
In [ ]:
df.dtypes.unique()
Split the data between categorical and numerical features
In [ ]:
is_categorical = (df.dtypes == object)
is_numerical = ~is_categorical
Create a numerical dataset to keep track of the features
In [ ]:
dfnum = df.loc[:, is_numerical].copy()
In [ ]:
dfnum.columns, len(dfnum.columns)
We've got 36 numerical features. We can use the describe method to get some statistics:
In [ ]:
dfnum.describe()
But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values. http://sebastianraschka.com/Articles/2014_about_feature_scaling.html
In [ ]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls
In [ ]:
dfnum.shape
In [ ]:
dfnum[cols_with_nulls].isnull().sum().sort_values(ascending=False)
#.plot(kind='bar')
Based on the description, the null values for the MasVnrArea should be 0 (no massonry veneer type)
In [ ]:
# We may want to refine this in the future. Perhaps build a model to predict the missing GarageCars from the other features?
median_list = 'LotFrontage', 'BsmtFullBath','BsmtHalfBath', 'GarageCars', 'GarageArea'
zero_list = 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF'
In [ ]:
for feature in median_list:
dfnum[feature].fillna(dfnum[feature].median(), inplace=True)
In [ ]:
for feature in zero_list:
dfnum[feature].fillna(0, inplace=True)
For the GarageYrBlt, replace by the year the house was built.
In [ ]:
dfnum.GarageYrBlt.fillna(dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()], inplace=True)
In [ ]:
# Check that we got rid of the nulls
dfnum.isnull().sum().any()
In [ ]:
# Assign to the slice (see the copy / write problem in Pandas)
df.loc[:, is_numerical] = dfnum
In [ ]:
def standardize(df):
return sk.preprocessing.StandardScaler().fit_transform(df)
dfnum_t = dfnum.apply(standardize)
dfnum_t.head()
In [ ]:
def violinplot(df, ax=None):
if ax is None:
ax = plt.gca()
sns.violinplot(df, ax=ax)
for xlab in ax.get_xticklabels():
xlab.set_rotation(30)
In [ ]:
def featureplot(df, nrows=1, figsize=(12,8), plotfunc=violinplot):
"""Plot the dataframe features"""
width, height = figsize
fig, axes = plt.subplots(nrows, 1, figsize=(width, height * nrows));
i = 0
plots_per_figure = df.shape[1] // nrows
if nrows == 1:
axes = [axes]
for j, ax in zip(range(plots_per_figure, df.shape[1] + 1, plots_per_figure), axes):
plotfunc(df.iloc[:, i:j], ax=ax)
i = j
In [ ]:
dfnum_t.head()
In [ ]:
train = dfnum.loc['train',:]
train_t = dfnum_t.loc['train',:]
Many of the features are higly skewed with very long tails.
In [ ]:
featureplot(train_t.iloc[:, 0:9])
Most of these are right skewed as well. BsmtFullBath has some discrete values (number of bathrooms).
In [ ]:
featureplot(train_t.iloc[:, 9:18])
Some features, such as BsmtFinSF2, are almost constant (blobs with long tail) as can be seen below
In [ ]:
fig, ax = plt.subplots(1,1, figsize=(4, 4))
sns.distplot(train_t['BsmtFinSF2'], ax=ax)
ax.set_title('Distribution of BsmtFinSF2')
In [ ]:
def test_nearly_constant(series):
counts = series.value_counts()
max_val_count = max(counts)
other_val_count = counts.drop(counts.argmax()).sum()
return other_val_count / max_val_count < 0.25
is_nearly_constant = train_t.apply(test_nearly_constant)
is_nearly_constant.value_counts()
In [ ]:
dropme = train_t.columns[is_nearly_constant]
dropme
In [ ]:
df = df.drop(dropme, axis=1)
train = train.drop(dropme, axis=1)
train_t = train_t.drop(dropme, axis=1)
Using a log transformation for some of the skewed features should help, as illustrated below. We use the raw data (not the standardized one) because we need positive values for the log function (we'll standardize the transformed variables later).
In [ ]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(train['LotArea'], ax=axes[0])
sns.distplot(np.log1p(train['LotArea']), ax=axes[1])
In [ ]:
zfactors = sp.stats.skewtest(train)[0]
sns.distplot(zfactors)
In [ ]:
is_skewed = np.abs(zfactors) > 10
pd.Series(data=zfactors, index=train.columns)[is_skewed].sort_values().plot(kind='barh')
plt.title('Z-factor for skewtest')
Check the sign of the skewness for all these
In [ ]:
assert all(np.sign(sp.stats.skew(train)[is_skewed]) > 0)
Let's apply a log1p transform to all these and plot the distributions again
In [ ]:
def transform_skewed_colums(dfnum, is_skewed=is_skewed):
"""
dfnum: dataframe to transform
dropme: columns to drop
is_skewed: iterable of length dfnum.columns indicating if a column is skewed
"""
dfnum2 = dfnum.copy()
for feature, skewed_feature in zip(dfnum.columns, is_skewed):
if skewed_feature:
dfnum2[feature] = np.log1p(dfnum[feature])
dfnum_t2 = dfnum2.apply(standardize)
return dfnum_t2
# the transformed dataset has fewer columns and we only want those
dfnum_t2 = transform_skewed_colums(df.loc[:, is_numerical])
In [ ]:
dfnum_t2.iloc[:, is_skewed].columns
In [ ]:
zfactors2 = sp.stats.skewtest(dfnum_t2)[0]
pd.Series(data=zfactors2, index=dfnum_t2.columns)[is_skewed].sort_values().plot(kind='barh')
Now our originally skewed features look more symmetric.
In [ ]:
featureplot(dfnum_t2.iloc[:, is_skewed], nrows=2, figsize=(10,5))
In [ ]:
featureplot(dfnum_t2.iloc[:, ~is_skewed], nrows=2, figsize=(10, 5))
In [ ]:
dfnum_t2.index.names = ['Dataset', 'Id']
dfnum_t2.head()
In [ ]:
dfnum_t2.to_csv('transformed_dataset_dfnum_t2.csv', index=True)
In [ ]:
nfeatures = dfnum_t2.columns
target_t = logtarget.apply(standardize)
target_t.head()
In [ ]:
dfnum_t2.head()
In [ ]:
corr = pd.DataFrame(data=dfnum_t2.loc['train',:].apply(lambda feature: sp.stats.pearsonr(feature, target_t['SalePrice'])),
columns=['pearsonr'])
corr = corr.assign(correlation=corr.applymap(lambda x: x[0]),
pvalue=corr.applymap(lambda x: x[1]))
corr = corr.drop('pearsonr', axis=1)
In [ ]:
corr.head()
In [ ]:
corr.sort_values('pvalue', ascending=False)['correlation'].plot(kind='barh')
In [ ]:
corr.sort_values('pvalue').head()
In [ ]:
corr.sort_values('pvalue').tail()
Let's keep only the features that have a high enough correlation with the price (correlation less than 0.2)
In [ ]:
min_correlation = 0.2
key_features = corr[np.abs(corr['correlation'] > min_correlation)].sort_values(by='correlation', ascending=False).index.values
key_features, key_features.size
In [ ]:
%store key_features
In [ ]: