In [ ]:
import itertools
import os
import sys
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi
In [ ]:
sys.path.insert(1, os.path.join(sys.path[0], '..')) # add parent directory to path
import samlib
imp.reload(samlib);
In [ ]:
import logging
logging.basicConfig(level=logging.DEBUG)
In [ ]:
target = pd.read_csv('../data/train_target.csv')
In [ ]:
target.describe()
The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.
In [ ]:
target = target / 1000
In [ ]:
logtarget = np.log1p(target)
In [ ]:
def read():
"""Read training and test data and return a dataframe with ['Dataset','Id'] multi-index
"""
raw_train = pd.read_csv('../data/train_prepared_light.csv')
raw_test = pd.read_csv('../data/test_prepared_light.csv')
df = pd.concat([raw_train, raw_test], keys=['train', 'test'])
df.index.names = 'Dataset', 'Id'
return df
df = read()
In [ ]:
pp = samlib.Pipeline(df.copy())
assert pp == df # the pipeline output equals df
In [ ]:
def select_categorical_features(df):
return df.loc[:, df.dtypes == object]
pp.append(select_categorical_features)
We've got 42 categorical features. We can use the describe method to get some statistics:
In [ ]:
pp().shape
In [ ]:
pp().describe()
In [ ]:
pp().isnull().sum()
Number of categories per feature
In [ ]:
plt.figure(figsize=(12, 10))
pp().describe().loc['unique'].sort_values(ascending=False).plot(kind='barh')
plt.title('Number of categories per feature')
Number of nulls per feature
In [ ]:
nulls = pp().isnull().sum()
In [ ]:
plt.figure(figsize=(12, 10))
ax = nulls[nulls > 0].sort_values(ascending=False).plot(kind='barh')
plt.title('Number of nulls per feature')
But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values. http://sebastianraschka.com/Articles/2014_about_feature_scaling.html
In [ ]:
def replace_bad_nulls(df):
nulls = df.isnull().sum()
bad_nulls_colz = nulls[nulls > 0].sort_values()['BsmtFinType1':].index
return samlib.fillna(df, lambda x: 'none', bad_nulls_colz)
pp.append(replace_bad_nulls)
Replace true nulls with mode (work in progress)
In [ ]:
def fill_nulls_with_mode(df):
return samlib.fillna(df, lambda dg: dg.mode().loc[0])
pp.append(fill_nulls_with_mode)
In [ ]:
pp()['LotShape'].head()
In [ ]:
def ordered_categories(series):
dg = pd.DataFrame(series).copy()
dg.loc['train','LogSalePrice'] = logtarget.values
return dg.groupby(dg.columns[0]).median().sort_values('LogSalePrice', ascending=False)
ordered_categories(pp()['LotShape'])
In [ ]:
def categorize(feature):
feature = feature.copy()
return pd.Categorical(feature, ordered_categories(feature).index)
categorize(pp()['LotShape'])
In [ ]:
def objects_to_categories(df):
return df.apply(categorize)
objects_to_categories(pp())['LotShape'].head()
In [ ]:
pp.append(objects_to_categories)
pp()['LotShape'].head()
In [ ]:
def order_columns_by_uniques(df):
colz_ordered_by_unique = df.describe().loc['unique'].sort_values(ascending=False).index
return df.reindex_axis(colz_ordered_by_unique, axis=1)
pp.append(order_columns_by_uniques)
pp().head()
See http://seaborn.pydata.org/tutorial/categorical.html for some ideas
In [ ]:
df = pp()
df.shape
In [ ]:
train = pp().loc['train']
In [ ]:
def plot_price_dist(y='LotShape', data=train, logtarget=logtarget, **kwargs):
"""Plot the price distribution for each category"""
dg = data[[y]].copy()
dg['LogSalePrice'] = logtarget
# Order categories by mean value
sns.violinplot(x="LogSalePrice", y=y, data=dg, scale='width', **kwargs)
#sns.swarmplot(x="LogSalePrice", y=y, data=dg, color="w", alpha=.5);
In [ ]:
def plot_value_counts(y=None, data=df, **kwargs):
if y is None:
if data.shape[1] == 1:
y = data.columns[0]
else:
raise ValueError('Must pass y or a dataframe with a single column')
return sns.countplot(y=y, data=data, **kwargs)
In [ ]:
train.shape
Use samlib.featureplots to plot the distribution and value count of each category.
Gives an idea of the distribution of values for each categorical variable. We can see that some categories, such as 'Condition2', are almost constant so are unlikely to have a large impact on predicting the sale xprice.
In [ ]:
samlib.featureplots(train, nrows=21, ncols=4, figsize=(2, 8), plotfuncs=(plot_price_dist, plot_value_counts), axis=1)
Let's create an indicator to detect the bad features, that:
In [ ]:
looks_good = 'Neighborhood'
looks_bad = 'Condition1'
The medians of the logprice within each categories should have maximum variance (the less the variance, the less we can distinguish the price within each category).
In [ ]:
# medians
med_price = logtarget.median().values
df = pp()
sharps = df.apply(lambda col: ordered_categories(col).std().values).iloc[0] / med_price
In [ ]:
fig, ax = plt.subplots(1,1, figsize=(12, 12))
sharps.sort_values().plot(kind='barh', ax=ax)
plt.title('Std of median logprices for each category (high is better)')
The entropy of the value counts should be large: the higher the entropy, the more uniformly distributed the value counts.
In [ ]:
from scipy.stats import entropy
df = pp()
unicounts = df.apply(lambda ser: entropy(ser.value_counts()))
In [ ]:
fig, ax = plt.subplots(1,1, figsize=(12, 12))
unicounts.sort_values().plot(kind='barh', ax=ax)
plt.title('Entropy of value counts for each category (high is better)')
Good features have a high variability and more uniform counts.
In [ ]:
goodness = sharps * unicounts
In [ ]:
fig, ax = plt.subplots(1,1, figsize=(12, 12))
goodness.sort_values().plot(kind='barh', ax=ax)
plt.title('Goodness of category (high is better)')
In [ ]:
topcolz = goodness.sort_values(ascending=False)[:12].index
topcolz
Let's plot our top features and check that they indeed appear helpful (good variability in the medians and high entropy in the counts).
In [ ]:
samlib.featureplots(train[topcolz], nrows=6, ncols=4, figsize=(2, 8), plotfuncs=(plot_price_dist, plot_value_counts), axis=1)
In [ ]:
df = pp()
df.columns = goodness.sort_values(ascending=False).index
In [ ]:
df.to_csv('cleaned_categorical_vars_with_colz_sorted_by_goodness.csv', index=True)
In [ ]: