In [1]:
# Import needed packages
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 20
sns.set()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [44]:
# Load train and test.csv
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
In [3]:
train.head()
test.head()
Out[3]:
Out[3]:
In [38]:
# Select quantitative and qualitative
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']
In [39]:
print("Number of quantitative features:", len(quantitative))
print("Number of qualitative features:", len(qualitative))
In [40]:
quantitative
qualitative
Out[40]:
Out[40]:
In [7]:
# Explore datasets' sizes
print("# of samples in training set:", len(train))
print("# of samples in test set:", len(test))
In [8]:
# Explore features with missing values
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True, ascending=False)
missing.plot.bar()
Out[8]:
In [9]:
# Explore distribution of the target, SalePrice
import scipy.stats as st
y = train['SalePrice']
plt.figure(1)
plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2)
plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3)
plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
Out[9]:
It appears that Johnson Su provides the best fit for the SalePrice
In [10]:
# Test the quantitative variables for normality, using Shapiro Wilk, with alpha=0.01
# https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01
normal = pd.DataFrame(train[quantitative])
normal = normal.apply(test_normality)
# True = reject null hypothesis (i.e., NOT normally distributed)
print(normal)
Also none of quantitative variables has normal distribution so these should be transformed as well.
In [11]:
f = pd.melt(train, value_vars=quantitative)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
Good candidates for log transformation
In [12]:
# Convert categorical features to dtype `categry` and fill missing
for c in qualitative:
train[c] = train[c].astype('category')
def boxplot(x, y, **kwargs):
sns.boxplot(x=x, y=y)
x=plt.xticks(rotation=90)
f = pd.melt(train, id_vars=['SalePrice'], value_vars=qualitative)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False)
g.map(boxplot, "value", "SalePrice")
Out[12]:
(Below) For each variable SalePrices are partitioned to distinct sets based on category values. Then check with ANOVA test if sets have similar distributions. If variable has minor impact then set means should be equal. Decreasing pval is sign of increasing diversity in partitions.
In [13]:
# Apply one-way ANOVA. For each feature, null hypothesis = no difference between mean SalePrice
# for each level
def anova(frame):
anv = pd.DataFrame()
anv['feature'] = qualitative
pvals = []
for c in qualitative:
samples = []
for cls in frame[c].unique():
s = frame[frame[c] == cls]['SalePrice'].values
samples.append(s)
pval = stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval'] = pvals
return anv.sort_values('pval')
a = anova(train)
a['disparity'] = np.log(1./a['pval'].values)
plt.figure(figsize=(10, 6))
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)
Out[13]:
Out[13]:
In [14]:
# Encode categorical variables. For each categorical variable, order each level based on the mean SalePrice,
# from lowest to highest. Then encode each level from 1 to N, and store the encoding in `feature`_E
def encode(frame, feature):
ordering = pd.DataFrame()
ordering['val'] = frame[feature].unique()
ordering.index = ordering.val
ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
ordering = ordering.sort_values('spmean')
ordering['ordering'] = range(1, ordering.shape[0]+1)
ordering = ordering['ordering'].to_dict()
for cat, o in ordering.items():
frame.loc[frame[feature] == cat, feature+'_E'] = o
qual_encoded = []
for q in qualitative:
encode(train, q)
qual_encoded.append(q+'_E')
print(qual_encoded)
In [15]:
train.head()
Out[15]:
In [16]:
# Calculate the Spearman correlation between SalePrice and each variable
def spearman(frame, features):
spr = pd.DataFrame()
spr['feature'] = features
spr['spearman'] = [
frame[f].corr(frame['SalePrice'], 'spearman') for f in features
]
spr = spr.sort_values('spearman')
plt.figure(figsize=(6, 0.25 * len(features)))
sns.barplot(data=spr, y='feature', x='spearman', orient='h')
features = quantitative + qual_encoded
spearman(train, features)
OverallQal and Neighborhood (encoded) have the strongest positive correlation with SalePrice
In [17]:
# Correlation between quantitative variables
plt.figure(1)
corr = train[quantitative + ['SalePrice']].corr()
sns.heatmap(corr, center=0., cmap="coolwarm")
# Correlation between qualitative variables
plt.figure(2)
corr = train[qual_encoded + ['SalePrice']].corr()
sns.heatmap(corr, center=0., cmap="coolwarm")
# Correlation between quantitative and qualitative variables
plt.figure(3)
corr = pd.DataFrame(
np.zeros([len(quantitative) + 1,
len(qualitative) + 1]),
index=quantitative + ['SalePrice'],
columns=qual_encoded + ['SalePrice'])
for q1 in quantitative + ['SalePrice']:
for q2 in qual_encoded + ['SalePrice']:
corr.loc[q1, q2] = train[q1].corr(train[q2])
sns.heatmap(corr, center=0., cmap="coolwarm")
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
Out[17]:
In [18]:
def pairplot(x, y, **kwargs):
ax = plt.gca()
ts = pd.DataFrame({'time':x, 'val':y})
ts=ts.groupby('time').mean()
ts.plot(ax=ax)
plt.xticks(rotation=90)
f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded)
g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(pairplot, "value", "SalePrice")
In [19]:
features = quantitative + qual_encoded
model = TSNE(n_components=2, random_state=0, perplexity=50)
X = train[features].fillna(0.).values
tsne = model.fit_transform(X)
std = StandardScaler()
s = std.fit_transform(X)
pca = PCA(n_components=30)
pca.fit(s)
pc = pca.transform(s)
kmeans = KMeans(n_clusters=5)
kmeans.fit(pc)
fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
print(np.sum(pca.explained_variance_ratio_))
Out[19]:
Out[19]:
Out[19]:
In [2]:
# Import sklearn
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer
In [3]:
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better=False)
def rmse_cv_train(model):
rmse = np.sqrt(-cross_val_score(
model, X_train, y_train, scoring=scorer, cv=10))
return (rmse)
def rmse_cv_test(model):
rmse = np.sqrt(-cross_val_score(
model, X_test, y_test, scoring=scorer, cv=10))
return (rmse)
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
In [12]:
from scipy.stats import skew
class DataPreprocessor():
"""This class encapsulates the transformations that got applied to the dfing dataset.
It provides fit, transform, and fit_transform methods that can be used to prepare the test dataset for evaluation
"""
def __init__(self):
pass
def fit(self, df):
self._fit_transform(df)
def fit_transform(self, df):
return self._fit_transform(df)
def transform(self, df):
return self._fit_transform(df, transform_only=True)
def _fit_transform(self, df, transform_only=False):
df = df.copy()
# Step 1: fill missing columns
df = self.fill_missing_inplace(df)
# Step 2: Replace numerical features that are actually really categories
df.replace(
{
"MSSubClass": {
20: "SC20",
30: "SC30",
40: "SC40",
45: "SC45",
50: "SC50",
60: "SC60",
70: "SC70",
75: "SC75",
80: "SC80",
85: "SC85",
90: "SC90",
120: "SC120",
150: "SC150",
160: "SC160",
180: "SC180",
190: "SC190"
},
"MoSold": {
1: "Jan",
2: "Feb",
3: "Mar",
4: "Apr",
5: "May",
6: "Jun",
7: "Jul",
8: "Aug",
9: "Sep",
10: "Oct",
11: "Nov",
12: "Dec"
}
},
inplace=True)
# Step 3: Select numeric and categorical features
if not transform_only:
self.numerical_features, self.categorical_features = self.select_features(
df)
# Step 4: Encode categorical features based on level ordering
# If fitting and transforming, then record the features encoded and the orderings
if not transform_only:
self.categorical_features_encoded = []
self.categorical_features_orderings = dict()
for feature in self.categorical_features:
if not transform_only:
self.categorical_features_orderings[feature] = self.encode_feature(
df, feature)
self.categorical_features_encoded.append(feature + '_E')
for cat, o in self.categorical_features_orderings[feature].items():
df.loc[df[feature] == cat, feature + '_E'] = o
# Drop the original features after encoding
df.drop(self.categorical_features, axis=1)
# Step 5:
# 3* Polynomials on the top 10 existing numeric features
if not transform_only:
corr = df.corr()
corr.sort_values(["SalePrice"], ascending=False, inplace=True)
# Select the top 10 features based on correlation with SalePrice
i = 0
self.features_to_expand = []
for feature, _ in corr.SalePrice[1:].iteritems():
if i >= 10:
break
if "_E" not in feature:
self.features_to_expand.append(feature)
i += 1
for feature in self.features_to_expand:
df[feature + 'Sqrt'] = np.sqrt(df[feature])
df[feature + '-2'] = df[feature]**2
df[feature + '-3'] = df[feature]**3
if not transform_only:
self.numerical_features.extend(
[feature + 'Sqrt', feature + '-2', feature + '-3'])
# Step 6: Transform skewed numerical features
df_num = df[self.numerical_features]
if not transform_only:
skewness = df_num.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
self.skewed_features = skewness.index
df_num.loc[:, self.skewed_features] = np.log1p(
df_num.loc[:, self.skewed_features])
# Step 7: generate dummy variables for categorical features
df_cat = df[self.categorical_features_encoded]
df_cat_dummies = pd.get_dummies(
df_cat, columns=self.categorical_features_encoded)
if not transform_only:
self.cat_dummy_columns = df_cat_dummies.columns
# Fill gaps in dummy variables when transform_only
else:
for col in self.cat_dummy_columns:
if col not in df_cat_dummies.columns:
df_cat_dummies[col] = 0
print("df_num shape:", df_num.shape)
print("df_cat shape:", df_cat.shape)
print("df_cat_dummies shape:", df_cat_dummies.shape)
# Step 8: Apply standard scalar to numerical features
if not transform_only:
self.stdSc = StandardScaler()
self.stdSc.fit(df.loc[:, self.numerical_features])
df.loc[:, self.numerical_features] = self.stdSc.transform(
df.loc[:, self.numerical_features])
df_ = pd.concat([df_num, df_cat_dummies], axis=1)
return df_
def fill_missing_inplace(self, df_in):
df = df_in.copy()
# Handle missing values for features where median/mean or most common value doesn't make sense
# Alley : data description says NA means "no alley access"
df.loc[:, "Alley"] = df.loc[:, "Alley"].fillna("None")
# BedroomAbvGr : NA most likely means 0
df.loc[:, "BedroomAbvGr"] = df.loc[:, "BedroomAbvGr"].fillna(0)
# BsmtQual etc : data description says NA for basement features is "no basement"
df.loc[:, "BsmtQual"] = df.loc[:, "BsmtQual"].fillna("No")
df.loc[:, "BsmtCond"] = df.loc[:, "BsmtCond"].fillna("No")
df.loc[:, "BsmtExposure"] = df.loc[:, "BsmtExposure"].fillna("No")
df.loc[:, "BsmtFinType1"] = df.loc[:, "BsmtFinType1"].fillna("No")
df.loc[:, "BsmtFinType2"] = df.loc[:, "BsmtFinType2"].fillna("No")
df.loc[:, "BsmtFullBath"] = df.loc[:, "BsmtFullBath"].fillna(0)
df.loc[:, "BsmtHalfBath"] = df.loc[:, "BsmtHalfBath"].fillna(0)
df.loc[:, "BsmtUnfSF"] = df.loc[:, "BsmtUnfSF"].fillna(0)
df.loc[:, "TotalBsmtSF"] = df.loc[:, "TotalBsmtSF"].fillna(0)
df.loc[:, "BsmtFinSF1"] = df.loc[:, "BsmtFinSF1"].fillna(0)
df.loc[:, "BsmtFinSF2"] = df.loc[:, "BsmtFinSF2"].fillna(0)
# CentralAir : NA most likely means No
df.loc[:, "CentralAir"] = df.loc[:, "CentralAir"].fillna("N")
# Condition : NA most likely means Normal
df.loc[:, "Condition1"] = df.loc[:, "Condition1"].fillna("Norm")
df.loc[:, "Condition2"] = df.loc[:, "Condition2"].fillna("Norm")
# Electrical: Fill with the most common value
df.loc[:, "Electrical"] = df.loc[:, 'Electrical'].fillna(
df['Electrical'].value_counts().index[0])
# EnclosedPorch : NA most likely means no enclosed porch
df.loc[:, "EnclosedPorch"] = df.loc[:, "EnclosedPorch"].fillna(0)
# External stuff : NA most likely means average
df.loc[:, "ExterCond"] = df.loc[:, "ExterCond"].fillna("TA")
df.loc[:, "ExterQual"] = df.loc[:, "ExterQual"].fillna("TA")
# Exterior: NA = Other
df.loc[:, "Exterior1st"] = df.loc[:, "Exterior1st"].fillna("Other")
df.loc[:, "Exterior2nd"] = df.loc[:, "Exterior2nd"].fillna("Other")
# Fence : data description says NA means "no fence"
df.loc[:, "Fence"] = df.loc[:, "Fence"].fillna("No")
# FireplaceQu : data description says NA means "no fireplace"
df.loc[:, "FireplaceQu"] = df.loc[:, "FireplaceQu"].fillna("No")
df.loc[:, "Fireplaces"] = df.loc[:, "Fireplaces"].fillna(0)
# Functional : data description says NA means typical
df.loc[:, "Functional"] = df.loc[:, "Functional"].fillna("Typ")
# GarageType etc : data description says NA for garage features is "no garage"
df.loc[:, "GarageType"] = df.loc[:, "GarageType"].fillna("No")
df.loc[:, "GarageFinish"] = df.loc[:, "GarageFinish"].fillna("No")
df.loc[:, "GarageQual"] = df.loc[:, "GarageQual"].fillna("No")
df.loc[:, "GarageCond"] = df.loc[:, "GarageCond"].fillna("No")
df.loc[:, "GarageArea"] = df.loc[:, "GarageArea"].fillna(0)
df.loc[:, "GarageCars"] = df.loc[:, "GarageCars"].fillna(0)
df.loc[:, "GarageYrBlt"] = df.loc[:, "GarageYrBlt"].fillna(0)
# HalfBath : NA most likely means no half baths above grade
df.loc[:, "HalfBath"] = df.loc[:, "HalfBath"].fillna(0)
# HeatingQC : NA most likely means typical
df.loc[:, "HeatingQC"] = df.loc[:, "HeatingQC"].fillna("TA")
# KitchenAbvGr : NA most likely means 0
df.loc[:, "KitchenAbvGr"] = df.loc[:, "KitchenAbvGr"].fillna(0)
# KitchenQual : NA most likely means typical
df.loc[:, "KitchenQual"] = df.loc[:, "KitchenQual"].fillna("TA")
# LotFrontage : NA most likely means no lot frontage
df.loc[:, "LotFrontage"] = df.loc[:, "LotFrontage"].fillna(0)
# LotShape : NA most likely means regular
df.loc[:, "LotShape"] = df.loc[:, "LotShape"].fillna("Reg")
# MasVnrType : NA most likely means no veneer
df.loc[:, "MasVnrType"] = df.loc[:, "MasVnrType"].fillna("None")
df.loc[:, "MasVnrArea"] = df.loc[:, "MasVnrArea"].fillna(0)
# MiscFeature : data description says NA means "no misc feature"
df.loc[:, "MiscFeature"] = df.loc[:, "MiscFeature"].fillna("No")
df.loc[:, "MiscVal"] = df.loc[:, "MiscVal"].fillna(0)
# MSZoning: Fill with the most common value
df.loc[:, "MSZoning"] = df.loc[:, 'MSZoning'].fillna(
df['MSZoning'].value_counts().index[0])
# OpenPorchSF : NA most likely means no open porch
df.loc[:, "OpenPorchSF"] = df.loc[:, "OpenPorchSF"].fillna(0)
# PavedDrive : NA most likely means not paved
df.loc[:, "PavedDrive"] = df.loc[:, "PavedDrive"].fillna("N")
# PoolQC : data description says NA means "no pool"
df.loc[:, "PoolQC"] = df.loc[:, "PoolQC"].fillna("No")
df.loc[:, "PoolArea"] = df.loc[:, "PoolArea"].fillna(0)
# SaleCondition : NA most likely means normal sale
df.loc[:, "SaleCondition"] = df.loc[:, "SaleCondition"].fillna(
"Normal")
# SaleType: NA most likely Oth
df.loc[:, "SaleType"] = df.loc[:, "SaleType"].fillna("Oth")
# ScreenPorch : NA most likely means no screen porch
df.loc[:, "ScreenPorch"] = df.loc[:, "ScreenPorch"].fillna(0)
# TotRmsAbvGrd : NA most likely means 0
df.loc[:, "TotRmsAbvGrd"] = df.loc[:, "TotRmsAbvGrd"].fillna(0)
# Utilities : NA most likely means all public utilities
df.loc[:, "Utilities"] = df.loc[:, "Utilities"].fillna("AllPub")
# WoodDeckSF : NA most likely means no wood deck
df.loc[:, "WoodDeckSF"] = df.loc[:, "WoodDeckSF"].fillna(0)
return df
# Encode categorical variables. For each categorical variable, order each level based on the mean SalePrice,
# from lowest to highest. Then encode each level from 1 to N, and store the encoding in `feature`_E
def encode_feature(self, frame, feature):
ordering = pd.DataFrame()
ordering['val'] = frame[feature].unique()
ordering.index = ordering.val
ordering['spmean'] = frame[[feature, 'SalePrice'
]].groupby(feature).mean()['SalePrice']
ordering = ordering.sort_values('spmean')
ordering['ordering'] = range(0, ordering.shape[0])
ordering = ordering['ordering'].to_dict()
return ordering
def select_features(self, df):
numerical_features = [
f for f in df.columns if df.dtypes[f] != 'object'
]
try:
numerical_features.remove('SalePrice')
numerical_features.remove('Id')
except:
pass
categorical_features = [
f for f in df.columns if df.dtypes[f] == 'object'
]
return numerical_features, categorical_features
In [13]:
pp = DataPreprocessor()
In [14]:
# Reload train.csv and test.csv
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
In [15]:
train_transformed = pp.fit_transform(train)
train_transformed.sort_index(axis=1, inplace=True)
In [16]:
test_transformed = pp.transform(test)
test_transformed.sort_index(axis=1, inplace=True)
In [18]:
y = np.log(train['SalePrice'])
In [19]:
# Partition the dataset in train + validation sets
X_train, X_val, y_train, y_val = train_test_split(train_transformed, y, test_size = 0.2, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_val : " + str(X_val.shape))
print("y_train : " + str(y_train.shape))
print("y_val : " + str(y_val.shape))
In [25]:
# XGboost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
fit_params = {'eval_metric': 'rmse'}
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(
xgb_model, {
'max_depth': [2, 3, 4, 5, 6],
'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800]
},
fit_params=fit_params,
verbose=1)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
# Look at predictions on training and validation set
print("RMSE on Training set :", rmse(y_train, y_train_pred))
print("RMSE on Validation set :", rmse(y_val, y_val_pred))
# Plot residuals
plt.scatter(
y_train_pred,
y_train_pred - y_train,
c="blue",
marker="s",
label="Training data")
plt.scatter(
y_val_pred,
y_val_pred - y_val,
c="lightgreen",
marker="s",
label="Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc="upper left")
plt.show()
# Plot predictions
plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data")
plt.scatter(
y_val_pred, y_val, c="lightgreen", marker="s", label="Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc="upper left")
plt.show()
In [21]:
clf.best_params_
Out[21]:
In [37]:
y_test_pred = clf.predict(test_transformed)
In [38]:
y_test_pred
Out[38]:
In [49]:
test_SalePrice = pd.Series(np.exp(y_test_pred), index=test.index, name='SalePrice')
In [50]:
submission = pd.concat([test['Id'], test_SalePrice], axis=1)
In [53]:
submission.to_csv('./input/submission.csv', index=False)