In [80]:
import math
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from IPython import display
from scipy import stats
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from tensorflow.python.data import Dataset
warnings.filterwarnings('ignore')
%matplotlib inline
# Citations: Feature selection section adapted from Comprehensive Data Exploration with Python by Pedro Marcelino
In [81]:
# Load training examples
df_train = pd.read_csv('../datasets/house_prices/train.csv')
# Randomize dataset to ensure that we draw observations idependently and identically
# from training distribution (i.i.d property)
df_train = df_train.reindex(np.random.permutation(df_train.index))
# Load test examples
df_test = pd.read_csv('../datasets/house_prices/test.csv')
df_test = df_test.reindex(np.random.permutation(df_test.index))
In [82]:
df_train.columns
Out[82]:
In [83]:
# PART 1: Analyze label field, SalePrice
df_train['SalePrice'].describe()
Out[83]:
In [84]:
# Look at distribution
sns.distplot(df_train['SalePrice'])
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
In [85]:
# Looks much better!
# PART 2: Feature selection
# First, let's look at a correlation matrix between all of our variables
correlation_matrix = df_train.corr() # Calculates pairwise correlation of columns; defauls to using 'pearson' correlation coefficient.
_ = plt.subplots(figsize=(12, 9))
sns.heatmap(correlation_matrix, vmax=0.8, square=True)
Out[85]:
In [86]:
# Observations
# 1) TotalBsmtSF and 1stFlrSF are highly correlated; likely encode the same information,
# therefore if we were to choose them as features in our model, we would only
# need to choose one of them.
# 2) GarageCars and GarageArea are also highly corralated; same as above.
# 3) etc. there are other pairs of columns that are highly correlated.
In [87]:
# Let's look at a correlation matrix between the k variables that are
# most highly correlated with a label variable, SalePrice:
k = 10
variable_columns = correlation_matrix.nlargest(k, 'SalePrice')['SalePrice'].index
label_correlation_matrix = np.corrcoef(df_train[variable_columns].values.T)
sns.set(font_scale=1.25)
heatmap = sns.heatmap(label_correlation_matrix, cbar=True, annot=True, square=True, fmt='0.2f', annot_kws={'size': 10}, yticklabels=variable_columns.values, xticklabels=variable_columns.values)
In [88]:
# Selected features
# 1) OverallQual: High correlation with SalePrice and not much correlation
# with other features
# 2) GrLivArea: Same as above
# 3) GarageCars: Same as above, don't need GarageArea since it encodes same
# information as this variable.
# 4) TotalBsmtSF: Same as above, don't need 1stFlrSF
# 5) Let's use YearBuilt, too.
In [89]:
# Now that we've selected features, let's look at correlation between features and label, SalePrice
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
sns.pairplot(df_train[cols], size=2.5)
plt.show()
In [90]:
# PART 3: Clean missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
Out[90]:
In [91]:
# Let's delete variables (columns) with missing data. Note that all of the
# variables with missing data aren't variables we really care about.
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)
# Let's also drop the one observation missing an 'Electrical' field.
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
# Double check that there no data is missing anymore
df_train.isnull().sum().sort_values(ascending=False)
Out[91]:
In [92]:
# PART 4: Remove outliers
# First, let's look at SalePrice to determine if there are any outliers that
# we need to remove
saleprice_scaled = preprocessing.StandardScaler().fit_transform(df_train['SalePrice'][:, np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)
In [93]:
# Hmm, looks like there are some outliers (the 7.somethings)
# They are following the overall trend so we'll keep them.
# Let's look at GrLivArea vs SalePrice now:
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000))
Out[93]:
In [94]:
# Looks like there are some GrLivArea outliers (the two farthest right points
# that don't seem to be following the overall trend). Let's drop those
# observations:
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]
# Moved outlier removal to preprocess_features function.
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)
In [95]:
# PART 5: Scale label field if needed
# Observations:
# 1) Hmm, isn't quite a normal distribution (not symmetrical about the center, mean > median > mode)
# 2) Distribution has positive skewness (longer right tail), therefore scaling
# data can be transformed with a log function:
df_train['SalePrice'] = np.log(df_train['SalePrice'])
# Distribution with log transformation (f=stats.norm also graphs normal curve)
sns.distplot(df_train['SalePrice'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
In [96]:
# PART 6: Scale features if needed
sns.distplot(df_train['GrLivArea'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
In [97]:
# Looks like GrLivArea also has a bit of a postive skew to its distribution.
df_train['GrLivArea'] = np.log(df_train['GrLivArea'])
sns.distplot(df_train['GrLivArea'], fit=stats.norm)
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
In [98]:
sns.distplot(df_train['TotalBsmtSF'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)
In [99]:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1
In [100]:
# Log transform TotalBsmtSF for observations that have basements.
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])
In [101]:
sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=stats.norm);
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)
In [102]:
# Check homescedasticity between variables:
# Linear regression models depend on another assumption: homescedacity.
# Homescedacity (means "same variance") describes the property that the variance
# between the label variable and all feature variables exhibit similar levels of variance.
# Heteroscedasticity (opposite of homescedacitiy) can be detected using
# a bivariate graphical approach: if there is a cone shape between two variables,
# they exchibit heteroscedasticity.
# Note: Most of the time, if you can ensure normality, homescedacity follows.
plt.scatter(df_train['GrLivArea'], df_train['SalePrice'])
Out[102]:
In [103]:
plt.scatter(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF']>0]['SalePrice']);
In [205]:
# Load training examples
df_train = pd.read_csv('../datasets/house_prices/train.csv')
# Randomize dataset to ensure that we draw observations idependently and identically
# from training distribution (i.i.d property)
df_train = df_train.reindex(np.random.permutation(df_train.index))
# Load test examples
df_test = pd.read_csv('../datasets/house_prices/test.csv')
df_test = df_test.reindex(np.random.permutation(df_test.index))
In [224]:
# PART 7: Training models
FEATURE_FIELDS = ['OverallQual',
'GrLivArea',
'GarageCars',
'TotalBsmtSF',
'YearBuilt']
TARGET_FIELD = 'SalePrice'
# Need min and max years to one-hot encode year.
# There are cases when certain years only show up in either the train or test subsets which
# will cause the number of one-hot encoded features to be different leading to errors in prediction.
MIN_YEAR = min(df_train['YearBuilt'].min(), df_test['YearBuilt'].min())
MAX_YEAR = max(df_train['YearBuilt'].max(), df_test['YearBuilt'].max())
MIN_GARAGE_CARS = int(min(df_train['GarageCars'].min(), df_test['GarageCars'].min()))
MAX_GARAGE_CARS = int(max(df_train['GarageCars'].max(), df_test['GarageCars'].max()))
# Write some helper functions to be reused throughout
def preprocess_features(dataframe):
"""Prepares features from dataframe for model training.
Args:
dataframe: A Pandas DataFrame containing housing data.
Returns:
A DataFrame containing preprocessed features to be used for model.
"""
processed_features = dataframe[FEATURE_FIELDS].copy()
# Numerical features (scale them if needed to achieve normality)
processed_features['GrLivArea'] = np.log(processed_features['GrLivArea'])
# Some TotalBsmtSF values are 0 for houses without a basement.
# Since we cannot log transform 0 values, we need to add a temporary
# variable HasBsmt to determine with values to log transform, leaving 0 values
# as 0.
processed_features['HasBsmt'] = pd.Series(len(processed_features['TotalBsmtSF']), index=processed_features.index)
processed_features['HasBsmt'] = 0
processed_features.loc[processed_features['TotalBsmtSF']>0,'HasBsmt'] = 1
# Log transform TotalBsmtSF value if not 0 (house has basement)
processed_features.loc[processed_features['HasBsmt']==1,'TotalBsmtSF'] = np.log(processed_features['TotalBsmtSF'])
# HasBsmt field no longer needed so remove it
processed_features = processed_features.drop(labels='HasBsmt', axis=1)
# Remove observations with missing fields
processed_features = processed_features.fillna(processed_features.mean())
# Categorical features (need to one-hot encode them)
# First, fill in categorical values that aren't represented in train dataset but are in test dataset
# processed_features['YearBuilt'] = processed_features['YearBuilt'].astype(
# 'category',
# categories=[year for year in range(MIN_YEAR, MAX_YEAR + 1)],
# )
# processed_features['GarageCars'] = processed_features['GarageCars'].astype(int)
# processed_features['GarageCars'] = processed_features['GarageCars'].astype(
# 'category',
# categories=[year for year in range(MIN_GARAGE_CARS, MAX_GARAGE_CARS + 1)],
# )
# year_built_one_hot = pd.get_dummies(processed_features['YearBuilt'], prefix='YearBuilt', drop_first=True)
# garage_cars_one_hot = pd.get_dummies(processed_features['GarageCars'], prefix='GarageCars', drop_first=True)
# processed_features = processed_features.drop(labels=['YearBuilt', 'GarageCars'], axis=1)
# processed_features = pd.concat([processed_features, year_built_one_hot, garage_cars_one_hot], axis=1)
return processed_features
def _remove_missing_data(dataframe):
"""Removes observations that have missing fields from dataframe."""
before_length = len(dataframe.index)
rows_with_missing_data = dataframe.loc[dataframe[FEATURE_FIELDS].isnull().sum(axis=1) > 0].index
print("Rows with missing data: %s" % rows_with_missing_data)
dataframe = dataframe.drop(rows_with_missing_data)
after_length = len(dataframe.index)
num_rows_removed = before_length - after_length
print("Removed %d rows with missing features." % num_rows_removed)
return dataframe
def preprocess_targets(dataframe):
"""Prepares target features (labels) from dataframe for model training.
Args:
dataframe: A Pandas DataFrame containing housing data.
Returns:
A DataFrame containing preprocessed targets to be used for model.
"""
processed_targets = pd.DataFrame()
processed_targets[TARGET_FIELD] = np.log(dataframe[TARGET_FIELD])
return processed_targets
# Prepare train targets (labels)
training_targets = preprocess_targets(df_train)
# Split training dataset into training and validation subsets
train_without_missing = (df_train)
training_examples = preprocess_features(df_train)
training_examples, validation_examples, training_targets, validation_targets = model_selection.train_test_split(training_examples, training_targets, test_size=0.25)
# Prepate test examples to make predictions on
test_examples = preprocess_features(df_test)
print("Training examples summary:")
display.display(training_examples.describe())
print("Training targets summary:")
display.display(training_targets.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())
print("Test examples summary:")
display.display(test_examples.describe())
print("Test targets summary:")
display.display(test_targets.describe())
In [225]:
def output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions):
# Calculate root mean squared error
mse = metrics.mean_squared_error(training_targets, training_predictions)
rsme = math.sqrt(mse)
print("Training mean squared error: %.4f" % mse)
print("Training root mean squared error: %.2f" % rsme)
mse = metrics.mean_squared_error(validation_targets, validation_predictions)
rsme = math.sqrt(mse)
print("Validation mean squared error: %.4f" % mse)
print("Validation root mean squared error: %.2f" % rsme)
# Variance score: 1 is a perfect prediction
r2_score = metrics.r2_score(training_targets, training_predictions)
print("Training variance (r2) score: %.4f" % r2_score)
r2_score = metrics.r2_score(validation_targets, validation_predictions)
print("Validation variance (r2) score: %.4f" % r2_score)
# Plot actual values vs. predictions for both training and validation sets and
# identity line for reference
fig, ax = plt.subplots(figsize=(12, 9))
validation_plt = plt.scatter(validation_targets, validation_predictions, color='red', s=10)
training_plt = plt.scatter(training_targets, training_predictions, color='blue', s=10)
lims = [
np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes
np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes
]
# now plot both limits against each other
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
plt.legend([validation_plt, training_plt], ['Validation', 'Training'])
plt.title('Actual vs. predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
return model
In [226]:
def linear_regression(training_examples,
training_targets,
validation_examples,
validation_targets):
# For models with one-hot encoded variables, you need to set 'fit_intercept' to False to avoid the
# dummy variable trap.
model = linear_model.LinearRegression()
model.fit(training_examples, training_targets)
training_predictions = model.predict(training_examples)
validation_predictions = model.predict(validation_examples)
output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions)
return model
def ridgeCV_regression(training_examples,
training_targets,
validation_examples,
validation_targets):
model = linear_model.RidgeCV()
model.fit(training_examples, training_targets)
training_predictions = model.predict(training_examples)
validation_predictions = model.predict(validation_examples)
output_model_eval(training_targets, training_predictions, validation_targets, validation_predictions)
return model
model = linear_regression(training_examples, training_targets, validation_examples, validation_targets)
# Have to apply np.exp to undo log transform done on label, SalePrice
test_predictions = np.exp(model.predict(test_examples))
test_predictions_df = pd.DataFrame(test_predictions)
In [245]:
# Prepare final dataframe for submission
test_examples_with_id = pd.concat([df_test['Id'], test_examples], axis=1)
test_examples_with_id['SalePrice'] = test_predictions_df
submission_df = pd.DataFrame({'Id': test_examples_with_id.Id, 'SalePrice': test_examples_with_id.SalePrice})
test_examples_with_id.loc[test_examples_with_id['Id'] == 2036]
Out[245]:
In [246]:
# Convert submission_df to csv
submission_df.to_csv('submission.csv', index=False)
In [ ]: