In [ ]:
import itertools
import os
import sys
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi
import sklearn as sk
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.base
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import samlib
In [ ]:
%store -r
In [ ]:
key_features
In [ ]:
dfnum_t2 = pd.read_csv('transformed_dataset_dfnum_t2.csv', index_col=['Dataset','Id'])
In [ ]:
dfnum_t2.head()
In [ ]:
dfnum_t2.tail()
In [ ]:
target = pd.read_csv('../data/train_target.csv')
In [ ]:
scaler = sk.preprocessing.StandardScaler()
def standardize(df):
_values = sk.preprocessing.StandardScaler().fit_transform(df)
return pd.DataFrame(data=_values, columns=df.columns)
def transform_target(target):
logtarget = np.log1p(target / 1000)
return scaler.fit_transform(logtarget)
def inverse_transform_target(target_t):
logtarget = scaler.inverse_transform(target_t)
return np.expm1(logtarget) * 1000
target_t = transform_target(target)
In [ ]:
# Test
assert all(target == inverse_transform_target(target_t))
In [ ]:
data = dfnum_t2.loc['train',:].copy()
data['SalePrice'] = target_t
In [ ]:
fig, axes = plt.subplots(2,2,figsize=(10,10))
for feature, ax in zip(key_features[:4], itertools.chain.from_iterable(axes)):
ax.plot(data[feature], data['SalePrice'], 'o')
ax.set(xlabel=feature, ylabel='SalePrice')
Let's build a simple linear regression model based on these 4 features.
In [ ]:
regression1 = smapi.ols("SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea", data=data).fit()
regression1.summary()
R-squared equals 0.79 so it's pretty good for a first try. Let's see what happens if we include all our numerical features.
In [ ]:
data.columns
Statsmodels gets confused with columns that start with a digit, so let's rename that column first
In [ ]:
data['1stFlrSF'].name = 'FlrSF'
In [ ]:
def rename_columns(df):
return df.rename_axis({'1stFlrSF': 'FirstFlrSF', '2ndFlrSF': 'SndFlrSF'}, axis=1)
data = rename_columns(data)
In [ ]:
data.columns
In [ ]:
desc = 'SalePrice ~ ' + ' + '.join(data.drop('SalePrice', axis=1))
desc
As can be seen below, using more numerical values improves R-squared to 0.88 which is pretty good, though there's of course a risk of overfitting.
In [ ]:
regression2 = smapi.ols(desc, data=data).fit()
regression2.summary()
In [ ]:
def get_data(X, y):
df = X.copy()
df['SalePrice'] = y
return df
def ols1(X, y):
data = get_data(X, y)
return smapi.ols("SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea", data=data)
def ols2(X, y):
data = get_data(X, y)
return smapi.ols(desc, data=data)
In [ ]:
def rmse(prediction, exact):
return np.mean((prediction - exact)**2.0)**0.5
def run_experiment(estimator, scoring=rmse):
Xtrain, Xtest, ytrain, ytest = sk.model_selection.train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'])
model = estimator(Xtrain, ytrain).fit()
return scoring(model.predict(Xtest), ytest)
def cross_validate(estimator, cv=5):
return np.array([run_experiment(estimator) for _ in range(cv)])
for model in [ols1, ols2]:
errors = cross_validate(model)
print(errors, errors.mean())
In [ ]:
for model in [ols1, ols2]:
mse = np.sqrt(-sk.model_selection.cross_val_score(samlib.Regressor(model), data.drop('SalePrice', axis=1), y=data['SalePrice'],
scoring='neg_mean_squared_error', cv=5))
print(mse, mse.mean())
In [ ]:
dfnum_t2 = rename_columns(dfnum_t2)
submission_t = regression2.predict(dfnum_t2.loc['test',:])
In [ ]:
submission = inverse_transform_target(submission_t)
submission
In [ ]:
def save(filename, submission):
df = pd.DataFrame(data={
"Id": np.arange(len(submission)) + 1461,
"SalePrice": submission
})
df.to_csv(filename, index=False)
save('ols_key_numerical_features_only.csv', submission)
Statsmodels has special plots to explore the outcome of a regression model http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html
In [ ]: