In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from autolearn.autolearn import AutoLearn

In [2]:
%matplotlib inline
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
  • Add option to remove outliers from all columns of dataset?

Import Data


In [3]:
train_file = os.path.normpath(os.getcwd() + '/data/train.csv')
test_file = os.path.normpath(os.getcwd() + '/data/test.csv')
submission_folder = os.path.normpath(os.getcwd() + '/submissions/')

In [4]:
autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='rmsle', impute_strategy='median')
# autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='mse', impute_strategy='median')
training_data, X_train, y = autolearn.process_training_data(train_file)
training_data.head()


Out[4]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 3 65.0 8450 1 NaN 3 3 0 ... 0 NaN NaN NaN 0 2 2008 8 4 208500
1 2 20 3 80.0 9600 1 NaN 3 3 0 ... 0 NaN NaN NaN 0 5 2007 8 4 181500
2 3 60 3 68.0 11250 1 NaN 0 3 0 ... 0 NaN NaN NaN 0 9 2008 8 4 223500
3 4 70 3 60.0 9550 1 NaN 0 3 0 ... 0 NaN NaN NaN 0 2 2006 8 0 140000
4 5 60 3 84.0 14260 1 NaN 0 3 0 ... 0 NaN NaN NaN 0 12 2008 8 4 250000

5 rows × 81 columns


In [5]:
print(training_data.shape)
print(X_train.shape)


(1460, 81)
(1460, 133)

In [6]:
test_data, X_test = autolearn.process_test_data(test_file)
test_data.head()


Out[6]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 2 80.0 11622 1 NaN 3 3 0 ... 120 0 NaN 2 NaN 0 6 2010 8 4
1 1462 20 3 81.0 14267 1 NaN 0 3 0 ... 0 0 NaN NaN 0 12500 6 2010 8 4
2 1463 60 3 74.0 13830 1 NaN 0 3 0 ... 0 0 NaN 2 NaN 0 3 2010 8 4
3 1464 60 3 78.0 9978 1 NaN 0 3 0 ... 0 0 NaN NaN NaN 0 6 2010 8 4
4 1465 120 3 43.0 5005 1 NaN 0 1 0 ... 144 0 NaN NaN NaN 0 1 2010 8 4

5 rows × 80 columns

Models


In [7]:
autolearn.train_all(X_train, y)


100% (  7 of 7) |#########################| Elapsed Time: 0:00:02 Time: 0:00:02

In [8]:
autolearn.predict_all(X_train)


100% (  7 of 7) |#########################| Elapsed Time: 0:00:02 Time: 0:00:02

In [9]:
autolearn.visualize_all(y)


100% (  7 of 7) |#########################| Elapsed Time: 0:00:04 Time: 0:00:04

In [10]:
autolearn.score_all(y)


100% (  7 of 7) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00

In [11]:
autolearn.cross_validate_all(X_train, y)


N/A% (0 of N/A) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--/Users/Austin/anaconda/envs/kaggle/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=3.
  % (min_groups, self.n_splits)), Warning)
 28% (  2 of 7) |#######                   | Elapsed Time: 0:00:02 ETA: 0:00:06/Users/Austin/projects/autolearn/autolearn/autolearn.py:223: RuntimeWarning: invalid value encountered in log1p
  return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y), 2)))
 85% (  6 of 7) |######################    | Elapsed Time: 0:00:07 ETA: 0:00:01/Users/Austin/anaconda/envs/kaggle/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=3.
  % (min_groups, self.n_splits)), Warning)
/Users/Austin/projects/autolearn/autolearn/autolearn.py:223: RuntimeWarning: invalid value encountered in log1p
  return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y), 2)))
100% (  7 of 7) |#########################| Elapsed Time: 0:00:07 Time: 0:00:07

In [12]:
results = autolearn.get_results()
results


Out[12]:
score cv variance parameters
forest 0.070293 [0.193934693545, 0.145764425462, 0.154033980036] 5.745238e+09 (DecisionTreeRegressor(criterion='mse', max_de...
bayes 0.085183 [0.332962710143, 0.212423251971, 0.224180451795] 6.426012e+09 GaussianNB(priors=None)
boost 0.089811 [0.168805473538, 0.123528263815, 0.127427070759] 5.859817e+09 ([DecisionTreeRegressor(criterion='friedman_ms...
bayes_ridge 0.142163 [0.180493099145, 0.15372542792, 0.172257400999] 5.304830e+09 BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, co...
ridge 0.143071 [0.183321595536, nan, 0.168031824507] 5.448559e+09 Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...
lasso 0.154537 [nan, nan, 0.166198334168] 5.500244e+09 Lasso(alpha=1.0, copy_X=True, fit_intercept=Tr...
linear 0.155034 [0.208602533925, nan, 0.166108985072] 5.502522e+09 LinearRegression(copy_X=True, fit_intercept=Tr...

In [13]:
sns.stripplot(x=results.index, y='score', data=results, color='r', label='score');  # only really works if score is mse
sns.stripplot(x=results.index, y='variance', data=results, color='b', label='variance');
plt.xticks(rotation=45);



In [14]:
s = results.apply(lambda x: pd.Series(x['cv']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'cv'
results2 = results.drop(['score','variance','cv', 'parameters'], axis=1).join(s)
results2


Out[14]:
cv
bayes 0.332963
bayes 0.212423
bayes 0.224180
bayes_ridge 0.180493
bayes_ridge 0.153725
bayes_ridge 0.172257
boost 0.168805
boost 0.123528
boost 0.127427
forest 0.193935
forest 0.145764
forest 0.154034
lasso 0.166198
linear 0.208603
linear 0.166109
ridge 0.183322
ridge 0.168032

In [15]:
sns.boxplot(x=results2.index, y='cv', data=results2, linewidth=2.5);


Linear Regression


In [16]:
model = autolearn.train(X_train, y, 'linear')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.155033780192

Logistic Regression


In [17]:
model = autolearn.train(X_train, y, 'logistic')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.047023527463

Ridge Regression


In [18]:
model = autolearn.train(X_train, y, 'ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.143071036816

Lasso


In [19]:
model = autolearn.train(X_train, y, 'lasso')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.154536567815

Bayes


In [20]:
model = autolearn.train(X_train, y, 'bayes')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.0851826565966

Bayes Ridge


In [22]:
model = autolearn.train(X_train, y, 'bayes_ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.142163076189

Forest


In [23]:
model = autolearn.train(X_train, y, 'forest')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)


0.0714718256863

In [ ]: