In [1]:

    
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from autolearn.autolearn import AutoLearn



In [2]:

    
%matplotlib inline
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})

Add option to remove outliers from all columns of dataset?

Import Data



In [3]:

    
train_file = os.path.normpath(os.getcwd() + '/data/train.csv')
test_file = os.path.normpath(os.getcwd() + '/data/test.csv')
submission_folder = os.path.normpath(os.getcwd() + '/submissions/')



In [4]:

    
autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='rmsle', impute_strategy='median')
# autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='mse', impute_strategy='median')
training_data, X_train, y = autolearn.process_training_data(train_file)
training_data.head()









    Out[4]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      3
      65.0
      8450
      1
      NaN
      3
      3
      0
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      8
      4
      208500
    
    
      1
      2
      20
      3
      80.0
      9600
      1
      NaN
      3
      3
      0
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      8
      4
      181500
    
    
      2
      3
      60
      3
      68.0
      11250
      1
      NaN
      0
      3
      0
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      8
      4
      223500
    
    
      3
      4
      70
      3
      60.0
      9550
      1
      NaN
      0
      3
      0
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      8
      0
      140000
    
    
      4
      5
      60
      3
      84.0
      14260
      1
      NaN
      0
      3
      0
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      8
      4
      250000
    
  

5 rows × 81 columns



In [5]:

    
print(training_data.shape)
print(X_train.shape)









    



(1460, 81)
(1460, 133)



In [6]:

    
test_data, X_test = autolearn.process_test_data(test_file)
test_data.head()









    Out[6]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      ScreenPorch
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
    
  
  
    
      0
      1461
      20
      2
      80.0
      11622
      1
      NaN
      3
      3
      0
      ...
      120
      0
      NaN
      2
      NaN
      0
      6
      2010
      8
      4
    
    
      1
      1462
      20
      3
      81.0
      14267
      1
      NaN
      0
      3
      0
      ...
      0
      0
      NaN
      NaN
      0
      12500
      6
      2010
      8
      4
    
    
      2
      1463
      60
      3
      74.0
      13830
      1
      NaN
      0
      3
      0
      ...
      0
      0
      NaN
      2
      NaN
      0
      3
      2010
      8
      4
    
    
      3
      1464
      60
      3
      78.0
      9978
      1
      NaN
      0
      3
      0
      ...
      0
      0
      NaN
      NaN
      NaN
      0
      6
      2010
      8
      4
    
    
      4
      1465
      120
      3
      43.0
      5005
      1
      NaN
      0
      1
      0
      ...
      144
      0
      NaN
      NaN
      NaN
      0
      1
      2010
      8
      4
    
  

5 rows × 80 columns

Models



In [7]:

    
autolearn.train_all(X_train, y)









    



100% (  7 of 7) |#########################| Elapsed Time: 0:00:02 Time: 0:00:02



In [8]:

    
autolearn.predict_all(X_train)









    



100% (  7 of 7) |#########################| Elapsed Time: 0:00:02 Time: 0:00:02



In [9]:

    
autolearn.visualize_all(y)









    



100% (  7 of 7) |#########################| Elapsed Time: 0:00:04 Time: 0:00:04



In [10]:

    
autolearn.score_all(y)









    



100% (  7 of 7) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00



In [11]:

    
autolearn.cross_validate_all(X_train, y)









    



N/A% (0 of N/A) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--/Users/Austin/anaconda/envs/kaggle/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=3.
  % (min_groups, self.n_splits)), Warning)
 28% (  2 of 7) |#######                   | Elapsed Time: 0:00:02 ETA: 0:00:06/Users/Austin/projects/autolearn/autolearn/autolearn.py:223: RuntimeWarning: invalid value encountered in log1p
  return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y), 2)))
 85% (  6 of 7) |######################    | Elapsed Time: 0:00:07 ETA: 0:00:01/Users/Austin/anaconda/envs/kaggle/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=3.
  % (min_groups, self.n_splits)), Warning)
/Users/Austin/projects/autolearn/autolearn/autolearn.py:223: RuntimeWarning: invalid value encountered in log1p
  return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y), 2)))
100% (  7 of 7) |#########################| Elapsed Time: 0:00:07 Time: 0:00:07



In [12]:

    
results = autolearn.get_results()
results









    Out[12]:






  
    
      
      score
      cv
      variance
      parameters
    
  
  
    
      forest
      0.070293
      [0.193934693545, 0.145764425462, 0.154033980036]
      5.745238e+09
      (DecisionTreeRegressor(criterion='mse', max_de...
    
    
      bayes
      0.085183
      [0.332962710143, 0.212423251971, 0.224180451795]
      6.426012e+09
      GaussianNB(priors=None)
    
    
      boost
      0.089811
      [0.168805473538, 0.123528263815, 0.127427070759]
      5.859817e+09
      ([DecisionTreeRegressor(criterion='friedman_ms...
    
    
      bayes_ridge
      0.142163
      [0.180493099145, 0.15372542792, 0.172257400999]
      5.304830e+09
      BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, co...
    
    
      ridge
      0.143071
      [0.183321595536, nan, 0.168031824507]
      5.448559e+09
      Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...
    
    
      lasso
      0.154537
      [nan, nan, 0.166198334168]
      5.500244e+09
      Lasso(alpha=1.0, copy_X=True, fit_intercept=Tr...
    
    
      linear
      0.155034
      [0.208602533925, nan, 0.166108985072]
      5.502522e+09
      LinearRegression(copy_X=True, fit_intercept=Tr...



In [13]:

    
sns.stripplot(x=results.index, y='score', data=results, color='r', label='score');  # only really works if score is mse
sns.stripplot(x=results.index, y='variance', data=results, color='b', label='variance');
plt.xticks(rotation=45);



In [14]:

    
s = results.apply(lambda x: pd.Series(x['cv']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'cv'
results2 = results.drop(['score','variance','cv', 'parameters'], axis=1).join(s)
results2









    Out[14]:






  
    
      
      cv
    
  
  
    
      bayes
      0.332963
    
    
      bayes
      0.212423
    
    
      bayes
      0.224180
    
    
      bayes_ridge
      0.180493
    
    
      bayes_ridge
      0.153725
    
    
      bayes_ridge
      0.172257
    
    
      boost
      0.168805
    
    
      boost
      0.123528
    
    
      boost
      0.127427
    
    
      forest
      0.193935
    
    
      forest
      0.145764
    
    
      forest
      0.154034
    
    
      lasso
      0.166198
    
    
      linear
      0.208603
    
    
      linear
      0.166109
    
    
      ridge
      0.183322
    
    
      ridge
      0.168032



In [15]:

    
sns.boxplot(x=results2.index, y='cv', data=results2, linewidth=2.5);

Linear Regression



In [16]:

    
model = autolearn.train(X_train, y, 'linear')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.155033780192

Logistic Regression



In [17]:

    
model = autolearn.train(X_train, y, 'logistic')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.047023527463

Ridge Regression



In [18]:

    
model = autolearn.train(X_train, y, 'ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.143071036816

Lasso



In [19]:

    
model = autolearn.train(X_train, y, 'lasso')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.154536567815

Bayes



In [20]:

    
model = autolearn.train(X_train, y, 'bayes')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.0851826565966

Bayes Ridge



In [22]:

    
model = autolearn.train(X_train, y, 'bayes_ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.142163076189

Forest



In [23]:

    
model = autolearn.train(X_train, y, 'forest')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)









    



0.0714718256863



In [ ]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	3	65.0	8450	1	NaN	3	3	...	NaN	NaN	NaN	2	2008	8	4	208500
1	2	20	3	80.0	9600	1	NaN	3	3	...	NaN	NaN	NaN	5	2007	8	4	181500
2	3	60	3	68.0	11250	1	NaN	0	3	...	NaN	NaN	NaN	9	2008	8	4	223500
3	4	70	3	60.0	9550	1	NaN	0	3	...	NaN	NaN	NaN	2	2006	8	0	140000
4	5	60	3	84.0	14260	1	NaN	0	3	...	NaN	NaN	NaN	12	2008	8	4	250000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	...	ScreenPorch	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition
0	1461	20	2	80.0	11622	1	NaN	3	3	...	120	NaN	2	NaN	0	6	2010	8	4
1	1462	20	3	81.0	14267	1	NaN	0	3	...	0	NaN	NaN	0	12500	6	2010	8	4
2	1463	60	3	74.0	13830	1	NaN	0	3	...	0	NaN	2	NaN	0	3	2010	8	4
3	1464	60	3	78.0	9978	1	NaN	0	3	...	0	NaN	NaN	NaN	0	6	2010	8	4
4	1465	120	3	43.0	5005	1	NaN	0	1	...	144	NaN	NaN	NaN	0	1	2010	8	4

	score	cv	variance	parameters
forest	0.070293	[0.193934693545, 0.145764425462, 0.154033980036]	5.745238e+09	(DecisionTreeRegressor(criterion='mse', max_de...
bayes	0.085183	[0.332962710143, 0.212423251971, 0.224180451795]	6.426012e+09	GaussianNB(priors=None)
boost	0.089811	[0.168805473538, 0.123528263815, 0.127427070759]	5.859817e+09	([DecisionTreeRegressor(criterion='friedman_ms...
bayes_ridge	0.142163	[0.180493099145, 0.15372542792, 0.172257400999]	5.304830e+09	BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, co...
ridge	0.143071	[0.183321595536, nan, 0.168031824507]	5.448559e+09	Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...
lasso	0.154537	[nan, nan, 0.166198334168]	5.500244e+09	Lasso(alpha=1.0, copy_X=True, fit_intercept=Tr...
linear	0.155034	[0.208602533925, nan, 0.166108985072]	5.502522e+09	LinearRegression(copy_X=True, fit_intercept=Tr...

	cv
bayes	0.332963
bayes	0.212423
bayes	0.224180
bayes_ridge	0.180493
bayes_ridge	0.153725
bayes_ridge	0.172257
boost	0.168805
boost	0.123528
boost	0.127427
forest	0.193935
forest	0.145764
forest	0.154034
lasso	0.166198
linear	0.208603
linear	0.166109
ridge	0.183322
ridge	0.168032