In [1]:
    
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from autolearn.autolearn import AutoLearn
    
In [2]:
    
%matplotlib inline
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
    
In [3]:
    
train_file = os.path.normpath(os.getcwd() + '/data/train.csv')
test_file = os.path.normpath(os.getcwd() + '/data/test.csv')
submission_folder = os.path.normpath(os.getcwd() + '/submissions/')
    
In [4]:
    
autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='rmsle', impute_strategy='median')
# autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='mse', impute_strategy='median')
training_data, X_train, y = autolearn.process_training_data(train_file)
training_data.head()
    
    Out[4]:
In [5]:
    
print(training_data.shape)
print(X_train.shape)
    
    
In [6]:
    
test_data, X_test = autolearn.process_test_data(test_file)
test_data.head()
    
    Out[6]:
In [7]:
    
autolearn.train_all(X_train, y)
    
    
In [8]:
    
autolearn.predict_all(X_train)
    
    
In [9]:
    
autolearn.visualize_all(y)
    
    
    
    
    
    
    
    
    
In [10]:
    
autolearn.score_all(y)
    
    
In [11]:
    
autolearn.cross_validate_all(X_train, y)
    
    
In [12]:
    
results = autolearn.get_results()
results
    
    Out[12]:
In [13]:
    
sns.stripplot(x=results.index, y='score', data=results, color='r', label='score');  # only really works if score is mse
sns.stripplot(x=results.index, y='variance', data=results, color='b', label='variance');
plt.xticks(rotation=45);
    
    
In [14]:
    
s = results.apply(lambda x: pd.Series(x['cv']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'cv'
results2 = results.drop(['score','variance','cv', 'parameters'], axis=1).join(s)
results2
    
    Out[14]:
In [15]:
    
sns.boxplot(x=results2.index, y='cv', data=results2, linewidth=2.5);
    
    
In [16]:
    
model = autolearn.train(X_train, y, 'linear')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [17]:
    
model = autolearn.train(X_train, y, 'logistic')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [18]:
    
model = autolearn.train(X_train, y, 'ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [19]:
    
model = autolearn.train(X_train, y, 'lasso')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [20]:
    
model = autolearn.train(X_train, y, 'bayes')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [22]:
    
model = autolearn.train(X_train, y, 'bayes_ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [23]:
    
model = autolearn.train(X_train, y, 'forest')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
    
    
    
In [ ]: