In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from autolearn.autolearn import AutoLearn
In [2]:
%matplotlib inline
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
In [3]:
train_file = os.path.normpath(os.getcwd() + '/data/train.csv')
test_file = os.path.normpath(os.getcwd() + '/data/test.csv')
submission_folder = os.path.normpath(os.getcwd() + '/submissions/')
In [4]:
autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='rmsle', impute_strategy='median')
# autolearn = AutoLearn(encode_categoricals=True, impute=True, onehot=True, standardize=False, decompose=False, target='SalePrice', id_col='Id', error_metric='mse', impute_strategy='median')
training_data, X_train, y = autolearn.process_training_data(train_file)
training_data.head()
Out[4]:
In [5]:
print(training_data.shape)
print(X_train.shape)
In [6]:
test_data, X_test = autolearn.process_test_data(test_file)
test_data.head()
Out[6]:
In [7]:
autolearn.train_all(X_train, y)
In [8]:
autolearn.predict_all(X_train)
In [9]:
autolearn.visualize_all(y)
In [10]:
autolearn.score_all(y)
In [11]:
autolearn.cross_validate_all(X_train, y)
In [12]:
results = autolearn.get_results()
results
Out[12]:
In [13]:
sns.stripplot(x=results.index, y='score', data=results, color='r', label='score'); # only really works if score is mse
sns.stripplot(x=results.index, y='variance', data=results, color='b', label='variance');
plt.xticks(rotation=45);
In [14]:
s = results.apply(lambda x: pd.Series(x['cv']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'cv'
results2 = results.drop(['score','variance','cv', 'parameters'], axis=1).join(s)
results2
Out[14]:
In [15]:
sns.boxplot(x=results2.index, y='cv', data=results2, linewidth=2.5);
In [16]:
model = autolearn.train(X_train, y, 'linear')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [17]:
model = autolearn.train(X_train, y, 'logistic')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [18]:
model = autolearn.train(X_train, y, 'ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [19]:
model = autolearn.train(X_train, y, 'lasso')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [20]:
model = autolearn.train(X_train, y, 'bayes')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [22]:
model = autolearn.train(X_train, y, 'bayes_ridge')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [23]:
model = autolearn.train(X_train, y, 'forest')
y_predictions = autolearn.predict(X_train, model)
score = autolearn.score(y, y_predictions)
autolearn.visualize(y, y_predictions)
print(score)
In [ ]: