In [1]:
from modelFactoryPy import main
from modelFactoryPy import get
from modelFactoryPy import store
from modelFactoryPy import pull
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
% matplotlib inline
In [2]:
main.getConnection()
# this will also create main.engine variable
Out[2]:
In [3]:
model_id = 'titanic_training'
In [4]:
#main.addModelId('titanic_training','Training on titanic data','passengerid')
main.getSessionId(model_id)
# this will also create main.session_id variable
Out[4]:
In [5]:
df = pd.read_csv('../data/titanic.csv')
In [6]:
df.head()
Out[6]:
In [7]:
summary = get.getSummary(df)
In [8]:
summary.head()
Out[8]:
In [9]:
store.storeSummary(summary)
In [10]:
## as we can see, the data is actually stored
pull.pullSummary(main.session_id)
Out[10]:
In [11]:
y = df['survived_int']
X = df[['sex','pclass','embarked','title','age','family']]
X.index = df["passengerid"].tolist()
In [12]:
def preprocess_features(X):
# Initialize new output DataFrame
output = pd.DataFrame(index = X.index)
# Investigate each feature column for the data
for col, col_data in X.iteritems():
# If data type is categorical, convert to dummy variables
if col_data.dtype == object:
col_data = pd.get_dummies(col_data, prefix = col)
# Collect the revised columns
output = output.join(col_data)
return output
X = preprocess_features(X)
In [13]:
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)
In [14]:
clf = RandomForestClassifier(random_state=0) # just a basic random forest model
clf.fit(X_train, y_train)
## predict on the test set:
probs = clf.predict_proba(X_test)
score=[probs[x][1] for x in range(len(probs)) ]
In [15]:
test_results = get.getTestResults(score, y_test)
test_results.head(10)
Out[15]:
In [16]:
store.storeTestResults(test_results)
In [17]:
## as we can see, the data is actually stored
pull.pullTestResults(main.session_id).head()
Out[17]:
In [18]:
roc = pull.pullROC(main.session_id)
liftchart = pull.pullLiftChart(main.session_id)
In [19]:
fg = plt.figure(figsize=(10,5))
adj = plt.subplots_adjust(hspace=0.4,wspace=0.2)
sp = plt.subplot(1,2,1)
l1 = plt.plot(roc.false_positive_rate, roc.true_positive_rate)
tl = plt.title("ROC curve")
sp = plt.subplot(1,2,2)
l1 = plt.plot(liftchart.population, liftchart.target_population)
tl = plt.title("Liftchart")
plt.show()
In [20]:
pull.pullAccuracy(main.session_id, 0.5, 'population')
Out[20]:
In [22]:
pull.pullAccuracy(main.session_id, 0.5, 'probability')
Out[22]:
In [23]:
store.storeModelScores(X_test.index, score)
In [24]:
## as we can see, the data is actually stored
pull.pullModelScores(main.session_id).head()
Out[24]:
In [25]:
main.closeSession()
In [26]:
## as we can see, the end time is filled in
pd.read_sql("select * from model_factory.run_history where session_id='"+main.session_id+"'", main.engine)
Out[26]: