In [2]:
from modelFactoryPy import main
from modelFactoryPy import get
from modelFactoryPy import store
from modelFactoryPy import pull
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
% matplotlib inline
In [3]:
main.getConnection()
# this will also create main.engine variable
Out[3]:
In [4]:
model_id = 'titanic_training'
In [5]:
try:
main.addModelId('titanic_training','Training on titanic data','passengerid')
except:
pass
main.getSessionId(model_id)
# this will also create main.session_id variable
Out[5]:
In [6]:
df = pd.read_csv('../data/titanic.csv')
In [7]:
df.head()
Out[7]:
In [8]:
summary = get.getSummary(df)
In [9]:
summary.head()
Out[9]:
In [10]:
store.storeSummary(summary)
In [11]:
## as we can see, the data is actually stored
pull.pullSummary(main.session_id)
Out[11]:
In [12]:
y = df['survived_int']
X = df[['sex','pclass','embarked','title','age','family']]
X.index = df["passengerid"].tolist()
In [13]:
def preprocess_features(X):
# Initialize new output DataFrame
output = pd.DataFrame(index = X.index)
# Investigate each feature column for the data
for col, col_data in X.iteritems():
# If data type is categorical, convert to dummy variables
if col_data.dtype == object:
col_data = pd.get_dummies(col_data, prefix = col)
# Collect the revised columns
output = output.join(col_data)
return output
X = preprocess_features(X)
In [14]:
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)
In [15]:
clf = RandomForestClassifier(random_state=0) # just a basic random forest model
clf.fit(X_train, y_train)
## predict on the test set:
probs = clf.predict_proba(X_test)
score=[probs[x][1] for x in range(len(probs)) ]
In [16]:
test_results = get.getTestResults(score, y_test)
test_results.head(10)
Out[16]:
In [17]:
store.storeTestResults(test_results)
In [18]:
## as we can see, the data is actually stored
pull.pullTestResults(main.session_id).head()
Out[18]:
In [19]:
roc = pull.pullROC(main.session_id)
liftchart = pull.pullLiftChart(main.session_id)
In [20]:
fg = plt.figure(figsize=(10,5))
adj = plt.subplots_adjust(hspace=0.4,wspace=0.2)
sp = plt.subplot(1,2,1)
l1 = plt.plot(roc.false_positive_rate, roc.true_positive_rate)
tl = plt.title("ROC curve")
sp = plt.subplot(1,2,2)
l1 = plt.plot(liftchart.population, liftchart.target_population)
tl = plt.title("Liftchart")
plt.show()
In [21]:
pull.pullAccuracy(main.session_id, 0.5, 'population')
Out[21]:
In [22]:
pull.pullAccuracy(main.session_id, 0.5, 'probability')
Out[22]:
In [23]:
pull.pullConfMatrix(main.session_id, 0.5, 'probability')
Out[23]:
In [24]:
threshold_value = 0.5
threshold_type = "population"
main.updateThreshold(model_id, threshold_value, threshold_type)
In [25]:
store.storeModelScores(X_test.index, score)
In [26]:
## as we can see, the data is actually stored
pull.pullModelScores(main.session_id).head()
Out[26]:
In [27]:
main.closeSession()
In [28]:
## as we can see, the end time is filled in
pd.read_sql("select * from model_factory.run_history where session_id='"+main.session_id+"'", main.engine)
Out[28]: