In [1]:
from modelFactoryPy import main
from modelFactoryPy import get
from modelFactoryPy import store
from modelFactoryPy import pull
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
% matplotlib inline
import sqlalchemy_mf_aster
from sqlalchemy.dialects import registry
from sqlalchemy_mf_aster import base

1. Get connection; define model_id, get session id


In [2]:
registry.register('aster', 'sqlalchemy_mf_aster.jdbc', 'AsterDialect_jdbc')

In [6]:
main.getConnection('aster')
# this will also create main.engine variable

In [4]:
model_id = 'titanic_training'

In [5]:
#main.addModelId('titanic_training','Training on titanic data','passengerid')
main.getSessionId(model_id)
# this will also create main.session_id variable


Out[5]:
'vecht499_titanic_training_20161220_537318'

2. Load the data

Could be the data from postgres SQL as well -> you would use pd.read_sql


In [33]:
df = pd.read_csv('../data/titanic.csv')

In [34]:
df.head()


Out[34]:
passengerid survived pclass name sex age sibsp parch ticket fare cabin embarked survived_int title family train
0 799 Perished Third Ibrahim Shawah, Mr. Yousseff male 30.000000 0 0 2685 7.2292 NaN C 0 Mr 0 1
1 237 Perished Second Hold, Mr. Stephen male 44.000000 1 0 26707 26.0000 NaN S 0 Mr 1 1
2 331 Survived Third McCoy, Miss. Agnes female 21.477273 2 0 367226 23.2500 NaN Q 1 Miss 2 1
3 509 Perished Third Olsen, Mr. Henry Margido male 28.000000 0 0 C 4001 22.5250 NaN S 0 Mr 0 1
4 806 Perished Third Johansson, Mr. Karl Johan male 31.000000 0 0 347063 7.7750 NaN S 0 Mr 0 1

3. Get and store the summary


In [35]:
summary = get.getSummary(df)

In [36]:
summary.head()


Out[36]:
mean sd median min max n n_na variable
0 446.000000 257.353842 446.000000 1.0000 891.0000 891 0 passengerid
1 29.603610 13.490785 29.328244 0.4200 80.0000 891 0 age
2 0.523008 1.102743 0.000000 0.0000 8.0000 891 0 sibsp
3 0.381594 0.806057 0.000000 0.0000 6.0000 891 0 parch
4 32.689702 49.607298 14.500000 4.0125 512.3292 891 0 fare

In [37]:
store.storeSummary(summary)

In [38]:
## as we can see, the data is actually stored
pull.pullSummary(main.session_id)


Out[38]:
session_id variable mean sd median min max n n_na
0 vecht499_titanic_training_20161130_844578 age 29.603610 13.490785 29.328244 0.4200 80.0000 891 0
1 vecht499_titanic_training_20161130_844578 sibsp 0.523008 1.102743 0.000000 0.0000 8.0000 891 0
2 vecht499_titanic_training_20161130_844578 passengerid 446.000000 257.353842 446.000000 1.0000 891.0000 891 0
3 vecht499_titanic_training_20161130_844578 survived_int 0.383838 0.486592 0.000000 0.0000 1.0000 891 0
4 vecht499_titanic_training_20161130_844578 family 0.904602 1.613459 0.000000 0.0000 10.0000 891 0
5 vecht499_titanic_training_20161130_844578 parch 0.381594 0.806057 0.000000 0.0000 6.0000 891 0
6 vecht499_titanic_training_20161130_844578 fare 32.689702 49.607298 14.500000 4.0125 512.3292 891 0
7 vecht499_titanic_training_20161130_844578 train 0.749719 0.433418 1.000000 0.0000 1.0000 891 0

4. Create features matrix, train and test set, build a model on the training set -> predict


In [39]:
y = df['survived_int']
X = df[['sex','pclass','embarked','title','age','family']]
X.index = df["passengerid"].tolist()

In [40]:
def preprocess_features(X):
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)    
    return output

X = preprocess_features(X)

In [41]:
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,  random_state=0)

In [42]:
clf = RandomForestClassifier(random_state=0) # just a basic random forest model
clf.fit(X_train, y_train)
## predict on the test set:
probs = clf.predict_proba(X_test)
score=[probs[x][1] for x in range(len(probs)) ]

5. getTestResults; storeTestResults; pull some statistics


In [43]:
test_results = get.getTestResults(score, y_test)
test_results.head(10)


Out[43]:
score label population target_population true_positives false_positives true_negatives false_negatives
0 1 1 0.005587 0.015152 1 0 113 65
1 1 1 0.011173 0.030303 2 0 113 64
2 1 1 0.016760 0.045455 3 0 113 63
3 1 1 0.022346 0.060606 4 0 113 62
4 1 1 0.027933 0.075758 5 0 113 61
5 1 1 0.033520 0.090909 6 0 113 60
6 1 1 0.039106 0.106061 7 0 113 59
7 1 1 0.044693 0.121212 8 0 113 58
8 1 0 0.050279 0.121212 8 1 112 58
9 1 1 0.055866 0.136364 9 1 112 57

In [44]:
store.storeTestResults(test_results)

In [45]:
## as we can see, the data is actually stored
pull.pullTestResults(main.session_id).head()


Out[45]:
session_id score label population target_population true_positives false_positives true_negatives false_negatives
0 vecht499_titanic_training_20161130_844578 0.231667 0 0.441341 0.742424 49 30 83 17
1 vecht499_titanic_training_20161130_844578 0.223810 0 0.446927 0.742424 49 31 82 17
2 vecht499_titanic_training_20161130_844578 0.223810 0 0.452514 0.742424 49 32 81 17
3 vecht499_titanic_training_20161130_844578 0.223810 0 0.458101 0.742424 49 33 80 17
4 vecht499_titanic_training_20161130_844578 0.223810 0 0.463687 0.742424 49 34 79 17

In [46]:
roc = pull.pullROC(main.session_id).sort('population', ascending = 1)
liftchart = pull.pullLiftChart(main.session_id).sort('population', ascending = 1)

In [47]:
fg = plt.figure(figsize=(10,5))
adj = plt.subplots_adjust(hspace=0.4,wspace=0.2)

sp = plt.subplot(1,2,1)
l1 = plt.plot(roc.false_positive_rate, roc.true_positive_rate)
tl = plt.title("ROC curve")

sp = plt.subplot(1,2,2)
l1 = plt.plot(liftchart.population, liftchart.target_population)
tl = plt.title("Liftchart")

plt.show()



In [48]:
pull.pullAccuracy(main.session_id, 0.5, 'population')


Out[48]:
0.6927374301675978

In [49]:
pull.pullAccuracy(main.session_id, 0.5, 'probability')


Out[49]:
0.776536312849162

In [50]:
pull.pullConfMatrix(main.session_id, 0.5, 'probability')


Out[50]:
predicted_positives predicted_negatives
actual_positives 46 20
actual_negatives 20 93

6. Store the scores


In [51]:
store.storeModelScores(X_test.index, score)

In [52]:
## as we can see, the data is actually stored
pull.pullModelScores(main.session_id).head()


Out[52]:
session_id id scores scores_class
0 vecht499_titanic_training_20161130_844578 798 0.400000 None
1 vecht499_titanic_training_20161130_844578 718 1.000000 None
2 vecht499_titanic_training_20161130_844578 565 0.466667 None
3 vecht499_titanic_training_20161130_844578 888 1.000000 None
4 vecht499_titanic_training_20161130_844578 152 0.900000 None

7. Close the session


In [53]:
main.closeSession()

In [54]:
## as we can see, the end time is filled in
connection = main.engine.connect()
a = connection.execute("select * from model_factory.run_history where session_id='"+main.session_id+"'")
b = a.fetchall()
pd.DataFrame.from_records(b, columns=a.keys())


ERROR:sqlalchemy.pool.SingletonThreadPool:Exception during reset or similar
Traceback (most recent call last):
  File "c:\users\vecht499\github\sqlalchemy\lib\sqlalchemy\pool.py", line 687, in _finalize_fairy
    fairy._reset(pool)
  File "c:\users\vecht499\github\sqlalchemy\lib\sqlalchemy\pool.py", line 829, in _reset
    pool._dialect.do_rollback(self)
  File "c:\users\vecht499\github\sqlalchemy\lib\sqlalchemy\engine\default.py", line 439, in do_rollback
    dbapi_connection.rollback()
  File "C:\Users\vecht499\Anaconda2\lib\site-packages\jaydebeapi\__init__.py", line 397, in rollback
    _handle_sql_exception()
  File "C:\Users\vecht499\Anaconda2\lib\site-packages\jaydebeapi\__init__.py", line 148, in _handle_sql_exception_jpype
    reraise(exc_type, exc_info[1], exc_info[2])
  File "C:\Users\vecht499\Anaconda2\lib\site-packages\jaydebeapi\__init__.py", line 395, in rollback
    self.jconn.rollback()
DatabaseError: java.sql.SQLException: [AsterData][ASTERJDBCDSII](34) Normal: Mule Exception (AD_MULE_ERROR)
Out[54]:
session_id user_id model_id start_time end_time last_exported
0 vecht499_titanic_training_20161130_844578 vecht499 titanic_training 2016-11-30 15:28:05 2016-11-30 15:28:43 None