In [1]:

    
from modelFactoryPy import main
from modelFactoryPy import get
from modelFactoryPy import store
from modelFactoryPy import pull
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
% matplotlib inline
import sqlalchemy_mf_aster
from sqlalchemy.dialects import registry
from sqlalchemy_mf_aster import base

1. Get connection; define model_id, get session id



In [2]:

    
registry.register('aster', 'sqlalchemy_mf_aster.jdbc', 'AsterDialect_jdbc')



In [6]:

    
main.getConnection('aster')
# this will also create main.engine variable



In [4]:

    
model_id = 'titanic_training'



In [5]:

    
#main.addModelId('titanic_training','Training on titanic data','passengerid')
main.getSessionId(model_id)
# this will also create main.session_id variable









    Out[5]:





'vecht499_titanic_training_20161220_537318'

2. Load the data

Could be the data from postgres SQL as well -> you would use pd.read_sql



In [33]:

    
df = pd.read_csv('../data/titanic.csv')



In [34]:

    
df.head()









    Out[34]:






  
    
      
      passengerid
      survived
      pclass
      name
      sex
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
      survived_int
      title
      family
      train
    
  
  
    
      0
      799
      Perished
      Third
      Ibrahim Shawah, Mr. Yousseff
      male
      30.000000
      0
      0
      2685
      7.2292
      NaN
      C
      0
      Mr
      0
      1
    
    
      1
      237
      Perished
      Second
      Hold, Mr. Stephen
      male
      44.000000
      1
      0
      26707
      26.0000
      NaN
      S
      0
      Mr
      1
      1
    
    
      2
      331
      Survived
      Third
      McCoy, Miss. Agnes
      female
      21.477273
      2
      0
      367226
      23.2500
      NaN
      Q
      1
      Miss
      2
      1
    
    
      3
      509
      Perished
      Third
      Olsen, Mr. Henry Margido
      male
      28.000000
      0
      0
      C 4001
      22.5250
      NaN
      S
      0
      Mr
      0
      1
    
    
      4
      806
      Perished
      Third
      Johansson, Mr. Karl Johan
      male
      31.000000
      0
      0
      347063
      7.7750
      NaN
      S
      0
      Mr
      0
      1

3. Get and store the summary



In [35]:

    
summary = get.getSummary(df)



In [36]:

    
summary.head()









    Out[36]:






  
    
      
      mean
      sd
      median
      min
      max
      n
      n_na
      variable
    
  
  
    
      0
      446.000000
      257.353842
      446.000000
      1.0000
      891.0000
      891
      0
      passengerid
    
    
      1
      29.603610
      13.490785
      29.328244
      0.4200
      80.0000
      891
      0
      age
    
    
      2
      0.523008
      1.102743
      0.000000
      0.0000
      8.0000
      891
      0
      sibsp
    
    
      3
      0.381594
      0.806057
      0.000000
      0.0000
      6.0000
      891
      0
      parch
    
    
      4
      32.689702
      49.607298
      14.500000
      4.0125
      512.3292
      891
      0
      fare



In [37]:

    
store.storeSummary(summary)



In [38]:

    
## as we can see, the data is actually stored
pull.pullSummary(main.session_id)









    Out[38]:






  
    
      
      session_id
      variable
      mean
      sd
      median
      min
      max
      n
      n_na
    
  
  
    
      0
      vecht499_titanic_training_20161130_844578
      age
      29.603610
      13.490785
      29.328244
      0.4200
      80.0000
      891
      0
    
    
      1
      vecht499_titanic_training_20161130_844578
      sibsp
      0.523008
      1.102743
      0.000000
      0.0000
      8.0000
      891
      0
    
    
      2
      vecht499_titanic_training_20161130_844578
      passengerid
      446.000000
      257.353842
      446.000000
      1.0000
      891.0000
      891
      0
    
    
      3
      vecht499_titanic_training_20161130_844578
      survived_int
      0.383838
      0.486592
      0.000000
      0.0000
      1.0000
      891
      0
    
    
      4
      vecht499_titanic_training_20161130_844578
      family
      0.904602
      1.613459
      0.000000
      0.0000
      10.0000
      891
      0
    
    
      5
      vecht499_titanic_training_20161130_844578
      parch
      0.381594
      0.806057
      0.000000
      0.0000
      6.0000
      891
      0
    
    
      6
      vecht499_titanic_training_20161130_844578
      fare
      32.689702
      49.607298
      14.500000
      4.0125
      512.3292
      891
      0
    
    
      7
      vecht499_titanic_training_20161130_844578
      train
      0.749719
      0.433418
      1.000000
      0.0000
      1.0000
      891
      0

4. Create features matrix, train and test set, build a model on the training set -> predict



In [39]:

    
y = df['survived_int']
X = df[['sex','pclass','embarked','title','age','family']]
X.index = df["passengerid"].tolist()



In [40]:

    
def preprocess_features(X):
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)    
    return output

X = preprocess_features(X)



In [41]:

    
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,  random_state=0)



In [42]:

    
clf = RandomForestClassifier(random_state=0) # just a basic random forest model
clf.fit(X_train, y_train)
## predict on the test set:
probs = clf.predict_proba(X_test)
score=[probs[x][1] for x in range(len(probs)) ]

5. getTestResults; storeTestResults; pull some statistics



In [43]:

    
test_results = get.getTestResults(score, y_test)
test_results.head(10)









    Out[43]:






  
    
      
      score
      label
      population
      target_population
      true_positives
      false_positives
      true_negatives
      false_negatives
    
  
  
    
      0
      1
      1
      0.005587
      0.015152
      1
      0
      113
      65
    
    
      1
      1
      1
      0.011173
      0.030303
      2
      0
      113
      64
    
    
      2
      1
      1
      0.016760
      0.045455
      3
      0
      113
      63
    
    
      3
      1
      1
      0.022346
      0.060606
      4
      0
      113
      62
    
    
      4
      1
      1
      0.027933
      0.075758
      5
      0
      113
      61
    
    
      5
      1
      1
      0.033520
      0.090909
      6
      0
      113
      60
    
    
      6
      1
      1
      0.039106
      0.106061
      7
      0
      113
      59
    
    
      7
      1
      1
      0.044693
      0.121212
      8
      0
      113
      58
    
    
      8
      1
      0
      0.050279
      0.121212
      8
      1
      112
      58
    
    
      9
      1
      1
      0.055866
      0.136364
      9
      1
      112
      57



In [44]:

    
store.storeTestResults(test_results)



In [45]:

    
## as we can see, the data is actually stored
pull.pullTestResults(main.session_id).head()









    Out[45]:






  
    
      
      session_id
      score
      label
      population
      target_population
      true_positives
      false_positives
      true_negatives
      false_negatives
    
  
  
    
      0
      vecht499_titanic_training_20161130_844578
      0.231667
      0
      0.441341
      0.742424
      49
      30
      83
      17
    
    
      1
      vecht499_titanic_training_20161130_844578
      0.223810
      0
      0.446927
      0.742424
      49
      31
      82
      17
    
    
      2
      vecht499_titanic_training_20161130_844578
      0.223810
      0
      0.452514
      0.742424
      49
      32
      81
      17
    
    
      3
      vecht499_titanic_training_20161130_844578
      0.223810
      0
      0.458101
      0.742424
      49
      33
      80
      17
    
    
      4
      vecht499_titanic_training_20161130_844578
      0.223810
      0
      0.463687
      0.742424
      49
      34
      79
      17



In [46]:

    
roc = pull.pullROC(main.session_id).sort('population', ascending = 1)
liftchart = pull.pullLiftChart(main.session_id).sort('population', ascending = 1)



In [47]:

    
fg = plt.figure(figsize=(10,5))
adj = plt.subplots_adjust(hspace=0.4,wspace=0.2)

sp = plt.subplot(1,2,1)
l1 = plt.plot(roc.false_positive_rate, roc.true_positive_rate)
tl = plt.title("ROC curve")

sp = plt.subplot(1,2,2)
l1 = plt.plot(liftchart.population, liftchart.target_population)
tl = plt.title("Liftchart")

plt.show()



In [48]:

    
pull.pullAccuracy(main.session_id, 0.5, 'population')









    Out[48]:





0.6927374301675978



In [49]:

    
pull.pullAccuracy(main.session_id, 0.5, 'probability')









    Out[49]:





0.776536312849162



In [50]:

    
pull.pullConfMatrix(main.session_id, 0.5, 'probability')









    Out[50]:






  
    
      
      predicted_positives
      predicted_negatives
    
  
  
    
      actual_positives
      46
      20
    
    
      actual_negatives
      20
      93

6. Store the scores



In [51]:

    
store.storeModelScores(X_test.index, score)



In [52]:

    
## as we can see, the data is actually stored
pull.pullModelScores(main.session_id).head()









    Out[52]:






  
    
      
      session_id
      id
      scores
      scores_class
    
  
  
    
      0
      vecht499_titanic_training_20161130_844578
      798
      0.400000
      None
    
    
      1
      vecht499_titanic_training_20161130_844578
      718
      1.000000
      None
    
    
      2
      vecht499_titanic_training_20161130_844578
      565
      0.466667
      None
    
    
      3
      vecht499_titanic_training_20161130_844578
      888
      1.000000
      None
    
    
      4
      vecht499_titanic_training_20161130_844578
      152
      0.900000
      None

7. Close the session



In [53]:

    
main.closeSession()



In [54]:

    
## as we can see, the end time is filled in
connection = main.engine.connect()
a = connection.execute("select * from model_factory.run_history where session_id='"+main.session_id+"'")
b = a.fetchall()
pd.DataFrame.from_records(b, columns=a.keys())









    



ERROR:sqlalchemy.pool.SingletonThreadPool:Exception during reset or similar
Traceback (most recent call last):
  File "c:\users\vecht499\github\sqlalchemy\lib\sqlalchemy\pool.py", line 687, in _finalize_fairy
    fairy._reset(pool)
  File "c:\users\vecht499\github\sqlalchemy\lib\sqlalchemy\pool.py", line 829, in _reset
    pool._dialect.do_rollback(self)
  File "c:\users\vecht499\github\sqlalchemy\lib\sqlalchemy\engine\default.py", line 439, in do_rollback
    dbapi_connection.rollback()
  File "C:\Users\vecht499\Anaconda2\lib\site-packages\jaydebeapi\__init__.py", line 397, in rollback
    _handle_sql_exception()
  File "C:\Users\vecht499\Anaconda2\lib\site-packages\jaydebeapi\__init__.py", line 148, in _handle_sql_exception_jpype
    reraise(exc_type, exc_info[1], exc_info[2])
  File "C:\Users\vecht499\Anaconda2\lib\site-packages\jaydebeapi\__init__.py", line 395, in rollback
    self.jconn.rollback()
DatabaseError: java.sql.SQLException: [AsterData][ASTERJDBCDSII](34) Normal: Mule Exception (AD_MULE_ERROR)






    Out[54]:






  
    
      
      session_id
      user_id
      model_id
      start_time
      end_time
      last_exported
    
  
  
    
      0
      vecht499_titanic_training_20161130_844578
      vecht499
      titanic_training
      2016-11-30 15:28:05
      2016-11-30 15:28:43
      None

	passengerid	survived	pclass	name	sex	age	sibsp	ticket	fare	cabin	embarked	survived_int	title	family	train
0	799	Perished	Third	Ibrahim Shawah, Mr. Yousseff	male	30.000000	0	2685	7.2292	NaN	C	0	Mr	0	1
1	237	Perished	Second	Hold, Mr. Stephen	male	44.000000	1	26707	26.0000	NaN	S	0	Mr	1	1
2	331	Survived	Third	McCoy, Miss. Agnes	female	21.477273	2	367226	23.2500	NaN	Q	1	Miss	2	1
3	509	Perished	Third	Olsen, Mr. Henry Margido	male	28.000000	0	C 4001	22.5250	NaN	S	0	Mr	0	1
4	806	Perished	Third	Johansson, Mr. Karl Johan	male	31.000000	0	347063	7.7750	NaN	S	0	Mr	0	1

	mean	sd	median	min	max	n	variable
0	446.000000	257.353842	446.000000	1.0000	891.0000	891	passengerid
1	29.603610	13.490785	29.328244	0.4200	80.0000	891	age
2	0.523008	1.102743	0.000000	0.0000	8.0000	891	sibsp
3	0.381594	0.806057	0.000000	0.0000	6.0000	891	parch
4	32.689702	49.607298	14.500000	4.0125	512.3292	891	fare

	session_id	variable	mean	sd	median	min	max	n
0	vecht499_titanic_training_20161130_844578	age	29.603610	13.490785	29.328244	0.4200	80.0000	891
1	vecht499_titanic_training_20161130_844578	sibsp	0.523008	1.102743	0.000000	0.0000	8.0000	891
2	vecht499_titanic_training_20161130_844578	passengerid	446.000000	257.353842	446.000000	1.0000	891.0000	891
3	vecht499_titanic_training_20161130_844578	survived_int	0.383838	0.486592	0.000000	0.0000	1.0000	891
4	vecht499_titanic_training_20161130_844578	family	0.904602	1.613459	0.000000	0.0000	10.0000	891
5	vecht499_titanic_training_20161130_844578	parch	0.381594	0.806057	0.000000	0.0000	6.0000	891
6	vecht499_titanic_training_20161130_844578	fare	32.689702	49.607298	14.500000	4.0125	512.3292	891
7	vecht499_titanic_training_20161130_844578	train	0.749719	0.433418	1.000000	0.0000	1.0000	891

	score	label	population	target_population	true_positives	false_positives	true_negatives	false_negatives
0	1	1	0.005587	0.015152	1	0	113	65
1	1	1	0.011173	0.030303	2	0	113	64
2	1	1	0.016760	0.045455	3	0	113	63
3	1	1	0.022346	0.060606	4	0	113	62
4	1	1	0.027933	0.075758	5	0	113	61
5	1	1	0.033520	0.090909	6	0	113	60
6	1	1	0.039106	0.106061	7	0	113	59
7	1	1	0.044693	0.121212	8	0	113	58
8	1	0	0.050279	0.121212	8	1	112	58
9	1	1	0.055866	0.136364	9	1	112	57

	session_id	score	population	target_population	true_positives	false_positives	true_negatives	false_negatives
0	vecht499_titanic_training_20161130_844578	0.231667	0.441341	0.742424	49	30	83	17
1	vecht499_titanic_training_20161130_844578	0.223810	0.446927	0.742424	49	31	82	17
2	vecht499_titanic_training_20161130_844578	0.223810	0.452514	0.742424	49	32	81	17
3	vecht499_titanic_training_20161130_844578	0.223810	0.458101	0.742424	49	33	80	17
4	vecht499_titanic_training_20161130_844578	0.223810	0.463687	0.742424	49	34	79	17

	session_id	id	scores	scores_class
0	vecht499_titanic_training_20161130_844578	798	0.400000	None
1	vecht499_titanic_training_20161130_844578	718	1.000000	None
2	vecht499_titanic_training_20161130_844578	565	0.466667	None
3	vecht499_titanic_training_20161130_844578	888	1.000000	None
4	vecht499_titanic_training_20161130_844578	152	0.900000	None