In [1]:

    
from modelFactoryPy import main
from modelFactoryPy import get
from modelFactoryPy import store
from modelFactoryPy import pull
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
% matplotlib inline









    



C:\Users\vecht499\Anaconda2\lib\site-packages\pandas\computation\__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)

1. Get connection; define model_id, get session id



In [2]:

    
main.getConnection()
# this will also create main.engine variable









    Out[2]:





Engine(postgresql://postgres:***@localhost:5432/postgres)



In [3]:

    
model_id = 'titanic_training'



In [4]:

    
#main.addModelId('titanic_training','Training on titanic data','passengerid')
main.getSessionId(model_id)
# this will also create main.session_id variable









    Out[4]:





'postgres_titanic_training_20161223_848217'

2. Load the data

Could be the data from postgres SQL as well -> you would use pd.read_sql



In [5]:

    
df = pd.read_csv('../data/titanic.csv')



In [6]:

    
df.head()









    Out[6]:






  
    
      
      passengerid
      survived
      pclass
      name
      sex
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
      survived_int
      title
      family
      train
    
  
  
    
      0
      799
      Perished
      Third
      Ibrahim Shawah, Mr. Yousseff
      male
      30.000000
      0
      0
      2685
      7.2292
      NaN
      C
      0
      Mr
      0
      1
    
    
      1
      237
      Perished
      Second
      Hold, Mr. Stephen
      male
      44.000000
      1
      0
      26707
      26.0000
      NaN
      S
      0
      Mr
      1
      1
    
    
      2
      331
      Survived
      Third
      McCoy, Miss. Agnes
      female
      21.477273
      2
      0
      367226
      23.2500
      NaN
      Q
      1
      Miss
      2
      1
    
    
      3
      509
      Perished
      Third
      Olsen, Mr. Henry Margido
      male
      28.000000
      0
      0
      C 4001
      22.5250
      NaN
      S
      0
      Mr
      0
      1
    
    
      4
      806
      Perished
      Third
      Johansson, Mr. Karl Johan
      male
      31.000000
      0
      0
      347063
      7.7750
      NaN
      S
      0
      Mr
      0
      1

3. Get and store the summary



In [7]:

    
summary = get.getSummary(df)



In [8]:

    
summary.head()









    Out[8]:






  
    
      
      mean
      sd
      median
      min
      max
      n
      n_na
      variable
    
  
  
    
      0
      446.000000
      257.353842
      446.000000
      1.0000
      891.0000
      891
      0
      passengerid
    
    
      1
      29.603610
      13.490785
      29.328244
      0.4200
      80.0000
      891
      0
      age
    
    
      2
      0.523008
      1.102743
      0.000000
      0.0000
      8.0000
      891
      0
      sibsp
    
    
      3
      0.381594
      0.806057
      0.000000
      0.0000
      6.0000
      891
      0
      parch
    
    
      4
      32.689702
      49.607298
      14.500000
      4.0125
      512.3292
      891
      0
      fare



In [9]:

    
store.storeSummary(summary)



In [10]:

    
## as we can see, the data is actually stored
pull.pullSummary(main.session_id)









    Out[10]:






  
    
      
      session_id
      variable
      mean
      sd
      median
      min
      max
      n
      n_na
    
  
  
    
      0
      postgres_titanic_training_20161223_848217
      passengerid
      446.000000
      257.353842
      446.000000
      1.000000
      891.000000
      891
      0
    
    
      1
      postgres_titanic_training_20161223_848217
      age
      29.603610
      13.490785
      29.328244
      0.420000
      80.000000
      891
      0
    
    
      2
      postgres_titanic_training_20161223_848217
      sibsp
      0.523008
      1.102743
      0.000000
      0.000000
      8.000000
      891
      0
    
    
      3
      postgres_titanic_training_20161223_848217
      parch
      0.381594
      0.806057
      0.000000
      0.000000
      6.000000
      891
      0
    
    
      4
      postgres_titanic_training_20161223_848217
      fare
      32.689702
      49.607298
      14.500000
      4.012500
      512.329200
      891
      0
    
    
      5
      postgres_titanic_training_20161223_848217
      survived_int
      0.383838
      0.486592
      0.000000
      0.000000
      1.000000
      891
      0
    
    
      6
      postgres_titanic_training_20161223_848217
      family
      0.904602
      1.613459
      0.000000
      0.000000
      10.000000
      891
      0
    
    
      7
      postgres_titanic_training_20161223_848217
      train
      0.749719
      0.433418
      1.000000
      0.000000
      1.000000
      891
      0

4. Create features matrix, train and test set, build a model on the training set -> predict



In [11]:

    
y = df['survived_int']
X = df[['sex','pclass','embarked','title','age','family']]
X.index = df["passengerid"].tolist()



In [12]:

    
def preprocess_features(X):
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)    
    return output

X = preprocess_features(X)



In [13]:

    
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,  random_state=0)



In [14]:

    
clf = RandomForestClassifier(random_state=0) # just a basic random forest model
clf.fit(X_train, y_train)
## predict on the test set:
probs = clf.predict_proba(X_test)
score=[probs[x][1] for x in range(len(probs)) ]

5. getTestResults; storeTestResults; pull some statistics



In [15]:

    
test_results = get.getTestResults(score, y_test)
test_results.head(10)









    



c:\users\vecht499\github\modelfactorypy\modelFactoryPy\get.py:22: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  a = a.sort('score', ascending = 0)






    Out[15]:






  
    
      
      score
      label
      population
      target_population
      true_positives
      false_positives
      true_negatives
      false_negatives
    
  
  
    
      0
      1.0
      1
      0.005587
      0.015152
      1
      0
      113
      65
    
    
      1
      1.0
      1
      0.011173
      0.030303
      2
      0
      113
      64
    
    
      2
      1.0
      1
      0.016760
      0.045455
      3
      0
      113
      63
    
    
      3
      1.0
      1
      0.022346
      0.060606
      4
      0
      113
      62
    
    
      4
      1.0
      1
      0.027933
      0.075758
      5
      0
      113
      61
    
    
      5
      1.0
      1
      0.033520
      0.090909
      6
      0
      113
      60
    
    
      6
      1.0
      1
      0.039106
      0.106061
      7
      0
      113
      59
    
    
      7
      1.0
      1
      0.044693
      0.121212
      8
      0
      113
      58
    
    
      8
      1.0
      0
      0.050279
      0.121212
      8
      1
      112
      58
    
    
      9
      1.0
      1
      0.055866
      0.136364
      9
      1
      112
      57



In [16]:

    
store.storeTestResults(test_results)



In [17]:

    
## as we can see, the data is actually stored
pull.pullTestResults(main.session_id).head()









    Out[17]:






  
    
      
      session_id
      score
      label
      population
      target_population
      true_positives
      false_positives
      true_negatives
      false_negatives
    
  
  
    
      0
      postgres_titanic_training_20161223_848217
      1.000000
      1.000000
      0.005587
      0.015152
      1
      0
      113
      65
    
    
      1
      postgres_titanic_training_20161223_848217
      1.000000
      1.000000
      0.011173
      0.030303
      2
      0
      113
      64
    
    
      2
      postgres_titanic_training_20161223_848217
      1.000000
      1.000000
      0.016760
      0.045455
      3
      0
      113
      63
    
    
      3
      postgres_titanic_training_20161223_848217
      1.000000
      1.000000
      0.022346
      0.060606
      4
      0
      113
      62
    
    
      4
      postgres_titanic_training_20161223_848217
      1.000000
      1.000000
      0.027933
      0.075758
      5
      0
      113
      61



In [18]:

    
roc = pull.pullROC(main.session_id)
liftchart = pull.pullLiftChart(main.session_id)



In [19]:

    
fg = plt.figure(figsize=(10,5))
adj = plt.subplots_adjust(hspace=0.4,wspace=0.2)

sp = plt.subplot(1,2,1)
l1 = plt.plot(roc.false_positive_rate, roc.true_positive_rate)
tl = plt.title("ROC curve")

sp = plt.subplot(1,2,2)
l1 = plt.plot(liftchart.population, liftchart.target_population)
tl = plt.title("Liftchart")

plt.show()



In [20]:

    
pull.pullAccuracy(main.session_id, 0.5, 'population')









    Out[20]:





0.6927374301675978



In [22]:

    
pull.pullAccuracy(main.session_id, 0.5, 'probability')









    Out[22]:





0.776536312849162

6. Store the scores



In [23]:

    
store.storeModelScores(X_test.index, score)



In [24]:

    
## as we can see, the data is actually stored
pull.pullModelScores(main.session_id).head()









    Out[24]:






  
    
      
      session_id
      id
      scores
      scores_class
    
  
  
    
      0
      postgres_titanic_training_20161223_848217
      763
      0.050000
      None
    
    
      1
      postgres_titanic_training_20161223_848217
      314
      0.000000
      None
    
    
      2
      postgres_titanic_training_20161223_848217
      112
      0.700000
      None
    
    
      3
      postgres_titanic_training_20161223_848217
      415
      0.000000
      None
    
    
      4
      postgres_titanic_training_20161223_848217
      309
      0.333333
      None

7. Close the session



In [25]:

    
main.closeSession()









    



postgres_titanic_training_20161223_848217



In [26]:

    
## as we can see, the end time is filled in
pd.read_sql("select * from model_factory.run_history where session_id='"+main.session_id+"'", main.engine)









    Out[26]:






  
    
      
      session_id
      user_id
      model_id
      start_time
      end_time
      last_exported
    
  
  
    
      0
      postgres_titanic_training_20161223_848217
      postgres
      titanic_training
      2016-12-23 17:16:40
      2016-12-23 17:16:41
      None

	passengerid	survived	pclass	name	sex	age	sibsp	ticket	fare	cabin	embarked	survived_int	title	family	train
0	799	Perished	Third	Ibrahim Shawah, Mr. Yousseff	male	30.000000	0	2685	7.2292	NaN	C	0	Mr	0	1
1	237	Perished	Second	Hold, Mr. Stephen	male	44.000000	1	26707	26.0000	NaN	S	0	Mr	1	1
2	331	Survived	Third	McCoy, Miss. Agnes	female	21.477273	2	367226	23.2500	NaN	Q	1	Miss	2	1
3	509	Perished	Third	Olsen, Mr. Henry Margido	male	28.000000	0	C 4001	22.5250	NaN	S	0	Mr	0	1
4	806	Perished	Third	Johansson, Mr. Karl Johan	male	31.000000	0	347063	7.7750	NaN	S	0	Mr	0	1

	mean	sd	median	min	max	n	variable
0	446.000000	257.353842	446.000000	1.0000	891.0000	891	passengerid
1	29.603610	13.490785	29.328244	0.4200	80.0000	891	age
2	0.523008	1.102743	0.000000	0.0000	8.0000	891	sibsp
3	0.381594	0.806057	0.000000	0.0000	6.0000	891	parch
4	32.689702	49.607298	14.500000	4.0125	512.3292	891	fare

	session_id	variable	mean	sd	median	min	max	n
0	postgres_titanic_training_20161223_848217	passengerid	446.000000	257.353842	446.000000	1.000000	891.000000	891
1	postgres_titanic_training_20161223_848217	age	29.603610	13.490785	29.328244	0.420000	80.000000	891
2	postgres_titanic_training_20161223_848217	sibsp	0.523008	1.102743	0.000000	0.000000	8.000000	891
3	postgres_titanic_training_20161223_848217	parch	0.381594	0.806057	0.000000	0.000000	6.000000	891
4	postgres_titanic_training_20161223_848217	fare	32.689702	49.607298	14.500000	4.012500	512.329200	891
5	postgres_titanic_training_20161223_848217	survived_int	0.383838	0.486592	0.000000	0.000000	1.000000	891
6	postgres_titanic_training_20161223_848217	family	0.904602	1.613459	0.000000	0.000000	10.000000	891
7	postgres_titanic_training_20161223_848217	train	0.749719	0.433418	1.000000	0.000000	1.000000	891

	score	label	population	target_population	true_positives	false_positives	true_negatives	false_negatives
0	1.0	1	0.005587	0.015152	1	0	113	65
1	1.0	1	0.011173	0.030303	2	0	113	64
2	1.0	1	0.016760	0.045455	3	0	113	63
3	1.0	1	0.022346	0.060606	4	0	113	62
4	1.0	1	0.027933	0.075758	5	0	113	61
5	1.0	1	0.033520	0.090909	6	0	113	60
6	1.0	1	0.039106	0.106061	7	0	113	59
7	1.0	1	0.044693	0.121212	8	0	113	58
8	1.0	0	0.050279	0.121212	8	1	112	58
9	1.0	1	0.055866	0.136364	9	1	112	57

	session_id	id	scores	scores_class
0	postgres_titanic_training_20161223_848217	763	0.050000	None
1	postgres_titanic_training_20161223_848217	314	0.000000	None
2	postgres_titanic_training_20161223_848217	112	0.700000	None
3	postgres_titanic_training_20161223_848217	415	0.000000	None
4	postgres_titanic_training_20161223_848217	309	0.333333	None