In [1]:
from modelFactoryPy import main
from modelFactoryPy import get
from modelFactoryPy import store
from modelFactoryPy import pull
import numpy as np
import pandas as pd
import random
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
% matplotlib inline


C:\Users\vecht499\Anaconda2\lib\site-packages\pandas\computation\__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)

1. Get connection; define model_id, get session id


In [2]:
main.getConnection()
# this will also create main.engine variable


Out[2]:
Engine(postgresql://postgres:***@localhost:5432/postgres)

In [3]:
model_id = 'titanic_training'

In [4]:
#main.addModelId('titanic_training','Training on titanic data','passengerid')
main.getSessionId(model_id)
# this will also create main.session_id variable


Out[4]:
'postgres_titanic_training_20161223_848217'

2. Load the data

Could be the data from postgres SQL as well -> you would use pd.read_sql


In [5]:
df = pd.read_csv('../data/titanic.csv')

In [6]:
df.head()


Out[6]:
passengerid survived pclass name sex age sibsp parch ticket fare cabin embarked survived_int title family train
0 799 Perished Third Ibrahim Shawah, Mr. Yousseff male 30.000000 0 0 2685 7.2292 NaN C 0 Mr 0 1
1 237 Perished Second Hold, Mr. Stephen male 44.000000 1 0 26707 26.0000 NaN S 0 Mr 1 1
2 331 Survived Third McCoy, Miss. Agnes female 21.477273 2 0 367226 23.2500 NaN Q 1 Miss 2 1
3 509 Perished Third Olsen, Mr. Henry Margido male 28.000000 0 0 C 4001 22.5250 NaN S 0 Mr 0 1
4 806 Perished Third Johansson, Mr. Karl Johan male 31.000000 0 0 347063 7.7750 NaN S 0 Mr 0 1

3. Get and store the summary


In [7]:
summary = get.getSummary(df)

In [8]:
summary.head()


Out[8]:
mean sd median min max n n_na variable
0 446.000000 257.353842 446.000000 1.0000 891.0000 891 0 passengerid
1 29.603610 13.490785 29.328244 0.4200 80.0000 891 0 age
2 0.523008 1.102743 0.000000 0.0000 8.0000 891 0 sibsp
3 0.381594 0.806057 0.000000 0.0000 6.0000 891 0 parch
4 32.689702 49.607298 14.500000 4.0125 512.3292 891 0 fare

In [9]:
store.storeSummary(summary)

In [10]:
## as we can see, the data is actually stored
pull.pullSummary(main.session_id)


Out[10]:
session_id variable mean sd median min max n n_na
0 postgres_titanic_training_20161223_848217 passengerid 446.000000 257.353842 446.000000 1.000000 891.000000 891 0
1 postgres_titanic_training_20161223_848217 age 29.603610 13.490785 29.328244 0.420000 80.000000 891 0
2 postgres_titanic_training_20161223_848217 sibsp 0.523008 1.102743 0.000000 0.000000 8.000000 891 0
3 postgres_titanic_training_20161223_848217 parch 0.381594 0.806057 0.000000 0.000000 6.000000 891 0
4 postgres_titanic_training_20161223_848217 fare 32.689702 49.607298 14.500000 4.012500 512.329200 891 0
5 postgres_titanic_training_20161223_848217 survived_int 0.383838 0.486592 0.000000 0.000000 1.000000 891 0
6 postgres_titanic_training_20161223_848217 family 0.904602 1.613459 0.000000 0.000000 10.000000 891 0
7 postgres_titanic_training_20161223_848217 train 0.749719 0.433418 1.000000 0.000000 1.000000 891 0

4. Create features matrix, train and test set, build a model on the training set -> predict


In [11]:
y = df['survived_int']
X = df[['sex','pclass','embarked','title','age','family']]
X.index = df["passengerid"].tolist()

In [12]:
def preprocess_features(X):
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)    
    return output

X = preprocess_features(X)

In [13]:
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,  random_state=0)

In [14]:
clf = RandomForestClassifier(random_state=0) # just a basic random forest model
clf.fit(X_train, y_train)
## predict on the test set:
probs = clf.predict_proba(X_test)
score=[probs[x][1] for x in range(len(probs)) ]

5. getTestResults; storeTestResults; pull some statistics


In [15]:
test_results = get.getTestResults(score, y_test)
test_results.head(10)


c:\users\vecht499\github\modelfactorypy\modelFactoryPy\get.py:22: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  a = a.sort('score', ascending = 0)
Out[15]:
score label population target_population true_positives false_positives true_negatives false_negatives
0 1.0 1 0.005587 0.015152 1 0 113 65
1 1.0 1 0.011173 0.030303 2 0 113 64
2 1.0 1 0.016760 0.045455 3 0 113 63
3 1.0 1 0.022346 0.060606 4 0 113 62
4 1.0 1 0.027933 0.075758 5 0 113 61
5 1.0 1 0.033520 0.090909 6 0 113 60
6 1.0 1 0.039106 0.106061 7 0 113 59
7 1.0 1 0.044693 0.121212 8 0 113 58
8 1.0 0 0.050279 0.121212 8 1 112 58
9 1.0 1 0.055866 0.136364 9 1 112 57

In [16]:
store.storeTestResults(test_results)

In [17]:
## as we can see, the data is actually stored
pull.pullTestResults(main.session_id).head()


Out[17]:
session_id score label population target_population true_positives false_positives true_negatives false_negatives
0 postgres_titanic_training_20161223_848217 1.000000 1.000000 0.005587 0.015152 1 0 113 65
1 postgres_titanic_training_20161223_848217 1.000000 1.000000 0.011173 0.030303 2 0 113 64
2 postgres_titanic_training_20161223_848217 1.000000 1.000000 0.016760 0.045455 3 0 113 63
3 postgres_titanic_training_20161223_848217 1.000000 1.000000 0.022346 0.060606 4 0 113 62
4 postgres_titanic_training_20161223_848217 1.000000 1.000000 0.027933 0.075758 5 0 113 61

In [18]:
roc = pull.pullROC(main.session_id)
liftchart = pull.pullLiftChart(main.session_id)

In [19]:
fg = plt.figure(figsize=(10,5))
adj = plt.subplots_adjust(hspace=0.4,wspace=0.2)

sp = plt.subplot(1,2,1)
l1 = plt.plot(roc.false_positive_rate, roc.true_positive_rate)
tl = plt.title("ROC curve")

sp = plt.subplot(1,2,2)
l1 = plt.plot(liftchart.population, liftchart.target_population)
tl = plt.title("Liftchart")

plt.show()



In [20]:
pull.pullAccuracy(main.session_id, 0.5, 'population')


Out[20]:
0.6927374301675978

In [22]:
pull.pullAccuracy(main.session_id, 0.5, 'probability')


Out[22]:
0.776536312849162

6. Store the scores


In [23]:
store.storeModelScores(X_test.index, score)

In [24]:
## as we can see, the data is actually stored
pull.pullModelScores(main.session_id).head()


Out[24]:
session_id id scores scores_class
0 postgres_titanic_training_20161223_848217 763 0.050000 None
1 postgres_titanic_training_20161223_848217 314 0.000000 None
2 postgres_titanic_training_20161223_848217 112 0.700000 None
3 postgres_titanic_training_20161223_848217 415 0.000000 None
4 postgres_titanic_training_20161223_848217 309 0.333333 None

7. Close the session


In [25]:
main.closeSession()


postgres_titanic_training_20161223_848217

In [26]:
## as we can see, the end time is filled in
pd.read_sql("select * from model_factory.run_history where session_id='"+main.session_id+"'", main.engine)


Out[26]:
session_id user_id model_id start_time end_time last_exported
0 postgres_titanic_training_20161223_848217 postgres titanic_training 2016-12-23 17:16:40 2016-12-23 17:16:41 None