In [1]:
from Stacked_Generalization.stackgen import StackGen

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split
import numpy as np

data = np.genfromtxt('CTG_Dataset.csv', delimiter=',')

X = data[1:,0:21]
Y = (data[1:, 21:22]).flatten()


X_TR, X_TE, y_TR, y_TE = train_test_split(X, Y, test_size=0.3, stratify = Y, random_state=9)

In [4]:
#Averaging base models' OOS predictions

stacked_classifier = StackGen(base_models = [KNeighborsClassifier(n_neighbors=10), LogisticRegression(solver = 'newton-cg'), 
                                             GaussianNB()], 
                              stacker = None,
                              classification = True, 
                              n_folds = 5, 
                              stratified = True, 
                              kf_random_state = 9, 
                              stack_with_orig = False,
                              save_results = 0)
final_result = stacked_classifier.fit_predict(X_TR, y_TR, X_TE, y_TE) #Log loss - OOS error --->  0.261039325565


### Fitting model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform') ###
Fold # of CV -> 1
Error for fold 1 is 0.815280754594
Fold # of CV -> 2
Error for fold 2 is 0.886837877026
Fold # of CV -> 3
Error for fold 3 is 0.58497288391
Fold # of CV -> 4
Error for fold 4 is 0.936253446539
Fold # of CV -> 5
Error for fold 5 is 0.810556077683
Average CV error is  0.80678020795
OOS error --->  0.339359399417

### Fitting model LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 0.368839670768
Fold # of CV -> 2
Error for fold 2 is 0.314639127104
Fold # of CV -> 3
Error for fold 3 is 0.299289908198
Fold # of CV -> 4
Error for fold 4 is 0.37031742597
Fold # of CV -> 5
Error for fold 5 is 0.314993716533
Average CV error is  0.333615969715
OOS error --->  0.292313606775

### Fitting model GaussianNB(priors=None) ###
Fold # of CV -> 1
Error for fold 1 is 1.39569120634
Fold # of CV -> 2
Error for fold 2 is 1.58689959906
Fold # of CV -> 3
Error for fold 3 is 1.2151105327
Fold # of CV -> 4
Error for fold 4 is 1.68324715717
Fold # of CV -> 5
Error for fold 5 is 1.12370852994
Average CV error is  1.40093140504
OOS error --->  1.07114753643

Stacking model not provided. Averaging predictions of base models...
OOS error --->  0.261039325565

In [5]:
#Stacking with Random Forest Classifier (meta-features only)

stacked_classifier = StackGen(base_models = [KNeighborsClassifier(n_neighbors=10), LogisticRegression(solver = 'newton-cg'), 
                                             GaussianNB()], 
                              stacker = RandomForestClassifier(random_state = 9),
                              classification = True, 
                              n_folds = 5, 
                              stratified = True, 
                              kf_random_state = 9, 
                              stack_with_orig = False,
                              save_results = 0)

_ = stacked_classifier.fit_predict(X_TR, y_TR, X_TE, y_TE) #Log loss - OOS error --->  0.219686600977


### Fitting model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform') ###
Fold # of CV -> 1
Error for fold 1 is 0.815280754594
Fold # of CV -> 2
Error for fold 2 is 0.886837877026
Fold # of CV -> 3
Error for fold 3 is 0.58497288391
Fold # of CV -> 4
Error for fold 4 is 0.936253446539
Fold # of CV -> 5
Error for fold 5 is 0.810556077683
Average CV error is  0.80678020795
OOS error --->  0.339359399417

### Fitting model LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 0.368839670768
Fold # of CV -> 2
Error for fold 2 is 0.314639127104
Fold # of CV -> 3
Error for fold 3 is 0.299289908198
Fold # of CV -> 4
Error for fold 4 is 0.37031742597
Fold # of CV -> 5
Error for fold 5 is 0.314993716533
Average CV error is  0.333615969715
OOS error --->  0.292313606775

### Fitting model GaussianNB(priors=None) ###
Fold # of CV -> 1
Error for fold 1 is 1.39569120634
Fold # of CV -> 2
Error for fold 2 is 1.58689959906
Fold # of CV -> 3
Error for fold 3 is 1.2151105327
Fold # of CV -> 4
Error for fold 4 is 1.68324715717
Fold # of CV -> 5
Error for fold 5 is 1.12370852994
Average CV error is  1.40093140504
OOS error --->  1.07114753643

Stacking base models using RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False)  ----> 
### Fitting model RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 1.01362771683
Fold # of CV -> 2
Error for fold 2 is 1.07915666945
Fold # of CV -> 3
Error for fold 3 is 1.11400300747
Fold # of CV -> 4
Error for fold 4 is 0.957715272867
Fold # of CV -> 5
Error for fold 5 is 0.791825135929
Average CV error is  0.99126556051
OOS error --->  0.219686600977


In [6]:
#Stacking with Random Forest Classifier (original features horizontally stacked with meta-features)

stacked_classifier = StackGen(base_models = [KNeighborsClassifier(n_neighbors=10), LogisticRegression(solver = 'newton-cg'), 
                                             GaussianNB()], 
                              stacker = RandomForestClassifier(random_state = 9),
                              classification = True, 
                              n_folds = 5, 
                              stratified = True, 
                              kf_random_state = 9, 
                              stack_with_orig = True,
                              save_results = 0)

_ = stacked_classifier.fit_predict(X_TR, y_TR, X_TE, y_TE)#Log loss - OOS error --->  0.160108856564


### Fitting model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform') ###
Fold # of CV -> 1
Error for fold 1 is 0.815280754594
Fold # of CV -> 2
Error for fold 2 is 0.886837877026
Fold # of CV -> 3
Error for fold 3 is 0.58497288391
Fold # of CV -> 4
Error for fold 4 is 0.936253446539
Fold # of CV -> 5
Error for fold 5 is 0.810556077683
Average CV error is  0.80678020795
OOS error --->  0.339359399417

### Fitting model LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 0.368839670768
Fold # of CV -> 2
Error for fold 2 is 0.314639127104
Fold # of CV -> 3
Error for fold 3 is 0.299289908198
Fold # of CV -> 4
Error for fold 4 is 0.37031742597
Fold # of CV -> 5
Error for fold 5 is 0.314993716533
Average CV error is  0.333615969715
OOS error --->  0.292313606775

### Fitting model GaussianNB(priors=None) ###
Fold # of CV -> 1
Error for fold 1 is 1.39569120634
Fold # of CV -> 2
Error for fold 2 is 1.58689959906
Fold # of CV -> 3
Error for fold 3 is 1.2151105327
Fold # of CV -> 4
Error for fold 4 is 1.68324715717
Fold # of CV -> 5
Error for fold 5 is 1.12370852994
Average CV error is  1.40093140504
OOS error --->  1.07114753643

Stacking base models using RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False)  ----> 
### Fitting model RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 0.433796725218
Fold # of CV -> 2
Error for fold 2 is 0.505127592684
Fold # of CV -> 3
Error for fold 3 is 0.406357108506
Fold # of CV -> 4
Error for fold 4 is 0.240421699915
Fold # of CV -> 5
Error for fold 5 is 0.408243265768
Average CV error is  0.398789278418
OOS error --->  0.160108856564


In [7]:
#In case you want to save results - set save_results to 1,2 or 3. Refer docs for more info

stacked_classifier = StackGen(base_models = [KNeighborsClassifier(n_neighbors=10), LogisticRegression(solver = 'newton-cg'), 
                                             GaussianNB()], 
                              stacker = RandomForestClassifier(random_state = 9),
                              classification = True, 
                              n_folds = 5, 
                              stratified = True, 
                              kf_random_state = 9, 
                              stack_with_orig = True,
                              save_results = 1)

_ = stacked_classifier.fit_predict(X_TR, y_TR, X_TE, y_TE)#Log loss - OOS error --->  0.160108856564


### Fitting model KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform') ###
Fold # of CV -> 1
Error for fold 1 is 0.815280754594
Fold # of CV -> 2
Error for fold 2 is 0.886837877026
Fold # of CV -> 3
Error for fold 3 is 0.58497288391
Fold # of CV -> 4
Error for fold 4 is 0.936253446539
Fold # of CV -> 5
Error for fold 5 is 0.810556077683
Average CV error is  0.80678020795
OOS error --->  0.339359399417

### Fitting model LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 0.368839670768
Fold # of CV -> 2
Error for fold 2 is 0.314639127104
Fold # of CV -> 3
Error for fold 3 is 0.299289908198
Fold # of CV -> 4
Error for fold 4 is 0.37031742597
Fold # of CV -> 5
Error for fold 5 is 0.314993716533
Average CV error is  0.333615969715
OOS error --->  0.292313606775

### Fitting model GaussianNB(priors=None) ###
Fold # of CV -> 1
Error for fold 1 is 1.39569120634
Fold # of CV -> 2
Error for fold 2 is 1.58689959906
Fold # of CV -> 3
Error for fold 3 is 1.2151105327
Fold # of CV -> 4
Error for fold 4 is 1.68324715717
Fold # of CV -> 5
Error for fold 5 is 1.12370852994
Average CV error is  1.40093140504
OOS error --->  1.07114753643

Stacking base models using RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False)  ----> 
### Fitting model RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=9,
            verbose=0, warm_start=False) ###
Fold # of CV -> 1
Error for fold 1 is 0.433796725218
Fold # of CV -> 2
Error for fold 2 is 0.505127592684
Fold # of CV -> 3
Error for fold 3 is 0.406357108506
Fold # of CV -> 4
Error for fold 4 is 0.240421699915
Fold # of CV -> 5
Error for fold 5 is 0.408243265768
Average CV error is  0.398789278418
OOS error --->  0.160108856564

Results have been saved to disk!

In [8]:
#Load saved results from disk using joblib
#Results are stored using the follow name ----
#StackGenResults_[list of base models]_stk-[stacking model]_[yyyy-mm-dd_hh-mm]_[savetype].pkl

from sklearn.externals import joblib
saved_results = joblib.load('StackGenResults_KNeighborsClassifier_LogisticRegression_GaussianNB_stk-RandomForestClassifier_2017-07-09_19-58_savetype1.pkl')

In [9]:
saved_results


Out[9]:
[{'RandomForestClassifier_stacker_oos_predictions': array([[ 1.  ,  0.  ,  0.  ],
         [ 1.  ,  0.  ,  0.  ],
         [ 0.22,  0.72,  0.06],
         ..., 
         [ 1.  ,  0.  ,  0.  ],
         [ 0.02,  0.08,  0.9 ],
         [ 1.  ,  0.  ,  0.  ]])}]

In [ ]: