notebook.community

Edit and run



In [1]:

    
from sklearn import datasets
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from rgf.sklearn import RGFClassifier



In [2]:

    
iris = datasets.load_iris()
rng = check_random_state(0)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]



In [3]:

    
rgf = RGFClassifier(max_leaf=400,
                    algorithm="RGF_Sib",
                    test_interval=100,
                    verbose=True)
gb = GradientBoostingClassifier(n_estimators=20,
                                learning_rate=0.01,
                                subsample=0.6,
                                random_state=rng)
n_folds = 3



In [4]:

    
rgf_scores = cross_val_score(rgf,
                            iris.data,
                            iris.target,
                            cv=StratifiedKFold(n_folds))
gb_scores = cross_val_score(gb,
                            iris.data,
                            iris.target,
                            cv=StratifiedKFold(n_folds))









    



"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c0
--------------------
Sat May 27 12:13:09 2017: Reading training data ... 
Sat May 27 12:13:09 2017: Start ... #train=99
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x99, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:09 2017: Calling optimizer with 50 trees and 100 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=1
Sat May 27 12:13:09 2017: Calling optimizer with 100 trees and 200 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=2
Sat May 27 12:13:09 2017: Calling optimizer with 150 trees and 300 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=3
Sat May 27 12:13:09 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:09 2017: Calling optimizer with 200 trees and 400 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c0-01
temp/rgf_classifier_c0-02
temp/rgf_classifier_c0-03
temp/rgf_classifier_c0-04

Sat May 27 12:13:09 2017: Done ... 
elapsed: 0.109

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c1
--------------------
Sat May 27 12:13:09 2017: Reading training data ... 
Sat May 27 12:13:09 2017: Start ... #train=99
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x99, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:09 2017: Calling optimizer with 29 trees and 101 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=1
Sat May 27 12:13:09 2017: Calling optimizer with 57 trees and 200 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=2
Sat May 27 12:13:09 2017: Calling optimizer with 85 trees and 300 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=3
Sat May 27 12:13:09 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:09 2017: Calling optimizer with 115 trees and 401 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c1-01
temp/rgf_classifier_c1-02
temp/rgf_classifier_c1-03
temp/rgf_classifier_c1-04

Sat May 27 12:13:09 2017: Done ... 
elapsed: 0.094

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c2
--------------------
Sat May 27 12:13:09 2017: Reading training data ... 
Sat May 27 12:13:09 2017: Start ... #train=99
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x99, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:09 2017: Calling optimizer with 34 trees and 100 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=1
Sat May 27 12:13:09 2017: Calling optimizer with 71 trees and 200 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=2
Sat May 27 12:13:09 2017: Calling optimizer with 112 trees and 300 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=3
Sat May 27 12:13:09 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:09 2017: Calling optimizer with 152 trees and 400 leaves
Sat May 27 12:13:09 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c2-01
temp/rgf_classifier_c2-02
temp/rgf_classifier_c2-03
temp/rgf_classifier_c2-04

Sat May 27 12:13:09 2017: Done ... 
elapsed: 0.109

None
"predict": 
   model_fn=temp\rgf_classifier_c0-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:09 2017: Reading test data ... 
Sat May 27 12:13:09 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c0-04,#leaf=400,#tree=200
Sat May 27 12:13:10 2017: Done ... 

None
"predict": 
   model_fn=temp\rgf_classifier_c1-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:10 2017: Reading test data ... 
Sat May 27 12:13:10 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c1-04,#leaf=401,#tree=115
Sat May 27 12:13:10 2017: Done ... 

None
"predict": 
   model_fn=temp\rgf_classifier_c2-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:10 2017: Reading test data ... 
Sat May 27 12:13:10 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c2-04,#leaf=400,#tree=152
Sat May 27 12:13:10 2017: Done ... 

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c0
--------------------
Sat May 27 12:13:10 2017: Reading training data ... 
Sat May 27 12:13:10 2017: Start ... #train=99
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x99, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:10 2017: Calling optimizer with 50 trees and 100 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=1
Sat May 27 12:13:10 2017: Calling optimizer with 100 trees and 200 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=2
Sat May 27 12:13:10 2017: Calling optimizer with 150 trees and 300 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=3
Sat May 27 12:13:10 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:10 2017: Calling optimizer with 200 trees and 400 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c0-01
temp/rgf_classifier_c0-02
temp/rgf_classifier_c0-03
temp/rgf_classifier_c0-04

Sat May 27 12:13:10 2017: Done ... 
elapsed: 0.109

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c1
--------------------
Sat May 27 12:13:10 2017: Reading training data ... 
Sat May 27 12:13:10 2017: Start ... #train=99
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x99, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:10 2017: Calling optimizer with 28 trees and 100 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=1
Sat May 27 12:13:10 2017: Calling optimizer with 57 trees and 200 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=2
Sat May 27 12:13:10 2017: Calling optimizer with 86 trees and 300 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=3
Sat May 27 12:13:10 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:10 2017: Calling optimizer with 115 trees and 400 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c1-01
temp/rgf_classifier_c1-02
temp/rgf_classifier_c1-03
temp/rgf_classifier_c1-04

Sat May 27 12:13:10 2017: Done ... 
elapsed: 0.062

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c2
--------------------
Sat May 27 12:13:10 2017: Reading training data ... 
Sat May 27 12:13:10 2017: Start ... #train=99
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x99, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:10 2017: Calling optimizer with 41 trees and 101 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=1
Sat May 27 12:13:10 2017: Calling optimizer with 83 trees and 200 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=2
Sat May 27 12:13:10 2017: Calling optimizer with 124 trees and 300 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=3
Sat May 27 12:13:10 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:10 2017: Calling optimizer with 165 trees and 400 leaves
Sat May 27 12:13:10 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c2-01
temp/rgf_classifier_c2-02
temp/rgf_classifier_c2-03
temp/rgf_classifier_c2-04

Sat May 27 12:13:10 2017: Done ... 
elapsed: 0.094

None
"predict": 
   model_fn=temp\rgf_classifier_c0-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:10 2017: Reading test data ... 
Sat May 27 12:13:10 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c0-04,#leaf=400,#tree=200
Sat May 27 12:13:10 2017: Done ... 

None
"predict": 
   model_fn=temp\rgf_classifier_c1-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:10 2017: Reading test data ... 
Sat May 27 12:13:10 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c1-04,#leaf=400,#tree=115
Sat May 27 12:13:10 2017: Done ... 

None
"predict": 
   model_fn=temp\rgf_classifier_c2-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:11 2017: Reading test data ... 
Sat May 27 12:13:11 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c2-04,#leaf=400,#tree=165
Sat May 27 12:13:11 2017: Done ... 

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c0
--------------------
Sat May 27 12:13:11 2017: Reading training data ... 
Sat May 27 12:13:11 2017: Start ... #train=102
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x102, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:11 2017: Calling optimizer with 50 trees and 100 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=1
Sat May 27 12:13:11 2017: Calling optimizer with 100 trees and 200 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=2
Sat May 27 12:13:11 2017: Calling optimizer with 150 trees and 300 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=3
Sat May 27 12:13:11 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:11 2017: Calling optimizer with 200 trees and 400 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c0-01
temp/rgf_classifier_c0-02
temp/rgf_classifier_c0-03
temp/rgf_classifier_c0-04

Sat May 27 12:13:11 2017: Done ... 
elapsed: 0.078

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c1
--------------------
Sat May 27 12:13:11 2017: Reading training data ... 
Sat May 27 12:13:11 2017: Start ... #train=102
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x102, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:11 2017: Calling optimizer with 33 trees and 100 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=1
Sat May 27 12:13:11 2017: Calling optimizer with 63 trees and 200 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=2
Sat May 27 12:13:11 2017: Calling optimizer with 94 trees and 301 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=3
Sat May 27 12:13:11 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:11 2017: Calling optimizer with 125 trees and 400 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c1-01
temp/rgf_classifier_c1-02
temp/rgf_classifier_c1-03
temp/rgf_classifier_c1-04

Sat May 27 12:13:11 2017: Done ... 
elapsed: 0.062

None
"train": 
   algorithm=RGF_Sib
   train_x_fn=temp/train.data.x
   train_y_fn=temp/train.data.y
   Log:ON
   model_fn_prefix=temp/rgf_classifier_c2
--------------------
Sat May 27 12:13:11 2017: Reading training data ... 
Sat May 27 12:13:11 2017: Start ... #train=102
--------------------
Forest-level: 
   loss=Log
   max_leaf_forest=400
   max_tree=200
   opt_interval=100
   test_interval=100
   num_tree_search=1
   Verbose:ON
   memory_policy=Generous
Turning on Force_to_refresh_all
-------------
Training data: 4x102, nonzero_ratio=1; managed as dense data.
-------------
Optimization: 
   loss=Log
   num_iteration_opt=5
   reg_L2=0.1
   opt_stepsize=0.5
   max_delta=1
Tree-level: min_pop=10
Node split: reg_L2=0.1
--------------------
Sat May 27 12:13:11 2017: Calling optimizer with 45 trees and 101 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=1
Sat May 27 12:13:11 2017: Calling optimizer with 86 trees and 200 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=2
Sat May 27 12:13:11 2017: Calling optimizer with 127 trees and 301 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=3
Sat May 27 12:13:11 2017: AzRgforest: #leaf reached max
Sat May 27 12:13:11 2017: Calling optimizer with 169 trees and 401 leaves
Sat May 27 12:13:11 2017: Writing model: seq#=4

Generated 4 model file(s): 
temp/rgf_classifier_c2-01
temp/rgf_classifier_c2-02
temp/rgf_classifier_c2-03
temp/rgf_classifier_c2-04

Sat May 27 12:13:11 2017: Done ... 
elapsed: 0.078

None
"predict": 
   model_fn=temp\rgf_classifier_c0-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:11 2017: Reading test data ... 
Sat May 27 12:13:11 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c0-04,#leaf=400,#tree=200
Sat May 27 12:13:11 2017: Done ... 

None
"predict": 
   model_fn=temp\rgf_classifier_c1-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:11 2017: Reading test data ... 
Sat May 27 12:13:11 2017: Predicting ... 
elapsed: 0.015
temp/predictions.txt: temp\rgf_classifier_c1-04,#leaf=400,#tree=125
Sat May 27 12:13:11 2017: Done ... 

None
"predict": 
   model_fn=temp\rgf_classifier_c2-04
   test_x_fn=temp/test.data.x
   prediction_fn=temp/predictions.txt
   Log:ON
--------------------
Sat May 27 12:13:11 2017: Reading test data ... 
Sat May 27 12:13:11 2017: Predicting ... 
elapsed: 0
temp/predictions.txt: temp\rgf_classifier_c2-04,#leaf=401,#tree=169
Sat May 27 12:13:11 2017: Done ... 

None



In [5]:

    
rgf_score = sum(rgf_scores)/n_folds
print('RGF Classfier score: {0:.5f}'.format(rgf_score))
gb_score = sum(gb_scores)/n_folds
print('Gradient Boosting Classfier score: {0:.5f}'.format(gb_score))









    



RGF Classfier score: 0.95997
Gradient Boosting Classfier score: 0.95997



In [ ]: