notebook.community

Edit and run



In [35]:

    
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix



In [2]:

    
resp = load_breast_cancer()



In [7]:

    
data = resp['data']
target = resp['target']
target_names = resp['target_names']



In [11]:

    
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.2)



In [15]:

    
# see dims
print('X_train:',X_train.shape)
print('X_test:',X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)









    



X_train: (455, 30)
X_test: (114, 30)
y_train: (455,)
y_test: (114,)



In [18]:

    
# instantiate RF model
rf = RandomForestClassifier()



In [22]:

    
# set params
parameters = {
    'n_estimators': [1,5,10,30,50],
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 10, 50, 500]
}



In [26]:

    
# use gridsearch
GridSearch = GridSearchCV(rf, parameters, scoring='accuracy')
GridSearch.fit(X_train, y_train)









    Out[26]:





GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 5, 10, 30, 50], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 10, 50, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)



In [27]:

    
# these are best params returned by GridSearch
GridSearch.best_params_









    Out[27]:





{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 30}



In [40]:

    
# use best params model to predict using X_test
predicted = GridSearch.predict(X_test)
print("Accuracy of test set: %.2f%%" % (100*accuracy_score(y_test, predicted)))









    



Accuracy of test set: 95.61%