In [35]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
In [2]:
resp = load_breast_cancer()
In [7]:
data = resp['data']
target = resp['target']
target_names = resp['target_names']
In [11]:
X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.2)
In [15]:
# see dims
print('X_train:',X_train.shape)
print('X_test:',X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)
In [18]:
# instantiate RF model
rf = RandomForestClassifier()
In [22]:
# set params
parameters = {
'n_estimators': [1,5,10,30,50],
'criterion': ['gini', 'entropy'],
'max_depth': [1, 10, 50, 500]
}
In [26]:
# use gridsearch
GridSearch = GridSearchCV(rf, parameters, scoring='accuracy')
GridSearch.fit(X_train, y_train)
Out[26]:
In [27]:
# these are best params returned by GridSearch
GridSearch.best_params_
Out[27]:
In [40]:
# use best params model to predict using X_test
predicted = GridSearch.predict(X_test)
print("Accuracy of test set: %.2f%%" % (100*accuracy_score(y_test, predicted)))