In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
In [3]:
cancer.keys()
Out[3]:
In [4]:
cancer['feature_names']
Out[4]:
In [5]:
df_in = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
In [6]:
df_target = pd.DataFrame(cancer['target'],columns=['Cancer'])
In [7]:
df_in.head()
Out[7]:
In [8]:
from sklearn.model_selection import train_test_split
In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_in, np.ravel(df_target), test_size=0.30, random_state=101)
In [10]:
from sklearn.svm import SVC
In [11]:
model = SVC(kernel='rbf')
In [12]:
model.fit(X_train,y_train)
Out[12]:
In [13]:
predictions = model.predict(X_test)
In [14]:
from sklearn.metrics import classification_report,confusion_matrix
In [15]:
print(confusion_matrix(y_test,predictions))
In [16]:
print(classification_report(y_test,predictions))
Woah! Notice that we are classifying everything into a single class! This means our model needs to have it parameters adjusted (it may also help to normalize the data).
We can search for parameters using a GridSearch!
Finding the right parameters (like what C or gamma values to use) is a tricky task! But luckily, we can be a little lazy and just try a bunch of combinations and see what works best! This idea of creating a 'grid' of parameters and just trying out all the possible combinations is called a Gridsearch, this method is common enough that Scikit-learn has this functionality built in with GridSearchCV! The CV stands for cross-validation which is the
GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested.
In [17]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}
In [18]:
from sklearn.model_selection import GridSearchCV
In [19]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
In [20]:
grid.fit(X_train,y_train)
Out[20]:
In [21]:
grid.best_params_
Out[21]:
In [22]:
grid.best_estimator_
Out[22]:
In [23]:
grid_predictions = grid.predict(X_test)
In [24]:
print(confusion_matrix(y_test,grid_predictions))
In [25]:
print(classification_report(y_test,grid_predictions))
In [26]:
from sklearn.tree import DecisionTreeClassifier
In [27]:
dtree = DecisionTreeClassifier()
In [28]:
dtree.fit(X_train,y_train)
Out[28]:
In [29]:
treepredictop = dtree.predict(X_test)
In [30]:
print(classification_report(y_test,treepredictop))
In [31]:
features = X_train.columns
features
Out[31]:
In [32]:
from IPython.display import Image as image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot
from PIL import Image
In [33]:
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data, feature_names=features, filled=True, rounded=True)
In [34]:
graph = pydot.graph_from_dot_data(dot_data.getvalue())
In [35]:
image(graph[0].create_png())
Out[35]:
In [36]:
dtree2 = DecisionTreeClassifier(min_samples_split=50)
In [37]:
dtree2.fit(X_train, y_train)
Out[37]:
In [38]:
tree2predictop = dtree2.predict(X_test)
In [39]:
print(classification_report(y_test,treepredictop))
In [40]:
dot_data = StringIO()
export_graphviz(dtree2, out_file=dot_data,feature_names=features,filled=True,rounded=True)
In [41]:
graph = pydot.graph_from_dot_data(dot_data.getvalue())
In [42]:
image(graph[0].create_png())
Out[42]:
In [43]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [44]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, min_samples_split=2)
In [45]:
model.fit(X_train, y_train);
In [46]:
model.score(X_test, y_test)
Out[46]:
In [47]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=100)
In [48]:
model.fit(X_train, y_train);
In [49]:
model.score(X_test, y_test)
Out[49]:
sklearn.ensemble
for regression problems.