In [ ]:
from __future__ import print_function
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
In [ ]:
from sklearn import datasets
iris = datasets.load_iris()
print(iris.DESCR)
In [ ]:
# Print some data lines
print(iris.data[:10])
print(iris.target)
In [ ]:
#Randomize and separate train & test
from sklearn.utils import shuffle
X, y = shuffle(iris.data, iris.target, random_state=0)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
In [ ]:
# Linear model
from sklearn.linear_model import LogisticRegression
# Define classifier
clf_logistic = LogisticRegression()
# Fit classifier
clf_logistic.fit(X_train, y_train)
In [ ]:
# Evaluate accuracy in test
from sklearn.metrics import accuracy_score
# Predict test data
y_test_pred = clf_logistic.predict(X_test)
# Evaluate accuracy
print('Accuracy test: ', accuracy_score(y_test, y_test_pred))
In [ ]:
from sklearn import tree
# Define classifier (use max_depth=3)
clf_tree = ...
# Fit over train data
...
# Evaluate test accuracy with accuracy_score
print('Tree accuracy test: ', ...)
# Evaluate ROC area with roc_auc_score
print('Tree average ROC area: ', ...)
In [ ]:
# Configure model
from sklearn import svm
clf_svc = svm.LinearSVC()
# Fit over train
...
# Accuracy score over test
...
In [ ]:
# ROC area
# Print probabilities
y_test_proba = clf_logistic.predict_proba(X_test)
print(y_test_proba[:5])
#Recode y from multiclass labels to binary labels
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(y_train)
print('Test classes: ',lb.classes_)
y_test_bin = lb.transform(y_test)
print(y_test_bin[:5])
# Roc curve
from sklearn.metrics import roc_auc_score
print('Average ROC area: ', roc_auc_score(y_test_bin, y_test_proba))