Sklearn basic example

Fit a simple classification model to the iris database


In [ ]:
from __future__ import print_function

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Load data


In [ ]:
from sklearn import datasets

iris = datasets.load_iris()
print(iris.DESCR)

In [ ]:
# Print some data lines
print(iris.data[:10])
print(iris.target)

In [ ]:
#Randomize and separate train & test
from sklearn.utils import shuffle
X, y = shuffle(iris.data, iris.target, random_state=0)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Linear model


In [ ]:
# Linear model 
from sklearn.linear_model import LogisticRegression

# Define classifier
clf_logistic = LogisticRegression()

# Fit classifier
clf_logistic.fit(X_train, y_train)

In [ ]:
# Evaluate accuracy in test
from sklearn.metrics import accuracy_score

# Predict test data
y_test_pred = clf_logistic.predict(X_test)

# Evaluate accuracy
print('Accuracy test: ', accuracy_score(y_test, y_test_pred))

Decision tree

- Build a second decision tree model to compare with the previous linear model
- Print Accuracy and ROC area

In [ ]:
from sklearn import tree

# Define classifier (use max_depth=3)
clf_tree = ...

# Fit over train data
...

# Evaluate test accuracy with accuracy_score
print('Tree accuracy test: ', ...)

# Evaluate ROC area with roc_auc_score
print('Tree average ROC area: ', ...)

Test another clasifier

- Based on the Sklearn algorithm cheat-sheet http://scikit-learn.org/stable/tutorial/machine_learning_map/
- Linear SVC: http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC 

In [ ]:
# Configure model
from sklearn import svm
clf_svc = svm.LinearSVC()

# Fit over train
...

# Accuracy score over test
...

ROC area


In [ ]:
# ROC area

# Print probabilities
y_test_proba = clf_logistic.predict_proba(X_test)
print(y_test_proba[:5])


#Recode y from multiclass labels to binary labels
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(y_train)
print('Test classes: ',lb.classes_)
y_test_bin = lb.transform(y_test)
print(y_test_bin[:5])


# Roc curve
from sklearn.metrics import roc_auc_score
print('Average ROC area: ', roc_auc_score(y_test_bin, y_test_proba))