In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Classification

Get some data to play with


In [ ]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()

In [ ]:
digits.images.shape

In [ ]:
print(digits.images[0])

In [ ]:
plt.matshow(digits.images[0], cmap=plt.cm.Greys)

In [ ]:
digits.data.shape

In [ ]:
digits.target.shape

In [ ]:
digits.target

Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)

Split the data to get going


In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

Really Simple API

0) Import your model class


In [ ]:
from sklearn.svm import LinearSVC

1) Instantiate an object and set the parameters


In [ ]:
svm = LinearSVC(C=0.1)

2) Fit the model


In [ ]:
svm.fit(X_train, y_train)

3) Apply / evaluate


In [ ]:
print(svm.predict(X_train))
print(y_train)

In [ ]:
svm.score(X_train, y_train)

In [ ]:
svm.score(X_test, y_test)

And again


In [ ]:
from sklearn.ensemble import RandomForestClassifier

In [ ]:
rf = RandomForestClassifier(n_estimators=50)

In [ ]:
rf.fit(X_train, y_train)

In [ ]:
rf.predict(X_test)

In [ ]:
rf.score(X_test, y_test)

Classifier Comparison

Labels Can Be Anything


In [ ]:
numbers = np.array(["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"])

In [ ]:
y_train_string = numbers[y_train]
svm.fit(X_train, y_train_string)

In [ ]:
svm.predict(X_test)