In [1]:
import sklearn
In [2]:
from sklearn import datasets
In [3]:
from sklearn.cross_validation import cross_val_score
In [4]:
from sklearn import svm, metrics
In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier
In [6]:
import pylab as pl
In [7]:
print sklearn.__version__
In [8]:
print pylab.__version__
In [9]:
import pylab as pl
import numpy as np
# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()
In [10]:
from sklearn.datasets import load_digits
In [11]:
digits = load_digits()
In [12]:
digits
Out[12]:
In [13]:
X = digits.data
y = digits.target
In [14]:
X.shape
Out[14]:
In [15]:
y.shape
Out[15]:
In [45]:
n_samples, n_features = X.shape
In [46]:
n_samples, n_features
Out[46]:
In [48]:
for i, j in enumerate(np.random.permutation(X.shape[0])[:6]):
pl.subplot(1, 6, (i + 1))
pl.imshow(X[j].reshape((8, 8)), interpolation='nearest')
pl.title("true class: %d" % y[j])
pl.xticks(()), pl.yticks(())
In [19]:
for i, j in enumerate(np.random.permutation([x for x in range(12)])):
print i,j
In [20]:
print np.random.permutation([np.exp(-x) for x in np.arange(1,5,.5)])
In [21]:
for i,j in enumerate(np.random.permutation([np.exp(-x) for x in np.arange(1,5,.5)])):
print i,j
In [24]:
for i,j in enumerate(np.random.permutation([x*x for x in np.arange(1,5,.5)])):
print i,j
In [25]:
np.random.permutation(10).reshape((5,2))
Out[25]:
In [26]:
X.shape, y.shape
Out[26]:
In [27]:
%time x_pca= RandomizedPCA(n_components=2).fit_transform(X)
In [28]:
x_pca.shape
Out[28]:
In [29]:
from itertools import cycle
In [30]:
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
In [42]:
for i, c in zip(np.unique(y), cycle(colors)):
pl.scatter(x_pca[y==i, 0], x_pca[y==i, 1], c=c, label=str(i), alpha=.8)
pl.legend(loc='upper right')
In [65]:
for i, c in zip(np.unique(y), cycle(colors)):
print i, c
print y==i
#print x_pca[y==i]
In [60]:
y
Out[60]:
In [61]:
y[:20]
Out[61]:
In [40]:
#printing all 0
pl.scatter(x_pca[y==0,0], x_pca[y==0,1], label='0', c='b', alpha=.5)
pl.legend(loc='upper right')
Out[40]:
In [77]:
from time import sleep
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
#plot 0s
pl.scatter(x_pca[y==0, 0], x_pca[y==0, 1], label=1, c='b', alpha=0.5)
#sleep(2)
#pl.show()
#plot 1s
pl.scatter(x_pca[y==1, 0], x_pca[y==1, 1], label=1, c='g', alpha=0.5)
#plot 2s
pl.scatter(x_pca[y==2, 0], x_pca[y==2, 1], label=1, c='r', alpha=0.5)
#plot 3s
pl.scatter(x_pca[y==3, 0], x_pca[y==3, 1], label=1, c='c', alpha=0.5)
#plot 4s
pl.scatter(x_pca[y==4, 0], x_pca[y==4, 1], label=1, c='m', alpha=0.5)
#plot 5s
pl.scatter(x_pca[y==5, 0], x_pca[y==5, 1], label=1, c='y', alpha=0.5)
#plot 6s
pl.scatter(x_pca[y==6, 0], x_pca[y==6, 1], label=1, c='k', alpha=0.5)
#plot 7
pl.scatter(x_pca[y==7, 0], x_pca[y==7, 1], label=1, c='b', alpha=0.5)
pl.scatter(x_pca[y==8, 0], x_pca[y==8, 1], label=1, c='g', alpha=0.5)
pl.scatter(x_pca[y==9, 0], x_pca[y==9, 1], label=1, c='r', alpha=0.5)
Out[77]:
In [80]:
x_pca.shape
Out[80]:
In [81]:
x_pca
Out[81]:
In [83]:
from sklearn import svm
In [85]:
%time svm.SVC().fit(X,y).score(X,y)
Out[85]:
In [86]:
from sklearn.cross_validation import train_test_split
In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0)
In [88]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[88]:
In [99]:
svc = svm.SVC(kernel='rbf').fit(X_train, y_train)
svc_score = svc.score(X_test, y_test)
print svc_score
In [95]:
print svc
In [96]:
svc2 = svm.SVC(kernel='rbf', C=100, gamma=.001).fit(X_train, y_train)
In [97]:
svc2
Out[97]:
In [100]:
svc2_score = svc2.score(X_test, y_test)
In [101]:
print svc2_score
In [125]:
X[train].shape, X[test].shape
Out[125]:
In [103]:
from sklearn.cross_validation import ShuffleSplit
cv = ShuffleSplit(n_samples, n_iter=3, test_size=0.1,
random_state=0)
for cv_index, (train, test) in enumerate(cv):
print("# Cross Validation Iteration #%d" % cv_index)
print("train indices: {0}...".format(train[:10]))
print("test indices: {0}...".format(test[:10]))
svc = svm.SVC(kernel="rbf", C=1, gamma=0.001).fit(X[train], y[train])
print("train score: {0:.3f}, test score: {1:.3f}\n".format(
svc.score(X[train], y[train]), svc.score(X[test], y[test])))
In [121]:
svc = svm.SVC(kernel="poly", degree=3, C=100, gamma=0.001)
cv = ShuffleSplit(n_samples, n_iter=10, test_size=0.1,
random_state=0)
test_scores = cross_val_score(svc, X, y, cv=cv, n_jobs=2)
print test_scores
print "meanscore=" + str(test_scores.mean())
In [105]:
n_samples
Out[105]:
In [127]:
np.logspace(-7, -1, 10)
Out[127]:
In [129]:
n_gammas = 10
n_iter = 5
cv = ShuffleSplit(n_samples, n_iter=n_iter, train_size=500, test_size=500,
random_state=0)
train_scores = np.zeros((n_gammas, n_iter))
test_scores = np.zeros((n_gammas, n_iter))
gammas = np.logspace(-7, -1, n_gammas)
for i, gamma in enumerate(gammas):
for j, (train, test) in enumerate(cv):
clf = svm.SVC(C=10, gamma=gamma).fit(X[train], y[train])
train_scores[i, j] = clf.score(X[train], y[train])
test_scores[i, j] = clf.score(X[test], y[test])
In [130]:
cv
Out[130]:
In [131]:
type(cv)
Out[131]:
In [134]:
cv.train_size
Out[134]:
In [138]:
for i in range(n_iter):
pl.semilogx(gammas, train_scores[:, i], alpha=0.4, lw=5, c='b')
pl.semilogx(gammas, test_scores[:, i], alpha=0.4, lw=1, c='g')
pl.ylabel("score for SVC(C=10, gamma=gamma)")
pl.xlabel("gamma")
Out[138]:
In [139]:
from sklearn.grid_search import GridSearchCV
In [144]:
from pprint import pprint
svc_params = {
'C': np.logspace(-1, 2, 4),
'gamma': np.logspace(-4, 0,5),
}
pprint(svc_params)
In [172]:
n_subsamples = 1000
X_small_train, y_small_train = X_train[:n_subsamples], y_train[:n_subsamples]
In [173]:
X_small_train.shape, y_small_train.shape
Out[173]:
In [174]:
gs_svc = GridSearchCV(svm.SVC(), svc_params, cv=14, n_jobs=-1)
%time _ = gs_svc.fit(X_small_train, y_small_train)
In [175]:
gs_svc.best_estimator_
Out[175]:
In [176]:
gs_svc.best_params_, gs_svc.best_score_
Out[176]:
In [177]:
def display_scores(params, scores, append_star=False):
"""Format the mean score +/- std error for params"""
params = ", ".join("{0}={1}".format(k, v)
for k, v in params.items())
line = "{0}:\t{1:.3f} (+/-{2:.3f})".format(
params, np.mean(scores), sem(scores))
if append_star:
line += " *"
return line
def display_grid_scores(grid_scores, top=None):
"""Helper function to format a report on a grid of scores"""
grid_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)
if top is not None:
grid_scores = grid_scores[:top]
# Compute a threshold for staring models with overlapping
# stderr:
_, best_mean, best_scores = grid_scores[0]
threshold = best_mean - 2 * sem(best_scores)
for params, mean_score, scores in grid_scores:
append_star = mean_score + 2 * sem(scores) > threshold
print(display_scores(params, scores, append_star=append_star))
In [178]:
from scipy.stats import sem #standard error mean
def mean_score(scores):
"""Print the empirical mean score and standard error of the mean."""
return ("Mean score: {0:.3f} (+/-{1:.3f})").format(
np.mean(scores), sem(scores))
display_grid_scores(gs_svc.grid_scores_, top=20)
In [1]:
import pandas as pd
In [ ]: