Das Folgende sind Ausschnitte aus dem Tutorial für scikit-learn.
In [31]:
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()
In [102]:
iris.data[10]
Out[102]:
In [103]:
iris.target
Out[103]:
In [32]:
print(digits.data)
In [33]:
digits.target
Out[33]:
In [34]:
digits.images[0]
Out[34]:
In [35]:
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)
In [104]:
clf.fit(digits.data[:-1], digits.target[:-1])
Out[104]:
In [37]:
clf.predict(digits.data[-1])
Out[37]:
In [38]:
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)
Out[38]:
In [39]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0])
Out[39]:
In [40]:
y[0]
Out[40]:
In [41]:
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl')
Out[41]:
In [42]:
clf = joblib.load('filename.pkl')
In [43]:
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
data.shape
Out[43]:
In [105]:
digits = datasets.load_digits()
digits.images.shape
import pylab as pl
pl.imshow(digits.images[-1], cmap=pl.cm.gray_r)
pl.show()
In [45]:
data = digits.images.reshape((digits.images.shape[0], -1))
In [53]:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
np.unique(iris_y)
Out[53]:
In [54]:
# Split iris data in train and test data
# A random permutation, to split the data randomly
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)
Out[54]:
In [55]:
knn.predict(iris_X_test)
Out[55]:
In [56]:
iris_y_test
Out[56]:
In [57]:
diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
In [58]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)
Out[58]:
In [59]:
print(regr.coef_)
In [60]:
np.mean((regr.predict(diabetes_X_test)-diabetes_y_test)**2)
Out[60]:
In [61]:
regr.score(diabetes_X_test, diabetes_y_test)
Out[61]:
In [62]:
X = np.c_[ .5, 1].T
y = [.5, 1]
test = np.c_[ 0, 2].T
regr = linear_model.LinearRegression()
import pylab as pl
pl.figure()
np.random.seed(0)
for _ in range(6):
this_X = .1*np.random.normal(size=(2, 1)) + X
regr.fit(this_X, y)
pl.plot(test, regr.predict(test))
pl.scatter(this_X, y, s=3)
pl.show()
In [66]:
regr = linear_model.Ridge(alpha=.1)
pl.figure()
np.random.seed(0)
for _ in range(6):
this_X = .1*np.random.normal(size=(2, 1)) + X
regr.fit(this_X, y)
pl.plot(test, regr.predict(test))
pl.scatter(this_X, y, s=3)
pl.show()
In [67]:
alphas = np.logspace(-4, -1, 6)
from __future__ import print_function
print([regr.set_params(alpha=alpha
).fit(diabetes_X_train, diabetes_y_train,
).score(diabetes_X_test, diabetes_y_test) for alpha in alphas])
In [68]:
regr = linear_model.Lasso()
scores = [regr.set_params(alpha=alpha
).fit(diabetes_X_train, diabetes_y_train
).score(diabetes_X_test, diabetes_y_test)
for alpha in alphas]
best_alpha = alphas[scores.index(max(scores))]
regr.alpha = best_alpha
regr.fit(diabetes_X_train, diabetes_y_train)
Out[68]:
In [69]:
print(regr.coef_)
In [70]:
logistic = linear_model.LogisticRegression(C=1e5)
In [71]:
logistic.fit(iris_X_train, iris_y_train)
Out[71]:
In [72]:
from sklearn import svm
svc = svm.SVC(kernel='linear')
svc.fit(iris_X_train, iris_y_train)
Out[72]:
In [73]:
svc = svm.SVC(kernel='rbf')
In [74]:
from sklearn import datasets, svm
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
svc = svm.SVC(C=1, kernel='linear')
svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])
Out[74]:
In [75]:
import numpy as np
X_folds = np.array_split(X_digits, 3)
y_folds = np.array_split(y_digits, 3)
scores = list()
for k in range(3):
# We use 'list' to copy, in order to 'pop' later on
X_train = list(X_folds)
X_test = X_train.pop(k)
X_train = np.concatenate(X_train)
y_train = list(y_folds)
y_test = y_train.pop(k)
y_train = np.concatenate(y_train)
scores.append(svc.fit(X_train, y_train).score(X_test, y_test))
print(scores)
In [77]:
from sklearn import cross_validation
k_fold = cross_validation.KFold(n=6, n_folds=3)
for train_indices, test_indices in k_fold:
print('Train: %s | test: %s' % (train_indices, test_indices))
In [78]:
>>> kfold = cross_validation.KFold(len(X_digits), n_folds=3)
>>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])
... for train, test in kfold]
Out[78]:
In [79]:
cross_validation.cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1)
Out[79]:
In [80]:
>>> from sklearn.grid_search import GridSearchCV
>>> gammas = np.logspace(-6, -1, 10)
>>> clf = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas),
... n_jobs=-1)
>>> clf.fit(X_digits[:1000], y_digits[:1000])
Out[80]:
In [81]:
>>> clf.best_score_
Out[81]:
In [82]:
clf.best_estimator_.gamma == 1e-6
Out[82]:
In [83]:
>>> # Prediction performance on test set is not as good as on train set
>>> clf.score(X_digits[1000:], y_digits[1000:])
Out[83]:
In [84]:
cross_validation.cross_val_score(clf, X_digits, y_digits)
Out[84]:
In [85]:
>>> from sklearn import linear_model, datasets
>>> lasso = linear_model.LassoCV()
>>> diabetes = datasets.load_diabetes()
>>> X_diabetes = diabetes.data
>>> y_diabetes = diabetes.target
>>> lasso.fit(X_diabetes, y_diabetes)
Out[85]:
In [86]:
>>> # The estimator chose automatically its lambda:
>>> lasso.alpha_
Out[86]:
In [87]:
>>> from sklearn import cluster, datasets
>>> iris = datasets.load_iris()
>>> X_iris = iris.data
>>> y_iris = iris.target
>>> k_means = cluster.KMeans(n_clusters=3)
>>> k_means.fit(X_iris)
Out[87]:
In [88]:
print(k_means.labels_[::10])
In [89]:
print(y_iris[::10])
In [90]:
>>> import scipy as sp
>>> try:
... lena = sp.lena()
... except AttributeError:
... from scipy import misc
... lena = misc.lena()
>>> X = lena.reshape((-1, 1)) # We need an (n_sample, n_feature) array
>>> k_means = cluster.KMeans(n_clusters=5, n_init=1)
>>> k_means.fit(X)
Out[90]:
In [91]:
>>> values = k_means.cluster_centers_.squeeze()
>>> labels = k_means.labels_
>>> lena_compressed = np.choose(labels, values)
>>> lena_compressed.shape = lena.shape
In [93]:
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering
###############################################################################
# Generate data
lena = sp.misc.lena()
# Downsample the image by a factor of 4
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
X = np.reshape(lena, (-1, 1))
###############################################################################
# Define the structure A of the data. Pixels connected to their neighbors.
connectivity = grid_to_graph(*lena.shape)
###############################################################################
# Compute clustering
print("Compute structured hierarchical clustering...")
n_clusters = 15 # number of regions
ward = AgglomerativeClustering(n_clusters=n_clusters,
linkage='ward', connectivity=connectivity).fit(X)
label = np.reshape(ward.labels_, lena.shape)
print("Number of pixels: ", label.size)
print("Number of clusters: ", np.unique(label).size)
In [94]:
>>> digits = datasets.load_digits()
>>> images = digits.images
>>> X = np.reshape(images, (len(images), -1))
>>> connectivity = grid_to_graph(*images[0].shape)
>>> agglo = cluster.FeatureAgglomeration(connectivity=connectivity,
... n_clusters=32)
>>> agglo.fit(X)
Out[94]:
In [95]:
>>> X_reduced = agglo.transform(X)
>>> X_approx = agglo.inverse_transform(X_reduced)
>>> images_approx = np.reshape(X_approx, images.shape)
In [96]:
>>> # Create a signal with only 2 useful dimensions
>>> x1 = np.random.normal(size=100)
>>> x2 = np.random.normal(size=100)
>>> x3 = x1 + x2
>>> X = np.c_[x1, x2, x3]
>>> from sklearn import decomposition
>>> pca = decomposition.PCA()
>>> pca.fit(X)
Out[96]:
In [97]:
print(pca.explained_variance_)
In [99]:
# As we can see, only the 2 first components are useful
>>> pca.n_components = 2
>>> X_reduced = pca.fit_transform(X)
>>> X_reduced.shape
Out[99]:
In [100]:
>>> # Generate sample data
>>> time = np.linspace(0, 10, 2000)
>>> s1 = np.sin(2 * time) # Signal 1 : sinusoidal signal
>>> s2 = np.sign(np.sin(3 * time)) # Signal 2 : square signal
>>> S = np.c_[s1, s2]
>>> S += 0.2 * np.random.normal(size=S.shape) # Add noise
>>> S /= S.std(axis=0) # Standardize data
>>> # Mix data
>>> A = np.array([[1, 1], [0.5, 2]]) # Mixing matrix
>>> X = np.dot(S, A.T) # Generate observations
>>> # Compute ICA
>>> ica = decomposition.FastICA()
>>> S_ = ica.fit_transform(X) # Get the estimated sources
>>> A_ = ica.mixing_.T
>>> np.allclose(X, np.dot(S_, A_) + ica.mean_)
Out[100]:
In [1]:
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
In [3]:
twenty_train.target_names
Out[3]:
In [4]:
len(twenty_train.data)
Out[4]:
In [5]:
len(twenty_train.filenames)
Out[5]:
In [6]:
print(twenty_train.target_names[twenty_train.target[0]])
In [7]:
twenty_train.target[:10]
Out[7]:
In [8]:
for t in twenty_train.target[:10]:
print(twenty_train.target_names[t])
In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
Out[9]:
In [10]:
count_vect.vocabulary_.get(u'algorithm')
Out[10]:
In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)
X_train_tf.shape
Out[15]:
In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)
In [18]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, twenty_train.target_names[category]))
In [19]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),])
In [21]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
In [22]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[22]:
In [23]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5)),])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[23]:
In [24]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
target_names=twenty_test.target_names))
In [25]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),}
In [26]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
In [27]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
In [28]:
twenty_train.target_names[gs_clf.predict(['God is love'])]
Out[28]:
In [29]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
In [30]:
score
Out[30]:
In [ ]: