notebook.community

Edit and run

1 Datenanalyse 4: Maschinelles Lernen

2 An introduction to machine learning with scikit-learn

3 A tutorial on statistical-learning for scientific data processing

3.0.1 Nearest Neighbour

3.0.2 Model selection

3.0.3 Unsupervised Clustering

3.1 Maschinelles Lernen mit Text

Datenanalyse 4: Maschinelles Lernen

Das Folgende sind Ausschnitte aus dem Tutorial für scikit-learn.

An introduction to machine learning with scikit-learn



In [31]:

    
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()



In [102]:

    
iris.data[10]









    Out[102]:





array([ 5.4,  3.7,  1.5,  0.2])



In [103]:

    
iris.target









    Out[103]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])



In [32]:

    
print(digits.data)









    



[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]



In [33]:

    
digits.target









    Out[33]:





array([0, 1, 2, ..., 8, 9, 8])



In [34]:

    
digits.images[0]









    Out[34]:





array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])



In [35]:

    
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)



In [104]:

    
clf.fit(digits.data[:-1], digits.target[:-1])









    Out[104]:





GridSearchCV(cv=None,
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'gamma': array([  1.00000e-06,   3.59381e-06,   1.29155e-05,   4.64159e-05,
         1.66810e-04,   5.99484e-04,   2.15443e-03,   7.74264e-03,
         2.78256e-02,   1.00000e-01])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [37]:

    
clf.predict(digits.data[-1])









    Out[37]:





array([8])



In [38]:

    
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)









    Out[38]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [39]:

    
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0])









    Out[39]:





array([0])



In [40]:

    
y[0]









    Out[40]:





0



In [41]:

    
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl')









    Out[41]:





['filename.pkl',
 'filename.pkl_01.npy',
 'filename.pkl_02.npy',
 'filename.pkl_03.npy',
 'filename.pkl_04.npy',
 'filename.pkl_05.npy',
 'filename.pkl_06.npy',
 'filename.pkl_07.npy',
 'filename.pkl_08.npy',
 'filename.pkl_09.npy',
 'filename.pkl_10.npy']



In [42]:

    
clf = joblib.load('filename.pkl')

A tutorial on statistical-learning for scientific data processing



In [43]:

    
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
data.shape









    Out[43]:





(150, 4)



In [105]:

    
digits = datasets.load_digits()
digits.images.shape
import pylab as pl 
pl.imshow(digits.images[-1], cmap=pl.cm.gray_r) 
pl.show()



In [45]:

    
data = digits.images.reshape((digits.images.shape[0], -1))

Nearest Neighbour



In [53]:

    
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
np.unique(iris_y)









    Out[53]:





array([0, 1, 2])



In [54]:

    
# Split iris data in train and test data
# A random permutation, to split the data randomly
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test  = iris_X[indices[-10:]]
iris_y_test  = iris_y[indices[-10:]]
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)









    Out[54]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')



In [55]:

    
knn.predict(iris_X_test)









    Out[55]:





array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])



In [56]:

    
iris_y_test









    Out[56]:





array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])



In [57]:

    
diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]



In [58]:

    
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)









    Out[58]:





LinearRegression(copy_X=True, fit_intercept=True, normalize=False)



In [59]:

    
print(regr.coef_)









    



[  3.03499549e-01  -2.37639315e+02   5.10530605e+02   3.27736980e+02
  -8.14131709e+02   4.92814588e+02   1.02848452e+02   1.84606489e+02
   7.43519617e+02   7.60951722e+01]



In [60]:

    
np.mean((regr.predict(diabetes_X_test)-diabetes_y_test)**2)









    Out[60]:





2004.5676026898204



In [61]:

    
regr.score(diabetes_X_test, diabetes_y_test)









    Out[61]:





0.58507530226905757



In [62]:

    
X = np.c_[ .5, 1].T
y = [.5, 1]
test = np.c_[ 0, 2].T
regr = linear_model.LinearRegression()

import pylab as pl 
pl.figure() 

np.random.seed(0)
for _ in range(6): 
   this_X = .1*np.random.normal(size=(2, 1)) + X
   regr.fit(this_X, y)
   pl.plot(test, regr.predict(test)) 
   pl.scatter(this_X, y, s=3)  
pl.show()



In [66]:

    
regr = linear_model.Ridge(alpha=.1)

pl.figure() 

np.random.seed(0)
for _ in range(6): 
   this_X = .1*np.random.normal(size=(2, 1)) + X
   regr.fit(this_X, y)
   pl.plot(test, regr.predict(test)) 
   pl.scatter(this_X, y, s=3) 
pl.show()



In [67]:

    
alphas = np.logspace(-4, -1, 6)
from __future__ import print_function
print([regr.set_params(alpha=alpha
            ).fit(diabetes_X_train, diabetes_y_train,
            ).score(diabetes_X_test, diabetes_y_test) for alpha in alphas])









    



[0.58511106838835292, 0.58520730154446743, 0.58546775406984919, 0.58555120365039159, 0.58307170855541623, 0.570589994372801]



In [68]:

    
regr = linear_model.Lasso()
scores = [regr.set_params(alpha=alpha
            ).fit(diabetes_X_train, diabetes_y_train
            ).score(diabetes_X_test, diabetes_y_test)
       for alpha in alphas]
best_alpha = alphas[scores.index(max(scores))]
regr.alpha = best_alpha
regr.fit(diabetes_X_train, diabetes_y_train)









    Out[68]:





Lasso(alpha=0.025118864315095794, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute='auto',
   tol=0.0001, warm_start=False)



In [69]:

    
print(regr.coef_)









    



[   0.         -212.43764548  517.19478111  313.77959962 -160.8303982    -0.
 -187.19554705   69.38229038  508.66011217   71.84239008]



In [70]:

    
logistic = linear_model.LogisticRegression(C=1e5)



In [71]:

    
logistic.fit(iris_X_train, iris_y_train)









    Out[71]:





LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.0001)



In [72]:

    
from sklearn import svm
svc = svm.SVC(kernel='linear')
svc.fit(iris_X_train, iris_y_train)









    Out[72]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [73]:

    
svc = svm.SVC(kernel='rbf')

Model selection



In [74]:

    
from sklearn import datasets, svm
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
svc = svm.SVC(C=1, kernel='linear')
svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])









    Out[74]:





0.97999999999999998



In [75]:

    
import numpy as np
X_folds = np.array_split(X_digits, 3)
y_folds = np.array_split(y_digits, 3)
scores = list()
for k in range(3):
    # We use 'list' to copy, in order to 'pop' later on
    X_train = list(X_folds)
    X_test  = X_train.pop(k)
    X_train = np.concatenate(X_train)
    y_train = list(y_folds)
    y_test  = y_train.pop(k)
    y_train = np.concatenate(y_train)
    scores.append(svc.fit(X_train, y_train).score(X_test, y_test))
print(scores)









    



[0.93489148580968284, 0.95659432387312182, 0.93989983305509184]



In [77]:

    
from sklearn import cross_validation
k_fold = cross_validation.KFold(n=6, n_folds=3)
for train_indices, test_indices in k_fold:
     print('Train: %s | test: %s' % (train_indices, test_indices))









    



Train: [2 3 4 5] | test: [0 1]
Train: [0 1 4 5] | test: [2 3]
Train: [0 1 2 3] | test: [4 5]



In [78]:

    
>>> kfold = cross_validation.KFold(len(X_digits), n_folds=3)
>>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])
...          for train, test in kfold]









    Out[78]:





[0.93489148580968284, 0.95659432387312182, 0.93989983305509184]



In [79]:

    
cross_validation.cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1)









    Out[79]:





array([ 0.93489149,  0.95659432,  0.93989983])



In [80]:

    
>>> from sklearn.grid_search import GridSearchCV
>>> gammas = np.logspace(-6, -1, 10)
>>> clf = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas),
...                    n_jobs=-1)
>>> clf.fit(X_digits[:1000], y_digits[:1000])









    Out[80]:





GridSearchCV(cv=None,
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'gamma': array([  1.00000e-06,   3.59381e-06,   1.29155e-05,   4.64159e-05,
         1.66810e-04,   5.99484e-04,   2.15443e-03,   7.74264e-03,
         2.78256e-02,   1.00000e-01])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)



In [81]:

    
>>> clf.best_score_









    Out[81]:





0.92400000000000004



In [82]:

    
clf.best_estimator_.gamma == 1e-6









    Out[82]:





True



In [83]:

    
>>> # Prediction performance on test set is not as good as on train set
>>> clf.score(X_digits[1000:], y_digits[1000:])









    Out[83]:





0.94228356336260977



In [84]:

    
cross_validation.cross_val_score(clf, X_digits, y_digits)









    Out[84]:





array([ 0.93521595,  0.95826377,  0.93791946])



In [85]:

    
>>> from sklearn import linear_model, datasets
>>> lasso = linear_model.LassoCV()
>>> diabetes = datasets.load_diabetes()
>>> X_diabetes = diabetes.data
>>> y_diabetes = diabetes.target
>>> lasso.fit(X_diabetes, y_diabetes)









    Out[85]:





LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', tol=0.0001, verbose=False)



In [86]:

    
>>> # The estimator chose automatically its lambda:
>>> lasso.alpha_









    Out[86]:





0.012291895087486173

Unsupervised Clustering



In [87]:

    
>>> from sklearn import cluster, datasets
>>> iris = datasets.load_iris()
>>> X_iris = iris.data
>>> y_iris = iris.target

>>> k_means = cluster.KMeans(n_clusters=3)
>>> k_means.fit(X_iris)









    Out[87]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)



In [88]:

    
print(k_means.labels_[::10])









    



[1 1 1 1 1 0 0 0 0 0 2 2 2 2 2]



In [89]:

    
print(y_iris[::10])









    



[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]



In [90]:

    
>>> import scipy as sp
>>> try:
...    lena = sp.lena()
... except AttributeError:
...    from scipy import misc
...    lena = misc.lena()
>>> X = lena.reshape((-1, 1)) # We need an (n_sample, n_feature) array
>>> k_means = cluster.KMeans(n_clusters=5, n_init=1)
>>> k_means.fit(X)









    Out[90]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)



In [91]:

    
>>> values = k_means.cluster_centers_.squeeze()
>>> labels = k_means.labels_
>>> lena_compressed = np.choose(labels, values)
>>> lena_compressed.shape = lena.shape



In [93]:

    
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering

###############################################################################
# Generate data
lena = sp.misc.lena()
# Downsample the image by a factor of 4
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
X = np.reshape(lena, (-1, 1))

###############################################################################
# Define the structure A of the data. Pixels connected to their neighbors.
connectivity = grid_to_graph(*lena.shape)

###############################################################################
# Compute clustering
print("Compute structured hierarchical clustering...")

n_clusters = 15  # number of regions
ward = AgglomerativeClustering(n_clusters=n_clusters,
        linkage='ward', connectivity=connectivity).fit(X)
label = np.reshape(ward.labels_, lena.shape)
print("Number of pixels: ", label.size)
print("Number of clusters: ", np.unique(label).size)









    



Compute structured hierarchical clustering...
Number of pixels:  65536
Number of clusters:  15



In [94]:

    
>>> digits = datasets.load_digits()
>>> images = digits.images
>>> X = np.reshape(images, (len(images), -1))
>>> connectivity = grid_to_graph(*images[0].shape)

>>> agglo = cluster.FeatureAgglomeration(connectivity=connectivity,
...                                      n_clusters=32)
>>> agglo.fit(X)









    Out[94]:





FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
           connectivity=<64x64 sparse matrix of type '<class 'numpy.int32'>'
	with 288 stored elements in COOrdinate format>,
           linkage='ward', memory=Memory(cachedir=None), n_clusters=32,
           n_components=None, pooling_func=<function mean at 0x03CEC540>)



In [95]:

    
>>> X_reduced = agglo.transform(X)

>>> X_approx = agglo.inverse_transform(X_reduced)
>>> images_approx = np.reshape(X_approx, images.shape)



In [96]:

    
>>> # Create a signal with only 2 useful dimensions
>>> x1 = np.random.normal(size=100)
>>> x2 = np.random.normal(size=100)
>>> x3 = x1 + x2
>>> X = np.c_[x1, x2, x3]

>>> from sklearn import decomposition
>>> pca = decomposition.PCA()
>>> pca.fit(X)









    Out[96]:





PCA(copy=True, n_components=None, whiten=False)



In [97]:

    
print(pca.explained_variance_)









    



[  3.29154194e+00   8.57774851e-01   1.09332636e-32]



In [99]:

    
# As we can see, only the 2 first components are useful
>>> pca.n_components = 2
>>> X_reduced = pca.fit_transform(X)
>>> X_reduced.shape









    Out[99]:





(100, 2)



In [100]:

    
>>> # Generate sample data
>>> time = np.linspace(0, 10, 2000)
>>> s1 = np.sin(2 * time)  # Signal 1 : sinusoidal signal
>>> s2 = np.sign(np.sin(3 * time))  # Signal 2 : square signal
>>> S = np.c_[s1, s2]
>>> S += 0.2 * np.random.normal(size=S.shape)  # Add noise
>>> S /= S.std(axis=0)  # Standardize data
>>> # Mix data
>>> A = np.array([[1, 1], [0.5, 2]])  # Mixing matrix
>>> X = np.dot(S, A.T)  # Generate observations

>>> # Compute ICA
>>> ica = decomposition.FastICA()
>>> S_ = ica.fit_transform(X)  # Get the estimated sources
>>> A_ = ica.mixing_.T
>>> np.allclose(X,  np.dot(S_, A_) + ica.mean_)









    Out[100]:





True

Maschinelles Lernen mit Text



In [1]:

    
categories = ['alt.atheism', 'soc.religion.christian',
               'comp.graphics', 'sci.med']



In [2]:

    
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)









    



d:\python34\lib\site-packages\sklearn\datasets\twenty_newsgroups.py:89: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
  logger.warn("Downloading dataset from %s (14 MB)", URL)
WARNING:sklearn.datasets.twenty_newsgroups:Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)






    



________________________________________________________________________________
Cache loading failed
________________________________________________________________________________
decoding with 'zlib_codec' codec failed (error: Error -5 while decompressing data: incomplete or truncated stream)



In [3]:

    
twenty_train.target_names









    Out[3]:





['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']



In [4]:

    
len(twenty_train.data)









    Out[4]:





2257



In [5]:

    
len(twenty_train.filenames)









    Out[5]:





2257



In [6]:

    
print(twenty_train.target_names[twenty_train.target[0]])









    



comp.graphics



In [7]:

    
twenty_train.target[:10]









    Out[7]:





array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int32)



In [8]:

    
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])









    



comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med



In [9]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape









    Out[9]:





(2257, 35788)



In [10]:

    
count_vect.vocabulary_.get(u'algorithm')









    Out[10]:





4690



In [15]:

    
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)
X_train_tf.shape









    Out[15]:





(2257, 35788)



In [16]:

    
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)



In [18]:

    
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))









    



'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics



In [19]:

    
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])



In [21]:

    
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)



In [22]:

    
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)









    Out[22]:





0.83488681757656458



In [23]:

    
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5)),])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)









    Out[23]:





0.9127829560585885



In [24]:

    
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))









    



                        precision    recall  f1-score   support

           alt.atheism       0.94      0.82      0.87       319
         comp.graphics       0.88      0.98      0.92       389
               sci.med       0.95      0.89      0.92       396
soc.religion.christian       0.90      0.95      0.92       398

           avg / total       0.92      0.91      0.91      1502



In [25]:

    
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),}



In [26]:

    
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)



In [27]:

    
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])



In [28]:

    
twenty_train.target_names[gs_clf.predict(['God is love'])]









    Out[28]:





'soc.religion.christian'



In [29]:

    
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))









    



clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)



In [30]:

    
score









    Out[30]:





0.90249999999999997



In [ ]:

Table of Contents

Datenanalyse 4: Maschinelles Lernen

An introduction to machine learning with scikit-learn

A tutorial on statistical-learning for scientific data processing

Nearest Neighbour

Model selection

Unsupervised Clustering

Maschinelles Lernen mit Text