w3-practice-02--text-analys



In [2]:
from sklearn import datasets

newsgroups = datasets.fetch_20newsgroups(
    subset='all',
    categories=['alt.atheism', 'sci.space'])

In [17]:
print(newsgroups.data[0])
print(newsgroups.target)


From: 9051467f@levels.unisa.edu.au (The Desert Brat)
Subject: Re: Keith Schneider - Stealth Poster?
Organization: Cured, discharged
Lines: 24

In article <1pa0f4INNpit@gap.caltech.edu>, keith@cco.caltech.edu (Keith Allan Schneider) writes:

> But really, are you threatened by the motto, or by the people that use it?

Every time somone writes something and says it is merely describing the norm,
it is infact re-inforcing that norm upon those programmed not to think for
themselves. The motto is dangerous in itself, it tells the world that every
*true* American is god-fearing, and puts down those who do not fear gods. It
doesn't need anyone to make it dangerous, it does a good job itself by just
existing on your currency.

> keith

The Desert Brat
-- 
John J McVey, Elc&Eltnc Eng, Whyalla, Uni S Australia,    ________
9051467f@levels.unisa.edu.au      T.S.A.K.C.            \/Darwin o\
For replies, mail to whjjm@wh.whyalla.unisa.edu.au      /\________/
Disclaimer: Unisa hates my opinions.                       bb  bb
+------------------------------------------------------+-----------------------+
|"It doesn't make a rainbow any less beautiful that we | "God's name is smack  |
|understand the refractive mechanisms that chance to   | for some."            |
|produce it." - Jim Perry, perry@dsinc.com             |    - Alice In Chains  |
+------------------------------------------------------+-----------------------+

[0 0 1 ... 1 1 0]

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
data = vectorizer.fit_transform(newsgroups.data)

In [50]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.svm import SVC

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(data, newsgroups.target)


Out[50]:
GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [52]:
gs.grid_scores_


/usr/local/lib/python2.7/site-packages/sklearn/model_selection/_search.py:761: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)
Out[52]:
[mean: 0.55263, std: 0.02812, params: {'C': 1e-05},
 mean: 0.55263, std: 0.02812, params: {'C': 0.0001},
 mean: 0.55263, std: 0.02812, params: {'C': 0.001},
 mean: 0.55263, std: 0.02812, params: {'C': 0.01},
 mean: 0.95017, std: 0.00822, params: {'C': 0.1},
 mean: 0.99328, std: 0.00455, params: {'C': 1.0},
 mean: 0.99328, std: 0.00455, params: {'C': 10.0},
 mean: 0.99328, std: 0.00455, params: {'C': 100.0},
 mean: 0.99328, std: 0.00455, params: {'C': 1000.0},
 mean: 0.99328, std: 0.00455, params: {'C': 10000.0},
 mean: 0.99328, std: 0.00455, params: {'C': 100000.0}]

In [106]:
C = gs.best_params_.get('C')
print(C)


1.0

In [110]:
svm = SVC(kernel='linear', random_state=241, C=C)
svm.fit(data, newsgroups.target)


Out[110]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [113]:
import pandas as pd
words = vectorizer.get_feature_names()
coef = pd.DataFrame(svm.coef_.data, svm.coef_.indices)

In [134]:
top_words = coef[0].map(lambda w: abs(w)).sort_values(ascending=False).head(10).index.map(lambda i: words[i])
t = top_words.sort_values()

print(t)


Index([u'atheism', u'atheists', u'bible', u'god', u'keith', u'moon',
       u'religion', u'sci', u'sky', u'space'],
      dtype='object')

In [135]:
print(",".join(t))


atheism,atheists,bible,god,keith,moon,religion,sci,sky,space