This code predicts the newsgroup from a list of 20 possible news groups. Its trainind on the commonly used 20-newsgroups dataset that is a "unusual" clasification dataset in that each newsgroup is very distinctive, leading to picking models that do better with this kind of data.
The code does the following:
Models are optimized through:
Code came from examples at:
20 newsgroups dataset info is at http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset
Be sure to install the following (pip3 is python 3 and pip command will also work):
pip3 install sklearn
pip3 install pandas
pip3 install scipy
If I missed an instal and you get an import error, try doing a pip3 install <import name>
. Note that the kernel for jupyter needs to be the same version/instalation of python you do the pip3 install in (python 3).
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.display import display, HTML
from IPython.display import Audio
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import time
display(HTML("<style>.container { width:97% !important; }</style>")) #Set width of iPython cells
In [3]:
from sklearn.datasets import fetch_20newsgroups
# You can restrict the categories to simulate fewwer classes
#categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
#categories = ['comp.graphics', 'sci.med']
#categories = ['alt.atheism', 'talk.religion.misc']
categories=None
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)
In [4]:
twenty_train.target_names
Out[4]:
In [5]:
len(twenty_train.data)
Out[5]:
In [6]:
len(twenty_train.filenames)
Out[6]:
In [7]:
print(twenty_train.data[0])
In [8]:
twenty_train.target_names[twenty_train.target[0]]
Out[8]:
In [9]:
twenty_train.target
Out[9]:
In [10]:
len(twenty_train.target)
Out[10]:
In [11]:
twenty_train.target_names
Out[11]:
In [12]:
len(twenty_test.data)
Out[12]:
In [13]:
len(twenty_test.data) / len(twenty_train.data)
Out[13]:
In [14]:
print(twenty_test.data[10])
In [15]:
twenty_test.target_names[twenty_test.target[10]]
Out[15]:
In [16]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print('training examples = ' + str(len(twenty_train.data)))
print('vocabulary length = ' + str(len(count_vect.vocabulary_)))
print('transformed training text matrix shape = ' + str(X_train_counts.shape))
In [17]:
# vocabulary_ is dict of word string -> word index
list(count_vect.vocabulary_.items())[:50]
Out[17]:
In [18]:
text = ['The The rain in spain.', 'The brown brown fox.']
counts_matrix = count_vect.transform(text)
type(counts_matrix)
Out[18]:
In [19]:
counts_matrix.data
Out[19]:
In [20]:
counts_matrix.indptr
Out[20]:
In [21]:
counts_matrix.indices
Out[21]:
In [22]:
from scipy.sparse import coo_matrix
coo = coo_matrix(counts_matrix)
#print(np.stack((coo.row, coo.col, coo.data)))
df = pd.DataFrame({'row':coo.row, 'column':coo.col, 'count':coo.data},
columns=['row','column', 'count'])
df
Out[22]:
In [23]:
inverse_vocabulary=np.empty(len(count_vect.vocabulary_), dtype=object)
for key,value in count_vect.vocabulary_.items():
inverse_vocabulary[value] = key
for i in coo.col:
print(i, inverse_vocabulary[i])
In [24]:
words = [inverse_vocabulary[i] for i in coo.col]
df = pd.DataFrame({'row':coo.row, 'column':coo.col, 'count':coo.data, 'word':words})
df = df[ ['row','column', 'count', 'word'] ]
df
Out[24]:
In [25]:
tfidf = TfidfTransformer()
tfidf.fit(X_train_counts) # compute weights on whole training set
tfidf_matrix = tfidf.transform(counts_matrix) # transform examples
print( 'tfidf_matrix type = ' + str(type(tfidf_matrix)) )
print( 'tfidf_matrix shape = ' + str(tfidf_matrix.shape) )
coo_tfidf = coo_matrix(tfidf_matrix)
words_tfidf = [inverse_vocabulary[i] for i in coo_tfidf.col]
df = pd.DataFrame({'row':coo_tfidf.row, 'column':coo_tfidf.col,
'value':coo_tfidf.data, 'word':words_tfidf})
df = df[ ['row','column', 'value', 'word'] ]
df
Out[25]:
In [26]:
import scipy
scipy.sparse.linalg.norm(tfidf_matrix, axis=1)
Out[26]:
Notice the following in the above values:
In [27]:
tfidf.idf_.shape
Out[27]:
In [28]:
words = ['the', 'very', 'car', 'vector', 'africa']
for word in words:
word_index = count_vect.vocabulary_[word]
print(word + ' = ' + str(tfidf.idf_[word_index]))
In [29]:
text_clf = Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', MultinomialNB()),
])
In [30]:
text_clf.fit(twenty_train.data, twenty_train.target)
Out[30]:
In [31]:
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)
Out[31]:
In [32]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
target_names=twenty_test.target_names))
In [33]:
df = pd.DataFrame(metrics.confusion_matrix(twenty_test.target, predicted))
df
Out[33]:
In [34]:
class QAResults:
def init(self, Y_expected, Y_predicted, X, class_labels):
self.Y_expected = Y_expected
self.Y_predicted = Y_predicted
self.X = X
self.class_labels = class_labels
self.next_error_index = 0
self.errors = np.nonzero(Y_expected - Y_predicted) # returns indexs of non-zero elements
print(self.errors)
def display_next(self):
if(self.next_error_index >= self.errors[0].shape[0]):
self.next_error_index = 0 # cycle back around
X_index = self.errors[0][self.next_error_index]
print('index = ', X_index )
print('Expected = ' + self.class_labels[self.Y_expected[X_index]])
print('Predicted = ' + self.class_labels[self.Y_predicted[X_index]])
print('\nX['+ str(X_index) +']')
print( self.X[X_index] )
self.next_error_index +=1
In [35]:
def header(str):
display(HTML('<h3>'+str+'</h3>'))
tests = {}
def test_pipeline(pipeline, name=None, verbose=True, qa_test = None):
start=time.time()
pipeline.fit(twenty_train.data, twenty_train.target)
predicted = pipeline.predict(twenty_test.data)
elapsed_time = (time.time() - start)
accuracy = np.mean(predicted == twenty_test.target)
f1 = metrics.f1_score(twenty_test.target, predicted, average='macro')
print( 'F1 = %.3f \nAccuracy = %.3f\ntime = %.3f sec.' % (f1, accuracy, elapsed_time))
if(verbose):
header('Classification Report')
print(metrics.classification_report(twenty_test.target, predicted,
target_names=twenty_test.target_names, digits=3))
header('Confusion Matrix (row=expected, col=predicted)')
df = pd.DataFrame(metrics.confusion_matrix(twenty_test.target, predicted))
df.columns = twenty_test.target_names
df['Expected']=twenty_test.target_names
df.set_index('Expected',inplace=True)
display(df)
if name is not None:
tests[name]={'Name':name, 'Accuracy':accuracy, 'F1':f1, 'Time':elapsed_time,
'Details':pipeline.get_params(deep=True)}
if qa_test is not None:
qa_test.init( twenty_test.target, predicted, twenty_test.data, twenty_test.target_names)
qa_test=QAResults()
test_pipeline(text_clf, qa_test=qa_test)
In [36]:
qa_test.display_next() # re-run this cell to see next error
In [37]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()), # <-- with weighting
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-5, random_state=42,
max_iter=40)),
]), verbose=False)
In [38]:
test_pipeline(Pipeline([('cvect', CountVectorizer()), # <-- no weighting
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-5, random_state=42,
max_iter=40)),
]), verbose=False)
In [39]:
test_pipeline(Pipeline([('tfidf_v', TfidfVectorizer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False)
In [40]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='hinge loss')
In [41]:
# hinge loss is a linear SVM
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='hinge loss')
In [42]:
# log loss is logistic regression
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='log', penalty='l2',
alpha=1e-6, random_state=42,
max_iter=10 )),
]), verbose=False, name='log loss')
In [43]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='log', penalty='none',
alpha=1e-6, random_state=42,
max_iter=10 )),
]), verbose=False, name='log loss no regularization')
In [44]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', MultinomialNB()),
]), verbose=False, name='MultinomialNB')
In [45]:
from sklearn.neighbors import KNeighborsClassifier
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('knn', KNeighborsClassifier(n_neighbors=5)),
]), verbose=False, name='KNN n=5')
In [46]:
from sklearn.neighbors import KNeighborsClassifier
for n in range(1,7):
print( '\nn = ' + str(n))
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('knn', KNeighborsClassifier(n_neighbors=n)),
]), verbose=False, name='KNN n=' + str(n))
In [47]:
from sklearn.neighbors import KNeighborsClassifier
for n in range(1,7):
print( '\nn = ' + str(n))
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('knn', KNeighborsClassifier(n_neighbors=n, weights='distance')),
]), verbose=False, name='KNN n=' + str(n) + ' distance weights')
In [48]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', NearestCentroid(metric='euclidean')),
]), verbose=False, name='NearestCentroid')
In [49]:
from sklearn.linear_model import LogisticRegression
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', LogisticRegression(solver='sag', multi_class='multinomial', n_jobs=-1)),
]), verbose=False, name='LogisticRegression multinomial')
In [50]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', LogisticRegression(solver='sag', multi_class='ovr',n_jobs=-1)),
]), verbose=False, name='LogisticRegression ovr')
In [51]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', LogisticRegression(C=10, solver='sag', multi_class='multinomial', n_jobs=-1, max_iter=200)),
]), verbose=False, name='LogisticRegression multinomial C=10')
In [52]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', LogisticRegression(C=100, solver='sag', multi_class='multinomial', n_jobs=-1, max_iter=200)),
]), verbose=False, name='LogisticRegression multinomial C=100')
In [53]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', LogisticRegression(C=1000, solver='sag', multi_class='multinomial', n_jobs=-1, max_iter=200)),
]), verbose=False, name='LogisticRegression multinomial C=1000')
In [54]:
p = Pipeline([('cvect', CountVectorizer(stop_words='english', ngram_range=(1,2),
max_df = 0.88, min_df=1)),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=4e-4, random_state=42,
max_iter=40 )),
])
test_pipeline(p, verbose=False)
In [55]:
# Adapted from https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers
def show_most_informative_features(vectorizer, clf, class_labels, n=50):
feature_names = vectorizer.get_feature_names()
for row in range(clf.coef_.shape[0]):
coefs_with_fns = sorted(zip(clf.coef_[row], feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
print( '\nclass = ' + class_labels[row])
l = [[fn_1, coef_1,fn_2,coef_2] for (coef_1, fn_1), (coef_2, fn_2) in top]
df = pd.DataFrame(l, columns=['Smallest Word', 'Smallest Weight', 'Largest Word', 'Largest Weight'])
display(df)
show_most_informative_features(p.named_steps['cvect'], p.named_steps['sgdc'], twenty_train.target_names)
In [56]:
p = Pipeline([('cvect', CountVectorizer( analyzer='char', ngram_range=(5,5),
max_df = 0.88, min_df=1)),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=4e-4, random_state=42,
max_iter=40 )),
])
test_pipeline(p, verbose=False)
show_most_informative_features(p.named_steps['cvect'], p.named_steps['sgdc'], twenty_train.target_names)
In [57]:
test_pipeline(Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer(use_idf=False)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='use_idf=False')
In [58]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='stopwords')
In [59]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english', ngram_range=(1,2),
max_df = 0.8, min_df=2)),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='ngram_range=(1,2)')
In [60]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer(norm=None)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='norm = None')
In [61]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='sublinear_tf=True')
In [62]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer(norm='l1')),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='norm=l1')
In [63]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english', ngram_range=(1,3),
max_df = 0.8, min_df=2)),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name='ngram_range=(1,3)')
In [64]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english', ngram_range=(1,2),
max_df = 0.8, min_df=2)),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 )),
]), verbose=False, name = 'ngram_range=(1,2), max_df = 0.8, min_df=2')
In [65]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english', ngram_range=(1,2),
max_df = 0.8, min_df=2)),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 , n_jobs=-1)),
]), verbose=False, name='ngram_range=(1,2), max_df = 0.8, min_df=2')
In [66]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english', token_pattern="[a-zA-Z]{3,}")),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 , n_jobs=-1)),
]), verbose=False, name='no numbers')
In [67]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english', token_pattern="[a-zA-Z0-9.-]{1,}")),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 , n_jobs=-1)),
]), verbose=False, name='dots in words')
In [68]:
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english',
ngram_range=(1,2), min_df=3,max_df=0.8)),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 , n_jobs=-1)),
]), verbose=False, name='ngram_range=(1,2), min_df=3,max_df=0.8, sublinear_tf')
In [69]:
for n in range(3,8,1):
print('\nN-grams = '+ str(n))
test_pipeline(Pipeline([('cvect', CountVectorizer(analyzer='char', ngram_range=(n,n),
min_df=2, max_df=0.9)),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('sgdc', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-4, random_state=42,
max_iter=40 , n_jobs=-1)),
]), verbose=False, name='char ngram ' + str(n) + ' + sublinear_tf')
In [70]:
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('svd', TruncatedSVD(n_components=300)),
('svc', SVC(kernel='linear', C=10)),
]), verbose=False, name='SVC + TruncatedSVD')
In [71]:
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('svc', SVC(kernel='linear')),
]), verbose=False, name='SVC')
In [72]:
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('sgdc', LinearSVC(C=10)),
]), verbose=False, name='LinearSVC, C=10')
In [73]:
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
test_pipeline(Pipeline([('cvect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('sgdc', LinearSVC(C=1)),
]), verbose=False, name='LinearSVC, C=1')
Code adapted from: http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html
In [74]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
In [75]:
class ItemSelector(BaseEstimator, TransformerMixin):
"""For data grouped by feature, select subset of data at a provided key.
The data is expected to be stored in a 2D data structure, where the first
index is over features and the second is over samples. i.e.
>> len(data[key]) == n_samples
Please note that this is the opposite convention to scikit-learn feature
matrixes (where the first index corresponds to sample).
ItemSelector only requires that the collection implement getitem
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
DataFrame, numpy record array, etc.
>> data = {'a': [1, 5, 2, 5, 2, 8],
'b': [9, 4, 1, 4, 1, 3]}
>> ds = ItemSelector(key='a')
>> data['a'] == ds.transform(data)
ItemSelector is not designed to handle data grouped by sample. (e.g. a
list of dicts). If your data is structured this way, consider a
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
Parameters
----------
key : hashable, required
The key corresponding to the desired value in a mappable.
"""
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'length': len(text),
'num_sentences': text.count('.'),
'num_questions': text.count('?') ,
'num_dollars': text.count('$'),
'num_percent': text.count('%'),
'num_exclamations': text.count('!'),
}
for text in posts]
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
"""Extract the subject & body from a usenet post in a single pass.
Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
return self
def transform(self, posts):
features = np.recarray(shape=(len(posts),),
dtype=[('subject', object), ('body', object)])
for i, text in enumerate(posts):
headers, _, bod = text.partition('\n\n')
bod = strip_newsgroup_footer(bod)
bod = strip_newsgroup_quoting(bod)
features['body'][i] = bod
prefix = 'Subject:'
sub = ''
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features['subject'][i] = sub
return features
class Printer(BaseEstimator, TransformerMixin):
"""{Print inputs}"""
def __init__(self, count):
self.count = count
def fit(self, x, y=None):
return self
def transform(self, x):
if(self.count >0):
self.count-=1
print(x[0])
return x
pipeline = Pipeline([
# Extract the subject & body
('subjectbody', SubjectBodyExtractor()),
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(n_jobs=-1,
transformer_list=[
# Pipeline for pulling features from the post's subject line
('subject', Pipeline([
('selector', ItemSelector(key='subject')),
('tfidf', TfidfVectorizer(min_df=1)),
])),
# Pipeline for standard bag-of-words model for body
('body_bow', Pipeline([
('selector', ItemSelector(key='body')),
('tfidf', TfidfVectorizer()),
])),
# Pipeline for pulling ad hoc features from post's body
('body_stats', Pipeline([
('selector', ItemSelector(key='body')),
('stats', TextStats()), # returns a list of dicts
('cvect', DictVectorizer()), # list of dicts -> feature matrix
#('print',Printer(1)),
# scaling is needed so SGD model will have balanced feature gradients
('scale', StandardScaler(copy=False, with_mean=False, with_std=True) ),
#('print2',Printer(1)),
])),
],
# weight components in FeatureUnion
transformer_weights={
'subject': 1,
'body_bow': 1,
'body_stats': .1,
},
)),
#('print',Printer(1)),
# Use a SVC classifier on the combined features
#('svc', SVC(kernel='linear')),
('sgdc', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5 )),
])
test_pipeline(pipeline, verbose=False, name='metadata')
In [76]:
from scipy.stats import expon as sp_expon
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
In [77]:
r = sp_uniform(loc=5,scale=2).rvs(size=1000*1000)
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
ax.hist(r, bins=100)
plt.show()
In [78]:
def geometric_sample(power_min, power_max, sample_size):
dist = sp_uniform(loc=power_min, scale=power_max-power_min)
return np.power(10, dist.rvs(size=sample_size))
geometric_sample(1,6,50)
Out[78]:
In [79]:
from sklearn.model_selection import RandomizedSearchCV
pipeline = Pipeline([('cvect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('sgdc', SGDClassifier( random_state=42 )),
])
#ngram_range=(1,2), max_df = 0.8, min_df=2
param_dist = {"cvect__stop_words": [None,'english'],
"cvect__ngram_range": [(1,1),(1,2)],
"cvect__min_df": sp_randint(1, 6),
"cvect__max_df": sp_uniform(loc=0.5, scale=0.5), # range is (loc, loc+scale)
"tfidf__sublinear_tf": [True,False],
"tfidf__norm": [None, 'l1', 'l2'],
"sgdc__max_iter": sp_randint(5, 40),
"sgdc__loss": ['hinge','log'],
"sgdc__alpha": geometric_sample(-8,-3,10000),
}
# n_iter - number of random models to evaluate
# n_jobs = -1 to run in parallel on all cores
# cv = 4 , 4-fold cross validation
# scoring='f1_macro' , averages the F1 for each target class
rs = RandomizedSearchCV(pipeline, param_distributions=param_dist,
n_iter=5, n_jobs=-1, cv=3, return_train_score=False,
verbose=1, scoring='f1_macro', random_state=42)
test_pipeline(rs, verbose=False, name='Random Parameter Search')
Audio(url='./Beep 2.wav', autoplay=True)
Out[79]:
In [80]:
#pd.get_option("display.max_columns")
pd.set_option("display.max_columns", 40)
header('Best')
display( pd.DataFrame.from_dict(rs.best_params_, orient= 'index') )
header('All Results')
df = pd.DataFrame(rs.cv_results_)
df = df.sort_values(['rank_test_score'])
display(df)
In [81]:
df = df.apply(pd.to_numeric, errors='ignore')
prefix = 'param_'
param_col = [col for col in df.columns if col.startswith(prefix) ]
for col in param_col:
name = col[len(prefix):]
header(name)
if(df[col].dtype == np.float64 or df[col].dtype == np.int64):
print( 'scatter')
df.plot(kind='scatter', x=col, y='mean_test_score', figsize=(15,10))
plt.show()
else:
mean = df[[col,'mean_test_score']].fillna(value='None').groupby(col).mean()
mean.plot(kind='bar', figsize=(10,10))
plt.show()
In [82]:
tests_df=pd.DataFrame.from_dict(tests, orient= 'index')
tests_df = tests_df.drop(['Name'], axis=1)
tests_df.columns=[ 'F1', 'Accuracy', 'Time (sec.)', 'Details']
tests_df = tests_df.sort_values(by=['F1'], ascending=False)
display(tests_df)
header('Best Model')
display(tests_df.head(1))
print(tests_df['Details'].values[0])
In [84]:
plt.figure(figsize=(13,5))
tests_df['F1'].plot(kind='bar', ylim=(0.6,None))
Audio(url='./Beep 2.wav', autoplay=True)
Out[84]:
In [ ]: