In [1]:
'''load dataset'''
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [2]:
'''feature extraction using different ngram analyzers'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
ngrams = [(1, 1, 2), (1, 1, 3), (1, 1, 4), (1, 2, 2), (1, 2, 3)]
train_Xs, test_Xs = [], []
from pandas import DataFrame, Series, Index
frame = DataFrame(columns=['vectorizer', 'ngram_min', 'ngram_max', 'num_feature', 'min_df',
                           'train_time', 'model', 'score'], )
string = train.data[0][:50]
from fabric.colors import red, blue
print red('Example text: %s' % string)
index = 0
for ngram_min in range(1, 4):
    for ngram_max in range(ngram_min, 4):
        for min_df in range(1, 3):
            vect = CountVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df)
            anal = vect.build_analyzer()
            analyze_eg = str(anal(string))
            print blue('ngram min=%d max=%d min_df=%d' % (ngram_min, ngram_max, min_df))
            print '\tresult:%s\n' % (analyze_eg)
            model = LinearSVC()
            frame = frame.append(DataFrame({'vectorizer': vect,
                                            'ngram_min': ngram_min,
                                            'ngram_max': ngram_max,
                                            'min_df': min_df,
                                            'model': model,
                                            },
                                            index=Index([index])))
            index += 1

frame


Example text: From: lerxst@wam.umd.edu (where's my thing)
Subjec
ngram min=1 max=1 min_df=1
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec']

ngram min=1 max=1 min_df=2
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec']

ngram min=1 max=2 min_df=1
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=1 max=2 min_df=2
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=1 max=3 min_df=1
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=1 max=3 min_df=2
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=2 max=2 min_df=1
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=2 max=2 min_df=2
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=2 max=3 min_df=1
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=2 max=3 min_df=2
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=3 max=3 min_df=1
	result:[u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=3 max=3 min_df=2
	result:[u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

Out[2]:
min_df model ngram_max ngram_min num_feature score train_time vectorizer
0 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 1 1 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
1 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 1 1 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
2 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 1 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
3 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 1 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
4 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 1 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
5 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 1 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
6 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 2 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
7 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 2 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
8 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 2 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
9 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 2 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
10 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 3 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...
11 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 3 NaN NaN NaN CountVectorizer(analyzer=word, binary=False, c...

In [3]:
'''Run experiments'''
import time
num_train, num_test = 2000, 500
for index, exp in frame.iterrows():
    start_time = time.time()
    train_X, test_X = train.data[:num_train], test.data[:num_test]
    train_Y, test_Y = train.target[:num_train], test.target[:num_test]
    vect = exp['vectorizer']
    vect.fit(train_X)
    vect.fit(test_X)
    train_X, test_X = vect.transform(train_X), vect.transform(test_X)
    model = exp['model']
    model.fit(train_X, train_Y)
    frame.ix[index, 'num_feature'] = train_X.shape[1]
    frame.ix[index, 'train_time'] = time.time() - start_time
    frame.ix[index, 'score'] = model.score(test_X, test.target[:test_X.shape[0]])

frame


Out[3]:
min_df model ngram_max ngram_min num_feature score train_time vectorizer
0 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 1 1 16316 0.614 5.340934 CountVectorizer(analyzer=word, binary=False, c...
1 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 1 1 6461 0.598 4.803072 CountVectorizer(analyzer=word, binary=False, c...
2 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 1 90265 0.636 12.13506 CountVectorizer(analyzer=word, binary=False, c...
3 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 1 17796 0.626 7.965464 CountVectorizer(analyzer=word, binary=False, c...
4 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 1 193684 0.642 17.5893 CountVectorizer(analyzer=word, binary=False, c...
5 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 1 24442 0.632 12.89869 CountVectorizer(analyzer=word, binary=False, c...
6 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 2 73949 0.552 6.732035 CountVectorizer(analyzer=word, binary=False, c...
7 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 2 2 11335 0.498 5.102484 CountVectorizer(analyzer=word, binary=False, c...
8 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 2 177368 0.548 14.7479 CountVectorizer(analyzer=word, binary=False, c...
9 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 2 17981 0.484 9.869578 CountVectorizer(analyzer=word, binary=False, c...
10 1 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 3 103419 0.472 8.157876 CountVectorizer(analyzer=word, binary=False, c...
11 2 LinearSVC(C=1.0, class_weight=None, dual=True,... 3 3 6646 0.39 5.16128 CountVectorizer(analyzer=word, binary=False, c...

In [4]:
'''Plot result'''
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3)
fig.set_figwidth(15)
ax = axs[0]
ax.scatter(frame['num_feature'].values, frame['score'])
ax.set_xlim(0)
ax.set_xlabel('num_feature')
ax.set_ylabel('score (accuracy)')
ax = axs[1]
ax.scatter(frame['min_df'].values, frame['score'])
ax.set_xlabel('min_df')
ax = axs[2]
ax.scatter(frame['ngram_max'].values, frame['score'])
ax.set_xlabel('ngram_max')


Out[4]:
<matplotlib.text.Text at 0x26c01550>