notebook.community

Edit and run



In [1]:

    
'''load dataset'''
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')



In [2]:

    
'''feature extraction using different ngram analyzers'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
ngrams = [(1, 1, 2), (1, 1, 3), (1, 1, 4), (1, 2, 2), (1, 2, 3)]
train_Xs, test_Xs = [], []
from pandas import DataFrame, Series, Index
frame = DataFrame(columns=['vectorizer', 'ngram_min', 'ngram_max', 'num_feature', 'min_df',
                           'train_time', 'model', 'score'], )
string = train.data[0][:50]
from fabric.colors import red, blue
print red('Example text: %s' % string)
index = 0
for ngram_min in range(1, 4):
    for ngram_max in range(ngram_min, 4):
        for min_df in range(1, 3):
            vect = CountVectorizer(ngram_range=(ngram_min, ngram_max), min_df=min_df)
            anal = vect.build_analyzer()
            analyze_eg = str(anal(string))
            print blue('ngram min=%d max=%d min_df=%d' % (ngram_min, ngram_max, min_df))
            print '\tresult:%s\n' % (analyze_eg)
            model = LinearSVC()
            frame = frame.append(DataFrame({'vectorizer': vect,
                                            'ngram_min': ngram_min,
                                            'ngram_max': ngram_max,
                                            'min_df': min_df,
                                            'model': model,
                                            },
                                            index=Index([index])))
            index += 1

frame









    



Example text: From: lerxst@wam.umd.edu (where's my thing)
Subjec
ngram min=1 max=1 min_df=1
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec']

ngram min=1 max=1 min_df=2
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec']

ngram min=1 max=2 min_df=1
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=1 max=2 min_df=2
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=1 max=3 min_df=1
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=1 max=3 min_df=2
	result:[u'from', u'lerxst', u'wam', u'umd', u'edu', u'where', u'my', u'thing', u'subjec', u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=2 max=2 min_df=1
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=2 max=2 min_df=2
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec']

ngram min=2 max=3 min_df=1
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=2 max=3 min_df=2
	result:[u'from lerxst', u'lerxst wam', u'wam umd', u'umd edu', u'edu where', u'where my', u'my thing', u'thing subjec', u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=3 max=3 min_df=1
	result:[u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']

ngram min=3 max=3 min_df=2
	result:[u'from lerxst wam', u'lerxst wam umd', u'wam umd edu', u'umd edu where', u'edu where my', u'where my thing', u'my thing subjec']







    Out[2]:






  
    
      
      min_df
      model
      ngram_max
      ngram_min
      num_feature
      score
      train_time
      vectorizer
    
  
  
    
      0 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       1
       1
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      1 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       1
       1
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      2 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       1
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      3 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       1
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      4 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       1
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      5 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       1
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      6 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       2
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      7 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       2
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      8 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       2
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      9 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       2
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      10
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       3
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      11
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       3
       NaN
       NaN
       NaN
       CountVectorizer(analyzer=word, binary=False, c...



In [3]:

    
'''Run experiments'''
import time
num_train, num_test = 2000, 500
for index, exp in frame.iterrows():
    start_time = time.time()
    train_X, test_X = train.data[:num_train], test.data[:num_test]
    train_Y, test_Y = train.target[:num_train], test.target[:num_test]
    vect = exp['vectorizer']
    vect.fit(train_X)
    vect.fit(test_X)
    train_X, test_X = vect.transform(train_X), vect.transform(test_X)
    model = exp['model']
    model.fit(train_X, train_Y)
    frame.ix[index, 'num_feature'] = train_X.shape[1]
    frame.ix[index, 'train_time'] = time.time() - start_time
    frame.ix[index, 'score'] = model.score(test_X, test.target[:test_X.shape[0]])

frame









    Out[3]:






  
    
      
      min_df
      model
      ngram_max
      ngram_min
      num_feature
      score
      train_time
      vectorizer
    
  
  
    
      0 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       1
       1
        16316
       0.614
       5.340934
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      1 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       1
       1
         6461
       0.598
       4.803072
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      2 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       1
        90265
       0.636
       12.13506
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      3 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       1
        17796
       0.626
       7.965464
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      4 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       1
       193684
       0.642
        17.5893
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      5 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       1
        24442
       0.632
       12.89869
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      6 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       2
        73949
       0.552
       6.732035
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      7 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       2
       2
        11335
       0.498
       5.102484
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      8 
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       2
       177368
       0.548
        14.7479
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      9 
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       2
        17981
       0.484
       9.869578
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      10
       1
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       3
       103419
       0.472
       8.157876
       CountVectorizer(analyzer=word, binary=False, c...
    
    
      11
       2
       LinearSVC(C=1.0, class_weight=None, dual=True,...
       3
       3
         6646
        0.39
        5.16128
       CountVectorizer(analyzer=word, binary=False, c...



In [4]:

    
'''Plot result'''
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3)
fig.set_figwidth(15)
ax = axs[0]
ax.scatter(frame['num_feature'].values, frame['score'])
ax.set_xlim(0)
ax.set_xlabel('num_feature')
ax.set_ylabel('score (accuracy)')
ax = axs[1]
ax.scatter(frame['min_df'].values, frame['score'])
ax.set_xlabel('min_df')
ax = axs[2]
ax.scatter(frame['ngram_max'].values, frame['score'])
ax.set_xlabel('ngram_max')









    Out[4]:





<matplotlib.text.Text at 0x26c01550>

	min_df	model	ngram_max	ngram_min	num_feature	score	train_time	vectorizer
0	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	1	1	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
1	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	1	1	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
2	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	1	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
3	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	1	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
4	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	1	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
5	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	1	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
6	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	2	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
7	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	2	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
8	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	2	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
9	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	2	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
10	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	3	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...
11	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	3	NaN	NaN	NaN	CountVectorizer(analyzer=word, binary=False, c...

	min_df	model	ngram_max	ngram_min	num_feature	score	train_time	vectorizer
0	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	1	1	16316	0.614	5.340934	CountVectorizer(analyzer=word, binary=False, c...
1	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	1	1	6461	0.598	4.803072	CountVectorizer(analyzer=word, binary=False, c...
2	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	1	90265	0.636	12.13506	CountVectorizer(analyzer=word, binary=False, c...
3	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	1	17796	0.626	7.965464	CountVectorizer(analyzer=word, binary=False, c...
4	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	1	193684	0.642	17.5893	CountVectorizer(analyzer=word, binary=False, c...
5	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	1	24442	0.632	12.89869	CountVectorizer(analyzer=word, binary=False, c...
6	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	2	73949	0.552	6.732035	CountVectorizer(analyzer=word, binary=False, c...
7	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	2	2	11335	0.498	5.102484	CountVectorizer(analyzer=word, binary=False, c...
8	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	2	177368	0.548	14.7479	CountVectorizer(analyzer=word, binary=False, c...
9	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	2	17981	0.484	9.869578	CountVectorizer(analyzer=word, binary=False, c...
10	1	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	3	103419	0.472	8.157876	CountVectorizer(analyzer=word, binary=False, c...
11	2	LinearSVC(C=1.0, class_weight=None, dual=True,...	3	3	6646	0.39	5.16128	CountVectorizer(analyzer=word, binary=False, c...