In [5]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, pipeline, metrics

In [6]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf

In [3]:
tokenizer = bc.BasicTokenizer()
vd = bc.VocabularyDistribution(aBookCollection, tokenizer)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-3-8e6b7621fe99> in <module>()
      1 tokenizer = bc.BasicTokenizer()
----> 2 vd = bc.VocabularyDistribution(aBookCollection, tokenizer)

AttributeError: 'module' object has no attribute 'VocabularyDistribution'

In [4]:
pandas.Series(vd.as_dataframe()).plot(kind='bar')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-e35667465ad1> in <module>()
----> 1 pandas.Series(vd.as_dataframe()).plot(kind='bar')

NameError: name 'vd' is not defined

In [7]:
training, testing = aBookCollection.selection().exclude_authors_below(7).selection().split_per_author_percentage(.5)

In [8]:
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
matrix_extractor = bc.SklExtractor(extractor)

svm_model = svm.SVC(class_weight="auto")
model = pipeline.Pipeline([('extractor', matrix_extractor), ('svd', decomposition.TruncatedSVD(50)), ('svm', bc.SklModelAdapter(svm_model))])

books, authors = training.as_arrays()
model.fit(books, authors)


Out[8]:
Pipeline(steps=[('extractor', <book_classification.sklearn_compat.SklExtractor object at 0x7fe323e9cd10>), ('svd', TruncatedSVD(algorithm='randomized', n_components=50, n_iterations=5,
       random_state=None, tol=0.0)), ('svm', <book_classification.sklearn_compat.SklModelAdapter object at 0x7fe323e9ce10>)])

In [9]:
books2, authors2 = testing.as_arrays()
results = model.predict(books2)

In [11]:
expected = authors2
predicted = results
plt.pcolor(metrics.confusion_matrix(expected, predicted))


Out[11]:
<matplotlib.collections.PolyCollection at 0x7fe3393bb8d0>

In [12]:
print(metrics.classification_report(expected, predicted))


             precision    recall  f1-score   support

Anthony Hamilton       1.00      1.00      1.00         5
Arthur Robert Harding       1.00      1.00      1.00         3
 Bret Harte       1.00      0.86      0.93        29
Charles Dickens       0.51      0.78      0.62        23
E. Raymond Hall       0.73      1.00      0.85        11
Edward E. Hale       1.00      0.75      0.86         4
Ernst Haeckel       1.00      1.00      1.00         3
F. Colburn Adams       0.00      0.00      0.00         3
Frank Harris       1.00      0.20      0.33         5
 George Ade       0.57      1.00      0.73         4
H. Irving Hancock       1.00      1.00      1.00        18
H. Rider Haggard       0.85      0.85      0.85        26
Harry Harrison       1.00      0.40      0.57         5
    Ian Hay       0.75      0.60      0.67         5
James B. Hendryx       1.00      1.00      1.00         5
Joel Chandler Harris       1.00      1.00      1.00         5
   John Hay       0.00      0.00      0.00         3
Julian Hawthorne       0.12      0.43      0.19         7
Lafcadio Hearn       1.00      0.86      0.92         7
Nathaniel Hawthorne       0.74      0.43      0.54        47
   O. Henry       1.00      0.57      0.73         7
Samuel Hopkins Adams       1.00      0.75      0.86         4
Thomas Bailey Aldrich       0.12      0.22      0.16         9
Thomas Hardy       1.00      0.85      0.92        13

avg / total       0.79      0.71      0.72       251

/home/ale/Programs/my-python3-env/lib/python3.3/site-packages/sklearn/metrics/metrics.py:1858: UserWarning: The sum of true positives and false positives are equal to zero for some labels. Precision is ill defined for those labels ['F. Colburn Adams']. The precision and recall are equal to zero for some labels. fbeta_score is ill defined for those labels ['F. Colburn Adams' 'John Hay']. 
  average=None)

In [15]:
print(metrics.confusion_matrix(expected, predicted))


[[ 5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 25  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  2  0]
 [ 0  0  0 18  0  0  0  0  0  0  0  1  0  0  0  0  0  1  0  1  0  0  2  0]
 [ 0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 18  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0 22  0  0  0  0  0  1  0  0  0  0  2  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  3  0  0  0  1  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  1  0  0  0  0  2  3  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6  0  0  0  0  0]
 [ 0  0  0  7  2  0  0  0  0  0  0  1  0  0  0  0  0 13  0 20  0  0  4  0]
 [ 0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  1  4  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  3  0  3  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  1  0  0  0  0  0 11]]

In [16]: