In [5]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, pipeline, metrics
In [6]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf
In [3]:
tokenizer = bc.BasicTokenizer()
vd = bc.VocabularyDistribution(aBookCollection, tokenizer)
In [4]:
pandas.Series(vd.as_dataframe()).plot(kind='bar')
In [7]:
training, testing = aBookCollection.selection().exclude_authors_below(7).selection().split_per_author_percentage(.5)
In [8]:
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
matrix_extractor = bc.SklExtractor(extractor)
svm_model = svm.SVC(class_weight="auto")
model = pipeline.Pipeline([('extractor', matrix_extractor), ('svd', decomposition.TruncatedSVD(50)), ('svm', bc.SklModelAdapter(svm_model))])
books, authors = training.as_arrays()
model.fit(books, authors)
Out[8]:
In [9]:
books2, authors2 = testing.as_arrays()
results = model.predict(books2)
In [11]:
expected = authors2
predicted = results
plt.pcolor(metrics.confusion_matrix(expected, predicted))
Out[11]:
In [12]:
print(metrics.classification_report(expected, predicted))
In [15]:
print(metrics.confusion_matrix(expected, predicted))
In [16]: