In [1]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, cross_validation
In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
print(len(aBookCollection))
del myShelf
In [34]:
anotherCollection = aBookCollection.selection().exclude_authors_below(4)
print(len(anotherCollection))
train_collection, test_collection = anotherCollection.selection().split_per_author_percentage(0.7)
collection_dataframe = anotherCollection.as_dataframe()
def label_for(book):
if book in train_collection.books():
return 'Train'
else:
return 'Test'
collection_dataframe['Set'] = collection_dataframe['Object'].map(label_for)
In [83]:
plt.figsize(12, 4)
print(collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').describe())
collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').plot(kind='bar', stacked=True)
#both_collections.groupby('Author').count().size().plot(kind='bar')
Out[83]:
In [53]:
plt.figsize(12, 4)
test_collection.as_dataframe().groupby('Author').size().plot(kind='bar')
Out[53]:
In [59]:
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
#extractor = bc.FrequenciesExtractor(tokenizer)
model = bc.ClassificationModel(training, extractor, decomposition.TruncatedSVD(50), svm.SVC())
In [60]:
results = model.classify(testing)
In [61]:
from sklearn.metrics import classification_report, confusion_matrix
expected = []
predicted = []
for book in testing.books():
expected.append(book.author())
predicted.append(results[book])
plt.pcolor(confusion_matrix(expected, predicted))
Out[61]:
In [62]:
print(classification_report(expected, predicted))
In [63]:
print(confusion_matrix(expected, predicted))
In [ ]: