In [1]:
import book_classification as bc
import pandas
import shelve
In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
aDataFrame = aBookCollection.as_dataframe()
del myShelf
In [3]:
aDataFrame.icol([0, 1]).describe()
Out[3]:
In [6]:
sort(aDataFrame.groupby('Author').size()).plot(kind='bar', figsize=(15, 6))
Out[6]:
This is the distribution of book counts over authors.
In [19]:
#aDataFrame.groupby('Author').size().plot(kind='kde', figsize=(6, 5))
aDataFrame.groupby('Author').size().hist()
Out[19]:
In [10]:
tokenizer = bc.BasicTokenizer()
aBookAnalysis = bc.BookCollectionAnalysis(aBookCollection, tokenizer)
The vocabulary size (unique words) for each book.
In [11]:
aBookAnalysis.vocabulary_size_by_book().set_index('Book').sort(['Unique words']).plot()
Out[11]:
The vocabulary size (unique words) for each author.
In [12]:
dataframe = aBookAnalysis.vocabulary_size_by_author().set_index('Author').sort(['Unique words'])
dataframe.plot(kind='bar', figsize=(15, 6))
Out[12]:
In [89]:
pandas.Series(aBookAnalysis.shared_words_by_authors()).apply(math.log10).plot(figsize=(6, 4))
Out[89]:
In [31]:
pandas.Series(aBookAnalysis.shared_words_by_books()).apply(math.log10).plot(figsize=(8, 4))
Out[31]:
These are cumulative totals (observations represent the number of words that appear in N authors or less).
In [143]:
pandas.Series(aBookAnalysis.shared_words_by_authors()).cumsum().apply(math.log).plot()
Out[143]:
In [142]:
pandas.Series(aBookAnalysis.shared_words_by_books()).cumsum().apply(math.log).plot()
Out[142]:
In [79]:
vocabularySizes = aBookAnalysis.vocabulary_size_by_book()['Unique words'] / len(aBookAnalysis.vocabulary().total())
vocabularySizes.hist(bins=100,figsize=(10,5))
#vocabularySizes.plot(kind='kde')
Out[79]:
In [35]:
print(vocabularySizes.sum() / len(vocabularySizes))
Let's look at the differences between them. Note that the logarithm was applied to frequencies, so they are in the same scale as entropies.
In [30]:
frequenciesExtractor = bc.FrequenciesExtractor(tokenizer)
entropiesExtractor = bc.EntropiesExtractor(tokenizer, bc.FixedGrouper(500))
frequencies = bc.CollectionHierarchialFeatures.from_book_collection(aBookCollection, frequenciesExtractor)
entropies = bc.CollectionHierarchialFeatures.from_book_collection(aBookCollection, entropiesExtractor)
In [35]:
df_input = []
for word in aBookAnalysis._vocabulary.total().keys():
df_input.append([math.log(frequencies.total()[word]), entropies.total()[word]])
df_input.sort()
entropies_vs_frequencies = pandas.DataFrame(df_input, columns=["Frequencies", "Entropies"])
In [36]:
entropies_vs_frequencies.plot(kind='kde', figsize=(8, 8), subplots=True, sharex=False)
Out[36]:
If we plot both distributions individually, we can't see the difference (apart from the scales). But by sorting pairs according to one of them (in this case frequencies), it's clear that entropies aren't the same.
More over, the maximum grows in a similar fashion as frequencies, but it can't explain the additional variations. So entropy seems to have more information about the words than frequency.
In [77]:
#entropies_vs_frequencies["Entropies"].plot(figsize=(12, 4))
plt.figsize(12,5)
fig = plt.figure()
l=len(entropies_vs_frequencies["Entropies"])
plt.axis([0,l,0,1])
scatter(range(l),entropies_vs_frequencies["Entropies"],s=1,alpha=0.05,figure=fig)
Out[77]:
By zooming at the tail (> 140000) we see the pattern continues.
In [96]:
plt.figsize(12,5)
l=len(entropies_vs_frequencies["Entropies"])
plt.axis([140000,l,0,1])
scatter(range(l),entropies_vs_frequencies["Entropies"],s=1,alpha=0.2)
Out[96]:
In [97]:
plt.figsize(12,5)
l=len(entropies_vs_frequencies["Entropies"])
plt.axis([130000,150000,0,1])
scatter(range(l),entropies_vs_frequencies["Entropies"],s=1,alpha=0.2)
Out[97]:
In [108]:
# TODO: get a decent density plot of x=freq,y=entr with log color map
#figure(figsize(10, 10))
#scatter(entropies_vs_frequencies["Frequencies"], entropies_vs_frequencies["Entropies"])
#figure(figsize(5, 5))
They seem to be almost equal many times, but increasingly more often informative words appear (which higher frequencies).
The following distribution of differences in the entropy series (sorted by increasing frequencies) indicates variation of entropy between words with essentialy the same frequency.
In [105]:
entropies_vs_frequencies["Entropies"].diff().dropna().apply(abs).hist(log=True)
Out[105]:
If we want to ignore rare words (that appear in few books and authors), we can make the distribution looke more "gaussian".
The analysis is in another file, so that both can be compared.
In [ ]: