In [1]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf
In [3]:
tokenizer = bc.BasicTokenizer()
aPossibleFeatureAnalyzer = bc.PossibleFeatureAnalyzer.from_documents(tokenizer, (b.contents for b in aBookCollection))
In [4]:
aDataFrame = aPossibleFeatureAnalyzer.as_dataframe()
print(aDataFrame.describe())
countSeries = aDataFrame['Count']
print("Skewness: {}\nKurtosis: {}".format(countSeries.skew(), countSeries.kurt()))
It's not a well shaped distribution. Some numbers and a logarithmic box/density plot:
In [5]:
plt.figsize(4, 5)
boxplot(countSeries.apply(numpy.log))
Out[5]:
In [6]:
plt.figsize(6, 4)
countSeries.apply(numpy.log).plot(kind='kde')
Out[6]:
We can see that the words at the extremes are very rare or stopwords.
In [7]:
df = aDataFrame.sort(columns='Count')
print(df.head(5))
print(df.tail(5))
print(len(df))
In [45]:
plt.figsize(10, 5)
resolution = 100
plot([x/resolution for x in range(resolution)], [math.log(countSeries.quantile(x/resolution)) for x in range(resolution)])
#plot([x/resolution for x in range(resolution)], [countSeries.quantile(x/resolution) for x in range(resolution)])
Out[45]:
In [46]:
anotherPossibleFeatureAnalyzer = aPossibleFeatureAnalyzer.prune_last_words(20).prune_less_occurrences_than(500)
anotherDataFrame = anotherPossibleFeatureAnalyzer.as_dataframe()
print(anotherDataFrame.describe())
anotherCountSeries = anotherDataFrame['Count']
print("Skewness: {}\nKurtosis: {}".format(anotherCountSeries.skew(), anotherCountSeries.kurt()))
In [47]:
df = anotherDataFrame.sort(columns='Count')
print(df.head(5))
print(df.tail(5))
print(len(df))
In [48]:
anotherCountSeries.apply(numpy.log).plot(kind='kde')
Out[48]:
In [49]:
#plt.xscale('log')
df['Count'].hist(log=True, bins=100)
Out[49]:
In [50]:
plt.figsize(4, 5)
boxplot(df['Count'].apply(numpy.log))
Out[50]:
In [51]:
plot([x/40 for x in range(40)], [math.log(anotherCountSeries.quantile(x/40)) for x in range(40)])
Out[51]:
In [15]:
tokenizer = bc.BasicTokenizer()
grouper = bc.BasicGrouper(500)
entropies = {}
for book in aBookCollection:
entropies[book] = bc.TokenEntropies.from_parts(grouper.parts_from(tokenizer.tokens_from(book.contents)))
In [16]:
import functools
total_entropy = functools.reduce(lambda x,y: x.combine(y), entropies.values())
In [52]:
dfEntropies = pandas.DataFrame([[k,total_entropy[k],v/anotherPossibleFeatureAnalyzer._total] for (k,v) in anotherPossibleFeatureAnalyzer._counts.items()], columns=['Word', 'Entropy', 'Frequency'])
#hist([v for k,v in total_entropy.items()], log=True)
dfEntropies.Entropy.hist(log=True, bins=30)
Out[52]:
In [53]:
plot([x/20 for x in range(20)], [dfEntropies.Entropy.quantile(x/20) for x in range(20)])
Out[53]:
In [54]:
data = []
for k,v in anotherPossibleFeatureAnalyzer._counts.items():
data.append([k, v/anotherPossibleFeatureAnalyzer._total, total_entropy[k], v/anotherPossibleFeatureAnalyzer._total / total_entropy[k]])
#freq_entr_y2.append()
#freq_entr.append((v, total_entropy[k]))
data = pandas.DataFrame(data, columns=['Word', 'Freq', 'Entropy', 'Both'])
#(data.Freq * (1-data.Entropy)).plot()
#blah = (data.Freq * (1-data.Entropy))
#blah = data[data.Entropy > .01][data.Freq > .0001].sort(columns='Both')
blah = data.sort(columns='Both')
print(blah.head(20))
boxplot(blah.Entropy)
#boxplot(blah.Both)
Out[54]:
In [ ]: