In [1]:
    
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
import statsmodels
    
In [2]:
    
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf
    
In [3]:
    
someBooks, _ = aBookCollection.exclude_authors_with_less_than(10).sample_authors(5).split_at_number_per_author(10)
aPossibleFeatureAnalyzer = bc.PossibleFeatureAnalyzer.from_book_collection(someBooks)
    
In [4]:
    
freqDf = aPossibleFeatureAnalyzer.frequencies().dataframe_total()
    
In [5]:
    
freqDf.apply(numpy.log10).plot(kind='kde')
    
    Out[5]:
    
In [6]:
    
import statsmodels.graphics.gofplots as gp
import scipy.stats
_ = gp.qqplot(freqDf.Value.apply(numpy.log10), scipy.stats.distributions.uniform())
    
    
In [7]:
    
entrDf = aPossibleFeatureAnalyzer.entropies().dataframe_total()
entrDf.plot(kind='kde')
    
    Out[7]:
    
In [8]:
    
_ = gp.qqplot(entrDf.Value, scipy.stats.distributions.uniform())
    
    
In [9]:
    
#blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0.35, 1)
blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0, 1)
    
In [10]:
    
freqDf2 = blah.frequencies().dataframe_total()
    
In [11]:
    
freqDf2.apply(numpy.log10).plot(kind='kde')
    
    Out[11]:
    
In [12]:
    
_ = gp.qqplot(freqDf2.Value.apply(numpy.log10), scipy.stats.distributions.uniform())
    
    
In [13]:
    
entrDf2 = blah.entropies().dataframe_total()
    
In [14]:
    
entrDf2.plot(kind='kde')
    
    Out[14]:
    
In [15]:
    
entrDf2.sort('Value').head(10)
    
    Out[15]:
In [16]:
    
plt.figsize(10, 6)
entrPnl2 = blah.entropies().dataframe_authors()
entrPnl2.hist()
    
    Out[16]:
    
In [17]:
    
freqPnl2 = blah.frequencies().dataframe_authors()
freqPnl2.hist(log=True)
    
    Out[17]:
    
In [18]:
    
mydata = []
df = blah.entropies().dataframe_authors()
for col in df:
    arr = [df[col].dropna().quantile(v/50) for v in range(50)]
    mydata.append(arr)
    
In [19]:
    
import statsmodels.api as sm
sm.graphics.fboxplot(mydata)
    
    Out[19]:
    
In [ ]: