This notebook presents analysis of data from the first million page views on my blog, Probably Overthinking It.
Copyright 2015 Allen Downey
MIT License: http://opensource.org/licenses/MIT
In [1]:
    
%matplotlib inline
    
In [7]:
    
import pandas as pd
def read_table(filename):
    fp = open(filename)
    t = pd.read_html(fp)
    table = t[5]
    return table
    
In [13]:
    
table1 = read_table('blogger1.html')
table1.shape
    
    Out[13]:
In [14]:
    
table2 = read_table('blogger2.html')
table2.shape
    
    Out[14]:
In [18]:
    
table = pd.concat([table1, table2], ignore_index=True)
table.shape
    
    Out[18]:
In [19]:
    
import string
chars = string.ascii_letters + ' '
def convert(s):
    return (int(s.rstrip(chars)))
def clean(s):
    i = s.find('Edit')
    return s[:i]
    
In [20]:
    
table['title'] = table[1].apply(clean)
table.title
    
    Out[20]:
In [21]:
    
table['plusses'] = table[4].fillna(0)
table.plusses.head()
    
    Out[21]:
In [24]:
    
table['comments'] = table[5].apply(convert)
table.comments.head()
    
    Out[24]:
In [25]:
    
table['views'] = table[6].apply(convert)
table.views
    
    Out[25]:
In [26]:
    
table['date'] = pd.to_datetime(table[7])
table.date.head()
    
    Out[26]:
In [34]:
    
table = table[table.views > 0]
table.shape
    
    Out[34]:
In [38]:
    
table.index = range(115, 0, -1)
table.title
    
    Out[38]:
In [ ]:
    
    
In [39]:
    
dates = table.date.sort_values()
diffs = dates.diff()
diffs.head()
    
    Out[39]:
In [40]:
    
diffs.dropna().describe()
    
    Out[40]:
In [41]:
    
table.sort_values(by=['views'], ascending=False)[['title', 'views', 'date']].head(20)
    
    Out[41]:
In [56]:
    
table.sort_values(by=['views'], ascending=True)[['title', 'views', 'date']].head(20)
    
    Out[56]:
In [55]:
    
import thinkstats2
import thinkplot
cdf = thinkstats2.Cdf(table.views)
thinkplot.PrePlot(1)
thinkplot.Cdf(cdf, complement=True)
thinkplot.Config(xlabel ='Number of page views', xscale='log', 
                 ylabel='CCDF', yscale='log', 
                 legend=False)
    
    
In [45]:
    
table.sort_values(by=['comments'], ascending=False)[['title', 'comments', 'date']].head(5)
    
    Out[45]:
In [44]:
    
table.sort_values(by=['plusses'], ascending=False)[['title', 'plusses', 'date']].head(5)
    
    Out[44]:
In [ ]:
    
    
In [ ]: