This notebook presents analysis of data from the first million page views on my blog, Probably Overthinking It.
Copyright 2015 Allen Downey
MIT License: http://opensource.org/licenses/MIT
In [1]:
%matplotlib inline
In [7]:
import pandas as pd
def read_table(filename):
fp = open(filename)
t = pd.read_html(fp)
table = t[5]
return table
In [13]:
table1 = read_table('blogger1.html')
table1.shape
Out[13]:
In [14]:
table2 = read_table('blogger2.html')
table2.shape
Out[14]:
In [18]:
table = pd.concat([table1, table2], ignore_index=True)
table.shape
Out[18]:
In [19]:
import string
chars = string.ascii_letters + ' '
def convert(s):
return (int(s.rstrip(chars)))
def clean(s):
i = s.find('Edit')
return s[:i]
In [20]:
table['title'] = table[1].apply(clean)
table.title
Out[20]:
In [21]:
table['plusses'] = table[4].fillna(0)
table.plusses.head()
Out[21]:
In [24]:
table['comments'] = table[5].apply(convert)
table.comments.head()
Out[24]:
In [25]:
table['views'] = table[6].apply(convert)
table.views
Out[25]:
In [26]:
table['date'] = pd.to_datetime(table[7])
table.date.head()
Out[26]:
In [34]:
table = table[table.views > 0]
table.shape
Out[34]:
In [38]:
table.index = range(115, 0, -1)
table.title
Out[38]:
In [ ]:
In [39]:
dates = table.date.sort_values()
diffs = dates.diff()
diffs.head()
Out[39]:
In [40]:
diffs.dropna().describe()
Out[40]:
In [41]:
table.sort_values(by=['views'], ascending=False)[['title', 'views', 'date']].head(20)
Out[41]:
In [56]:
table.sort_values(by=['views'], ascending=True)[['title', 'views', 'date']].head(20)
Out[56]:
In [55]:
import thinkstats2
import thinkplot
cdf = thinkstats2.Cdf(table.views)
thinkplot.PrePlot(1)
thinkplot.Cdf(cdf, complement=True)
thinkplot.Config(xlabel ='Number of page views', xscale='log',
ylabel='CCDF', yscale='log',
legend=False)
In [45]:
table.sort_values(by=['comments'], ascending=False)[['title', 'comments', 'date']].head(5)
Out[45]:
In [44]:
table.sort_values(by=['plusses'], ascending=False)[['title', 'plusses', 'date']].head(5)
Out[44]:
In [ ]:
In [ ]: