In [1]:
%matplotlib inline
import pandas as pd
import datetime
import ast
import tldextract
In [3]:
# You will need access to D4D data.world organization. Check in slack for more info
# 150mb / 240k rows. Give it time, has to download over internet
df = pd.read_csv('https://query.data.world/s/bbokc1f08to11j19j5axvkrcv', sep='\t', parse_dates=['date'])
In [4]:
df.set_index('date', inplace=True)
df.count()
Out[4]:
In [5]:
by_year=df.groupby([pd.TimeGrouper('A')]).count()['title']
by_year
Out[5]:
In [6]:
by_year.plot()
Out[6]:
In [7]:
df.groupby([pd.TimeGrouper('A'),'category']).count()['title']
Out[7]:
In [8]:
df.groupby(['author']).count()['title'].sort_values(ascending=0).head(25)
Out[8]:
In [9]:
from collections import Counter
tld_counter = Counter()
In [10]:
def get_tld(hrefs):
# Quick and dirty, not thorough yet
for link in ast.literal_eval(hrefs):
top_level = tldextract.extract(link)
top_level = top_level.domain
tld_counter[top_level] += 1
In [11]:
_ = df[['hrefs']].applymap(get_tld)
In [12]:
tld_counter.most_common(25)
Out[12]: