notebook.community

Edit and run



In [ ]:

    
%matplotlib inline



In [ ]:

    
from __future__ import (print_function, unicode_literals, division)



In [ ]:

    
try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse



In [ ]:

    
from itertools import islice
import sys

# package up logic in a package
# from hypothesisapi import API

# include your hypothes.is USERNAME, TOKEN as parameters in a hypothesis_settings.py file in your sys.path
# get TOKEN at https://hypothes.is/profile/developer

from hypothesis_settings import (USERNAME, TOKEN)
from Hypothes_is import Hypothesis, HypothesisAnnotation

h_api = Hypothesis(USERNAME, TOKEN, max_results=sys.maxint)



In [ ]:

    
import numpy as np
import pandas as pd
from pandas import (DataFrame, Series)
import matplotlib.pyplot as plt



In [ ]:

    
# reading in all the annotations
annotations = list(h_api.search_all({'sort':'updated', 'order':'desc', 'user':'rdhyee@hypothes.is'}))



In [ ]:

    
len(annotations)



In [ ]:

    
annotations[0]



In [ ]:

    
# collect some of the stats

from itertools import islice

annotations = []

for (i, annot) in enumerate(islice(h_api.search_all({'sort':'updated', 'order':'desc'}), 5000)):
    print("\r%d" % i , end="")
    annotations.append(annot)



In [ ]:

    
# write annotations out
import json
with open("annotations.json", "wb") as f:
    f.write(json.dumps(annotations))



In [ ]:

    
# read in annotations
import json
#annotations = json.loads(open("annotations.json").read())



In [ ]:

    
df = DataFrame(annotations)



In [ ]:

    
# date distribution
import dateutil.parser
s = df.created.apply(dateutil.parser.parse).apply(lambda d: (d.year, d.month)).value_counts()
s



In [ ]:

    
(s.sort_index(ascending=True).plot(kind='bar', color='green', # x='year/month', y='# of annotations'
                                  ))



In [ ]:

    
len(df.user.value_counts())



In [ ]:

    
df.user.value_counts()[:20]



In [ ]:

    
df.uri.apply(lambda url: urlparse(url)[1]).value_counts()



In [ ]:

    
# most annotated web pages
df.uri.value_counts()



In [ ]:

    
# look for pages annotated as part of climatefeedback.org
# http://climatefeedback.org/members/early-participants.html

climatefeedback_members = ['karmour', 'Alexis_b', 'drchavas', 'jgdwyer', 'emanuel', 'ed_hawkins', 'Dkambo',
                            'aklocker', 'james_kossin', 'jmlauderdale', 'mashayek', 's_perkins',
                            'andypitman', 'hramsay','kevenroy', 'martysingh','alexis.tantet',
                            'emvincent','bmv','DonWuebbles','aalpert']
clf_accts = ["acct:{user}@hypothes.is".format(user=user) for user in climatefeedback_members]



In [ ]:

    
clf_annots = df[df.user.isin(clf_accts)]
clf_annots.uri.value_counts()



In [ ]: