HackFSM
Relationship to other public APIs based on Solr?
Documentation:
In [1]:
from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)
from itertools import islice
import logging
import requests
import json
import urllib
import urlparse
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)
logger=logging.getLogger()
In [2]:
def query(q, fl="id"):
url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'app_id':HACKFSM_ID,
'app_key':HACKFSM_KEY})
r = requests.get(url)
return r.json()
In [3]:
result = query(q="fsmTitle:Savio")['response']
result
Out[3]:
In [4]:
# try again
# http://stackoverflow.com/a/5724453/7782
# http://excess.org/article/2013/02/itergen1/
class my_g(object):
def __init__(self,max_count):
self._remaining = range(max_count)
self._len = max_count
def __iter__(self):
return self
def __len__(self):
return self._len
def next(self):
if not self._remaining:
raise StopIteration
return self._remaining.pop(0)
g=my_g(10)
print len(g)
list(g)
Out[4]:
In [5]:
class FSM(object):
def __init__(self, q, fl="id", start=0, rows=30,
base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):
self.q = q
self.fl = fl
self.start = start
self.rows = rows
self.base_url = base_url
self.app_id = app_id
self.app_key = app_key
# get first page and numfound
self.cursor = start
# get the first page
result = self._get_page(q, fl, self.cursor, self.rows)
self.numfound = result['response']['numFound']
def _check_status(self,result):
"""throw exception if non-zero status"""
if result['responseHeader']['status'] != 0:
raise FSMException("status: " + str(result['responseHeader']['status']))
def _get_page(self, q, fl, start, rows):
result = self._call_api(q, fl, start, rows)
# update current page
self.page = result['response']['docs']
self.page_len = len(self.page)
return result
def _call_api(self, q, fl, start, rows):
url = "{base_url}?".format(base_url=self.base_url) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'start':start,
'row':rows,
'app_id':self.app_id,
'app_key':self.app_key})
result = requests.get(url).json()
self._check_status(result)
# check whether we're getting fewer records than expected
if len(result['response']['docs']) < rows:
# are we at the end of the results
if start + len(result['response']['docs']) != self.numfound:
logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url,
numfound=self.numfound,
start_plus_len=start + len(result['response']['docs'])))
return result
def __iter__(self):
return self
def __len__(self):
return self.numfound
def next(self):
if not self.page:
# retrieve next page and check whether there's anything left
self.cursor += self.page_len
result = self._get_page(self.q, self.fl, self.cursor, self.rows)
if self.page_len == 0:
raise StopIteration
return self.page.pop(0)
In [6]:
fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated")
In [7]:
len(fsm)
Out[7]:
In [8]:
results = list(islice(fsm,None))
results[:10]
Out[8]:
In [9]:
df = DataFrame(results)
In [10]:
len(df)
Out[10]:
In [11]:
df.fsmImageUrl
Out[11]:
In [12]:
from IPython.display import HTML
from jinja2 import Template
CSS = """
<style>
.wrap img {
margin-left: 0px;
margin-right: 0px;
display: inline-block;
width: 150px;
}
.wrap {
/* Prevent vertical gaps */
line-height: 0;
-webkit-column-count: 5;
-webkit-column-gap: 0px;
-moz-column-count: 5;
-moz-column-gap: 0px;
column-count: 5;
column-gap: 0px;
}
.wrap img {
/* Just in case there are inline attributes */
width: 100% !important;
height: auto !important;
}
</style>
"""
IMAGES_TEMPLATE = CSS + """
<div class="wrap">
{% for item in items %}<img title="{{item.fsmTitle.0}}" src="{{item.fsmImageUrl.0}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results[:10]))
Out[12]:
To programmatically differentiate records that describe images from records that describe TEI-encoded XML documents, the API permits queries that exclude records with NULL values in the "unwanted" Url field.
That is, to retrieve TEI documents only, one would query for null values in the fsmImageUrl field. To retrieve images only, one would query for null values in the fsmTeiUrl field.
NOTE: Please observe the hyphen prepended to the field names in the examples below. The hyphen (minus sign) functions here as a NOT operator.
Example that selects for TEI encoded XML documents by excluding null values of fsmImageUrl:
https://<BASE URL>/solr/fsm/select?q=-fsmImageUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012
Example that selects for images by excluding null values of fsmTeiUrl:
https://<BASE URL>/solr/fsm/select?q=-fsmTeiUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012
In [13]:
# TEI-encoded docs
len(FSM("-fsmImageUrl:[* TO *]"))
Out[13]:
In [14]:
# images
len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl"))
Out[14]:
In [15]:
from lxml.html import parse, fromstring
from collections import OrderedDict
api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail"
r = requests.get(api_docs_url).content
doc = fromstring(r)
In [16]:
rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr')
headers = [col.text_content().strip() for col in rows[0].findall('td')]
headers
Out[16]:
In [17]:
fields = []
for row in rows[1:]:
field = [col.text_content().strip() for col in row.findall('td')]
fields.append(field)
fsmfields = OrderedDict(fields)
fsmfields.keys()
Out[17]:
In [18]:
fsm = FSM(q="*",fl=",".join(fsmfields.keys()))
In [19]:
len(fsm)
Out[19]:
In [20]:
df = DataFrame(list(fsm))
In [21]:
len(df)
Out[21]:
In [22]:
df.head()
Out[22]:
In [23]:
# TEI URIs
len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))
Out[23]:
In [24]:
# null dates
len(df[df.fsmDateCreated.isnull()])
Out[24]:
In [25]:
# non-null image URLs
len(df[~df.fsmImageUrl.isnull()])
Out[25]:
In [26]:
df[~df.fsmImageUrl.isnull()].id
Out[26]:
In [27]:
# distribution of number of image URLs
df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()
Out[27]:
In [28]:
# let's crawl for images
results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys())))
In [29]:
len(results_images)
Out[29]:
In [30]:
df_images=DataFrame(results_images)
In [31]:
df_images[df_images.fsmImageUrl.isnull()]
Out[31]:
In [32]:
# would be interesting to see sizes of images and whether we can get at thumbnails
df_images.fsmImageUrl
Out[32]:
In [33]:
urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc
Out[33]:
In [34]:
df_images.fsmImageUrl
Out[34]:
In [35]:
# calculate hostnames for all image urls
# might be possible to do this all with pandas
netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))
reduce(lambda x,y: x | y, netlocs, set())
Out[35]:
In [36]:
def len2(x):
try:
return len(x)
except:
return np.nan
df_images.fsmImageUrl.apply(len2) == 3
Out[36]:
In [37]:
df_images[df_images.fsmImageUrl.apply(len2) == 3].head()
Out[37]:
In [38]:
df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl
Out[38]:
In [39]:
IMAGES_TEMPLATE = """
<div class="nowrap">
{% for item in items %}<img title="{{item}}" src="{{item}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl ))
Out[39]:
In [40]:
len(df[~df.fsmDateCreated.isnull()])
Out[40]:
In [41]:
s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')
In [42]:
def first(x):
try:
return x[0]
except:
return np.nan
df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)
In [43]:
df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date
Out[43]:
In [44]:
pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()
Out[44]:
In [45]:
# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put
date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')
pd.to_datetime(pd.Series(date_stngs),coerce=True)
Out[45]:
In [46]:
def f(x):
try:
return set(x)
except:
return set()
reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())
Out[46]:
In [47]:
#related id
len(df.fsmRelatedIdentifier.dropna())
Out[47]:
In [48]:
df.fsmTeiUrl.dropna()
Out[48]: