HackFSM

Relationship to other public APIs based on Solr?


In [1]:
from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)
from itertools import islice

import logging
import requests
import json
import urllib
import urlparse

from pandas import DataFrame, Series
import pandas as pd
import numpy as np

logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)
logger=logging.getLogger()

In [2]:
def query(q, fl="id"):
    url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \
            urllib.urlencode({'q':q,
                              'fl':fl,
                              'wt':'json',
                              'app_id':HACKFSM_ID,
                              'app_key':HACKFSM_KEY})
    r = requests.get(url)
    return r.json()

In [3]:
result = query(q="fsmTitle:Savio")['response']
result


Out[3]:
{u'docs': [{u'id': u'ark:/13030/ft2f59n853'},
  {u'id': u'access143'},
  {u'id': u'ark:/13030/tf2q2n99d3'},
  {u'id': u'ark:/13030/tf3p3003k7'},
  {u'id': u'ark:/13030/tf5m3nb15b'},
  {u'id': u'ark:/13030/tf267n996q'},
  {u'id': u'access326'},
  {u'id': u'access327'},
  {u'id': u'access328'},
  {u'id': u'access329'},
  {u'id': u'access330'},
  {u'id': u'access331'},
  {u'id': u'access332'},
  {u'id': u'access333'},
  {u'id': u'access334'},
  {u'id': u'access335'},
  {u'id': u'access339'},
  {u'id': u'access340'},
  {u'id': u'access341'},
  {u'id': u'access343'},
  {u'id': u'access344'},
  {u'id': u'access345'},
  {u'id': u'access346'},
  {u'id': u'access347'},
  {u'id': u'access348'},
  {u'id': u'access365'},
  {u'id': u'access366'},
  {u'id': u'access367'},
  {u'id': u'access369'},
  {u'id': u'access370'}],
 u'numFound': 124,
 u'start': 0}

Paging through results


In [4]:
# try again
# http://stackoverflow.com/a/5724453/7782
# http://excess.org/article/2013/02/itergen1/


class my_g(object):
    def __init__(self,max_count):
        self._remaining = range(max_count)
        self._len = max_count
    def __iter__(self):
        return self
    def __len__(self):
        return self._len
    def next(self):
        if not self._remaining:
            raise StopIteration
        return self._remaining.pop(0)

g=my_g(10)
print len(g)
list(g)


10
Out[4]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [5]:
class FSM(object):
    def __init__(self, q, fl="id", start=0, rows=30,
                 base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):
        self.q = q
        self.fl = fl
        self.start = start
        self.rows = rows
        
        self.base_url = base_url
        self.app_id = app_id
        self.app_key = app_key

        # get first page and numfound
        self.cursor = start 
        
        # get the first page
        result = self._get_page(q, fl, self.cursor, self.rows)
        self.numfound = result['response']['numFound']
    
    def _check_status(self,result):
        """throw exception if non-zero status"""
        if result['responseHeader']['status'] != 0:
            raise FSMException("status: " + str(result['responseHeader']['status']))

    def _get_page(self, q, fl, start, rows):
        result = self._call_api(q, fl, start, rows)
        
        # update current page
        self.page = result['response']['docs']
        self.page_len = len(self.page)
        
        return result
    
    def _call_api(self, q, fl, start, rows):
        url = "{base_url}?".format(base_url=self.base_url) + \
            urllib.urlencode({'q':q,
                              'fl':fl,
                              'wt':'json',
                              'start':start,
                              'row':rows,
                              'app_id':self.app_id,
                              'app_key':self.app_key})

        result = requests.get(url).json()
        self._check_status(result)
        
        # check whether we're getting fewer records than expected
        if len(result['response']['docs']) < rows:
            # are we at the end of the results
            if start + len(result['response']['docs']) != self.numfound:
                logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url,
                                                            numfound=self.numfound,
                                                            start_plus_len=start + len(result['response']['docs'])))
                
        
        return result

    def __iter__(self):
        return self
    def __len__(self):
        return self.numfound
    def next(self):
        if not self.page:
            # retrieve next page and check whether there's anything left
            self.cursor += self.page_len
            result = self._get_page(self.q, self.fl, self.cursor, self.rows)
    
            if self.page_len == 0:
                raise StopIteration
                
        return self.page.pop(0)

In [6]:
fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated")

In [7]:
len(fsm)


Out[7]:
685

In [8]:
results = list(islice(fsm,None))
results[:10]


Out[8]:
[{u'fsmDateCreated': [u'Nov. 9, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9r90',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9s0j'],
  u'fsmTitle': [u'Professor John Searle speaking to crowd.'],
  u'id': u'ark:/13030/ft6k40080h'},
 {u'fsmDateCreated': [u'Dec. 2, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2842',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k285m'],
  u'fsmTitle': [u'Mario Savio speaking with reporters.'],
  u'id': u'ark:/13030/tf009n97vn'},
 {u'fsmDateCreated': [u'Dec. 2, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2c2h',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k2c32'],
  u'fsmTitle': [u'Joan Baez singing in front of Sproul Hall.'],
  u'id': u'ark:/13030/tf5j49n838'},
 {u'fsmDateCreated': [u'Dec. 3, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z5w',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9z6f'],
  u'fsmTitle': [u'Girl student being booked on campus before being taken to jail.'],
  u'id': u'ark:/13030/ft700007tc'},
 {u'fsmDateCreated': [u'Oct. 5, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9n7b',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9n8w'],
  u'fsmTitle': [u'Bryan Turner speaking.'],
  u'id': u'ark:/13030/ft7n39p1mr'},
 {u'fsmDateCreated': [u'Nov. 9, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k1b6q',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k1b78'],
  u'fsmTitle': [u'Steve Weissman speaking to crowd.'],
  u'id': u'ark:/13030/tf8w1006vp'},
 {u'fsmDateCreated': [u'Nov. 24, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9v37',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9v4s'],
  u'fsmTitle': [u'Professor Morris Hirsch speaking from Sproul steps.'],
  u'id': u'ark:/13030/ft9f59p3bw'},
 {u'fsmDateCreated': [u'Oct. 1, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k0v2s',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k0v3b'],
  u'fsmTitle': [u'Crowd in Sproul Plaza.'],
  u'id': u'ark:/13030/tf0870010x'},
 {u'fsmDateCreated': [u'Dec. 3, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z1p',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9z27'],
  u'fsmTitle': [u'Crowds in Sproul Plaza'],
  u'id': u'ark:/13030/ft8199p26d'},
 {u'fsmDateCreated': [u'Dec. 2, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9x7g',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9x81'],
  u'fsmTitle': [u'Professor David Hackett talking to his class.'],
  u'id': u'ark:/13030/ft9000102p'}]

In [9]:
df = DataFrame(results)

In [10]:
len(df)


Out[10]:
685

In [11]:
df.fsmImageUrl


Out[11]:
0     [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
1     [http://nma.berkeley.edu/ark:/28722/bk0005k284...
2     [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
3     [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
4     [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
5     [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...
6     [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...
7     [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...
8     [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...
9     [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...
10    [http://nma.berkeley.edu/ark:/28722/bk0005k232...
11    [http://nma.berkeley.edu/ark:/28722/bk0005k047...
12    [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...
13    [http://nma.berkeley.edu/ark:/28722/bk0005k110...
14    [http://nma.berkeley.edu/ark:/28722/bk0005k276...
...
670    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
671    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
672    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
673    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
674    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
675    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
676    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
677    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
678    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
679    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
680    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
681    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
682    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
683    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
684    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
Name: fsmImageUrl, Length: 685, dtype: object

In [12]:
from IPython.display import HTML
from jinja2 import Template

CSS = """
<style>
  .wrap img {
    margin-left: 0px;
    margin-right: 0px;
    display: inline-block;
    width: 150px;
  }
  
.wrap {
   /* Prevent vertical gaps */
   line-height: 0;
   
   -webkit-column-count: 5;
   -webkit-column-gap:   0px;
   -moz-column-count:    5;
   -moz-column-gap:      0px;
   column-count:         5;
   column-gap:           0px;
   
}
.wrap img {
  /* Just in case there are inline attributes */
  width: 100% !important;
  height: auto !important;
}

</style>
"""

IMAGES_TEMPLATE = CSS + """
<div class="wrap">
 {% for item in items %}<img title="{{item.fsmTitle.0}}" src="{{item.fsmImageUrl.0}}"/>{% endfor %}
</div>
"""
    
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results[:10]))


Out[12]:

DISTINGUISHING IMAGES FROM DOCUMENTS

To programmatically differentiate records that describe images from records that describe TEI-encoded XML documents, the API permits queries that exclude records with NULL values in the "unwanted" Url field.

That is, to retrieve TEI documents only, one would query for null values in the fsmImageUrl field. To retrieve images only, one would query for null values in the fsmTeiUrl field.

NOTE: Please observe the hyphen prepended to the field names in the examples below. The hyphen (minus sign) functions here as a NOT operator.

Example that selects for TEI encoded XML documents by excluding null values of fsmImageUrl:

 https://<BASE URL>/solr/fsm/select?q=-fsmImageUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012

Example that selects for images by excluding null values of fsmTeiUrl:

 https://<BASE URL>/solr/fsm/select?q=-fsmTeiUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012

In [13]:
# TEI-encoded docs

len(FSM("-fsmImageUrl:[* TO *]"))


Out[13]:
194

In [14]:
# images

len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl"))


Out[14]:
685

Studying the API parameters


In [15]:
from lxml.html import parse, fromstring
from collections import OrderedDict

api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail"
r = requests.get(api_docs_url).content
doc = fromstring(r)

In [16]:
rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr')
headers = [col.text_content().strip() for col in rows[0].findall('td')]
headers


Out[16]:
['Field Name', 'Definitions']

In [17]:
fields = []

for row in rows[1:]:
    field = [col.text_content().strip() for col in row.findall('td')]
    fields.append(field)
    
fsmfields = OrderedDict(fields)
fsmfields.keys()


Out[17]:
['id',
 'fsmTitle',
 'fsmCreator',
 'fsmTypeOfResource',
 'fsmDateCreated',
 'fsmNote',
 'fsmRelatedTitle',
 'fsmIdentifier',
 'fsmRelatedIdentifier',
 'fsmPhysicalLocation',
 'fsmImageUrl',
 'fsmTeiUrl']

Study all the records


In [18]:
fsm = FSM(q="*",fl=",".join(fsmfields.keys()))

In [19]:
len(fsm)


Out[19]:
879

In [20]:
df = DataFrame(list(fsm))

In [21]:
len(df)


Out[21]:
879

In [22]:
df.head()


Out[22]:
fsmCreator fsmDateCreated fsmIdentifier fsmImageUrl fsmNote fsmPhysicalLocation fsmRelatedIdentifier fsmRelatedTitle fsmTeiUrl fsmTitle fsmTypeOfResource id
0 [Warren (Photographer)] [Nov. 9, 1964] [BANC PIC 1959.010 -- NEG pt.3 11-09-64.4] [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... [The Free Speech Movement Digital Archive, San... NaN [Professor John Searle speaking to crowd.] [still image] ark:/13030/ft6k40080h
1 [Steven Marcus] [Dec. 2, 1964] [BANC PIC 2000.002--NEG Strip 117:36] [http://nma.berkeley.edu/ark:/28722/bk0005k284... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... [The Free Speech Movement Digital Archive, Ste... NaN [Mario Savio speaking with reporters.] [still image] ark:/13030/tf009n97vn
2 [Steven Marcus] [Dec. 2, 1964] [BANC PIC 2000.002--NEG Strip 122:42] [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... [The Free Speech Movement Digital Archive, Ste... NaN [Joan Baez singing in front of Sproul Hall.] [still image] ark:/13030/tf5j49n838
3 [Jones (Photographer)] [Dec. 3, 1964] [BANC PIC 1959.010 -- NEG pt.3 12-03-64.2] [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... [The Free Speech Movement Digital Archive, San... NaN [Girl student being booked on campus before be... [still image] ark:/13030/ft700007tc
4 [Ingman (Photographer)] [Oct. 5, 1964] [BANC PIC 1959.010 -- NEG pt.3 10-05-64.4] [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... [The Free Speech Movement Digital Archive, San... NaN [Bryan Turner speaking.] [still image] ark:/13030/ft7n39p1mr

5 rows × 12 columns


In [23]:
# TEI URIs

len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))


Out[23]:
194

In [24]:
# null dates

len(df[df.fsmDateCreated.isnull()])


Out[24]:
393

In [25]:
# non-null image URLs
len(df[~df.fsmImageUrl.isnull()])


Out[25]:
685

In [26]:
df[~df.fsmImageUrl.isnull()].id


Out[26]:
0     ark:/13030/ft6k40080h
1     ark:/13030/tf009n97vn
2     ark:/13030/tf5j49n838
3     ark:/13030/ft700007tc
4     ark:/13030/ft7n39p1mr
5     ark:/13030/tf8w1006vp
6     ark:/13030/ft9f59p3bw
7     ark:/13030/tf0870010x
8     ark:/13030/ft8199p26d
9     ark:/13030/ft9000102p
10    ark:/13030/tf7n39n9qb
11    ark:/13030/ft3c6004k4
12    ark:/13030/tf8n39p05g
13    ark:/13030/tf20000235
14    ark:/13030/tf0d5n97ws
...
670    access369
671    access370
672    access371
673    access372
674    access373
675    access374
676    access375
677    access376
678    access377
679    access378
680    access379
681    access380
682    access381
683    access382
684    access383
Name: id, Length: 685, dtype: object

In [27]:
# distribution of number of image URLs

df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()


Out[27]:
2    628
3     56
4      1
dtype: int64

In [28]:
# let's crawl for images

results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys())))

In [29]:
len(results_images)


Out[29]:
685

In [30]:
df_images=DataFrame(results_images)

In [31]:
df_images[df_images.fsmImageUrl.isnull()]


Out[31]:
Int64Index([], dtype='int64') Empty DataFrame

0 rows × 11 columns


In [32]:
# would be interesting to see sizes of images and whether we can get at thumbnails
df_images.fsmImageUrl


Out[32]:
0     [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
1     [http://nma.berkeley.edu/ark:/28722/bk0005k284...
2     [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
3     [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
4     [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
5     [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...
6     [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...
7     [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...
8     [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...
9     [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...
10    [http://nma.berkeley.edu/ark:/28722/bk0005k232...
11    [http://nma.berkeley.edu/ark:/28722/bk0005k047...
12    [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...
13    [http://nma.berkeley.edu/ark:/28722/bk0005k110...
14    [http://nma.berkeley.edu/ark:/28722/bk0005k276...
...
670    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
671    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
672    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
673    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
674    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
675    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
676    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
677    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
678    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
679    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
680    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
681    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
682    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
683    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
684    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
Name: fsmImageUrl, Length: 685, dtype: object

In [33]:
urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc


Out[33]:
'digitalassets.lib.berkeley.edu'

In [34]:
df_images.fsmImageUrl


Out[34]:
0     [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
1     [http://nma.berkeley.edu/ark:/28722/bk0005k284...
2     [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
3     [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
4     [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
5     [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...
6     [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...
7     [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...
8     [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...
9     [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...
10    [http://nma.berkeley.edu/ark:/28722/bk0005k232...
11    [http://nma.berkeley.edu/ark:/28722/bk0005k047...
12    [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...
13    [http://nma.berkeley.edu/ark:/28722/bk0005k110...
14    [http://nma.berkeley.edu/ark:/28722/bk0005k276...
...
670    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
671    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
672    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
673    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
674    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
675    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
676    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
677    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
678    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
679    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
680    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
681    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
682    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
683    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
684    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
Name: fsmImageUrl, Length: 685, dtype: object

In [35]:
# calculate hostnames for all image urls

# might be possible to do this all with pandas
netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))
reduce(lambda x,y: x | y, netlocs, set())


Out[35]:
{u'digitalassets.lib.berkeley.edu',
 u'nma.berkeley.edu',
 u'sunsite.berkeley.edu'}

In [36]:
def len2(x):
    try:
        return len(x)
    except:
        return np.nan
    
df_images.fsmImageUrl.apply(len2) == 3


Out[36]:
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
670    False
671    False
672    False
673    False
674    False
675    False
676    False
677    False
678    False
679    False
680    False
681    False
682    False
683    False
684    False
Name: fsmImageUrl, Length: 685, dtype: bool

In [37]:
df_images[df_images.fsmImageUrl.apply(len2) == 3].head()


Out[37]:
fsmCreator fsmDateCreated fsmIdentifier fsmImageUrl fsmNote fsmPhysicalLocation fsmRelatedIdentifier fsmRelatedTitle fsmTitle fsmTypeOfResource id
246 [Hecker, Ron] [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Crowd in Sproul Plaza from Student Union balc... NaN UARC PIC 24B:2:22
247 [Hecker, Ron] [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Crowd at Greek Theater] NaN UARC PIC 24B:2:17
248 NaN [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [View from inside Sproul Hall lobby looking th... NaN UARC PIC 24B:1:26
249 [Hecker, Ron] [Dec. 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Student Strike] NaN UARC PIC 24B:2:6
250 [Hecker, Ron] [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Crowd at Greek Theater] NaN UARC PIC 24B:2:21

5 rows × 11 columns


In [38]:
df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl


Out[38]:
[u'http://nma.berkeley.edu/ark:/28722/bk001532c4q',
 u'http://nma.berkeley.edu/ark:/28722/bk001532c7c',
 u'http://nma.berkeley.edu/ark:/28722/bk001532c58',
 u'http://nma.berkeley.edu/ark:/28722/bk001532c8x']

In [39]:
IMAGES_TEMPLATE = """
<div class="nowrap">
 {% for item in items %}<img title="{{item}}" src="{{item}}"/>{% endfor %}
</div>
"""
    
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl ))


Out[39]:

Dates


In [40]:
len(df[~df.fsmDateCreated.isnull()])


Out[40]:
486

In [41]:
s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')

In [42]:
def first(x):
    try:
        return x[0]
    except:
        return np.nan


df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)

In [43]:
df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date


Out[43]:
156   1964-01-01
90    1964-01-01
74    1964-01-01
14    1964-01-01
146   1964-01-01
731   1964-01-01
92    1964-01-01
167   1964-01-01
300   1964-01-01
220   1964-01-01
871   1964-01-01
203   1964-01-05
261   1964-10-01
245   1964-10-01
243   1964-10-01
...
197   1970-05-03
210   1970-05-03
23    1970-05-05
50    1970-05-05
179   1970-05-05
869   1973-01-01
129   1984-10-02
180   1984-10-02
159   1984-10-02
287   1984-10-02
289   1984-10-02
299   1984-10-02
868   1986-01-01
801   1990-01-01
867   1993-06-03
Name: calc_date, Length: 434, dtype: datetime64[ns]

In [44]:
pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()


Out[44]:
0    1964-11-09
1    1964-12-02
2    1964-12-02
3    1964-12-03
4    1964-10-05
5    1964-11-09
6    1964-11-24
7    1964-10-01
8    1964-12-03
9    1964-12-02
10   1964-12-03
11   1964-12-07
12   1964-11-09
13   1964-10-01
14   1964-01-01
...
863   1965-07-26
864   1965-10-13
865   1965-03-05
867   1993-06-03
868   1986-01-01
869   1973-01-01
870   1965-01-03
871   1964-01-01
872   1964-11-30
873   1964-12-04
874   1964-12-22
875   1965-01-07
876   1964-12-21
877   1965-01-09
878   1965-01-02
Name: fsmDateCreated, Length: 434, dtype: datetime64[ns]

In [45]:
# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put
date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')
pd.to_datetime(pd.Series(date_stngs),coerce=True)


Out[45]:
0   2008-12-20
1   2008-12-21
2   2008-12-22
3   2008-12-23
4   1964-11-09
5          NaT
dtype: datetime64[ns]

Types of Resources


In [46]:
def f(x):
    try:
        return set(x)
    except:
        return set()
    
reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())


Out[46]:
{u'Box 1:1',
 u'Box 1:11',
 u'Box 1:11:4',
 u'Box 1:13',
 u'Box 1:13:1',
 u'Box 1:13:4',
 u'Box 1:14',
 u'Box 1:15',
 u'Box 1:16',
 u'Box 1:17',
 u'Box 1:2',
 u'Box 1:25',
 u'Box 1:25:1',
 u'Box 1:25:4',
 u'Box 1:28',
 u'Box 1:29',
 u'Box 1:2:3',
 u'Box 1:30',
 u'Box 1:30:2',
 u'Box 1:32',
 u'Box 1:34',
 u'Box 1:34:1',
 u'Box 1:38',
 u'Box 1:39',
 u'Box 1:4',
 u'Box 1:41',
 u'Box 1:43',
 u'Box 1:44',
 u'Box 1:45',
 u'Box 1:46',
 u'Box 1:5',
 u'Box 1:6',
 u'Box 1:7',
 u'Box 1:8',
 u'Box 2:11',
 u'Box 2:11:1',
 u'Box 2:11:2',
 u'Box 2:11:3',
 u'Box 2:11:4',
 u'Box 2:11:6',
 u'Box 2:18',
 u'Box 2:18:1',
 u'Box 2:22',
 u'Box 2:22:1',
 u'Box 2:47',
 u'Box 2:47:1',
 u'Box 2:47:3',
 u'Box 2:49',
 u'Box 2:49:1',
 u'Box 2:49:2',
 u'Box 2:55',
 u'Box 2:59',
 u'Box 2:8',
 u'Box 2:8:3',
 u'Box 2:8:4',
 u'Box 3:1',
 u'Box 3:11',
 u'Box 3:14',
 u'Box 3:14:1',
 u'Box 3:15',
 u'Box 3:17',
 u'Box 3:17:2',
 u'Box 3:2',
 u'Box 3:21',
 u'Box 3:22',
 u'Box 3:23',
 u'Box 3:26',
 u'Box 3:29',
 u'Box 3:29:1',
 u'Box 3:29:2',
 u'Box 3:3',
 u'Box 3:31',
 u'Box 3:33',
 u'Box 3:34',
 u'Box 3:34:1',
 u'Box 3:36',
 u'Box 3:38',
 u'Box 3:39',
 u'Box 3:39:4',
 u'Box 3:39:5',
 u'Box 3:39:8',
 u'Box 3:40',
 u'Box 3:41',
 u'Box 3:5',
 u'Box 4:10',
 u'Box 4:5',
 u'Box 4:5:13',
 u'Box 4:5:2',
 u'Box 4:5:6',
 u'Box 4:8',
 u'Box 4:8:5',
 u'Box 4:9',
 u'Box 4:9:3',
 u'Box 70:33',
 u'Box 70:33:2',
 u'Box 70:33:4',
 u'Box 70:34',
 u'Box 70:34:1',
 u'Box 70:34:3',
 u'Box 70:34:7c',
 u'Box 70:34:8',
 u'Box 72:14',
 u'Box 72:14:1',
 u'Box 72:14:11',
 u'Box 72:14:19',
 u'Box 72:23',
 u'Box 72:23:1',
 u'Carton 1:12',
 u'Carton 1:12:2',
 u'Carton 1:12:3',
 u'Carton 1:12:4',
 u'Carton 1:12:5',
 u'Carton 1:12:6',
 u'Carton 1:12:7',
 u'Carton 1:12:8',
 u'Carton 1:14',
 u'Carton 1:15',
 u'Carton 1:9',
 u'Carton 21:14:1',
 u'Carton 21:14:7',
 u'Carton 21:16',
 u'Carton 21:2:1',
 u'Carton 2:20',
 u'Carton 2:32',
 u'Carton 3:16',
 u'Carton 3:37',
 u'Carton 3:58',
 u'Carton 3:58:4',
 u'Carton 3:58:7',
 u'Carton 4:32',
 u'Carton 4:78',
 u'Carton 4:80',
 u'agendas',
 u'articles',
 u'briefs (legal documents)',
 u'detail',
 u'fdr',
 u'fliers (printed matter)',
 u'folder',
 u'form letters',
 u'group statements',
 u'item',
 u'leaflets',
 u'letters (correspondence)',
 u'magazines (periodicals)',
 u'memorandums',
 u'minutes',
 u'miscellaneous',
 u'news bulletins',
 u'newsletters',
 u'newspapers',
 u'oral histories',
 u'pamphlets',
 u'papers (document genres)',
 u'personal statement',
 u'personal statements',
 u'progress reports',
 u'reports',
 u'still image',
 u'tables of content',
 u'text',
 u'title pages',
 u'transcripts'}

In [47]:
#related id

len(df.fsmRelatedIdentifier.dropna())


Out[47]:
236

TEI documents


In [48]:
df.fsmTeiUrl.dropna()


Out[48]:
685    [http://content.cdlib.org/xml/ark:/13030/kt5m3...
686    [http://content.cdlib.org/xml/ark:/13030/kt5s2...
687    [http://content.cdlib.org/xml/ark:/13030/kt6k4...
688    [http://content.cdlib.org/xml/ark:/13030/kt4s2...
689    [http://content.cdlib.org/xml/ark:/13030/kt1h4...
690    [http://content.cdlib.org/xml/ark:/13030/kt2w1...
691    [http://content.cdlib.org/xml/ark:/13030/kt609...
692    [http://content.cdlib.org/xml/ark:/13030/kt638...
693    [http://content.cdlib.org/xml/ark:/13030/kt777...
694    [http://content.cdlib.org/xml/ark:/13030/kt0k4...
695    [http://content.cdlib.org/xml/ark:/13030/kt6m3...
696    [http://content.cdlib.org/xml/ark:/13030/kt287...
697    [http://content.cdlib.org/xml/ark:/13030/kt3p3...
698    [http://content.cdlib.org/xml/ark:/13030/kt177...
699    [http://content.cdlib.org/xml/ark:/13030/kt1g5...
...
864    [http://content.cdlib.org/xml/ark:/13030/kt3z0...
865    [http://content.cdlib.org/xml/ark:/13030/kt5h4...
866    [http://content.cdlib.org/xml/ark:/13030/kt1v1...
867    [http://content.cdlib.org/xml/ark:/13030/kt7d5...
868    [http://content.cdlib.org/xml/ark:/13030/kt7h4...
869    [http://content.cdlib.org/xml/ark:/13030/kt919...
870    [http://content.cdlib.org/xml/ark:/13030/kt409...
871    [http://content.cdlib.org/xml/ark:/13030/kt4c6...
872    [http://content.cdlib.org/xml/ark:/13030/kt387...
873    [http://content.cdlib.org/xml/ark:/13030/kt3q2...
874    [http://content.cdlib.org/xml/ark:/13030/kt7v1...
875    [http://content.cdlib.org/xml/ark:/13030/kt038...
876    [http://content.cdlib.org/xml/ark:/13030/kt7z0...
877    [http://content.cdlib.org/xml/ark:/13030/kt500...
878    [http://content.cdlib.org/xml/ark:/13030/kt9b6...
Name: fsmTeiUrl, Length: 194, dtype: object