HackFSM

Relationship to other public APIs based on Solr?

Documentation:

http://digitalhumanities.berkeley.edu/hackfsm/api/detail



In [1]:

    
from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)
from itertools import islice

import logging
import requests
import json
import urllib
import urlparse

from pandas import DataFrame, Series
import pandas as pd
import numpy as np

logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)
logger=logging.getLogger()



In [2]:

    
def query(q, fl="id"):
    url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \
            urllib.urlencode({'q':q,
                              'fl':fl,
                              'wt':'json',
                              'app_id':HACKFSM_ID,
                              'app_key':HACKFSM_KEY})
    r = requests.get(url)
    return r.json()



In [3]:

    
result = query(q="fsmTitle:Savio")['response']
result









    Out[3]:





{u'docs': [{u'id': u'ark:/13030/ft2f59n853'},
  {u'id': u'access143'},
  {u'id': u'ark:/13030/tf2q2n99d3'},
  {u'id': u'ark:/13030/tf3p3003k7'},
  {u'id': u'ark:/13030/tf5m3nb15b'},
  {u'id': u'ark:/13030/tf267n996q'},
  {u'id': u'access326'},
  {u'id': u'access327'},
  {u'id': u'access328'},
  {u'id': u'access329'},
  {u'id': u'access330'},
  {u'id': u'access331'},
  {u'id': u'access332'},
  {u'id': u'access333'},
  {u'id': u'access334'},
  {u'id': u'access335'},
  {u'id': u'access339'},
  {u'id': u'access340'},
  {u'id': u'access341'},
  {u'id': u'access343'},
  {u'id': u'access344'},
  {u'id': u'access345'},
  {u'id': u'access346'},
  {u'id': u'access347'},
  {u'id': u'access348'},
  {u'id': u'access365'},
  {u'id': u'access366'},
  {u'id': u'access367'},
  {u'id': u'access369'},
  {u'id': u'access370'}],
 u'numFound': 124,
 u'start': 0}

Paging through results



In [4]:

    
# try again
# http://stackoverflow.com/a/5724453/7782
# http://excess.org/article/2013/02/itergen1/


class my_g(object):
    def __init__(self,max_count):
        self._remaining = range(max_count)
        self._len = max_count
    def __iter__(self):
        return self
    def __len__(self):
        return self._len
    def next(self):
        if not self._remaining:
            raise StopIteration
        return self._remaining.pop(0)

g=my_g(10)
print len(g)
list(g)









    



10






    Out[4]:





[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]



In [5]:

    
class FSM(object):
    def __init__(self, q, fl="id", start=0, rows=30,
                 base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):
        self.q = q
        self.fl = fl
        self.start = start
        self.rows = rows
        
        self.base_url = base_url
        self.app_id = app_id
        self.app_key = app_key

        # get first page and numfound
        self.cursor = start 
        
        # get the first page
        result = self._get_page(q, fl, self.cursor, self.rows)
        self.numfound = result['response']['numFound']
    
    def _check_status(self,result):
        """throw exception if non-zero status"""
        if result['responseHeader']['status'] != 0:
            raise FSMException("status: " + str(result['responseHeader']['status']))

    def _get_page(self, q, fl, start, rows):
        result = self._call_api(q, fl, start, rows)
        
        # update current page
        self.page = result['response']['docs']
        self.page_len = len(self.page)
        
        return result
    
    def _call_api(self, q, fl, start, rows):
        url = "{base_url}?".format(base_url=self.base_url) + \
            urllib.urlencode({'q':q,
                              'fl':fl,
                              'wt':'json',
                              'start':start,
                              'row':rows,
                              'app_id':self.app_id,
                              'app_key':self.app_key})

        result = requests.get(url).json()
        self._check_status(result)
        
        # check whether we're getting fewer records than expected
        if len(result['response']['docs']) < rows:
            # are we at the end of the results
            if start + len(result['response']['docs']) != self.numfound:
                logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url,
                                                            numfound=self.numfound,
                                                            start_plus_len=start + len(result['response']['docs'])))
                
        
        return result

    def __iter__(self):
        return self
    def __len__(self):
        return self.numfound
    def next(self):
        if not self.page:
            # retrieve next page and check whether there's anything left
            self.cursor += self.page_len
            result = self._get_page(self.q, self.fl, self.cursor, self.rows)
    
            if self.page_len == 0:
                raise StopIteration
                
        return self.page.pop(0)



In [6]:

    
fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated")



In [7]:

    
len(fsm)









    Out[7]:





685



In [8]:

    
results = list(islice(fsm,None))
results[:10]









    Out[8]:





[{u'fsmDateCreated': [u'Nov. 9, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9r90',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9s0j'],
  u'fsmTitle': [u'Professor John Searle speaking to crowd.'],
  u'id': u'ark:/13030/ft6k40080h'},
 {u'fsmDateCreated': [u'Dec. 2, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2842',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k285m'],
  u'fsmTitle': [u'Mario Savio speaking with reporters.'],
  u'id': u'ark:/13030/tf009n97vn'},
 {u'fsmDateCreated': [u'Dec. 2, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2c2h',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k2c32'],
  u'fsmTitle': [u'Joan Baez singing in front of Sproul Hall.'],
  u'id': u'ark:/13030/tf5j49n838'},
 {u'fsmDateCreated': [u'Dec. 3, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z5w',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9z6f'],
  u'fsmTitle': [u'Girl student being booked on campus before being taken to jail.'],
  u'id': u'ark:/13030/ft700007tc'},
 {u'fsmDateCreated': [u'Oct. 5, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9n7b',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9n8w'],
  u'fsmTitle': [u'Bryan Turner speaking.'],
  u'id': u'ark:/13030/ft7n39p1mr'},
 {u'fsmDateCreated': [u'Nov. 9, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k1b6q',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k1b78'],
  u'fsmTitle': [u'Steve Weissman speaking to crowd.'],
  u'id': u'ark:/13030/tf8w1006vp'},
 {u'fsmDateCreated': [u'Nov. 24, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9v37',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9v4s'],
  u'fsmTitle': [u'Professor Morris Hirsch speaking from Sproul steps.'],
  u'id': u'ark:/13030/ft9f59p3bw'},
 {u'fsmDateCreated': [u'Oct. 1, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k0v2s',
   u'http://nma.berkeley.edu/ark:/28722/bk0005k0v3b'],
  u'fsmTitle': [u'Crowd in Sproul Plaza.'],
  u'id': u'ark:/13030/tf0870010x'},
 {u'fsmDateCreated': [u'Dec. 3, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z1p',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9z27'],
  u'fsmTitle': [u'Crowds in Sproul Plaza'],
  u'id': u'ark:/13030/ft8199p26d'},
 {u'fsmDateCreated': [u'Dec. 2, 1964'],
  u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9x7g',
   u'http://nma.berkeley.edu/ark:/28722/bk0005j9x81'],
  u'fsmTitle': [u'Professor David Hackett talking to his class.'],
  u'id': u'ark:/13030/ft9000102p'}]



In [9]:

    
df = DataFrame(results)



In [10]:

    
len(df)









    Out[10]:





685



In [11]:

    
df.fsmImageUrl









    Out[11]:





0     [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
1     [http://nma.berkeley.edu/ark:/28722/bk0005k284...
2     [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
3     [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
4     [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
5     [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...
6     [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...
7     [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...
8     [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...
9     [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...
10    [http://nma.berkeley.edu/ark:/28722/bk0005k232...
11    [http://nma.berkeley.edu/ark:/28722/bk0005k047...
12    [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...
13    [http://nma.berkeley.edu/ark:/28722/bk0005k110...
14    [http://nma.berkeley.edu/ark:/28722/bk0005k276...
...
670    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
671    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
672    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
673    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
674    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
675    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
676    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
677    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
678    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
679    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
680    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
681    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
682    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
683    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
684    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
Name: fsmImageUrl, Length: 685, dtype: object



In [12]:

    
from IPython.display import HTML
from jinja2 import Template

CSS = """
<style>
  .wrap img {
    margin-left: 0px;
    margin-right: 0px;
    display: inline-block;
    width: 150px;
  }
  
.wrap {
   /* Prevent vertical gaps */
   line-height: 0;
   
   -webkit-column-count: 5;
   -webkit-column-gap:   0px;
   -moz-column-count:    5;
   -moz-column-gap:      0px;
   column-count:         5;
   column-gap:           0px;
   
}
.wrap img {
  /* Just in case there are inline attributes */
  width: 100% !important;
  height: auto !important;
}

</style>
"""

IMAGES_TEMPLATE = CSS + """
<div class="wrap">
 {% for item in items %}<img title="{{item.fsmTitle.0}}" src="{{item.fsmImageUrl.0}}"/>{% endfor %}
</div>
"""
    
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results[:10]))









    Out[12]:

DISTINGUISHING IMAGES FROM DOCUMENTS

To programmatically differentiate records that describe images from records that describe TEI-encoded XML documents, the API permits queries that exclude records with NULL values in the "unwanted" Url field.

That is, to retrieve TEI documents only, one would query for null values in the fsmImageUrl field. To retrieve images only, one would query for null values in the fsmTeiUrl field.

NOTE: Please observe the hyphen prepended to the field names in the examples below. The hyphen (minus sign) functions here as a NOT operator.

Example that selects for TEI encoded XML documents by excluding null values of fsmImageUrl:

 https://<BASE URL>/solr/fsm/select?q=-fsmImageUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012

Example that selects for images by excluding null values of fsmTeiUrl:

 https://<BASE URL>/solr/fsm/select?q=-fsmTeiUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012



In [13]:

    
# TEI-encoded docs

len(FSM("-fsmImageUrl:[* TO *]"))









    Out[13]:





194



In [14]:

    
# images

len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl"))









    Out[14]:





685

Studying the API parameters



In [15]:

    
from lxml.html import parse, fromstring
from collections import OrderedDict

api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail"
r = requests.get(api_docs_url).content
doc = fromstring(r)



In [16]:

    
rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr')
headers = [col.text_content().strip() for col in rows[0].findall('td')]
headers









    Out[16]:





['Field Name', 'Definitions']



In [17]:

    
fields = []

for row in rows[1:]:
    field = [col.text_content().strip() for col in row.findall('td')]
    fields.append(field)
    
fsmfields = OrderedDict(fields)
fsmfields.keys()









    Out[17]:





['id',
 'fsmTitle',
 'fsmCreator',
 'fsmTypeOfResource',
 'fsmDateCreated',
 'fsmNote',
 'fsmRelatedTitle',
 'fsmIdentifier',
 'fsmRelatedIdentifier',
 'fsmPhysicalLocation',
 'fsmImageUrl',
 'fsmTeiUrl']

Study all the records



In [18]:

    
fsm = FSM(q="*",fl=",".join(fsmfields.keys()))



In [19]:

    
len(fsm)









    Out[19]:





879



In [20]:

    
df = DataFrame(list(fsm))



In [21]:

    
len(df)









    Out[21]:





879



In [22]:

    
df.head()









    Out[22]:






  
    
      
      fsmCreator
      fsmDateCreated
      fsmIdentifier
      fsmImageUrl
      fsmNote
      fsmPhysicalLocation
      fsmRelatedIdentifier
      fsmRelatedTitle
      fsmTeiUrl
      fsmTitle
      fsmTypeOfResource
      id
    
  
  
    
      0
       [Warren (Photographer)]
       [Nov. 9, 1964]
       [BANC PIC 1959.010 -- NEG pt.3 11-09-64.4]
       [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
       [Photographer]
       [The Bancroft Library;;, University of Califor...
       [http://bancroft.berkeley.edu/FSM/, BANC PIC 1...
       [The Free Speech Movement Digital Archive, San...
       NaN
              [Professor John Searle speaking to crowd.]
       [still image]
       ark:/13030/ft6k40080h
    
    
      1
               [Steven Marcus]
       [Dec. 2, 1964]
            [BANC PIC 2000.002--NEG Strip 117:36]
       [http://nma.berkeley.edu/ark:/28722/bk0005k284...
       [Photographer]
       [The Bancroft Library;;, University of Califor...
       [http://bancroft.berkeley.edu/FSM/, BANC PIC 2...
       [The Free Speech Movement Digital Archive, Ste...
       NaN
                  [Mario Savio speaking with reporters.]
       [still image]
       ark:/13030/tf009n97vn
    
    
      2
               [Steven Marcus]
       [Dec. 2, 1964]
            [BANC PIC 2000.002--NEG Strip 122:42]
       [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
       [Photographer]
       [The Bancroft Library;;, University of Califor...
       [http://bancroft.berkeley.edu/FSM/, BANC PIC 2...
       [The Free Speech Movement Digital Archive, Ste...
       NaN
            [Joan Baez singing in front of Sproul Hall.]
       [still image]
       ark:/13030/tf5j49n838
    
    
      3
        [Jones (Photographer)]
       [Dec. 3, 1964]
       [BANC PIC 1959.010 -- NEG pt.3 12-03-64.2]
       [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
       [Photographer]
       [The Bancroft Library;;, University of Califor...
       [http://bancroft.berkeley.edu/FSM/, BANC PIC 1...
       [The Free Speech Movement Digital Archive, San...
       NaN
       [Girl student being booked on campus before be...
       [still image]
       ark:/13030/ft700007tc
    
    
      4
       [Ingman (Photographer)]
       [Oct. 5, 1964]
       [BANC PIC 1959.010 -- NEG pt.3 10-05-64.4]
       [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
       [Photographer]
       [The Bancroft Library;;, University of Califor...
       [http://bancroft.berkeley.edu/FSM/, BANC PIC 1...
       [The Free Speech Movement Digital Archive, San...
       NaN
                                [Bryan Turner speaking.]
       [still image]
       ark:/13030/ft7n39p1mr
    
  

5 rows × 12 columns



In [23]:

    
# TEI URIs

len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))









    Out[23]:





194



In [24]:

    
# null dates

len(df[df.fsmDateCreated.isnull()])









    Out[24]:





393



In [25]:

    
# non-null image URLs
len(df[~df.fsmImageUrl.isnull()])









    Out[25]:





685



In [26]:

    
df[~df.fsmImageUrl.isnull()].id









    Out[26]:





0     ark:/13030/ft6k40080h
1     ark:/13030/tf009n97vn
2     ark:/13030/tf5j49n838
3     ark:/13030/ft700007tc
4     ark:/13030/ft7n39p1mr
5     ark:/13030/tf8w1006vp
6     ark:/13030/ft9f59p3bw
7     ark:/13030/tf0870010x
8     ark:/13030/ft8199p26d
9     ark:/13030/ft9000102p
10    ark:/13030/tf7n39n9qb
11    ark:/13030/ft3c6004k4
12    ark:/13030/tf8n39p05g
13    ark:/13030/tf20000235
14    ark:/13030/tf0d5n97ws
...
670    access369
671    access370
672    access371
673    access372
674    access373
675    access374
676    access375
677    access376
678    access377
679    access378
680    access379
681    access380
682    access381
683    access382
684    access383
Name: id, Length: 685, dtype: object



In [27]:

    
# distribution of number of image URLs

df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()









    Out[27]:





2    628
3     56
4      1
dtype: int64



In [28]:

    
# let's crawl for images

results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys())))



In [29]:

    
len(results_images)









    Out[29]:





685



In [30]:

    
df_images=DataFrame(results_images)



In [31]:

    
df_images[df_images.fsmImageUrl.isnull()]









    Out[31]:






  
    
      Int64Index([], dtype='int64')
      Empty DataFrame
    
  

0 rows × 11 columns



In [32]:

    
# would be interesting to see sizes of images and whether we can get at thumbnails
df_images.fsmImageUrl









    Out[32]:





0     [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
1     [http://nma.berkeley.edu/ark:/28722/bk0005k284...
2     [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
3     [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
4     [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
5     [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...
6     [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...
7     [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...
8     [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...
9     [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...
10    [http://nma.berkeley.edu/ark:/28722/bk0005k232...
11    [http://nma.berkeley.edu/ark:/28722/bk0005k047...
12    [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...
13    [http://nma.berkeley.edu/ark:/28722/bk0005k110...
14    [http://nma.berkeley.edu/ark:/28722/bk0005k276...
...
670    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
671    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
672    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
673    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
674    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
675    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
676    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
677    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
678    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
679    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
680    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
681    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
682    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
683    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
684    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
Name: fsmImageUrl, Length: 685, dtype: object

http://content.cdlib.org/ark:/13030/tf1z09n5r1/thumbnail -> http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_a.gif

http://content.cdlib.org/ark:/13030/tf1z09n5r1/hi-res.jpg -> http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg



In [33]:

    
urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc









    Out[33]:





'digitalassets.lib.berkeley.edu'



In [34]:

    
df_images.fsmImageUrl









    Out[34]:





0     [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...
1     [http://nma.berkeley.edu/ark:/28722/bk0005k284...
2     [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...
3     [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...
4     [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...
5     [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...
6     [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...
7     [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...
8     [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...
9     [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...
10    [http://nma.berkeley.edu/ark:/28722/bk0005k232...
11    [http://nma.berkeley.edu/ark:/28722/bk0005k047...
12    [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...
13    [http://nma.berkeley.edu/ark:/28722/bk0005k110...
14    [http://nma.berkeley.edu/ark:/28722/bk0005k276...
...
670    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
671    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
672    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
673    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
674    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
675    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
676    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
677    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
678    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
679    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
680    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
681    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
682    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
683    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
684    [http://digitalassets.lib.berkeley.edu/fsm/ucb...
Name: fsmImageUrl, Length: 685, dtype: object



In [35]:

    
# calculate hostnames for all image urls

# might be possible to do this all with pandas
netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))
reduce(lambda x,y: x | y, netlocs, set())









    Out[35]:





{u'digitalassets.lib.berkeley.edu',
 u'nma.berkeley.edu',
 u'sunsite.berkeley.edu'}



In [36]:

    
def len2(x):
    try:
        return len(x)
    except:
        return np.nan
    
df_images.fsmImageUrl.apply(len2) == 3









    Out[36]:





0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
670    False
671    False
672    False
673    False
674    False
675    False
676    False
677    False
678    False
679    False
680    False
681    False
682    False
683    False
684    False
Name: fsmImageUrl, Length: 685, dtype: bool



In [37]:

    
df_images[df_images.fsmImageUrl.apply(len2) == 3].head()









    Out[37]:






  
    
      
      fsmCreator
      fsmDateCreated
      fsmIdentifier
      fsmImageUrl
      fsmNote
      fsmPhysicalLocation
      fsmRelatedIdentifier
      fsmRelatedTitle
      fsmTitle
      fsmTypeOfResource
      id
    
  
  
    
      246
       [Hecker, Ron]
       [Dec. 7, 1964]
       NaN
       [http://sunsite.berkeley.edu/FindingAids/dynaw...
       NaN
       NaN
       NaN
       [Free Speech Movement Photographs Collection,  ]
       [Crowd in Sproul Plaza from Student Union balc...
       NaN
       UARC PIC 24B:2:22
    
    
      247
       [Hecker, Ron]
       [Dec. 7, 1964]
       NaN
       [http://sunsite.berkeley.edu/FindingAids/dynaw...
       NaN
       NaN
       NaN
       [Free Speech Movement Photographs Collection,  ]
                                [Crowd at Greek Theater]
       NaN
       UARC PIC 24B:2:17
    
    
      248
                 NaN
       [Dec. 7, 1964]
       NaN
       [http://sunsite.berkeley.edu/FindingAids/dynaw...
       NaN
       NaN
       NaN
       [Free Speech Movement Photographs Collection,  ]
       [View from inside Sproul Hall lobby looking th...
       NaN
       UARC PIC 24B:1:26
    
    
      249
       [Hecker, Ron]
          [Dec. 1964]
       NaN
       [http://sunsite.berkeley.edu/FindingAids/dynaw...
       NaN
       NaN
       NaN
       [Free Speech Movement Photographs Collection,  ]
                                        [Student Strike]
       NaN
        UARC PIC 24B:2:6
    
    
      250
       [Hecker, Ron]
       [Dec. 7, 1964]
       NaN
       [http://sunsite.berkeley.edu/FindingAids/dynaw...
       NaN
       NaN
       NaN
       [Free Speech Movement Photographs Collection,  ]
                                [Crowd at Greek Theater]
       NaN
       UARC PIC 24B:2:21
    
  

5 rows × 11 columns



In [38]:

    
df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl









    Out[38]:





[u'http://nma.berkeley.edu/ark:/28722/bk001532c4q',
 u'http://nma.berkeley.edu/ark:/28722/bk001532c7c',
 u'http://nma.berkeley.edu/ark:/28722/bk001532c58',
 u'http://nma.berkeley.edu/ark:/28722/bk001532c8x']



In [39]:

    
IMAGES_TEMPLATE = """
<div class="nowrap">
 {% for item in items %}<img title="{{item}}" src="{{item}}"/>{% endfor %}
</div>
"""
    
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl ))









    Out[39]:

Dates



In [40]:

    
len(df[~df.fsmDateCreated.isnull()])









    Out[40]:





486



In [41]:

    
s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')



In [42]:

    
def first(x):
    try:
        return x[0]
    except:
        return np.nan


df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)



In [43]:

    
df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date









    Out[43]:





156   1964-01-01
90    1964-01-01
74    1964-01-01
14    1964-01-01
146   1964-01-01
731   1964-01-01
92    1964-01-01
167   1964-01-01
300   1964-01-01
220   1964-01-01
871   1964-01-01
203   1964-01-05
261   1964-10-01
245   1964-10-01
243   1964-10-01
...
197   1970-05-01
210   1970-05-01
23    1970-05-05
50    1970-05-05
179   1970-05-05
869   1973-01-01
129   1984-10-02
180   1984-10-02
159   1984-10-02
287   1984-10-02
289   1984-10-02
299   1984-10-02
868   1986-01-01
801   1990-01-01
867   1993-06-01
Name: calc_date, Length: 434, dtype: datetime64[ns]



In [44]:

    
pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()









    Out[44]:





0    1964-11-09
1    1964-12-02
2    1964-12-02
3    1964-12-03
4    1964-10-05
5    1964-11-09
6    1964-11-24
7    1964-10-01
8    1964-12-03
9    1964-12-02
10   1964-12-03
11   1964-12-07
12   1964-11-09
13   1964-10-01
14   1964-01-01
...
863   1965-07-26
864   1965-10-13
865   1965-03-05
867   1993-06-01
868   1986-01-01
869   1973-01-01
870   1965-01-01
871   1964-01-01
872   1964-11-30
873   1964-12-04
874   1964-12-22
875   1965-01-07
876   1964-12-21
877   1965-01-09
878   1965-01-02
Name: fsmDateCreated, Length: 434, dtype: datetime64[ns]



In [45]:

    
# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put
date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')
pd.to_datetime(pd.Series(date_stngs),coerce=True)









    Out[45]:





0   2008-12-20
1   2008-12-21
2   2008-12-22
3   2008-12-23
4   1964-11-09
5          NaT
dtype: datetime64[ns]

Types of Resources



In [46]:

    
def f(x):
    try:
        return set(x)
    except:
        return set()
    
reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())









    Out[46]:





{u'Box 1:1',
 u'Box 1:11',
 u'Box 1:11:4',
 u'Box 1:13',
 u'Box 1:13:1',
 u'Box 1:13:4',
 u'Box 1:14',
 u'Box 1:15',
 u'Box 1:16',
 u'Box 1:17',
 u'Box 1:2',
 u'Box 1:25',
 u'Box 1:25:1',
 u'Box 1:25:4',
 u'Box 1:28',
 u'Box 1:29',
 u'Box 1:2:3',
 u'Box 1:30',
 u'Box 1:30:2',
 u'Box 1:32',
 u'Box 1:34',
 u'Box 1:34:1',
 u'Box 1:38',
 u'Box 1:39',
 u'Box 1:4',
 u'Box 1:41',
 u'Box 1:43',
 u'Box 1:44',
 u'Box 1:45',
 u'Box 1:46',
 u'Box 1:5',
 u'Box 1:6',
 u'Box 1:7',
 u'Box 1:8',
 u'Box 2:11',
 u'Box 2:11:1',
 u'Box 2:11:2',
 u'Box 2:11:3',
 u'Box 2:11:4',
 u'Box 2:11:6',
 u'Box 2:18',
 u'Box 2:18:1',
 u'Box 2:22',
 u'Box 2:22:1',
 u'Box 2:47',
 u'Box 2:47:1',
 u'Box 2:47:3',
 u'Box 2:49',
 u'Box 2:49:1',
 u'Box 2:49:2',
 u'Box 2:55',
 u'Box 2:59',
 u'Box 2:8',
 u'Box 2:8:3',
 u'Box 2:8:4',
 u'Box 3:1',
 u'Box 3:11',
 u'Box 3:14',
 u'Box 3:14:1',
 u'Box 3:15',
 u'Box 3:17',
 u'Box 3:17:2',
 u'Box 3:2',
 u'Box 3:21',
 u'Box 3:22',
 u'Box 3:23',
 u'Box 3:26',
 u'Box 3:29',
 u'Box 3:29:1',
 u'Box 3:29:2',
 u'Box 3:3',
 u'Box 3:31',
 u'Box 3:33',
 u'Box 3:34',
 u'Box 3:34:1',
 u'Box 3:36',
 u'Box 3:38',
 u'Box 3:39',
 u'Box 3:39:4',
 u'Box 3:39:5',
 u'Box 3:39:8',
 u'Box 3:40',
 u'Box 3:41',
 u'Box 3:5',
 u'Box 4:10',
 u'Box 4:5',
 u'Box 4:5:13',
 u'Box 4:5:2',
 u'Box 4:5:6',
 u'Box 4:8',
 u'Box 4:8:5',
 u'Box 4:9',
 u'Box 4:9:3',
 u'Box 70:33',
 u'Box 70:33:2',
 u'Box 70:33:4',
 u'Box 70:34',
 u'Box 70:34:1',
 u'Box 70:34:3',
 u'Box 70:34:7c',
 u'Box 70:34:8',
 u'Box 72:14',
 u'Box 72:14:1',
 u'Box 72:14:11',
 u'Box 72:14:19',
 u'Box 72:23',
 u'Box 72:23:1',
 u'Carton 1:12',
 u'Carton 1:12:2',
 u'Carton 1:12:3',
 u'Carton 1:12:4',
 u'Carton 1:12:5',
 u'Carton 1:12:6',
 u'Carton 1:12:7',
 u'Carton 1:12:8',
 u'Carton 1:14',
 u'Carton 1:15',
 u'Carton 1:9',
 u'Carton 21:14:1',
 u'Carton 21:14:7',
 u'Carton 21:16',
 u'Carton 21:2:1',
 u'Carton 2:20',
 u'Carton 2:32',
 u'Carton 3:16',
 u'Carton 3:37',
 u'Carton 3:58',
 u'Carton 3:58:4',
 u'Carton 3:58:7',
 u'Carton 4:32',
 u'Carton 4:78',
 u'Carton 4:80',
 u'agendas',
 u'articles',
 u'briefs (legal documents)',
 u'detail',
 u'fdr',
 u'fliers (printed matter)',
 u'folder',
 u'form letters',
 u'group statements',
 u'item',
 u'leaflets',
 u'letters (correspondence)',
 u'magazines (periodicals)',
 u'memorandums',
 u'minutes',
 u'miscellaneous',
 u'news bulletins',
 u'newsletters',
 u'newspapers',
 u'oral histories',
 u'pamphlets',
 u'papers (document genres)',
 u'personal statement',
 u'personal statements',
 u'progress reports',
 u'reports',
 u'still image',
 u'tables of content',
 u'text',
 u'title pages',
 u'transcripts'}



In [47]:

    
#related id

len(df.fsmRelatedIdentifier.dropna())









    Out[47]:





236

TEI documents



In [48]:

    
df.fsmTeiUrl.dropna()









    Out[48]:





685    [http://content.cdlib.org/xml/ark:/13030/kt5m3...
686    [http://content.cdlib.org/xml/ark:/13030/kt5s2...
687    [http://content.cdlib.org/xml/ark:/13030/kt6k4...
688    [http://content.cdlib.org/xml/ark:/13030/kt4s2...
689    [http://content.cdlib.org/xml/ark:/13030/kt1h4...
690    [http://content.cdlib.org/xml/ark:/13030/kt2w1...
691    [http://content.cdlib.org/xml/ark:/13030/kt609...
692    [http://content.cdlib.org/xml/ark:/13030/kt638...
693    [http://content.cdlib.org/xml/ark:/13030/kt777...
694    [http://content.cdlib.org/xml/ark:/13030/kt0k4...
695    [http://content.cdlib.org/xml/ark:/13030/kt6m3...
696    [http://content.cdlib.org/xml/ark:/13030/kt287...
697    [http://content.cdlib.org/xml/ark:/13030/kt3p3...
698    [http://content.cdlib.org/xml/ark:/13030/kt177...
699    [http://content.cdlib.org/xml/ark:/13030/kt1g5...
...
864    [http://content.cdlib.org/xml/ark:/13030/kt3z0...
865    [http://content.cdlib.org/xml/ark:/13030/kt5h4...
866    [http://content.cdlib.org/xml/ark:/13030/kt1v1...
867    [http://content.cdlib.org/xml/ark:/13030/kt7d5...
868    [http://content.cdlib.org/xml/ark:/13030/kt7h4...
869    [http://content.cdlib.org/xml/ark:/13030/kt919...
870    [http://content.cdlib.org/xml/ark:/13030/kt409...
871    [http://content.cdlib.org/xml/ark:/13030/kt4c6...
872    [http://content.cdlib.org/xml/ark:/13030/kt387...
873    [http://content.cdlib.org/xml/ark:/13030/kt3q2...
874    [http://content.cdlib.org/xml/ark:/13030/kt7v1...
875    [http://content.cdlib.org/xml/ark:/13030/kt038...
876    [http://content.cdlib.org/xml/ark:/13030/kt7z0...
877    [http://content.cdlib.org/xml/ark:/13030/kt500...
878    [http://content.cdlib.org/xml/ark:/13030/kt9b6...
Name: fsmTeiUrl, Length: 194, dtype: object

	fsmCreator	fsmDateCreated	fsmIdentifier	fsmImageUrl	fsmNote	fsmPhysicalLocation	fsmRelatedIdentifier	fsmRelatedTitle	fsmTeiUrl	fsmTitle	fsmTypeOfResource	id
0	[Warren (Photographer)]	[Nov. 9, 1964]	[BANC PIC 1959.010 -- NEG pt.3 11-09-64.4]	[http://nma.berkeley.edu/ark:/28722/bk0005j9r9...	[Photographer]	[The Bancroft Library;;, University of Califor...	[http://bancroft.berkeley.edu/FSM/, BANC PIC 1...	[The Free Speech Movement Digital Archive, San...	NaN	[Professor John Searle speaking to crowd.]	[still image]	ark:/13030/ft6k40080h
1	[Steven Marcus]	[Dec. 2, 1964]	[BANC PIC 2000.002--NEG Strip 117:36]	[http://nma.berkeley.edu/ark:/28722/bk0005k284...	[Photographer]	[The Bancroft Library;;, University of Califor...	[http://bancroft.berkeley.edu/FSM/, BANC PIC 2...	[The Free Speech Movement Digital Archive, Ste...	NaN	[Mario Savio speaking with reporters.]	[still image]	ark:/13030/tf009n97vn
2	[Steven Marcus]	[Dec. 2, 1964]	[BANC PIC 2000.002--NEG Strip 122:42]	[http://nma.berkeley.edu/ark:/28722/bk0005k2c2...	[Photographer]	[The Bancroft Library;;, University of Califor...	[http://bancroft.berkeley.edu/FSM/, BANC PIC 2...	[The Free Speech Movement Digital Archive, Ste...	NaN	[Joan Baez singing in front of Sproul Hall.]	[still image]	ark:/13030/tf5j49n838
3	[Jones (Photographer)]	[Dec. 3, 1964]	[BANC PIC 1959.010 -- NEG pt.3 12-03-64.2]	[http://nma.berkeley.edu/ark:/28722/bk0005j9z5...	[Photographer]	[The Bancroft Library;;, University of Califor...	[http://bancroft.berkeley.edu/FSM/, BANC PIC 1...	[The Free Speech Movement Digital Archive, San...	NaN	[Girl student being booked on campus before be...	[still image]	ark:/13030/ft700007tc
4	[Ingman (Photographer)]	[Oct. 5, 1964]	[BANC PIC 1959.010 -- NEG pt.3 10-05-64.4]	[http://nma.berkeley.edu/ark:/28722/bk0005j9n7...	[Photographer]	[The Bancroft Library;;, University of Califor...	[http://bancroft.berkeley.edu/FSM/, BANC PIC 1...	[The Free Speech Movement Digital Archive, San...	NaN	[Bryan Turner speaking.]	[still image]	ark:/13030/ft7n39p1mr

	fsmCreator	fsmDateCreated	fsmIdentifier	fsmImageUrl	fsmNote	fsmPhysicalLocation	fsmRelatedIdentifier	fsmRelatedTitle	fsmTitle	fsmTypeOfResource	id
246	[Hecker, Ron]	[Dec. 7, 1964]	NaN	[http://sunsite.berkeley.edu/FindingAids/dynaw...	NaN	NaN	NaN	[Free Speech Movement Photographs Collection, ]	[Crowd in Sproul Plaza from Student Union balc...	NaN	UARC PIC 24B:2:22
247	[Hecker, Ron]	[Dec. 7, 1964]	NaN	[http://sunsite.berkeley.edu/FindingAids/dynaw...	NaN	NaN	NaN	[Free Speech Movement Photographs Collection, ]	[Crowd at Greek Theater]	NaN	UARC PIC 24B:2:17
248	NaN	[Dec. 7, 1964]	NaN	[http://sunsite.berkeley.edu/FindingAids/dynaw...	NaN	NaN	NaN	[Free Speech Movement Photographs Collection, ]	[View from inside Sproul Hall lobby looking th...	NaN	UARC PIC 24B:1:26
249	[Hecker, Ron]	[Dec. 1964]	NaN	[http://sunsite.berkeley.edu/FindingAids/dynaw...	NaN	NaN	NaN	[Free Speech Movement Photographs Collection, ]	[Student Strike]	NaN	UARC PIC 24B:2:6
250	[Hecker, Ron]	[Dec. 7, 1964]	NaN	[http://sunsite.berkeley.edu/FindingAids/dynaw...	NaN	NaN	NaN	[Free Speech Movement Photographs Collection, ]	[Crowd at Greek Theater]	NaN	UARC PIC 24B:2:21