In [14]:
import lxml.html
"""take table from jhsgw search results, which are written in some annoying javascript, 
so I had to just copy the tables from the HTML. Then filter for results with pictures
   """
def PicFilter(table):
    splittable = table.split('</tr>')
    ftable = [x for x in splittable if 'Archive Record' not in x]
    filtered_table = [x for x in ftable if 'Photo Record' not in x]
    tree = ','.join(filtered_table)
    etree = (lxml.html.document_fromstring(tree))
    r = etree.xpath('//a/@href')
    links = [x for x in r if "id=" in x]
    print("here are " + str(len(links)) + " pictures") 
    for x in links:
        yield(x.replace('?request=record;id=', \
                    'http://jhsgw.pastperfect-online.com/31288cgi/mweb.exe?request=record;id='))
    
#tableNE1 = '''<table class="results" cellspacing="0" cellpadding="0"><tbody><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><span class="resultsnote"><b>Archive Record</b></span></td><td style="text-align:left"><a href="?request=record;id=2CAA9515-C192-4438-8C71-641229205508;type=301">2010.6.1</a>, Paper, Letterhead, Kolker Poultry Company Collection<i> -- Found in <b>Scope &amp; Content</b>: ... Wholesale Poultry and Eggs/1251 Fourth Street, <span class="hilite">NE</span>/Union Market.</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><span class="resultsnote"><b>Archive Record</b></span></td><td style="text-align:left"><a href="?request=record;id=B0F74535-965C-4073-96D3-454093444569;type=301">2010.6.3</a>, Paper, letterhead, Kolker Poultry Company Collection<i> -- Found in <b>Scope &amp; Content</b>: ... of Quality Poultry/1251 Fourth Street, <span class="hilite">NE</span>/Union Market/Washington DC.  Red image of hen and rooster in upper left corner.</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&amp;hex=1984071-2.jpg','x19840712jpg','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/1984071-2.jpg" style="width:75px;height:53px" alt="Image of 1984.07.1, Print, photographic"></a></td><td style="text-align:left"><a href="?request=record;id=774A1725-03E0-4585-8644-638492232529;type=102">1984.07.1</a>, Print, photographic, Ezras Israel Congregation Collection<i> -- Found in <b>Description</b>: ... of Ezras Israel synagogue at 8th and I Street, <span class="hilite">NE</span>.  Girl seated near left side with long curls is Augusta Dessoff (nee Silverman).</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&amp;hex=19871003.jpg','x19871003jpg','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/19871003.jpg" style="width:75px;height:60px" alt="Image of 1987.10.03, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=6B8A33DF-22BB-49B2-BF5E-219301176789;type=102">1987.10.03</a>, Print, Photographic<i> -- Found in <b>Description</b>: Asher Pomerantz in front of store on Kramer St, <span class="hilite">NE</span>, 1921.  Family lived behind store.</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&amp;hex=1987111.JPG','x1987111JPG','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/002/thumbs/1987111.JPG" style="width:75px;height:60px" alt="Image of 1987.11.1, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=7A68C254-7B97-46D5-9E2A-217340996450;type=102">1987.11.1</a>, Print, Photographic<i> -- Found in <b>Description</b>: ... second-and furniture store at 600 H Street, <span class="hilite">NE</span>.</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><span class="resultsnote"><b>Photo Record</b></span></td><td style="text-align:left"><a href="?request=record;id=440ABABA-0771-4DF0-BADB-800003982782;type=102">1990.10.8</a>, Print, photographic<i> -- Found in <b>Description</b>: ... of Reliable Footwear Shoestore on H Street, <span class="hilite">NE</span>.</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&amp;hex=1993221.JPG','x1993221JPG','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/1993221.JPG" style="width:61px;height:75px" alt="Image of 1993.22.1, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=1CB6B6C1-B0B7-43A8-841C-545322863881;type=102">1993.22.1</a>, Print, Photographic<i> -- Found in <b>Description</b>: ... Food Market at 5th and E. Capitol Street, <span class="hilite">NE</span>, with Yiddish newspaper the Jewish Daily Forward.</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&amp;hex=2002116.JPG','x2002116JPG','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/2002116.JPG" style="width:52px;height:75px" alt="Image of 2002.1.16, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=27B96EDC-247C-4ACC-AA8C-113619169084;type=102">2002.1.16</a>, Print, Photographic, Hais Family Collection<i> -- Found in <b>Description</b>: ... in front of DGS Market at 7th and C Street, <span class="hilite">NE</span>, with mother Ida Flax Hais and another woman</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&amp;hex=2004251.jpg','x2004251jpg','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/2004251.jpg" style="width:50px;height:75px" alt="Image of 2004.25.1, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=C6506737-9E68-4B55-BD58-278911158496;type=102">2004.25.1</a>, Print, Photographic</td></tr></tbody></table>'''



NElinks = (list(PicFilter(tableNE)))


here are 139 pictures

In [11]:
import requests 
from collections import defaultdict
filtered = []
def dictionarybuild(url):
    
# build dictionary of elements from the photo pages.  
#broken. Need to fix the elements you split on.
    page = requests.get(url)
    pge = page.content
    result = defaultdict(list)
    p = str(pge)
    recs = p.split("<td class=\\'fr_label\\' style=\\'vertical-align:top\\'>")
  
  
    for rec in recs:
        result['url'] = str(url)
        if "Description" in rec:
            Description = rec.split("</td><td class=results>")
            result['Description'] = Description[1][:Description[1].find('<')]
            
        if "Title" in rec:
            Title = rec.split("</td><td class=results>")
            result['Title'] = Title[1][:Title[1].find('<')]
        
        if "Date" in rec:
            Date = rec.split("</td><td class=results>")
            result['Date'] = Date[1][:Date[1].find('<')]
            
                
        if "Place" in rec:
            Place = rec.split("</td><td class=results>")
            result['Place'] = Place[1][:Place[1].find('<')]
            
        if "Collection" in rec:
            Collection = rec.split("</td><td class=results>")
            result['Collection'] = Collection[1][:Collection[1].find('<')]
            
    
    filtered.append(result)
    #print(filtered)

for link in links:
    dictionarybuild(link)

In [13]:
import googlemaps
from collections import defaultdict
from pprint import pprint
NEfinal = []
gmaps = googlemaps.Client(key='insert API key')    
for rec in NE:
    place = rec['Place']
    if 'DC' not in place:
        place = place + "Washington, DC"
    geocode_result = gmaps.geocode(place)
#extract lat long from JSON object as lat long tuple
    latlong = geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng']
    rec['latlong'] = latlong
    try: 
        partialcheck = geocode_result[0]['partial_match']
        rec['geocheck'] = 'problematic geo. Need more specific address'
    except:
        rec['geocheck'] = 'this is fine'
    NEfinal.append(rec)
#pprint(NEfinal)

In [22]:
#concatenate notebooks
from pandas import concat
nonNE = pd.DataFrame.from_csv('exampleNotNEFinal.csv', encoding = "ISO-8859-1")
NE = pd.DataFrame.from_csv('exampleNE.csv', encoding = "ISO-8859-1")
pieces = [NE, nonNE]
concatenated = pd.concat(pieces)

In [24]:
concatenated.to_csv('exampleNotNE.csv')

In [89]:
#fix obviously bad geos
place = '3rd Street NW and G Street NW, Washington, D.C.'

import googlemaps

from collections import defaultdict
from pprint import pprint
NEfinal = []
gmaps = googlemaps.Client(key='insert API key')    
geocode_result = gmaps.geocode(place)
#extract lat long from JSON object as lat long tuple
lat = geocode_result[0]['geometry']['location']['lat']
lnng = geocode_result[0]['geometry']['location']['lng']
print(lat)
print(lnng)


38.8983257
-77.01522369999999

In [91]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r


capitol_lat = 38.889931 
capitol_long = -77.009003
haversine(lon1, lat1, capitol_long, capitol_lat)


Out[91]:
10.63961669975575

In [94]:
almostfinal = pd.DataFrame.from_csv('09252016_JHSGW_fixed_almost_final.csv', encoding = "ISO-8859-1")

all = almostfinal.to_dict('records')

In [95]:
all[0]


Out[95]:
{'Place': '5404 Temple Hills Road, Temple Hills, MD',
 'latitude': 38.812175,
 'Title': nan,
 'Collection': 'Congregation Shaare Tikvah',
 'Description': nan,
 'longitude': -76.9374,
 'Date': '1967-2002',
 'url': 'http://www.jhsgw.org/exhibitions/online/goldberg/photographs/congregation-shaare-tikvah?slide=5'}

In [100]:
for rec in all:
    rec['capitol_distance'] = haversine(rec['longitude'], rec['latitude'], capitol_long, capitol_lat)

In [103]:
final_frame = pd.DataFrame(all)

In [111]:
final_frame.to_csv('09252016final_final.csv')

In [ ]: