In [14]:
import lxml.html
"""take table from jhsgw search results, which are written in some annoying javascript,
so I had to just copy the tables from the HTML. Then filter for results with pictures
"""
def PicFilter(table):
splittable = table.split('</tr>')
ftable = [x for x in splittable if 'Archive Record' not in x]
filtered_table = [x for x in ftable if 'Photo Record' not in x]
tree = ','.join(filtered_table)
etree = (lxml.html.document_fromstring(tree))
r = etree.xpath('//a/@href')
links = [x for x in r if "id=" in x]
print("here are " + str(len(links)) + " pictures")
for x in links:
yield(x.replace('?request=record;id=', \
'http://jhsgw.pastperfect-online.com/31288cgi/mweb.exe?request=record;id='))
#tableNE1 = '''<table class="results" cellspacing="0" cellpadding="0"><tbody><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><span class="resultsnote"><b>Archive Record</b></span></td><td style="text-align:left"><a href="?request=record;id=2CAA9515-C192-4438-8C71-641229205508;type=301">2010.6.1</a>, Paper, Letterhead, Kolker Poultry Company Collection<i> -- Found in <b>Scope & Content</b>: ... Wholesale Poultry and Eggs/1251 Fourth Street, <span class="hilite">NE</span>/Union Market.</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><span class="resultsnote"><b>Archive Record</b></span></td><td style="text-align:left"><a href="?request=record;id=B0F74535-965C-4073-96D3-454093444569;type=301">2010.6.3</a>, Paper, letterhead, Kolker Poultry Company Collection<i> -- Found in <b>Scope & Content</b>: ... of Quality Poultry/1251 Fourth Street, <span class="hilite">NE</span>/Union Market/Washington DC. Red image of hen and rooster in upper left corner.</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&hex=1984071-2.jpg','x19840712jpg','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/1984071-2.jpg" style="width:75px;height:53px" alt="Image of 1984.07.1, Print, photographic"></a></td><td style="text-align:left"><a href="?request=record;id=774A1725-03E0-4585-8644-638492232529;type=102">1984.07.1</a>, Print, photographic, Ezras Israel Congregation Collection<i> -- Found in <b>Description</b>: ... of Ezras Israel synagogue at 8th and I Street, <span class="hilite">NE</span>. Girl seated near left side with long curls is Augusta Dessoff (nee Silverman).</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&hex=19871003.jpg','x19871003jpg','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/19871003.jpg" style="width:75px;height:60px" alt="Image of 1987.10.03, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=6B8A33DF-22BB-49B2-BF5E-219301176789;type=102">1987.10.03</a>, Print, Photographic<i> -- Found in <b>Description</b>: Asher Pomerantz in front of store on Kramer St, <span class="hilite">NE</span>, 1921. Family lived behind store.</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&hex=1987111.JPG','x1987111JPG','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/002/thumbs/1987111.JPG" style="width:75px;height:60px" alt="Image of 1987.11.1, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=7A68C254-7B97-46D5-9E2A-217340996450;type=102">1987.11.1</a>, Print, Photographic<i> -- Found in <b>Description</b>: ... second-and furniture store at 600 H Street, <span class="hilite">NE</span>.</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><span class="resultsnote"><b>Photo Record</b></span></td><td style="text-align:left"><a href="?request=record;id=440ABABA-0771-4DF0-BADB-800003982782;type=102">1990.10.8</a>, Print, photographic<i> -- Found in <b>Description</b>: ... of Reliable Footwear Shoestore on H Street, <span class="hilite">NE</span>.</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&hex=1993221.JPG','x1993221JPG','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/1993221.JPG" style="width:61px;height:75px" alt="Image of 1993.22.1, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=1CB6B6C1-B0B7-43A8-841C-545322863881;type=102">1993.22.1</a>, Print, Photographic<i> -- Found in <b>Description</b>: ... Food Market at 5th and E. Capitol Street, <span class="hilite">NE</span>, with Yiddish newspaper the Jewish Daily Forward.</i></td></tr><tr><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&hex=2002116.JPG','x2002116JPG','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/2002116.JPG" style="width:52px;height:75px" alt="Image of 2002.1.16, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=27B96EDC-247C-4ACC-AA8C-113619169084;type=102">2002.1.16</a>, Print, Photographic, Hais Family Collection<i> -- Found in <b>Description</b>: ... in front of DGS Market at 7th and C Street, <span class="hilite">NE</span>, with mother Ida Flax Hais and another woman</i></td></tr><tr style="background-color:#f1f3f5"><td style="text-align:center;vertical-align:middle"><a href="javascript:var x=window.open('/31288cgi/mweb.exe?request=image&hex=2004251.jpg','x2004251jpg','menubar,scrollbars,resizable,width=680,height=680')"><img src="/31288images/001/thumbs/2004251.jpg" style="width:50px;height:75px" alt="Image of 2004.25.1, Print, Photographic"></a></td><td style="text-align:left"><a href="?request=record;id=C6506737-9E68-4B55-BD58-278911158496;type=102">2004.25.1</a>, Print, Photographic</td></tr></tbody></table>'''
NElinks = (list(PicFilter(tableNE)))
In [11]:
import requests
from collections import defaultdict
filtered = []
def dictionarybuild(url):
# build dictionary of elements from the photo pages.
#broken. Need to fix the elements you split on.
page = requests.get(url)
pge = page.content
result = defaultdict(list)
p = str(pge)
recs = p.split("<td class=\\'fr_label\\' style=\\'vertical-align:top\\'>")
for rec in recs:
result['url'] = str(url)
if "Description" in rec:
Description = rec.split("</td><td class=results>")
result['Description'] = Description[1][:Description[1].find('<')]
if "Title" in rec:
Title = rec.split("</td><td class=results>")
result['Title'] = Title[1][:Title[1].find('<')]
if "Date" in rec:
Date = rec.split("</td><td class=results>")
result['Date'] = Date[1][:Date[1].find('<')]
if "Place" in rec:
Place = rec.split("</td><td class=results>")
result['Place'] = Place[1][:Place[1].find('<')]
if "Collection" in rec:
Collection = rec.split("</td><td class=results>")
result['Collection'] = Collection[1][:Collection[1].find('<')]
filtered.append(result)
#print(filtered)
for link in links:
dictionarybuild(link)
In [13]:
import googlemaps
from collections import defaultdict
from pprint import pprint
NEfinal = []
gmaps = googlemaps.Client(key='insert API key')
for rec in NE:
place = rec['Place']
if 'DC' not in place:
place = place + "Washington, DC"
geocode_result = gmaps.geocode(place)
#extract lat long from JSON object as lat long tuple
latlong = geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng']
rec['latlong'] = latlong
try:
partialcheck = geocode_result[0]['partial_match']
rec['geocheck'] = 'problematic geo. Need more specific address'
except:
rec['geocheck'] = 'this is fine'
NEfinal.append(rec)
#pprint(NEfinal)
In [22]:
#concatenate notebooks
from pandas import concat
nonNE = pd.DataFrame.from_csv('exampleNotNEFinal.csv', encoding = "ISO-8859-1")
NE = pd.DataFrame.from_csv('exampleNE.csv', encoding = "ISO-8859-1")
pieces = [NE, nonNE]
concatenated = pd.concat(pieces)
In [24]:
concatenated.to_csv('exampleNotNE.csv')
In [89]:
#fix obviously bad geos
place = '3rd Street NW and G Street NW, Washington, D.C.'
import googlemaps
from collections import defaultdict
from pprint import pprint
NEfinal = []
gmaps = googlemaps.Client(key='insert API key')
geocode_result = gmaps.geocode(place)
#extract lat long from JSON object as lat long tuple
lat = geocode_result[0]['geometry']['location']['lat']
lnng = geocode_result[0]['geometry']['location']['lng']
print(lat)
print(lnng)
In [91]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r
capitol_lat = 38.889931
capitol_long = -77.009003
haversine(lon1, lat1, capitol_long, capitol_lat)
Out[91]:
In [94]:
almostfinal = pd.DataFrame.from_csv('09252016_JHSGW_fixed_almost_final.csv', encoding = "ISO-8859-1")
all = almostfinal.to_dict('records')
In [95]:
all[0]
Out[95]:
In [100]:
for rec in all:
rec['capitol_distance'] = haversine(rec['longitude'], rec['latitude'], capitol_long, capitol_lat)
In [103]:
final_frame = pd.DataFrame(all)
In [111]:
final_frame.to_csv('09252016final_final.csv')
In [ ]: