A Something-to-Do-in-Boston Suggestion Engine
In [2]:
import xml.etree.cElementTree as ET
import pprint as pp
import re
import os
import json
import string
In [3]:
filename = '/Users/excalibur/Dropbox/nanodegree/data/boston_massachusetts.osm'
In [4]:
# system beep
def finished():
os.system("printf '\a'")
os.system("printf '\a'")
In [5]:
# replace sets with lists for JSON
def set_default(obj):
if isinstance(obj, set):
return list(obj)
raise TypeError
In [6]:
# make a JSON file
def make_places_file(places):
with open('places.json', 'w') as f:
json.dump(places, f, default=set_default, sort_keys=True, indent=2, separators=(',', ' : '))
finished()
In [7]:
places = {}
for event,element in ET.iterparse(filename):
if element.tag == 'node':
node_id = element.attrib['id']
places[node_id] = {}
places[node_id]['lat'] = element.attrib['lat']
places[node_id]['lon'] = element.attrib['lon']
for tag in element:
if tag.attrib['k'] == 'addr:city':
places[node_id]['city'] = tag.attrib['v']
if tag.attrib['k'] == 'addr:housenumber':
places[node_id]['number'] = tag.attrib['v']
if tag.attrib['k'] == 'addr:street':
places[node_id]['street'] = tag.attrib['v']
if tag.attrib['k'] == 'address':
places[node_id]['address'] = tag.attrib['v']
if tag.attrib['k'] == 'amenity':
places[node_id]['amenity'] = tag.attrib['v']
if tag.attrib['k'] == 'cuisine':
places[node_id]['cuisine'] = tag.attrib['v']
if tag.attrib['k'] == 'designation':
places[node_id]['designation'] = tag.attrib['v']
if tag.attrib['k'] == 'leisure':
places[node_id]['leisure'] = tag.attrib['v']
if tag.attrib['k'] == 'name':
places[node_id]['name'] = tag.attrib['v']
if tag.attrib['k'] == 'note':
places[node_id]['note'] = tag.attrib['v']
if tag.attrib['k'] == 'opening_hours':
places[node_id]['opening_hours'] = tag.attrib['v']
if tag.attrib['k'] == 'phone':
places[node_id]['phone'] = tag.attrib['v']
if tag.attrib['k'] == 'shop':
places[node_id]['shop'] = tag.attrib['v']
if tag.attrib['k'] == 'website':
places[node_id]['website'] = tag.attrib['v']
finished()
In [8]:
len(places)
Out[8]:
In [9]:
make_places_file(places)
We don't want places with only latitude and longitude.
In [10]:
to_remove = []
for place in places:
if len(places[place].keys()) <= 2:
to_remove.append(place)
for place_id in to_remove:
del places[place_id]
Check to see how many places are left.
In [11]:
len(places)
Out[11]:
Well, there goes most of the data, which makes sense, since it was map data.
Similarly, we don't want places that don't have some sort of helpful label.
In [12]:
keys = set()
for place in places:
if 'name' not in places[place].keys():
for key in places[place].keys():
keys.add(key)
print keys
So those labels listed above exist in various combinations for places even when the name label is absent.
Since name might not be used, but something else like shop could be, eliminate places that don't have some sort of helpful label (i.e., one from the above set).
In [13]:
to_remove = []
for place in places:
if not 'name' in places[place].keys() and not 'shop' in places[place].keys() and not 'amenity' in places[place].keys() and not 'leisure' in places[place].keys() and not 'note' in places[place].keys():
to_remove.append(place)
for place_id in to_remove:
del places[place_id]
Check to see how many places are left.
In [14]:
len(places)
Out[14]:
Our places are slowly evaporating, but that's OK.
In [15]:
def display_labels(label):
# for nbviewer, use a counter so output is truncated
count = 0 #
labels = set()
for place in places:
if label in places[place]:
labels.add(places[place][label])
for l in labels:
print l
# for nbviewer, use a counter so output is truncated
count += 1
if count > 50:
break
In [17]:
display_labels('name')
After scanning the above list, some obvious candidates for removal appear. Schools and churches probably wouldn't be popular destinations (some folks would disagree, of course).
Names with @ seem to be cross-streets (while not a terrible idea for suggested destinations, cross-streets obviously lack certain enticing details). Similarly, any names that are merely attempts at addresses should be nixed.
In [18]:
remove_regex = re.compile(r'School|Academy|Elem|Church|@|St|Ave|Pkwy|Rd|Hwy|Dr|I-')
In [19]:
to_remove = []
for place in places:
if 'name' in places[place]:
if remove_regex.search(places[place]['name']):
to_remove.append(place)
for place_id in to_remove:
del places[place_id]
In [20]:
display_labels('name')
There are likely plenty of other types of names to weed out, but the current batch should be good for now.
Get a new count for the number of places that remain.
In [21]:
len(places)
Out[21]:
Ah! Are we going to have any left when all of this is over!?
Moving on to the other keys: shop, amenity, designation, cuisine, and leisure, all seem like they can function as general labels and, thus, need not be distinguished from one another.
In [22]:
for label in ['shop','amenity','designation','cuisine','leisure']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
The above labels are good candidates for snakecasify-ing and should be added to a set for each place (since there could be unwanted overlap between them); the set should then be converted to a list for the sake of JSON.
In [23]:
for place in places:
places[place]['labels'] = set()
for key in ['shop','amenity','designation','cuisine','leisure']:
if key in places[place]:
# snakecasify
places[place]['labels'].add(places[place][key].replace(',','').lower().replace(' ','_'))
places[place]['labels'] = list(places[place]['labels'])
In [24]:
def remove_keys(keys):
for place in places:
for key in keys:
if key in places[place]:
del places[place][key]
In [25]:
remove_keys(['shop','amenity','designation','cuisine','leisure'])
Save progress.
In [26]:
make_places_file(places)
In [27]:
def check_keys():
keys = set()
for place in places:
for key in places[place].keys():
keys.add(key)
print keys
In [28]:
check_keys()
In [29]:
for label in ['note','opening_hours']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
For our purposes, note and opening_hours seem a little lame, chaotic, random, and infrequent. Get rid of 'em.
In [30]:
remove_keys(['note','opening_hours'])
In [31]:
for label in ['website']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
People like websites. Fix 'em.
In [32]:
http_regex = re.compile(r'http://|https://')
for place in places:
if 'website' in places[place]:
if not http_regex.match(places[place]['website']):
places[place]['website'] = 'http://' + places[place]['website']
print places[place]['website']
In [33]:
for label in ['phone']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Bleck. Those phone number strings are nutzo.
I think people still use phones for calling though. Might as well fix 'em.
In [34]:
phone_regex = re.compile(r'\d{3}-\d{3}-\d{4}')
for place in places:
if 'phone' in places[place]:
if not phone_regex.match(places[place]['phone']):
phone = list(places[place]['phone'])
phone.reverse()
digit_regex = re.compile(r'\d')
new_phone = []
digit_count = 0
for i in phone:
if digit_regex.match(i):
new_phone.insert(0,i)
digit_count += 1
if digit_count == 4 or digit_count == 7:
new_phone.insert(0,'-')
if digit_count > 9:
break
places[place]['phone'] = str.join('',new_phone)
# print any that got through
if not phone_regex.match(places[place]['phone']):
print "problem number: " + places[place]['phone']
Simply scanning the numbers displayed in the output window above, 617 357 LUCK is the likely culprit.
In [35]:
for place in places:
if 'phone' in places[place]:
if places[place]['phone'] == '-161-7357':
places[place]['phone'] = '617-357-5825'
In [36]:
for label in ['phone']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Scanning again, I noticed , Forest City Management in the displayed results.
It must have escaped the frontal-regex match. Drop it like it's hot.
In [37]:
for place in places:
if 'phone' in places[place]:
if places[place]['phone'] == '617-494-9330, Forest City Management':
print places[place]['phone']
In [38]:
for place in places:
if 'phone' in places[place]:
if places[place]['phone'] == '617-494-9330, Forest City Management':
places[place]['phone'] = '617-494-9330'
Check for any other oddballs by looking for string length 12.
In [39]:
for place in places:
if 'phone' in places[place]:
if len(places[place]['phone']) != 12:
print places[place]['phone']
In [40]:
check_keys()
In [41]:
for label in ['city']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Since we're using the cities (we're assuming folks will know we're in Massachusetts), toss any punctuation and references to the state.
Also, capitalize proper nouns.
In [42]:
state_regex = re.compile(r'')
for place in places:
if 'city' in places[place]:
# remove state
city = places[place]['city'].split(',')
places[place]['city'] = city[0]
# capitalize
places[place]['city'] = string.capwords(places[place]['city'])
In [43]:
for label in ['city']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Deal with that street address. Google says it's in Cambridge
In [44]:
for place in places:
if 'city' in places[place]:
if places[place]['city'] == '2067 Massachusetts Avenue':
print places[place]
In [45]:
for place in places:
if 'city' in places[place]:
if places[place]['city'] == '2067 Massachusetts Avenue':
places[place]['address'] = places[place]['city']
places[place]['city'] = 'Cambridge'
In [46]:
for label in ['city']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
In [47]:
for label in ['number','street', 'address']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Look for unnecessary duplication, where numbers and/or streets are present when an address is as well.
In [48]:
for place in places:
if 'address' in places[place].keys() and ('number' in places[place].keys() or 'street' in places[place].keys()):
print places[place]
Remove duplication.
In [49]:
for place in places:
if 'address' in places[place].keys() and ('number' in places[place].keys() or 'street' in places[place].keys()):
if 'number' in places[place].keys():
del places[place]['number']
if 'street' in places[place].keys():
del places[place]['street']
Numbers without corresponding streets are a bit on the worthless side, so remove them too.
In [50]:
for place in places:
if 'number' in places[place].keys() and 'street' not in places[place].keys():
print places[place]
In [51]:
for place in places:
if 'number' in places[place].keys() and 'street' not in places[place].keys():
del places[place]['number']
Make addresses from numbers and/or streets.
In [52]:
for place in places:
if 'address' not in places[place].keys():
if 'number' in places[place].keys() and 'street' in places[place].keys():
places[place]['address'] = places[place]['number'] + " " + places[place]['street']
del places[place]['number']
del places[place]['street']
elif 'street' in places[place].keys():
places[place]['address'] = places[place]['street']
del places[place]['street']
In [53]:
for label in ['address']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Get rid of the city, state, and zip bits.
In [54]:
for place in places:
if 'address' in places[place].keys():
places[place]['address'] = places[place]['address'].split(',')[0]
In [55]:
for label in ['address']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Attempt to make street designations uniform.
Check out the last string in each address.
In [56]:
last_string_set = set()
for place in places:
if 'address' in places[place].keys():
address_strings = places[place]['address'].split(' ')
last_string_set.add(address_strings[len(address_strings)-1])
print last_string_set
In [57]:
swap = { 'Rd':'Road', 'Plz':'Plaza', 'Ln':'Lane', 'Sq':'Square', 'Hwy':'Highway', 'St':'Street', 'Ave':'Avenue', 'Pkwy':'Parkway', 'Blvd':'Boulevard' }
for place in places:
if 'address' in places[place].keys():
address_strings = places[place]['address'].split(' ')
# capitalize
address_strings[len(address_strings)-1] = string.capwords(address_strings[len(address_strings)-1])
# remove punctuation
address_strings[len(address_strings)-1] = address_strings[len(address_strings)-1].translate(None,'.,')
# swap
if address_strings[len(address_strings)-1] in swap.keys():
address_strings[len(address_strings)-1] = swap[address_strings[len(address_strings)-1]]
# remove problematic ones showing up in results list above
if address_strings[len(address_strings)-1] == 'Ma' or address_strings[len(address_strings)-1] == '02467':
address_strings[len(address_strings)-1] = ''
# rejoin address string
places[place]['address'] = str.join(' ',address_strings)
In [58]:
last_string_set = set()
for place in places:
if 'address' in places[place].keys():
address_strings = places[place]['address'].split(' ')
last_string_set.add(address_strings[len(address_strings)-1])
print last_string_set
In [59]:
for label in ['address']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
Remove some others based on cursory spot checking.
In [60]:
for place in places:
if 'address' in places[place].keys():
if 'Chestnut Hill MA' in places[place]['address']:
places[place]['address'] = places[place]['address'].replace('Chestnut Hill MA','')
elif '. Dorchester Education Complex' in places[place]['address']:
places[place]['address'] = places[place]['address'].replace('. Dorchester Education Complex','')
In [61]:
for label in ['address']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
In [62]:
check_keys()
As checked during earlier investigations, lat and lon seem fine.
In [63]:
for label in ['lat','lon']:
print "\n------- " + label.upper() + " -------\n"
display_labels(label)
How many places are left now?
In [64]:
len(places)
Out[64]:
No longer need unique ids for each place (MongoDB will provide new ones).
In [65]:
new_places = []
for place in places:
new_places.append(places[place])
In [70]:
new_places[0:10]
Out[70]:
In [67]:
make_places_file(new_places)
In [68]:
!mongoimport --db bosroul --collection places --file places.json --jsonArray