In [1]:
!ls -la data
In [113]:
!head data/boston_massachusetts.osm
In [153]:
!cp data/boston_massachusetts.osm data/boston_massachusetts_new.osm
In [152]:
filename = '/Users/excalibur/py/nanodegree/wrang_mongdb/example.osm'
filename = 'data/boston_massachusetts_new.osm'
In [4]:
import xml.etree.cElementTree as ET
import pprint as pp
import re
import os
import json
In [5]:
# system beep
def finished():
os.system("printf '\a'")
os.system("printf '\a'")
In [6]:
tags = {}
for event,element in ET.iterparse(filename):
if element.tag not in tags:
tags[element.tag] = {}
tags[element.tag]['count'] = 1
tags[element.tag]['attributes'] = {}
tags[element.tag]['children'] = {}
tags[element.tag]['grandchildren'] = {}
tags[element.tag]['greatgrandchildren'] = {}
else:
tags[element.tag]['count'] += 1
for attribute_key,attribute_val in element.attrib.items():
if attribute_key not in tags[element.tag]['attributes']:
tags[element.tag]['attributes'][attribute_key] = 1
else:
tags[element.tag]['attributes'][attribute_key] += 1
for child in element:
if child.tag not in tags[element.tag]['children']:
tags[element.tag]['children'][child.tag] = 1
else:
tags[element.tag]['children'][child.tag] += 1
for grandchild in child:
if grandchild.tag not in tags[element.tag]['grandchildren']:
tags[element.tag]['grandchildren'][grandchild.tag] = 1
else:
tags[element.tag]['grandchildren'][grandchild.tag] += 1
for greatgrandchild in grandchild:
if greatgrandchild.tag not in tags[element.tag]['greatgrandchild']:
tags[element.tag]['greatgrandchild'][greatgrandchild.tag] = 1
else:
tags[element.tag]['greatgrandchild'][greatgrandchild.tag] += 1
# clean up unused dictionaries
for item in tags.items():
if not item[1]['attributes']:
del item[1]['attributes']
if not item[1]['children']:
del item[1]['children']
if not item[1]['grandchildren']:
del item[1]['grandchildren']
if not item[1]['greatgrandchildren']:
del item[1]['greatgrandchildren']
pp.pprint(tags)
finished()
In [95]:
def get_subdictionaries(element, sub_dictionary):
# regexes
string_of_digits_regex = re.compile(r'^[0-9]*$')
lat_or_long_regex = re.compile(r'^-{0,1}[0-9]*.[0-9]*$')
timestamp_regex = re.compile(r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
# if the current element does not already exist in the current subdictionary,
# create a dictionary for it
if element.tag not in sub_dictionary:
sub_dictionary[element.tag] = {}
### CONFUSING TERMINOLOGY SECTION ###
# for the attribute keys and values
for attribute_key,attribute_val in element.attrib.items():
# if the attribute_key is k,
if attribute_key == 'k':
# make a dictionary entry for it (if it doesn't already exit),
# and a set for the corresponding values (the values are the value of the 'v' attribute)
if attribute_val not in sub_dictionary[element.tag]:
sub_dictionary[element.tag][attribute_val] = set()
# get the length of each element in the set
list_of_lengths = [len(set_element) for set_element in sub_dictionary[element.tag][attribute_val]]
# if the corresponding v value of the current k value has a length not already in the set,
# add it to the set
if (len(element.attrib.get('v')) not in list_of_lengths):
sub_dictionary[element.tag][attribute_val].add(element.attrib.get('v'))
# even if the length of the current v value is already present,
# if the v value is not purely a string of digits,
# add it to the set
elif (not string_of_digits_regex.match(element.attrib.get('v'))):
sub_dictionary[element.tag][attribute_val].add(element.attrib.get('v'))
# if attribute_key is v,
# disregard since the value was extracted above
elif attribute_key == 'v':
pass
# if attribute_key isn't already in the dictionary,
# add it
elif attribute_key not in sub_dictionary[element.tag]:
# create a set for its values
sub_dictionary[element.tag][attribute_key] = set()
# add attribute_val to the set
sub_dictionary[element.tag][attribute_key].add(attribute_val)
# if the attribute_key is already in the dictionary
else:
# calculate the length of each element in the relevant value set
list_of_lengths = [len(set_element) for set_element in sub_dictionary[element.tag][attribute_key]]
# if attribute_val has a length not already in the value set,
# add it
if (len(attribute_val) not in list_of_lengths):
sub_dictionary[element.tag][attribute_key].add(attribute_val)
# even if the length of attribute_val is already present,
# if it's not purely a string of digits,
# and/or if it's not in the proper lat/long or timestamp format,
# add it
# (in other words, only keep one example of something that is in a seemingly 'non-problematic' format,
# but keep everything else), unless attribute_key == 'user' (don't want all of those,
# esp. since they could be almost any type of string!)
elif ( (not string_of_digits_regex.match(attribute_val)) and
(not lat_or_long_regex.match(attribute_val)) and
(not timestamp_regex.match(attribute_val)) and
(attribute_key != 'user') ):
sub_dictionary[element.tag][attribute_key].add(attribute_val)
# if the sub_dictionary remained empty,
# delete it
if not sub_dictionary[element.tag]:
del sub_dictionary[element.tag]
return sub_dictionary
In [8]:
def dictionarianize(filename, tag_to_examine):
tag_dictionary = {}
tag_dictionary['attributes'] = {}
tag_dictionary['children'] = {}
tag_dictionary['grandchildren'] = {}
for event,element in ET.iterparse(filename):
# get attributes
if element.tag == tag_to_examine:
get_subdictionaries(element, tag_dictionary['attributes'])
# deal with descendants, if any
for child in element:
get_subdictionaries(child, tag_dictionary['children'])
for grandchild in child:
get_subdictionaries(grandchild, tag_dictionary['grandchildren'])
# if the dictionary remained empty,
# delete it
if not tag_dictionary['attributes']:
del tag_dictionary['attributes']
if not tag_dictionary['children']:
del tag_dictionary['children']
if not tag_dictionary['grandchildren']:
del tag_dictionary['grandchildren']
# tests
#if tag_to_examine == 'node':
# assert tag_dictionary['attributes']['user'] == 'lurker'
return tag_dictionary
In [9]:
def set_default(obj):
if isinstance(obj, set):
return list(obj)
raise TypeError
In [33]:
def jsonize(tag_names):
for tag_name in tag_names:
with open('{0}.json'.format(tag_name), 'w') as f:
json.dump(dictionarianize(filename, tag_to_examine=tag_name), f, default=set_default, sort_keys=True, indent=2, separators=(',', ' : '))
print "{0} done".format(tag_name)
finished()
In [92]:
tag_names = ['bounds','member','nd','node','osm','relation','tag','way']
jsonize(tag_names)
bounds element attributes seemed fine
member elements attributes: varying length ref values, but all were strings of integers, so seemingly fine; role values were mostly snakecase; any that weren't were changed to snakecase due to the examples given here (http://wiki.openstreetmap.org/wiki/Relation); type values seemed fine
In [96]:
!grep member data/boston_massachusetts_new.osm -B 1 -A 1 | head
In [129]:
def get_new_root(filename):
tree = ET.parse(filename)
return tree.getroot()
In [175]:
root = get_new_root(filename)
In [139]:
def write_tree(filename):
tree.write(filename)
In [103]:
# write a function to snakecasify
def snakecasify(element_name, attribute_name, key_or_value):
if key_or_value == 'value':
if len(element_name) > 1:
for element in root.findall(element_name[0]):
for child in element.findall(element_name[1]):
attribute_val = child.get(attribute_name).lower().replace(' ','_')
child.set(attribute_name,attribute_val)
tree.write(filename)
In [93]:
snakecasify(['relation','member'], 'role', 'value')
In [94]:
# check results
jsonize(['member'])
nd elements ref attribute values appeared to be fine (strings of integers)
node changeset, id, lat, lon, timestamp, uid, user, and version attributes seemed fine
node children: tag elements were a mess, as would be expected given the way they are used in OpenStreetMap
problematic tag attributes scheduled for removal: "Deed", "FIXME", "LOCATION", "ROUTES", "Town"
"abandoned:railway" : [ "station" ], following format here: http://wiki.openstreetmap.org/wiki/Railways
In [193]:
!grep abandoned data/boston_massachusetts_new.osm -B 1 -A 1 | head
In [183]:
for element in root.findall('node'):
for child in element.findall('tag'):
if child.attrib['k'] == 'abandoned:railway':
child.attrib['railway'] = 'abandoned'
del child.attrib['k']
del child.attrib['v']
ET.SubElement(element,'tag',{'public_transport' : 'station'})
In [191]:
write_tree(filename)
In [190]:
for element in root.findall('node'):
for child in element.findall('tag'):
if ('railway' in child.attrib) or ('public_transport' in child.attrib):
print child.attrib
In [115]:
def replace_value(element_name, key, current_value, new_value):
tree = ET.parse(filename)
root = tree.getroot()
if len(element_name) > 1:
for element in root.findall(element_name[0]):
for child in element.findall(element_name[1]):
if child.get(key) == current_value:
child.set(key,new_value)
tree.write(filename)
In [116]:
# check results
jsonize(['node'])
after fixes, write new regexes to omit
skip users when all strings