List data files


In [1]:
!ls -la data


total 741856
drwxr-xr-x   4 excalibur  staff        136 Feb 24 10:15 .
drwxr-xr-x  15 excalibur  staff        510 Feb 27 22:57 ..
-rw-r-----@  1 excalibur  staff  366426696 Feb 23 11:32 boston_massachusetts.osm
-rw-r-----@  1 excalibur  staff   13399735 Feb 23 11:32 boston_massachusetts.osm.pbf

Display head of data file


In [113]:
!head data/boston_massachusetts.osm


<?xml version='1.0' encoding='UTF-8'?>
<osm version="0.6" generator="Osmosis 0.43.1">
  <bounds minlon="-71.19100" minlat="42.22800" maxlon="-70.92300" maxlat="42.39900" origin="http://www.openstreetmap.org/api/0.6"/>
  <node id="26746680" version="1" timestamp="2007-03-24T19:38:02Z" uid="6817" user="lurker" changeset="244358" lat="42.3089253" lon="-71.1191797">
    <tag k="created_by" v="YahooApplet 1.0"/>
  </node>
  <node id="30730952" version="2" timestamp="2012-12-19T19:24:31Z" uid="326503" user="wambag" changeset="14335103" lat="42.3678097" lon="-71.0218711"/>
  <node id="30730953" version="2" timestamp="2012-12-19T19:24:31Z" uid="326503" user="wambag" changeset="14335103" lat="42.3677364" lon="-71.0218568"/>
  <node id="30730954" version="2" timestamp="2012-12-19T19:24:31Z" uid="326503" user="wambag" changeset="14335103" lat="42.3676084" lon="-71.0218168"/>
  <node id="30730955" version="2" timestamp="2012-12-19T19:24:32Z" uid="326503" user="wambag" changeset="14335103" lat="42.3675229" lon="-71.0218486"/>

In [153]:
!cp data/boston_massachusetts.osm data/boston_massachusetts_new.osm

Get data file name


In [152]:
filename = '/Users/excalibur/py/nanodegree/wrang_mongdb/example.osm'
filename = 'data/boston_massachusetts_new.osm'

Import Python packages


In [4]:
import xml.etree.cElementTree as ET
import pprint as pp
import re
import os
import json

General Helper functions


In [5]:
# system beep
def finished():
    os.system("printf '\a'")
    os.system("printf '\a'")

Iterate over and display tags, attributes, and descendants in data file


In [6]:
tags = {}

for event,element in ET.iterparse(filename):
    
    if element.tag not in tags:
        tags[element.tag] = {}
        tags[element.tag]['count'] = 1
        tags[element.tag]['attributes'] = {}
        tags[element.tag]['children'] = {}
        tags[element.tag]['grandchildren'] = {}
        tags[element.tag]['greatgrandchildren'] = {}
    else:
        tags[element.tag]['count'] += 1
        
    for attribute_key,attribute_val in element.attrib.items():
            if attribute_key not in tags[element.tag]['attributes']:
                tags[element.tag]['attributes'][attribute_key] = 1
            else:
                tags[element.tag]['attributes'][attribute_key] += 1
                
    for child in element:
        if child.tag not in tags[element.tag]['children']:
            tags[element.tag]['children'][child.tag] = 1
        else:
            tags[element.tag]['children'][child.tag] += 1
        
        for grandchild in child:
            if grandchild.tag not in tags[element.tag]['grandchildren']:
                tags[element.tag]['grandchildren'][grandchild.tag] = 1
            else:
                tags[element.tag]['grandchildren'][grandchild.tag] += 1
                
            for greatgrandchild in grandchild:
                if greatgrandchild.tag not in tags[element.tag]['greatgrandchild']:
                    tags[element.tag]['greatgrandchild'][greatgrandchild.tag] = 1
                else:
                    tags[element.tag]['greatgrandchild'][greatgrandchild.tag] += 1
               
# clean up unused dictionaries
for item in tags.items():
    if not item[1]['attributes']:
        del item[1]['attributes']
    if not item[1]['children']:
        del item[1]['children']
    if not item[1]['grandchildren']:
        del item[1]['grandchildren']
    if not item[1]['greatgrandchildren']:
        del item[1]['greatgrandchildren']

pp.pprint(tags)
finished()


{'bounds': {'attributes': {'maxlat': 1,
                           'maxlon': 1,
                           'minlat': 1,
                           'minlon': 1,
                           'origin': 1},
            'count': 1},
 'member': {'attributes': {'ref': 8328, 'role': 8328, 'type': 8328},
            'count': 8328},
 'nd': {'attributes': {'ref': 1904147}, 'count': 1904147},
 'node': {'attributes': {'changeset': 1601437,
                         'id': 1601437,
                         'lat': 1601437,
                         'lon': 1601437,
                         'timestamp': 1601437,
                         'uid': 1601437,
                         'user': 1601437,
                         'version': 1601437},
          'children': {'tag': 274720},
          'count': 1601437},
 'osm': {'attributes': {'generator': 1, 'version': 1},
         'children': {'bounds': 1,
                      'node': 1601437,
                      'relation': 1050,
                      'way': 245626},
         'count': 1,
         'grandchildren': {'member': 8328, 'nd': 1904147, 'tag': 748353}},
 'relation': {'attributes': {'changeset': 1050,
                             'id': 1050,
                             'timestamp': 1050,
                             'uid': 1050,
                             'user': 1050,
                             'version': 1050},
              'children': {'member': 8328, 'tag': 4366},
              'count': 1050},
 'tag': {'attributes': {'k': 748353, 'v': 748353}, 'count': 748353},
 'way': {'attributes': {'changeset': 245626,
                        'id': 245626,
                        'timestamp': 245626,
                        'uid': 245626,
                        'user': 245626,
                        'version': 245626},
         'children': {'nd': 1904147, 'tag': 469267},
         'count': 245626}}

In [95]:
def get_subdictionaries(element, sub_dictionary):
    
    # regexes
    string_of_digits_regex = re.compile(r'^[0-9]*$')
    lat_or_long_regex = re.compile(r'^-{0,1}[0-9]*.[0-9]*$')
    timestamp_regex = re.compile(r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
    
    # if the current element does not already exist in the current subdictionary,
    # create a dictionary for it
    if element.tag not in sub_dictionary:
        sub_dictionary[element.tag] = {}
    
    ### CONFUSING TERMINOLOGY SECTION ###
    # for the attribute keys and values
    for attribute_key,attribute_val in element.attrib.items():
        
        # if the attribute_key is k, 
        if attribute_key == 'k':
            # make a dictionary entry for it (if it doesn't already exit),
            # and a set for the corresponding values (the values are the value of the 'v' attribute)
            if attribute_val not in sub_dictionary[element.tag]:
                sub_dictionary[element.tag][attribute_val] = set()
            
            # get the length of each element in the set
            list_of_lengths = [len(set_element) for set_element in sub_dictionary[element.tag][attribute_val]]
            
            # if the corresponding v value of the current k value has a length not already in the set, 
            # add it to the set
            if (len(element.attrib.get('v')) not in list_of_lengths): 
                sub_dictionary[element.tag][attribute_val].add(element.attrib.get('v'))
            
            # even if the length of the current v value is already present, 
            # if the v value is not purely a string of digits, 
            # add it to the set
            elif (not string_of_digits_regex.match(element.attrib.get('v'))):
                sub_dictionary[element.tag][attribute_val].add(element.attrib.get('v'))
            
        # if attribute_key is v, 
        # disregard since the value was extracted above
        elif attribute_key == 'v':
            pass
        
        # if attribute_key isn't already in the dictionary, 
        # add it
        elif attribute_key not in sub_dictionary[element.tag]:
            # create a set for its values
            sub_dictionary[element.tag][attribute_key] = set()
            # add attribute_val to the set
            sub_dictionary[element.tag][attribute_key].add(attribute_val)
            
        # if the attribute_key is already in the dictionary
        else:
            
            # calculate the length of each element in the relevant value set
            list_of_lengths = [len(set_element) for set_element in sub_dictionary[element.tag][attribute_key]]
            
            # if attribute_val has a length not already in the value set, 
            # add it
            if (len(attribute_val) not in list_of_lengths): 
                sub_dictionary[element.tag][attribute_key].add(attribute_val)
            
            # even if the length of attribute_val is already present, 
            # if it's not purely a string of digits,
            # and/or if it's not in the proper lat/long or timestamp format, 
            # add it
            # (in other words, only keep one example of something that is in a seemingly 'non-problematic' format,
            # but keep everything else), unless attribute_key == 'user' (don't want all of those, 
            # esp. since they could be almost any type of string!)
            elif ( (not string_of_digits_regex.match(attribute_val)) and 
                   (not lat_or_long_regex.match(attribute_val)) and 
                   (not timestamp_regex.match(attribute_val)) and
                   (attribute_key != 'user') ):
                sub_dictionary[element.tag][attribute_key].add(attribute_val)
                
    
    # if the sub_dictionary remained empty, 
    # delete it
    if not sub_dictionary[element.tag]:
        del sub_dictionary[element.tag]
        
    return sub_dictionary

In [8]:
def dictionarianize(filename, tag_to_examine):
    tag_dictionary = {}
    tag_dictionary['attributes'] = {}
    tag_dictionary['children'] = {}
    tag_dictionary['grandchildren'] = {}
    
    for event,element in ET.iterparse(filename):
        
        # get attributes
        if element.tag == tag_to_examine:
            get_subdictionaries(element, tag_dictionary['attributes'])
                      
            # deal with descendants, if any
            for child in element:
                get_subdictionaries(child, tag_dictionary['children'])
                
                for grandchild in child:
                    get_subdictionaries(grandchild, tag_dictionary['grandchildren'])

    # if the dictionary remained empty, 
    # delete it
    if not tag_dictionary['attributes']:
        del tag_dictionary['attributes']
    if not tag_dictionary['children']:
        del tag_dictionary['children']
    if not tag_dictionary['grandchildren']:
        del tag_dictionary['grandchildren']
    
    # tests
    #if tag_to_examine == 'node':
    #    assert tag_dictionary['attributes']['user'] == 'lurker'
        
    return tag_dictionary

In [9]:
def set_default(obj):
    if isinstance(obj, set):
        return list(obj)
    raise TypeError

jsonize


In [33]:
def jsonize(tag_names):
    for tag_name in tag_names:
        with open('{0}.json'.format(tag_name), 'w') as f:
            json.dump(dictionarianize(filename, tag_to_examine=tag_name), f, default=set_default, sort_keys=True, indent=2, separators=(',', ' : '))
        print "{0} done".format(tag_name)
        finished()

In [92]:
tag_names = ['bounds','member','nd','node','osm','relation','tag','way']
jsonize(tag_names)


bounds done
member done
nd done
node done
osm done
relation done
tag done
way done

bounds

bounds element attributes seemed fine

member

member elements attributes: varying length ref values, but all were strings of integers, so seemingly fine; role values were mostly snakecase; any that weren't were changed to snakecase due to the examples given here (http://wiki.openstreetmap.org/wiki/Relation); type values seemed fine


In [96]:
!grep member data/boston_massachusetts_new.osm -B 1 -A 1 | head


    <tag k="name" v="Sherman Lot" />
    <tag k="note" v="parking for Harvest members and some other permit holders" />
    <tag k="access" v="private" />
--
--
  <relation changeset="20567148" id="37967" timestamp="2014-02-14T21:59:51Z" uid="326503" user="wambag" version="18">
    <member ref="61317423" role="via" type="node" />
    <member ref="8615818" role="to" type="way" />
    <member ref="115591269" role="from" type="way" />
    <tag k="type" v="restriction" />

get new root from file


In [129]:
def get_new_root(filename):
    tree = ET.parse(filename)
    return tree.getroot()

In [175]:
root = get_new_root(filename)

write current tree to file


In [139]:
def write_tree(filename):
    tree.write(filename)

In [103]:
# write a function to snakecasify
def snakecasify(element_name, attribute_name, key_or_value):

    if key_or_value == 'value':
        if len(element_name) > 1:
            for element in root.findall(element_name[0]):
                for child in element.findall(element_name[1]):
                    attribute_val = child.get(attribute_name).lower().replace(' ','_')
                    child.set(attribute_name,attribute_val)
                
    tree.write(filename)

member roles


In [93]:
snakecasify(['relation','member'], 'role', 'value')

In [94]:
# check results
jsonize(['member'])


member done

nd

nd elements ref attribute values appeared to be fine (strings of integers)

node

node changeset, id, lat, lon, timestamp, uid, user, and version attributes seemed fine

node children: tag elements were a mess, as would be expected given the way they are used in OpenStreetMap

node children: tags

problematic tag attributes scheduled for removal: "Deed", "FIXME", "LOCATION", "ROUTES", "Town"

"abandoned:railway" : [ "station" ], following format here: http://wiki.openstreetmap.org/wiki/Railways


In [193]:
!grep abandoned data/boston_massachusetts_new.osm -B 1 -A 1 | head


    <tag k="start_date" v="1848" />
    <tag k="abandoned:railway" v="station" />
  <tag /><tag /></node>
--
--
    <tag k="name" v="Watertown Branch" />
    <tag k="railway" v="abandoned" />
    <tag k="old_railway_operator" v="B&amp;M" />
--
--

In [183]:
for element in root.findall('node'):
    for child in element.findall('tag'):
        if child.attrib['k'] == 'abandoned:railway':
            child.attrib['railway'] = 'abandoned'
            del child.attrib['k']
            del child.attrib['v']
    
            ET.SubElement(element,'tag',{'public_transport' : 'station'})

In [191]:
write_tree(filename)

In [190]:
for element in root.findall('node'):
    for child in element.findall('tag'):
        if ('railway' in child.attrib) or ('public_transport' in child.attrib):
            print child.attrib


{'railway': 'abandoned'}
{'public_transport': 'station'}

In [115]:
def replace_value(element_name, key, current_value, new_value):
    tree = ET.parse(filename)
    root = tree.getroot()
    
    if len(element_name) > 1:
        for element in root.findall(element_name[0]):
            for child in element.findall(element_name[1]):
                if child.get(key) == current_value:
                    child.set(key,new_value)

    tree.write(filename)

In [116]:
# check results
jsonize(['node'])


node done

after fixes, write new regexes to omit

skip users when all strings