In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the
tag name as the key and number of times this tag can be encountered in
the map as value.
Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint
def count_tags(filename):
# create tag dict
tags = {}
# open file
with open(filename, "r") as f:
# loop over iterparse elements
for event, elem in ET.iterparse(f):
# parse tag from elem
tag = str(elem).strip("<").strip(">").replace("'","").split(" ")
# check if tag is in dict
if tag[1] not in tags.keys():
# if not, add tag as new key
tags[tag[1]] = 1
# if so...
else:
#increase count of identifed tag
tags[tag[1]] += 1
return tags
def test():
tags = count_tags('example.osm')
pprint.pprint(tags)
assert tags == {'bounds': 1,
'member': 3,
'nd': 4,
'node': 20,
'osm': 1,
'relation': 1,
'tag': 7,
'way': 1}
if __name__ == "__main__":
test()
In [ ]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.
We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.
Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
"lower", for tags that contain only lowercase letters and are valid,
"lower_colon", for otherwise valid tags with a colon in their names,
"problemchars", for tags with problematic characters, and
"other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def key_type(element, keys):
if element.tag == "tag":
# loop over tags of element
for tag in element.iter("tag"):
# check for lower
if re.search(lower, tag.attrib["k"]):
# increase count
keys["lower"] += 1
# check for lower_colon
elif re.search(lower_colon, tag.attrib["k"]):
# increase count
keys["lower_colon"] += 1
# check for problemchars
elif re.search(problemchars, tag.attrib["k"]):
# increase count
keys["problemchars"] += 1
# else assign other
else:
keys["other"] += 1
return keys
def process_map(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
def test():
# You can use another testfile 'map.osm' to look at your solution
# Note that the assertion below will be incorrect then.
# Note as well that the test function here is only used in the Test Run;
# when you submit, your code will be checked against a different dataset.
keys = process_map('example.osm')
pprint.pprint(keys)
assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}
if __name__ == "__main__":
test()
In [ ]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!
The function process_map should return a set of unique user IDs ("uid")
"""
def get_user(element):
return element.get("user")
def process_map(filename):
users = set()
for _, element in ET.iterparse(filename):
# loop over tags
for tag in element.iter(element.tag):
# check if user value exists
if get_user(tag):
# get value for user and update set
users.add(get_user(tag))
return users
def test():
users = process_map('example.osm')
pprint.pprint(users)
assert len(users) == 6
if __name__ == "__main__":
test()
In [ ]:
"""
Your task in this exercise has two steps:
- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
the unexpected street types to the appropriate ones in the expected list.
You have to add mappings only for the actual problems you find in this OSMFILE,
not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
The function takes a string with street name as an argument and should return the fixed name
We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint
OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
"Trail", "Parkway", "Commons"]
# UPDATE THIS VARIABLE
mapping = { "St": "Street",
"St.": "Street",
"N.": "North",
"Ave": "Avenue",
"Rd.": "Road"
}
def audit_street_type(street_types, street_name):
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
street_types[street_type].add(street_name)
def is_street_name(elem):
return (elem.attrib['k'] == "addr:street")
def audit(osmfile):
osm_file = open(osmfile, "r")
street_types = defaultdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_street_name(tag):
audit_street_type(street_types, tag.attrib['v'])
osm_file.close()
return street_types
def update_name(name, mapping):
# parse name elements
name_el = name.split(" ")
# loop over name elements
for el in name_el:
# check if name element is in mapping
if el in mapping:
# if so, replace element with mapping
name = name.replace(el, mapping[el])
return name
def test():
st_types = audit(OSMFILE)
assert len(st_types) == 3
pprint.pprint(dict(st_types))
for st_type, ways in st_types.iteritems():
for name in ways:
better_name = update_name(name, mapping)
print name, "=>", better_name
if name == "West Lexington St.":
assert better_name == "West Lexington Street"
if name == "Baldwin Rd.":
assert better_name == "Baldwin Road"
if __name__ == '__main__':
test()
In [ ]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
To do so you will parse the elements in the OSM XML file, transforming them from document format to
tabular format, thus making it possible to write to .csv files. These csv files can then easily be
imported to a SQL database as tables.
The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files
We've already provided the code needed to load the data, perform iterative parsing and write the
output to csv files. Your task is to complete the shape_element function that will transform each
element into the correct format. To make this process easier we've already defined a schema (see
the schema.py file in the last code tab) for the .csv files and the eventual tables. Using the
cerberus library we can validate the output against this schema to ensure it is correct.
## Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.
### If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}
The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored
The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon
is not present.
Additionally,
- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
the tag key. For example:
<tag k="addr:street:name" v="Lincoln"/>
should be turned into
{'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}
- If a node has no secondary tags then the "node_tags" field should just contain an empty list.
The final return value for a "node" element should look something like:
{'node': {'id': 757860928,
'user': 'uboot',
'uid': 26299,
'version': '2',
'lat': 41.9747374,
'lon': -87.6920102,
'timestamp': '2010-07-22T16:16:51Z',
'changeset': 5288876},
'node_tags': [{'id': 757860928,
'key': 'amenity',
'value': 'fast_food',
'type': 'regular'},
{'id': 757860928,
'key': 'cuisine',
'value': 'sausage',
'type': 'regular'},
{'id': 757860928,
'key': 'name',
'value': "Shelly's Tasty Freeze",
'type': 'regular'}]}
### If the element top level tag is "way":
The dictionary should have the format {"way": ..., "way_tags": ..., "way_nodes": ...}
The "way" field should hold a dictionary of the following top level way attributes:
- id
- user
- uid
- version
- timestamp
- changeset
All other attributes can be ignored
The "way_tags" field should again hold a list of dictionaries, following the exact same rules as
for "node_tags".
Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag. Each dictionary should have the fields:
- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
the way element
The final return value for a "way" element should look something like:
{'way': {'id': 209809850,
'user': 'chicago-buildings',
'uid': 674454,
'version': '1',
'timestamp': '2013-03-13T15:58:04Z',
'changeset': 15353317},
'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
{'id': 209809850, 'node_id': 2199822390, 'position': 1},
{'id': 209809850, 'node_id': 2199822392, 'position': 2},
{'id': 209809850, 'node_id': 2199822369, 'position': 3},
{'id': 209809850, 'node_id': 2199822370, 'position': 4},
{'id': 209809850, 'node_id': 2199822284, 'position': 5},
{'id': 209809850, 'node_id': 2199822281, 'position': 6}],
'way_tags': [{'id': 209809850,
'key': 'housenumber',
'type': 'addr',
'value': '1412'},
{'id': 209809850,
'key': 'street',
'type': 'addr',
'value': 'West Lexington St.'},
{'id': 209809850,
'key': 'street:name',
'type': 'addr',
'value': 'Lexington'},
{'id': '209809850',
'key': 'street:prefix',
'type': 'addr',
'value': 'West'},
{'id': 209809850,
'key': 'street:type',
'type': 'addr',
'value': 'Street'},
{'id': 209809850,
'key': 'building',
'type': 'regular',
'value': 'yes'},
{'id': 209809850,
'key': 'levels',
'type': 'building',
'value': '1'},
{'id': 209809850,
'key': 'building_id',
'type': 'chicago',
'value': '366409'}]}
"""
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus
import schema
OSM_PATH = "example.osm"
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
SCHEMA = schema.schema
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
problem_chars=PROBLEMCHARS, default_tag_type='regular'):
"""Clean and shape node or way XML element to Python dict"""
node_attribs = {}
way_attribs = {}
way_nodes = []
tags = [] # Handle secondary tags the same way for both node and way elements
# YOUR CODE HERE
if element.tag == 'node':
return {'node': node_attribs, 'node_tags': tags}
elif element.tag == 'way':
return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
# ================================================== #
# Helper Functions #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
"""Yield element if it is the right type of tag"""
context = ET.iterparse(osm_file, events=('start', 'end'))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
def validate_element(element, validator, schema=SCHEMA):
"""Raise ValidationError if element does not match schema"""
if validator.validate(element, schema) is not True:
field, errors = next(validator.errors.iteritems())
message_string = "\nElement of type '{0}' has the following errors:\n{1}"
error_string = pprint.pformat(errors)
raise Exception(message_string.format(field, error_string))
class UnicodeDictWriter(csv.DictWriter, object):
"""Extend csv.DictWriter to handle Unicode input"""
def writerow(self, row):
super(UnicodeDictWriter, self).writerow({
k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
})
def writerows(self, rows):
for row in rows:
self.writerow(row)
# ================================================== #
# Main Function #
# ================================================== #
def process_map(file_in, validate):
"""Iteratively process each XML element and write to csv(s)"""
with codecs.open(NODES_PATH, 'w') as nodes_file, \
codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
codecs.open(WAYS_PATH, 'w') as ways_file, \
codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
nodes_writer.writeheader()
node_tags_writer.writeheader()
ways_writer.writeheader()
way_nodes_writer.writeheader()
way_tags_writer.writeheader()
validator = cerberus.Validator()
for element in get_element(file_in, tags=('node', 'way')):
el = shape_element(element)
if el:
if validate is True:
validate_element(el, validator)
if element.tag == 'node':
nodes_writer.writerow(el['node'])
node_tags_writer.writerows(el['node_tags'])
elif element.tag == 'way':
ways_writer.writerow(el['way'])
way_nodes_writer.writerows(el['way_nodes'])
way_tags_writer.writerows(el['way_tags'])
if __name__ == '__main__':
# Note: Validation is ~ 10X slower. For the project consider using a small
# sample of the map when validating.
process_map(OSM_PATH, validate=True)
In [ ]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:
{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
"version":"2",
"changeset":"17206049",
"timestamp":"2013-08-03T16:43:42Z",
"user":"linuxUser16",
"uid":"1219059"
},
"pos": [41.9757030, -87.6921867],
"address": {
"housenumber": "5157",
"postcode": "60625",
"street": "North Lincoln Ave"
},
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}
You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB.
Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to
update the street names before you save them to JSON.
In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
- attributes in the CREATED array should be added under a key "created"
- attributes for latitude and longitude should be added to a "pos" array,
for use in geospacial indexing. Make sure the values inside "pos" array are floats
and not strings.
- if the second level tag "k" value contains problematic characters, it should be ignored
- if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if the second level tag "k" value does not start with "addr:", but contains ":", you can
process it in a way that you feel is best. For example, you might split it into a two-level
dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
- if there is a second ":" that separates the type/direction of a street,
the tag should be ignored, for example:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
should be turned into:
{...
"address": {
"housenumber": 5158,
"street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}
- for "way" specifically:
<nd ref="305896090"/>
<nd ref="1719825889"/>
should be turned into
"node_refs": ["305896090", "1719825889"]
"""
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
# create dicts
node = {}
address = {}
created = {}
# create lists
node_refs = []
pos = []
# check for valid tags
if element.tag == "node" or element.tag == "way" :
# save valid tag to node dict
node['type'] = element.tag
# check for lat and lon attributes
if 'lat' in element.attrib.keys() and 'lon' in element.attrib.keys():
# try convert lat and lon values to float
try:
lat = float(element.attrib['lat'])
lon = float(element.attrib['lon'])
# save values to pos list
pos = [lat,lon]
except:
pass
# loop over attribute items
for k, m in element.attrib.items():
# skip lat and lon elements
if k not in pos:
# check if attributes match CREATED list
if k in CREATED:
created[k] = m
else:
node[k] = m
# loop over children
for child in element:
# check second level tag nd
if child.tag == "nd":
node_refs.append(child.attrib['ref'])
# check second level address tag
if child.tag == "tag":
if child.attrib['k'].startswith("addr:"):
address[child.attrib['k']] = child.attrib['v']
# handle dirty address tags
elif not child.attrib['k'].startswith("addr:") and child.attrib['k'].count(":") == 1:
print(child.attrib['k'])
print(child.attrib['v'])
#child.attrib['k'] = child.attrib['k'].replace(":", "addr:")
address[child.attrib['k']] = child.attrib['v']
# omit tags with more than one ":"
elif child.attrib['k'].count(":") > 1:
pass
# integrate created dict into node dict
if created:
node['created'] = created
# integrate pos dict into node dict
if pos:
node['pos'] = pos
# integrate adress dict into node dict
if address:
node['address'] = address
# integrate node_refs dict into node dict
if node_refs:
node['node_refs'] = node_refs
print(node)
return node
else:
return None
def process_map(file_in, pretty = False):
# You do not need to change this file
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
def test():
# NOTE: if you are running this code on your computer, with a larger dataset,
# call the process_map procedure with pretty=False. The pretty=True option adds
# additional spaces to the output, making it significantly larger.
data = process_map('example.osm', True)
#pprint.pprint(data)
correct_first_elem = {
"id": "261114295",
"visible": "true",
"type": "node",
"pos": [41.9730791, -87.6866303],
"created": {
"changeset": "11129782",
"user": "bbmiller",
"version": "7",
"uid": "451048",
"timestamp": "2012-03-28T18:31:23Z"
}
}
assert data[0] == correct_first_elem
assert data[-1]["address"] == {
"street": "West Lexington St.",
"housenumber": "1412"
}
assert data[-1]["node_refs"] == [ "2199822281", "2199822390", "2199822392", "2199822369",
"2199822370", "2199822284", "2199822281"]
if __name__ == "__main__":
test()