In [243]:
# file size of original OSM file: 376 MB
!ls -lh cologne_germany.osm
In [244]:
# !/usr/bin/env python
# -*- coding: utf-8 -*-
#
#import xml.etree.ElementTree as ET # Use cElementTree or lxml if too slow
#
#OSM_FILE = "cologne_germany.osm" # Replace this with your osm file
#SAMPLE_FILE = "cologne_germany_sample.osm"
#
#k = 10 # Parameter: take every k-th top level element
#
#def get_element(osm_file, tags=('node', 'way', 'relation')):
# """Yield element if it is the right type of tag
#
# Reference:
# http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
# """
# context = iter(ET.iterparse(osm_file, events=('start', 'end')))
# _, root = next(context)
# for event, elem in context:
# if event == 'end' and elem.tag in tags:
# yield elem
# root.clear()
#
#
#with open(SAMPLE_FILE, 'wb') as output:
# """
# Changed output of write to byte objects in order to work with Python 3.x
#
# Reference:
# http://stackoverflow.com/questions/33054527/python-3-5-typeerror-a-bytes-like-object-is-required-not-str
# """
# output.write(b'<?xml version="1.0" encoding="UTF-8"?>\n')
# output.write(b'<osm>\n ')
#
# # Write every kth top level element
# for i, element in enumerate(get_element(OSM_FILE)):
# if i % k == 0:
# output.write(ET.tostring(element, encoding='utf-8'))
#
# output.write(b'</osm>')
In [245]:
# file size of sample OSM file: 38 MB
!ls -lh cologne_germany_sample.osm
In [1]:
# setup environment
import xml.etree.ElementTree as ET
import pprint
import re
In [247]:
def head_input(input_in, n=20):
"""
Returns n (default=20) items of a dict/list
"""
if type(input_in) == type(dict()):
return dict(list(sorted(input_in.items()))[0:n])
elif type(input_in) == type(list()):
return input_in[0:n]
else:
return "Non supported input type"
In [248]:
def get_tag_counts(file="cologne_germany_sample.osm"):
"""
Given a valid XML input file, the function returns a dict of tags and their respective counts
"""
# create tags dict
tag_counts = {}
# open file
with open(file, "r", encoding="utf8") as f:
# loop over file
for event, elem in ET.iterparse(f):
# check if tag is in dict
if elem.tag not in tag_counts.keys():
# if not, add tag as new key
tag_counts[elem.tag] = 1
# if so...
else:
#increase count of identifed tag
tag_counts[elem.tag] += 1
return tag_counts
In [249]:
# print tag and counts of sample data
get_tag_counts()
Out[249]:
In [250]:
def get_tag_types(file="cologne_germany_sample.osm"):
"""
Given a valid XML input file, the function returns a dict of keys and corresponding counts of the "tag" attribute
"""
# create tags dict
tag_types = {}
# open file
with open(file, "r", encoding="utf8") as f:
# loop over file
for event, elem in ET.iterparse(f):
# inspect only "tag" elements
if elem.tag == "tag":
# loop over "tag" elements
for tag in elem.iter("tag"):
# check if tag key not in tags_types dict
if tag.attrib["k"] not in tag_types.keys():
# if not add key with count 1
tag_types[tag.attrib["k"]] = 1
else:
# if so increase count
tag_types[tag.attrib["k"]] += 1
return tag_types
In [251]:
tag_types = get_tag_types()
In [252]:
# get first 20 items in tag_types dict
head_input(tag_types)
Out[252]:
In [272]:
# pretty print all items in tag_types dict (produces a long list only used for exploration)
#pprint.pprint(tag_types)
In [254]:
# print tag_types with more than 200 values
pprint.pprint({(k,v) for k,v in tag_types.items() if v > 200})
In [255]:
def get_tag_key(file="cologne_germany_sample.osm", key="FIXME"):
"""
Given a valid XML input file, the function returns a list of values for the corresponding key of the "tag" attribute
"""
# create tags dict
tag_keys = []
# open file
with open(file, "r", encoding="utf8") as f:
# loop over file
for event, elem in ET.iterparse(f):
# inspect only "tag" elements
if elem.tag == "tag":
# loop over "tag" elements
for tag in elem.iter("tag"):
# check if tag key not in tags_types dict
if tag.attrib["k"] == key:
# if not add key with count 1
tag_keys.append(tag.attrib["v"])
else:
continue
return set(tag_keys)
In [256]:
tag_key_fixme = get_tag_key()
Most of the fixme notes are related to errors concerning buildings, e.g. where an entrance is located
In [259]:
# pretty print FIXME notes
pprint.pprint(tag_key_fixme)
In [260]:
tag_key_fixed = get_tag_key(key="fixed")
Most of the fixed tages are related to roads and how they are categorized
In [261]:
# pretty print FIXED tags
pprint.pprint(tag_key_fixed)
The correct key to indicate districts seems to be "addr:suburb" since "addr:district" yields only district of Cologne
In [262]:
get_tag_key(key="addr:district")
Out[262]:
In [22]:
get_tag_key(key="addr:suburb")
Out[22]:
I was wondering which information was stored with "addr:housename". Apparently no standard has been definied, since it is used for clubs, companies, opening times, addresses, etc - an opportunity for improvement? :-)
In [23]:
get_tag_key(key="addr:housename")
Out[23]:
Sanity check passed, all documents containing "addr:country" refers to Germany (DE = Deutschland = Germany)
In [62]:
get_tag_key(key="addr:country")
Out[62]:
The tag "add:city" yields another opportunity for improvement. Strictly speaking, the only valid value is "Köln" (= Cologne in German). All other values are related to either districts of Cologne or different cities
In [63]:
get_tag_key(key="addr:city")
Out[63]:
I was curious about the quality of street names and decided to do an upfront visual check. Suprisingly the quality of street names in Cologne's OSM data is very good: An obvious case for trouble is the spelling of street names in Germany (Strasse vs. Straße, vs Str.) which does not seem to be an issue at all here (Note that
In [64]:
tag_key_street = get_tag_key(key="addr:street")
In [279]:
# pretty print tag_key_street (produces a long list only used for exploration)
#pprint.pprint(tag_key_street)
Unfortunately Cologne's data contains two tags for storing postal codes, "addr:postcode" and "addr:postal_code". Both should be merged into a single tag
In [65]:
get_tag_key(key="addr:postcode")
Out[65]:
In [68]:
get_tag_key(key="postal_code")
Out[68]:
Opening hours are mess lacking any standard
In [273]:
get_tag_key(key="opening_hours")
Out[273]:
Lastly, I was curious about the meaning of "alt_name" and "information". Whereas "alt_name" seems to store names for buildings, information seems to store some data related to hiking
In [66]:
get_tag_key(key="alt_name")
Out[66]:
In [67]:
get_tag_key(key="information")
Out[67]:
Using regular expressions definied during the courss the data is audited
In [10]:
# complie regular expressions
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n\-]')
In [11]:
def get_audit_tags(file="cologne_germany_sample.osm"):
"""
tbd
"""
# create tags dict
audit_tags = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
lower_list_key = []
lower_list_value = []
lower_colon_list_key = []
lower_colon_list_value = []
other_list_key = []
other_list_value = []
problemchars_list_key = []
problemchars_list_value = []
# open file
with open(file, "r", encoding="utf8") as f:
# loop over file
for _ , elem in ET.iterparse(f):
if elem.tag == "tag":
# loop over tags of element
for tag in elem.iter("tag"):
# check for lower in key
if re.search(lower, tag.attrib["k"]):
# add value to lower list
lower_list_key.append(tag.attrib["k"])
# increase count
audit_tags["lower"] += 1
# check for lower in value
elif re.search(lower, tag.attrib["v"]):
# add value to lower list
lower_list_value.append(tag.attrib["v"])
# increase count
audit_tags["lower"] += 1
# check for lower_colon in key
elif re.search(lower_colon, tag.attrib["k"]):
# add value to lower_colon list
lower_colon_list_key.append(tag.attrib["k"])
# increase count
audit_tags["lower_colon"] += 1
# check for lower_colon in value
elif re.search(lower_colon, tag.attrib["v"]):
# add value to lower_colon list
lower_colon_list_value.append(tag.attrib["v"])
# increase count
audit_tags["lower_colon"] += 1
# check for problemchars in key
elif re.search(problemchars, tag.attrib["k"]):
# add value to problemchars list
problemchars_list_key.append(tag.attrib["k"])
# increase count
audit_tags["problemchars"] += 1
# check for problemchars in value
elif re.search(problemchars, tag.attrib["v"]):
# add value to problemchars list
problemchars_list_value.append(tag.attrib["v"])
# increase count
audit_tags["problemchars"] += 1
# else assign other
else:
# add key to other list
other_list_key.append(tag.attrib["k"])
# increase count
audit_tags["other"] += 1
# add value to other list
other_list_value.append(tag.attrib["v"])
# increase count
audit_tags["other"] += 1
return audit_tags, set(lower_list_key), set(lower_list_value), set(lower_colon_list_key), set(lower_colon_list_value), set(problemchars_list_key), set(problemchars_list_value), set(other_list_key), set(other_list_value)
In [12]:
problemchars_count, pchars_low_key, pchars_low_value, pchars_low_colon_key, pchars_low_colon_value, pchars_list_key, pchars_list_value, pchars_other_key, pchars_other_value = get_audit_tags()
In [14]:
# print problemchar count per category
problemchars_count
Out[14]:
In [15]:
# print lower key tags
pchars_low_key
Out[15]:
In [16]:
# print lower value tags
pchars_low_value
Out[16]:
In [17]:
# print lower_colon keys
pchars_low_colon_key
Out[17]:
In [18]:
# print lower_colon keys
pchars_low_colon_value
Out[18]:
In [20]:
# print problemchars keys
pchars_list_key
Out[20]:
In [21]:
# print problemchars value
pchars_list_value
Out[21]:
In [22]:
# print other keys
pchars_other_key
Out[22]:
In [24]:
# print other keys
pchars_other_value
Out[24]:
A brief summary of the problems encountered during data exploration:
The following function is a prototype for mixing issues with OSM data. It accepts fixes via predefinied mappings and can make use of problemchars identified by the get_audit_tags() function.
In [25]:
mapping = {"Köln Rath/Heumar" : "Köln",
"Köln-Nippes": "Köln",
"51143,51145" : "51143",}
In [38]:
def fix_problems(file="cologne_germany_sample.osm", mapping=mapping, problemchars=problemchars):
"""
Given a valid XML input file, the function applies fixes to keys of the "tag" attribute given a mapping dict
and a list of problemchars
"""
fixed_dict = {}
tag = None
issue = None
with open(file, "r", encoding="utf8") as f:
# loop over file
for event , elem in ET.iterparse(f):
# only process "tag" tags
if elem.tag == "tag":
# loop over tags of element
for tag in elem.iter("tag"):
# check if value is in mapping dict
if tag.attrib["v"] in mapping.keys():
# record problematic key
issue = tag.attrib["v"]
# loop over key value pairs in mapping dict
for m in mapping:
# apply fixes
tag.attrib["v"] = tag.attrib["v"].replace(m, mapping[m])
# record fix
fix = tag.attrib["v"]
# check if key is inproblemchars list
elif tag.attrib["k"] in problemchars:
# record problematic key
issue = tag.attrib["k"]
# check for hiphen
if re.search(re.compile(r'\-'), tag.attrib["k"]):
# fix hiphen
tag.attrib["k"] = tag.attrib["k"].replace("-", "_")
# check for dot
elif re.search(re.compile(r'\.'), tag.attrib["k"]):
# fix dot
tag.attrib["k"] = tag.attrib["k"].replace(".", ":")
# record fixed key
fix = tag.attrib["k"]
# check if value is inproblemchars list
#elif tag.attrib["v"] in problemchars:
# record problematic key
#issue = tag.attrib["v"]
# check for hiphen
#if re.search(re.compile(r'\-'), tag.attrib["v"]):
# fix hiphen
#tag.attrib["v"] = tag.attrib["v"].replace("-", "_")
# check for dot
#elif re.search(re.compile(r'\.'), tag.attrib["v"]):
# fix dot
#tag.attrib["v"] = tag.attrib["v"].replace(".", ":")
# record fixed key
#fix = tag.attrib["v"]
# update fixed_dict
if issue and fix:
fixed_dict[issue] = fix
return fixed_dict
In [39]:
test = fix_problems()
test
Out[39]:
In [239]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
CREATED = ["version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
# initialize node dict
node = {}
# initialize helper dicts and lists
created = {}
address = {}
pos = []
node_refs = []
# only process "node" or "way" tags
if element.tag == "node" or element.tag == "way":
# add note type to node dict
node["type"] = element.tag
# loop over keys of element attributes
for key in element.attrib:
# check if attribute is id or visible
if key == "id" or key == "visible":
# add key and value to node dict
node[key] = element.attrib[key]
# check if key in created array
elif key in CREATED:
# add key and value to created dict
created[key] = element.attrib[key]
# check if attribute is lat
elif key == "lat":
# cast attribute to float and add to pos list
pos.insert(0,float(element.attrib[key]))
# check if attribute is lon
elif key == "lon":
# cast attribute to float and add to pos list
pos.insert(1,float(element.attrib[key]))
# if not
else:
# ignore
continue
# loop over children of element
for child in element:
# check for node references
if child.tag == "nd":
# add ref attribute to node_refs list
node_refs.append(child.attrib["ref"])
else:
# check if child key contains problematic characters
if re.search(problemchars, child.attrib["k"]):
# ignore
continue
# check if child key does not contain more than 1 colon:
elif not child.attrib["k"].count(":") > 1:
# check if attribute key is in mapping
if child.attrib["k"] in mapping.keys() or child.attrib["v"] in mapping.keys():
# loop over mapping dict
for m in mapping:
# update key according to mapping
child.attrib["k"] = child.attrib["k"].replace(m, mapping[m])
# update value according to mapping
child.attrib["v"] = child.attrib["v"].replace(m, mapping[m])
# check if attribute key starts with "addr:"
if child.attrib["k"].startswith("addr:"):
# clean key
clean_key = child.attrib["k"].replace("addr:", "")
# add key and value to address dict
address[clean_key] = child.attrib["v"]
# if not
else:
# add key and value to other dict
node[child.attrib["k"]] = child.attrib["v"]
# if it does contain more than 1 colon:
else:
# ignore
continue
# add helper dicts and list to node dict
if created:
# add k,v for created dict
node["created"] = created
# if pos list contains elements
if pos:
# add k,v for pos list
node["pos"] = pos
# if address dict contains elements
if address:
# add k,v for address dict
node["address"] = address
# if node_refs dict contains elements
if node_refs:
# add k,v for node_refs dict
node["node_refs"] = node_refs
return node
else:
return None
In [240]:
# code from case study
def process_map(file_in, pretty = False):
# You do not need to change this file
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
In [241]:
data = process_map("cologne_germany_sample.osm", pretty=False)
In [223]:
# check if data has been processed correctly
data[-1]
Out[223]:
Using mongoimport from UNIX shell in virtual machine:
mongoimport --db osm --collection cologne --type json --file /vagrant/cologne_germany_sample.json
In [12]:
# file size of sample JSON: 41 MB
!ls -lh cologne_germany_sample.json
In [1]:
# setup pymongo, connect to MongoDB and select osm database
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.osm.cologne
In [50]:
# test connection
db.find_one()
Out[50]:
In [129]:
# helper function to display cursor
def get_cursor(query):
"""
Given a valid MongoDB aggregation query, the function returns its result as a list
"""
print(list(query))
In [5]:
# number of documents in database
db.find().count()
Out[5]:
In [51]:
# number of nodes
db.find({"type": "node"}).count()
Out[51]:
In [52]:
# number of ways
db.find({"type": "way"}).count()
Out[52]:
In [61]:
# number of unique users
len(db.distinct("created.user"))
Out[61]:
In [185]:
# top 5 users by number of created documents
user_query = db.aggregate([
{"$group": {"_id": "$created.user", "count": {"$sum": 1}}},
{"$sort": {"count": -1}},
{"$limit" : 5}
])
The top 5 users created 70% of the documents in the database (130536/187163)
In [186]:
get_cursor(user_query)
In [194]:
# count number of shops
shop_query = db.aggregate([
{"$match": {"shop": {"$exists": 1}}},
{"$group": {"_id": None, "count": {"$sum": 1}}}
])
In [195]:
get_cursor(shop_query)
In [212]:
# count number of documents in cologne-ehrenfeld district (the district I used to live in)
ehrenfeld_query = db.aggregate([
{"$match": {"address.postcode": "50823"}},
{"$group": {"_id": None, "count": {"$sum": 1}}}
])
In [213]:
get_cursor(ehrenfeld_query)
In [274]:
# top 5 amenities in Cologne
amenity_query = db.aggregate([
{"$match": {"amenity": {"$exists": 1}}},
{"$group": {"_id": "$amenity", "count" : {"$sum": 1}}},
{"$sort": {"count": -1}},
{"$limit" : 5}
])
Cologne seems to be a city where parking is difficult, since the top amenity is considered parking (with bicycle parking ath the fourth position). Further, apart from restaurants and benches, post boxes are listed
In [275]:
get_cursor(amenity_query)
In [277]:
# top 3 cuisines in Cologne
cuisine_query = db.aggregate([
{"$match": {"cuisine": {"$exists": 1}}},
{"$group": {"_id": "$cuisine", "count" : {"$sum": 1}}},
{"$sort": {"count": -1}},
{"$limit" : 3}
])
Although only a few documents contain cuisine tags, the result is not unusual for Cologne. Being a city with a numerous Italian and Turkish community, top three cuisines are considered Italian, Turkish (Kebab is my proxy for Turkish here) and German.
In [278]:
get_cursor(cuisine_query)
In [228]:
# Which historic sites exist in Cologne?
historic_query = db.aggregate([
{"$match": {"historic": {"$exists": 1}}},
{"$group": {"_id": "$historic", "count" : {"$sum": 1}}},
{"$sort": {"count": -1}}
])
The most frequent historic site is labeled memorial, probably related to world war 2
In [229]:
get_cursor(historic_query)
In [183]:
# Which are the top 5 shop categories in Cologne?
shop_query2 = db.aggregate([
{"$match": {"shop": {"$exists": 1}}},
{"$group": {"_id": "$shop", "count" : {"$sum": 1}}},
{"$sort": {"count": -1}},
{"$limit" : 5}
])
No suprises here, the kiosk a.k.a. "büdchen", which means small shop where you can buy alcohol late at night in German, is the top shop in Cologne
In [184]:
get_cursor(shop_query2)
In [224]:
# Which leisure venues exist in Cologne?
leisure_query = db.aggregate([
{"$match": {"leisure": {"$exists": 1}}},
{"$group": {"_id": "$leisure", "count" : {"$sum": 1}}},
{"$sort": {"count": -1}}
])
After consulting with Google translate, I learned that pitch is considered a synonym to playground. Thus having pitch as the most frequently listed leisure venue in Cologne makes sense
In [225]:
get_cursor(leisure_query)
In [220]:
# How many gay venues exist in Cologne?
gay_query = db.aggregate([
{"$match": {"gay": {"$exists": 1}}},
{"$group": {"_id": None, "count" : {"$sum": 1}}},
])
Apparently there exist only 1 gay venue in our sample, which is cleary a strange result for Cologne (the capital of gays in Germany).
In [221]:
get_cursor(gay_query)
After my short review of Cologne's OSM data I am suprised of the general quality of the data, especially related to street names. Additionalyl OSM users tag documents which need to be fixed with corresponding tags (FIXME), which makes it easy for new users to contribute. Two problems exist which could be fixed easily (postal code and city tags), further opening times are recorded lacking a standard input format. Considering Cologne's status as the capital of gay people in Germany, it is suprising that only one document carries the tag "gay".