In [128]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import codecs
import json
from pprint import pprint
In [129]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_type_portuguese_re = re.compile(r'^\b\S+\.?', re.IGNORECASE)
# method to audit unexpected street types in data
def audit_street_type(street_types, street_name, expected):
m = street_type_portuguese_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
# the collected street type is unknown
street_types[street_type].add(street_name)
#checks whether an element is a street
def is_street_name(elem):
return (elem.attrib['k'] == "addr:street")
def audit(osmfile, expected):
osm_file = open(osmfile, "r")
street_types = defaultdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
#look only for elements **node** and **way**
if elem.tag == "node" or elem.tag == "way":
#iterate over children **tag** elements and analyze the ones that have a str
for tag in elem.iter("tag"):
if is_street_name(tag):
audit_street_type(street_types, tag.attrib['v'], expected)
return street_types
#method that converts abbreviated street names to an extensive and unique format(e.g, R. da Forca --> Rua da Forca)
def update_name(name, mapping):
m = street_type_portuguese_re.search(name)
if m:
street_type = m.group()
updated_street_type = mapping.get(street_type)
if updated_street_type:
name = name.replace(street_type, updated_street_type)
return name
In [130]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def shape_element(element):
node = {}
if element.tag == "node" or element.tag == "way" :
# YOUR CODE HERE
node['type'] = element.tag
node.update(element.attrib)
latitude = node.pop('lat', None)
longitude = node.pop('lon', None)
if latitude and longitude:
node['pos'] = [float(latitude), float(longitude)]
node['created'] = {}
for created_attr in CREATED:
created_attr_value = node.pop(created_attr, None )
if created_attr_value:
node['created'][created_attr] = created_attr_value
for tag in element.iter('tag'):
k = tag.attrib['k']
if problemchars.findall(k):
pass
elif k[:5] == 'addr:':
address_key = k[5:]
if lower_colon.findall(address_key):
pass
else:
if 'address' not in node.keys():
node['address'] = {}
value = tag.attrib.get('v')
if address_key =='street':
value = update_name(value, mapping)
node['address'][address_key] = value
else:
node[k] = tag.attrib.get('v')
node_refs = [tag.get('ref') for tag in element.iter('nd')]
if node_refs:
node['node_refs'] = node_refs
return node
else:
return None
def process_map(file_in, pretty = False):
# You do not need to change this file
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
In this project I have used the metro area of Porto, Portugal. If you want to run this code, please download the OSM XML data from Map Zen and extract it in the directory of this Notebook.
The direct link is the following: https://s3.amazonaws.com/metro-extracts.mapzen.com/porto_portugal.osm.bz2
In [131]:
path = 'porto_portugal.osm'
#path = 'porto_portugal_sample.osm'
In [132]:
import os
print "Dataset size is %.1fMB"%(os.path.getsize(path)/1000000.)
In [133]:
def get_user(element):
return element.attrib.get('user')
users = set()
for _, element in ET.iterparse(path):
user = get_user(element)
if user:
users.add(user)
In [134]:
print "----------------------"
print " First 20 users"
print "----------------------"
for u in list(users)[:20]:
print u
print '...'
print "----------------------"
print "Total number of unique users: %d"%len(users)
The following function is used to count all the tags that appear in our XML document.
In [135]:
def count_tags(filename):
# YOUR CODE HERE
tree = ET.parse(filename)
root = tree.getroot()
tags_count = {root.tag: 1}
for child in root.findall('.//'):
tag = child.tag
tags_count[tag] = tags_count.get(tag, 0) + 1
return tags_count
In [136]:
tags = count_tags(path)
%store tags
The following tags and respective count can be found in our data:
In [137]:
pprint(tags)
The first thing a noticed in the data is that street names are written in Portuguese, which means that the conventional parsing techniques for English streets does not apply. In Portuguese the following guidelines may help to define the first version of parser for street addresses:
Using the method audit I will try to find out other Portuguese street types. This method basically collects all the street addresses that are not recognized by the parser, grouping them per inferred street type. It infers the street type by extracting the first word of the address.
In [151]:
expected=["Rua", "Avenida", "Estrada", "Travessa"]
audit(path, expected)
Out[151]:
We can see that there were many street types that were not being included. Let's add it.
In [152]:
expected = ["Rua", "Avenida", "Estrada", "Travessa", "Viela", "Zona", "Praceta", "Praça", "Calçada", "Largo", "Lugar", "Campo", "Ciclovia", "Caminho", "Via"]
audit(path, expected)
Out[152]:
Another issue is that some street types are written using abreviations. E.g., Sometimes Avenida appears as Av.. The method _updatename is going to be used to update these
In [165]:
mapping = {
"R": "Rua",
"R.": "Rua",
"Praca": "Praça",
"Av.": "Avenida",
"Calcada": "Calçada"
}
name = 'R. da Liberdade'
print '%s => %s'%(name, update_name(name, mapping))
Portuguese has some latin characters that can be a trouble, such as ç in Praça. Instead, we read Pra\xe7a, which does not make sense. Although these weird characters might make sense with different encoding, we will also convert them to something Python can recognize.
Let's do this by updating update_name method and mapping dictionary.
In [166]:
def update_name(name, mapping):
m = street_type_portuguese_re.search(name)
if m:
street_type = m.group()
updated_street_type = mapping.get(street_type)
if updated_street_type:
name = name.encode('utf-8').replace(street_type.encode('utf-8'), updated_street_type)
return name
In [167]:
mapping = {
"R": "Rua",
"R.": "Rua",
"Praca": "Praça",
"Av.": "Avenida",
"Calcada": "Calçada",
u'Urbaniza\xe7\xe3o': 'Urbanização',
u'Pra\xe7a': 'Praça',
}
name = u'Pra\xe7a da Liberdade'
updated_name = update_name(name, mapping)
print '%s => %s'%(name.encode('utf-8'), updated_name)
In [153]:
def count_cities(filename):
# YOUR CODE HERE
tree = ET.parse(filename)
root = tree.getroot()
cities_count = {}
for child in root.findall('.//tag'):
if child.attrib['k'] == 'addr:city':
city = child.attrib['v'].lower()
cities_count[city] = cities_count.get(city, 0) + 1
return cities_count
In [154]:
cities = count_cities(path)
In [155]:
len(cities)
Out[155]:
We have 80 different city names. According to Wikipedia (https://en.wikipedia.org/wiki/Metropolitan_Area_of_Porto#Population), Porto metropolitan area has only 18 cities:
In [158]:
city_names = [
'vila nova de gaia',
'santo tirso',
'trofa',
'arouca',
'oliveira de azeméis',
'santa maria da feira',
'são joão da madeira',
'vale de cambra',
'espinho',
'gondomar',
'maia',
'matosinhos',
'porto',
'póvoa de varzim',
'valongo',
'vila do conde',
'vila nova de gaia',
'paredes',
'paços de ferreira', # outside metropolitan area
'penafiel', # outside metropolitan area
]
In [ ]:
From the list above we can explain this large number of cities by the fact that many users have given the name of civil parish.
Sometimes both the civil parish and the city are written (e.g., 'canidelo - vila nova de gaia' where Vila Nova de Gaia is the city) and other times only the parish is given (e.g., canidelo).
In addition, cities with long names sometimes appear written with acronyms. E.g., Vila Nova de Gaia appears once as V. N. Gaia.
In [159]:
pprint(cities)
In [160]:
parish_city_mapping = {
'canidelo' : 'vila nova de gaia',
'canelas' : 'vila nova de gaia',
'mafamude': 'vila nova de gaia',
'grijó': 'vila nova de gaia',
'madalena': 'vila nova de gaia',
"sandim": 'vila nova de gaia',
"vilar de andorinho": 'vila nova de gaia',
'baltar': 'paredes',
'ermesinde': 'valongo',
"custóias": 'matosinhos',
"castêlo da maia": 'matosinhos',
"leça da palmeira": 'matosinhos',
"são mamede de infesta": 'matosinhos',
"leça do balio": 'matosinhos',
'mindelo': 'vila do conde',
'bonfim': 'porto',
'paranhos': 'porto',
'ramalde': 'porto',
"campanhã":'porto',
'senhora da hora': 'matosinhos',
'valadares': 'vila nova de gaia',
"águas santas": 'maia',
"pedrouços": 'maia',
'lousado': u'vila nova de famalic\xe3o',
'alfena': 'valongo',
'roriz': 'santo tirso',
"lamelas": 'santo tirso',
'baguim do monte': 'gondomar',
'fajozes': 'vila do conde',
"s.pedro de formariz": 'vila do conde',
"mosteiró": 'vila do conde',
'lourosa': 'santa maria da feira',
'fiães': 'santa maria da feira',
"argoncilhe": 'santa maria da feira',
"nogueira da regedoura": 'santa maria da feira',
"ribeirão": 'trofa',
"guidões": 'trofa',
"gandra": "paredes",
"modelos": u'pa\xe7os de ferreira',
"rio tinto": "gondomar",
"povoa de varzim": u'p\xf3voa de varzim',
"lagares": "penafiel",
"guetim":"espinho",
}
In [161]:
def update_city_name(name, mapping, city_names):
name = name.lower().encode('utf-8')
updated_name = mapping.get(name)
if updated_name:
return updated_name
if name not in city_names:
for city in city_names:
if city in name:
return city
return name.decode('utf-8')
In [162]:
updated_cities = {}
for (city, count) in cities.iteritems():
updated_city = update_city_name(city, parish_city_mapping, city_names)
updated_cities[updated_city] = (updated_cities.get(updated_city) or 0) + count
print len(updated_cities)
pprint(updated_cities)
In [ ]:
Another idea that could be done was getting the areas that have been recently updated (in the last day/week/month).
This could be achieved by filtering nodes using the attribute timestamp and according to the intended time window. Issues that may arise from dealing with data with date and time values are usually timezone normalization.
Other idea would be presenting places that need data, so that users willing to contribute could be aware of which places are more important. One way of doing this might be, given a pair of coordinates (longitude, latitude) count all the documents in a range of 1km.
The main issue would be the fact that in a city center streets are more complex than in a rural place. Thus the city center will need a higher density of nodes in order to be complete.
Another issue that can arise is how to define the places that are going to be listed. So far, the solution only mentioned coordinates, which can have continuous values. One idea might be getting a list of all the post codes in the porto area, and using a WEBService (e.g., CTT postal code) collect the respective coordinates.
In [168]:
json_data = process_map("porto_portugal.osm")
In this project I have used a mongolab database to host my MongoDB database. After creating the database p03 I have imported the json data into porto collection by running the following line in a terminal.
$ mongoimport -h ds055565.mongolab.com:55565 -d p03 -c porto -u <admin_user> -p <admin_pass> --file porto_portugal.osm.json
In order to run a few queries in this database I used the MongoDB python driver PyMongo.
In [169]:
from pymongo import MongoClient
client = MongoClient("mongodb://p03_guest:p03_guest@ds055565.mongolab.com:55565/p03")
db = client.p03
porto_portugal_collection = db['porto']
In [219]:
porto_portugal_collection.find().count()
Out[219]:
In [220]:
porto_portugal_collection.find({"type":"node"}).count()
Out[220]:
In [218]:
porto_portugal_collection.find({"type":"way"}).count()
Out[218]:
This had been done previously using the XML parser. Now there may be less users due to the data cleaning that was performed.
In [176]:
distinct_users = porto_portugal_collection.distinct("created.user")
In [216]:
len(distinct_users)
Out[216]:
In [188]:
cursor = porto_portugal_collection.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, {"$sort":{"count":-1}}, {"$limit":20}])
In [189]:
users_top20 = list(cursor)
In [215]:
for user in users_top20:
print "%30s ...... %d" %(user['_id'],user['count'])
In [ ]: