Wrangle OpenStreetMap Data

Luis Cruz - luismirandacruz (at) gmail (dot) com


In [128]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import codecs
import json
from pprint import pprint

Count tags

The following routine counts the tags for each


In [129]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_type_portuguese_re = re.compile(r'^\b\S+\.?', re.IGNORECASE)

# method to audit unexpected street types in data
def audit_street_type(street_types, street_name, expected):
    m = street_type_portuguese_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            # the collected street type is unknown
            street_types[street_type].add(street_name)

#checks whether an element is a street
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile, expected):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        #look only for elements **node** and **way**
        if elem.tag == "node" or elem.tag == "way":
            #iterate over children **tag** elements and analyze the ones that have a str
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'], expected)

    return street_types


#method that converts abbreviated street names to an extensive and unique format(e.g, R. da Forca --> Rua da Forca)
def update_name(name, mapping):
    m = street_type_portuguese_re.search(name)
    if m:
        street_type = m.group()
        updated_street_type = mapping.get(street_type)
        if updated_street_type:
            name = name.replace(street_type, updated_street_type)
    return name

In [130]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        node['type'] = element.tag
        node.update(element.attrib)
        latitude = node.pop('lat', None)
        longitude = node.pop('lon', None)
        if latitude and longitude:
            node['pos'] = [float(latitude), float(longitude)]
        node['created'] = {}
        for created_attr in CREATED:
            created_attr_value = node.pop(created_attr, None )
            if created_attr_value:
                node['created'][created_attr] = created_attr_value
        
        for tag in  element.iter('tag'):
            k = tag.attrib['k']
            if problemchars.findall(k):
                pass
            elif k[:5] == 'addr:':
                address_key = k[5:]
                if lower_colon.findall(address_key):
                    pass
                else:
                    if 'address' not in node.keys():
                        node['address'] = {}
                    value = tag.attrib.get('v')
                    if address_key =='street':
                        value = update_name(value, mapping)
                    node['address'][address_key] = value
            else:
                node[k] = tag.attrib.get('v')
        node_refs = [tag.get('ref') for tag in element.iter('nd')]
        if node_refs:
            node['node_refs'] = node_refs
        
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

Overview of the Data

In this project I have used the metro area of Porto, Portugal. If you want to run this code, please download the OSM XML data from Map Zen and extract it in the directory of this Notebook.

The direct link is the following: https://s3.amazonaws.com/metro-extracts.mapzen.com/porto_portugal.osm.bz2


In [131]:
path = 'porto_portugal.osm'
#path = 'porto_portugal_sample.osm'

Size of the file


In [132]:
import os
print "Dataset size is %.1fMB"%(os.path.getsize(path)/1000000.)


Dataset size is 151.8MB

Number of unique users

The following users were found in our data:


In [133]:
def get_user(element):
    return element.attrib.get('user')

users = set()
for _, element in ET.iterparse(path):
    user = get_user(element)
    if user:
        users.add(user)

In [134]:
print "----------------------"
print "    First 20 users"
print "----------------------"
for u in list(users)[:20]:
    print u
print '...'
print "----------------------"
print "Total number of unique users: %d"%len(users)


----------------------
    First 20 users
----------------------
Paulo Salvador
RuiPinto_12
fribeiro
nscerqueira
jfig
Marco Vergueira
paulamir
alvieboy
elsevilla
OSMF Redaction Account
noperante
PortugalMaps
Bruno Loureiro
Ropino
AndrewBuck
xybot
meldias
Ruca x16
Lobinho
djpatricio
...
----------------------
Total number of unique users: 905

Number of different tags (including nodes and ways)

The following function is used to count all the tags that appear in our XML document.


In [135]:
def count_tags(filename):
        # YOUR CODE HERE
        tree = ET.parse(filename)
        root = tree.getroot()

        tags_count = {root.tag: 1}
        for child in root.findall('.//'):
            tag = child.tag
            tags_count[tag] = tags_count.get(tag, 0) + 1
        return tags_count

In [136]:
tags = count_tags(path)
%store tags


Stored 'tags' (dict)

The following tags and respective count can be found in our data:


In [137]:
pprint(tags)


{'bounds': 1,
 'member': 12400,
 'nd': 891696,
 'node': 682565,
 'osm': 1,
 'relation': 1214,
 'tag': 295483,
 'way': 104136}

Problems encountered in the map

The first thing a noticed in the data is that street names are written in Portuguese, which means that the conventional parsing techniques for English streets does not apply. In Portuguese the following guidelines may help to define the first version of parser for street addresses:

  • Street types appear in the first word of the address
  • Street types are commonly Rua, Avenida, Estrada, among others.

Using the method audit I will try to find out other Portuguese street types. This method basically collects all the street addresses that are not recognized by the parser, grouping them per inferred street type. It infers the street type by extracting the first word of the address.


In [151]:
expected=["Rua", "Avenida", "Estrada", "Travessa"]
audit(path, expected)


Out[151]:
defaultdict(set,
            {'25': {'25 Abril'},
             'Alameda': {u'Alameda Bas\xedlio Teles',
              'Alameda Futebol Clube de Infesta',
              u'Alameda de S\xe3o Silvestre'},
             u'Av': {u'Av Lu\xeds de Cam\xf5es'},
             'Av.': {u'Av. Men\xe9res',
              'Av. Pedra Verde',
              'Av. Principal',
              'Av. da Pedra Verde'},
             'Brito': {'Brito Capelo'},
             'Cais': {'Cais das Lavandeiras'},
             'Calcada': {'Calcada da Feira dos Dez'},
             u'Cal\xe7ada': {u'Cal\xe7ada da Cabine',
              u'Cal\xe7ada da Junqueira',
              u'Cal\xe7ada da Serra',
              u'Cal\xe7ada de Fontela',
              u'Cal\xe7ada de Valinhos',
              u'Cal\xe7ada do Arco'},
             'Caminho': {'Caminho de Vilar'},
             u'Campo': {u'Campo dos M\xe1rtires da P\xe1tria'},
             'Ciclovia': {'Ciclovia da Foz'},
             u'Costa': {u'Costa Padr\xe3o'},
             'EN': {'EN 204/5'},
             'Esplanada': {'Esplanada do Rio de Janeiro'},
             'Ladeira': {'Ladeira da Quinta Nova'},
             u'Largo': {u'Largo 1\xba de Dezembro',
              'Largo 25 de Abril',
              u'Largo Fran\xe7a Borges',
              'Largo Padre Baltazar Guedes',
              'Largo Padre Joaquim Pereira Santos',
              u'Largo Padre Lu\xeds Gonzaga Queir\xf3s',
              u'Largo Padre Sa\xfade',
              u'Largo da Esta\xe7\xe3o',
              'Largo da Feira dos Dez',
              u'Largo de Santo Ant\xf3nio',
              u'Largo de S\xe3o Bento',
              u'Largo de S\xe3o Domingos',
              u'Largo dos L\xf3ios'},
             'Lugar': {'Lugar da Igreja'},
             'Medas': {'Medas'},
             'Mercado': {'Mercado Ferreira Borges'},
             'Nacional': {'Nacional 1', 'Nacional 1/IC2'},
             'Nossa': {'Nossa Senhora do Amparo'},
             'Padre': {'Padre Ricardo Neto'},
             'Paramos': {'Paramos'},
             'Praceta': {'Praceta 5 de Outubro',
              'Praceta Escultor Alves de Sousa',
              u'Praceta Jaime Cortes\xe3o',
              u'Praceta Manuel Gon\xe7alves Ramos',
              'Praceta Parque Nascente',
              'Praceta Professor Egas Moniz',
              u'Praceta Z\xe9 Telhado',
              'Praceta da Ranha',
              'Praceta de Macau',
              'Praceta escultor Alves de Sousa'},
             u'Pra\xe7a': {u'Pra\xe7a 5 de Outubro',
              u'Pra\xe7a Cid. Salvador',
              u'Pra\xe7a Coronel Pacheco',
              u'Pra\xe7a Dom Afonso V',
              u'Pra\xe7a Dom Jo\xe3o I',
              u'Pra\xe7a Dona Filipa de Lencastre',
              u'Pra\xe7a Manuel Guedes',
              u'Pra\xe7a Marqu\xeas de Pombal',
              u'Pra\xe7a Mouzinho de Albuquerque',
              u'Pra\xe7a Vasco da Gama',
              u'Pra\xe7a da Batalha',
              u'Pra\xe7a da Liberdade',
              u'Pra\xe7a da Rep\xfablica',
              u'Pra\xe7a da Ribeira',
              u'Pra\xe7a de Carlos Alberto',
              u'Pra\xe7a do Almada',
              u'Pra\xe7a do Ex\xe9rcito Libertador',
              u'Pra\xe7a dos Poveiros'},
             u'Pra\xe7eta': {u'Pra\xe7eta Professor Sampaio'},
             u'P\xe1tio': {u'P\xe1tio das Escadas do Monte dos Judeus'},
             u'R': {u'R Escola Preparat\xf3ria'},
             'R.': {'R. Adelino Amaro da Costa', 'R. Silva Brinco'},
             'R.S.Pedro': {'R.S.Pedro de Formariz'},
             'RUA': {'RUA CONDE FERREIRA',
              'RUA Central de Gens',
              'RUA DRAGOES SANDINENSES'},
             u'Urbaniza\xe7\xe3o': {u'Urbaniza\xe7\xe3o Industrial da Carri\xe7a',
              u'Urbaniza\xe7\xe3o Industrial do Soeiro'},
             'Via': {'Via Futebol Clube do Porto', u'Via Panor\xe2mica'},
             'Viela': {'Viela da Portela',
              'Viela do Pocinho',
              'Viela dos Congregados'},
             u'Zona': {'Zona Industrial Maia I Sector X - Lote 384',
              u'Zona Industrial da Carri\xe7a, lote 19/20'},
             'da': {'da fonte'},
             'do': {'do Sol'}})

We can see that there were many street types that were not being included. Let's add it.


In [152]:
expected = ["Rua", "Avenida", "Estrada", "Travessa", "Viela", "Zona", "Praceta", "Praça", "Calçada", "Largo", "Lugar", "Campo", "Ciclovia", "Caminho", "Via"]
audit(path, expected)


/Users/luiscruz/dev/udacity_data_analyst/venv/lib/python2.7/site-packages/ipykernel/__main__.py:9: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
Out[152]:
defaultdict(set,
            {'25': {'25 Abril'},
             'Alameda': {u'Alameda Bas\xedlio Teles',
              'Alameda Futebol Clube de Infesta',
              u'Alameda de S\xe3o Silvestre'},
             u'Av': {u'Av Lu\xeds de Cam\xf5es'},
             'Av.': {u'Av. Men\xe9res',
              'Av. Pedra Verde',
              'Av. Principal',
              'Av. da Pedra Verde'},
             'Brito': {'Brito Capelo'},
             'Cais': {'Cais das Lavandeiras'},
             'Calcada': {'Calcada da Feira dos Dez'},
             u'Cal\xe7ada': {u'Cal\xe7ada da Cabine',
              u'Cal\xe7ada da Junqueira',
              u'Cal\xe7ada da Serra',
              u'Cal\xe7ada de Fontela',
              u'Cal\xe7ada de Valinhos',
              u'Cal\xe7ada do Arco'},
             u'Costa': {u'Costa Padr\xe3o'},
             'EN': {'EN 204/5'},
             'Esplanada': {'Esplanada do Rio de Janeiro'},
             'Ladeira': {'Ladeira da Quinta Nova'},
             'Medas': {'Medas'},
             'Mercado': {'Mercado Ferreira Borges'},
             'Nacional': {'Nacional 1', 'Nacional 1/IC2'},
             'Nossa': {'Nossa Senhora do Amparo'},
             'Padre': {'Padre Ricardo Neto'},
             'Paramos': {'Paramos'},
             u'Pra\xe7a': {u'Pra\xe7a 5 de Outubro',
              u'Pra\xe7a Cid. Salvador',
              u'Pra\xe7a Coronel Pacheco',
              u'Pra\xe7a Dom Afonso V',
              u'Pra\xe7a Dom Jo\xe3o I',
              u'Pra\xe7a Dona Filipa de Lencastre',
              u'Pra\xe7a Manuel Guedes',
              u'Pra\xe7a Marqu\xeas de Pombal',
              u'Pra\xe7a Mouzinho de Albuquerque',
              u'Pra\xe7a Vasco da Gama',
              u'Pra\xe7a da Batalha',
              u'Pra\xe7a da Liberdade',
              u'Pra\xe7a da Rep\xfablica',
              u'Pra\xe7a da Ribeira',
              u'Pra\xe7a de Carlos Alberto',
              u'Pra\xe7a do Almada',
              u'Pra\xe7a do Ex\xe9rcito Libertador',
              u'Pra\xe7a dos Poveiros'},
             u'Pra\xe7eta': {u'Pra\xe7eta Professor Sampaio'},
             u'P\xe1tio': {u'P\xe1tio das Escadas do Monte dos Judeus'},
             u'R': {u'R Escola Preparat\xf3ria'},
             'R.': {'R. Adelino Amaro da Costa', 'R. Silva Brinco'},
             'R.S.Pedro': {'R.S.Pedro de Formariz'},
             'RUA': {'RUA CONDE FERREIRA',
              'RUA Central de Gens',
              'RUA DRAGOES SANDINENSES'},
             u'Urbaniza\xe7\xe3o': {u'Urbaniza\xe7\xe3o Industrial da Carri\xe7a',
              u'Urbaniza\xe7\xe3o Industrial do Soeiro'},
             'da': {'da fonte'},
             'do': {'do Sol'}})

Another issue is that some street types are written using abreviations. E.g., Sometimes Avenida appears as Av.. The method _updatename is going to be used to update these


In [165]:
mapping = {
    "R": "Rua",
    "R.": "Rua",
    "Praca": "Praça",
    "Av.": "Avenida",
    "Calcada": "Calçada"
}
name = 'R. da Liberdade'
print '%s => %s'%(name, update_name(name, mapping))


R. da Liberdade => Rua da Liberdade

Portuguese has some latin characters that can be a trouble, such as ç in Praça. Instead, we read Pra\xe7a, which does not make sense. Although these weird characters might make sense with different encoding, we will also convert them to something Python can recognize.

Let's do this by updating update_name method and mapping dictionary.


In [166]:
def update_name(name, mapping):
    m = street_type_portuguese_re.search(name)
    if m:
        street_type = m.group()
        updated_street_type = mapping.get(street_type)
        if updated_street_type:
            name = name.encode('utf-8').replace(street_type.encode('utf-8'), updated_street_type)
    return name

In [167]:
mapping = {
    "R": "Rua",
    "R.": "Rua",
    "Praca": "Praça",
    "Av.": "Avenida",
    "Calcada": "Calçada",
    u'Urbaniza\xe7\xe3o': 'Urbanização',
    u'Pra\xe7a': 'Praça',
}
name = u'Pra\xe7a da Liberdade'
updated_name = update_name(name, mapping)
print '%s => %s'%(name.encode('utf-8'), updated_name)


Praça da Liberdade => Praça da Liberdade

City Names

Sometimes the name of cities can also be an issue. Some users may use acronyms, others might suppress some propositions when present in the name.

Let's make an audit for this. I'll start with counting the number of distinct cities.


In [153]:
def count_cities(filename):
        # YOUR CODE HERE
        tree = ET.parse(filename)
        root = tree.getroot()

        cities_count = {}
        for child in root.findall('.//tag'):
            if child.attrib['k'] == 'addr:city':
                city = child.attrib['v'].lower()
                cities_count[city] = cities_count.get(city, 0) + 1
        return cities_count

In [154]:
cities = count_cities(path)

Distinct citiy names


In [155]:
len(cities)


Out[155]:
80

We have 80 different city names. According to Wikipedia (https://en.wikipedia.org/wiki/Metropolitan_Area_of_Porto#Population), Porto metropolitan area has only 18 cities:


In [158]:
city_names = [
    'vila nova de gaia',
    'santo tirso',
    'trofa',
    'arouca',
    'oliveira de azeméis',
    'santa maria da feira',
    'são joão da madeira',
    'vale de cambra',
    'espinho',
    'gondomar',
    'maia',
    'matosinhos',
    'porto',
    'póvoa de varzim',
    'valongo',
    'vila do conde',
    'vila nova de gaia',
    'paredes',
    'paços de ferreira', # outside metropolitan area
    'penafiel',          # outside metropolitan area
]

In [ ]:

From the list above we can explain this large number of cities by the fact that many users have given the name of civil parish.

Sometimes both the civil parish and the city are written (e.g., 'canidelo - vila nova de gaia' where Vila Nova de Gaia is the city) and other times only the parish is given (e.g., canidelo).

In addition, cities with long names sometimes appear written with acronyms. E.g., Vila Nova de Gaia appears once as V. N. Gaia.


In [159]:
pprint(cities)


{'alfena': 3,
 'arcozelo, vila nova de gaia': 1,
 'argoncilhe': 7,
 'baguim do monte': 1,
 'baltar': 2,
 'bonfim': 1,
 'branzelo': 1,
 u'calend\xe1rio': 74,
 u'campanh\xe3': 1,
 'canelas': 1,
 'canelas vng': 1,
 'canidelo': 1,
 'canidelo - v. n. gaia': 1,
 'canidelo - vila nova de gaia': 1,
 'carvalhos': 2,
 u'cast\xealo da maia': 1,
 u'cust\xf3ias': 5,
 u'cust\xf3ias mts': 1,
 'ermesinde': 6,
 u'esmeriz -vila nova de famalic\xe3o': 1,
 'espinho': 7,
 'fajozes': 3,
 u'fi\xe3es': 10,
 'gandra': 2,
 'gens': 1,
 'gondomar': 3,
 u'gondomar (s\xe3o cosme)': 9,
 u'grij\xf3': 8,
 'guetim': 1,
 u'guid\xf5es': 1,
 'lagares': 2,
 'lagoa': 3,
 'lamelas': 8,
 u'le\xe7a da palmeira': 4,
 u'le\xe7a do balio': 3,
 'lisboa': 3,
 u'lob\xe3o': 1,
 'lourosa': 24,
 'lousado': 1,
 'macieira da maia': 2,
 'madalena': 1,
 'mafamude': 1,
 'maia': 848,
 'matosinhos': 9,
 u'milheir\xf3s': 3,
 'mindelo': 2,
 'modelos': 1,
 'moreira da maia': 3,
 u'mosteir\xf3': 7,
 'nogueira da regedoura': 10,
 'paranhos': 1,
 'paredes': 1,
 u'pa\xe7os de ferreira': 7,
 u'pa\xe7os de ferriera': 1,
 u'pedrou\xe7os': 1,
 'penafiel': 1,
 'porto': 249,
 'povoa de varzim': 1,
 u'p\xf3voa de varzim': 1,
 'ramalde': 6,
 'ribeiros altos - paredes': 1,
 u'ribeir\xe3o': 6,
 'rio tinto': 5,
 'roriz': 1,
 's. martinho do campo': 1,
 's.pedro de formariz': 1,
 'sandim': 1,
 'santo tirso': 13,
 'senhora da hora': 2,
 u's\xe3o  mamede de infesta': 13,
 u's\xe3o felix da mariha': 1,
 u's\xe3o mamede de infesta': 2,
 'trofa': 1,
 'valadares': 1,
 'valongo': 7,
 'vila do conde': 9,
 u'vila nova de famalic\xe3o': 5,
 'vila nova de gaia': 20,
 'vilar de andorinho': 1,
 u'\xe1guas santas': 4}

Solving the issue automatically


In [160]:
parish_city_mapping = {
    'canidelo' : 'vila nova de gaia',
    'canelas' : 'vila nova de gaia',
    'mafamude': 'vila nova de gaia',
    'grijó': 'vila nova de gaia',
    'madalena': 'vila nova de gaia',
    "sandim": 'vila nova de gaia',
    "vilar de andorinho": 'vila nova de gaia',
    'baltar': 'paredes',
    'ermesinde': 'valongo',
    "custóias": 'matosinhos',
    "castêlo da maia": 'matosinhos',
    "leça da palmeira": 'matosinhos',
    "são mamede de infesta": 'matosinhos',
    "leça do balio": 'matosinhos',
    'mindelo': 'vila do conde',
    'bonfim': 'porto',
    'paranhos': 'porto',
    'ramalde': 'porto',
    "campanhã":'porto',
    'senhora da hora': 'matosinhos',
    'valadares': 'vila nova de gaia',
    "águas santas": 'maia',
    "pedrouços": 'maia',
    'lousado': u'vila nova de famalic\xe3o',
    'alfena': 'valongo',
    'roriz': 'santo tirso',
    "lamelas": 'santo tirso',
    'baguim do monte': 'gondomar',
    'fajozes': 'vila do conde',
    "s.pedro de formariz": 'vila do conde',
    "mosteiró": 'vila do conde',
    'lourosa': 'santa maria da feira',
    'fiães': 'santa maria da feira',
    "argoncilhe": 'santa maria da feira',
    "nogueira da regedoura": 'santa maria da feira',
    "ribeirão": 'trofa',
    "guidões": 'trofa',
    "gandra": "paredes",
    "modelos": u'pa\xe7os de ferreira',
    "rio tinto": "gondomar",
    "povoa de varzim": u'p\xf3voa de varzim',
    "lagares": "penafiel",
    "guetim":"espinho",
    
}

In [161]:
def update_city_name(name, mapping, city_names):
    name = name.lower().encode('utf-8')
    updated_name = mapping.get(name)
    if updated_name:
        return updated_name
    if name not in city_names:
        for city in city_names:
            if city in name:
                return city
    return name.decode('utf-8')

In [162]:
updated_cities = {}
for (city, count) in cities.iteritems():
    updated_city = update_city_name(city, parish_city_mapping, city_names)
    updated_cities[updated_city] = (updated_cities.get(updated_city) or 0) + count
print len(updated_cities)
pprint(updated_cities)


32
{u'branzelo': 1,
 u'calend\xe1rio': 74,
 u'canelas vng': 1,
 u'canidelo - v. n. gaia': 1,
 u'carvalhos': 2,
 u'cust\xf3ias mts': 1,
 u'esmeriz -vila nova de famalic\xe3o': 1,
 u'espinho': 8,
 u'gens': 1,
 'gondomar': 18,
 u'lagoa': 3,
 u'lisboa': 3,
 u'lob\xe3o': 1,
 'maia': 858,
 'matosinhos': 26,
 u'milheir\xf3s': 3,
 'paredes': 6,
 u'pa\xe7os de ferreira': 8,
 u'pa\xe7os de ferriera': 1,
 u'penafiel': 3,
 'porto': 258,
 u'p\xf3voa de varzim': 2,
 u's. martinho do campo': 1,
 'santa maria da feira': 51,
 u'santo tirso': 22,
 u's\xe3o  mamede de infesta': 13,
 u's\xe3o felix da mariha': 1,
 'trofa': 8,
 'valongo': 16,
 'vila do conde': 22,
 u'vila nova de famalic\xe3o': 6,
 'vila nova de gaia': 37}

In [ ]:

Other ideas about the datasets

Another idea that could be done was getting the areas that have been recently updated (in the last day/week/month).

This could be achieved by filtering nodes using the attribute timestamp and according to the intended time window. Issues that may arise from dealing with data with date and time values are usually timezone normalization.

Other idea would be presenting places that need data, so that users willing to contribute could be aware of which places are more important. One way of doing this might be, given a pair of coordinates (longitude, latitude) count all the documents in a range of 1km.

The main issue would be the fact that in a city center streets are more complex than in a rural place. Thus the city center will need a higher density of nodes in order to be complete.

Another issue that can arise is how to define the places that are going to be listed. So far, the solution only mentioned coordinates, which can have continuous values. One idea might be getting a list of all the post codes in the porto area, and using a WEBService (e.g., CTT postal code) collect the respective coordinates.

Convert data to json

The following method process_map processes data with the transformations described above and stores data into a json file with the name "porto_portugal.osm.json".


In [168]:
json_data = process_map("porto_portugal.osm")

Import json data in a mongodb database

In this project I have used a mongolab database to host my MongoDB database. After creating the database p03 I have imported the json data into porto collection by running the following line in a terminal.

$ mongoimport -h ds055565.mongolab.com:55565 -d p03 -c porto -u <admin_user> -p <admin_pass> --file porto_portugal.osm.json

In order to run a few queries in this database I used the MongoDB python driver PyMongo.


In [169]:
from pymongo import MongoClient
client = MongoClient("mongodb://p03_guest:p03_guest@ds055565.mongolab.com:55565/p03")
db = client.p03
porto_portugal_collection = db['porto']

Overview of data using MongoDB

Number of documents


In [219]:
porto_portugal_collection.find().count()


Out[219]:
786701

Number of nodes


In [220]:
porto_portugal_collection.find({"type":"node"}).count()


Out[220]:
682556

Number of ways


In [218]:
porto_portugal_collection.find({"type":"way"}).count()


Out[218]:
104134

Distinct users

This had been done previously using the XML parser. Now there may be less users due to the data cleaning that was performed.


In [176]:
distinct_users = porto_portugal_collection.distinct("created.user")

In [216]:
len(distinct_users)


Out[216]:
901

Top 20 contributors


In [188]:
cursor = porto_portugal_collection.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, {"$sort":{"count":-1}}, {"$limit":20}])

In [189]:
users_top20 = list(cursor)

In [215]:
for user in users_top20:
    print "%30s ...... %d" %(user['_id'],user['count'])


                       rtafav2 ...... 109871
                        TaedeT ...... 104018
                   Filipe Neto ...... 61419
                        zermes ...... 50238
               ViriatoLusitano ...... 40540
                        rtafav ...... 36792
                        Spec80 ...... 34780
        ViriatoLusitano_import ...... 18551
                   DanielPinto ...... 13713
              Micha Wiedenmann ...... 10685
                 José Carvalho ...... 10153
                    Luso-Delta ...... 9993
                         B_M_A ...... 9255
                   Rui Tavares ...... 8808
                   pedro903903 ...... 8567
                      jbarbosa ...... 8336
                     rsbarbosa ...... 8177
                          xpdm ...... 8109
                      Barrocas ...... 7240
                  Rui Oliveira ...... 6744

In [ ]: