Create a relative samll sample file from the whole osm file


In [24]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "/Users/yangrenqin/udacity/P3/san-francisco.osm"  # Replace this with your osm file
SAMPLE_FILE = "/Users/yangrenqin/udacity/P3/sample1.osm"

k = 30 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'w') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='unicode'))

    output.write('</osm>')

However, since the original full size osm file is too big, I didn't use this sample file in the later part. In the whole wrangling, audit and clean process, I just used the full original osm file

Count the number of different tags


In [10]:
from collections import defaultdict

filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'


def count_tags(filename):
    tags=defaultdict(int)
    for _,elem in (ET.iterparse(filename)):
        tags[elem.tag] += 1
    return tags
        
count_tags(filename)


Out[10]:
defaultdict(int,
            {'bounds': 1,
             'member': 26246,
             'nd': 1677325,
             'node': 1410191,
             'osm': 1,
             'relation': 1687,
             'tag': 949435,
             'way': 154315})

Find different "k" attribute of tags and count them


In [31]:
import re

filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_colons=re.compile(r'^([a-z]|_)*(:([a-z]|_)*)+$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys,other):
    if element.tag == "tag":
        a=lower.search(element.attrib['k'])
        b=lower_colon.search(element.attrib['k'])
        c=problemchars.search(element.attrib['k'])
        d=lower_colons.search(element.attrib['k'])
        if a:
            keys['lower'] += 1
        elif b:
            keys['lower_colon'] += 1
        elif c:
            keys['problemchars'] += 1
        elif d:
            keys['lower_colons'] += 1
        else:
            keys['other'] += 1
            other.append(element.attrib['k'])
    return keys,other



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "lower_colons":0, "problemchars": 0, "other": 0}
    other=[]
    for _, element in ET.iterparse(filename):
        keys,other = key_type(element, keys,other)

    return keys,other

keys,others=process_map(filename)
print(keys)


{'other': 14745, 'problemchars': 87, 'lower_colons': 2850, 'lower': 445870, 'lower_colon': 485883}

Aduit the street type and find out the wrong street type


In [11]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import re

filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Way", "Highway", "Path", "Terrace", "Alley", "Center",
            "Circle", "Plaza", "Real"]

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types=defaultdict(set)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

error_street_type=audit(filename)
error_street_type


Out[11]:
defaultdict(set,
            {'1': {'W Of Us 101 @ Jct Sr 1'},
             '1.3': {'ALA 260 PM 1.3'},
             '120': {'12th Street #120'},
             '12180142': {'12180142'},
             '15th': {'15th'},
             '170': {'California Street, Suite 170'},
             '2': {'San Francisco Bicycle Route 2'},
             '24th': {'24th'},
             '3.2': {'ALA 84 PM 3.2'},
             '300': {'Mission Street #300'},
             '4.5': {'SF 80 PM 4.5'},
             '41276': {'Upton St 41276'},
             '9th': {'9th'},
             'A': {'Avenue A', 'Upton St #A'},
             'Abenue': {'Columbus Abenue'},
             'Academy': {'California College Prep Academy'},
             'Alameda': {'Alameda', 'The Alameda'},
             'Alcatraz': {'Alcatraz'},
             'Arguello': {'Arguello'},
             'Ave': {' Grant Ave',
              '45th Ave',
              '7th Ave',
              '8th Ave',
              'Allerton Ave',
              'California Ave',
              'Cherry Ave',
              'Colusa Ave',
              'Corte Madera Ave',
              'Cortland Ave',
              'Day Ave',
              'Earl Ave',
              'Floribunda Ave',
              'Geneva Ave',
              'Grand Ave',
              'Grant Ave',
              'Greenwood Ave',
              'Hamilton Ave',
              'Huntington Ave',
              'Jerrold Ave',
              'Loma Vista Ave',
              'Lorton Ave',
              'Magnolia Ave',
              'Palmetto Ave',
              'Pennsylvania Ave',
              'Phelan Ave',
              'Piedmont Ave',
              'Rand Ave',
              'San Pablo Ave',
              'Shattuck Ave',
              'Tehama Ave',
              'Telegraph Ave',
              'Thorton Ave',
              'University Ave',
              'Van Ness Ave',
              'W & E Of Us 101 N Of Seminary Ave',
              'W 25th Ave',
              'West Portal Ave'},
             'Ave.': {'960 Arlington Ave.',
              'Cortland Ave.',
              'Dartmouth Ave.',
              'Edes Ave.',
              'Fairmount Ave.',
              'Grand Ave.',
              'Hamilton Ave.',
              'Jefferson Ave.',
              'Lincoln Ave.',
              'Menalto Ave.',
              'Pennsylvania Ave.',
              'Piedmont Ave.',
              'San Carlos Ave.'},
             'Bay': {'Bay'},
             'Bluxome': {'Bluxome'},
             'Blvd': {'10500 Foothill Blvd',
              '2501 East Bayshore Blvd',
              '25555 Hesperian Blvd',
              '296 Airport Blvd',
              '380 Foster City Blvd',
              '600 Leweling Blvd',
              'Airport Blvd',
              'Anza Blvd',
              'Ardenwood Blvd',
              'Arguello Blvd',
              'Cipriani Blvd',
              'Fremont Blvd',
              'Geary Blvd',
              'International Blvd',
              'Junipero Serra & Westborough Blvd',
              'Leimert Blvd',
              'Lewelling Blvd',
              'MacArthur Blvd',
              'Mission Blvd',
              'Monterey Blvd',
              'Monument Blvd',
              'N California Blvd',
              'Newark Blvd',
              'Park Blvd',
              'Sawyer Camp Trail & Hillcrest Blvd',
              'Skyline Blvd',
              'Under Ramp Sw Quad Of Us 101 / Sr 92 Ic Off 19th Ave & Fashion Island Blvd',
              'W MacArthur Blvd'},
             'Blvd,': {'Nw Quad I-280 / Sr 35 Ic @ Jct Hayne Rd, Golf Course Dr, Skyline Blvd,'},
             'Blvd.': {'East Francisco Blvd.', 'Geary Blvd.'},
             'Boulavard': {'Alemany Boulavard'},
             'Boulvard': {'Alemany Boulvard'},
             'Bradshaw': {'Bradshaw'},
             'Brannan': {'Brannan'},
             'Bridge': {'Pier 1  SM- Hayward Bridge'},
             'Bridgeway': {'Bridgeway'},
             'Broadway': {'Broadway', 'North Broadway'},
             'California': {'California'},
             'Clement': {'Clement'},
             'Clemente': {'San Clemente'},
             'Columbus': {'Columbus'},
             'Cortland': {'Cortland'},
             'Cragmont': {'Cragmont'},
             'Cres': {'Wellesley Cres'},
             'Crescent': {'Wellesley Crescent'},
             'Ct': {'Prescott Ct'},
             'Ctr': {'Tanforan Shopping Ctr'},
             'Cut': {'Short Cut'},
             'D': {'Avenue D'},
             'Dr': {'California Dr',
              'Chateau Dr',
              'Corriente Point Dr',
              'Hillside Dr',
              'Lakeside Dr',
              'Monte Verde Dr'},
             'Dr.': {'Grandview Dr.'},
             'E': {'Avenue E'},
             'East': {'Buena Vista Avenue East'},
             'Embarcadero': {'Embarcadero', 'The Embarcadero'},
             'Everett': {'Everett'},
             'F': {'Avenue F'},
             'Fillmore': {'Fillmore'},
             'G': {'G'},
             'Geary': {'Geary'},
             'H': {'Avenue H'},
             'Harrison': {'Harrison'},
             'Hwy': {'40 Shoreline Hwy', 'Redwood Hwy'},
             'I-580': {'E Of Center St @ I-580'},
             'I-580)': {'N Side Of Foothill Blvd @ John Dr (Near I-580)'},
             'Ic': {'Ne Quad I-280 / Edgewood Rd Ic',
              'Nw Quad Sr 84 / Ardenwood Blvd Ic',
              'Se Quad Sr 92 / Ralston Ic'},
             'Int': {'012 Mi Ne Of Sr 1 / Linda Mar Blvd Int',
              'Se Quad Sr 1 / Crespi Dr Int'},
             'Judah': {'Judah'},
             'King': {'King'},
             'Las': {'Alameda De Las'},
             'Leimert': {'Leimert'},
             'Leslie': {'Leslie'},
             'Lindbergh': {'Ne Quad Us 101 / 3rd Ave Off Lindbergh'},
             'Ln': {'Yerba Buena Ln'},
             'Ln.': {'Bont Ln.'},
             'Lugano': {'Via Lugano'},
             'M': {'Avenue M'},
             'Mall': {'Pacific Avenue Mall'},
             'Market': {'Market'},
             'Market/Castro': {'Market/Castro'},
             'Market/Noe': {'Market/Noe'},
             'Mission': {'Mission'},
             'Ness': {'Van Ness'},
             'North': {'Mission Bay Boulevard North'},
             'Oakridge': {'Oakridge'},
             'Ora': {'Avenue Del Ora'},
             'Palms': {'Avenue of the Palms'},
             'Park': {'South Park'},
             'Peak': {'Grizzly Peak'},
             'Pl': {'San Francisco/Oakland Bridge Toll Pl'},
             'Plz': {'Woodside Plz'},
             'Post': {'Post'},
             'Powell': {'Bay and Powell', 'Powell'},
             'Pulgas': {'Alamed de las Pulgas',
              'Alameda De Las Pulgas',
              'Alameda de Las Pulgas',
              'Alameda de las Pulgas'},
             'Rd': {'Ascot Rd',
              'Marshlands Rd',
              'Rollins Rd',
              'San Mateo Rd',
              'Se Quad I-680 / Rudgear Rd',
              'Willow Rd',
              'Ygnacio Valley Rd'},
             'Rd.': {'Alpine Rd.'},
             'Rhein': {'Stein Am Rhein'},
             'Rock': {'Indian Rock'},
             'Schwerin': {'Schwerin'},
             'Sobrante': {'Camino Sobrante'},
             'Southgate': {'Southgate'},
             'Spencer': {'E & W Of  Us 101 @ Monte Mar & Spencer'},
             'St': {' Laguna St',
              ' Valencia St',
              '12th St',
              '14th St',
              '16th St',
              '18th St',
              '19th St',
              '2nd St',
              '31st Ave & Judah St',
              '4th St',
              '6th St',
              '8th St',
              '9th St',
              'Adeline St',
              'Alice St',
              'Brannan St',
              'Bryant St',
              'Center St',
              'Cesar Chavez St St',
              'Chester St',
              'Church St',
              'Clement St',
              'Connecticut St',
              'Delancey St',
              'Diamond St',
              'Embarcadero St',
              'Fallon St',
              'Fillmore St',
              'Folsom St',
              'Franklin St',
              'Fulton St',
              'Granada St',
              'Green St',
              'Hayes St',
              'Hollis St',
              'Howard St',
              'Hyde St',
              'Jackson St',
              'Kearny St',
              'Laurel St',
              'Lyon St',
              'Madison St',
              'Main St',
              'Market St',
              'McAllister St',
              'Mendell St',
              'Mission St',
              'Missouri St',
              'Noe St',
              'Oak St',
              'Park St',
              'Peralta St',
              'Pine St',
              'Polk St',
              'Post St',
              'S Delaware St',
              'San Pablo Av & Cedar St',
              'Sansome St',
              'Sutter St',
              'Taraval St',
              'Turk St',
              'Under I-580 Btwn Fruitvale / Champion St',
              'Under I-880 @ 7th St & Linden St',
              'Union St',
              'Valencia St',
              'Washington St',
              'Webster St',
              'Williams St'},
             'St.': {'9th St.',
              'Buchanan St.',
              'California St.',
              'Divisadero St.',
              'Fillmore St.',
              'Fulton St.',
              'Haight St.',
              'Halleck St.',
              'Jennings St.',
              'Market St.',
              'Pine St.',
              'Sutter St.',
              'Valencia St.',
              'Webster St.'},
             'Steet': {'Guerrero Steet'},
             'Steps': {'Bancroft Steps'},
             'Sutter': {'Sutter'},
             'Telegraph': {'3605 Telegraph'},
             'Valencia': {'Valencia'},
             'View': {'Norwood View'},
             'Walk': {'Fountain Walk', 'Rose Walk', 'Terrace Walk'},
             'Wedemeyer': {'Wedemeyer'},
             'West': {'Buena Vista Avenue West', 'Embarcadero West'},
             'arrowhead': {'lake arrowhead'},
             'ave': {'central ave'},
             'avenue': {'Santa Cruz avenue'},
             'broadway': {'broadway'},
             'bush': {'bush'},
             'parkway': {'Bridge parkway'},
             'square': {'33 union square'},
             'st': {'26th st', 'Crane st'},
             'street': {'Vallejo street',
              'Woolsey street',
              'laguna street',
              'market street',
              'townsend street'},
             'sutter': {'sutter'}})

Mapping dictionaries which used to update and clean those amendable data


In [207]:
street_mapping = { "St": "Street",
            "St.": "Street",
            "Steet": "Street",
            "st": "Street",
            "street": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "ave": "Avenue",
            "avenue": "Avenue",
            "Rd.": "Road",
            "Rd": "Road",
            "Blvd": "Boulevard",
            "Blvd,": "Boulevard",
            "Blvd.": "Boulevard",
            "Boulavard": "Boulevard",
            "Boulvard": "Boulevard",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Pl": "Plaza",
            "Plz": "Plaza",
            "square": "Square"
            }

postcode_mapping={"CA 94030": "94030",
                  "CA 94133": "94133",
                  "CA 94544": "94544",
                  "CA 94103": "94103",
                  "CA:94103": "94103"
                    }
error_postcode={'1087', '515', 'CA'}

cityname_mapping={"Berkeley, CA": "Berkeley",
                  "Fremont ": "Fremont",
                  "Oakland, CA": "Oakland",
                  "Oakland, Ca": "Oakland",
                  "San Francisco, CA": "San Francisco",
                  "San Francisco, CA 94102": "San Francisco",
                  "San Francicsco": "San Francisco",
                  "San Fransisco": "San Francisco",
                  "San Francsico": "San Francisco",
                  "Artherton": "Atherton"
                    }
error_cityname={'155', '157'}

Find out the all type of postal code


In [14]:
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
def is_street_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_postcode(tag):
                    postcode_types.add(tag.attrib['v'])
    osm_file.close()
    return postcode_types

postcode=audit(filename)
postcode


Out[14]:
{'1087',
 '515',
 '90214',
 '93710',
 '94002',
 '94002-3585',
 '94010',
 '94013',
 '94014',
 '94015',
 '94017',
 '94019',
 '94025',
 '94027',
 '94030',
 '94044',
 '94061',
 '94062',
 '94063',
 '94065',
 '94066',
 '94070',
 '94080',
 '94087',
 '94102',
 '94103',
 '94103-3124',
 '94104',
 '94105',
 '94107',
 '94108',
 '94109',
 '94110',
 '94111',
 '94112',
 '94112 ',
 '94113',
 '94114',
 '94115',
 '94115 ',
 '94116',
 '94117',
 '94117-9991',
 '94118',
 '94118-4504',
 '9412',
 '94121',
 '94121 ',
 '94122',
 '94123',
 '941234',
 '94124',
 '94127',
 '94129',
 '94130',
 '94131',
 '94132',
 '94133',
 '94134',
 '94143',
 '94158',
 '94166',
 '94188',
 '94213',
 '94301',
 '94301-2019',
 '94303',
 '94401',
 '94402',
 '94403',
 '94404',
 '94501',
 '94502',
 '94519',
 '94523',
 '94530',
 '94536',
 '94541',
 '94544',
 '94545',
 '94546',
 '94549',
 '94549-5506',
 '94552',
 '94555',
 '94556',
 '94560',
 '94563',
 '94577',
 '94578',
 '94579',
 '94587',
 '94595',
 '94596',
 '94597',
 '94598',
 '94601',
 '94602',
 '94603',
 '94605',
 '94606',
 '94606-3636',
 '94607',
 '94608',
 '94609',
 '94610',
 '94611',
 '94612',
 '94612-2202',
 '94613',
 '94618',
 '94619',
 '94621',
 '94702',
 '94703',
 '94704',
 '94705',
 '94706',
 '94707',
 '94708',
 '94709',
 '94710',
 '94720',
 '94720-1076',
 '94801',
 '94804',
 '94805',
 '94901',
 '94904',
 '94920',
 '94925',
 '94939',
 '94941',
 '94965',
 '95498',
 'CA',
 'CA 94030',
 'CA 94133',
 'CA 94544',
 'CA:94103'}

Find out all type of city names


In [15]:
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'

def is_city(elem):
    return (elem.attrib['k'] == "addr:city")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    city_types = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_city(tag):
                    city_types.add(tag.attrib['v'])
    osm_file.close()
    return city_types

cityname=audit(filename)
cityname


Out[15]:
{'155',
 '157',
 'Alameda',
 'Albany',
 'Artherton',
 'Atherton',
 'Belmont',
 'Berkeley',
 'Berkeley, CA',
 'Brisbane',
 'Burlingame',
 'Castro Valley',
 'Daly City',
 'East Palo Alto',
 'El Cerrito',
 'Emeryville',
 'Foster City',
 'Fremont',
 'Fremont ',
 'Greenbrae',
 'Half Moon Bay',
 'Hayward',
 'Kensington',
 'Kentfield',
 'Lafayette',
 'Marin City',
 'Menlo Park',
 'Mill Valley',
 'Montara',
 'Moraga',
 'Newark',
 'Oakland',
 'Oakland, CA',
 'Oakland, Ca',
 'Orinda',
 'Pacifica',
 'Palo Alto',
 'Piedmont',
 'Pleasant Hill',
 'Redwood City',
 'Richmond',
 'San Bruno',
 'San Carlos',
 'San Francicsco',
 'San Francisco',
 'San Francisco, CA',
 'San Francisco, CA 94102',
 'San Francsico',
 'San Fransisco',
 'San Leandro',
 'San Mateo',
 'Sausalito',
 'South San Francisco',
 'Tiburon',
 'Union City',
 'Walnut Creek',
 'berkeley',
 'oakland',
 'san francisco'}

Audit, correct and write data from xml into csv files


In [218]:
import csv
import codecs
import re
import xml.etree.cElementTree as ET




OSM_PATH = "/Users/yangrenqin/udacity/P3/san-francisco.osm"

NODES_PATH = "/Users/yangrenqin/udacity/P3/nodes.csv"
NODE_TAGS_PATH = "/Users/yangrenqin/udacity/P3/nodes_tags.csv"
WAYS_PATH = "/Users/yangrenqin/udacity/P3/ways.csv"
WAY_NODES_PATH = "/Users/yangrenqin/udacity/P3/ways_nodes.csv"
WAY_TAGS_PATH = "/Users/yangrenqin/udacity/P3/ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def capitalize(a):
    b=''
    for i in a.split(' '):
    
        i=i.capitalize()+' '
        b=b+i
    a=b.strip()
    return a

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        for i in element.attrib:
            if i in node_attr_fields:
                node_attribs[i]=element.attrib[i]
        if element.getchildren() == []:
            pass
        else:
            for tag in element.iter('tag'):
                node_tags={}
                k=tag.attrib['k']
                if PROBLEMCHARS.search(k):
                    continue
                elif LOWER_COLON.search(k):
                    
                    if LOWER_COLON.search(k).group() == k:
                        if k == "addr:street":
                            m = street_type_re.search(tag.attrib['v'])
                            if m:
                                street_type = m.group()
                                if street_type in error_street_type:
                                    if street_type in street_mapping:
                                        tag.attrib['v']=tag.attrib['v'].replace(street_type,street_mapping[street_type])
                                    else:
                                        continue
                                node_tags['key']=k.split(':')[1]
                                node_tags['type']=k.split(':')[0]
                                node_tags['id']=element.attrib['id']
                                node_tags['value']=tag.attrib['v']
                                tags.append(node_tags)
                            else:
                                continue
                        if k == "addr:postcode":
                            if tag.attrib['v'] in error_postcode:
                                continue
                            else: 
                                if tag.attrib['v'] in postcode_mapping:
                                    tag.attrib['v']=postcode_mapping[tag.attrib['v']]
                                node_tags['key']=k.split(':')[1]
                                node_tags['type']=k.split(':')[0]
                                node_tags['id']=element.attrib['id']
                                node_tags['value']=tag.attrib['v']
                                tags.append(node_tags)
                        if k == "addr:city":
                            if tag.attrib['v'] in error_cityname:
                                continue
                            else: 
                                if tag.attrib['v'] in cityname_mapping:
                                    tag.attrib['v']=cityname_mapping[tag.attrib['v']]
                                node_tags['key']=k.split(':')[1]
                                node_tags['type']=k.split(':')[0]
                                node_tags['id']=element.attrib['id']
                                node_tags['value']=capitalize(tag.attrib['v'])
                                tags.append(node_tags)
                    else:
                        node_tags['key']=k.partition(':')[-1]
                        node_tags['type']=k.partition(':')[0]
                        node_tags['id']=element.attrib['id']
                        node_tags['value']=tag.attrib['v']
                        tags.append(node_tags)
                else:
                    node_tags['id']=element.attrib['id']
                    node_tags['value']=tag.attrib['v']
                    node_tags['key']=k
                    node_tags['type']=default_tag_type
                    tags.append(node_tags)
    if element.tag == 'way':
        for i in element.attrib:
            if i in way_attr_fields:
                way_attribs[i]=element.attrib[i]
        if element.getchildren() == []:
            pass
        else:
            for tag in element.iter('tag'):
                way_tags={}
                k=tag.attrib['k']
                if PROBLEMCHARS.search(k):
                    continue
                elif LOWER_COLON.search(k):
                    if LOWER_COLON.search(k).group() == k:
                        if k == "addr:street":
                            m = street_type_re.search(tag.attrib['v'])
                            if m:
                                street_type = m.group()
                                if street_type in error_street_type:
                                    if street_type in street_mapping:
                                        tag.attrib['v']=tag.attrib['v'].replace(street_type,street_mapping[street_type])
                                    else:
                                        continue
                                way_tags['key']=k.split(':')[1]
                                way_tags['type']=k.split(':')[0]
                                way_tags['id']=element.attrib['id']
                                way_tags['value']=tag.attrib['v']
                                tags.append(way_tags)
                            else:
                                continue
                        if k == "addr:postcode":
                            if tag.attrib['v'] in error_postcode:
                                continue
                            else: 
                                if tag.attrib['v'] in postcode_mapping:
                                    tag.attrib['v']=postcode_mapping[tag.attrib['v']]
                                way_tags['key']=k.split(':')[1]
                                way_tags['type']=k.split(':')[0]
                                way_tags['id']=element.attrib['id']
                                way_tags['value']=tag.attrib['v']
                                tags.append(way_tags)
                        if k == "addr:city":
                            if tag.attrib['v'] in error_cityname:
                                continue
                            else: 
                                if tag.attrib['v'] in cityname_mapping:
                                    tag.attrib['v']=cityname_mapping[tag.attrib['v']]
                                way_tags['key']=k.split(':')[1]
                                way_tags['type']=k.split(':')[0]
                                way_tags['id']=element.attrib['id']
                                way_tags['value']=capitalize(tag.attrib['v'])
                                tags.append(way_tags)
                    else:
                        way_tags['key']=k.partition(':')[-1]
                        way_tags['type']=k.partition(':')[0]
                        way_tags['id']=element.attrib['id']
                        way_tags['value']=tag.attrib['v']
                        tags.append(way_tags)
                else:
                    way_tags['id']=element.attrib['id']
                    way_tags['value']=tag.attrib['v']
                    way_tags['key']=k
                    way_tags['type']=default_tag_type
                    tags.append(way_tags)
            
            for i,nd in enumerate(element.iter('nd')):
                way_nd={}
                way_nd['id']=element.attrib['id']
                way_nd['node_id']=nd.attrib['ref']
                way_nd['position']=i
                way_nodes.append(way_nd)
                
                    
    if element.tag == 'node':
        if element.getchildren() == []:
            return {'node': node_attribs}
        else:
            return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        if element.getchildren() == []:
            return {'way': way_attribs}
        else:
            return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

In [219]:
def get_element(osm_file, tags=('node', 'way')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
def is_numeric(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def keep_numeric(original):
    for i,v in original.items():
        if is_numeric(v):
            if float(v).is_integer():
                original[i]=int(float(v))
            else:
                original[i]=float(v)
    return original

def process_map(file_in):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file,codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = csv.DictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()


        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if element.tag == 'node':
                    if element.getchildren() == []:
                        nodes_writer.writerow(keep_numeric(el['node']))
                    else:
                        nodes_writer.writerow(keep_numeric(el['node']))
                        node_tags_writer.writerows([keep_numeric(i) for i in el['node_tags']])
                elif element.tag == 'way':
                    if element.getchildren() == []:
                        ways_writer.writerow(keep_numeric(el['way']))
                    else:
                        ways_writer.writerow(keep_numeric(el['way']))
                        way_nodes_writer.writerows([keep_numeric(i) for i in el['way_nodes']])
                        way_tags_writer.writerows([keep_numeric(i) for i in el['way_tags']])

In [220]:
if __name__ == '__main__':
    process_map(OSM_PATH)

Verify the result of audit and update


In [223]:
import sqlite3
import pandas as pd
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
 FROM (SELECT * FROM nodes_tags\
 UNION ALL\
 SELECT * FROM ways_tags) tags\
 WHERE tags.key='street'\
 GROUP BY tags.value\
 ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['Street','count'])
db.close()

In [228]:
rows.head(20)


Out[228]:
Street count
0 El Camino Real 386
1 Jefferson Avenue 324
2 Roosevelt Avenue 270
3 Hudson Street 240
4 Woodside Road 223
5 Hamilton Avenue 220
6 Madison Avenue 206
7 Vera Avenue 201
8 Redwood Avenue 194
9 Kentfield Avenue 192
10 Martin Luther King Jr Way 177
11 King Street 175
12 Hoover Street 170
13 Oak Avenue 168
14 Valota Road 166
15 Hopkins Avenue 163
16 Brewster Avenue 162
17 University Avenue 157
18 Fulton Street 156
19 Bay Road 154

In [231]:
rows.tail(20)


Out[231]:
Street count
1359 West MacArthur Boulevard 1
1360 West Parnassus Court 1
1361 West Ranger Avenue 1
1362 Westmoor Avenue 1
1363 Whittle Avenue 1
1364 William Saroyan Place 1
1365 Williams Street 1
1366 Willie Mays Plaza 1
1367 Wood Street 1
1368 Woodminster Lane 1
1369 Yacht Road 1
1370 Yosemite Avenue 1
1371 Youngs Valley Road 1
1372 Zoo Avenue 1
1373 central Avenue 1
1374 market Street 1
1375 pine Street 1
1376 shattuck Avenue 1
1377 townsend Street 1
1378 ygnacio Valley Road 1

In [238]:
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
 FROM (SELECT * FROM nodes_tags\
 UNION ALL\
 SELECT * FROM ways_tags) tags\
 WHERE tags.key='postcode'\
 GROUP BY tags.value\
 ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['Postcode','count'])
db.close()

In [239]:
rows.head(10)


Out[239]:
Postcode count
0 94063 358
1 94587 250
2 94109 200
3 94103 192
4 94061 161
5 94114 129
6 94113 111
7 94110 65
8 94102 63
9 94107 63

In [240]:
rows.tail(10)


Out[240]:
Postcode count
120 94549-5506 1
121 94552 1
122 94563 1
123 94606-3636 1
124 94612-2202 1
125 94621 1
126 94708 1
127 94720 1
128 94720-1076 1
129 95498 1

In [241]:
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
 FROM (SELECT * FROM nodes_tags\
 UNION ALL\
 SELECT * FROM ways_tags) tags\
 WHERE tags.key='city'\
 GROUP BY tags.value\
 ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['City','count'])
db.close()

In [242]:
rows


Out[242]:
City count
0 Redwood City 23533
1 Berkeley 3358
2 Palo Alto 1651
3 San Francisco 1379
4 Union City 252
5 Burlingame 158
6 Oakland 143
7 San Mateo 38
8 Alameda 24
9 Walnut Creek 18
10 Albany 16
11 Hayward 15
12 Atherton 13
13 San Carlos 12
14 Fremont 11
15 Emeryville 8
16 Mill Valley 6
17 San Leandro 6
18 Belmont 4
19 Pacifica 4
20 Castro Valley 3
21 Daly City 3
22 Lafayette 3
23 Menlo Park 3
24 Piedmont 3
25 Richmond 3
26 Sausalito 3
27 Brisbane 2
28 Foster City 2
29 Half Moon Bay 2
30 Newark 2
31 San Bruno 2
32 East Palo Alto 1
33 El Cerrito 1
34 Greenbrae 1
35 Kensington 1
36 Kentfield 1
37 Marin City 1
38 Montara 1
39 Moraga 1
40 Orinda 1
41 Pleasant Hill 1
42 South San Francisco 1
43 Tiburon 1

Data overview part


In [270]:
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT nodes_tags.value,COUNT(*) as num\
 FROM nodes_tags, (SELECT DISTINCT(id) FROM nodes_tags WHERE value='cafe') AS i\
 WHERE nodes_tags.id=i.id AND nodes_tags.key='name'\
 GROUP BY nodes_tags.value\
 ORDER BY num desc\
 LIMIT 1;"
c.execute(query)
result=c.fetchall()
db.close()

In [271]:
result


Out[271]:
[('Starbucks', 68)]

I just repeatly use this code to perform different query to database.


In [ ]: