Create a relative samll sample file from the whole osm file



In [24]:

    
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "/Users/yangrenqin/udacity/P3/san-francisco.osm"  # Replace this with your osm file
SAMPLE_FILE = "/Users/yangrenqin/udacity/P3/sample1.osm"

k = 30 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'w') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='unicode'))

    output.write('</osm>')

However, since the original full size osm file is too big, I didn't use this sample file in the later part. In the whole wrangling, audit and clean process, I just used the full original osm file

Count the number of different tags



In [10]:

    
from collections import defaultdict

filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'


def count_tags(filename):
    tags=defaultdict(int)
    for _,elem in (ET.iterparse(filename)):
        tags[elem.tag] += 1
    return tags
        
count_tags(filename)









    Out[10]:





defaultdict(int,
            {'bounds': 1,
             'member': 26246,
             'nd': 1677325,
             'node': 1410191,
             'osm': 1,
             'relation': 1687,
             'tag': 949435,
             'way': 154315})

Find different "k" attribute of tags and count them



In [31]:

    
import re

filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_colons=re.compile(r'^([a-z]|_)*(:([a-z]|_)*)+$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys,other):
    if element.tag == "tag":
        a=lower.search(element.attrib['k'])
        b=lower_colon.search(element.attrib['k'])
        c=problemchars.search(element.attrib['k'])
        d=lower_colons.search(element.attrib['k'])
        if a:
            keys['lower'] += 1
        elif b:
            keys['lower_colon'] += 1
        elif c:
            keys['problemchars'] += 1
        elif d:
            keys['lower_colons'] += 1
        else:
            keys['other'] += 1
            other.append(element.attrib['k'])
    return keys,other



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "lower_colons":0, "problemchars": 0, "other": 0}
    other=[]
    for _, element in ET.iterparse(filename):
        keys,other = key_type(element, keys,other)

    return keys,other

keys,others=process_map(filename)
print(keys)









    



{'other': 14745, 'problemchars': 87, 'lower_colons': 2850, 'lower': 445870, 'lower_colon': 485883}

Aduit the street type and find out the wrong street type



In [11]:

    
import xml.etree.ElementTree as ET
from collections import defaultdict
import re

filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Way", "Highway", "Path", "Terrace", "Alley", "Center",
            "Circle", "Plaza", "Real"]

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types=defaultdict(set)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

error_street_type=audit(filename)
error_street_type









    Out[11]:





defaultdict(set,
            {'1': {'W Of Us 101 @ Jct Sr 1'},
             '1.3': {'ALA 260 PM 1.3'},
             '120': {'12th Street #120'},
             '12180142': {'12180142'},
             '15th': {'15th'},
             '170': {'California Street, Suite 170'},
             '2': {'San Francisco Bicycle Route 2'},
             '24th': {'24th'},
             '3.2': {'ALA 84 PM 3.2'},
             '300': {'Mission Street #300'},
             '4.5': {'SF 80 PM 4.5'},
             '41276': {'Upton St 41276'},
             '9th': {'9th'},
             'A': {'Avenue A', 'Upton St #A'},
             'Abenue': {'Columbus Abenue'},
             'Academy': {'California College Prep Academy'},
             'Alameda': {'Alameda', 'The Alameda'},
             'Alcatraz': {'Alcatraz'},
             'Arguello': {'Arguello'},
             'Ave': {' Grant Ave',
              '45th Ave',
              '7th Ave',
              '8th Ave',
              'Allerton Ave',
              'California Ave',
              'Cherry Ave',
              'Colusa Ave',
              'Corte Madera Ave',
              'Cortland Ave',
              'Day Ave',
              'Earl Ave',
              'Floribunda Ave',
              'Geneva Ave',
              'Grand Ave',
              'Grant Ave',
              'Greenwood Ave',
              'Hamilton Ave',
              'Huntington Ave',
              'Jerrold Ave',
              'Loma Vista Ave',
              'Lorton Ave',
              'Magnolia Ave',
              'Palmetto Ave',
              'Pennsylvania Ave',
              'Phelan Ave',
              'Piedmont Ave',
              'Rand Ave',
              'San Pablo Ave',
              'Shattuck Ave',
              'Tehama Ave',
              'Telegraph Ave',
              'Thorton Ave',
              'University Ave',
              'Van Ness Ave',
              'W & E Of Us 101 N Of Seminary Ave',
              'W 25th Ave',
              'West Portal Ave'},
             'Ave.': {'960 Arlington Ave.',
              'Cortland Ave.',
              'Dartmouth Ave.',
              'Edes Ave.',
              'Fairmount Ave.',
              'Grand Ave.',
              'Hamilton Ave.',
              'Jefferson Ave.',
              'Lincoln Ave.',
              'Menalto Ave.',
              'Pennsylvania Ave.',
              'Piedmont Ave.',
              'San Carlos Ave.'},
             'Bay': {'Bay'},
             'Bluxome': {'Bluxome'},
             'Blvd': {'10500 Foothill Blvd',
              '2501 East Bayshore Blvd',
              '25555 Hesperian Blvd',
              '296 Airport Blvd',
              '380 Foster City Blvd',
              '600 Leweling Blvd',
              'Airport Blvd',
              'Anza Blvd',
              'Ardenwood Blvd',
              'Arguello Blvd',
              'Cipriani Blvd',
              'Fremont Blvd',
              'Geary Blvd',
              'International Blvd',
              'Junipero Serra & Westborough Blvd',
              'Leimert Blvd',
              'Lewelling Blvd',
              'MacArthur Blvd',
              'Mission Blvd',
              'Monterey Blvd',
              'Monument Blvd',
              'N California Blvd',
              'Newark Blvd',
              'Park Blvd',
              'Sawyer Camp Trail & Hillcrest Blvd',
              'Skyline Blvd',
              'Under Ramp Sw Quad Of Us 101 / Sr 92 Ic Off 19th Ave & Fashion Island Blvd',
              'W MacArthur Blvd'},
             'Blvd,': {'Nw Quad I-280 / Sr 35 Ic @ Jct Hayne Rd, Golf Course Dr, Skyline Blvd,'},
             'Blvd.': {'East Francisco Blvd.', 'Geary Blvd.'},
             'Boulavard': {'Alemany Boulavard'},
             'Boulvard': {'Alemany Boulvard'},
             'Bradshaw': {'Bradshaw'},
             'Brannan': {'Brannan'},
             'Bridge': {'Pier 1  SM- Hayward Bridge'},
             'Bridgeway': {'Bridgeway'},
             'Broadway': {'Broadway', 'North Broadway'},
             'California': {'California'},
             'Clement': {'Clement'},
             'Clemente': {'San Clemente'},
             'Columbus': {'Columbus'},
             'Cortland': {'Cortland'},
             'Cragmont': {'Cragmont'},
             'Cres': {'Wellesley Cres'},
             'Crescent': {'Wellesley Crescent'},
             'Ct': {'Prescott Ct'},
             'Ctr': {'Tanforan Shopping Ctr'},
             'Cut': {'Short Cut'},
             'D': {'Avenue D'},
             'Dr': {'California Dr',
              'Chateau Dr',
              'Corriente Point Dr',
              'Hillside Dr',
              'Lakeside Dr',
              'Monte Verde Dr'},
             'Dr.': {'Grandview Dr.'},
             'E': {'Avenue E'},
             'East': {'Buena Vista Avenue East'},
             'Embarcadero': {'Embarcadero', 'The Embarcadero'},
             'Everett': {'Everett'},
             'F': {'Avenue F'},
             'Fillmore': {'Fillmore'},
             'G': {'G'},
             'Geary': {'Geary'},
             'H': {'Avenue H'},
             'Harrison': {'Harrison'},
             'Hwy': {'40 Shoreline Hwy', 'Redwood Hwy'},
             'I-580': {'E Of Center St @ I-580'},
             'I-580)': {'N Side Of Foothill Blvd @ John Dr (Near I-580)'},
             'Ic': {'Ne Quad I-280 / Edgewood Rd Ic',
              'Nw Quad Sr 84 / Ardenwood Blvd Ic',
              'Se Quad Sr 92 / Ralston Ic'},
             'Int': {'012 Mi Ne Of Sr 1 / Linda Mar Blvd Int',
              'Se Quad Sr 1 / Crespi Dr Int'},
             'Judah': {'Judah'},
             'King': {'King'},
             'Las': {'Alameda De Las'},
             'Leimert': {'Leimert'},
             'Leslie': {'Leslie'},
             'Lindbergh': {'Ne Quad Us 101 / 3rd Ave Off Lindbergh'},
             'Ln': {'Yerba Buena Ln'},
             'Ln.': {'Bont Ln.'},
             'Lugano': {'Via Lugano'},
             'M': {'Avenue M'},
             'Mall': {'Pacific Avenue Mall'},
             'Market': {'Market'},
             'Market/Castro': {'Market/Castro'},
             'Market/Noe': {'Market/Noe'},
             'Mission': {'Mission'},
             'Ness': {'Van Ness'},
             'North': {'Mission Bay Boulevard North'},
             'Oakridge': {'Oakridge'},
             'Ora': {'Avenue Del Ora'},
             'Palms': {'Avenue of the Palms'},
             'Park': {'South Park'},
             'Peak': {'Grizzly Peak'},
             'Pl': {'San Francisco/Oakland Bridge Toll Pl'},
             'Plz': {'Woodside Plz'},
             'Post': {'Post'},
             'Powell': {'Bay and Powell', 'Powell'},
             'Pulgas': {'Alamed de las Pulgas',
              'Alameda De Las Pulgas',
              'Alameda de Las Pulgas',
              'Alameda de las Pulgas'},
             'Rd': {'Ascot Rd',
              'Marshlands Rd',
              'Rollins Rd',
              'San Mateo Rd',
              'Se Quad I-680 / Rudgear Rd',
              'Willow Rd',
              'Ygnacio Valley Rd'},
             'Rd.': {'Alpine Rd.'},
             'Rhein': {'Stein Am Rhein'},
             'Rock': {'Indian Rock'},
             'Schwerin': {'Schwerin'},
             'Sobrante': {'Camino Sobrante'},
             'Southgate': {'Southgate'},
             'Spencer': {'E & W Of  Us 101 @ Monte Mar & Spencer'},
             'St': {' Laguna St',
              ' Valencia St',
              '12th St',
              '14th St',
              '16th St',
              '18th St',
              '19th St',
              '2nd St',
              '31st Ave & Judah St',
              '4th St',
              '6th St',
              '8th St',
              '9th St',
              'Adeline St',
              'Alice St',
              'Brannan St',
              'Bryant St',
              'Center St',
              'Cesar Chavez St St',
              'Chester St',
              'Church St',
              'Clement St',
              'Connecticut St',
              'Delancey St',
              'Diamond St',
              'Embarcadero St',
              'Fallon St',
              'Fillmore St',
              'Folsom St',
              'Franklin St',
              'Fulton St',
              'Granada St',
              'Green St',
              'Hayes St',
              'Hollis St',
              'Howard St',
              'Hyde St',
              'Jackson St',
              'Kearny St',
              'Laurel St',
              'Lyon St',
              'Madison St',
              'Main St',
              'Market St',
              'McAllister St',
              'Mendell St',
              'Mission St',
              'Missouri St',
              'Noe St',
              'Oak St',
              'Park St',
              'Peralta St',
              'Pine St',
              'Polk St',
              'Post St',
              'S Delaware St',
              'San Pablo Av & Cedar St',
              'Sansome St',
              'Sutter St',
              'Taraval St',
              'Turk St',
              'Under I-580 Btwn Fruitvale / Champion St',
              'Under I-880 @ 7th St & Linden St',
              'Union St',
              'Valencia St',
              'Washington St',
              'Webster St',
              'Williams St'},
             'St.': {'9th St.',
              'Buchanan St.',
              'California St.',
              'Divisadero St.',
              'Fillmore St.',
              'Fulton St.',
              'Haight St.',
              'Halleck St.',
              'Jennings St.',
              'Market St.',
              'Pine St.',
              'Sutter St.',
              'Valencia St.',
              'Webster St.'},
             'Steet': {'Guerrero Steet'},
             'Steps': {'Bancroft Steps'},
             'Sutter': {'Sutter'},
             'Telegraph': {'3605 Telegraph'},
             'Valencia': {'Valencia'},
             'View': {'Norwood View'},
             'Walk': {'Fountain Walk', 'Rose Walk', 'Terrace Walk'},
             'Wedemeyer': {'Wedemeyer'},
             'West': {'Buena Vista Avenue West', 'Embarcadero West'},
             'arrowhead': {'lake arrowhead'},
             'ave': {'central ave'},
             'avenue': {'Santa Cruz avenue'},
             'broadway': {'broadway'},
             'bush': {'bush'},
             'parkway': {'Bridge parkway'},
             'square': {'33 union square'},
             'st': {'26th st', 'Crane st'},
             'street': {'Vallejo street',
              'Woolsey street',
              'laguna street',
              'market street',
              'townsend street'},
             'sutter': {'sutter'}})

Mapping dictionaries which used to update and clean those amendable data



In [207]:

    
street_mapping = { "St": "Street",
            "St.": "Street",
            "Steet": "Street",
            "st": "Street",
            "street": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "ave": "Avenue",
            "avenue": "Avenue",
            "Rd.": "Road",
            "Rd": "Road",
            "Blvd": "Boulevard",
            "Blvd,": "Boulevard",
            "Blvd.": "Boulevard",
            "Boulavard": "Boulevard",
            "Boulvard": "Boulevard",
            "Dr": "Drive",
            "Dr.": "Drive",
            "Pl": "Plaza",
            "Plz": "Plaza",
            "square": "Square"
            }

postcode_mapping={"CA 94030": "94030",
                  "CA 94133": "94133",
                  "CA 94544": "94544",
                  "CA 94103": "94103",
                  "CA:94103": "94103"
                    }
error_postcode={'1087', '515', 'CA'}

cityname_mapping={"Berkeley, CA": "Berkeley",
                  "Fremont ": "Fremont",
                  "Oakland, CA": "Oakland",
                  "Oakland, Ca": "Oakland",
                  "San Francisco, CA": "San Francisco",
                  "San Francisco, CA 94102": "San Francisco",
                  "San Francicsco": "San Francisco",
                  "San Fransisco": "San Francisco",
                  "San Francsico": "San Francisco",
                  "Artherton": "Atherton"
                    }
error_cityname={'155', '157'}

Find out the all type of postal code



In [14]:

    
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'
def is_street_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_postcode(tag):
                    postcode_types.add(tag.attrib['v'])
    osm_file.close()
    return postcode_types

postcode=audit(filename)
postcode









    Out[14]:





{'1087',
 '515',
 '90214',
 '93710',
 '94002',
 '94002-3585',
 '94010',
 '94013',
 '94014',
 '94015',
 '94017',
 '94019',
 '94025',
 '94027',
 '94030',
 '94044',
 '94061',
 '94062',
 '94063',
 '94065',
 '94066',
 '94070',
 '94080',
 '94087',
 '94102',
 '94103',
 '94103-3124',
 '94104',
 '94105',
 '94107',
 '94108',
 '94109',
 '94110',
 '94111',
 '94112',
 '94112 ',
 '94113',
 '94114',
 '94115',
 '94115 ',
 '94116',
 '94117',
 '94117-9991',
 '94118',
 '94118-4504',
 '9412',
 '94121',
 '94121 ',
 '94122',
 '94123',
 '941234',
 '94124',
 '94127',
 '94129',
 '94130',
 '94131',
 '94132',
 '94133',
 '94134',
 '94143',
 '94158',
 '94166',
 '94188',
 '94213',
 '94301',
 '94301-2019',
 '94303',
 '94401',
 '94402',
 '94403',
 '94404',
 '94501',
 '94502',
 '94519',
 '94523',
 '94530',
 '94536',
 '94541',
 '94544',
 '94545',
 '94546',
 '94549',
 '94549-5506',
 '94552',
 '94555',
 '94556',
 '94560',
 '94563',
 '94577',
 '94578',
 '94579',
 '94587',
 '94595',
 '94596',
 '94597',
 '94598',
 '94601',
 '94602',
 '94603',
 '94605',
 '94606',
 '94606-3636',
 '94607',
 '94608',
 '94609',
 '94610',
 '94611',
 '94612',
 '94612-2202',
 '94613',
 '94618',
 '94619',
 '94621',
 '94702',
 '94703',
 '94704',
 '94705',
 '94706',
 '94707',
 '94708',
 '94709',
 '94710',
 '94720',
 '94720-1076',
 '94801',
 '94804',
 '94805',
 '94901',
 '94904',
 '94920',
 '94925',
 '94939',
 '94941',
 '94965',
 '95498',
 'CA',
 'CA 94030',
 'CA 94133',
 'CA 94544',
 'CA:94103'}

Find out all type of city names



In [15]:

    
filename='/Users/yangrenqin/udacity/P3/san-francisco.osm'

def is_city(elem):
    return (elem.attrib['k'] == "addr:city")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    city_types = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_city(tag):
                    city_types.add(tag.attrib['v'])
    osm_file.close()
    return city_types

cityname=audit(filename)
cityname









    Out[15]:





{'155',
 '157',
 'Alameda',
 'Albany',
 'Artherton',
 'Atherton',
 'Belmont',
 'Berkeley',
 'Berkeley, CA',
 'Brisbane',
 'Burlingame',
 'Castro Valley',
 'Daly City',
 'East Palo Alto',
 'El Cerrito',
 'Emeryville',
 'Foster City',
 'Fremont',
 'Fremont ',
 'Greenbrae',
 'Half Moon Bay',
 'Hayward',
 'Kensington',
 'Kentfield',
 'Lafayette',
 'Marin City',
 'Menlo Park',
 'Mill Valley',
 'Montara',
 'Moraga',
 'Newark',
 'Oakland',
 'Oakland, CA',
 'Oakland, Ca',
 'Orinda',
 'Pacifica',
 'Palo Alto',
 'Piedmont',
 'Pleasant Hill',
 'Redwood City',
 'Richmond',
 'San Bruno',
 'San Carlos',
 'San Francicsco',
 'San Francisco',
 'San Francisco, CA',
 'San Francisco, CA 94102',
 'San Francsico',
 'San Fransisco',
 'San Leandro',
 'San Mateo',
 'Sausalito',
 'South San Francisco',
 'Tiburon',
 'Union City',
 'Walnut Creek',
 'berkeley',
 'oakland',
 'san francisco'}

Audit, correct and write data from xml into csv files



In [218]:

    
import csv
import codecs
import re
import xml.etree.cElementTree as ET




OSM_PATH = "/Users/yangrenqin/udacity/P3/san-francisco.osm"

NODES_PATH = "/Users/yangrenqin/udacity/P3/nodes.csv"
NODE_TAGS_PATH = "/Users/yangrenqin/udacity/P3/nodes_tags.csv"
WAYS_PATH = "/Users/yangrenqin/udacity/P3/ways.csv"
WAY_NODES_PATH = "/Users/yangrenqin/udacity/P3/ways_nodes.csv"
WAY_TAGS_PATH = "/Users/yangrenqin/udacity/P3/ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def capitalize(a):
    b=''
    for i in a.split(' '):
    
        i=i.capitalize()+' '
        b=b+i
    a=b.strip()
    return a

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    if element.tag == 'node':
        for i in element.attrib:
            if i in node_attr_fields:
                node_attribs[i]=element.attrib[i]
        if element.getchildren() == []:
            pass
        else:
            for tag in element.iter('tag'):
                node_tags={}
                k=tag.attrib['k']
                if PROBLEMCHARS.search(k):
                    continue
                elif LOWER_COLON.search(k):
                    
                    if LOWER_COLON.search(k).group() == k:
                        if k == "addr:street":
                            m = street_type_re.search(tag.attrib['v'])
                            if m:
                                street_type = m.group()
                                if street_type in error_street_type:
                                    if street_type in street_mapping:
                                        tag.attrib['v']=tag.attrib['v'].replace(street_type,street_mapping[street_type])
                                    else:
                                        continue
                                node_tags['key']=k.split(':')[1]
                                node_tags['type']=k.split(':')[0]
                                node_tags['id']=element.attrib['id']
                                node_tags['value']=tag.attrib['v']
                                tags.append(node_tags)
                            else:
                                continue
                        if k == "addr:postcode":
                            if tag.attrib['v'] in error_postcode:
                                continue
                            else: 
                                if tag.attrib['v'] in postcode_mapping:
                                    tag.attrib['v']=postcode_mapping[tag.attrib['v']]
                                node_tags['key']=k.split(':')[1]
                                node_tags['type']=k.split(':')[0]
                                node_tags['id']=element.attrib['id']
                                node_tags['value']=tag.attrib['v']
                                tags.append(node_tags)
                        if k == "addr:city":
                            if tag.attrib['v'] in error_cityname:
                                continue
                            else: 
                                if tag.attrib['v'] in cityname_mapping:
                                    tag.attrib['v']=cityname_mapping[tag.attrib['v']]
                                node_tags['key']=k.split(':')[1]
                                node_tags['type']=k.split(':')[0]
                                node_tags['id']=element.attrib['id']
                                node_tags['value']=capitalize(tag.attrib['v'])
                                tags.append(node_tags)
                    else:
                        node_tags['key']=k.partition(':')[-1]
                        node_tags['type']=k.partition(':')[0]
                        node_tags['id']=element.attrib['id']
                        node_tags['value']=tag.attrib['v']
                        tags.append(node_tags)
                else:
                    node_tags['id']=element.attrib['id']
                    node_tags['value']=tag.attrib['v']
                    node_tags['key']=k
                    node_tags['type']=default_tag_type
                    tags.append(node_tags)
    if element.tag == 'way':
        for i in element.attrib:
            if i in way_attr_fields:
                way_attribs[i]=element.attrib[i]
        if element.getchildren() == []:
            pass
        else:
            for tag in element.iter('tag'):
                way_tags={}
                k=tag.attrib['k']
                if PROBLEMCHARS.search(k):
                    continue
                elif LOWER_COLON.search(k):
                    if LOWER_COLON.search(k).group() == k:
                        if k == "addr:street":
                            m = street_type_re.search(tag.attrib['v'])
                            if m:
                                street_type = m.group()
                                if street_type in error_street_type:
                                    if street_type in street_mapping:
                                        tag.attrib['v']=tag.attrib['v'].replace(street_type,street_mapping[street_type])
                                    else:
                                        continue
                                way_tags['key']=k.split(':')[1]
                                way_tags['type']=k.split(':')[0]
                                way_tags['id']=element.attrib['id']
                                way_tags['value']=tag.attrib['v']
                                tags.append(way_tags)
                            else:
                                continue
                        if k == "addr:postcode":
                            if tag.attrib['v'] in error_postcode:
                                continue
                            else: 
                                if tag.attrib['v'] in postcode_mapping:
                                    tag.attrib['v']=postcode_mapping[tag.attrib['v']]
                                way_tags['key']=k.split(':')[1]
                                way_tags['type']=k.split(':')[0]
                                way_tags['id']=element.attrib['id']
                                way_tags['value']=tag.attrib['v']
                                tags.append(way_tags)
                        if k == "addr:city":
                            if tag.attrib['v'] in error_cityname:
                                continue
                            else: 
                                if tag.attrib['v'] in cityname_mapping:
                                    tag.attrib['v']=cityname_mapping[tag.attrib['v']]
                                way_tags['key']=k.split(':')[1]
                                way_tags['type']=k.split(':')[0]
                                way_tags['id']=element.attrib['id']
                                way_tags['value']=capitalize(tag.attrib['v'])
                                tags.append(way_tags)
                    else:
                        way_tags['key']=k.partition(':')[-1]
                        way_tags['type']=k.partition(':')[0]
                        way_tags['id']=element.attrib['id']
                        way_tags['value']=tag.attrib['v']
                        tags.append(way_tags)
                else:
                    way_tags['id']=element.attrib['id']
                    way_tags['value']=tag.attrib['v']
                    way_tags['key']=k
                    way_tags['type']=default_tag_type
                    tags.append(way_tags)
            
            for i,nd in enumerate(element.iter('nd')):
                way_nd={}
                way_nd['id']=element.attrib['id']
                way_nd['node_id']=nd.attrib['ref']
                way_nd['position']=i
                way_nodes.append(way_nd)
                
                    
    if element.tag == 'node':
        if element.getchildren() == []:
            return {'node': node_attribs}
        else:
            return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        if element.getchildren() == []:
            return {'way': way_attribs}
        else:
            return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}



In [219]:

    
def get_element(osm_file, tags=('node', 'way')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
def is_numeric(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def keep_numeric(original):
    for i,v in original.items():
        if is_numeric(v):
            if float(v).is_integer():
                original[i]=int(float(v))
            else:
                original[i]=float(v)
    return original

def process_map(file_in):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file,codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = csv.DictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = csv.DictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = csv.DictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = csv.DictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = csv.DictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()


        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if element.tag == 'node':
                    if element.getchildren() == []:
                        nodes_writer.writerow(keep_numeric(el['node']))
                    else:
                        nodes_writer.writerow(keep_numeric(el['node']))
                        node_tags_writer.writerows([keep_numeric(i) for i in el['node_tags']])
                elif element.tag == 'way':
                    if element.getchildren() == []:
                        ways_writer.writerow(keep_numeric(el['way']))
                    else:
                        ways_writer.writerow(keep_numeric(el['way']))
                        way_nodes_writer.writerows([keep_numeric(i) for i in el['way_nodes']])
                        way_tags_writer.writerows([keep_numeric(i) for i in el['way_tags']])



In [220]:

    
if __name__ == '__main__':
    process_map(OSM_PATH)

Verify the result of audit and update



In [223]:

    
import sqlite3
import pandas as pd
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
 FROM (SELECT * FROM nodes_tags\
 UNION ALL\
 SELECT * FROM ways_tags) tags\
 WHERE tags.key='street'\
 GROUP BY tags.value\
 ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['Street','count'])
db.close()



In [228]:

    
rows.head(20)









    Out[228]:






  
    
      
      Street
      count
    
  
  
    
      0
      El Camino Real
      386
    
    
      1
      Jefferson Avenue
      324
    
    
      2
      Roosevelt Avenue
      270
    
    
      3
      Hudson Street
      240
    
    
      4
      Woodside Road
      223
    
    
      5
      Hamilton Avenue
      220
    
    
      6
      Madison Avenue
      206
    
    
      7
      Vera Avenue
      201
    
    
      8
      Redwood Avenue
      194
    
    
      9
      Kentfield Avenue
      192
    
    
      10
      Martin Luther King Jr Way
      177
    
    
      11
      King Street
      175
    
    
      12
      Hoover Street
      170
    
    
      13
      Oak Avenue
      168
    
    
      14
      Valota Road
      166
    
    
      15
      Hopkins Avenue
      163
    
    
      16
      Brewster Avenue
      162
    
    
      17
      University Avenue
      157
    
    
      18
      Fulton Street
      156
    
    
      19
      Bay Road
      154



In [231]:

    
rows.tail(20)









    Out[231]:






  
    
      
      Street
      count
    
  
  
    
      1359
      West MacArthur Boulevard
      1
    
    
      1360
      West Parnassus Court
      1
    
    
      1361
      West Ranger Avenue
      1
    
    
      1362
      Westmoor Avenue
      1
    
    
      1363
      Whittle Avenue
      1
    
    
      1364
      William Saroyan Place
      1
    
    
      1365
      Williams Street
      1
    
    
      1366
      Willie Mays Plaza
      1
    
    
      1367
      Wood Street
      1
    
    
      1368
      Woodminster Lane
      1
    
    
      1369
      Yacht Road
      1
    
    
      1370
      Yosemite Avenue
      1
    
    
      1371
      Youngs Valley Road
      1
    
    
      1372
      Zoo Avenue
      1
    
    
      1373
      central Avenue
      1
    
    
      1374
      market Street
      1
    
    
      1375
      pine Street
      1
    
    
      1376
      shattuck Avenue
      1
    
    
      1377
      townsend Street
      1
    
    
      1378
      ygnacio Valley Road
      1



In [238]:

    
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
 FROM (SELECT * FROM nodes_tags\
 UNION ALL\
 SELECT * FROM ways_tags) tags\
 WHERE tags.key='postcode'\
 GROUP BY tags.value\
 ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['Postcode','count'])
db.close()



In [239]:

    
rows.head(10)



In [240]:

    
rows.tail(10)



In [241]:

    
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT tags.value, COUNT(*) as count\
 FROM (SELECT * FROM nodes_tags\
 UNION ALL\
 SELECT * FROM ways_tags) tags\
 WHERE tags.key='city'\
 GROUP BY tags.value\
 ORDER BY count DESC;"
c.execute(query)
rows=pd.DataFrame(c.fetchall(),columns=['City','count'])
db.close()



In [242]:

    
rows









    Out[242]:






  
    
      
      City
      count
    
  
  
    
      0
      Redwood City
      23533
    
    
      1
      Berkeley
      3358
    
    
      2
      Palo Alto
      1651
    
    
      3
      San Francisco
      1379
    
    
      4
      Union City
      252
    
    
      5
      Burlingame
      158
    
    
      6
      Oakland
      143
    
    
      7
      San Mateo
      38
    
    
      8
      Alameda
      24
    
    
      9
      Walnut Creek
      18
    
    
      10
      Albany
      16
    
    
      11
      Hayward
      15
    
    
      12
      Atherton
      13
    
    
      13
      San Carlos
      12
    
    
      14
      Fremont
      11
    
    
      15
      Emeryville
      8
    
    
      16
      Mill Valley
      6
    
    
      17
      San Leandro
      6
    
    
      18
      Belmont
      4
    
    
      19
      Pacifica
      4
    
    
      20
      Castro Valley
      3
    
    
      21
      Daly City
      3
    
    
      22
      Lafayette
      3
    
    
      23
      Menlo Park
      3
    
    
      24
      Piedmont
      3
    
    
      25
      Richmond
      3
    
    
      26
      Sausalito
      3
    
    
      27
      Brisbane
      2
    
    
      28
      Foster City
      2
    
    
      29
      Half Moon Bay
      2
    
    
      30
      Newark
      2
    
    
      31
      San Bruno
      2
    
    
      32
      East Palo Alto
      1
    
    
      33
      El Cerrito
      1
    
    
      34
      Greenbrae
      1
    
    
      35
      Kensington
      1
    
    
      36
      Kentfield
      1
    
    
      37
      Marin City
      1
    
    
      38
      Montara
      1
    
    
      39
      Moraga
      1
    
    
      40
      Orinda
      1
    
    
      41
      Pleasant Hill
      1
    
    
      42
      South San Francisco
      1
    
    
      43
      Tiburon
      1

Data overview part



In [270]:

    
db = sqlite3.connect('sanfrancisco.db')
c=db.cursor()
query="SELECT nodes_tags.value,COUNT(*) as num\
 FROM nodes_tags, (SELECT DISTINCT(id) FROM nodes_tags WHERE value='cafe') AS i\
 WHERE nodes_tags.id=i.id AND nodes_tags.key='name'\
 GROUP BY nodes_tags.value\
 ORDER BY num desc\
 LIMIT 1;"
c.execute(query)
result=c.fetchall()
db.close()



In [271]:

    
result









    Out[271]:





[('Starbucks', 68)]

I just repeatly use this code to perform different query to database.



In [ ]:

	Street	count
0	El Camino Real	386
1	Jefferson Avenue	324
2	Roosevelt Avenue	270
3	Hudson Street	240
4	Woodside Road	223
5	Hamilton Avenue	220
6	Madison Avenue	206
7	Vera Avenue	201
8	Redwood Avenue	194
9	Kentfield Avenue	192
10	Martin Luther King Jr Way	177
11	King Street	175
12	Hoover Street	170
13	Oak Avenue	168
14	Valota Road	166
15	Hopkins Avenue	163
16	Brewster Avenue	162
17	University Avenue	157
18	Fulton Street	156
19	Bay Road	154

	Street	count
1359	West MacArthur Boulevard	1
1360	West Parnassus Court	1
1361	West Ranger Avenue	1
1362	Westmoor Avenue	1
1363	Whittle Avenue	1
1364	William Saroyan Place	1
1365	Williams Street	1
1366	Willie Mays Plaza	1
1367	Wood Street	1
1368	Woodminster Lane	1
1369	Yacht Road	1
1370	Yosemite Avenue	1
1371	Youngs Valley Road	1
1372	Zoo Avenue	1
1373	central Avenue	1
1374	market Street	1
1375	pine Street	1
1376	shattuck Avenue	1
1377	townsend Street	1
1378	ygnacio Valley Road	1

	Postcode	count
0	94063	358
1	94587	250
2	94109	200
3	94103	192
4	94061	161
5	94114	129
6	94113	111
7	94110	65
8	94102	63
9	94107	63

	Postcode	count
120	94549-5506	1
121	94552	1
122	94563	1
123	94606-3636	1
124	94612-2202	1
125	94621	1
126	94708	1
127	94720	1
128	94720-1076	1
129	95498	1

	City	count
0	Redwood City	23533
1	Berkeley	3358
2	Palo Alto	1651
3	San Francisco	1379
4	Union City	252
5	Burlingame	158
6	Oakland	143
7	San Mateo	38
8	Alameda	24
9	Walnut Creek	18
10	Albany	16
11	Hayward	15
12	Atherton	13
13	San Carlos	12
14	Fremont	11
15	Emeryville	8
16	Mill Valley	6
17	San Leandro	6
18	Belmont	4
19	Pacifica	4
20	Castro Valley	3
21	Daly City	3
22	Lafayette	3
23	Menlo Park	3
24	Piedmont	3
25	Richmond	3
26	Sausalito	3
27	Brisbane	2
28	Foster City	2
29	Half Moon Bay	2
30	Newark	2
31	San Bruno	2
32	East Palo Alto	1
33	El Cerrito	1
34	Greenbrae	1
35	Kensington	1
36	Kentfield	1
37	Marin City	1
38	Montara	1
39	Moraga	1
40	Orinda	1
41	Pleasant Hill	1
42	South San Francisco	1
43	Tiburon	1