In [1]:
import os
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import collections
import bson
import pymongo
In [2]:
# Data used - Weekly OSM Metro Extracts from:
# https://mapzen.com/metro-extracts/
# https://s3.amazonaws.com/metro-extracts.mapzen.com/albuquerque_new-mexico.osm.bz2
# https://s3.amazonaws.com/metro-extracts.mapzen.com/honolulu_hawaii.osm.bz2
DATADIR = "data"
DATAFILE = "honolulu_hawaii.osm"
DATAFILE2 = "albuquerque_new-mexico.osm"
HNL_DATA = os.path.join(DATADIR, DATAFILE)
ABQ_DATA = os.path.join(DATADIR, DATAFILE2)
In [3]:
def count_tags(filename):
counts = collections.defaultdict(int)
for line in ET.iterparse(filename, events=("start",)):
current = line[1].tag
counts[current] += 1
return counts
hnl_tags = count_tags(HNL_DATA)
abq_tags = count_tags(ABQ_DATA)
pprint.pprint(hnl_tags)
pprint.pprint(abq_tags)
In [4]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def key_type(element, keys):
if element.tag == "tag":
k_value = element.attrib['k']
if lower.search(k_value) is not None:
keys['lower'] += 1
elif lower_colon.search(k_value) is not None:
keys['lower_colon'] += 1
elif problemchars.search(k_value) is not None:
keys["problemchars"] += 1
else:
keys['other'] += 1
return keys
def process_map(filename):
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
for _, element in ET.iterparse(filename):
keys = key_type(element, keys)
return keys
hnl_all_keys = process_map(HNL_DATA)
abq_all_keys = process_map(ABQ_DATA)
print hnl_all_keys
print abq_all_keys
This shows many keys are present with semicolons such as 'addr:postcode'
In [34]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court",
"Place", "Square", "Lane", "Road", "Trail", "Parkway", "Commons",
"Highway", "Loop", "Circle", "Walk", "Way", "Southwest", "Northeast",
"Southeast", "Northwest"]
# UPDATE THIS VARIABLE
mapping = { "pl": "Place",
"st": "Street",
"ave": "Avenue",
"rd": "Road",
"w": "West",
"n": "North",
"s": "South",
"e": "East",
"blvd":"Boulevard",
"sr": "Drive",
"ct": "Court",
"ne": "Northeast",
"se": "Southeast",
"nw": "Northwest",
"sw": "Southwest",
"dr": "Drive",
"sq": "Square",
"ln": "Lane",
"trl": "Trail",
"pkwy": "Parkway",
"ste": "Suite",
"lp": "Loop",
"hwy": "Highway",
"cir": "Circle"}
def audit_street_type(street_types, street_name):
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected:
print "STREET NAME: ", street_name
print "STREET NOT IN EXPECTED:", street_type
street_types[street_type].add(street_name)
def is_street_name(elem):
return (elem.attrib['k'] == "addr:street")
def audit(osmfile):
osm_file = open(osmfile, "r")
street_types = collections.defaultdict(set)
for event, elem in ET.iterparse(osm_file, events=("start",)):
if elem.tag == "node" or elem.tag == "way":
for tag in elem.iter("tag"):
if is_street_name(tag):
audit_street_type(street_types, tag.attrib['v'])
return street_types
def update_name(name, mapping):
after = []
# Split name string to test each part of the name;
# Replacements may come anywhere in the name.
for part in name.split(" "):
# Normalize each address part to cehck against normalized parts in dict.
part = part.strip(",\.").lower()
# Check each part of the name against the keys in the correction dict
if part in mapping.keys():
# If exists in dict, overwrite that part of the name with the dict value for it.
part = mapping[part]
# Assemble each corrected piece of the name back together.
# Also, capitalize the each address part before adding it back.
# .capitalize() instead of .title() so 1st stays as 1st instead of converting to 1St
after.append(part.capitalize())
# Return all pieces of the name as a string joined by a space.
return " ".join(after)
In [35]:
hnl_st_types = audit(HNL_DATA)
In [36]:
abq_st_types = audit(ABQ_DATA)
In [6]:
# Mapping variables were updated: see full list above
for st_type, ways in hnl_st_types.iteritems():
for name in ways:
better_name = update_name(name, mapping)
# Show all street names in way nodes.
print name, "=>", better_name
In [7]:
# Mapping variables were updated: see full list above
for st_type, ways in abq_st_types.iteritems():
for name in ways:
better_name = update_name(name, mapping)
# Only show changed street names in way nodes
# since there are a lot more in this city data
if name != better_name:
print name, "=>", better_name
# One street would need to be cleaned a lot: 1833 8th Street Northwestalbuquerque Nm 87102
# But almost all have been cleaned pretty well.
In [8]:
# Covers cases encountered in cleaning ('96826', 'HI 96819', '96734-9998')
def check_5_digits(new_postal):
# Make sure postal code is 5 digits
if len(str(new_postal)) == 5:
return new_postal
# Else return postal code with descriptive error message attached.
else:
return "NOT 5 DIGITS:"+str(new_postal)
def correct_postal_codes(postal):
# Try to convert numbers as intended.
# No need to check for conditions with if statements if the vast majority will be valid.
try:
new_postal = int(postal)
# Check for 5 sigits with another function.
return check_5_digits(new_postal)
except ValueError as value_error:
# Check if it's a string with a hyphen containing 4 trailing digits e.g. '96734-9998'
if '-' in postal:
# Grab first digits and convert to int.
postal = int(postal.split('-')[0])
# Check for 5 sigits with another function.
return check_5_digits(postal)
if ' ' in postal:
postals = postal.split(' ')
new_postals = [p for p in postals if len(p)==5]
try:
# Check the first entry for a valid 5 digit number
new_postals = int(new_postals[0])
return check_5_digits(new_postals)
# Otherwise return the error thrown and values.
except Exception as err:
return err, new_postals
else:
# Return the ValueError thrown and value if it doesn't meet these criteria
return ValueError + ":" + str(postal)
In [18]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def is_address(elem):
if elem.attrib['k'][:5] == "addr:":
return True
def shape_element(element):
# Make an empty dictionary for the output node/element.
node = {}
# Process 'node' or 'way' elements only.
if element.tag == "node" or element.tag == "way":
address_info = {}
nd_info = []
# Add 'node'/'way' as 'type'
node["type"] = element.tag
# Add 'id' as 'id'
node["id"] = element.attrib["id"]
# If visible exists, add it to dict
if "visible" in element.attrib.keys():
node["visible"] = element.attrib["visible"]
# Add 'lat'/'lon' if they ar available
if "lat" in element.attrib.keys():
node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
# Add version, changeset, timestamp, uid, and user under the root node 'created'
node["created"] = {"version": element.attrib['version'],
"changeset": element.attrib['changeset'],
"timestamp": element.attrib['timestamp'],
"uid": element.attrib['uid'],
"user": element.attrib['user']}
# Iterate through the tags of k,v pairs.
for tag in element.iter("tag"):
p = problemchars.search(tag.attrib['k'])
if p:
# print "PROBLEM:", p.group()
# Do nothing currently
continue
elif is_address(tag):
if ":" in tag.attrib['k'][5:]:
# First 5 char of address attributes should be 'addr:'
# If they're not, it's a bad address for this script.
# Skip.
continue
else:
# If first 5 char contain ':' (i.e. 'addr:'), add the last part of
# the string as a key and the value from 'v' as the value in
# our address_info dict.
# i.e. 'addr:state' will add 'state'
#
# Check the postcodes and correct them if applicable before
# adding into address_info
if tag.attrib['k'][5:] == 'postcode':
tag.attrib['v'] = correct_postal_codes(tag.attrib['v'])
# Find the streets, and update them with our predfined function
# and mapping dictionary from above.
if tag.attrib['k'][5:] == 'street':
tag.attrib['v'] = update_name(tag.attrib['v'], mapping)
# Add the (updated) address values in the address_info dictionary
# under their associated address keys. i.e. 'street', etc
address_info[tag.attrib['k'][5:]] = tag.attrib['v']
else:
# If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.
node[tag.attrib['k']] = tag.attrib['v']
#print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
# If we found 'addr:' info and added it to our address_info dict,
if address_info != {}:
# Then add that address_info dict under the node 'address'
node['address'] = address_info
# Iterate through the 'nd' nodes if they exist.
for tag2 in element.iter("nd"):
# add each entry in a running list.
nd_info.append(tag2.attrib['ref'])
# If the resulting list isn't empty,
if nd_info != []:
# Add the list under the node 'node_refs'
node['node_refs'] = nd_info
return node
else:
# If the element isn't 'node' or 'way', just return None.
return None
def process_map(file_in, pretty = False):
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
In [19]:
data = process_map("data/honolulu_hawaii.osm", False)
In [20]:
data = process_map("data/albuquerque_new-mexico.osm", False)
In [2]:
## Functions to load our database and collection in pymongo
from pymongo import MongoClient
# Function to return a database of the name specified.
# We want a database named 'project' in this case.
def get_db(db_name):
client = MongoClient("mongodb://localhost:27017")
db = client[db_name]
return db
## Function to return the collection we want to use in MongoDB
def get_collection(db, collection):
collections_db = db[collection]
return collections_db
In [50]:
## Function to insert json data into MongoDB
def insert_data(json_data, db_collection):
with open(json_data, 'r') as f:
## json.loads() takes a string, while json.load() takes a file-like object.
## http://stackoverflow.com/questions/11568246/
## /loading-several-text-files-into-mongodb-using-pymongo
for each_line in f.readlines():
db_collection.insert(json.loads(each_line))
print("Complete.")
In [3]:
def map_aggregate(db, collection, pipeline):
db_collection = db[collection]
result = db_collection.aggregate(pipeline)
return result
How do postcodes look?
We can initially see that some cleanup is needed for the zipcodes. This includes
After adding an address cleaning function, all zipcodes now adhere to a 5 digit code, but a few (such as 89197 and 87100) with only 1 entry may have been entered incorrectly.
In [117]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
def is_address(elem):
if elem.attrib['k'][:5] == "addr:":
return True
def shape_element(element, city_name):
'''
Function to pull information for way/node entries from .osm files and
store as json records for loading into MongoDB.
Returns data record as a dictionary.
Example structure:
{city_name: value,
user: value
type: value,
id: value,
visible: value,
address: {street: value,
postcode: value,
housenumber: value,
housenname: value},
created: {version: value,
changeset: value,
timestamp: value,
uid: value,
user: value},
lat: value,
lon: value,
node_refs: value}
'''
# Make an empty dictionary for the output node/element.
node = {}
# Process 'node' or 'way' elements only.
if element.tag == "node" or element.tag == "way":
## Add in the city name to each node before writing
## out the json file.
node["city_name"] = city_name
# Create a sub-dictionary for the address info.
address_info = {}
nd_info = []
# Add 'node'/'way' as 'type'
node["type"] = element.tag
# Add 'id' as 'id'
node["id"] = element.attrib["id"]
# If visible exists, add it to dict
if "visible" in element.attrib.keys():
node["visible"] = element.attrib["visible"]
# Add 'lat'/'lon' if they ar available
if "lat" in element.attrib.keys():
node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
# Add version, changeset, timestamp, uid, and user under the root node 'created'
node["created"] = {"version": element.attrib['version'],
"changeset": element.attrib['changeset'],
"timestamp": element.attrib['timestamp'],
"uid": element.attrib['uid'],
"user": element.attrib['user']}
# Iterate through the tags of k,v pairs.
for tag in element.iter("tag"):
#print tag.attrib
p = problemchars.search(tag.attrib['k'])
if p:
# print "PROBLEM:", p.group()
# Do nothing currently
continue
elif is_address(tag):
if ":" in tag.attrib['k'][5:]:
# print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
# first 5 char of address attributes should be 'addr:'
# If they're not, it's a bad address for this script.
# Skip.
continue
else:
# If first 5 char contain ':' (i.e. 'addr:'), add the last part of the string as
# a key and the value from 'v' as the value in our address_info dict.
# i.e. 'addr:state' will add 'state'
#
# Check the postcodes and correct them if applicable before adding into address_info
if tag.attrib['k'][5:] == 'postcode':
tag.attrib['v'] = correct_postal_codes(tag.attrib['v'])
address_info[tag.attrib['k'][5:]] = tag.attrib['v']
#print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
else:
# If there's no ':', just add the 'k' as a key, and 'v' as a value in our node dict.
node[tag.attrib['k']] = tag.attrib['v']
#print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
# If we found 'addr:' info and added it to our address_info dict,
if address_info != {}:
# Then add that address_info dict under the node 'address'
node['address'] = address_info
# Iterate through the 'nd' nodes if they exist.
for tag2 in element.iter("nd"):
# add each entry in a running list.
nd_info.append(tag2.attrib['ref'])
# If the resulting list isn't empty,
if nd_info != []:
# Add the list under the node 'node_refs'
node['node_refs'] = nd_info
return node
else:
# If the element isn't 'node' or 'way', just return None.
return None
def process_map(file_in, city_name, pretty = False):
'''
Function to write dictionary record into json file
for loading into MongoDB.
Writes dictionaries returned out to a .json file.
'''
file_out = "{0}.json".format(file_in)
data = []
with codecs.open(file_out, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element, city_name)
if el:
data.append(el)
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
else:
fo.write(json.dumps(el) + "\n")
return data
In [122]:
data = process_map("data/honolulu_hawaii.osm", 'honolulu', False)
In [123]:
data = process_map("data/albuquerque_new-mexico.osm", 'albuquerque', False)
In [4]:
# Get 'project' database
db = get_db('project')
# Get 'cities' collection in the 'project' database
# Put honolulu and albuquerque city data in this collection.
db_cities = get_collection(db, 'cities')
In [125]:
hnl_json_data = 'data/honolulu_hawaii.osm.json'
insert_data(hnl_json_data, db_cities)
In [126]:
abq_json_data = 'data/albuquerque_new-mexico.osm.json'
insert_data(abq_json_data, db_cities)
In [5]:
db.cities
Out[5]:
In [8]:
result = db.cities.aggregate([{"$match":{"created.user":{"$exists":1}}},
{"$group":
{"_id": {"City":"$city_name",
"User":"$created.user"},
"count": {"$sum": 1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"User":"$_id.User",
"Count":"$count"}},
{"$sort": {"Count": -1}},
{"$limit" : 7 }])
pprint.pprint(result)
In [26]:
def make_city_pipeline(city):
pipeline = [{"$match":{"created.user":{"$exists":1},
"city_name":city}},
{"$group": {"_id": {"City":"$city_name",
"User":"$created.user"},
"count": {"$sum": 1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"User":"$_id.User",
"Count":"$count"}},
{"$sort": {"Count": -1}},
{"$limit" : 5 }]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [24]:
## Honolulu area and Albuquerque are represented in this dataset.
## Some other areas across Oahu are also sparsely represented
def make_city_pipeline(city):
pipeline = [{"$match":{"address.city":{"$exists":1},
"city_name":city}},
{"$group":{"_id":{"City":"$city_name",
"City_name":"$address.city"},
"count":{"$sum":1}}},
{"$project": {'_id':0,
"Name":"$_id.City_name",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":10}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [13]:
# Parking is the highest reported amenity by far in both cities.
def make_city_pipeline(city):
pipeline = [{"$match":{"amenity":{"$exists":1},
"city_name":city}},
{"$group": {"_id": {"City":"$city_name",
"Amenity":"$amenity"},
"count": {"$sum": 1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"Amenity":"$_id.Amenity",
"Count":"$count"}},
{"$sort": {"Count": -1}},
{"$limit" : 5 }]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [7]:
def make_city_pipeline(city):
pipeline = [{"$match":{"address.postcode":{"$exists":1},
"city_name":city}},
{"$group": {"_id": {"City":"$city_name",
"Zip":"$address.postcode"},
"count": {"$sum": 1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"Zipcode":"$_id.Zip",
"Count":"$count"}},
{"$sort": {"Count": -1}},
{"$limit" : 1 }]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [13]:
# Total nodes/records in database.
print "Both Cities:", db.cities.find().count()
print "Honolulu:", db.cities.find({'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({'city_name':'albuquerque'}).count()
In [15]:
# Number of node nodes.
print "Both Cities:", db.cities.find({"type":"node"}).count()
print "Honolulu:", db.cities.find({"type":"node",
'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({"type":"node",
'city_name':'albuquerque'}).count()
In [16]:
# Number of way nodes.
print "Both Cities:", db.cities.find({'type':'way'}).count()
print "Honolulu:", db.cities.find({'type':'way',
'city_name':'honolulu'}).count()
print "Albuquerque:", db.cities.find({'type':'way',
'city_name':'albuquerque'}).count()
In [17]:
# Number of constributors.
print "Constributors:", len(db.cities.distinct("created.user"))
In [85]:
## Find the most popular amenities
def make_city_pipeline(city):
pipeline = [{"$match":{"amenity":{"$exists":1},
"city_name":city}},
{"$group":{"_id":{"City":"$city_name",
"Amenity":"$amenity"},
"count":{"$sum":1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"Amenity":"$_id.Amenity",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":10}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [86]:
## Find the most popular places of worship
def make_city_pipeline(city):
pipeline = [{"$match":{"amenity":{"$exists":1},
"amenity":"place_of_worship",
"city_name":city}},
{"$group":{"_id": {"City":"$city_name",
"Religion":"$religion"},
"count":{"$sum":1}}},
{"$project":{"_id":0,
"City":"$_id.City",
"Religion":"$_id.Religion",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":6}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [30]:
## Find the most popular restaurants
## We leave blank cuisines in to get a feel for how many restaurants
## were reported without a cuisine filled in.
def make_city_pipeline(city):
pipeline = [{"$match":{"amenity":{"$exists":1},
"amenity":"restaurant",
"city_name":city}},
{"$group":{"_id":{"City":"$city_name",
"Food":"$cuisine"},
"count":{"$sum":1}}},
{"$project":{"_id":0,
"City":"$_id.City",
"Food":"$_id.Food",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":6}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [88]:
## Find the most popular fast food joints
def make_city_pipeline(city):
pipeline = [{"$match":{"amenity":{"$exists":1},
"amenity":"fast_food",
"city_name":city}},
{"$group":{"_id":{"City":"$city_name",
"Food":"$cuisine"},
"count":{"$sum":1}}},
{"$project":{"_id":0,
"City":"$_id.City",
"Food":"$_id.Food",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":6}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [89]:
## Find the names of the most popular fast food joints
def make_city_pipeline(city):
pipeline = [{"$match":{"amenity":{"$exists":1},
"amenity":"fast_food",
"city_name":city}},
{"$group":{"_id":{"City":"$city_name",
"Name":"$name"},
"count":{"$sum":1}}},
{"$project":{"_id":0,
"City":"$_id.City",
"Name":"$_id.Name",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":6}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [90]:
## What are the most popular shops.
def make_city_pipeline(city):
pipeline = [{"$match":{"shop":{"$exists":1},
"city_name":city}},
{"$group":{"_id":{"City":"$city_name",
"Shop":"$shop"},
"count":{"$sum":1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"Shop":"$_id.Shop",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":10}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [93]:
## Find the names of the most popular supermarkets
def make_city_pipeline(city):
pipeline = [{"$match":{"shop":{"$exists":1},
"city_name":city,
"shop":"supermarket"}},
{"$group":{"_id":{"City":"$city_name",
"Supermarket":"$name"},
"count":{"$sum":1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"Supermarket":"$_id.Supermarket",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":5}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [94]:
## Find the names of the most popular convenience stores
def make_city_pipeline(city):
pipeline = [{"$match":{"shop":{"$exists":1},
"city_name":city,
"shop":"convenience"}},
{"$group":{"_id":{"City":"$city_name",
"Name":"$name"},
"count":{"$sum":1}}},
{"$project": {'_id':0,
"City":"$_id.City",
"Name":"$_id.Name",
"Count":"$count"}},
{"$sort":{"Count":-1}},
{"$limit":5}]
return pipeline
pipeline = make_city_pipeline('honolulu')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)
print ""
pipeline = make_city_pipeline('albuquerque')
result2 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result2)
In [ ]:
from IPython import utils
from IPython.core.display import HTML
import os
def css_styling():
"""Load default custom.css file from ipython profile"""
base = utils.path.get_ipython_dir()
styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'profile_custom1/static/custom/custom.css'),'r').read())
return HTML(styles)
css_styling()