Recall that you ended Lab 10, with the question, "Given a complaint type, what percentage of such complaints were logged in each area of NYC?"
This follow-up lab gives you several solutions. By inspecting and running these examples, you should be able to see their tradeoffs.
In [ ]:
import sqlite3 as db
disk_engine = db.connect ('NYC-311-2M.db')
In [ ]:
import plotly.plotly as py
py.sign_in ('USERNAME', 'PASSWORD') # Connect!
In [ ]:
import pandas as pd
In [ ]:
import itertools
In [ ]:
import time # To benchmark of these three solutions
import sys # for sys.stdout.flush ()
In [ ]:
from plotly.graph_objs import Bar, Layout
def iplot_percent_complaints_by_type_and_city (traces):
return py.iplot({'data': traces,
'layout': Layout(barmode='stack',
xaxis={'tickangle': 40, 'autorange': False, 'range': [-0.5, 16]},
yaxis={'title': 'Percent of Complaints by City'},
margin={'b': 150},
title='Relative Number of 311 Complaints by City')
}, filename='311/relative complaints by city', validate=False)
In [ ]:
# Generate a static list of the top 7 cities
query = '''
SELECT City, COUNT(*) AS NumComplaints
FROM data
WHERE City <> 'None'
GROUP BY City COLLATE NOCASE
ORDER BY -NumComplaints
LIMIT 7
'''
TOP_CITIES = pd.read_sql_query (query, disk_engine)['City']
In [ ]:
print TOP_CITIES
In [ ]:
t1a = time.time ()
# Determine the number of complaints by type
query = '''
SELECT ComplaintType, COUNT(*) AS NumComplaints
FROM data
GROUP BY ComplaintType COLLATE NOCASE
ORDER BY -NumComplaints
'''
df = pd.read_sql_query (query, disk_engine)
t1a = time.time () - t1a
print "[+%gs] Part A" % t1a
print df.head ()
In [ ]:
t1b = time.time ()
# Convert this data into a lookup table (dictionary)
total_complaints_by_type = \
dict (zip ([x.capitalize () for x in df.ComplaintType],
df.NumComplaints))
t1b = time.time () - t1b
print "[+%gs] Part B" % t1b
In [ ]:
# Print a few entries just as a sanity check
print list (itertools.islice (total_complaints_by_type.items (), 5))
In [ ]:
t1c = time.time ()
def capitalize (string_list):
"""
Given a list of strings, returns a new list with standardized
capitalization.
"""
return [s.capitalize () for s in string_list]
def gather (key_list, dictionary):
"""
Given a list of keys, returns a list of corresponding values from a
dictionary.
"""
return [dictionary[key] for key in key_list]
traces1 = []
for city in TOP_CITIES: # Determines the complaint counts by city
print ("[+%gs] Processing %s ..." % (time.time () - t1c, city)) ; sys.stdout.flush ()
query = '''
SELECT ComplaintType, COUNT(*) as NumComplaints
FROM data
WHERE City = "{}" COLLATE NOCASE
GROUP BY ComplaintType COLLATE NOCASE
ORDER BY -NumComplaints
'''.format (city)
df = pd.read_sql_query (query, disk_engine)
# Normalize complaint counts
complaint_types = capitalize (df.ComplaintType)
totals = gather (complaint_types, total_complaints_by_type)
percent_complaints = 100.0 * df.NumComplaints / totals
# Add this city as a new trace
traces1.append (Bar (x=complaint_types,
y=percent_complaints,
name=city.capitalize ()))
t1c = time.time () - t1c
print "[+%gs] Part C" % t1c
In [ ]:
# Check it!
print "==> Total time for Solution 1: %gs" % (t1a + t1b + t1c)
iplot_percent_complaints_by_type_and_city (traces1)
This second approach computes the total number of complaints by type, but stores it in a view (or virtual table). It then references this virtual table within the city-specific query to normalize the counts.
We mentioned views in the class slides but did not do a specific exercise using them, so it's OK if you did not think of this solution.
In [ ]:
t2a = time.time ()
query = '''
CREATE VIEW IF NOT EXISTS TotalComplaintsView AS
SELECT ComplaintType, COUNT(*) AS NumComplaints
FROM data
GROUP BY ComplaintType COLLATE NOCASE
ORDER BY -NumComplaints
'''
c = disk_engine.cursor ()
c.execute (query)
t2a = time.time () - t2a
print "[+%gs] Part A" % t2a
A nice feature of a view is that it is stored in the database and automatically kept up to date.
So, you can create it once and use any time you need it, even after updates to the data from which the view derives.
In [ ]:
t2b = time.time ()
traces2 = []
for city in TOP_CITIES: # Determines the complaint counts by city
print ("[+%gs] Processing %s ..." % (time.time () - t2b, city)) ; sys.stdout.flush ()
query = '''
SELECT D.ComplaintType,
(100.0 * COUNT(*) / T.NumComplaints) AS PercentComplaints
FROM data AS D, TotalComplaintsView AS T
WHERE (City = "{}" COLLATE NOCASE)
AND (D.ComplaintType = T.ComplaintType COLLATE NOCASE)
GROUP BY D.ComplaintType COLLATE NOCASE
ORDER BY -T.NumComplaints
'''.format (city)
df = pd.read_sql_query (query, disk_engine)
traces2.append (Bar (x=capitalize (df.ComplaintType),
y=df.PercentComplaints,
name=city.capitalize ()))
t2b = time.time () - t2b
print "[+%gs] Part B" % t2b
In [ ]:
print ("==> Total time for Solution 2: %gs" % (t2a + t2b))
iplot_percent_complaints_by_type_and_city (traces2)
This third solution introduces a new concept, namely, the idea of a subquery.
The basic idea is that, within a SELECT statement, you can reference a table generated "on-the-fly" from another SELECT statement. Notice how this solution basically merges the two queries used in the previous solutions into just a single query.
In [ ]:
t3 = time.time ()
traces3 = []
for city in TOP_CITIES: # Determines the complaint counts by city
print ("[+%gs] Processing %s ..." % (time.time () - t3, city)) ; sys.stdout.flush ()
query = '''
SELECT D.ComplaintType,
(100.0 * COUNT(*) / T.NumComplaints) AS PercentComplaints
FROM data AS D,
(SELECT ComplaintType, COUNT(*) AS NumComplaints
FROM data
GROUP BY ComplaintType COLLATE NOCASE) AS T
WHERE (City = "{}" COLLATE NOCASE)
AND (D.ComplaintType = T.ComplaintType COLLATE NOCASE)
GROUP BY D.ComplaintType COLLATE NOCASE
ORDER BY -T.NumComplaints
'''.format (city)
df = pd.read_sql_query (query, disk_engine)
traces3.append (Bar (x=capitalize (df.ComplaintType),
y=df.PercentComplaints,
name=city.capitalize ()))
t3 = time.time () - t3
print "[+%gs] Total" % t3
In [ ]:
print "==> Total time for Solution 3: %gs" % t3
iplot_percent_complaints_by_type_and_city (traces3)
This next solution is a variation on Solution 2, except instead of creating a view, we create an actual table with the totals.
By storing the table, we can speed up Solution 2 a lot. The downside is that we now have to be careful to maintain this totals table, in the event there are updates to the underlying dataset from which it derives.
In [ ]:
t4a = time.time ()
query = '''
DROP TABLE IF EXISTS TotalComplaints
'''
c = disk_engine.cursor ()
c.execute (query)
query = '''
CREATE TABLE TotalComplaints AS
SELECT ComplaintType, COUNT(*) AS NumComplaints
FROM data
GROUP BY ComplaintType COLLATE NOCASE
ORDER BY -NumComplaints
'''
c.execute (query)
t4a = time.time () - t4a
print "[+%gs] Part A" % t4a
In [ ]:
t4b = time.time ()
traces4 = []
for city in TOP_CITIES: # Determines the complaint counts by city
print ("[+%gs] Processing %s ..." % (time.time () - t4b, city)) ; sys.stdout.flush ()
query = '''
SELECT D.ComplaintType,
(100.0 * COUNT(*) / T.NumComplaints) AS PercentComplaints
FROM data AS D, TotalComplaints AS T
WHERE (City = "{}" COLLATE NOCASE)
AND (D.ComplaintType = T.ComplaintType COLLATE NOCASE)
GROUP BY D.ComplaintType COLLATE NOCASE
ORDER BY -T.NumComplaints
'''.format (city)
df = pd.read_sql_query (query, disk_engine)
traces4.append (Bar (x=capitalize (df.ComplaintType),
y=df.PercentComplaints,
name=city.capitalize ()))
t4b = time.time () - t4b
print "[+%gs] Part B" % t4b
In [ ]:
print "==> Total time for Solution 4: %gs" % (t4a + t4b)
iplot_percent_complaints_by_type_and_city (traces4)
In [ ]: