Basic analysis of the data scrpaed from the OpenDSD API.
In [1]:
import pandas as pd
import sqlite3
import matplotlib.pylab as plt
import numpy as np
import os
from lib import download_ambry_db
%matplotlib inline
In [3]:
download_ambry_db("http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db", "projects.db")
download_ambry_db("http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db", "approvals.db")
In [57]:
# COnnect to the downloaded database and get a cursor so we can run queries.
con = sqlite3.connect("projects.db")
cur = con.cursor()
In [58]:
# Get an example work to look for the data structure.
import json
row = cur.execute('SELECT data FROM json LIMIT 1').fetchone()
d = json.loads(str(row[0]).decode('zlib'))
d.keys()
Out[58]:
In [61]:
d['Jobs']
Out[61]:
In [60]:
# Now we can iterate over the data structure and do something interesting.
for row in cur.execute('SELECT data FROM json LIMIT 10'):
d = json.loads(str(row[0]).decode('zlib'))
print d['DevelopmentId'], d['Fees']
In [59]:
# create a generator to iterate over a particular type of object
def gen_fees(cur):
for row in cur.execute('SELECT data FROM json'):
d = json.loads(str(row[0]).decode('zlib'))
if 'Fees' in d and d['Fees']:
for fee in d['Fees']:
yield fee
cur.close()
con.close()
In [44]:
# Example of looping over the generator
for i, row in enumerate(gen_fees(cur)):
if i > 10: break
print row
In [47]:
# Then, we can create a dataframe from a list comprehension
# FOr Fees, this is 287K records, so this will be slow.
df = pd.DataFrame(gen_fees(cur))
In [49]:
df.head()
Out[49]:
In [53]:
df.groupby('FeeId').sum().head()
Out[53]:
In [55]:
In [ ]: