San Diego Development OpenDSD Analysis

Basic analysis of the data scrpaed from the OpenDSD API.

Issue 215


In [1]:
import pandas as pd
import sqlite3
import matplotlib.pylab as plt
import numpy as np
import os
from lib import download_ambry_db
%matplotlib inline

In [3]:
download_ambry_db("http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db", "projects.db")   
download_ambry_db("http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db", "approvals.db")


Already downloaded: http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db
Already extracted: projects.db
Downloading: http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db
Extracting to: approvals.db

In [57]:
# COnnect to the downloaded database and get a cursor so we can run queries. 
con = sqlite3.connect("projects.db")
cur = con.cursor()

In [58]:
# Get an example work to look for the data structure. 
import json
row = cur.execute('SELECT data FROM json LIMIT 1').fetchone()
d = json.loads(str(row[0]).decode('zlib'))
d.keys()


Out[58]:
[u'ApplicationExpiration',
 u'Customers',
 u'ProjectManagerId',
 u'Jobs',
 u'AccountNum',
 u'Title',
 u'Invoices',
 u'ProjectId',
 u'JobOrderNum',
 u'ApplicationExpired',
 u'Header',
 u'DevelopmentId',
 u'AdminHold',
 u'ReviewCycles',
 u'Fees',
 u'Scope',
 u'ProjectManager',
 u'ApplicationDate',
 u'DevelopmentTitle']

In [61]:
d['Jobs']


Out[61]:
[{u'APN': u'438-252-19-00',
  u'ApprovalInfo': [],
  u'Approvals': [{u'ApprovalId': 2529,
    u'CompleteCancelDate': None,
    u'Depiction': u'1',
    u'FirstInspectionDate': None,
    u'IssueDate': None,
    u'IssuedBy': u'',
    u'JobId': 1574,
    u'NetChangeDU': u'',
    u'PermitHolder': u' ',
    u'Scope': u'Greater North Park   Tentative Map Waiver to convert four residential units to condominiums at 4714 Oregon Street in the MR-3000 zone of Mid City Communities Plan District in the Greater North Park Community Plan. Council District 3. Notice Cards=1.',
    u'SquareFootage': None,
    u'Status': u'Created',
    u'Type': u'Map Waiver',
    u'Valuation': u''}],
  u'Description': u'4714   OREGON ST ',
  u'JobFeesSubTotal': None,
  u'JobId': 1574,
  u'Latitude': 32.763291,
  u'Longitude': -117.135504,
  u'MapReference': u'1269-D3',
  u'NAD83Easting': None,
  u'NAD83Northing': None,
  u'ProjectId': 1410,
  u'SignOffs': [{u'DisciplineDescription': u'LDR-Engineering Review',
    u'DisciplineId': 109,
    u'SignedDate': u'Not Signed-Off'},
   {u'DisciplineDescription': u'LDR-Map Check',
    u'DisciplineId': 111,
    u'SignedDate': u'Not Signed-Off'}],
  u'SortableStreetAddress': u'OREGON ST 0000004714 ',
  u'StreetAddress': u'4714 OREGON ST '}]

In [60]:
# Now we can iterate over the data structure and do something interesting. 
for row in cur.execute('SELECT data FROM json LIMIT 10'):
    d = json.loads(str(row[0]).decode('zlib'))
    print d['DevelopmentId'], d['Fees']


1001 []
1040 [{u'Category': u'Plan Check Fees', u'InvoiceId': 107465, u'Description': u'Deposit Account', u'ProjectId': 1040, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 3000, u'QuantityPaid': 3000, u'FeeId': 1656, u'Unit': u'Dollars'}]
1041 [{u'Category': u'Plan Check Fees', u'InvoiceId': 107464, u'Description': u'Deposit Account', u'ProjectId': 1041, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 1700, u'QuantityPaid': 1700, u'FeeId': 1656, u'Unit': u'Dollars'}]
1096 [{u'Category': u'Plan Check Fees', u'InvoiceId': 138746, u'Description': u'Deposit Account', u'ProjectId': 1096, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 5500, u'QuantityPaid': 5500, u'FeeId': 1656, u'Unit': u'Dollars'}]
1102 [{u'Category': u'Plan Check Fees', u'InvoiceId': 78488, u'Description': u'Deposit Account', u'ProjectId': 1102, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 4490, u'QuantityPaid': 4490, u'FeeId': 1656, u'Unit': u'Dollars'}]
1163 [{u'Category': u'Plan Check Fees', u'InvoiceId': 79269, u'Description': u'Deposit Acct (Pre 8-2-03)', u'ProjectId': 1164, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 2500, u'QuantityPaid': 2500, u'FeeId': 1317, u'Unit': u'Each'}]
1110 []
1282 [{u'Category': u'Plan Check Fees', u'InvoiceId': 22932, u'Description': u'Deposit Account', u'ProjectId': 1287, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 10661, u'QuantityPaid': 10661, u'FeeId': 1656, u'Unit': u'Dollars'}]
1389 [{u'Category': u'Plan Check Fees', u'InvoiceId': 216989, u'Description': u'Deposit Account', u'ProjectId': 1395, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 15467, u'QuantityPaid': 15467, u'FeeId': 1656, u'Unit': u'Dollars'}]
1404 []

In [59]:
# create a generator to iterate over a particular type of object
def gen_fees(cur):

    for row in cur.execute('SELECT data FROM json'):
        d = json.loads(str(row[0]).decode('zlib'))
        if 'Fees' in d and d['Fees']:
            for fee in d['Fees']:
                yield fee
   
    cur.close()
    con.close()

In [44]:
# Example of looping over the generator
for i, row in enumerate(gen_fees(cur)):
    if i > 10: break
        
    print row


{u'Category': u'Plan Check Fees', u'InvoiceId': 107465, u'Description': u'Deposit Account', u'ProjectId': 1040, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 3000, u'QuantityPaid': 3000, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 107464, u'Description': u'Deposit Account', u'ProjectId': 1041, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 1700, u'QuantityPaid': 1700, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 138746, u'Description': u'Deposit Account', u'ProjectId': 1096, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 5500, u'QuantityPaid': 5500, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 78488, u'Description': u'Deposit Account', u'ProjectId': 1102, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 4490, u'QuantityPaid': 4490, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 79269, u'Description': u'Deposit Acct (Pre 8-2-03)', u'ProjectId': 1164, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 2500, u'QuantityPaid': 2500, u'FeeId': 1317, u'Unit': u'Each'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 22932, u'Description': u'Deposit Account', u'ProjectId': 1287, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 10661, u'QuantityPaid': 10661, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 216989, u'Description': u'Deposit Account', u'ProjectId': 1395, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 15467, u'QuantityPaid': 15467, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 113664, u'Description': u'Deposit Account', u'ProjectId': 1606, u'InvoiceStatus': u'Requires Invoice', u'QuantityRequired': 10059, u'QuantityPaid': 3499, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 235014, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1704, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 11000, u'QuantityPaid': 11000, u'FeeId': 2955, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 222608, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1706, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 7000, u'QuantityPaid': 7000, u'FeeId': 2955, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 255428, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1707, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 22500, u'QuantityPaid': 22500, u'FeeId': 2955, u'Unit': u'Dollars'}

In [47]:
# Then, we can create a dataframe from a list comprehension
# FOr Fees, this is 287K records, so this will be slow. 
df = pd.DataFrame(gen_fees(cur))

In [49]:
df.head()


Out[49]:
Category Description FeeId InvoiceId InvoiceStatus ProjectId QuantityPaid QuantityRequired Unit
0 Plan Check Fees Deposit Account 1656 107465 Paid on 1040 3000 3000 Dollars
1 Plan Check Fees Deposit Account 1656 107464 Paid on 1041 1700 1700 Dollars
2 Plan Check Fees Deposit Account 1656 138746 Paid on 1096 5500 5500 Dollars
3 Plan Check Fees Deposit Account 1656 78488 Paid on 1102 4490 4490 Dollars
4 Plan Check Fees Deposit Acct (Pre 8-2-03) 1317 79269 Paid on 1164 2500 2500 Each

In [53]:
df.groupby('FeeId').sum().head()


Out[53]:
InvoiceId ProjectId QuantityPaid QuantityRequired
FeeId
698 519810 1143739 109 308
1280 3759 6970 1 1
1281 6520 16355 2 2
1317 6341947 1253552 1214860 1464054
1400 10748236 5025918 444 443

In [55]:


In [ ]: