San Diego Development OpenDSD Analysis

Basic analysis of the data scrpaed from the OpenDSD API.



In [1]:

    
import pandas as pd
import sqlite3
import matplotlib.pylab as plt
import numpy as np
import os
from lib import download_ambry_db
%matplotlib inline



In [3]:

    
download_ambry_db("http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db", "projects.db")   
download_ambry_db("http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db", "approvals.db")









    



Already downloaded: http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/projects.db
Already extracted: projects.db
Downloading: http://s3.sandiegodata.org/library/sandiego.gov/opendsd-json-0.0.1/json/approvals.db
Extracting to: approvals.db



In [57]:

    
# COnnect to the downloaded database and get a cursor so we can run queries. 
con = sqlite3.connect("projects.db")
cur = con.cursor()



In [58]:

    
# Get an example work to look for the data structure. 
import json
row = cur.execute('SELECT data FROM json LIMIT 1').fetchone()
d = json.loads(str(row[0]).decode('zlib'))
d.keys()









    Out[58]:





[u'ApplicationExpiration',
 u'Customers',
 u'ProjectManagerId',
 u'Jobs',
 u'AccountNum',
 u'Title',
 u'Invoices',
 u'ProjectId',
 u'JobOrderNum',
 u'ApplicationExpired',
 u'Header',
 u'DevelopmentId',
 u'AdminHold',
 u'ReviewCycles',
 u'Fees',
 u'Scope',
 u'ProjectManager',
 u'ApplicationDate',
 u'DevelopmentTitle']



In [61]:

    
d['Jobs']









    Out[61]:





[{u'APN': u'438-252-19-00',
  u'ApprovalInfo': [],
  u'Approvals': [{u'ApprovalId': 2529,
    u'CompleteCancelDate': None,
    u'Depiction': u'1',
    u'FirstInspectionDate': None,
    u'IssueDate': None,
    u'IssuedBy': u'',
    u'JobId': 1574,
    u'NetChangeDU': u'',
    u'PermitHolder': u' ',
    u'Scope': u'Greater North Park   Tentative Map Waiver to convert four residential units to condominiums at 4714 Oregon Street in the MR-3000 zone of Mid City Communities Plan District in the Greater North Park Community Plan. Council District 3. Notice Cards=1.',
    u'SquareFootage': None,
    u'Status': u'Created',
    u'Type': u'Map Waiver',
    u'Valuation': u''}],
  u'Description': u'4714   OREGON ST ',
  u'JobFeesSubTotal': None,
  u'JobId': 1574,
  u'Latitude': 32.763291,
  u'Longitude': -117.135504,
  u'MapReference': u'1269-D3',
  u'NAD83Easting': None,
  u'NAD83Northing': None,
  u'ProjectId': 1410,
  u'SignOffs': [{u'DisciplineDescription': u'LDR-Engineering Review',
    u'DisciplineId': 109,
    u'SignedDate': u'Not Signed-Off'},
   {u'DisciplineDescription': u'LDR-Map Check',
    u'DisciplineId': 111,
    u'SignedDate': u'Not Signed-Off'}],
  u'SortableStreetAddress': u'OREGON ST 0000004714 ',
  u'StreetAddress': u'4714 OREGON ST '}]



In [60]:

    
# Now we can iterate over the data structure and do something interesting. 
for row in cur.execute('SELECT data FROM json LIMIT 10'):
    d = json.loads(str(row[0]).decode('zlib'))
    print d['DevelopmentId'], d['Fees']









    



1001 []
1040 [{u'Category': u'Plan Check Fees', u'InvoiceId': 107465, u'Description': u'Deposit Account', u'ProjectId': 1040, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 3000, u'QuantityPaid': 3000, u'FeeId': 1656, u'Unit': u'Dollars'}]
1041 [{u'Category': u'Plan Check Fees', u'InvoiceId': 107464, u'Description': u'Deposit Account', u'ProjectId': 1041, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 1700, u'QuantityPaid': 1700, u'FeeId': 1656, u'Unit': u'Dollars'}]
1096 [{u'Category': u'Plan Check Fees', u'InvoiceId': 138746, u'Description': u'Deposit Account', u'ProjectId': 1096, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 5500, u'QuantityPaid': 5500, u'FeeId': 1656, u'Unit': u'Dollars'}]
1102 [{u'Category': u'Plan Check Fees', u'InvoiceId': 78488, u'Description': u'Deposit Account', u'ProjectId': 1102, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 4490, u'QuantityPaid': 4490, u'FeeId': 1656, u'Unit': u'Dollars'}]
1163 [{u'Category': u'Plan Check Fees', u'InvoiceId': 79269, u'Description': u'Deposit Acct (Pre 8-2-03)', u'ProjectId': 1164, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 2500, u'QuantityPaid': 2500, u'FeeId': 1317, u'Unit': u'Each'}]
1110 []
1282 [{u'Category': u'Plan Check Fees', u'InvoiceId': 22932, u'Description': u'Deposit Account', u'ProjectId': 1287, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 10661, u'QuantityPaid': 10661, u'FeeId': 1656, u'Unit': u'Dollars'}]
1389 [{u'Category': u'Plan Check Fees', u'InvoiceId': 216989, u'Description': u'Deposit Account', u'ProjectId': 1395, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 15467, u'QuantityPaid': 15467, u'FeeId': 1656, u'Unit': u'Dollars'}]
1404 []



In [59]:

    
# create a generator to iterate over a particular type of object
def gen_fees(cur):

    for row in cur.execute('SELECT data FROM json'):
        d = json.loads(str(row[0]).decode('zlib'))
        if 'Fees' in d and d['Fees']:
            for fee in d['Fees']:
                yield fee
   
    cur.close()
    con.close()



In [44]:

    
# Example of looping over the generator
for i, row in enumerate(gen_fees(cur)):
    if i > 10: break
        
    print row









    



{u'Category': u'Plan Check Fees', u'InvoiceId': 107465, u'Description': u'Deposit Account', u'ProjectId': 1040, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 3000, u'QuantityPaid': 3000, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 107464, u'Description': u'Deposit Account', u'ProjectId': 1041, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 1700, u'QuantityPaid': 1700, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 138746, u'Description': u'Deposit Account', u'ProjectId': 1096, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 5500, u'QuantityPaid': 5500, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 78488, u'Description': u'Deposit Account', u'ProjectId': 1102, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 4490, u'QuantityPaid': 4490, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 79269, u'Description': u'Deposit Acct (Pre 8-2-03)', u'ProjectId': 1164, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 2500, u'QuantityPaid': 2500, u'FeeId': 1317, u'Unit': u'Each'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 22932, u'Description': u'Deposit Account', u'ProjectId': 1287, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 10661, u'QuantityPaid': 10661, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 216989, u'Description': u'Deposit Account', u'ProjectId': 1395, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 15467, u'QuantityPaid': 15467, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 113664, u'Description': u'Deposit Account', u'ProjectId': 1606, u'InvoiceStatus': u'Requires Invoice', u'QuantityRequired': 10059, u'QuantityPaid': 3499, u'FeeId': 1656, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 235014, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1704, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 11000, u'QuantityPaid': 11000, u'FeeId': 2955, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 222608, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1706, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 7000, u'QuantityPaid': 7000, u'FeeId': 2955, u'Unit': u'Dollars'}
{u'Category': u'Plan Check Fees', u'InvoiceId': 255428, u'Description': u'Deposit Account Pre 8-2-2003', u'ProjectId': 1707, u'InvoiceStatus': u'Paid on ', u'QuantityRequired': 22500, u'QuantityPaid': 22500, u'FeeId': 2955, u'Unit': u'Dollars'}



In [47]:

    
# Then, we can create a dataframe from a list comprehension
# FOr Fees, this is 287K records, so this will be slow. 
df = pd.DataFrame(gen_fees(cur))



In [49]:

    
df.head()









    Out[49]:






  
    
      
      Category
      Description
      FeeId
      InvoiceId
      InvoiceStatus
      ProjectId
      QuantityPaid
      QuantityRequired
      Unit
    
  
  
    
      0
       Plan Check Fees
                 Deposit Account
       1656
       107465
       Paid on 
       1040
       3000
       3000
       Dollars
    
    
      1
       Plan Check Fees
                 Deposit Account
       1656
       107464
       Paid on 
       1041
       1700
       1700
       Dollars
    
    
      2
       Plan Check Fees
                 Deposit Account
       1656
       138746
       Paid on 
       1096
       5500
       5500
       Dollars
    
    
      3
       Plan Check Fees
                 Deposit Account
       1656
        78488
       Paid on 
       1102
       4490
       4490
       Dollars
    
    
      4
       Plan Check Fees
       Deposit Acct (Pre 8-2-03)
       1317
        79269
       Paid on 
       1164
       2500
       2500
          Each



In [53]:

    
df.groupby('FeeId').sum().head()









    Out[53]:






  
    
      
      InvoiceId
      ProjectId
      QuantityPaid
      QuantityRequired
    
    
      FeeId
      
      
      
      
    
  
  
    
      698 
         519810
       1143739
           109
           308
    
    
      1280
           3759
          6970
             1
             1
    
    
      1281
           6520
         16355
             2
             2
    
    
      1317
        6341947
       1253552
       1214860
       1464054
    
    
      1400
       10748236
       5025918
           444
           443



In [55]:



In [ ]:

	Category	Description	FeeId	InvoiceId	InvoiceStatus	ProjectId	QuantityPaid	QuantityRequired	Unit
0	Plan Check Fees	Deposit Account	1656	107465	Paid on	1040	3000	3000	Dollars
1	Plan Check Fees	Deposit Account	1656	107464	Paid on	1041	1700	1700	Dollars
2	Plan Check Fees	Deposit Account	1656	138746	Paid on	1096	5500	5500	Dollars
3	Plan Check Fees	Deposit Account	1656	78488	Paid on	1102	4490	4490	Dollars
4	Plan Check Fees	Deposit Acct (Pre 8-2-03)	1317	79269	Paid on	1164	2500	2500	Each

	InvoiceId	ProjectId	QuantityPaid	QuantityRequired
FeeId
698	519810	1143739	109	308
1280	3759	6970	1	1
1281	6520	16355	2	2
1317	6341947	1253552	1214860	1464054
1400	10748236	5025918	444	443