notebook.community

Edit and run



In [63]:

    
! conda install geopandas -qy
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely.geometry import Point, Polygon

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

import zipfile
import requests
import os
import shutil

%matplotlib inline

from downloading_funcs import addr_shape, down_extract_zip
from supp_funcs import zoneConcentration, pointInZone
import lnks









    




# All requested packages already installed.
# packages in environment at /opt/conda:
#
geopandas                 0.3.0                    py36_0    conda-forge



In [64]:

    
#Load the BBL list
BBL12_17CSV = ['https://opendata.arcgis.com/datasets/82ab09c9541b4eb8ba4b537e131998ce_22.csv', 'https://opendata.arcgis.com/datasets/4c4d6b4defdf4561b737a594b6f2b0dd_23.csv',   'https://opendata.arcgis.com/datasets/d7aa6d3a3fdc42c4b354b9e90da443b7_1.csv',     'https://opendata.arcgis.com/datasets/a8434614d90e416b80fbdfe2cb2901d8_2.csv', 'https://opendata.arcgis.com/datasets/714d5f8b06914b8596b34b181439e702_36.csv',     'https://opendata.arcgis.com/datasets/c4368a66ce65455595a211d530facc54_3.csv',]



In [65]:

    
def data_pipeline(shapetype, bbl_links, supplement=None,
                 dex=None, ts_lst_range=None):
    #A pipeline for group_e dataframe operations
    
    
    #Test inputs --------------------------------------------------------------
    if supplement:
        assert isinstance(supplement, list)
    assert isinstance(bbl_links, list)
    if ts_lst_range:
        assert isinstance(ts_lst_range, list)
        assert len(ts_lst_range) == 2 #Must be list of format [start-yr, end-yr]
    
    #We'll need our addresspoints and our shapefile
    if not dex:
        dex = addr_shape(shapetype)
    
    #We need a list of time_unit_of_analysis
    if ts_lst_range:
        ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(1980, 2025)]
        ts_lst = [x for x in ts_lst if 
                  x >= ts_lst_range[0] and x <= ts_lst_range[1]]
        ts_lst = sorted(ts_lst)
    if not ts_lst_range:
        ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(2012, 2017)]
        ts_lst = sorted(ts_lst)
    
    #Now we need to stack our BBL data ----------------------------------------
    
    #Begin by forming an empty DF 
    bbl_df = pd.DataFrame()
    for i in bbl_links:
        bbl = pd.read_csv(i, encoding='utf-8', low_memory=False)
        col_len = len(bbl.columns)
        bbl_df = bbl_df.append(bbl)
        if len(bbl.columns) != col_len:
            print('Column Mismatch!')
        del bbl
        
    bbl_df.LICENSE_START_DATE      = pd.to_datetime(
        bbl_df.LICENSE_START_DATE)
    
    bbl_df.LICENSE_EXPIRATION_DATE = pd.to_datetime(
        bbl_df.LICENSE_EXPIRATION_DATE)
    
    bbl_df.LICENSE_ISSUE_DATE      = pd.to_datetime(
        bbl_df.LICENSE_ISSUE_DATE)

    
    bbl_df.sort_values('LICENSE_START_DATE')
        
    #Set up our time unit of analysis
    bbl_df['month']      = 0
    bbl_df['endMonth']   = 0
    bbl_df['issueMonth'] = 0
    
    bbl_df['month'] = bbl_df['LICENSE_START_DATE'].dt.year + (
        bbl_df['LICENSE_START_DATE'].dt.month/100
    )
    bbl_df['endMonth'] = bbl_df['LICENSE_EXPIRATION_DATE'].dt.year + (
        bbl_df['LICENSE_EXPIRATION_DATE'].dt.month/100
    )
    bbl_df['issueMonth'] = bbl_df['LICENSE_ISSUE_DATE'].dt.year + (
        bbl_df['LICENSE_ISSUE_DATE'].dt.month/100
    )
    bbl_df.endMonth.fillna(max(ts_lst))
    bbl_df['endMonth'][bbl_df['endMonth'] > max(ts_lst)] = max(ts_lst)
       
    #Sort on month
    bbl_df = bbl_df.dropna(subset=['month'])
    bbl_df = bbl_df.set_index(['MARADDRESSREPOSITORYID','month'])
    bbl_df = bbl_df.sort_index(ascending=True)
    bbl_df.reset_index(inplace=True)
    
        
    bbl_df = bbl_df[bbl_df['MARADDRESSREPOSITORYID'] >= 0]
        
    bbl_df = bbl_df.dropna(subset=['LICENSESTATUS', 'issueMonth', 'endMonth',
                                   'MARADDRESSREPOSITORYID','month', 
                                   'LONGITUDE', 'LATITUDE'
                                  ])
    
    #Now that we have the BBL data, let's create our flag and points data -----
    
    #This is the addresspoints, passed from the dex param
    addr_df = dex[0]
    
    #Zip the latlongs
    addr_df['geometry'] = [
        Point(xy) for xy in zip(
            addr_df.LONGITUDE.apply(float), addr_df.LATITUDE.apply(float)
        )
    ]
    
    addr_df['Points']   = addr_df['geometry'] #Duplicate, so raw retains points
    
    addr_df['dummy_counter'] = 1 #Always one, always dropped before export
    
    crs='EPSG:4326' #Convenience assignment of crs
    
    #Now we're stacking for each month ----------------------------------------
    
    out_gdf = pd.DataFrame() #Empty storage df

    for i in ts_lst[:2]: #iterate through the list of months
                
        #dex[1] is the designated shapefile passed from the dex param, 
        #and should match the shapetype defined in that param
        
        #Copy of the dex[1] shapefile
        shp_gdf = dex[1]
        
        #Active BBL in month i
        bbl_df['inRange'] = 0
        bbl_df['inRange'][(bbl_df.endMonth > i) & (bbl_df.month <= i)] = 1
        
        #Issued BBL in month i
        bbl_df['isuFlag'] = 0
        bbl_df['isuFlag'][bbl_df.issueMonth == i] = 1
        
        #Merge BBL and MAR datasets -------------------------------------------
        addr    = pd.merge(addr_df, bbl_df, how='left', 
                        left_on='ADDRESS_ID', right_on='MARADDRESSREPOSITORYID')
        addr    = gpd.GeoDataFrame(addr, crs=crs, geometry=addr.geometry)
        
        
        addr.crs = shp_gdf.crs
        raw     = gpd.sjoin(shp_gdf, addr, how='left', op='intersects')
        
        #A simple percent of buildings with active flags per shape,
        #and call it a 'utilization index'
        numer = raw.groupby('NAME').sum()
        numer = numer.inRange
        denom = raw.groupby('NAME').sum()
        denom = denom.dummy_counter
        issue = raw.groupby('NAME').sum()
        issue = issue.isuFlag
        
        flags = []
        
        utl_inx           = pd.DataFrame(numer/denom)
        
        utl_inx.columns   = [
            'Util_Indx_BBL'
        ]
        flags.append(utl_inx)
        
        #This is number of buildings with an active BBL in month i
        bbl_count         = pd.DataFrame(numer)
        
        bbl_count.columns = [
            'countBBL'
        ]
        flags.append(bbl_count)
        
        #This is number of buildings that were issued a BBL in month i
        isu_count         = pd.DataFrame(issue)
        isu_count.columns = [
            'countIssued'
        ]
        flags.append(isu_count)
        
        for flag in flags:
            flag.crs = shp_gdf.crs

            shp_gdf = shp_gdf.merge(flag,
                                    how="left", left_on='NAME', right_index=True)
        shp_gdf['month'] = i
        
        #Head will be the list of retained columns
        head = ['NAME', 'Util_Indx_BBL',
               'countBBL', 'countIssued',
               'month', 'geometry']
        shp_gdf = shp_gdf[head]
        
        
        if supplement: #this is where your code will be fed into the pipeline.
            for supp_func in supplement:
                if len(supp_func) == 2:
                    shp_gdf = supp_func[0](shp_gdf, raw, supp_func[1])
                if len(supp_func) == 3:
                    shp_gdf = supp_func[0](shp_gdf, raw, supp_func[1],
                                          supp_func[2])
                if len(supp_func) == 4:
                    shp_gdf = supp_func[0](shp_gdf, raw, supp_func[1],
                                          supp_func[2], supp_func[3])
                
        
        out_gdf = out_gdf.append(shp_gdf) #This does the stacking
        print('Merged month:', i)
        del shp_gdf, addr, utl_inx #Save me some memory please!
    
    #Can't have strings in our matrix
    out_gdf = pd.get_dummies(out_gdf, columns=['NAME'])
    out_gdf = out_gdf.drop('geometry', axis=1)
    
    out_gdf.to_csv('./data/' + shapetype + '_out.csv') #Save
    
    return [bbl_df, addr_df, out_gdf, raw] #Remove this later, for testing now



In [66]:

    
dex = addr_shape('anc')



In [67]:

    
def metro_prox(shp_gdf, raw, bufr=None):
    #Flag properties within distance "bufr" of metro stations
    
    if not bufr:
        bufr = 1/250 #Hard to say what a good buffer is.
    
    assert isinstance(bufr, float) #buffer must be float!
    
    #Frame up the metro buffer shapes
    metro = down_extract_zip(
    'https://opendata.arcgis.com/datasets/54018b7f06b943f2af278bbe415df1de_52.zip'
    )
    metro          = gpd.read_file(metro, crs=shp_gdf.crs)
    metro.geometry = metro.geometry.buffer(bufr)
    metro['bymet'] = 1
    metro.drop(['NAME'], axis=1, inplace=True)
    
    #Frame up the raw address points data
    pointy         = raw[['NAME', 'Points', 'dummy_counter']]
    pointy         = gpd.GeoDataFrame(pointy, crs=metro.crs, 
                                      geometry=pointy.Points)
    pointy         = gpd.sjoin(pointy, metro, 
                               how='left', op='intersects')
    
    denom = pointy.groupby('NAME').sum()
    denom = denom.dummy_counter
    
    numer = pointy.groupby('NAME').sum()
    numer = numer.bymet
    
    pct_metro_coverage    = pd.DataFrame(numer/denom)
        
    pct_metro_coverage.columns   = [
        'pct_metro_coverage'
    ]
    
    pct_metro_coverage.fillna(0, inplace=True)
    
    pct_metro_coverage.crs = pointy.crs
    shp_gdf = shp_gdf.merge(pct_metro_coverage,
                        how="left", left_on='NAME', right_index=True)
    return shp_gdf



In [68]:

    
#sets0-address df,



In [ ]:



In [69]:

    
#sets[2] #Our number of rows equals our number of shapes * number of months



In [70]:

    
cz1217 = ['https://opendata.arcgis.com/datasets/9cbe8553d4e2456ab6c140d83c7e83e0_15.csv', 'https://opendata.arcgis.com/datasets/3d49e06d51984fa2b68f21eed21eba1f_14.csv', 'https://opendata.arcgis.com/datasets/54b57e15f6944af8b413a5e4f88b070c_13.csv', 'https://opendata.arcgis.com/datasets/b3283607f9b74457aff420081eec3190_29.csv', 'https://opendata.arcgis.com/datasets/2dc1a7dbb705471eb38af39acfa16238_28.csv', 'https://opendata.arcgis.com/datasets/585c8c3ef58c4f1ab1ddf1c759b3a8bd_39.csv']



In [71]:

    
constr12 = pd.read_csv('https://opendata.arcgis.com/datasets/9cbe8553d4e2456ab6c140d83c7e83e0_15.csv')
constr13 = pd.read_csv('https://opendata.arcgis.com/datasets/3d49e06d51984fa2b68f21eed21eba1f_14.csv')
constr14 = pd.read_csv('https://opendata.arcgis.com/datasets/54b57e15f6944af8b413a5e4f88b070c_13.csv')
constr15 = pd.read_csv('https://opendata.arcgis.com/datasets/b3283607f9b74457aff420081eec3190_29.csv')
constr16 = pd.read_csv('https://opendata.arcgis.com/datasets/2dc1a7dbb705471eb38af39acfa16238_28.csv')
constr17 = pd.read_csv('https://opendata.arcgis.com/datasets/585c8c3ef58c4f1ab1ddf1c759b3a8bd_39.csv')
constr12_17 = pd.concat([constr12, constr13, constr14, constr15, constr16, constr17], axis=0, ignore_index=True)



In [72]:

    
constr12_17.head().T









    Out[72]:







  
    
      
      0
      1
      2
      3
      4
    
  
  
    
      X
      -77.0227
      -77.0172
      -77.0403
      -77.0159
      -77.047
    
    
      Y
      38.899
      38.8776
      38.8985
      38.901
      38.9049
    
    
      OBJECTID
      29973310
      29991483
      29992527
      29994904
      29998107
    
    
      APPLICATIONDATE
      2007-02-07T00:00:00.000Z
      2009-02-25T09:13:49.000Z
      2009-06-11T11:40:35.000Z
      2009-08-14T17:05:20.000Z
      2010-07-20T14:41:48.000Z
    
    
      ISEXCAVATION
      F
      T
      F
      F
      F
    
    
      ISFIXTURE
      F
      F
      F
      T
      T
    
    
      ISPAVING
      F
      F
      F
      F
      F
    
    
      ISLANDSCAPING
      F
      F
      F
      F
      F
    
    
      ISPROJECTIONS
      F
      F
      F
      F
      F
    
    
      ISPSRENTAL
      T
      F
      T
      F
      F
    
    
      TRACKINGNUMBER
      25115
      45250
      48116
      50002
      56703
    
    
      PERMITNUMBER
      PA25115
      NaN
      PA48116
      PA50002
      PA56703
    
    
      INTAKEDATE
      2007-02-07T00:00:00.000Z
      2009-02-25T09:13:49.000Z
      2009-06-11T11:40:35.000Z
      2009-08-14T17:05:20.000Z
      2010-07-20T14:41:48.000Z
    
    
      ISSUEDATE
      2012-03-27T13:57:21.000Z
      NaN
      2012-06-19T11:54:29.000Z
      2012-06-11T14:28:30.000Z
      2012-01-24T15:09:13.000Z
    
    
      EFFECTIVEDATE
      2012-03-01T00:00:00.000Z
      2012-01-06T00:00:00.000Z
      2012-03-15T00:00:00.000Z
      2012-06-11T00:00:00.000Z
      2012-01-24T00:00:00.000Z
    
    
      EXPIRATIONDATE
      2012-06-29T00:00:00.000Z
      2012-04-13T00:00:00.000Z
      2013-03-15T00:00:00.000Z
      2012-06-29T00:00:00.000Z
      2013-01-22T00:00:00.000Z
    
    
      XCOORD
      398031
      398511
      396501
      398619
      395923
    
    
      YCOORD
      136795
      134420
      136743
      137012
      137446
    
    
      STATUS
      Permit Expired
      Cancel/Withdrawn
      Permit Expired
      Permit Expired
      Permit Expired
    
    
      WLFULLADDRESS
      701 8TH STREET NW
      1100 4TH STREET SW
      1719 G STREET NW
      811 4TH STREET NW
      2100 M STREET NW
    
    
      PERMITTEENAME
      G Street, LLC
      TEST
      1730 PENNSYLVANIA AVE LP 1730 PENNSYLVANIA AVE LP
      MADRIGAL CONDOMINUMS LLC
      Chris Holben
    
    
      OWNERNAME
      G Street, LLC
      TEST
      1730 PENNSYLVANIA AVE, LP
      MADRIGAL CONDOMINUMS LLC
      Chris Holben
    
    
      CONTRACTORNAME
      NaN
      NaN
      NaN
      NaN
      Chris Holben
    
    
      WORKDETAIL
      NaN
      PICK-UP CUSTOMER MAINLINE STUB AT 700 6TH ST,N...
      ROPES AND SANCTIONS WERE APPROVED. NO FIXED FE...
      INSTALLATION OF THE SCULPTURE PROVIDED CONSTRU...
      INSTALL A 30'X6' BIKESHARE RACK ON THE SIDEWAL...
    
    
      READYFORREVIEWDATE
      2007-04-30T00:00:00.000Z
      2011-10-27T16:22:24.000Z
      2009-06-11T11:40:35.000Z
      2009-08-14T17:13:41.000Z
      2010-07-29T16:18:08.000Z
    
    
      APPLICANTCOMPANYNAME
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      LATITUDE
      38.899
      38.8776
      38.8985
      38.901
      38.9049
    
    
      LONGITUDE
      -77.0227
      -77.0172
      -77.0403
      -77.0159
      -77.047



In [73]:

    
constr12_17.columns









    Out[73]:





Index(['X', 'Y', 'OBJECTID', 'APPLICATIONDATE', 'ISEXCAVATION', 'ISFIXTURE',
       'ISPAVING', 'ISLANDSCAPING', 'ISPROJECTIONS', 'ISPSRENTAL',
       'TRACKINGNUMBER', 'PERMITNUMBER', 'INTAKEDATE', 'ISSUEDATE',
       'EFFECTIVEDATE', 'EXPIRATIONDATE', 'XCOORD', 'YCOORD', 'STATUS',
       'WLFULLADDRESS', 'PERMITTEENAME', 'OWNERNAME', 'CONTRACTORNAME',
       'WORKDETAIL', 'READYFORREVIEWDATE', 'APPLICANTCOMPANYNAME', 'LATITUDE',
       'LONGITUDE'],
      dtype='object')



In [74]:

    
constr12_17.shape









    Out[74]:





(78526, 28)



In [75]:

    
constr12_17['effective_date'] = pd.to_datetime(constr12_17['EFFECTIVEDATE'])
constr12_17['expire_date'] = pd.to_datetime(constr12_17['EXPIRATIONDATE'])
permit_length = (constr12_17.expire_date - constr12_17.effective_date)
print(permit_length)









    



0       120 days 00:00:00
1        98 days 00:00:00
2       365 days 00:00:00
3        18 days 00:00:00
4       364 days 00:00:00
5       364 days 00:00:00
6       364 days 00:00:00
7       364 days 00:00:00
8       307 days 10:26:07
9       365 days 00:00:00
10      364 days 00:00:00
11      364 days 00:00:00
12      179 days 00:00:00
13      364 days 00:00:00
14      364 days 00:00:00
15      364 days 00:00:00
16       80 days 00:00:00
17       80 days 00:00:00
18       80 days 00:00:00
19      365 days 00:00:00
20      364 days 00:00:00
21       67 days 00:00:00
22       28 days 00:00:00
23      179 days 00:00:00
24      306 days 00:00:00
25      364 days 00:00:00
26       77 days 00:00:00
27       61 days 00:00:00
28      306 days 00:00:00
29      364 days 00:00:00
               ...       
78496    59 days 00:00:00
78497    59 days 00:00:00
78498    59 days 00:00:00
78499    59 days 00:00:00
78500    59 days 00:00:00
78501   130 days 00:00:00
78502    59 days 00:00:00
78503    59 days 00:00:00
78504    59 days 00:00:00
78505    59 days 00:00:00
78506    59 days 00:00:00
78507    59 days 00:00:00
78508    59 days 00:00:00
78509    59 days 00:00:00
78510    59 days 00:00:00
78511    60 days 00:00:00
78512    59 days 00:00:00
78513    59 days 00:00:00
78514    59 days 00:00:00
78515    59 days 00:00:00
78516    59 days 00:00:00
78517    59 days 00:00:00
78518    59 days 00:00:00
78519    59 days 00:00:00
78520   179 days 00:00:00
78521   179 days 00:00:00
78522   179 days 00:00:00
78523   179 days 00:00:00
78524    59 days 00:00:00
78525   146 days 00:00:00
Length: 78526, dtype: timedelta64[ns]



In [76]:

    
permits = constr12_17[(constr12_17['STATUS']=='Permit Expired') | 
                (constr12_17['STATUS']=='Approved (Pending Payment)') |
                (constr12_17['STATUS']=='Issued') |
                (constr12_17['STATUS']=='Assigned')]
print(permits.STATUS.unique())









    



['Permit Expired' 'Issued' 'Assigned' 'Approved (Pending Payment)']



In [77]:

    
permits.sort_values('effective_date')
permits['month'] = 0
permits['month'] = 0

permits['month'] = permits['effective_date'].dt.year + (permits['effective_date'].dt.month/100)
permits['endmonth'] = permits['expire_date'].dt.year + (permits['expire_date'].dt.month/100)

permits = permits.set_index(['month'])
permits = permits.sort_index(ascending=True)
permits.reset_index(inplace=True)

permits = permits.dropna(subset=['STATUS', 'endmonth', 'month', 'LONGITUDE', 'LATITUDE'])









    



/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [78]:

    
! wget https://opendata.arcgis.com/datasets/6969dd63c5cb4d6aa32f15effb8311f3_8.geojson -O census2012









    



--2017-11-20 01:20:37--  https://opendata.arcgis.com/datasets/6969dd63c5cb4d6aa32f15effb8311f3_8.geojson
Resolving opendata.arcgis.com (opendata.arcgis.com)... 52.22.192.109, 34.235.70.5
Connecting to opendata.arcgis.com (opendata.arcgis.com)|52.22.192.109|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/json]
Saving to: ‘census2012’

census2012              [ <=>                ]   1.71M  --.-KB/s    in 0.03s   

2017-11-20 01:20:37 (54.4 MB/s) - ‘census2012’ saved [1794478]



In [79]:

    
import geopandas as gpd
import os
census = gpd.read_file('census2012')



In [80]:

    
census.columns









    Out[80]:





Index(['OBJECTID', 'TRACT', 'GEOID', 'P0010001', 'P0010002', 'P0010003',
       'P0010004', 'P0010005', 'P0010006', 'P0010007', 'P0010008', 'OP000001',
       'OP000002', 'OP000003', 'OP000004', 'P0020002', 'P0020005', 'P0020006',
       'P0020007', 'P0020008', 'P0020009', 'P0020010', 'OP00005', 'OP00006',
       'OP00007', 'OP00008', 'P0030001', 'P0030003', 'P0030004', 'P0030005',
       'P0030006', 'P0030007', 'P0030008', 'OP00009', 'OP00010', 'OP00011',
       'OP00012', 'P0040002', 'P0040005', 'P0040006', 'P0040007', 'P0040008',
       'P0040009', 'P0040010', 'OP000013', 'OP000014', 'OP000015', 'OP000016',
       'H0010001', 'H0010002', 'H0010003', 'ACRES', 'SQ_MILES', 'Shape_Length',
       'Shape_Area', 'FAGI_TOTAL_2010', 'FAGI_MEDIAN_2010', 'FAGI_TOTAL_2013',
       'FAGI_MEDIAN_2013', 'geometry'],
      dtype='object')



In [81]:

    
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(permits.LONGITUDE.apply(float), permits.LATITUDE.apply(float))]
crs = {'init': 'epsg:4326'}
points = gpd.GeoDataFrame(permits, crs=crs, geometry=geometry)
fig, ax = plt.subplots()
census.plot(ax=ax, color='red')
points.plot(ax=ax, color='black', marker='.', markersize=5)
ax.set_aspect('equal')



In [82]:

    
geo_constr = gpd.sjoin(census, points, how='left', op='intersects')
geo_constr.head()









    Out[82]:







  
    
      
      OBJECTID_left
      TRACT
      GEOID
      P0010001
      P0010002
      P0010003
      P0010004
      P0010005
      P0010006
      P0010007
      ...
      OWNERNAME
      CONTRACTORNAME
      WORKDETAIL
      READYFORREVIEWDATE
      APPLICANTCOMPANYNAME
      LATITUDE
      LONGITUDE
      effective_date
      expire_date
      endmonth
    
  
  
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      Greg Calderon
      Gregory Calderon
      Up to 3 Utility Test Holes to identify the dep...
      2016-10-25T07:33:37.000Z
      NaN
      38.960390
      -77.085981
      2016-12-07
      2017-02-07
      2017.02
    
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      Lightower Lightower
      LIGHTOWER LIGHTOWER
      Excavate to install (2) -3' x 2' x 3' Lightowe...
      2015-07-02T15:20:55.000Z
      NaN
      38.960464
      -77.086147
      2015-09-08
      2015-11-06
      2015.11
    
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      DISTRICT DEPARTMENT OF TRANSPORTATION
      NaN
      The Tenlytown Historical Society will clean th...
      2012-10-09T16:11:46.000Z
      NaN
      38.960782
      -77.085733
      2012-10-15
      2012-11-06
      2012.11
    
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      DCWater/OPS
      NaN
      Repair sewer main in sidewalk.
      2015-01-26T08:44:22.000Z
      DCWater/OPS
      38.960782
      -77.085733
      2015-01-26
      2015-03-26
      2015.03
    
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      Potomac Electric Power Company
      NaN
      WR#3486881 CUT#40139-REPLACE MANHOLE FRAME AND...
      2015-08-19T15:24:41.000Z
      Potomac Electric Power Company
      38.960782
      -77.085733
      2015-08-19
      2015-10-01
      2015.10
    
  

5 rows × 93 columns



In [83]:

    
geo_constr.geometry.head()









    Out[83]:





0    POLYGON ((-77.07862731061486 38.94604079377298...
0    POLYGON ((-77.07862731061486 38.94604079377298...
0    POLYGON ((-77.07862731061486 38.94604079377298...
0    POLYGON ((-77.07862731061486 38.94604079377298...
0    POLYGON ((-77.07862731061486 38.94604079377298...
Name: geometry, dtype: object



In [110]:

    
Contruct2HouseRatio = pd.DataFrame(geo_constr.TRACT.value_counts()*100000/geo_constr.H0010002.sum())
Contruct2HouseRatio.columns = ['Contruct2HouseRatio']
print(Contruct2HouseRatio)









    



        Contruct2HouseRatio
010700             1.491650
010600             1.442772
005800             1.057858
000100             0.950502
001001             0.944392
007200             0.929554
004400             0.925190
011100             0.865838
000801             0.816087
000901             0.757608
010100             0.740152
003200             0.740152
010200             0.688655
004801             0.679927
000202             0.652870
004600             0.651997
005500             0.643269
002502             0.638905
010800             0.617957
001100             0.612720
008803             0.597009
002900             0.585663
008802             0.584790
009301             0.581298
003301             0.571697
002400             0.566461
001500             0.562096
009400             0.542022
003800             0.542022
000600             0.540276
...                     ...
007409             0.157108
009904             0.152744
009906             0.150998
007709             0.150125
002001             0.137906
011000             0.137033
002202             0.133542
009507             0.129177
009804             0.127432
007404             0.123941
009802             0.113467
007408             0.112594
009801             0.110848
010900             0.110848
007708             0.109103
009903             0.109103
007406             0.103866
007605             0.103866
009000             0.102120
009811             0.097756
009810             0.089028
001803             0.088155
009700             0.084664
009907             0.081172
007403             0.074190
007301             0.066334
000702             0.061970
006804             0.061970
002302             0.044514
000201             0.007855

[179 rows x 1 columns]



In [128]:

    
tracts_housepermits = census.merge(Contruct2HouseRatio, how="left", left_on='TRACT', right_index=True)
tracts_housepermits = tracts_housepermits.dropna(subset=['Contruct2HouseRatio'])
tracts_housepermits.head()









    Out[128]:







  
    
      
      OBJECTID
      TRACT
      GEOID
      P0010001
      P0010002
      P0010003
      P0010004
      P0010005
      P0010006
      P0010007
      ...
      ACRES
      SQ_MILES
      Shape_Length
      Shape_Area
      FAGI_TOTAL_2010
      FAGI_MEDIAN_2010
      FAGI_TOTAL_2013
      FAGI_MEDIAN_2013
      geometry
      Contruct2HouseRatio
    
    
      index_left
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      606.401784
      0.947503
      6507.906513
      2.454021e+06
      441410751.0
      114136.5
      445020421.0
      105423.5
      POLYGON ((-77.07862731061486 38.94604079377298...
      0.944392
    
    
      1
      2
      001002
      11001001002
      3442
      3329
      2788
      185
      7
      311
      0
      ...
      219.834053
      0.343491
      3952.278513
      8.896369e+05
      119559942.0
      74658.0
      134639759.0
      76705.0
      POLYGON ((-77.07241230806461 38.93361179147915...
      0.198130
    
    
      2
      3
      004001
      11001004001
      3745
      3623
      3237
      139
      5
      162
      8
      ...
      68.162548
      0.106504
      2361.335362
      2.758440e+05
      203836328.0
      72807.0
      230295472.0
      77629.0
      POLYGON ((-77.04659929925825 38.92324478991338...
      0.206858
    
    
      3
      4
      004002
      11001004002
      2797
      2720
      2232
      226
      4
      181
      1
      ...
      48.451481
      0.075705
      2132.734912
      1.960762e+05
      132421749.0
      60460.5
      170535365.0
      65155.0
      POLYGON ((-77.04609029867974 38.9144417882433,...
      0.199876
    
    
      4
      5
      004100
      11001004100
      2708
      2646
      2261
      115
      9
      181
      3
      ...
      201.868555
      0.315420
      3872.498178
      8.169331e+05
      403203164.0
      87019.0
      217573711.0
      81283.5
      POLYGON ((-77.04995330062248 38.92084978921181...
      0.453867
    
  

5 rows × 61 columns



In [88]:

    
Construct = pd.DataFrame(geo_constr.TRACT.value_counts())
Construct.columns = ['Construct']
print(Construct)









    



        Construct
010700       1709
010600       1653
005800       1212
000100       1089
001001       1082
007200       1065
004400       1060
011100        992
000801        935
000901        868
010100        848
003200        848
010200        789
004801        779
000202        748
004600        747
005500        737
002502        732
010800        708
001100        702
008803        684
002900        671
008802        670
009301        666
003301        655
002400        649
001500        644
009400        621
003800        621
000600        619
...           ...
007409        180
009904        175
009906        173
007709        172
002001        158
011000        157
002202        153
009507        148
009804        146
007404        142
009802        130
007408        129
009801        127
010900        127
007708        125
009903        125
007406        119
007605        119
009000        117
009811        112
009810        102
001803        101
009700         97
009907         93
007403         85
007301         76
000702         71
006804         71
002302         51
000201          9

[179 rows x 1 columns]



In [126]:

    
tracts_construction = census.merge(Construct, how="left", left_on='TRACT', right_index=True)
tracts_construction.head()









    Out[126]:







  
    
      
      OBJECTID
      TRACT
      GEOID
      P0010001
      P0010002
      P0010003
      P0010004
      P0010005
      P0010006
      P0010007
      ...
      ACRES
      SQ_MILES
      Shape_Length
      Shape_Area
      FAGI_TOTAL_2010
      FAGI_MEDIAN_2010
      FAGI_TOTAL_2013
      FAGI_MEDIAN_2013
      geometry
      Construct
    
    
      index_left
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      1
      001001
      11001001001
      7436
      7168
      6342
      298
      17
      432
      2
      ...
      606.401784
      0.947503
      6507.906513
      2.454021e+06
      441410751.0
      114136.5
      445020421.0
      105423.5
      POLYGON ((-77.07862731061486 38.94604079377298...
      1082
    
    
      1
      2
      001002
      11001001002
      3442
      3329
      2788
      185
      7
      311
      0
      ...
      219.834053
      0.343491
      3952.278513
      8.896369e+05
      119559942.0
      74658.0
      134639759.0
      76705.0
      POLYGON ((-77.07241230806461 38.93361179147915...
      227
    
    
      2
      3
      004001
      11001004001
      3745
      3623
      3237
      139
      5
      162
      8
      ...
      68.162548
      0.106504
      2361.335362
      2.758440e+05
      203836328.0
      72807.0
      230295472.0
      77629.0
      POLYGON ((-77.04659929925825 38.92324478991338...
      237
    
    
      3
      4
      004002
      11001004002
      2797
      2720
      2232
      226
      4
      181
      1
      ...
      48.451481
      0.075705
      2132.734912
      1.960762e+05
      132421749.0
      60460.5
      170535365.0
      65155.0
      POLYGON ((-77.04609029867974 38.9144417882433,...
      229
    
    
      4
      5
      004100
      11001004100
      2708
      2646
      2261
      115
      9
      181
      3
      ...
      201.868555
      0.315420
      3872.498178
      8.169331e+05
      403203164.0
      87019.0
      217573711.0
      81283.5
      POLYGON ((-77.04995330062248 38.92084978921181...
      520
    
  

5 rows × 61 columns



In [129]:

    
tracts_construction.corr()['Construct'].sort_values()









    Out[129]:





P0020006           -0.261472
P0010004           -0.259658
P0040006           -0.227277
P0030004           -0.224933
OBJECTID           -0.037660
P0010008            0.024537
OP000001            0.027752
OP00006             0.030719
P0030008            0.032247
OP00005             0.035081
P0010007            0.043464
OP00009             0.044671
P0020009            0.045459
OP000002            0.046621
P0020007            0.051757
OP000013            0.059832
P0030007            0.061382
P0010005            0.071961
P0040009            0.078678
P0040007            0.080429
OP000014            0.095446
OP00010             0.099620
OP00008             0.102800
P0020002            0.103117
OP000004            0.103815
P0030005            0.105039
Shape_Length        0.113443
ACRES               0.118205
Shape_Area          0.118205
SQ_MILES            0.118205
P0040002            0.121239
OP00012             0.130993
OP000016            0.179779
H0010003            0.196766
H0010002            0.204751
H0010001            0.212796
P0010002            0.225098
P0010001            0.228830
P0020010            0.260473
P0040010            0.271864
P0030001            0.278053
OP00011             0.294581
OP000015            0.296879
P0040008            0.304177
P0030006            0.304225
P0010006            0.309543
P0020008            0.309564
OP000003            0.328594
OP00007             0.330498
P0030003            0.348728
P0040005            0.349160
P0020005            0.358799
P0010003            0.359019
FAGI_TOTAL_2010     0.393312
FAGI_MEDIAN_2010    0.416439
FAGI_MEDIAN_2013    0.448794
FAGI_TOTAL_2013     0.475842
Construct           1.000000
Name: Construct, dtype: float64



In [ ]:

	0	1	2	3	4
X	-77.0227	-77.0172	-77.0403	-77.0159	-77.047
Y	38.899	38.8776	38.8985	38.901	38.9049
OBJECTID	29973310	29991483	29992527	29994904	29998107
APPLICATIONDATE	2007-02-07T00:00:00.000Z	2009-02-25T09:13:49.000Z	2009-06-11T11:40:35.000Z	2009-08-14T17:05:20.000Z	2010-07-20T14:41:48.000Z
ISEXCAVATION	F	T	F	F	F
ISFIXTURE	F	F	F	T	T
ISPAVING	F	F	F	F	F
ISLANDSCAPING	F	F	F	F	F
ISPROJECTIONS	F	F	F	F	F
ISPSRENTAL	T	F	T	F	F
TRACKINGNUMBER	25115	45250	48116	50002	56703
PERMITNUMBER	PA25115	NaN	PA48116	PA50002	PA56703
INTAKEDATE	2007-02-07T00:00:00.000Z	2009-02-25T09:13:49.000Z	2009-06-11T11:40:35.000Z	2009-08-14T17:05:20.000Z	2010-07-20T14:41:48.000Z
ISSUEDATE	2012-03-27T13:57:21.000Z	NaN	2012-06-19T11:54:29.000Z	2012-06-11T14:28:30.000Z	2012-01-24T15:09:13.000Z
EFFECTIVEDATE	2012-03-01T00:00:00.000Z	2012-01-06T00:00:00.000Z	2012-03-15T00:00:00.000Z	2012-06-11T00:00:00.000Z	2012-01-24T00:00:00.000Z
EXPIRATIONDATE	2012-06-29T00:00:00.000Z	2012-04-13T00:00:00.000Z	2013-03-15T00:00:00.000Z	2012-06-29T00:00:00.000Z	2013-01-22T00:00:00.000Z
XCOORD	398031	398511	396501	398619	395923
YCOORD	136795	134420	136743	137012	137446
STATUS	Permit Expired	Cancel/Withdrawn	Permit Expired	Permit Expired	Permit Expired
WLFULLADDRESS	701 8TH STREET NW	1100 4TH STREET SW	1719 G STREET NW	811 4TH STREET NW	2100 M STREET NW
PERMITTEENAME	G Street, LLC	TEST	1730 PENNSYLVANIA AVE LP 1730 PENNSYLVANIA AVE LP	MADRIGAL CONDOMINUMS LLC	Chris Holben
OWNERNAME	G Street, LLC	TEST	1730 PENNSYLVANIA AVE, LP	MADRIGAL CONDOMINUMS LLC	Chris Holben
CONTRACTORNAME	NaN	NaN	NaN	NaN	Chris Holben
WORKDETAIL	NaN	PICK-UP CUSTOMER MAINLINE STUB AT 700 6TH ST,N...	ROPES AND SANCTIONS WERE APPROVED. NO FIXED FE...	INSTALLATION OF THE SCULPTURE PROVIDED CONSTRU...	INSTALL A 30'X6' BIKESHARE RACK ON THE SIDEWAL...
READYFORREVIEWDATE	2007-04-30T00:00:00.000Z	2011-10-27T16:22:24.000Z	2009-06-11T11:40:35.000Z	2009-08-14T17:13:41.000Z	2010-07-29T16:18:08.000Z
APPLICANTCOMPANYNAME	NaN	NaN	NaN	NaN	NaN
LATITUDE	38.899	38.8776	38.8985	38.901	38.9049
LONGITUDE	-77.0227	-77.0172	-77.0403	-77.0159	-77.047

OBJECTID_left	TRACT	GEOID	P0010001	P0010002	P0010003	P0010004	P0010005	P0010006	P0010007	...	OWNERNAME	CONTRACTORNAME	WORKDETAIL	READYFORREVIEWDATE	APPLICANTCOMPANYNAME	LATITUDE	LONGITUDE	effective_date	expire_date	endmonth
1	001001	11001001001	7436	7168	6342	298	17	432	2	...	Greg Calderon	Gregory Calderon	Up to 3 Utility Test Holes to identify the dep...	2016-10-25T07:33:37.000Z	NaN	38.960390	-77.085981	2016-12-07	2017-02-07	2017.02
1	001001	11001001001	7436	7168	6342	298	17	432	2	...	Lightower Lightower	LIGHTOWER LIGHTOWER	Excavate to install (2) -3' x 2' x 3' Lightowe...	2015-07-02T15:20:55.000Z	NaN	38.960464	-77.086147	2015-09-08	2015-11-06	2015.11
1	001001	11001001001	7436	7168	6342	298	17	432	2	...	DISTRICT DEPARTMENT OF TRANSPORTATION	NaN	The Tenlytown Historical Society will clean th...	2012-10-09T16:11:46.000Z	NaN	38.960782	-77.085733	2012-10-15	2012-11-06	2012.11
1	001001	11001001001	7436	7168	6342	298	17	432	2	...	DCWater/OPS	NaN	Repair sewer main in sidewalk.	2015-01-26T08:44:22.000Z	DCWater/OPS	38.960782	-77.085733	2015-01-26	2015-03-26	2015.03
1	001001	11001001001	7436	7168	6342	298	17	432	2	...	Potomac Electric Power Company	NaN	WR#3486881 CUT#40139-REPLACE MANHOLE FRAME AND...	2015-08-19T15:24:41.000Z	Potomac Electric Power Company	38.960782	-77.085733	2015-08-19	2015-10-01	2015.10

	OBJECTID	TRACT	GEOID	P0010001	P0010002	P0010003	P0010004	P0010005	P0010006	P0010007	...	ACRES	SQ_MILES	Shape_Length	Shape_Area	FAGI_TOTAL_2010	FAGI_MEDIAN_2010	FAGI_TOTAL_2013	FAGI_MEDIAN_2013	geometry	Contruct2HouseRatio
index_left
0	1	001001	11001001001	7436	7168	6342	298	17	432	2	...	606.401784	0.947503	6507.906513	2.454021e+06	441410751.0	114136.5	445020421.0	105423.5	POLYGON ((-77.07862731061486 38.94604079377298...	0.944392
1	2	001002	11001001002	3442	3329	2788	185	7	311	0	...	219.834053	0.343491	3952.278513	8.896369e+05	119559942.0	74658.0	134639759.0	76705.0	POLYGON ((-77.07241230806461 38.93361179147915...	0.198130
2	3	004001	11001004001	3745	3623	3237	139	5	162	8	...	68.162548	0.106504	2361.335362	2.758440e+05	203836328.0	72807.0	230295472.0	77629.0	POLYGON ((-77.04659929925825 38.92324478991338...	0.206858
3	4	004002	11001004002	2797	2720	2232	226	4	181	1	...	48.451481	0.075705	2132.734912	1.960762e+05	132421749.0	60460.5	170535365.0	65155.0	POLYGON ((-77.04609029867974 38.9144417882433,...	0.199876
4	5	004100	11001004100	2708	2646	2261	115	9	181	3	...	201.868555	0.315420	3872.498178	8.169331e+05	403203164.0	87019.0	217573711.0	81283.5	POLYGON ((-77.04995330062248 38.92084978921181...	0.453867