In [63]:
! conda install geopandas -qy
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import zipfile
import requests
import os
import shutil
%matplotlib inline
from downloading_funcs import addr_shape, down_extract_zip
from supp_funcs import zoneConcentration, pointInZone
import lnks
In [64]:
#Load the BBL list
BBL12_17CSV = ['https://opendata.arcgis.com/datasets/82ab09c9541b4eb8ba4b537e131998ce_22.csv', 'https://opendata.arcgis.com/datasets/4c4d6b4defdf4561b737a594b6f2b0dd_23.csv', 'https://opendata.arcgis.com/datasets/d7aa6d3a3fdc42c4b354b9e90da443b7_1.csv', 'https://opendata.arcgis.com/datasets/a8434614d90e416b80fbdfe2cb2901d8_2.csv', 'https://opendata.arcgis.com/datasets/714d5f8b06914b8596b34b181439e702_36.csv', 'https://opendata.arcgis.com/datasets/c4368a66ce65455595a211d530facc54_3.csv',]
In [65]:
def data_pipeline(shapetype, bbl_links, supplement=None,
dex=None, ts_lst_range=None):
#A pipeline for group_e dataframe operations
#Test inputs --------------------------------------------------------------
if supplement:
assert isinstance(supplement, list)
assert isinstance(bbl_links, list)
if ts_lst_range:
assert isinstance(ts_lst_range, list)
assert len(ts_lst_range) == 2 #Must be list of format [start-yr, end-yr]
#We'll need our addresspoints and our shapefile
if not dex:
dex = addr_shape(shapetype)
#We need a list of time_unit_of_analysis
if ts_lst_range:
ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(1980, 2025)]
ts_lst = [x for x in ts_lst if
x >= ts_lst_range[0] and x <= ts_lst_range[1]]
ts_lst = sorted(ts_lst)
if not ts_lst_range:
ts_lst = [x+(i/100) for i in range(1,13,1) for x in range(2012, 2017)]
ts_lst = sorted(ts_lst)
#Now we need to stack our BBL data ----------------------------------------
#Begin by forming an empty DF
bbl_df = pd.DataFrame()
for i in bbl_links:
bbl = pd.read_csv(i, encoding='utf-8', low_memory=False)
col_len = len(bbl.columns)
bbl_df = bbl_df.append(bbl)
if len(bbl.columns) != col_len:
print('Column Mismatch!')
del bbl
bbl_df.LICENSE_START_DATE = pd.to_datetime(
bbl_df.LICENSE_START_DATE)
bbl_df.LICENSE_EXPIRATION_DATE = pd.to_datetime(
bbl_df.LICENSE_EXPIRATION_DATE)
bbl_df.LICENSE_ISSUE_DATE = pd.to_datetime(
bbl_df.LICENSE_ISSUE_DATE)
bbl_df.sort_values('LICENSE_START_DATE')
#Set up our time unit of analysis
bbl_df['month'] = 0
bbl_df['endMonth'] = 0
bbl_df['issueMonth'] = 0
bbl_df['month'] = bbl_df['LICENSE_START_DATE'].dt.year + (
bbl_df['LICENSE_START_DATE'].dt.month/100
)
bbl_df['endMonth'] = bbl_df['LICENSE_EXPIRATION_DATE'].dt.year + (
bbl_df['LICENSE_EXPIRATION_DATE'].dt.month/100
)
bbl_df['issueMonth'] = bbl_df['LICENSE_ISSUE_DATE'].dt.year + (
bbl_df['LICENSE_ISSUE_DATE'].dt.month/100
)
bbl_df.endMonth.fillna(max(ts_lst))
bbl_df['endMonth'][bbl_df['endMonth'] > max(ts_lst)] = max(ts_lst)
#Sort on month
bbl_df = bbl_df.dropna(subset=['month'])
bbl_df = bbl_df.set_index(['MARADDRESSREPOSITORYID','month'])
bbl_df = bbl_df.sort_index(ascending=True)
bbl_df.reset_index(inplace=True)
bbl_df = bbl_df[bbl_df['MARADDRESSREPOSITORYID'] >= 0]
bbl_df = bbl_df.dropna(subset=['LICENSESTATUS', 'issueMonth', 'endMonth',
'MARADDRESSREPOSITORYID','month',
'LONGITUDE', 'LATITUDE'
])
#Now that we have the BBL data, let's create our flag and points data -----
#This is the addresspoints, passed from the dex param
addr_df = dex[0]
#Zip the latlongs
addr_df['geometry'] = [
Point(xy) for xy in zip(
addr_df.LONGITUDE.apply(float), addr_df.LATITUDE.apply(float)
)
]
addr_df['Points'] = addr_df['geometry'] #Duplicate, so raw retains points
addr_df['dummy_counter'] = 1 #Always one, always dropped before export
crs='EPSG:4326' #Convenience assignment of crs
#Now we're stacking for each month ----------------------------------------
out_gdf = pd.DataFrame() #Empty storage df
for i in ts_lst[:2]: #iterate through the list of months
#dex[1] is the designated shapefile passed from the dex param,
#and should match the shapetype defined in that param
#Copy of the dex[1] shapefile
shp_gdf = dex[1]
#Active BBL in month i
bbl_df['inRange'] = 0
bbl_df['inRange'][(bbl_df.endMonth > i) & (bbl_df.month <= i)] = 1
#Issued BBL in month i
bbl_df['isuFlag'] = 0
bbl_df['isuFlag'][bbl_df.issueMonth == i] = 1
#Merge BBL and MAR datasets -------------------------------------------
addr = pd.merge(addr_df, bbl_df, how='left',
left_on='ADDRESS_ID', right_on='MARADDRESSREPOSITORYID')
addr = gpd.GeoDataFrame(addr, crs=crs, geometry=addr.geometry)
addr.crs = shp_gdf.crs
raw = gpd.sjoin(shp_gdf, addr, how='left', op='intersects')
#A simple percent of buildings with active flags per shape,
#and call it a 'utilization index'
numer = raw.groupby('NAME').sum()
numer = numer.inRange
denom = raw.groupby('NAME').sum()
denom = denom.dummy_counter
issue = raw.groupby('NAME').sum()
issue = issue.isuFlag
flags = []
utl_inx = pd.DataFrame(numer/denom)
utl_inx.columns = [
'Util_Indx_BBL'
]
flags.append(utl_inx)
#This is number of buildings with an active BBL in month i
bbl_count = pd.DataFrame(numer)
bbl_count.columns = [
'countBBL'
]
flags.append(bbl_count)
#This is number of buildings that were issued a BBL in month i
isu_count = pd.DataFrame(issue)
isu_count.columns = [
'countIssued'
]
flags.append(isu_count)
for flag in flags:
flag.crs = shp_gdf.crs
shp_gdf = shp_gdf.merge(flag,
how="left", left_on='NAME', right_index=True)
shp_gdf['month'] = i
#Head will be the list of retained columns
head = ['NAME', 'Util_Indx_BBL',
'countBBL', 'countIssued',
'month', 'geometry']
shp_gdf = shp_gdf[head]
if supplement: #this is where your code will be fed into the pipeline.
for supp_func in supplement:
if len(supp_func) == 2:
shp_gdf = supp_func[0](shp_gdf, raw, supp_func[1])
if len(supp_func) == 3:
shp_gdf = supp_func[0](shp_gdf, raw, supp_func[1],
supp_func[2])
if len(supp_func) == 4:
shp_gdf = supp_func[0](shp_gdf, raw, supp_func[1],
supp_func[2], supp_func[3])
out_gdf = out_gdf.append(shp_gdf) #This does the stacking
print('Merged month:', i)
del shp_gdf, addr, utl_inx #Save me some memory please!
#Can't have strings in our matrix
out_gdf = pd.get_dummies(out_gdf, columns=['NAME'])
out_gdf = out_gdf.drop('geometry', axis=1)
out_gdf.to_csv('./data/' + shapetype + '_out.csv') #Save
return [bbl_df, addr_df, out_gdf, raw] #Remove this later, for testing now
In [66]:
dex = addr_shape('anc')
In [67]:
def metro_prox(shp_gdf, raw, bufr=None):
#Flag properties within distance "bufr" of metro stations
if not bufr:
bufr = 1/250 #Hard to say what a good buffer is.
assert isinstance(bufr, float) #buffer must be float!
#Frame up the metro buffer shapes
metro = down_extract_zip(
'https://opendata.arcgis.com/datasets/54018b7f06b943f2af278bbe415df1de_52.zip'
)
metro = gpd.read_file(metro, crs=shp_gdf.crs)
metro.geometry = metro.geometry.buffer(bufr)
metro['bymet'] = 1
metro.drop(['NAME'], axis=1, inplace=True)
#Frame up the raw address points data
pointy = raw[['NAME', 'Points', 'dummy_counter']]
pointy = gpd.GeoDataFrame(pointy, crs=metro.crs,
geometry=pointy.Points)
pointy = gpd.sjoin(pointy, metro,
how='left', op='intersects')
denom = pointy.groupby('NAME').sum()
denom = denom.dummy_counter
numer = pointy.groupby('NAME').sum()
numer = numer.bymet
pct_metro_coverage = pd.DataFrame(numer/denom)
pct_metro_coverage.columns = [
'pct_metro_coverage'
]
pct_metro_coverage.fillna(0, inplace=True)
pct_metro_coverage.crs = pointy.crs
shp_gdf = shp_gdf.merge(pct_metro_coverage,
how="left", left_on='NAME', right_index=True)
return shp_gdf
In [68]:
#sets0-address df,
In [ ]:
In [69]:
#sets[2] #Our number of rows equals our number of shapes * number of months
In [70]:
cz1217 = ['https://opendata.arcgis.com/datasets/9cbe8553d4e2456ab6c140d83c7e83e0_15.csv', 'https://opendata.arcgis.com/datasets/3d49e06d51984fa2b68f21eed21eba1f_14.csv', 'https://opendata.arcgis.com/datasets/54b57e15f6944af8b413a5e4f88b070c_13.csv', 'https://opendata.arcgis.com/datasets/b3283607f9b74457aff420081eec3190_29.csv', 'https://opendata.arcgis.com/datasets/2dc1a7dbb705471eb38af39acfa16238_28.csv', 'https://opendata.arcgis.com/datasets/585c8c3ef58c4f1ab1ddf1c759b3a8bd_39.csv']
In [71]:
constr12 = pd.read_csv('https://opendata.arcgis.com/datasets/9cbe8553d4e2456ab6c140d83c7e83e0_15.csv')
constr13 = pd.read_csv('https://opendata.arcgis.com/datasets/3d49e06d51984fa2b68f21eed21eba1f_14.csv')
constr14 = pd.read_csv('https://opendata.arcgis.com/datasets/54b57e15f6944af8b413a5e4f88b070c_13.csv')
constr15 = pd.read_csv('https://opendata.arcgis.com/datasets/b3283607f9b74457aff420081eec3190_29.csv')
constr16 = pd.read_csv('https://opendata.arcgis.com/datasets/2dc1a7dbb705471eb38af39acfa16238_28.csv')
constr17 = pd.read_csv('https://opendata.arcgis.com/datasets/585c8c3ef58c4f1ab1ddf1c759b3a8bd_39.csv')
constr12_17 = pd.concat([constr12, constr13, constr14, constr15, constr16, constr17], axis=0, ignore_index=True)
In [72]:
constr12_17.head().T
Out[72]:
In [73]:
constr12_17.columns
Out[73]:
In [74]:
constr12_17.shape
Out[74]:
In [75]:
constr12_17['effective_date'] = pd.to_datetime(constr12_17['EFFECTIVEDATE'])
constr12_17['expire_date'] = pd.to_datetime(constr12_17['EXPIRATIONDATE'])
permit_length = (constr12_17.expire_date - constr12_17.effective_date)
print(permit_length)
In [76]:
permits = constr12_17[(constr12_17['STATUS']=='Permit Expired') |
(constr12_17['STATUS']=='Approved (Pending Payment)') |
(constr12_17['STATUS']=='Issued') |
(constr12_17['STATUS']=='Assigned')]
print(permits.STATUS.unique())
In [77]:
permits.sort_values('effective_date')
permits['month'] = 0
permits['month'] = 0
permits['month'] = permits['effective_date'].dt.year + (permits['effective_date'].dt.month/100)
permits['endmonth'] = permits['expire_date'].dt.year + (permits['expire_date'].dt.month/100)
permits = permits.set_index(['month'])
permits = permits.sort_index(ascending=True)
permits.reset_index(inplace=True)
permits = permits.dropna(subset=['STATUS', 'endmonth', 'month', 'LONGITUDE', 'LATITUDE'])
In [78]:
! wget https://opendata.arcgis.com/datasets/6969dd63c5cb4d6aa32f15effb8311f3_8.geojson -O census2012
In [79]:
import geopandas as gpd
import os
census = gpd.read_file('census2012')
In [80]:
census.columns
Out[80]:
In [81]:
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(permits.LONGITUDE.apply(float), permits.LATITUDE.apply(float))]
crs = {'init': 'epsg:4326'}
points = gpd.GeoDataFrame(permits, crs=crs, geometry=geometry)
fig, ax = plt.subplots()
census.plot(ax=ax, color='red')
points.plot(ax=ax, color='black', marker='.', markersize=5)
ax.set_aspect('equal')
In [82]:
geo_constr = gpd.sjoin(census, points, how='left', op='intersects')
geo_constr.head()
Out[82]:
In [83]:
geo_constr.geometry.head()
Out[83]:
In [110]:
Contruct2HouseRatio = pd.DataFrame(geo_constr.TRACT.value_counts()*100000/geo_constr.H0010002.sum())
Contruct2HouseRatio.columns = ['Contruct2HouseRatio']
print(Contruct2HouseRatio)
In [128]:
tracts_housepermits = census.merge(Contruct2HouseRatio, how="left", left_on='TRACT', right_index=True)
tracts_housepermits = tracts_housepermits.dropna(subset=['Contruct2HouseRatio'])
tracts_housepermits.head()
Out[128]:
In [88]:
Construct = pd.DataFrame(geo_constr.TRACT.value_counts())
Construct.columns = ['Construct']
print(Construct)
In [126]:
tracts_construction = census.merge(Construct, how="left", left_on='TRACT', right_index=True)
tracts_construction.head()
Out[126]:
In [129]:
tracts_construction.corr()['Construct'].sort_values()
Out[129]:
In [ ]: