In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import geopandas as gpd
from shapely.geometry import Point
import re
import sys
!{sys.executable} -m pip --quiet install python-slugify
%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()
In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
Out[2]:
In [3]:
df15 = pkg.reference('parking_transactions_2015').dataframe().sample(100_000)
df16 = pkg.reference('parking_transactions_2016').dataframe().sample(100_000)
df17 = pkg.reference('parking_transactions_2017').dataframe().sample(100_000)
df18 = pkg.reference('parking_transactions_2018').dataframe().sample(100_000)
In [4]:
from functools import reduce
all_dfs = [df15, df16, df17, df18]
#columns = reduce(lambda a, b: a | set(b.columns), all_dfs, set())
columns = [
'uuid',
'pole_id',
'trans_start',
'meter_expire',
'meter_type',
'pay_method',
'trans_amt',
'smartcard_id'
]
In [5]:
for df in all_dfs:
for col in columns:
if col not in list(df.columns):
df[col] = None
aligned_dfs = [ df[columns] for df in all_dfs ]
df = pd.concat(aligned_dfs)
In [6]:
df['trans_start'] = pd.to_datetime(df['trans_start'])
df['meter_expire'] = pd.to_datetime(df['meter_expire'])
df['n'] = 1
df.head()
Out[6]:
In [7]:
len(df)
Out[7]:
In [8]:
#
# Link in communities and tracts
loc = pkg.reference('locations_source').dataframe()
t = loc[ (loc.longitude != 0) & (loc.longitude != -180 ) & (loc.longitude != -117)]
gloc = gpd.GeoDataFrame(t, geometry=
[Point(x,y) for x,y in zip(t.longitude, t.latitude)])
compkg = mp.open_package('http://library.metatab.org/sandiegodata.org-geography-2018.csv')
tract_boundaries = compkg.resource('tract_boundaries').geoframe()
tracts = compkg.resource('tract_communities').dataframe()
gloc.crs = tract_boundaries.crs
t = gpd.sjoin(gloc, tract_boundaries, how='left').drop(columns='index_right')
t = t.merge(tracts[['geoid', 'community_name']], on='geoid')
cols = ['pole', 'community_name', 'geoid', 'geometry', ]
# Re-merge to get the poles that had invalid locations
locations = loc.merge(t[cols], on='pole')
max_pat = re.compile(r'(\d+) (Min|Hour) Max')
def find_max(v):
m = max_pat.search(v)
if not m:
return None
val = int(m.group(1))
if m.group(2) == 'Hour':
val *= 60
return val
locations['max_time'] = locations.config_name.apply(find_max)
cost_pat = re.compile(r'\$(\d+\.\d\d)')
def extract_cost(v):
m = cost_pat.search(v)
if not m:
return None
return float(m.group(1))
locations['per_minute_cost'] = (locations.config_name.apply(extract_cost)/60).round(4)
hours_pat = re.compile(r'(\d+)am-(\d)pm')
import json
def extract_hours(v):
m = hours_pat.search(v)
if not m:
return None
hours = [int(m.group(1)), int(m.group(2))+12]
v = v.replace(m.group(0), '')
m = hours_pat.search(v)
if not m:
return json.dumps(hours) # JSON notreally necessary; python is same
return json.dumps(hours + [int(m.group(1)), int(m.group(2))+12])
locations['hours'] = locations.config_name.apply(extract_hours)
days_pat = re.compile(r'(M|Mon)-(Fri|Sat)')
def extract_days(v):
if 'Daily' in v:
return json.dumps(['Mon','Sun'])
m = days_pat.search(v)
if not m:
return ''
s,e = m.group(1), m.group(2)
if s == 'M':
s = 'Mon'
days = [s,e]
v = v.replace(m.group(0), '')
m = days_pat.search(v)
if not m:
if 'Sat' in v:
return json.dumps(days+ ['Sat','Sat'])
else:
return json.dumps(days) # JSON not really necessary; python is same
return json.dumps(days + [m.group(1), m.group(2)])
locations['days'] = locations.config_name.apply(extract_days)
locations['area'] = locations.area.astype('category')
locations.sort_values('pole', inplace=True)
locations['pole_index'] = locations.reset_index().index + 1
locations.rename(columns={'pole':'pole_id'}, inplace=True)
# Map poles to meter types.
meter_type_merge = df[['pole_id', 'meter_type']].sort_values('pole_id').drop_duplicates()
locations = locations.merge(meter_type_merge, on='pole_id', how='left')
final_cols = ['pole_id', 'pole_index', 'config_id', 'config_name', 'max_time', 'hours', 'days', 'per_minute_cost',
'meter_type', 'zone', 'area', 'sub_area', 'community_name', 'geoid','longitude', 'latitude', 'geometry']
locations = locations[final_cols]
locations.head()
Out[8]:
In [ ]:
In [ ]: