In [1]:
# imports
import os.path
import numpy as np
import pandas as pd
In [2]:
# housekeeping
data_dir = os.path.dirname(os.getcwd()) + "/data/"
In [3]:
# library data load
lib_data = pd.read_csv(data_dir + "Library_Usage.csv")
park_historical_data = pd.read_csv(data_dir + "Park_Scores_2005-2014.csv")
park_recent_data = pd.read_csv(data_dir + "Park_Evaluation_Scores_starting_Fiscal_Year_2015.csv")
In [4]:
# let's get a sense of what our data looks like
lib_data.head()
Out[4]:
In [5]:
lib_data.columns
Out[5]:
In [6]:
lib_data.shape
Out[6]:
In [7]:
lib_data.loc[:,["Total Checkouts", "Total Renewals", "Year Patron Registered"]].describe()
Out[7]:
In [8]:
# the data runs from 2003-2016, average books checked out is 162, average renewals 60.
# note these are over the entire history of the patron
# we can subtract the year they were registered from 2016 and divide the total number of checkouts
# by this number to get number of checkouts per year
# group this by supervisor district
In [9]:
"""
Things I want to do to explore this data:
- DONE remove records without supervisor districts
- DONE group by supervisor district and explore mean
- DONE group by supervisor district to determine park service area overlap with supervisor district
- look at trends in age, checkouts, renewals by supervisor district
- match supervisor districts with park scores from other DataSF data
- see if there is correlation
- cook up a way to access this data using dash
- dockerize
"""
#lib_data.where(np.isnan(lib_data["Supervisor District"]))
Out[9]:
In [10]:
lib_data_w_sd = lib_data.dropna(subset=["Supervisor District"])
# only library data that has a supervisor district
In [11]:
lib_data_w_sd.shape
Out[11]:
In [12]:
park_historical_data.head()
Out[12]:
In [13]:
park_recent_data.head()
Out[13]:
In [14]:
park_recent_data.groupby(["Supervisor District"]).mean()
# this is an interesting tidbit.
Out[14]:
In [15]:
park_recent_data.groupby(["PSA"]).mean()
Out[15]:
In [16]:
park_historical_data.groupby(["PSA"]).mean()["Score"]
Out[16]:
In [17]:
supervisor_district = []
for i in range(11):
supervisor_district.append([])
# I want to see where the park service areas overlap the supervisor districts
for i in range(park_recent_data.shape[0]):
sd = park_recent_data.loc[park_recent_data.index[i], "Supervisor District"] - 1
psa = park_recent_data.loc[park_recent_data.index[i], "PSA"]
if psa not in supervisor_district[sd]:
supervisor_district[sd].append(psa)
# runtime O(n) since we have a very small number of PSA in the individual supervisor_district array.
for i in range(len(supervisor_district)):
print("Supervisor District {0}: ".format(i+1), supervisor_district[i])
In [18]:
lib_data_w_sd.groupby(["Supervisor District"]).mean()
Out[18]:
In [19]:
park_combined = pd.merge(park_historical_data, park_recent_data, on="Park")
In [20]:
park_combined.head()
Out[20]:
In [21]:
grouped_by_park = park_combined.groupby(["Park"]).mean()
grouped_by_park["Score"] *= 100
grouped_by_park.head()
Out[21]:
In [22]:
means = grouped_by_park.groupby(["Supervisor District"]).mean()
means
Out[22]:
In [23]:
means.index.values
Out[23]:
In [37]:
means["Score"].values
Out[37]:
In [24]:
lib_data_w_sd.groupby(["Supervisor District"]).mean().index
Out[24]:
In [ ]: