In [1]:
# imports
import os.path
import numpy as np
import pandas as pd

In [2]:
# housekeeping
data_dir = os.path.dirname(os.getcwd()) + "/data/"

In [3]:
# library data load
lib_data = pd.read_csv(data_dir + "Library_Usage.csv")
park_historical_data = pd.read_csv(data_dir + "Park_Scores_2005-2014.csv")
park_recent_data = pd.read_csv(data_dir + "Park_Evaluation_Scores_starting_Fiscal_Year_2015.csv")

In [4]:
# let's get a sense of what our data looks like
lib_data.head()


Out[4]:
Patron Type Code Patron Type Definition Total Checkouts Total Renewals Age Range Home Library Code Home Library Definition Circulation Active Month Circulation Active Year Notice Preference Code Notice Preference Definition Provided Email Address Year Patron Registered Outside of County Supervisor District
0 3 SENIOR 28 13 65 to 74 years X Main Library November 2012 z email True 2003 True NaN
1 0 ADULT 21 10 55 to 59 years X Main Library October 2015 z email True 2003 False NaN
2 0 ADULT 275 559 60 to 64 years X Main Library January 2015 z email True 2003 True NaN
3 0 ADULT 73 38 45 to 54 years M8 Mission Bay February 2016 z email True 2003 False NaN
4 0 ADULT 182 90 45 to 54 years X Main Library July 2016 z email True 2003 False NaN

In [5]:
lib_data.columns


Out[5]:
Index(['Patron Type Code', 'Patron Type Definition', 'Total Checkouts',
       'Total Renewals', 'Age Range', 'Home Library Code',
       'Home Library Definition', 'Circulation Active Month',
       'Circulation Active Year', 'Notice Preference Code',
       'Notice Preference Definition', 'Provided Email Address',
       'Year Patron Registered', 'Outside of County', 'Supervisor District'],
      dtype='object')

In [6]:
lib_data.shape


Out[6]:
(423448, 15)

In [7]:
lib_data.loc[:,["Total Checkouts", "Total Renewals", "Year Patron Registered"]].describe()


Out[7]:
Total Checkouts Total Renewals Year Patron Registered
count 423448.000000 423448.000000 423448.000000
mean 161.982097 59.657327 2010.348917
std 453.703678 225.009917 4.357374
min 0.000000 0.000000 2003.000000
25% 2.000000 0.000000 2007.000000
50% 19.000000 2.000000 2012.000000
75% 113.000000 27.000000 2014.000000
max 35907.000000 8965.000000 2016.000000

In [8]:
# the data runs from 2003-2016, average books checked out is 162, average renewals 60.
# note these are over the entire history of the patron

# we can subtract the year they were registered from 2016 and divide the total number of checkouts
# by this number to get number of checkouts per year

# group this by supervisor district

In [9]:
"""
Things I want to do to explore this data:
- DONE remove records without supervisor districts
- DONE group by supervisor district and explore mean
- DONE group by supervisor district to determine park service area overlap with supervisor district
- look at trends in age, checkouts, renewals by supervisor district
- match supervisor districts with park scores from other DataSF data
- see if there is correlation

- cook up a way to access this data using dash
- dockerize
"""

#lib_data.where(np.isnan(lib_data["Supervisor District"]))


Out[9]:
'\nThings I want to do to explore this data:\n- DONE remove records without supervisor districts\n- DONE group by supervisor district and explore mean\n- DONE group by supervisor district to determine park service area overlap with supervisor district\n- look at trends in age, checkouts, renewals by supervisor district\n- match supervisor districts with park scores from other DataSF data\n- see if there is correlation\n\n- cook up a way to access this data using dash\n- dockerize\n'

In [10]:
lib_data_w_sd = lib_data.dropna(subset=["Supervisor District"])
# only library data that has a supervisor district

In [11]:
lib_data_w_sd.shape


Out[11]:
(313138, 15)

In [12]:
park_historical_data.head()


Out[12]:
ParkID PSA Park FQ Score
0 86 PSA4 Carl Larsen Park FY05Q3 0.795
1 13 PSA4 Junipero Serra Playground FY05Q3 0.957
2 9 PSA4 Rolph Nicol Playground FY05Q3 0.864
3 117 PSA2 Alamo Square FY05Q4 0.857
4 60 PSA6 Jose Coronado Playground FY05Q4 0.859

In [13]:
park_recent_data.head()


Out[13]:
Park Park Type Park Site Score PSA Supervisor District
0 10th Avenue-Clement Mini Park Mini Park 88.2 PSA 1 1
1 24th Street-York Mini Park Mini Park 89.4 PSA 6 9
2 Adam Rogers Park Neighborhood Park or Playground 88.4 PSA 3 10
3 Alamo Square Neighborhood Park or Playground 85.0 PSA 2 5
4 Alice Chalmers Playground Neighborhood Park or Playground 63.1 PSA 3 11

In [14]:
park_recent_data.groupby(["Supervisor District"]).mean()
# this is an interesting tidbit.


Out[14]:
Park Site Score
Supervisor District
1 88.128571
2 87.050000
3 86.080952
4 86.012500
5 88.400000
6 85.575000
7 87.172727
8 86.030000
9 86.831579
10 81.943478
11 77.100000

In [15]:
park_recent_data.groupby(["PSA"]).mean()


Out[15]:
Park Site Score Supervisor District
PSA
GGP 86.100000 1.000000
PSA 1 87.045455 2.181818
PSA 2 87.775000 6.062500
PSA 3 79.086957 10.173913
PSA 4 84.218182 6.545455
PSA 5 86.290476 8.000000
PSA 6 86.104348 8.782609

In [16]:
park_historical_data.groupby(["PSA"]).mean()["Score"]


Out[16]:
PSA
GGP     0.889997
PSA1    0.923912
PSA2    0.901968
PSA3    0.857579
PSA4    0.883901
PSA5    0.879724
PSA6    0.925487
Name: Score, dtype: float64

In [17]:
supervisor_district = []
for i in range(11):
    supervisor_district.append([])
    
# I want to see where the park service areas overlap the supervisor districts

for i in range(park_recent_data.shape[0]):
    sd = park_recent_data.loc[park_recent_data.index[i], "Supervisor District"] - 1
    psa = park_recent_data.loc[park_recent_data.index[i], "PSA"]
    if psa not in supervisor_district[sd]:
        supervisor_district[sd].append(psa)

# runtime O(n) since we have a very small number of PSA in the individual supervisor_district array.
        
for i in range(len(supervisor_district)):
    print("Supervisor District {0}: ".format(i+1), supervisor_district[i])


Supervisor District 1:  ['PSA 1', 'GGP', 'PSA 4']
Supervisor District 2:  ['PSA 1']
Supervisor District 3:  ['PSA 1', 'PSA 2']
Supervisor District 4:  ['PSA 4']
Supervisor District 5:  ['PSA 2']
Supervisor District 6:  ['PSA 2', 'PSA 6']
Supervisor District 7:  ['PSA 4', 'PSA 5']
Supervisor District 8:  ['PSA 5', 'PSA 6']
Supervisor District 9:  ['PSA 6', 'PSA 3']
Supervisor District 10:  ['PSA 3', 'PSA 2', 'PSA 6']
Supervisor District 11:  ['PSA 3', 'PSA 4', 'PSA 5']

Supervisor Districts in PSA:

  • GGP:
    • 1
  • PSA 1:
    • 1,2,3
  • PSA 2:
    • 3,5,6,10
  • PSA 3:
    • 9,10,11
  • PSA 4:
    • 1,4,7,11
  • PSA 5:
    • 7,8,11
  • PSA 6:
    • 6,8,10

Since supervisor districts are more granular than the PSAs, we'll use SD and map the park scores as a weighted average based on how many supervisor districts they appear in


In [18]:
lib_data_w_sd.groupby(["Supervisor District"]).mean()


Out[18]:
Patron Type Code Total Checkouts Total Renewals Provided Email Address Year Patron Registered Outside of County
Supervisor District
1 0.984806 218.482361 84.322545 0.833651 2009.590697 0.000373
2 0.997589 135.532879 51.796956 0.894956 2010.568714 0.000851
3 1.090064 237.762087 69.529637 0.746287 2010.325313 0.001715
4 0.949755 250.812043 95.975618 0.809882 2009.346286 0.000586
5 0.962548 140.236211 57.932254 0.842467 2010.547362 0.001305
6 1.201079 159.816690 42.613649 0.745313 2011.406610 0.002754
7 0.998989 202.031986 81.073068 0.846788 2009.354842 0.001761
8 0.923858 163.039405 69.068333 0.874008 2009.840882 0.000748
9 0.891562 161.468131 54.605992 0.738643 2009.899264 0.000852
10 0.906192 138.875325 38.933371 0.661770 2010.518191 0.000434
11 0.997010 184.991030 59.617000 0.682054 2009.774905 0.000821

In [19]:
park_combined = pd.merge(park_historical_data, park_recent_data, on="Park")

In [20]:
park_combined.head()


Out[20]:
ParkID PSA_x Park FQ Score Park Type Park Site Score PSA_y Supervisor District
0 86 PSA4 Carl Larsen Park FY05Q3 0.795 Neighborhood Park or Playground 88.7 PSA 4 4
1 86 PSA4 Carl Larsen Park FY06Q1 0.813 Neighborhood Park or Playground 88.7 PSA 4 4
2 86 PSA4 Carl Larsen Park FY06Q2 0.256 Neighborhood Park or Playground 88.7 PSA 4 4
3 86 PSA4 Carl Larsen Park FY06Q4 0.648 Neighborhood Park or Playground 88.7 PSA 4 4
4 86 PSA4 Carl Larsen Park FY07Q2 0.716 Neighborhood Park or Playground 88.7 PSA 4 4

In [21]:
grouped_by_park = park_combined.groupby(["Park"]).mean()
grouped_by_park["Score"] *= 100
grouped_by_park.head()


Out[21]:
ParkID Score Park Site Score Supervisor District
Park
10th Avenue-Clement Mini Park 156 91.044000 88.2 1
24th Street-York Mini Park 51 97.803333 89.4 9
Adam Rogers Park 46 80.734375 88.4 10
Alamo Square 117 89.654839 85.0 5
Alice Chalmers Playground 25 89.837500 63.1 11

In [22]:
means = grouped_by_park.groupby(["Supervisor District"]).mean()
means


Out[22]:
ParkID Score Park Site Score
Supervisor District
1 118.153846 90.023564 88.969231
2 125.181818 93.511954 86.927273
3 150.125000 93.919523 87.537500
4 86.285714 89.512920 85.742857
5 121.785714 88.938648 88.485714
6 121.800000 90.631516 85.630000
7 64.909091 91.606806 87.172727
8 81.578947 89.319242 86.047368
9 75.823529 92.780364 86.929412
10 7314.526316 86.975325 81.605263
11 65.300000 86.169463 76.600000

In [23]:
means.index.values


Out[23]:
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [37]:
means["Score"].values


Out[37]:
array([ 90.02356371,  93.51195438,  93.91952346,  89.5129199 ,
        88.93864833,  90.6315163 ,  91.60680563,  89.31924232,
        92.78036448,  86.97532498,  86.16946273])

In [24]:
lib_data_w_sd.groupby(["Supervisor District"]).mean().index


Out[24]:
Float64Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], dtype='float64', name='Supervisor District')

In [ ]: