notebook.community

Edit and run



In [1]:

    
# imports
import os.path
import numpy as np
import pandas as pd



In [2]:

    
# housekeeping
data_dir = os.path.dirname(os.getcwd()) + "/data/"



In [3]:

    
# library data load
lib_data = pd.read_csv(data_dir + "Library_Usage.csv")
park_historical_data = pd.read_csv(data_dir + "Park_Scores_2005-2014.csv")
park_recent_data = pd.read_csv(data_dir + "Park_Evaluation_Scores_starting_Fiscal_Year_2015.csv")



In [4]:

    
# let's get a sense of what our data looks like
lib_data.head()









    Out[4]:






  
    
      
      Patron Type Code
      Patron Type Definition
      Total Checkouts
      Total Renewals
      Age Range
      Home Library Code
      Home Library Definition
      Circulation Active Month
      Circulation Active Year
      Notice Preference Code
      Notice Preference Definition
      Provided Email Address
      Year Patron Registered
      Outside of County
      Supervisor District
    
  
  
    
      0
      3
      SENIOR
      28
      13
      65 to 74 years
      X
      Main Library
      November
      2012
      z
      email
      True
      2003
      True
      NaN
    
    
      1
      0
      ADULT
      21
      10
      55 to 59 years
      X
      Main Library
      October
      2015
      z
      email
      True
      2003
      False
      NaN
    
    
      2
      0
      ADULT
      275
      559
      60 to 64 years
      X
      Main Library
      January
      2015
      z
      email
      True
      2003
      True
      NaN
    
    
      3
      0
      ADULT
      73
      38
      45 to 54 years
      M8
      Mission Bay
      February
      2016
      z
      email
      True
      2003
      False
      NaN
    
    
      4
      0
      ADULT
      182
      90
      45 to 54 years
      X
      Main Library
      July
      2016
      z
      email
      True
      2003
      False
      NaN



In [5]:

    
lib_data.columns









    Out[5]:





Index(['Patron Type Code', 'Patron Type Definition', 'Total Checkouts',
       'Total Renewals', 'Age Range', 'Home Library Code',
       'Home Library Definition', 'Circulation Active Month',
       'Circulation Active Year', 'Notice Preference Code',
       'Notice Preference Definition', 'Provided Email Address',
       'Year Patron Registered', 'Outside of County', 'Supervisor District'],
      dtype='object')



In [6]:

    
lib_data.shape









    Out[6]:





(423448, 15)



In [7]:

    
lib_data.loc[:,["Total Checkouts", "Total Renewals", "Year Patron Registered"]].describe()









    Out[7]:






  
    
      
      Total Checkouts
      Total Renewals
      Year Patron Registered
    
  
  
    
      count
      423448.000000
      423448.000000
      423448.000000
    
    
      mean
      161.982097
      59.657327
      2010.348917
    
    
      std
      453.703678
      225.009917
      4.357374
    
    
      min
      0.000000
      0.000000
      2003.000000
    
    
      25%
      2.000000
      0.000000
      2007.000000
    
    
      50%
      19.000000
      2.000000
      2012.000000
    
    
      75%
      113.000000
      27.000000
      2014.000000
    
    
      max
      35907.000000
      8965.000000
      2016.000000



In [8]:

    
# the data runs from 2003-2016, average books checked out is 162, average renewals 60.
# note these are over the entire history of the patron

# we can subtract the year they were registered from 2016 and divide the total number of checkouts
# by this number to get number of checkouts per year

# group this by supervisor district



In [9]:

    
"""
Things I want to do to explore this data:
- DONE remove records without supervisor districts
- DONE group by supervisor district and explore mean
- DONE group by supervisor district to determine park service area overlap with supervisor district
- look at trends in age, checkouts, renewals by supervisor district
- match supervisor districts with park scores from other DataSF data
- see if there is correlation

- cook up a way to access this data using dash
- dockerize
"""

#lib_data.where(np.isnan(lib_data["Supervisor District"]))









    Out[9]:





'\nThings I want to do to explore this data:\n- DONE remove records without supervisor districts\n- DONE group by supervisor district and explore mean\n- DONE group by supervisor district to determine park service area overlap with supervisor district\n- look at trends in age, checkouts, renewals by supervisor district\n- match supervisor districts with park scores from other DataSF data\n- see if there is correlation\n\n- cook up a way to access this data using dash\n- dockerize\n'



In [10]:

    
lib_data_w_sd = lib_data.dropna(subset=["Supervisor District"])
# only library data that has a supervisor district



In [11]:

    
lib_data_w_sd.shape









    Out[11]:





(313138, 15)



In [12]:

    
park_historical_data.head()









    Out[12]:






  
    
      
      ParkID
      PSA
      Park
      FQ
      Score
    
  
  
    
      0
      86
      PSA4
      Carl Larsen Park
      FY05Q3
      0.795
    
    
      1
      13
      PSA4
      Junipero Serra Playground
      FY05Q3
      0.957
    
    
      2
      9
      PSA4
      Rolph Nicol Playground
      FY05Q3
      0.864
    
    
      3
      117
      PSA2
      Alamo Square
      FY05Q4
      0.857
    
    
      4
      60
      PSA6
      Jose Coronado Playground
      FY05Q4
      0.859



In [13]:

    
park_recent_data.head()









    Out[13]:






  
    
      
      Park
      Park Type
      Park Site Score
      PSA
      Supervisor District
    
  
  
    
      0
      10th Avenue-Clement Mini Park
      Mini Park
      88.2
      PSA 1
      1
    
    
      1
      24th Street-York Mini Park
      Mini Park
      89.4
      PSA 6
      9
    
    
      2
      Adam Rogers Park
      Neighborhood Park or Playground
      88.4
      PSA 3
      10
    
    
      3
      Alamo Square
      Neighborhood Park or Playground
      85.0
      PSA 2
      5
    
    
      4
      Alice Chalmers Playground
      Neighborhood Park or Playground
      63.1
      PSA 3
      11



In [14]:

    
park_recent_data.groupby(["Supervisor District"]).mean()
# this is an interesting tidbit.









    Out[14]:






  
    
      
      Park Site Score
    
    
      Supervisor District
      
    
  
  
    
      1
      88.128571
    
    
      2
      87.050000
    
    
      3
      86.080952
    
    
      4
      86.012500
    
    
      5
      88.400000
    
    
      6
      85.575000
    
    
      7
      87.172727
    
    
      8
      86.030000
    
    
      9
      86.831579
    
    
      10
      81.943478
    
    
      11
      77.100000



In [15]:

    
park_recent_data.groupby(["PSA"]).mean()









    Out[15]:






  
    
      
      Park Site Score
      Supervisor District
    
    
      PSA
      
      
    
  
  
    
      GGP
      86.100000
      1.000000
    
    
      PSA 1
      87.045455
      2.181818
    
    
      PSA 2
      87.775000
      6.062500
    
    
      PSA 3
      79.086957
      10.173913
    
    
      PSA 4
      84.218182
      6.545455
    
    
      PSA 5
      86.290476
      8.000000
    
    
      PSA 6
      86.104348
      8.782609



In [16]:

    
park_historical_data.groupby(["PSA"]).mean()["Score"]









    Out[16]:





PSA
GGP     0.889997
PSA1    0.923912
PSA2    0.901968
PSA3    0.857579
PSA4    0.883901
PSA5    0.879724
PSA6    0.925487
Name: Score, dtype: float64



In [17]:

    
supervisor_district = []
for i in range(11):
    supervisor_district.append([])
    
# I want to see where the park service areas overlap the supervisor districts

for i in range(park_recent_data.shape[0]):
    sd = park_recent_data.loc[park_recent_data.index[i], "Supervisor District"] - 1
    psa = park_recent_data.loc[park_recent_data.index[i], "PSA"]
    if psa not in supervisor_district[sd]:
        supervisor_district[sd].append(psa)

# runtime O(n) since we have a very small number of PSA in the individual supervisor_district array.
        
for i in range(len(supervisor_district)):
    print("Supervisor District {0}: ".format(i+1), supervisor_district[i])









    



Supervisor District 1:  ['PSA 1', 'GGP', 'PSA 4']
Supervisor District 2:  ['PSA 1']
Supervisor District 3:  ['PSA 1', 'PSA 2']
Supervisor District 4:  ['PSA 4']
Supervisor District 5:  ['PSA 2']
Supervisor District 6:  ['PSA 2', 'PSA 6']
Supervisor District 7:  ['PSA 4', 'PSA 5']
Supervisor District 8:  ['PSA 5', 'PSA 6']
Supervisor District 9:  ['PSA 6', 'PSA 3']
Supervisor District 10:  ['PSA 3', 'PSA 2', 'PSA 6']
Supervisor District 11:  ['PSA 3', 'PSA 4', 'PSA 5']

Supervisor Districts in PSA:

GGP:
- 1
PSA 1:
- 1,2,3
PSA 2:
- 3,5,6,10
PSA 3:
- 9,10,11
PSA 4:
- 1,4,7,11
PSA 5:
- 7,8,11
PSA 6:
- 6,8,10

Since supervisor districts are more granular than the PSAs, we'll use SD and map the park scores as a weighted average based on how many supervisor districts they appear in



In [18]:

    
lib_data_w_sd.groupby(["Supervisor District"]).mean()









    Out[18]:






  
    
      
      Patron Type Code
      Total Checkouts
      Total Renewals
      Provided Email Address
      Year Patron Registered
      Outside of County
    
    
      Supervisor District
      
      
      
      
      
      
    
  
  
    
      1
      0.984806
      218.482361
      84.322545
      0.833651
      2009.590697
      0.000373
    
    
      2
      0.997589
      135.532879
      51.796956
      0.894956
      2010.568714
      0.000851
    
    
      3
      1.090064
      237.762087
      69.529637
      0.746287
      2010.325313
      0.001715
    
    
      4
      0.949755
      250.812043
      95.975618
      0.809882
      2009.346286
      0.000586
    
    
      5
      0.962548
      140.236211
      57.932254
      0.842467
      2010.547362
      0.001305
    
    
      6
      1.201079
      159.816690
      42.613649
      0.745313
      2011.406610
      0.002754
    
    
      7
      0.998989
      202.031986
      81.073068
      0.846788
      2009.354842
      0.001761
    
    
      8
      0.923858
      163.039405
      69.068333
      0.874008
      2009.840882
      0.000748
    
    
      9
      0.891562
      161.468131
      54.605992
      0.738643
      2009.899264
      0.000852
    
    
      10
      0.906192
      138.875325
      38.933371
      0.661770
      2010.518191
      0.000434
    
    
      11
      0.997010
      184.991030
      59.617000
      0.682054
      2009.774905
      0.000821



In [19]:

    
park_combined = pd.merge(park_historical_data, park_recent_data, on="Park")



In [20]:

    
park_combined.head()









    Out[20]:






  
    
      
      ParkID
      PSA_x
      Park
      FQ
      Score
      Park Type
      Park Site Score
      PSA_y
      Supervisor District
    
  
  
    
      0
      86
      PSA4
      Carl Larsen Park
      FY05Q3
      0.795
      Neighborhood Park or Playground
      88.7
      PSA 4
      4
    
    
      1
      86
      PSA4
      Carl Larsen Park
      FY06Q1
      0.813
      Neighborhood Park or Playground
      88.7
      PSA 4
      4
    
    
      2
      86
      PSA4
      Carl Larsen Park
      FY06Q2
      0.256
      Neighborhood Park or Playground
      88.7
      PSA 4
      4
    
    
      3
      86
      PSA4
      Carl Larsen Park
      FY06Q4
      0.648
      Neighborhood Park or Playground
      88.7
      PSA 4
      4
    
    
      4
      86
      PSA4
      Carl Larsen Park
      FY07Q2
      0.716
      Neighborhood Park or Playground
      88.7
      PSA 4
      4



In [21]:

    
grouped_by_park = park_combined.groupby(["Park"]).mean()
grouped_by_park["Score"] *= 100
grouped_by_park.head()









    Out[21]:






  
    
      
      ParkID
      Score
      Park Site Score
      Supervisor District
    
    
      Park
      
      
      
      
    
  
  
    
      10th Avenue-Clement Mini Park
      156
      91.044000
      88.2
      1
    
    
      24th Street-York Mini Park
      51
      97.803333
      89.4
      9
    
    
      Adam Rogers Park
      46
      80.734375
      88.4
      10
    
    
      Alamo Square
      117
      89.654839
      85.0
      5
    
    
      Alice Chalmers Playground
      25
      89.837500
      63.1
      11



In [22]:

    
means = grouped_by_park.groupby(["Supervisor District"]).mean()
means









    Out[22]:






  
    
      
      ParkID
      Score
      Park Site Score
    
    
      Supervisor District
      
      
      
    
  
  
    
      1
      118.153846
      90.023564
      88.969231
    
    
      2
      125.181818
      93.511954
      86.927273
    
    
      3
      150.125000
      93.919523
      87.537500
    
    
      4
      86.285714
      89.512920
      85.742857
    
    
      5
      121.785714
      88.938648
      88.485714
    
    
      6
      121.800000
      90.631516
      85.630000
    
    
      7
      64.909091
      91.606806
      87.172727
    
    
      8
      81.578947
      89.319242
      86.047368
    
    
      9
      75.823529
      92.780364
      86.929412
    
    
      10
      7314.526316
      86.975325
      81.605263
    
    
      11
      65.300000
      86.169463
      76.600000



In [23]:

    
means.index.values









    Out[23]:





array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])



In [37]:

    
means["Score"].values









    Out[37]:





array([ 90.02356371,  93.51195438,  93.91952346,  89.5129199 ,
        88.93864833,  90.6315163 ,  91.60680563,  89.31924232,
        92.78036448,  86.97532498,  86.16946273])



In [24]:

    
lib_data_w_sd.groupby(["Supervisor District"]).mean().index









    Out[24]:





Float64Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0], dtype='float64', name='Supervisor District')



In [ ]:

	Patron Type Code	Patron Type Definition	Total Checkouts	Total Renewals	Age Range	Home Library Code	Home Library Definition	Circulation Active Month	Circulation Active Year	Notice Preference Code	Notice Preference Definition	Provided Email Address	Year Patron Registered	Outside of County	Supervisor District
0	3	SENIOR	28	13	65 to 74 years	X	Main Library	November	2012	z	email	True	2003	True	NaN
1	0	ADULT	21	10	55 to 59 years	X	Main Library	October	2015	z	email	True	2003	False	NaN
2	0	ADULT	275	559	60 to 64 years	X	Main Library	January	2015	z	email	True	2003	True	NaN
3	0	ADULT	73	38	45 to 54 years	M8	Mission Bay	February	2016	z	email	True	2003	False	NaN
4	0	ADULT	182	90	45 to 54 years	X	Main Library	July	2016	z	email	True	2003	False	NaN

	Total Checkouts	Total Renewals	Year Patron Registered
count	423448.000000	423448.000000	423448.000000
mean	161.982097	59.657327	2010.348917
std	453.703678	225.009917	4.357374
min	0.000000	0.000000	2003.000000
25%	2.000000	0.000000	2007.000000
50%	19.000000	2.000000	2012.000000
75%	113.000000	27.000000	2014.000000
max	35907.000000	8965.000000	2016.000000

	ParkID	PSA	Park	FQ	Score
0	86	PSA4	Carl Larsen Park	FY05Q3	0.795
1	13	PSA4	Junipero Serra Playground	FY05Q3	0.957
2	9	PSA4	Rolph Nicol Playground	FY05Q3	0.864
3	117	PSA2	Alamo Square	FY05Q4	0.857
4	60	PSA6	Jose Coronado Playground	FY05Q4	0.859

	Park	Park Type	Park Site Score	PSA	Supervisor District
0	10th Avenue-Clement Mini Park	Mini Park	88.2	PSA 1	1
1	24th Street-York Mini Park	Mini Park	89.4	PSA 6	9
2	Adam Rogers Park	Neighborhood Park or Playground	88.4	PSA 3	10
3	Alamo Square	Neighborhood Park or Playground	85.0	PSA 2	5
4	Alice Chalmers Playground	Neighborhood Park or Playground	63.1	PSA 3	11

	Park Site Score
Supervisor District
1	88.128571
2	87.050000
3	86.080952
4	86.012500
5	88.400000
6	85.575000
7	87.172727
8	86.030000
9	86.831579
10	81.943478
11	77.100000

	Park Site Score	Supervisor District
PSA
GGP	86.100000	1.000000
PSA 1	87.045455	2.181818
PSA 2	87.775000	6.062500
PSA 3	79.086957	10.173913
PSA 4	84.218182	6.545455
PSA 5	86.290476	8.000000
PSA 6	86.104348	8.782609

	Patron Type Code	Total Checkouts	Total Renewals	Provided Email Address	Year Patron Registered	Outside of County
Supervisor District
1	0.984806	218.482361	84.322545	0.833651	2009.590697	0.000373
2	0.997589	135.532879	51.796956	0.894956	2010.568714	0.000851
3	1.090064	237.762087	69.529637	0.746287	2010.325313	0.001715
4	0.949755	250.812043	95.975618	0.809882	2009.346286	0.000586
5	0.962548	140.236211	57.932254	0.842467	2010.547362	0.001305
6	1.201079	159.816690	42.613649	0.745313	2011.406610	0.002754
7	0.998989	202.031986	81.073068	0.846788	2009.354842	0.001761
8	0.923858	163.039405	69.068333	0.874008	2009.840882	0.000748
9	0.891562	161.468131	54.605992	0.738643	2009.899264	0.000852
10	0.906192	138.875325	38.933371	0.661770	2010.518191	0.000434
11	0.997010	184.991030	59.617000	0.682054	2009.774905	0.000821

	ParkID	Score	Park Site Score	Supervisor District
Park
10th Avenue-Clement Mini Park	156	91.044000	88.2	1
24th Street-York Mini Park	51	97.803333	89.4	9
Adam Rogers Park	46	80.734375	88.4	10
Alamo Square	117	89.654839	85.0	5
Alice Chalmers Playground	25	89.837500	63.1	11

	ParkID	Score	Park Site Score
Supervisor District
1	118.153846	90.023564	88.969231
2	125.181818	93.511954	86.927273
3	150.125000	93.919523	87.537500
4	86.285714	89.512920	85.742857
5	121.785714	88.938648	88.485714
6	121.800000	90.631516	85.630000
7	64.909091	91.606806	87.172727
8	81.578947	89.319242	86.047368
9	75.823529	92.780364	86.929412
10	7314.526316	86.975325	81.605263
11	65.300000	86.169463	76.600000