In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point
%matplotlib inline
In [2]:
df = gpd.read_file('../data/external/tree-canopy.geojson')
In [3]:
df.columns
Out[3]:
In [4]:
df.head()
Out[4]:
In [5]:
# Keep only the relevant columns
df = df.loc[:, ["block_id", "tree_dbh", "status", "health", "spc_latin",
"spc_common", "sidewalk", "geometry", "tree_id"]]
In [6]:
# There are dead and stumped trees in the data set!
print ("Dead:", len(df[df.status == "Dead"]))
print ("Stump:", len(df[df.status == "Stump"]))
In [7]:
# Keep only trees that are alive
df = df[df.status == "Alive"]
In [8]:
# Make sure we're using the correct projection
df = df.to_crs({'init' :'epsg:4326'})
In [9]:
df.head(3)
Out[9]:
tree_dbh
Diameter of the tree, measured at approximately 54" / 137cm above the ground. Data was collected for both living and dead trees; for stumps, use stump_diam
Because standard measuring tapes are more accessible than forestry-specific measuring tapes designed to measure diameter, users originally measured tree circumference in the field. To better match other forestry datasets, this circumference value was subsequently divided by 3.14159 to transform it to diameter. Both the field measurement and processed value were rounded to the nearest whole inch.
health
Indicates the user's perception of tree health.
In [9]:
# Make tree diameter an integer
df.tree_dbh = df.tree_dbh.astype("int64")
In [10]:
df.describe()
Out[10]:
In [11]:
len(df[df["tree_dbh"] < 50])
Out[11]:
In [12]:
df[df["tree_dbh"] > 100]
Out[12]:
In [14]:
df[df["tree_dbh"] < 40].tree_dbh.value_counts(sort=False).plot(kind="bar")
Out[14]:
The most common tree trunk size is 4.
What is the relative benefit of increasing size of trees? Does a tree twice as large bring twice as much value to publice space? Or do trees have a diminishing return on size?
In [13]:
tree_size_counts = df.tree_dbh.value_counts(sort=False)
In [14]:
df["size_score"] = np.sqrt(1 + df.tree_dbh)
In [15]:
df.size_score.hist(bins=30)
Out[15]:
Much better distrbution. We'll use this for the score.
In [16]:
df.health.value_counts(sort=False).plot(kind="bar")
Out[16]:
In [17]:
# One tree does not have health status. Remove it.
df[pd.isnull(df.health)]
Out[17]:
In [18]:
df = df[~pd.isnull(df.health)]
In [19]:
def define_health(x):
if x == "Good":
return 1.0
elif x == "Fair":
return 0.8
elif x == "Poor":
return 0.6
df["health_multiplier"] = df.health.map(define_health)
In [20]:
df["score"] = df.health_multiplier * df.size_score
In [21]:
df.score.hist(bins=60)
Out[21]:
In [22]:
df.describe()
Out[22]:
The equation for the tree score is as follow:
$$ S = h \sqrt{1 + \oslash_{tree}} $$$h$ being the health multiplier, from 0.6 to 1.0 depending on the health of the tree.
$\oslash_{tree}$ being the diameter of the tree.
In [ ]:
df.to_csv('../data/interim/tree-scores.csv')
In [23]:
# Merge into stations
stations = pd.read_csv('../data/processed/stations.csv')
In [24]:
# Convert stations csv into buffer polygons
geometry = gpd.GeoSeries([Point(xy) for xy in zip(stations.Longitude, stations.Latitude)])
geometry = geometry.buffer(.0005)
geo_stations = gpd.GeoDataFrame(stations, geometry=geometry)
geo_stations.crs = {'init' :'epsg:4326'}
geo_stations.head()
geo_stations.to_file('../data/interim/geo_stations')
In [25]:
# Merge street quality data with citibike stations using Geopandas Spatial Merge
station_df = gpd.sjoin(geo_stations, df, how="left", op='intersects')
In [31]:
Out[31]:
In [40]:
# Save for Map
df.loc[station_df.index_right.dropna().unique(), :].to_csv("../data/map/trees.csv")
In [15]:
station_df
Out[15]:
In [138]:
# Create new dataframe with summed score
station_scores = pd.DataFrame()
station_scores["score"] = station_df.groupby(['Station_id']).score.sum()
station_scores["score_mean"] = station_df.groupby(['Station_id']).score.mean()
station_scores["tree_count"] = station_df.groupby(['Station_id']).score.count()
station_scores["station_id"] = station_df.groupby(['Station_id']).score.mean().index
station_scores.fillna(0.0, inplace=True)
In [139]:
station_scores.head(3)
Out[139]:
In [141]:
station_scores.describe()
Out[141]:
In [142]:
# Histogram of scores
station_scores.score.hist(bins=50)
Out[142]:
In [151]:
# Histogram of tree counts
station_scores.tree_count.hist(bins=20)
Out[151]:
In [155]:
# Histogram of mean tree score per station
station_scores.score_mean.hist(bins=40)
Out[155]:
In [158]:
zero_stations = len(station_scores[station_scores.score == 0])
print("Number of stations without trees:", str(zero_stations))
In [160]:
# Save to CSV
station_scores.to_csv("../data/processed/tree-canopy.csv")