In [1]:
import pandas as pd
high_schools = pd.read_pickle('pickle/high_schools.p')
print(len(high_schools))
high_schools.head()
Out[1]:
In [2]:
stations_locations = pd.read_pickle('pickle/stations_locations.p')
stations_locations.head()
Out[2]:
In [3]:
stations_traffic = pd.read_pickle('pickle/stations_traffic.p')
stations_traffic.head(6)
Out[3]:
Let's plot both the schools and the stations coordinates to have a glance at their geographical distribution:
In [4]:
stations = stations_locations.rename(columns={'station': 'name'})
locations = stations.loc[:, ['name', 'latitude', 'longitude']]
locations['type'] = 'station'
schools = high_schools.rename(columns={'school_name': 'name'})
locations = locations.append(schools.loc[:, ['name', 'latitude', 'longitude']])
locations['type'].fillna('school', inplace=True)
locations.tail(20)
Out[4]:
In [5]:
!pip install folium
In [6]:
import folium
all_stations_schools_map = folium.Map([40.72, -73.92], zoom_start=11, tiles='CartoDB positron',
width='60%')
for i, location in locations.iterrows():
lat = location['latitude']
long = location['longitude']
name = popup=location['name']
if location['type'] == 'school':
marker = folium.RegularPolygonMarker([lat, long], popup=name, color='RoyalBlue',
fill_color='RoyalBlue', radius=5)
else:
marker = folium.CircleMarker([lat, long], popup=name, color='FireBrick',
fill_color='FireBrick', radius=2)
marker.add_to(all_stations_schools_map)
all_stations_schools_map.save('maps/all_stations_schools.html')
all_stations_schools_map
Out[6]:
The interactive map is available here.
It seems like there is a couple of schools with no nearby stations. Let's make sure our algorithm later ignores them.
Next we will merge both subway stations data frames by the station name. Unfortunately the names are spelled differently in both data frames so we will have to do some pre-processing. We will create a join_name
column in each data frame with the pre-processed names in order to make it work.
First, we'll process the names in the stations_traffic
data frame both automatically with str.replace()
and manually with a map
. Then we'll change the names in both stations_traffic
and stations_locations
to lower case and merge them.
In [7]:
stations_traffic['station'] = stations_traffic['station'].str.replace(' AV', ' AVE')
stations_traffic['station'] = stations_traffic['station'].str.replace(' AVEE', ' AVE')
stations_traffic['station'] = stations_traffic['station'].str.replace('AVENUE', 'AVE')
stations_traffic['station'] = stations_traffic['station'].str.replace('-', ' - ')
stations_traffic['station'] = stations_traffic['station'].str.replace('/', ' - ')
stations_traffic['station'] = stations_traffic['station'].str.replace('0 ', '0th ')
stations_traffic['station'] = stations_traffic['station'].str.replace('1 ', '1st ')
stations_traffic['station'] = stations_traffic['station'].str.replace('11st', '11th')
stations_traffic['station'] = stations_traffic['station'].str.replace('2 ', '2nd ')
stations_traffic['station'] = stations_traffic['station'].str.replace('12nd', '12th')
stations_traffic['station'] = stations_traffic['station'].str.replace('3 ', '3rd ')
stations_traffic['station'] = stations_traffic['station'].str.replace('13rd', '13th')
stations_traffic['station'] = stations_traffic['station'].str.replace('4 ', '4th ')
stations_traffic['station'] = stations_traffic['station'].str.replace('5 ', '5th ')
stations_traffic['station'] = stations_traffic['station'].str.replace('6 ', '6th ')
stations_traffic['station'] = stations_traffic['station'].str.replace('7 ', '7th ')
stations_traffic['station'] = stations_traffic['station'].str.replace('8 ', '8th ')
stations_traffic['station'] = stations_traffic['station'].str.replace('9 ', '9th ')
In [8]:
station_names_map = {
'103rd ST - CORONA': '103rd St - Corona Plaza',
'116th ST - COLUMBIA': '116th St - Columbia University',
'137th ST CITY COL': '137th St - City College',
'138th - GRAND CONC': '138th St - Grand Concourse',
'149th - GRAND CONC': '149th St - Grand Concourse',
'14TH STREET': '14th St',
'14th ST - UNION SQ': 'Union Sq - 14th St',
'15th ST - PROSPECT': '15th St - Prospect Park',
'161st - YANKEE STAD': '161st St - Yankee Stadium',
'163rd ST - AMSTERDM': '163rd St - Amsterdam Av',
'174th - 175th STS': '174th-175th Sts',
'182nd - 183rd STS': '182nd-183rd Sts',
'21st ST - QNSBRIDGE': '21st St - Queensbridge',
'2nd AVE': 'Lower East Side - 2nd Ave',
'3rd AVE 138th ST': '3rd Ave - 138th St',
'34th ST - HERALD SQ': 'Herald Sq - 34th St',
'34th ST - HUDSON YD': '34th St - Hudson Yards',
'34th ST - PENN STA': '34th St - Penn Station',
'4th AVE - 9th ST': '4th Av - 9th St',
'40th ST LOWERY ST': '40th St',
'42nd ST - PORT AUTH': '42nd St - Port Authority Bus Term',
'47th - 50th STS ROCK': '47th-50th Sts - Rockefeller Ctr',
'4AV - 9th ST': '4th Av - 9th St',
'5th AVE': '5th Ave - Bryant Pk',
'59th ST COLUMBUS': '59th St - Columbus Circle',
'61st ST WOODSIDE': 'Woodside - 61st St',
'66th ST - LINCOLN': '66th St - Lincoln Ctr',
'68ST - HUNTER CO': '68th St - Hunter College',
'75th ST - ELDERTS': '75th St - Eldert Ln',
'81st ST - MUSEUM': '81st St',
'82nd ST - JACKSON H': '82nd St - Jackson Hts',
'85th ST - FOREST PK': '85th St - Forest Pky',
'90th ST - ELMHURST': '90th St - Elmhurst Av',
'AQUEDUCT N.COND': 'Aqueduct - North Conduit Av',
'AQUEDUCT RACETR': 'Aqueduct Racetrack',
'ASTORIA DITMARS': 'Astoria - Ditmars Blvd',
'ATL AVE - BARCLAY': "Atlantic Av - Barclay's Center",
"B'WAY - LAFAYETTE": 'Broadway - Lafayette St',
'BAY PKWY': 'Bay Pky',
'BEDFORD PK BLVD': 'Bedford Park Blvd',
'BEDFORD - NOSTRAN': 'Bedford - Nostrand Aves',
'BEVERLEY ROAD': 'Beverly Rd',
'BRIARWOOD': 'Briarwood - Van Wyck Blvd',
'BROADWAY JCT': 'Broadway Junction',
'BROOKLYN BRIDGE': 'Brooklyn Bridge - City Hall',
'BUSHWICK AVE': 'Bushwick - Aberdeen',
'CANARSIE - ROCKAW': 'Canarsie - Rockaway Pkwy',
'CATHEDRAL PKWY': 'Cathedral Pkwy (110th St)',
'CENTRAL PK N110': 'Central Park North (110th St)',
'CHRISTOPHER ST': 'Christopher St - Sheridan Sq',
'CLINTON - WASH AVE': 'Clinton - Washington Aves',
'CONEY IS - STILLW': 'Coney Island - Stillwell Av',
'CROWN HTS - UTICA': 'Crown Hts - Utica Ave',
'DELANCEY - ESSEX': 'Delancey St - Essex St',
"E 143rd - ST MARY'S": "E 143rd St - St Mary's St",
'EASTCHSTER - DYRE': 'Eastchester - Dyre Ave',
'EASTN PKWY - MUSM': 'Eastern Pkwy - Bklyn Museum',
'EAST 105th ST': 'E 105th St',
'FAR ROCKAWAY': 'Far Rockaway - Mott Ave',
'FLATBUSH AVE - B.C': 'Brooklyn College - Flatbush Ave',
'FLUSHING - MAIN': 'Flushing - Main St',
'FOREST HILLS 71': 'Forest Hills - 71st Av',
'FT HAMILTON PKY': 'Ft Hamilton Pkwy',
'GRAND ARMY PLAZ': 'Grand Army Plaza',
'GRAND - NEWTOWN': 'Grand Ave - Newtown',
'GRD CNTRL - 42nd ST': 'Grand Central - 42nd St',
'HARLEM 148th ST': 'Harlem - 148 St',
'HOWARD BCH JFK': 'Howard Beach - JFK Airport',
'HOYT - SCHER': 'Hoyt - Schermerhorn Sts',
'HUNTERS PT AVE': 'Hunters Point Ave',
'JAMAICA 179th ST': 'Jamaica - 179th St',
'JAMAICA CENTER': 'Jamaica Ctr - Parsons / Archer',
'JAMAICA VAN WK': 'Jamaica - Van Wyck',
'JAY ST - METROTEC': 'Jay St - MetroTech',
'JKSN HT - ROOSVLT': 'Jackson Hts - Roosevelt Av',
'KEW GARDENS': 'Kew Gardens - Union Tpke',
'KINGSTON - THROOP': 'Kingston - Throop Aves',
'KNICKERBOCKER': 'Knickerbocker Ave',
'LEXINGTON AVE - 53': 'Lexington Ave - 53rd St',
'LEXINGTON AVE - 63': 'Lexington Ave - 63rd St',
'MARBLE HILL - 225': 'Marble Hill - 225th St',
'METS - WILLETS PT': 'Mets - Willets Point',
'MORISN AVE - SNDVW': 'Morrison Av - Soundview',
'MYRTLE - WILLOUGH': 'Myrtle-Willoughby Aves',
'MYRTLE - WYCKOFF': 'Myrtle - Wyckoff Aves',
'NASSAU ST': 'Nassau Ave',
'NEREID AVE': 'Nereid Ave (238 St)',
'MYRTLE - WILLOUGH': 'Myrtle-Willoughby Aves',
'MYRTLE - WYCKOFF': 'Myrtle - Wyckoff Aves',
'NEREID AVE': 'Nereid Ave (238 St)',
'NORWOOD 205th ST': 'Norwood - 205th St',
'OZONE PK LEFFRT': 'Ozone Park - Lefferts Blvd',
'PARK PLACE': 'Park Pl',
'QUEENS PLAZA': 'Queens Plz',
'ROCKAWAY PARK B': 'Rockaway Park - Beach 116 St',
'ROOSEVELT ISLND': 'Roosevelt Island - Main St',
'SMITH - 9th ST': 'Smith - 9th Sts',
'SUTPHIN - ARCHER': 'Sutphin Blvd - Archer Av',
'SUTTER AVE - RUTLD': 'Sutter Ave - Rutland Road',
'V.CORTLANDT PK': 'Van Cortlandt Park - 242nd St',
'VERNON - JACKSON': 'Vernon Blvd - Jackson Ave',
'W 4th ST - WASH SQ': 'W 4th St - Washington Sq (Upper)',
'W 8th ST - AQUARIUM': 'W 8th St - NY Aquarium',
'WAKEFIELD - 241': 'Wakefield - 241st St',
'WEST FARMS SQ': 'West Farms Sq - E Tremont Av',
'WESTCHESTER SQ': 'Westchester Sq - E Tremont Ave',
'WHITEHALL S - FRY': 'Whitehall St',
'WORLD TRADE CTR': 'World Trade Center'
}
In [9]:
for i, station_name in stations_traffic['station'].iteritems():
if station_name in station_names_map.keys():
better_name = station_names_map.get(station_name)
stations_traffic.set_value(i, 'station', better_name)
stations_traffic.head(10)
Out[9]:
In [10]:
stations_traffic['join_name'] = stations_traffic['station'].str.lower()
stations_locations['join_name'] = stations_locations['station'].str.lower()
We are now ready to merge both dataframes. We'll use the join_name
columns and will then discard them, keeping only stations_locations
' original names.
In [11]:
merged_stations = pd.merge(stations_locations, stations_traffic, how='left', left_on='join_name',
right_on='join_name')
In [12]:
columns_of_interest = ['station_x', 'avg_daily_traffic_feb', 'latitude', 'longitude']
stations_entrances = merged_stations[columns_of_interest]
stations_entrances = stations_entrances.rename(columns={'station_x': 'station_name'})
stations_entrances.sort_values(by='avg_daily_traffic_feb', ascending=False, inplace=True)
stations_entrances.head(15)
Out[12]:
We are done with cleaning data. Now it is time to actually perform the recommendations. We will start by creating a function that tells whether two locations (i.e., a school and a station) are within 810 meters from each other. This is a simple heuristic for calculating whether they are within walking distance from each other (10-15 minute walk).
For that we will use the Harvesine formula, which calculates the distance between 2 points on a sphere using their latitude and logitude. The Earth is nearly spherical and that formula can be used to calculate distances on it correctly to within 0.5% or so (see https://en.wikipedia.org/wiki/Great-circle_distance).
In [13]:
# Adapted from http://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points
from math import radians, cos, sin, asin, sqrt
def within_810m(school_lat, school_lon, station_lat, station_lon):
"""
Calculates the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
school_lat, school_lon, station_lat, station_lon = map(radians,
[school_lat, school_lon,
station_lat, station_lon])
# haversine formula
earth_radius_km = 6367
d_lon = station_lon - school_lon
d_lat = station_lat - school_lat
a = sin(d_lat/2)**2 + cos(school_lat) * cos(station_lat) * sin(d_lon/2)**2
c = 2 * asin(sqrt(a))
m = earth_radius_km * 1000 * c
return m <= 810
For the recommendations we will also filter out stations with fewer than 4000 average daily entries and selected a maximum of 5 stations per high school.
Have in mind that we have sorted the dataset first by the potential college students of each school and then by the average number of daily traffic for each station.
In [14]:
result = pd.DataFrame(columns=('school_name', 'potential_college_students', 'start_time',
'end_time', 'station_name', 'avg_daily_traffic_feb', 'city'))
result_locations = pd.DataFrame(columns=('latitude', 'longitude', 'type', 'name'))
k = 0
for i, school in high_schools.iterrows():
for j, entrance in stations_entrances.iterrows():
if within_810m(high_schools.ix[i,'latitude'], high_schools.ix[i,'longitude'],
stations_entrances.ix[j,'latitude'], stations_entrances.ix[j,'longitude']):
same_school = result['school_name'] == high_schools.ix[i,'school_name']
same_station = result['station_name'] == stations_entrances.ix[j,'station_name']
if len(result[(same_school) & (same_station)]) == 0:
if len(result[(same_school)]) < 5:
if stations_entrances.ix[j,'avg_daily_traffic_feb'] >= 4000:
result.loc[k] = (high_schools.ix[i,'school_name'],
high_schools.ix[i,'potential_college_students'],
high_schools.ix[i,'start_time'],
high_schools.ix[i,'end_time'],
stations_entrances.ix[j,'station_name'],
stations_entrances.ix[j,'avg_daily_traffic_feb'],
high_schools.ix[i,'city'])
result_locations.loc[(k*2)] = (high_schools.ix[i,'latitude'],
high_schools.ix[i,'longitude'],
'school',
high_schools.ix[i,'school_name']
)
result_locations.loc[k*2+1] = (stations_entrances.ix[j,'latitude'],
stations_entrances.ix[j,'longitude'],
'station',
stations_entrances.ix[j,'station_name'])
k = k + 1
Let's take a glimpse of the distribution of the coordinates of the selected schools and stations.
In [15]:
selected_schools_stations_map = folium.Map([40.729, -73.9], zoom_start=11,
tiles='CartoDB positron', width='60%')
for i, result_location in result_locations.iterrows():
lat = result_location['latitude']
long = result_location['longitude']
name = popup=result_location['name']
if result_location['type'] == 'school':
marker = folium.RegularPolygonMarker([lat, long], popup=name, color='RoyalBlue',
fill_color='RoyalBlue', radius=7)
else:
marker = folium.CircleMarker([lat, long], popup=name, color='FireBrick',
fill_color='FireBrick', radius=5)
marker.add_to(selected_schools_stations_map)
selected_schools_stations_map.save('maps/selected_schools_stations.html')
selected_schools_stations_map
Out[15]:
The interactive map is available here.
And finally, the final recommendation.
In [16]:
result
Out[16]:
As you can see, the final recommendation has 29 stations and also includes data like start and end time for each school, as well as the city in which the school/station is located.