In [72]:

    
import sys
import pandas as pd
print("Pandas version: ", pd.__version__)
print()
pd.set_option('display.max_rows', 500)
import numpy as np
print("Numpy version: ", np.__version__)
print()
print("system version: ", sys.version)

from datetime import datetime as dt
import calendar

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(color_codes=True)

from IPython.display import Image

#customizable plotting parameters
plt.rcParams["figure.figsize"] = (12,5)
plt.rcParams["xtick.labelsize"] = 15
plt.rcParams["ytick.labelsize"] = 15
plt.rcParams["axes.labelsize"] = 20
plt.rcParams['legend.fontsize'] = 15
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.titlepad'] = 15
plt.rcParams['axes.labelpad'] = 15

#autosaves work so we dont have to!
%autosave 120









    



Pandas version:  0.20.2

Numpy version:  1.12.1

system version:  3.6.1 |Anaconda custom (x86_64)| (default, May 11 2017, 13:04:09) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]






    














    



Autosaving every 120 seconds

Obtaining the Data

The gala will be held on May 30th! We wish to give WTWY 2 weeks to in order to implement our findings!

In order to use the latest results, we will be using MTA data that relates to 3 weeks prior to the 2 week implemtation process.

Below we will clean up/standardize the headers in addition to adding useful information to the datframe i.e. turnstile.

We will also pickle our resulting dataframe in order to ensure we can obtain the original information with speed should it be required later on.



In [14]:

    
may13 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_170513.txt")
may06 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_170506.txt")
april29 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_170429.txt")

master_df = pd.concat([may13, may06, april29])

master_df = master_df.rename(columns=lambda x: x.strip()) #rid of whitespace
master_df = master_df.rename(columns=lambda x: x.lower()) #lower case for ease of typing

master_df["date"] = master_df["date"].apply(lambda x: dt.strptime(x, '%m/%d/%Y')) #make datetime objects
master_df['week_day_num'] = master_df['date'].dt.dayofweek #numeric indicies that relate to day of week
master_df['week_day_name'] = master_df["date"].apply(lambda x: calendar.day_name[x.dayofweek]) #names of the days of week
master_df["time"] = master_df["time"].apply(lambda x: dt.strptime(x, '%H:%M:%S').time()) #makes time object
master_df['turnstile'] = master_df['c/a'] + " "+ master_df['scp'] #identifies unique turnstile

master_df.sort_values(["c/a", "scp", "date", "time"], inplace=True)

master_df.reset_index(inplace=True)

del master_df['index']

master_df.to_pickle("master_df.pkl") #saves the clean resuting dataframe for ease of access in the future.



In [2]:

    
master_df = pd.read_pickle("master_df.pkl")



In [3]:

    
master_df.head() #nice and clean!









    Out[3]:







  
    
      
      c/a
      unit
      scp
      station
      linename
      division
      date
      time
      desc
      entries
      exits
      week_day_num
      week_day_name
      turnstile
    
  
  
    
      0
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      00:00:00
      REGULAR
      6148025
      2082120
      5
      Saturday
      A002 02-00-00
    
    
      1
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      04:00:00
      REGULAR
      6148080
      2082123
      5
      Saturday
      A002 02-00-00
    
    
      2
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      08:00:00
      REGULAR
      6148101
      2082157
      5
      Saturday
      A002 02-00-00
    
    
      3
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      12:00:00
      REGULAR
      6148224
      2082251
      5
      Saturday
      A002 02-00-00
    
    
      4
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      16:00:00
      REGULAR
      6148524
      2082316
      5
      Saturday
      A002 02-00-00

Interesting observation!

It seems that the entries and exits are accumalted values. Thus, we have to find the differences between subsequent entries at the turnstile level.



In [4]:

    
#turnstile level for entries
master_df['new_entries'] = master_df.groupby(['turnstile'])['entries'].transform(lambda x: x.diff())

#turnstile level for exits
master_df['new_exits'] = master_df.groupby(['turnstile'])['exits'].transform(lambda x: x.diff())

#the starting point of our differences will have NaN's, lets change those to 0's!
master_df['new_entries'].fillna(0, inplace=True)
master_df['new_exits'].fillna(0, inplace=True)

#it does not make sense to have negative values for entries or exits!
master_df = master_df[(master_df.new_entries > 0) & (master_df.new_exits > 0)]

#the total traffic at a turnstile in a given block of time (4hrs) is the sum of entries and exits!
master_df['traffic'] = master_df[['new_entries', 'new_exits']].sum(axis=1)

#since our analysis is at the turnstile level, we assume an upper bound of 5000 individuals through a turnstile
#in a block of time.
master_df = master_df[master_df['traffic'] <= 5000]

average_traffic_per_day = int((sum(master_df['traffic'])/3)/7)

#let's take a look!
master_df.head()









    Out[4]:







  
    
      
      c/a
      unit
      scp
      station
      linename
      division
      date
      time
      desc
      entries
      exits
      week_day_num
      week_day_name
      turnstile
      new_entries
      new_exits
      traffic
    
  
  
    
      1
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      04:00:00
      REGULAR
      6148080
      2082123
      5
      Saturday
      A002 02-00-00
      55.0
      3.0
      58.0
    
    
      2
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      08:00:00
      REGULAR
      6148101
      2082157
      5
      Saturday
      A002 02-00-00
      21.0
      34.0
      55.0
    
    
      3
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      12:00:00
      REGULAR
      6148224
      2082251
      5
      Saturday
      A002 02-00-00
      123.0
      94.0
      217.0
    
    
      4
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      16:00:00
      REGULAR
      6148524
      2082316
      5
      Saturday
      A002 02-00-00
      300.0
      65.0
      365.0
    
    
      5
      A002
      R051
      02-00-00
      59 ST
      NQR456W
      BMT
      2017-04-22
      20:00:00
      REGULAR
      6148910
      2082368
      5
      Saturday
      A002 02-00-00
      386.0
      52.0
      438.0



In [ ]:

    
print("Average Traffic through turnstiles per day: ", average_traffic_per_day)



In [6]:

    
station_by_day = pd.DataFrame(master_df.groupby(['station', 'week_day_name', 'week_day_num', 'turnstile']).sum()['traffic']).reset_index()

From NYC MTA we obtain the following top 5 stations in the city!



In [7]:

    
top_5_list = ['TIMES SQ-42 ST', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '14 ST-UNION SQ', '34 ST-PENN STA']



In [8]:

    
top_5_stations = station_by_day[station_by_day['station'].isin(top_5_list)] #subset the dataframe.



In [9]:

    
top_5_grouped = pd.DataFrame(top_5_stations.groupby(['station']).sum()['traffic']).reset_index()



In [10]:

    
top_5_grouped.sort_values(by='traffic', ascending=False, inplace=True)



In [11]:

    
top_5_grouped.reset_index(inplace=True)
del top_5_grouped['index']



In [12]:

    
top_5_grouped.plot.bar(x='station')
plt.title("Top 5 stations - aggregate traffic over 3 weeks")
plt.xlabel("Stations")
plt.ylabel("Traffic")
plt.savefig("top_5_aggregate_traffic.png")



In [66]:

    
def time_bin(x):
    if x < datetime.time(2):
        return "00:00-01:59"
    elif x < datetime.time(6):
        return "02:00-05:59"
    elif x < datetime.time(10):
        return "06:00-09:59"
    elif x < datetime.time(14):
        return "10:00-13:59"
    elif x < datetime.time(18):
        return "14:00-17:59"
    elif x < datetime.time(22):
        return "18:00- 21:59"
    else:
        return "22:00-23:59"
master_df["Time_Bin"] = master_df["time"].apply(time_bin)



In [67]:

    
time_breakdown = pd.DataFrame(master_df[master_df['station'].isin(top_5_list)].groupby(['station','Time_Bin']).sum()['traffic']).reset_index()



In [71]:

    
for i in top_5_list:
    time_breakdown[time_breakdown['station']==i].set_index('Time_Bin').plot(kind='bar')
    plt.title(i)
    plt.xlabel("Time Frame")
    plt.ylabel("Foot Traffic")
    plt.tight_layout()
    plt.savefig(i+" traffic_by_time.png")



In [154]:

    
station_day_breakdown_df = station_by_day.groupby(['station', 'week_day_num']).sum().reset_index()



In [155]:

    
for i in top_5_list:
    station_day_breakdown_df[station_day_breakdown_df['station'] == i].set_index('week_day_num').plot()
    plt.title(i+" Station over 3-week period")
    plt.xlabel("Days (Monday = 0)")
    plt.ylabel("Traffic")
    plt.savefig(i+".png")

Demograhics

The demographic information for Time-Sq, Grand-Central, Herald-Sq and Penn-Station

Time-Sq, Grand-Central, Herald-Sq and Penn Station are all located in the same census tract. This is great news as we can aggregate demographic information for 4/5 top stations!

Below is an image of the census tract!



In [156]:

    
Image("midtown.jpg")









    Out[156]:

Let's look at some key target demographics!



In [1]:

    
gender_denisty_dict = {"Female": 51.7, "Male": 48.2}

age_density_dict = {"20-24":11.5, "25-29":15.9, "30-34":11.9, "35-39": 8.6}









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-f2e15bee093f> in <module>()
      3 age_density_dict = {"20-24":11.5, "25-29":15.9, "30-34":11.9, "35-39": 8.6}
      4 
----> 5 genderdata = pd.DataFrame.from_dict(gender_denisty_dict)

NameError: name 'pd' is not defined



In [158]:

    
plt.figure(figsize=(12, 6))
pd.Series(gender_denisty_dict).plot(kind="barh")
plt.xlabel("Gender")
plt.ylabel("Percentage")
plt.title("Gender Breakdown Midtown")
plt.tight_layout()
plt.savefig("Gender_breakdown_midtown.png")



In [179]:

    
plt.figure(figsize=(12, 6))
pd.Series(age_density_dict).plot(kind="barh")
plt.xlabel("Percentage")
plt.ylabel("Age group")
plt.title("Age Breakdown Midtown")
plt.tight_layout()
plt.savefig("Age_breakdown_Midtown")



In [161]:

    
midtown_demo = pd.read_csv("midtown_agg_demo.csv")



In [162]:

    
del midtown_demo['Unnamed: 3']
del midtown_demo['Unnamed: 4']
midtown_demo.dropna(inplace=True)



In [163]:

    
midtown_demo['Percent'] = midtown_demo['Percent'].apply(lambda x: str(x))
midtown_demo['Percent'] = midtown_demo['Percent'].apply(lambda x: float(x.strip("%")))



In [164]:

    
def graph(col1, col2):
    """
    Col1 and Col2 are pandas series
    """
    
    pd.Series(dict(list(zip(col1,col2)))).sort_values().plot(kind='barh')
    plt.xlabel("Percent")



In [165]:

    
graph(midtown_demo['Race'], midtown_demo['Percent'])
plt.title("Race Breakdown Midtown")
plt.ylabel("Race")
plt.tight_layout()
plt.savefig("Race_breakdown_midtown.png")



In [166]:

    
midtown_asian_pop = pd.read_csv("midtown_asian_breakdown.csv")



In [167]:

    
midtown_asian_pop.dropna(axis=1, inplace=True)



In [168]:

    
midtown_asian_pop.drop([0,14,15], inplace=True)



In [204]:

    
graph(midtown_asian_pop['SELECTED ASIAN SUBGROUPS'], midtown_asian_pop['Percent'])
plt.ylabel("Sub group")
plt.title("Asian Sub-Groups Midtown")
plt.tight_layout()
plt.savefig("asian breakdown midtown.png")



In [184]:

    
midtown_hispanic_pop = pd.read_csv("midtown_hispanic_breakdown.csv")
midtown_hispanic_pop.dropna(axis=1, inplace=True)



In [203]:

    
graph(midtown_hispanic_pop['hispanic_subgroup'], midtown_hispanic_pop['Percent'])
plt.ylabel("Sub group")
plt.title("Hispanic Sub-Groups Midtown")
plt.tight_layout()
plt.savefig("hispanic breakdown midtown.png")



In [170]:

    
midtown_income = pd.read_csv("midtown_income.csv")
midtown_income.dropna(axis=1, inplace=True)



In [215]:

    
graph(midtown_income['INCOME AND BENEFITS'], midtown_income['Percentage'])
plt.title("Income Bracket Midtown")
plt.ylabel("Income Brackets")
plt.tight_layout()
plt.savefig("Income_midtown_graph.png")

Let's at some key stats for out last popular station - Union Sq



In [172]:

    
Image("union.png")









    Out[172]:



In [174]:

    
gender_denisty_dict_union = {"Female": 48.7, "Male": 51.3}

age_density_dict_union = {"20-24":8.7, "25-29":13.8, "30-34":12.6, "35-39": 9.5}



In [175]:

    
plt.figure(figsize=(12, 6))
pd.Series(gender_denisty_dict_union).plot(kind="barh")
plt.xlabel("Gender")
plt.ylabel("Percentage")
plt.title("Gender Breakdown union")
plt.tight_layout()
plt.savefig("Gender_breakdown_union.png")



In [178]:

    
plt.figure(figsize=(12, 6))
pd.Series(age_density_dict_union).sort_values().plot(kind="barh")
plt.xlabel("Percentage")
plt.ylabel("Age group")
plt.title("Age Breakdown Union")
plt.tight_layout()
plt.savefig("Age_breakdown_Union")



In [197]:

    
union_agg_demo = pd.read_csv("union_agg_demo.csv")



In [198]:

    
del union_agg_demo['Unnamed: 3']
union_agg_demo.drop([6], inplace=True)



In [200]:

    
graph(union_agg_demo['RACE'], union_agg_demo['Percent'])



In [208]:

    
union_asian = pd.read_csv("union_asian_breakdown.csv")



In [209]:

    
union_asian.dropna(axis=1, inplace=True)
graph(union_asian['SELECTED ASIAN SUBGROUPS'], union_asian['Percent'])
plt.ylabel("Sub group")
plt.title("Asian Breakdown Union")
plt.tight_layout()
plt.savefig("Asian breakdown union.png")



In [213]:

    
union_hispanic = pd.read_csv("union_hispanic_breakdon.csv")



In [214]:

    
graph(union_hispanic['Hispanic_subgroup'], union_hispanic['Percent'])
plt.title(" Hispanic Breakdown Union")
plt.ylabel("Sub group")
plt.tight_layout()
plt.savefig("hispanic breakdown union.png")



In [216]:

    
union_income = pd.read_csv("union_income.csv")



In [218]:

    
graph(union_income['INCOME AND BENEFITS'], union_income['Percent'])
plt.ylabel("Brackets")
plt.title("Income distribution Union")
plt.tight_layout()
plt.savefig("Income distribution union")



In [ ]:

	c/a	unit	scp	station	linename	division	date	time	desc	entries	exits	week_day_num	week_day_name	turnstile
0	A002	R051	02-00-00	59 ST	NQR456W	BMT	2017-04-22	00:00:00	REGULAR	6148025	2082120	5	Saturday	A002 02-00-00
1	A002	R051	02-00-00	59 ST	NQR456W	BMT	2017-04-22	04:00:00	REGULAR	6148080	2082123	5	Saturday	A002 02-00-00
2	A002	R051	02-00-00	59 ST	NQR456W	BMT	2017-04-22	08:00:00	REGULAR	6148101	2082157	5	Saturday	A002 02-00-00
3	A002	R051	02-00-00	59 ST	NQR456W	BMT	2017-04-22	12:00:00	REGULAR	6148224	2082251	5	Saturday	A002 02-00-00
4	A002	R051	02-00-00	59 ST	NQR456W	BMT	2017-04-22	16:00:00	REGULAR	6148524	2082316	5	Saturday	A002 02-00-00