Project: Project 1: Benson
Date: 01/23/2017
Name: Prashant Tatineni
Team: Connor Stefan, Prashant Tatineni, Rosie Hoyem
In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
%matplotlib inline
In [5]:
df_input = pd.read_csv('data/raw/ACS_15_5YR_C24010_with_ann.csv')
census_tracts = []
counts = []
for index, row in df_input.iterrows():
if index != 0:
counts.append(int(row['HD01_VD43']))
county_code = row['GEO.id2'][3:5]
tract_code = row['GEO.id2'][5:]
if county_code == '61':
census_tracts.append('1' + tract_code)
if county_code == '05':
census_tracts.append('2' + tract_code)
if county_code == '47':
census_tracts.append('3' + tract_code)
if county_code == '81':
census_tracts.append('4' + tract_code)
if county_code == '85':
census_tracts.append('5' + tract_code)
df_output = pd.DataFrame({'BoroCT2010':census_tracts, 'stem_women':counts})
df_output.to_csv('data/stem_women.csv', index=False)
In [6]:
stations = pd.read_csv('data/raw/subway_station_tracts.csv')
stem_women = pd.read_csv('data/stem_women.csv')
df = pd.merge(stations, stem_women, on='BoroCT2010')
dfviz = df[df['stem_women'] > 100]
sns.barplot(x=dfviz['stem_women'], y=dfviz['name'], hue=dfviz['BoroName'], ci=None);
plt.xlabel('Women in STEM occupations living nearby',fontsize=14);
plt.ylabel('Station Name',fontsize=14);
plt.legend(loc=1);
plt.xlim(100,400);
In [7]:
dft50 = df[df['stem_women'] > 50].sort_values('stem_women', ascending=False)[['name','BoroName','BoroCT2010','stem_women']]
sorted_stations = dft50.groupby(['name','BoroName','BoroCT2010']).max().sort_values('stem_women', ascending=False).head(50)
sorted_stations.to_csv('data/top50_stations_women.csv')
In [10]:
df1 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_160507.txt')
df2 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_160514.txt')
df3 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_160521.txt')
df4 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_160528.txt')
frames = [df1, df2, df3, df4]
df = pd.concat(frames)
In [21]:
df.columns = (u'C/A', u'UNIT', u'SCP', u'STATION', u'LINENAME', u'DIVISION', u'DATE',
u'TIME', u'DESC', u'ENTRIES',
u'EXITS')
df = df.sort(['C/A','UNIT','SCP','DATE','TIME'])
df['diffEntries'] = df.ENTRIES.diff()
df['diffExits'] = df.EXITS.diff()
df['machine_change'] = (df.SCP != df.SCP.shift())
In [22]:
# Edit records where turnstile machine changes; set to 0.
df = df.reset_index(drop=True)
df.diffEntries[df.index[df.machine_change==True]] = 0
df.diffExits[df.index[df.machine_change==True]] = 0
In [23]:
# Further edit records where turnstile counter resulted in negative counts or counts > 6000; reset to 0.
df['counterReset'] = ((df.diffEntries > 6000) | (df.diffEntries < 0) | (df.diffExits > 6000) | (df.diffExits < 0))
df.diffEntries[df.index[df.counterReset==True]] = 0
df.diffExits[df.index[df.counterReset==True]] = 0
df.head()
Out[23]:
In [26]:
df['TotalVol'] = df.diffEntries+df.diffExits
topstations = df.groupby(['STATION']).sum()
topstations = topstations.sort('TotalVol', ascending=False)
topstations.to_csv('data/volumeByStation.csv')
In [28]:
census = pd.read_csv('data/top50_stations_women.csv')
In [33]:
# Top 5 stations by women residents in tech occupations
census['census_rank'] = census.index + 1
census.head()
Out[33]:
In [34]:
ts = (pd.read_csv('data/volumeByStation.csv'))[:51]
ts['ts_rank']= ts.index + 1
ts.head()
Out[34]:
In [35]:
import re
capitalizer = lambda x: x.upper()
census['name'] = census['name'].apply(capitalizer)
clean_ave = lambda x: re.sub('AVE','AV', x)
census['name'] = census['name'].apply(clean_ave)
clean_numbers1 = lambda x: re.sub(r'(\b\d+)(RD\b)',r'\1', x)
census['name'] = census['name'].apply(clean_numbers1)
clean_numbers2 = lambda x: re.sub(r'(\b\d+)(TH\b)',r'\1', x)
census['name'] = census['name'].apply(clean_numbers2)
clean_numbers3 = lambda x: re.sub(r'(\b\d+)(ST\b)',r'\1', x)
census['name'] = census['name'].apply(clean_numbers3)
clean_numbers4 = lambda x: re.sub(r'(\b\d+)(ND\b)',r'\1', x)
census['name'] = census['name'].apply(clean_numbers4)
clean_dashes = lambda x: re.sub(r'(\w*)(\s-\s)(\w*)',r'\1-\3', x)
census['name'] = census['name'].apply(clean_dashes)
remove_parans = lambda x: re.sub(r'\(.*\)',r'', x)
census['name'] = census['name'].apply(remove_parans)
census.replace('CONCOURSE', 'CONC', inplace=True)
census.replace('AVENUE', 'AV', inplace=True)
census.replace('WASHINGTON', 'WASH', inplace=True)
census.replace('JUNCTION', 'JCT', inplace=True)
census.replace('CONCOURSE', 'CONC', inplace=True)
census.replace('WOODHAVN', 'WOODHAVEN', inplace=True)
census.replace('CENTER', 'CTR', inplace=True)
census.replace('QUEENSBRIDGE', 'QNSBRIDGE', inplace=True)
census.replace('W 4 ST-WASHINGTON SQ ', 'W 4 ST-WASH SQ', inplace=True)
In [37]:
census_ts = pd.merge(census, ts, how='inner', left_on='name', right_on='STATION')
census_ts['comb_rank'] = census_ts['ts_rank'] + census_ts['census_rank']
In [38]:
census_ts.plot.scatter('stem_women', 'TotalVol');
In [39]:
final = census_ts.sort_values('comb_rank', ascending=True)
final.head()
Out[39]: