In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [182]:
#subway entrance each 15 minutes for station during 2014
#data is almost 1Gb
url = 'https://recursos-data.buenosaires.gob.ar/ckan2/subte-molinetes/molinetes_2014.csv'
filePath = '/home/pipe/Descargas/molinetes_2014.csv'
rawData = pd.read_csv(#url,
filePath,
sep = ';',
usecols = [0,1,3,5,9],
parse_dates = [0],
#nrows = 100000
)
rawData.columns = ['date','time','line','station','trips']
rawData.head()
Out[182]:
In [183]:
#change wrong labels in stations
rawData.station.replace(to_replace='Saenz Peña', value='Saenz Pena', inplace=True)
rawData.station.replace(to_replace='Independencia.', value='Independencia', inplace=True)
In [184]:
#get days and remove sunday and saturday
#Monday=0, Sunday=6
rawData['day'] = rawData.date.map(lambda x: x.weekday())
print rawData.day.value_counts()
rawData.query('day < 5', inplace=True)
print rawData.day.value_counts()
rawData.drop('day', inplace = True, axis = 1)
In [185]:
#group by time - roush hour general
gTime = rawData.groupby(['time']).trips.mean()
In [186]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1,1,1)
rushValue = gTime.sort_values(ascending=False)[13]
#rush hour 8 to 9:30 and 17 to 18:30
#change color
colors =['grey' if (x < rushValue) else 'firebrick' for x in gTime ]
ax = sns.barplot(range(len(gTime)),gTime, palette = colors)
ax.set_xticks(range(len(gTime)))
ax.set_xticklabels(list(gTime.index), rotation = 270)
{}
Out[186]:
In [187]:
lines = rawData.line.unique()
#group by time and line - roush hour por linea
gTimeLine = rawData.groupby(['line','time']).mean()
In [188]:
lineColors = {'A':'dodgerblue','B':'red','C':'mediumblue','D':'forestgreen','E':'purple','H':'yellow','P':'coral'}
In [189]:
fig = plt.figure(figsize=(16,36))
#rush hour 8 to 9:30 and 17 to 18:30 seems to fit to every line
for idLine in range(len(lines)):
tripTimes = gTimeLine.loc[lines[idLine]].trips
rushValueLine = tripTimes.sort_values(ascending=False)[11]
colors =['grey' if (x < rushValueLine) else lineColors[lines[idLine]] for x in tripTimes ]
ax = fig.add_subplot(6,1,idLine + 1)
ax = sns.barplot(range(len(tripTimes)),tripTimes, palette = colors)
ax.set_title('Line '+ lines[idLine])
ax.set_xticks(range(len(tripTimes)))
ax.set_xticklabels(list(tripTimes.index), rotation = 270)
In [190]:
morningRush = ['08:00','08:15','08:30','08:45','09:00','09:15','09:30']
eveningRush = ['17:00','17:15','17:30','17:45','18:00','18:15','18:30']
morningRushData = rawData.copy()[rawData.time.isin(morningRush)]
eveningRushData = rawData.copy()[rawData.time.isin(eveningRush)]
In [191]:
morningRushStations = morningRushData.groupby(['line','station']).mean()
eveningRushStations = eveningRushData.groupby(['line','station']).mean()
In [198]:
list(morningRushStations.loc['H'].index)
Out[198]:
In [199]:
#subway station order
stationOrder ={'A':
['San Pedrito','Flores', 'Carabobo', 'Puan','Primera Junta', 'Acoyte','Rio de Janeiro',
'Castro Barros','Loria', 'Plaza Miserere', 'Alberti','Pasco','Congreso', 'Saenz Pena',
'Lima', 'Piedras', 'Peru','Plaza de Mayo'],
'B':[ 'Rosas', 'Echeverria', 'Los Incas', 'Tronador','Federico Lacroze', 'Dorrego', 'Malabia',
'Angel Gallardo','Medrano', 'Carlos Gardel', 'Pueyrredon', 'Pasteur','Callao','Uruguay',
'Carlos Pellegrini', 'Florida','Leandro N. Alem'],
'C': [ 'Retiro','General San Martin','Lavalle','Diagonal Norte','Avenida de Mayo',
'Mariano Moreno','Independencia','San Juan','Constitucion'],
'D':['Congreso de Tucuman','Juramento','Jose Hernandez','Olleros','Ministro Carranza',
'Palermo','Plaza Italia','Scalabrini Ortiz','Bulnes','Aguero','Pueyrredon.',
'Facultad de Medicina','Callao.','Tribunales','9 de julio','Catedral'],
'E':['Pza. de los Virreyes','Varela', 'Medalla Milagrosa','Emilio Mitre', 'Jose Maria Moreno',
'Avenida La Plata','Boedo','Urquiza','Jujuy','Pichincha','Entre Rios','San Jose',
'Independencia','General Belgrano','Bolivar'],
'H':['Corrientes','Once', 'Venezuela','Humberto I','Inclan','Caseros','Patricios','Hospitales']
}
In [201]:
fig = plt.figure(figsize=(16,36))
plotID = 1
for idLine in range(len(lines)):
for daytime in ['morning','evening']:
if daytime == 'morning':
dataset = morningRushStations
else:
dataset = eveningRushStations
tripsByStation = dataset.loc[lines[idLine]]
tripsByStation['station'] = tripsByStation.index
tripsByStation['station'] = tripsByStation.station.astype('category')
ax = fig.add_subplot(6,2,plotID)
ax = sns.barplot(tripsByStation.station,tripsByStation.trips, palette = colors,order=stationOrder[lines[idLine]])
ax.set_title('Line '+ lines[idLine] + ' ' + daytime)
ax.set_xticks(range(len(stationOrder[lines[idLine]])))
ax.set_xticklabels(stationOrder[lines[idLine]], rotation = 270)
plotID += 1
In [204]:
#dataset = morningRushStations
dataset = eveningRushStations
tripsByStation = dataset.loc['C']
tripsByStation['station'] = tripsByStation.index
tripsByStation['station'] = tripsByStation.station.astype('category')
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1,1,1)
#.astype('category')
ax = sns.barplot(tripsByStation.station,tripsByStation.trips, palette = colors)
ax.set_xticks(range(len(tripsByStation.station)))
ax.set_xticklabels(tripsByStation.station, rotation = 270)
[]
Out[204]:
In [203]:
#definir orden de estaciones para todos
#para cada estacion, para cada hora, MEAN de viajes
In [ ]: