In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('passagierfrequenz.csv', delimiter=';')
Source and documentation: http://data.sbb.ch/explore/dataset/passagierfrequenz/
DTV = Durchschnittlicher täglicher Verkehr (Montag bis Sonntag) = average daily circulation (including the weekend)
DWV = Durchschnittlicher werktäglicher Verkehr (Montag bis Freitag) = average daily circulation Mo-Friday
In [3]:
print("Q1: Which Swiss railway station is the most frequented?")
print("A: The most frequented station is Zürich HB:")
df[['Station', 'DTV']].sort_values(by='DTV', ascending=False).head(1)
Out[3]:
In [4]:
print("Q2: Which stations have a higher average daily circulation on Saturday and Sunday?")
print("A: These 21 stations:")
df[df['DTV'] > df['DWV']]
Out[4]:
In [5]:
print("Q3: Print a comma-separated list of all the comments in the Comments column. Escape them with the “\"” character and don't include any empty cell.")
comments_list = df[df['Comments'] == df['Comments']]['Comments'].tolist()
print("A: The comments are:", '"' + str.join('","',comments_list) + '".')
In [6]:
print("Q4: How many rows contains another year than 2014?")
print("A: I counted", len(df[df['Year'] != 2014]), "rows containing another year than 2014.")
In [7]:
print("Q5: What is the size (rows, columns) of the data?")
print("A: There is", df.shape[0], "rows and", df.shape[1], "columns.")
In [8]:
df[df['Station'] == 'Zürich HB']
Out[8]:
In [9]:
print("Q6: How many stations have a name starting with A?")
import re
a_stations = df[df['Station'].str.match('^A')]
print("A: There is", len(a_stations), "“A stations”. Here they are:")
a_stations
Out[9]:
In [10]:
print("Q7: Which are the least frequented stations during the work days? And the full week?")
print("A(a): During the work days:")
df[['Station', 'DWV']].sort_values(by='DWV').head(10)
Out[10]:
In [11]:
print("A(b): During the full week:")
df[['Station', 'DTV']].sort_values(by='DTV').head(10)
Out[11]:
In [12]:
print("Q8: Take the most frequented and the least frequented stations. How many times more passengers has the most frequented one?")
most_freq = df[['Station', 'DTV']].sort_values(by='DTV', ascending=False).head(1)
least_freq = df[['Station', 'DTV']].sort_values(by='DTV').head(1)
most_freq[['Station', 'DTV']]
print("A:", most_freq['Station'].tolist()[0], "has", most_freq['DTV'].tolist()[0], "average daily passengers and", least_freq['Station'].tolist()[0], str(least_freq['DTV'].tolist()[0]) + ".")
ratio = most_freq['DTV'].tolist()[0] / least_freq['DTV'].tolist()[0]
print("This means that Zurich HB has", ratio, "times more daily passengers than Oron.")
In [13]:
print("Q9: Which stations have far more passengers during work days than during the full week? Group them in a subset.")
work_days = df[df['DWV'] >= 1.35 * df['DTV']]
print("A: These", len(work_days), "stations have at least 35% more passengers during the work days:")
work_days
Out[13]:
In [14]:
print("Q10: Find a crazy station name. Is its average frequency near to the mean average frequency of all stations?")
# Let's try to find a very long name...
longnames = df[df['Station'].str.match('.{25,}')]
longnames
Out[14]:
In [15]:
# … We'll pick “Geneveys-sur-Coffrane, Les”. This is an pretty long name.
meanDTV = df['DTV'].mean()
GeneveysDTV = df[df['Code'] == 'GEC']['DTV'].values
print("A: “Geneveys-sur-Coffran, Les” has an average daily frequency of", str(GeneveysDTV[0]) + ".")
print("This is far less than", str(meanDTV) + ", the mean average frequency of all stations.")
print("However, the _median_ frequency of all stations is only", str(df['DTV'].median()) + ".")
In [16]:
print("Q11: Who else than the SBB CFF FFS (Federal Railways) owns stations? Make a list of them (remove any duplicate).")
other_owner = df[(df['Owner'] != 'CFF') & (df['Owner'] != 'SBB') & (df['Owner'] != 'FFS')]
list_owners = other_owner['Owner'].tolist()
print("A:", str.join(", ", set(list_owners)))
In [17]:
print("Q12: Print how many stations each owner has.")
print("A: Here is how many stations they have:\n" + str(df['Owner'].value_counts()))
In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
standard = df[(df['DWV'] > 300) & (df['DWV'] < 2300) ]
standard.plot(kind='scatter', x='DWV', y='DTV')
print("These are the stations in Q2 and Q3 and their average daily passengers during the full week vs. the work days:")
In [19]:
plt.style.use('ggplot')
least_frequented = df.sort_values(by='DWV').head(20)
least_frequented.plot(kind='barh', x='Station', y='DTV').invert_yaxis()
print("These are the 20 least frequented stations, in average daily passengers:")
In [20]:
q1_freq = df[df['DWV'] <= 340]
q2_freq = df[(df['DWV'] <= 915) & (df['DWV'] > 340)]
q3_freq = df[(df['DWV'] <= 2700) & (df['DWV'] > 915)]
plt.scatter(y=q1_freq["DWV"], x=q1_freq["DTV"], c='c', alpha=0.75, marker='1')
plt.scatter(y=q2_freq["DWV"], x=q2_freq["DTV"], c='y', alpha=0.75, marker='2')
plt.scatter(y=q3_freq["DWV"], x=q3_freq["DTV"], c='m', alpha=0.75, marker='3')
print("Q1, Q2 and Q3 of average daily circulation; x axis = DTV, y axis = DWV")
plt.xlim(-15,2500)
plt.ylim(-30,2800)
Out[20]:
In [ ]: