Source and documentation: http://data.sbb.ch/explore/dataset/equipement/
In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("stations.csv", delimiter=';')
In [3]:
df.head(5)
Out[3]:
In [4]:
print("Q1: How many train station is there in Switzerland? How many of them are accessible for wheelchairs?")
accessible_perron = df[df['stepless_perron'] == 1]
print("A: There is", df.shape[0], "stations.", accessible_perron.shape[0], "of them have a stepless perron.")
#df.describe()
#df['accessible_ticket'].value_counts()
In [5]:
import math
print("Q2: What does “bats” stand for? Is there interesting data about that?")
print("A: This field contains comments in french, italian and german about the stations.")
print("The metadata says: “Comment field (Billet Automat Touch Screen?)” with a question mark.")
bats = df[df['bats'] == df['bats']] # check if equals to himself to get rid of the NaN
#bats[['station_name','bats']].head(5) # to explore the data
print("Some stations are marked as “not served” in french. The “bats” column enables us to get a non-extensive list of these:")
bats[bats['bats'] == 'non desservi'][['station_name','bats']]
Out[5]:
In [6]:
print("Q3: How many stations offer help to board and get out of the trains? What percentage of the stations does it represent?")
with_help = df['wheelchair_load'].value_counts()[1]
total = with_help + df['wheelchair_load'].value_counts()[0]
percentage = (with_help*100) / total
print("A:", with_help, "stations offer help to board and get out of trains. This represents", "%.2f" % percentage + "% of the total.")
In [7]:
print("Q4: Which are the 10 least equipped stations?")
# For our rating, we just want to know if “automat_ktu” is more than 1, and not add the actual number.
def automat(x):
if x > 0:
return 1
else:
return 0
df['rating'] = df['accessible_ticket'] + df['accessible_wc'] + df['wheelchair_load'] + df['mobilift'] + df['stepless_perron'] + df['autelca'] + df['automat_ktu'].apply(automat)
zeroRatingCount = df['didok'][df['rating'] == 0].count()
print("A: These 10 stations have zero recorded equipment. But", zeroRatingCount-10, "other station have no equipment.")
df[['station_name', 'rating']].sort_values(by='rating').head(10)
Out[7]:
In [8]:
print('Q5: Is there areas with a lot of non-equipped stations?')
zeroRating = df[df['rating'] == 0]
# posList = df['geopos'].tolist
geopos_array = []
neighbors_list = []
# 1) store all positions in an array
for index, row in zeroRating.iterrows():
geopos = row['geopos'].split(', ')
latlng = [float(item) for item in geopos]
geopos_array.append(latlng)
neighbors_list.append(row['station_name'])
# 2) store “neighbour” non-equipped stations in two new columns
# [latitude = 47.452833265 N, longitude = 8.70557069755 E]
# In Switzerland, 1 ~= 110 km for latitude, 75 km for longitude
# Let's say 50 km is enough to be "neighbour" stations (31 miles, a bit more than Columbia University - Newark)
# It's a rough calculation: the surface in which cities are "neighbour" is a square and not a circle.
latdistance = 0.4
longdistance = 0.6
df['neighbors_unequipped'] = 0
df['neighbors_names'] = ''
for index, row in zeroRating.iterrows(): # iteration 1: through the stations
neighbors_count = 0
neighbors_index = 0
neighbors_currlist = []
geopos = row['geopos'].split(', ')
latlng = [float(item) for item in geopos]
for loc in geopos_array: # iteration 2: through the array of locations
if loc != latlng: # we check if this isn't the current station location
if (abs(loc[0] - latlng[0]) < latdistance) and (abs(loc[1] - latlng[1]) < longdistance):
neighbors_count += 1
neighbors_currlist.append(neighbors_list[neighbors_index])
neighbors_index += 1
df.loc[index, 'neighbors_unequipped'] = neighbors_count
df.loc[index, 'neighbors_names'] = str.join(', ', neighbors_currlist)
df[df['neighbors_unequipped'] > 0].sort_values(by='neighbors_unequipped', ascending=False).head(10)
top_unequipped = df[df['neighbors_unequipped'] > 0].sort_values(by='neighbors_unequipped', ascending=False)
print("[We create two new columns and get this list -- text answer in the next cell]")
top_unequipped[['station_name', 'neighbors_unequipped', 'neighbors_names']].head(10)
Out[8]:
In [9]:
# 3) new for loop to get a “clean” list of different locations,
# which means we avoid printing two stations from the same area
name_list = []
reject_list = []
for index, row in top_unequipped.iterrows(): # iteration 1: through the stations
rejects = row['neighbors_names'] # we want to get the different areas
reject_list.extend(rejects.split(', '))
if row['station_name'] not in reject_list:
name_list.append("“" + row['station_name'] + "”: " + str(row['neighbors_unequipped']))
print("Answer to Q5: These places count the most unequipped stations within about 50 km:", str.join(', ', name_list))
In [10]:
print("Q6: What is the average station “rating”, based on the number of equipments?")
print("A: The average rating is", df['rating'].mean())
In [11]:
print("Q7: Merge this data with the passenger frequence. Is it a perfect match?")
dfp = pd.read_csv('passagierfrequenz.csv', delimiter=';')
dfm = df.merge(dfp, how='inner', left_on='station_name', right_on='Station')
print("A: It isn’t a perfect match. The “equipment” dataset has", df.shape[0], "rows; the “frequency” dataset has", dfp.shape[0], "rows; the merged dataset has only", dfm.shape[0], "rows.")
print("Here is the merged dataset:")
dfm
Out[11]:
In [12]:
print("Q8: Find the mismatches.")
# We do an outer join, then we find out which rows are incomplete
dfouter = df.merge(dfp, how='outer', left_on='station_name', right_on='Station')
mismatches = dfouter[dfouter['Station'] != dfouter['station_name']][['Station', 'station_name']]
print("A: Here are the", len(mismatches), "unmatched rows:")
mismatches
Out[12]:
In [13]:
print("Q9: Which are the most frequented stations with no equipment?")
print("A: These are the top 5:")
dfm[['Station', 'rating', 'DTV']][dfm['rating'] == 0].sort_values('DTV', ascending=False).head(5)
Out[13]:
In [14]:
print("Q10: Which are the most frequented stations with poor equipment (a rating of 3)?")
print("A: These are the top 5. Schlieren has more than 10,000 daily passengers.")
dfm[['Station', 'rating', 'DTV']][dfm['rating'] == 3].sort_values('DTV', ascending=False).head(5)
Out[14]:
In [15]:
print("Q11: Does any of the top 20 most frequented stations have missing equipments?")
most_frequented = dfm.sort_values('DTV', ascending=False).head(20)
most_frequented_missing = most_frequented[most_frequented['rating'] < 6]
print("These very frequented stations don't have a complete equipment:")
most_frequented_missing[['Station', 'DTV', 'rating']]
Out[15]:
In [16]:
print("Q12: How many “automat KTU” is there in all Swiss railway stations? How many stations have one?")
print("A: There is", int(df['automat_ktu'].sum()), "“automat KTU”.")
print(df['automat_ktu'].apply(automat).sum(), "stations have at least one “automat KTU”.")
In [17]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
df['rating'].value_counts().plot(kind='barh', label='Yolool').invert_yaxis()
plt.ylabel('Rating')
plt.xlabel('Number of stations')
print("Most railway stations (more than 250) have a rating of 1 or 2:")
In [18]:
plt.style.use('ggplot')
most_frequented.plot(kind='barh', x='Station', y='DTV').invert_yaxis()
fig = plt.gcf()
fig.set_size_inches(18, 7, forward=True)
plt.rcParams.update({'font.size': 16})
plt.rc('ytick', labelsize=14)
print("These are the most frequented stations, in average daily passengers:")
In [19]:
most_frequented.plot(kind='scatter', x='rating', y='DTV')
print("The same stations and their equipment rating:")
In [ ]: