In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import re

Population preprocessing


In [2]:
file = "../data/population_suisse_clean.xlsx"

df_pop = pd.read_excel(file)
df_pop = df_pop.reset_index()

Remove district information


In [3]:
df_pop.drop(df_pop['index'].str.startswith(">>"), inplace=True)

Remove NPZ information and parenthesis information


In [4]:
clean_name = lambda s: s.partition(' ')[2]
remove_sup = lambda s: re.sub(r'\(*\ [^)]*\)', '', s)

df_pop['index'] = df_pop['index'].apply(clean_name)
df_pop['index'] = df_pop['index'].apply(remove_sup)

In [5]:
name2pop = {}
def create_dict(row):
    name2pop.update({row['index']: row["Etat de la population"]})

In [6]:
a = df_pop.apply(create_dict, axis=1)

Capacity preprocessing


In [7]:
file = '../data/2017-01-30_out.csv'
df_train = pd.read_csv(file)


/Users/TristanO/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Clean stop id for aggregation


In [8]:
clean_id = lambda s : s.partition(':')[0]

df_train['stop_id'] = df_train['stop_id'].apply(clean_id)

Aggregate


In [9]:
most_present = lambda x: x.value_counts().index[0]

In [10]:
df = df_train.groupby(['stop_id']).agg({'name': most_present,'id': 'count'})
df["amount_of_train"] = df.id
df = df[["amount_of_train", "name"]]
df


Out[10]:
amount_of_train name
stop_id
8014428 130 Weil am Rhein
8014429 130 Weil am Rhein-Ost
8014439 75 Riehen
8014440 205 Lorrach-Stetten
8014441 205 Lorrach Hbf
8014443 182 Lorrach Haagen/Messe
8014444 182 Lorrach Brombach/Hauingen
8014445 182 Steinen (D)
8014446 70 Maulburg
8014447 86 Schopfheim
8014448 68 Fahrnau
8014449 68 Hausen-Raitbach
8014450 68 Zell (Wiesental)
8014474 38 Waldshut
8014481 36 Erzingen (Baden)
8014482 36 Wilchingen-Hallau
8014483 36 Neunkirch
8014484 56 Beringen Bad Bf
8014485 112 Neuhausen Bad Bf
8014487 36 Herblingen
8014488 36 Trasadingen
8014554 66 Engen
8014555 66 Welschingen-Neuhausen
8014556 66 Muhlhausen (b Engen)
8014558 88 Singen (Hohentwiel)
8014570 72 Bohringen-Rickelshausen
8014571 72 Radolfzell
8014580 72 Markelfingen
8014581 72 Allensbach
8014582 72 Hegne
... ... ...
8516176 89 Klingnau
8516187 136 Zug Schutzengel
8516219 49 Mellingen Heitersberg
8516283 63 Geneve-Secheron
8516350 76 Buchrain
8516353 72 Baldegg Kloster
8517112 18 Walchwil Horndli
8517131 86 Pratteln Salina Raurica
8517336 47 Luzern Verkehrshaus
8517519 68 Stabio
8517527 68 Zug Casino
8518243 56 Beringerfeld
8518452 117 Prilly-Malley
8518459 77 Steinhausen Rigiblick
8518475 89 Mendrisio S. Martino
8518771 69 Biel/Bienne Bozingenfeld/Champ
8518924 56 Neuhausen Rheinfall
8518963 69 Solothurn Allmend
8718206 13 Mulhouse Ville
8718213 96 Saint-Louis (Haut-Rhin)
8771500 6 Pontarlier (F)
8771513 8 Frasne
8771521 2 Labergement-Ste-Marie
8771861 12 Morteau
8772202 1 Lyon-Perrache
8772319 1 Lyon-Part-Dieu
8774107 1 Culoz
8774371 1 Amberieu
8774500 51 Bellegarde (Ain)
8774538 17 Pougny-Chancy

683 rows × 2 columns


In [11]:
def get_pop(x):
    if x in name2pop:
        return name2pop[x]
    x1 = x.split(" ")[0]
    if x1 in name2pop:
        return name2pop[x1]
    x2 = x.split("-")[0]
    if x2 in name2pop:
        return name2pop[x2]

In [12]:
df['pop'] = df.name.apply(get_pop)

In [71]:
df_okay = df[~df['pop'].isnull()]

In [72]:
sum(df['pop'].isnull())


Out[72]:
283

In [73]:
df_okay.plot.scatter(x='amount_of_train', y='pop', logy=True);



In [74]:
name2pop["Prilly"]


Out[74]:
12058

In [78]:
df_okay[df_okay.amount_of_train < 3]


Out[78]:
amount_of_train name pop
stop_id
8503408 1 Rekingen AG 990.0
8506011 2 Eschlikon 4228.0
8506013 2 Aadorf 8599.0
8506014 2 Elgg 4146.0
8507478 2 Frutigen 6793.0
8507495 2 Leissigen 976.0

In [15]:
df_okay


Out[15]:
name id pop
stop_id
8014439 Riehen 75 20774.0
8014483 Neunkirch 36 2051.0
8014488 Trasadingen 36 603.0
8500020 Muttenz 172 17538.0
8500021 Pratteln 173 15601.0
8500023 Liestal 198 13956.0
8500024 Lausen 83 5044.0
8500025 Itingen 84 2069.0
8500026 Sissach 196 6469.0
8500027 Gelterkinden 118 5871.0
8500028 Tecknau 80 860.0
8500031 Diepflingen 37 685.0
8500034 Buckten 37 698.0
8500037 Trimbach 37 6434.0
8500100 Tavannes 60 3572.0
8500101 Reconvilier 60 2329.0
8500103 Sorvilier 43 265.0
8500104 Court 43 1432.0
8500105 Moutier 153 7629.0
8500113 Laufen 123 5491.0
8500114 Zwingen 86 2242.0
8500115 Grellingen 86 1810.0
8500116 Duggingen 82 1470.0
8500117 Aesch 86 10155.0
8500125 Courgenay 80 2258.0
8500126 Porrentruy 84 6798.0
8500128 Boncourt 41 1232.0
8500141 Courchavon 41 294.0
8500200 Pieterlen 69 3971.0
8500201 Lengnau 69 2651.0
... ... ... ...
8506109 Amriswil 40 12814.0
8506121 Romanshorn 40 10659.0
8506131 Kreuzlingen 31 21290.0
8506208 Uzwil 68 12695.0
8506209 Flawil 68 10323.0
8506217 Sulgen 6 3663.0
8506302 St. Gallen 165 75310.0
8506311 Rorschach 32 9100.0
8506314 St. Margrethen 40 5777.0
8507000 Bern 408 130015.0
8507100 Thun 104 43303.0
8507478 Frutigen 2 6793.0
8507483 Spiez 104 12555.0
8507495 Leissigen 2 976.0
8508005 Burgdorf 73 15998.0
8508006 Wynigen 39 2016.0
8508008 Herzogenbuchsee 73 6956.0
8508100 Langenthal 130 15316.0
8508102 Murgenthal 56 2876.0
8508103 Rothrist 57 8515.0
8509000 Chur 112 34547.0
8509002 Landquart 112 8692.0
8509003 Maienfeld 3 2740.0
8509004 Bad Ragaz 66 5705.0
8509411 Sargans 132 6075.0
8509413 Flums 4 4848.0
8509414 Walenstadt 36 5505.0
8516176 Klingnau 89 3263.0
8516350 Buchrain 76 6161.0
8517519 Stabio 68 4540.0

274 rows × 3 columns


In [ ]: