In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import re
In [2]:
file = "../data/population_suisse_clean.xlsx"
df_pop = pd.read_excel(file)
df_pop = df_pop.reset_index()
Remove district information
In [3]:
df_pop.drop(df_pop['index'].str.startswith(">>"), inplace=True)
Remove NPZ information and parenthesis information
In [4]:
clean_name = lambda s: s.partition(' ')[2]
remove_sup = lambda s: re.sub(r'\(*\ [^)]*\)', '', s)
df_pop['index'] = df_pop['index'].apply(clean_name)
df_pop['index'] = df_pop['index'].apply(remove_sup)
In [5]:
name2pop = {}
def create_dict(row):
name2pop.update({row['index']: row["Etat de la population"]})
In [6]:
a = df_pop.apply(create_dict, axis=1)
In [7]:
file = '../data/2017-01-30_out.csv'
df_train = pd.read_csv(file)
Clean stop id for aggregation
In [8]:
clean_id = lambda s : s.partition(':')[0]
df_train['stop_id'] = df_train['stop_id'].apply(clean_id)
Aggregate
In [9]:
most_present = lambda x: x.value_counts().index[0]
In [10]:
df = df_train.groupby(['stop_id']).agg({'name': most_present,'id': 'count'})
df["amount_of_train"] = df.id
df = df[["amount_of_train", "name"]]
df
Out[10]:
In [11]:
def get_pop(x):
if x in name2pop:
return name2pop[x]
x1 = x.split(" ")[0]
if x1 in name2pop:
return name2pop[x1]
x2 = x.split("-")[0]
if x2 in name2pop:
return name2pop[x2]
In [12]:
df['pop'] = df.name.apply(get_pop)
In [71]:
df_okay = df[~df['pop'].isnull()]
In [72]:
sum(df['pop'].isnull())
Out[72]:
In [73]:
df_okay.plot.scatter(x='amount_of_train', y='pop', logy=True);
In [74]:
name2pop["Prilly"]
Out[74]:
In [78]:
df_okay[df_okay.amount_of_train < 3]
Out[78]:
In [15]:
df_okay
Out[15]:
In [ ]: