https://github.com/sechilds/pydata_sept2016/blob/master/use_sample_data.ipynb
Data collection from the City of Toronto website: https://www1.toronto.ca/wps/portal/contentonly?vgnextoid=fa6be8c5a612c510VgnVCM10000071d60f89RCRD&vgnextchannel=1a66e03bb8d1e310VgnVCM10000071d60f89RCRD.
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn as sk
In [2]:
df = pd.read_excel('data/Subway & SRT Logs (Jan01_14 to June 30_17).xlsx',
parse_dates=[['Date', 'Time']])
In [8]:
df.columns = ['datetime', 'day', 'station', 'code',
'min_delay', 'min_gap', 'bound', 'line', 'vehicle']
df = df.set_index('datetime')
df.tail()
Out[8]:
In [10]:
print('Min: {}'.format(df.index.min()))
print('Max: {}'.format(df.index.max()))
In [17]:
idx = pd.date_range(df.index.min(), df.index.max())
In [18]:
codes = pd.read_excel('data/Subway & SRT Log Codes.xlsx')
In [19]:
codes.columns = ['un_0', 'un_1', 'code', 'description',
'un_4', 'un_5', 'srt_rmenu_code', 'code_description_1']
codes.tail()
Out[19]:
In [20]:
df['code_description'] = df.code.map(codes.set_index('code')['description'])
df.tail()
Out[20]:
In [25]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df)
In [30]:
from patsy.highlevel import dmatrices
outcome, predictors = dmatrices(
'index ~ min_gap + min_gap', data=df_train, return_type='dataframe')
In [ ]:
outcome.tail()
In [ ]:
predictors.tail()
In [ ]:
model = sk.linear_model.LogisticRegression()
model.fit(predictors, outcome)
In [ ]: