PyData Hands-On Data Science Night - 16 August 2017

https://www.meetup.com/pydatato/events/242191592/


In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import sklearn as sk

In [2]:
df = pd.read_excel('data/Subway & SRT Logs (Jan01_14 to June 30_17).xlsx',
                   parse_dates=[['Date', 'Time']])

In [8]:
df.columns = ['datetime', 'day', 'station', 'code',
              'min_delay', 'min_gap', 'bound', 'line', 'vehicle']

df = df.set_index('datetime')

df.tail()


Out[8]:
day station code min_delay min_gap bound line vehicle
datetime
2017-06-30 22:00:00 Friday YONGE UNIVERSITY LINE MUO 0 0 NaN BD 0
2017-06-30 22:30:00 Friday LAWRENCE STATION MUPAA 0 0 N YU 6031
2017-06-30 22:58:00 Friday KENNEDY BD STATION SUDP 5 9 E BD 5018
2017-06-30 23:30:00 Friday YONGE UNIVERSITY LINE MUGD 0 0 NaN YU 0
2017-06-30 23:57:00 Friday WOODBINE STATION MUIS 0 0 NaN BD 0

In [10]:
print('Min: {}'.format(df.index.min()))
print('Max: {}'.format(df.index.max()))


Min: 2014-01-01 00:21:00
Max: 2017-06-30 23:57:00

In [17]:
idx = pd.date_range(df.index.min(), df.index.max())

In [18]:
codes = pd.read_excel('data/Subway & SRT Log Codes.xlsx')

In [19]:
codes.columns = ['un_0', 'un_1', 'code', 'description',
                 'un_4', 'un_5', 'srt_rmenu_code', 'code_description_1']

codes.tail()


Out[19]:
un_0 un_1 code description un_4 un_5 srt_rmenu_code code_description_1
112 NaN 113 TUS Crew Unable to Maintain Schedule NaN NaN NaN NaN
113 NaN 114 TUSC Operator Overspeeding NaN NaN NaN NaN
114 NaN 115 TUSET Train Controls Improperly Shut Down NaN NaN NaN NaN
115 NaN 116 TUST Storm Trains NaN NaN NaN NaN
116 NaN 117 TUSUP Supervisory Error NaN NaN NaN NaN

In [20]:
df['code_description'] = df.code.map(codes.set_index('code')['description'])

df.tail()


Out[20]:
day station code min_delay min_gap bound line vehicle code_description
datetime
2017-06-30 22:00:00 Friday YONGE UNIVERSITY LINE MUO 0 0 NaN BD 0 Miscellaneous Other
2017-06-30 22:30:00 Friday LAWRENCE STATION MUPAA 0 0 N YU 6031 Passenger Assistance Alarm Activated - No Trou...
2017-06-30 22:58:00 Friday KENNEDY BD STATION SUDP 5 9 E BD 5018 Disorderly Patron
2017-06-30 23:30:00 Friday YONGE UNIVERSITY LINE MUGD 0 0 NaN YU 0 Miscellaneous General Delays
2017-06-30 23:57:00 Friday WOODBINE STATION MUIS 0 0 NaN BD 0 Injured or ill Customer (In Station) - Transpo...

In [25]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df)

In [30]:
from patsy.highlevel import dmatrices

outcome, predictors = dmatrices(
    'index ~ min_gap + min_gap', data=df_train, return_type='dataframe')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/usr/local/lib/python3.6/site-packages/patsy/compat.py in call_and_wrap_exc(msg, origin, f, *args, **kwargs)
    116     try:
--> 117         return f(*args, **kwargs)
    118     except Exception as e:

/usr/local/lib/python3.6/site-packages/patsy/eval.py in eval(self, expr, source_name, inner_namespace)
    165         return eval(code, {}, VarLookupDict([inner_namespace]
--> 166                                             + self._namespaces))
    167 

<string> in <module>()

NameError: name 'index' is not defined

The above exception was the direct cause of the following exception:

PatsyError                                Traceback (most recent call last)
<ipython-input-30-a6bb09220182> in <module>()
      2 
      3 outcome, predictors = dmatrices(
----> 4     'index ~ min_gap + min_gap', data=df_train, return_type='dataframe')

/usr/local/lib/python3.6/site-packages/patsy/highlevel.py in dmatrices(formula_like, data, eval_env, NA_action, return_type)
    308     eval_env = EvalEnvironment.capture(eval_env, reference=1)
    309     (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310                                       NA_action, return_type)
    311     if lhs.shape[1] == 0:
    312         raise PatsyError("model is missing required outcome variables")

/usr/local/lib/python3.6/site-packages/patsy/highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
    163         return iter([data])
    164     design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165                                       NA_action)
    166     if design_infos is not None:
    167         return build_design_matrices(design_infos, data,

/usr/local/lib/python3.6/site-packages/patsy/highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
     68                                       data_iter_maker,
     69                                       eval_env,
---> 70                                       NA_action)
     71     else:
     72         return None

/usr/local/lib/python3.6/site-packages/patsy/build.py in design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action)
    694                                                    factor_states,
    695                                                    data_iter_maker,
--> 696                                                    NA_action)
    697     # Now we need the factor infos, which encapsulate the knowledge of
    698     # how to turn any given factor into a chunk of data:

/usr/local/lib/python3.6/site-packages/patsy/build.py in _examine_factor_types(factors, factor_states, data_iter_maker, NA_action)
    441     for data in data_iter_maker():
    442         for factor in list(examine_needed):
--> 443             value = factor.eval(factor_states[factor], data)
    444             if factor in cat_sniffers or guess_categorical(value):
    445                 if factor not in cat_sniffers:

/usr/local/lib/python3.6/site-packages/patsy/eval.py in eval(self, memorize_state, data)
    564         return self._eval(memorize_state["eval_code"],
    565                           memorize_state,
--> 566                           data)
    567 
    568     __getstate__ = no_pickling

/usr/local/lib/python3.6/site-packages/patsy/eval.py in _eval(self, code, memorize_state, data)
    549                                  memorize_state["eval_env"].eval,
    550                                  code,
--> 551                                  inner_namespace=inner_namespace)
    552 
    553     def memorize_chunk(self, state, which_pass, data):

/usr/local/lib/python3.6/site-packages/patsy/compat.py in call_and_wrap_exc(msg, origin, f, *args, **kwargs)
    122                                  origin)
    123             # Use 'exec' to hide this syntax from the Python 2 parser:
--> 124             exec("raise new_exc from e")
    125         else:
    126             # In python 2, we just let the original exception escape -- better

/usr/local/lib/python3.6/site-packages/patsy/compat.py in <module>()

PatsyError: Error evaluating factor: NameError: name 'index' is not defined
    index ~ min_gap + min_gap
    ^^^^^

In [ ]:
outcome.tail()

In [ ]:
predictors.tail()

In [ ]:
model = sk.linear_model.LogisticRegression()
model.fit(predictors, outcome)

In [ ]: