Features, Labels, Join


In [526]:
import pandas as pd
import numpy as np
import warnings

A simple static feature


In [527]:
def gender_feature(filename='gender_female.csv'):
    """ Reads in a CSV, drops NAs, returns the DataFrame. """
    
    df = pd.read_csv(filename)
    df = df.dropna(how='any')
    
    return df.set_index('entity_id')

In [528]:
my_gender = gender_feature()
my_gender.head()


Out[528]:
gender_female
entity_id
309 1.0
324 1.0
48 1.0
210 0.0
180 1.0

A temporal feature


In [529]:
def incident_aggregation(as_of_date, agg_col, date_col, time_delta, 
                         agg_funcs, filename='incidents.csv'):
    """
    Reads and aggregates a CSV file over a date range.
    
    Args:
        as_of_date (datetime): End of the aggregation window (excluded).
        agg_col (str): Name of the column for aggregation.
        date_col (str): Name of the column that gives knowledge dates for 
                        the values in agg_col.
        time_delta (pd.Timedelta): Time range. Gives the time 
                                   window preceding the as_of_date over
                                   which we aggregate.
        agg_funcs (dict): A dictionary that maps column names to functions.
                          The functions will be applied to the groupby, 
                          and the resulting dataframe contains columns
                          named like <key>_<timedelta>.
        filename (str): Path to the CSV that should be aggregated. The 
                       CSV must contain an entity_id column, as well as 
                       the columns given by agg_col and date_col.
        
    Returns (pd.DataFrame): A dataframe, uniquely indexed by entity_id,
                            with columns that contain the aggregations
                            from agg_funcs. 
    """
    
    # read the CSV
    df = pd.read_csv(filename)
    df[date_col] = pd.to_datetime(df[date_col])
    
    # restrict to data in aggregation window
    df = df.loc[df[date_col] < as_of_date,:]
    df = df.loc[df[date_col] >= (as_of_date-time_delta),:]
    
    # add as_of_date to the index
    df['as_of_date'] = as_of_date
    df = df.set_index(['entity_id','as_of_date'])
    
    # just some formatting for naming the columns
    nice_timedelta_str = str(time_delta).replace('00:00:00','').replace(' ','')
    agg_funcs = {k+'_'+nice_timedelta_str: v for k,v in agg_funcs.items()}
    
    # aggregate by entity_id and apply the functions
    return df[agg_col].groupby(level=[0,1]).agg(agg_funcs)

In [530]:
my_agg = incident_aggregation(pd.to_datetime('2016-01-01'),
                             'incident_type',
                             'incident_date',
                              pd.Timedelta(365,'d'), 
                              {'count_neglects': lambda x: sum(x=='neglect_of_duty'),
                              'count_conduct': lambda x: sum(x=='conduct_unbecoming')})

In [531]:
my_agg.head()


Out[531]:
count_conduct_365days count_neglects_365days
entity_id as_of_date
0 2016-01-01 2 1
1 2016-01-01 2 1
2 2016-01-01 1 0
3 2016-01-01 1 2
4 2016-01-01 3 0

Binary labels


In [532]:
def label_aggregation(as_of_date, time_delta, filename='incidents.csv'):
    """ Find if an entity has a 'discipline' or 'conduct_unbecoming' incident
        that is decided as sustained.
    Args:
        as_of_date (datetime): Beginning of the aggregation window (included).
        time_delta (pd.Timedelta): Time range. Gives the time 
                                   window following the as_of_date over
                                   which we aggregate.
        filename (str): Path to the incidents CSV, which contains
                        entity_id, incident type and date, and 
                        decision with date.
    Returns (pd.Series):
        A boolean series, indexed uniquely by entity_id and as_of_date,
        giving if the entity had at least one sustained disciplinary
        or conduct_unbecoming event that fell within the time window.
    """
    
    # load the CSV
    df = pd.read_csv(filename, parse_dates=['incident_date','decision_date'])
        
    # restrict to incidents after the as_of_date
    df = df.loc[df.incident_date>=as_of_date,:]
    
    # restrict to decisions in the time window
    df = df.loc[df.decision_date<(as_of_date+time_delta),:]

    # add the as_of_date to the index
    df['as_of_date'] = as_of_date
    df = df.set_index(['entity_id','as_of_date'])
    
    # binarize
    df['adverse_incident'] = df.incident_type.isin(['discipline','conduct_unbecoming'])\
                             &(df.decision=='sustained')
    
    # aggregate and return
    return df.adverse_incident.groupby(level=[0,1]).max()

In [533]:
my_labels = label_aggregation(pd.to_datetime('2016-01-01'), pd.Timedelta(90,'d'))

In [534]:
my_labels.head()


Out[534]:
entity_id  as_of_date
2          2016-01-01     True
3          2016-01-01    False
8          2016-01-01    False
9          2016-01-01    False
10         2016-01-01    False
Name: adverse_incident, dtype: bool

Joining Labels and Features


In [535]:
my_gender['as_of_date'] = pd.to_datetime('2016-01-01')
my_gender = my_gender.set_index(['as_of_date'], append=True)

In [536]:
dataset = my_labels.to_frame().join(my_gender, how='left')\
                              .join(my_agg, how='left')
    
dataset.head()


Out[536]:
adverse_incident gender_female count_conduct_365days count_neglects_365days
entity_id as_of_date
2 2016-01-01 True 0.0 1.0 0.0
3 2016-01-01 False 0.0 1.0 2.0
8 2016-01-01 False 1.0 3.0 2.0
9 2016-01-01 False 0.0 1.0 0.0
10 2016-01-01 False 0.0 1.0 2.0

But clearly, some entities are missing...

Make a table of 'active' entities for the given date.


In [537]:
def active_officers(as_of_date, filename='patrol_duty.csv'):
    """Check if an officer is on patrol duty for the as_of_date."""
    
    # read CSV
    df = pd.read_csv(filename, parse_dates=['start_date','end_date'])
    
    # check if as_of_date falls between start and end date of duty
    df['active'] = (df.start_date<=as_of_date)&(df.end_date>=as_of_date)
    
    df['as_of_date'] = as_of_date
    df = df.set_index(['entity_id','as_of_date'])
    
    return df[df.active==True].index

In [538]:
my_active = active_officers(pd.to_datetime('2016-01-01'))

We can now index into the dataset with our new entity list:


In [540]:
dataset.loc[my_active,:].head()


Out[540]:
adverse_incident gender_female count_conduct_365days count_neglects_365days
entity_id as_of_date
0 2016-01-01 NaN NaN NaN NaN
2 2016-01-01 True 0.0 1.0 0.0
3 2016-01-01 False 0.0 1.0 2.0
13 2016-01-01 NaN NaN NaN NaN
15 2016-01-01 NaN NaN NaN NaN

We need to coalesce / impute!

A slightly nicer label fetcher


In [541]:
def label_aggregation(as_of_dates, time_delta, filename='incidents.csv'):
    """ Find if an entity has a 'discipline' or 'conduct_unbecoming' incident
        that is decided as sustained.
    Args:
        as_of_dates ([datetime]): List of beginnings of the aggregation 
                                  windows (included).
        time_delta (pd.Timedelta): Time range. Gives the time 
                                   window following the as_of_date over
                                   which we aggregate.
        filename (str): Path to the incidents CSV, which contains
                        entity_id, incident type and date, and 
                        decision with date.
    Returns (pd.Series):
        A boolean series, indexed uniquely by entity_id and as_of_date,
        giving if the entity had at least one sustained disciplinary
        or conduct_unbecoming event that fell within the time window.
    """
    
    # load the CSV
    df = pd.read_csv(filename, parse_dates=['incident_date','decision_date'])
    
    if len(set(as_of_dates))!=len(as_of_dates):
        raise ValueError("As of dates need to be unique!")
        
    as_of_dates = sorted(as_of_dates)
    
    # let's be cautious here already and do a sanity check
    for idx, aod in enumerate(as_of_dates[:-1]):
        if aod+time_delta >= as_of_dates[idx+1]:
            warnings.warn("Your label windows will overlap!")
        
    dfs = []
    
    # go over all the dates
    for as_of_date in as_of_dates:
        
        # restrict to incidents after the as_of_date
        this_df = df.loc[df.incident_date>=as_of_date,:]

        # restrict to decisions in the time window
        this_df = this_df.loc[this_df.decision_date<(as_of_date+time_delta),:]

        # add the as_of_date to the index
        this_df['as_of_date'] = as_of_date
        this_df = this_df.set_index(['entity_id','as_of_date'])

        # binarize
        this_df['adverse_incident'] = this_df.incident_type.isin(['discipline','conduct_unbecoming'])\
                                      &(this_df.decision=='sustained')
            
        dfs.append(this_df.adverse_incident.groupby(level=[0,1]).max())
    
    # concat and return
    return pd.concat(dfs)

In [542]:
my_labels = label_aggregation([pd.to_datetime('2016-01-01'),
                               pd.to_datetime('2016-05-01')],
                              pd.Timedelta(90,'d'))

In [543]:
my_labels.head()


Out[543]:
entity_id  as_of_date
2          2016-01-01     True
3          2016-01-01    False
8          2016-01-01    False
9          2016-01-01    False
10         2016-01-01    False
Name: adverse_incident, dtype: bool

In [544]:
my_labels.index.levels[1]


Out[544]:
DatetimeIndex(['2016-01-01', '2016-05-01'], dtype='datetime64[ns]', name='as_of_date', freq=None)

... and slightly nicer active-officer fetcher


In [545]:
def active_officers(as_of_dates, filename='patrol_duty.csv'):
    """Check if an officer is on patrol duty for the as_of_date."""
    
    # read CSV
    df = pd.read_csv(filename, parse_dates=['start_date','end_date'])
    
    dfs = []
    
    for as_of_date in as_of_dates:

        # check if as_of_date falls between start and end date of duty
        this_active = (df.start_date<=as_of_date)&(df.end_date>=as_of_date)

        df['as_of_date'] = as_of_date
        
        dfs.append(df[this_active==True].set_index(['entity_id','as_of_date']))
    
    return pd.concat(dfs).sort_index().index

In [546]:
my_active = active_officers([pd.to_datetime('2016-01-01'),
                             pd.to_datetime('2016-05-01')])

In [547]:
my_active


Out[547]:
MultiIndex(levels=[[0, 2, 3, 13, 15, 18, 21, 29, 32, 34, 35, 37, 42, 43, 48, 52, 54, 57, 61, 62, 65, 69, 73, 75, 79, 81, 107, 108, 109, 114, 115, 116, 121, 133, 137, 138, 140, 143, 145, 146, 151, 152, 153, 155, 156, 164, 171, 179, 187, 194, 195, 197, 206, 208, 210, 213, 218, 220, 223, 227, 229, 232, 243, 248, 255, 256, 259, 260, 265, 273, 287, 297, 301, 305, 307, 308, 310, 318, 327, 334, 337, 339, 343, 345, 372, 374, 380, 384, 386, 387, 390, 396, 400, 402, 403, 404, 410, 411, 412, 415, 418, 425, 427, 432, 433, 434, 440, 448, 453, 455, 463, 468, 478, 483, 494], [2016-01-01 00:00:00, 2016-05-01 00:00:00]],
           labels=[[0, 0, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 8, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 15, 15, 16, 17, 18, 18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 36, 36, 37, 38, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 47, 47, 48, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 65, 66, 67, 67, 68, 68, 69, 70, 70, 71, 72, 72, 73, 73, 74, 74, 75, 76, 76, 77, 77, 78, 79, 79, 80, 80, 81, 81, 82, 83, 84, 85, 86, 86, 87, 87, 88, 88, 89, 89, 90, 91, 92, 92, 93, 94, 95, 96, 97, 97, 98, 98, 99, 100, 101, 102, 103, 104, 105, 105, 106, 106, 107, 108, 109, 109, 110, 110, 111, 111, 112, 112, 113, 113, 114, 114], [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]],
           names=['entity_id', 'as_of_date'])

Do the Split!


In [548]:
def train_test_splitter(split_dates, label_time_delta, label_fetcher, feature_fetchers):
    """TODO: Write nice documentation. Always!"""
    
    test_as_of_dates = [aod for aod, usefor in split_dates if usefor=='test']
    train_as_of_dates = [aod for aod, usefor in split_dates if usefor=='train']
    as_of_dates = list(zip(*split_dates))[0]
    
    # TODO: check that the as_of_dates are unique
    
    # check that the train/test splits are well-separated
    if max(train_as_of_dates) + label_time_delta >= min(test_as_of_dates):
        raise ValueError("Your train and test label windows overlap!")
        
    # fetch the index of active officers - make DF, beacuse Pandas isn't nice here
    actives = pd.DataFrame(index=active_officers(as_of_dates))
        
    # fetch the DF with labels
    labels = label_fetcher(as_of_dates, label_time_delta)
    
    # subset (or superset!) the labels to active entities
    dataset = actives.join(labels, how='left')
    
    # now join in all the features
    for ff in feature_fetchers:
        
        these_feats = []
        
        # first, concatenate the various as-of-dates per feature
        # TODO: this should be handled by the features, really!
        for as_of_date in as_of_dates:
                        
            # note: You could do some **kwargs magic here, and have every feature accept arbitrary arguments.
            #       Nice if you want to handle a single config dict!
            these_feats.append(ff(as_of_date))
            
        dataset = dataset.join(pd.concat(these_feats), how='left')
    
    # let's make an aux with test/train information:
    
    # small dataframe with just dates --> usefor
    only_dates = pd.DataFrame(split_dates,
                              columns=['as_of_date', 'usefor']).set_index('as_of_date')
    
    # blow up that dataframe with the dataset index (as_of_dates)
    aux = only_dates.loc[dataset.index.get_level_values(1),:]
    
    # share index
    aux.index = dataset.index
        
    return dataset, aux

With this, we can grab training/testing data with many as-of-dates:


In [549]:
# let's wrap the gender feature - it's nice to have even static features mirror the as-of-date
def nicer_gender(as_of_date):
    df = gender_feature()
    df['as_of_date'] = as_of_date
    return df.set_index('as_of_date', append=True)


my_split_dates = [(pd.to_datetime('2016-01-01'), 'train'), 
                  (pd.to_datetime('2016-04-30') , 'train'), 
                  (pd.to_datetime('2016-08-01'), 'test')]

my_feature_fetchers = [lambda aod: incident_aggregation(aod, 'incident_type', 'incident_date', 
                                                        pd.Timedelta(90,'d'), 
                                                         {'count_neglects': lambda x: sum(x=='neglect_of_duty'),
                                                          'count_conduct': lambda x: sum(x=='conduct_unbecoming')}),
                       lambda aod: incident_aggregation(aod, 'decision', 'decision_date', 
                                                        pd.Timedelta(190,'d'), 
                                                         {'count_decision_other': lambda x: sum(x=='other')}),
                       nicer_gender
                       ]

In [550]:
m, aux = train_test_splitter(my_split_dates,
                             label_time_delta=pd.Timedelta(90,'d'),
                             label_fetcher=label_aggregation,
                             feature_fetchers=my_feature_fetchers)

In [551]:
m.head()


Out[551]:
adverse_incident count_conduct_90days count_neglects_90days count_decision_other_190days gender_female
entity_id as_of_date
0 2016-01-01 NaN 1.0 0.0 0.0 0.0
2016-04-30 False NaN NaN 0.0 0.0
2016-08-01 NaN 1.0 2.0 0.0 0.0
2 2016-01-01 True NaN NaN 0.0 0.0
3 2016-01-01 False 0.0 1.0 0.0 0.0

In [552]:
aux.head()


Out[552]:
usefor
entity_id as_of_date
0 2016-01-01 train
2016-04-30 train
2016-08-01 test
2 2016-01-01 train
3 2016-01-01 train

In [553]:
m.loc[aux[aux.usefor=='train'].index,:].head()


Out[553]:
adverse_incident count_conduct_90days count_neglects_90days count_decision_other_190days gender_female
entity_id as_of_date
0 2016-01-01 NaN 1.0 0.0 0.0 0.0
2016-04-30 False NaN NaN 0.0 0.0
2 2016-01-01 True NaN NaN 0.0 0.0
3 2016-01-01 False 0.0 1.0 0.0 0.0
2016-04-30 NaN 0.0 0.0 0.0 0.0

TODOs:

  • imputation for labels and features
  • NaN-checker: Throw error if too many rows are NaN. And/or: Make sure all requested as-of-dates actually get returned.