In [1]:
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta

import pandas as pd
from sklearn.linear_model import LogisticRegression

Here's the shell for a simple temporal cross validation loop. It's based on Rayid's magicloops.

First, we must set some parameters:


In [2]:
# start time of our data
start_time = datetime.strptime('2018-01-01', '%Y-%m-%d')

# last date of data including labels
# (this past Sunday!)
end_time = datetime.strptime('2018-06-30', '%Y-%m-%d')

# how far ahead we're predicting, e.g. '1' means the label 
# takes a 1 if an event takes place in the next month
# and a 0 otherwise
# unit: months
prediction_windows = [1]

# how often should we score?
# e.g. police might predict a year ahead (prediction window) every day (update window)
# unit: months
update_window = 1

In [3]:
df = pd.read_csv('temporal_CV.csv')
df


Out[3]:
entity jail_entry
0 A 1/1/18
1 A 1/23/18
2 A 4/16/18
3 B 2/2/18
4 C 5/6/18
5 C 6/1/18
6 D 3/1/18

convert to date format


In [4]:
df['jail_entry'] = pd.to_datetime(df['jail_entry'], format = '%m/%d/%y')

Dataframe for storing results


In [5]:
results = pd.DataFrame(columns=['train_features_start_time', 'train_features_end_time', 
                                'train_label_start_time', 'train_label_end_time', 
                                'test_features_start_time', 'test_features_end_time',
                                'test__label_start_time', 'test_label_end_time', 
                                'accuracy'])

In [ ]:
# the last test labels extend to the end of our data
test_label_end_time = end_time

for prediction_window in prediction_windows:
    
    # we'll start at the end to ensure the "freshest" possible data.
    # keep looping backward until there isn't enough time for two
    # prediction windows. (Remember, the train and test prediction
    # windows cannot overlap.)
    while (test_label_end_time >= start_time + 2 * relativedelta(months=+prediction_window)):
        
        # the prediction window equals the time between the start and end of the test label 
        test_label_start_time = test_label_end_time - relativedelta(months=+prediction_window)
        
        # the end of the train label window and test features window should precede the beginning of the test label window
        train_label_end_time = test_features_end_time = test_label_start_time - relativedelta(days=+1) 
        
        # the prediction window also equals the time between the start and end of the train label 
        train_label_start_time = train_label_end_time - relativedelta(months=+prediction_window)
        
        # the end of the train features should precede the beginning of the train labels
        train_features_end_time = train_label_start_time - relativedelta(days=+1)
        
        # for this example, we'll use all the data back to start_date
        train_features_start_time = test_features_start_time = start_time
        
        # only run if there's enough data for a full train label window
        while (train_label_start_time >= start_time):
            
            #train_label_start_time -= relativedelta(months=+prediction_window)
            
            # It's safer to split the data then generate features and labels
            raw_train_X = df[(df.jail_entry >= train_features_start_time) & (df.jail_entry <= train_features_end_time)]
            raw_train_y = df[(df.jail_entry >= train_label_start_time) & (df.jail_entry <= train_label_end_time)]
            raw_test_X = df[(df.jail_entry >= test_features_start_time) & (df.jail_entry <= test_features_end_time)]
            raw_test_y = df[(df.jail_entry >= test_label_start_time) & (df.jail_entry <= test_label_end_time)]
            
            # create the matrices we need
            
            # ensure that each entity is only represented at the appropriate times
            # e.g. B should not appear before 2/2/18
            
            # fit on train data
            # predict on test data
            # calculate accuracy
            # write results to the results dataframe
            
        test_label_end_time -= relativedelta(months=+update_window)

In [ ]:


In [ ]: