notebook.community

Edit and run



In [7]:

    
import pandas as pd
import numpy as np



In [16]:

    
# entities 
n = 500
df = pd.DataFrame()
ids = np.arange(n)
np.random.shuffle(ids)



In [17]:

    
# make data for static feature
gender = np.random.binomial(1, 0.6, len(ids))
gender = pd.DataFrame(index=ids, data=gender, columns=['gender_female'])
gender.loc[np.random.choice(ids, size=int(len(ids)*.05))] = pd.np.nan
gender.to_csv('../2_crossval/label_join/gender_female.csv', index_label='entity_id')



In [18]:

    
# make data for temporal feature
date_range = pd.date_range(start='2014-01-01', end='2017-01-01', freq='W')
multiindex = pd.MultiIndex.from_product([ids, date_range])
timesheet = pd.DataFrame(index=multiindex, columns=['hours'])
timesheet['hours'] = np.random.randint(low=0, high=60, size=len(timesheet))

# drop some random rows
timesheet = timesheet.iloc[np.random.choice(range(len(timesheet)), replace=False, size=int(len(timesheet)*.8))]

timesheet.to_csv('../2_crossval/label_join/timesheet.csv', index_label=['entity_id','date'])



In [29]:

    
# make an incident table with entity_id, incident_date, incident_type, decision_date, decision

# each entity has between 0 and 50 incidents
incidents = pd.DataFrame(
                index=np.repeat(ids, repeats=np.random.randint(low=0, high=50, size=len(ids))),
                columns=['incident_date', 'incident_type', 'decision_date', 'decision']
                )
incidents.index.name='entity_id'

# incident types are random
incidents.incident_type = np.random.choice(['discipline', 'conduct_unbecoming', 'neglect_of_duty', 'other'],
                                           size=len(incidents))

# incident date is random
incidents['incident_date'] = np.random.choice(date_range, size=len(incidents))

# decision happens between 1 and 200 days later
timedelts = np.array(list(map(lambda x: pd.Timedelta(x, 'd'), np.random.randint(1,200,size=len(incidents))) ))
incidents['decision_date'] = incidents['incident_date'] + timedelts

# make some decision dates unknown
incidents.loc[incidents.decision_date > pd.to_datetime('2017-01-01'),'decision_date'] = pd.np.nan
incidents.iloc[np.random.choice(range(len(incidents)), size=int(len(incidents)*0.1)), 2] = pd.np.nan

# decisions are random
incidents['decision'] = np.random.choice(['dismissed','sustained','lack_of_evidence'], size=len(incidents))
incidents.loc[incidents.decision_date.isnull(), 'decision'] = pd.np.nan

incidents = incidents.sort_values(by='incident_date')

incidents.to_csv('../2_crossval/label_join/incidents.csv', index_label=['entity_id'])



In [20]:

    
pwd









    Out[20]:





'/home/bene/DSaPP/hitchhikers-guide/tech-tutorials/model_eval/utils'



In [21]:

    
# status table that lists start and end of engagement
patrol_duty = pd.DataFrame(index=ids, columns=['start_date','end_date'])

patrol_duty['start_date'] = np.random.choice(date_range, size=len(patrol_duty))

timedelts = np.array(list(map(lambda x: pd.Timedelta(x, 'd'), np.random.randint(1,720,size=len(patrol_duty))) ))
patrol_duty['end_date'] = patrol_duty['start_date'] + timedelts

patrol_duty.loc[patrol_duty.end_date>pd.to_datetime('2017-01-01'),'end_date'] = pd.np.nan

# drop some rows
patrol_duty = patrol_duty.iloc[np.random.choice(range(len(patrol_duty)), replace=False, size=int(len(patrol_duty)*.8))]

patrol_duty = patrol_duty.sort_index()

patrol_duty.to_csv('../2_crossval/label_join/patrol_duty.csv', index_label='entity_id')