In [7]:
import pandas as pd
import numpy as np
In [16]:
# entities
n = 500
df = pd.DataFrame()
ids = np.arange(n)
np.random.shuffle(ids)
In [17]:
# make data for static feature
gender = np.random.binomial(1, 0.6, len(ids))
gender = pd.DataFrame(index=ids, data=gender, columns=['gender_female'])
gender.loc[np.random.choice(ids, size=int(len(ids)*.05))] = pd.np.nan
gender.to_csv('../2_crossval/label_join/gender_female.csv', index_label='entity_id')
In [18]:
# make data for temporal feature
date_range = pd.date_range(start='2014-01-01', end='2017-01-01', freq='W')
multiindex = pd.MultiIndex.from_product([ids, date_range])
timesheet = pd.DataFrame(index=multiindex, columns=['hours'])
timesheet['hours'] = np.random.randint(low=0, high=60, size=len(timesheet))
# drop some random rows
timesheet = timesheet.iloc[np.random.choice(range(len(timesheet)), replace=False, size=int(len(timesheet)*.8))]
timesheet.to_csv('../2_crossval/label_join/timesheet.csv', index_label=['entity_id','date'])
In [29]:
# make an incident table with entity_id, incident_date, incident_type, decision_date, decision
# each entity has between 0 and 50 incidents
incidents = pd.DataFrame(
index=np.repeat(ids, repeats=np.random.randint(low=0, high=50, size=len(ids))),
columns=['incident_date', 'incident_type', 'decision_date', 'decision']
)
incidents.index.name='entity_id'
# incident types are random
incidents.incident_type = np.random.choice(['discipline', 'conduct_unbecoming', 'neglect_of_duty', 'other'],
size=len(incidents))
# incident date is random
incidents['incident_date'] = np.random.choice(date_range, size=len(incidents))
# decision happens between 1 and 200 days later
timedelts = np.array(list(map(lambda x: pd.Timedelta(x, 'd'), np.random.randint(1,200,size=len(incidents))) ))
incidents['decision_date'] = incidents['incident_date'] + timedelts
# make some decision dates unknown
incidents.loc[incidents.decision_date > pd.to_datetime('2017-01-01'),'decision_date'] = pd.np.nan
incidents.iloc[np.random.choice(range(len(incidents)), size=int(len(incidents)*0.1)), 2] = pd.np.nan
# decisions are random
incidents['decision'] = np.random.choice(['dismissed','sustained','lack_of_evidence'], size=len(incidents))
incidents.loc[incidents.decision_date.isnull(), 'decision'] = pd.np.nan
incidents = incidents.sort_values(by='incident_date')
incidents.to_csv('../2_crossval/label_join/incidents.csv', index_label=['entity_id'])
In [20]:
pwd
Out[20]:
In [21]:
# status table that lists start and end of engagement
patrol_duty = pd.DataFrame(index=ids, columns=['start_date','end_date'])
patrol_duty['start_date'] = np.random.choice(date_range, size=len(patrol_duty))
timedelts = np.array(list(map(lambda x: pd.Timedelta(x, 'd'), np.random.randint(1,720,size=len(patrol_duty))) ))
patrol_duty['end_date'] = patrol_duty['start_date'] + timedelts
patrol_duty.loc[patrol_duty.end_date>pd.to_datetime('2017-01-01'),'end_date'] = pd.np.nan
# drop some rows
patrol_duty = patrol_duty.iloc[np.random.choice(range(len(patrol_duty)), replace=False, size=int(len(patrol_duty)*.8))]
patrol_duty = patrol_duty.sort_index()
patrol_duty.to_csv('../2_crossval/label_join/patrol_duty.csv', index_label='entity_id')