In [1]:
%matplotlib inline
import diogenes.read as read
import diogenes.display as display
import diogenes.modify as modify
import diogenes.utils as utils
import diogenes.grid_search as grid_search
import numpy as np
The following disclaimer is included with the data by the Invisible Institute.
This dataset is compiled from three lists of allegations against Chicago Police Department officers, spanning approximately 2002 - 2008 and 2010 - 2014, produced by the City of Chicago in response to litigation and to FOIA requests.
The City of Chicago's production of this information is accompanied by a disclaimer that not all information contained in the City's database may be correct.
No independent verification of the City's records has taken place and this dataset does not purport to be an accurate reflection of either the City's database or its veracity.
In [120]:
#Record arrays
allegations = read.open_csv_url('https://raw.githubusercontent.com/jamestwhedbee/DataProjects/master/CPDB/Allegations.csv',parse_datetimes=['IncidentDate','StartDate','EndDate'])
citizens = read.open_csv_url('https://raw.githubusercontent.com/jamestwhedbee/DataProjects/master/CPDB/Citizens.csv')
officers = read.open_csv_url('https://raw.githubusercontent.com/jamestwhedbee/DataProjects/master/CPDB/Officers.csv')
We can see the column names for the three tables below.
In [121]:
#I shouldn't have to nest function calls just to get a summary of my data. This needs to be a single call.
#Most of the data isn't numeric, so we should find a way to be more helpful than this.
#Also, what is the "None" printing at the end of this?
print display.pprint_sa(display.describe_cols(allegations))
print display.pprint_sa(display.describe_cols(citizens))
print display.pprint_sa(display.describe_cols(officers))
For this analysis, we will be removing several columns for the following reasons:
We will also translate ApptDate, which specifies the number of days between the hire date and 1900-1-1, to the number of years working.
In [122]:
import datetime
#TODO: there is a typo in the "OfficerFirst" column in allegations.
#Should pass this on to Kalven at Invisible Institute along with questions about data.
allegations = utils.remove_cols(allegations,['OfficeFirst','OfficerLast','Investigator','AllegationCode','RecommendedFinding','RecommendedOutcome','FinalFinding','FinalOutcome','Beat','Add1','Add2','City'])
officers = utils.remove_cols(officers,['OfficerFirst','OfficerLast','Star'])
#Convert appointment date days since 1900-1-1 to years prior to today
def tenure(vector):
today = datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d')
started = np.add(np.datetime64('1900-01-01'),map(lambda x: np.timedelta64(int(x), 'D'),vector))
tenure = np.subtract(np.datetime64(today),started)
return np.divide(tenure,np.timedelta64(1,'D')) / 365
#Impute median date for missing values
officers['ApptDate'] = modify.replace_missing_vals(officers['ApptDate'], strategy='median')
tenure_days = modify.combine_cols(officers,tenure,['ApptDate'])
officers = utils.append_cols(officers,[tenure_days],['Tenure'])
For ease of use, let's join our tables.
In [123]:
master = utils.join(allegations,citizens,'left',['CRID'],['CRID'])
#Rename Race and Gender, since citizens and officers have these columns
temp_col_names = list(master.dtype.names)
gender_index = temp_col_names.index("Gender")
race_index = temp_col_names.index("Race")
temp_col_names[gender_index] = "CitizenGender"
temp_col_names[race_index] = "CitizenRace"
master.dtype.names = tuple(temp_col_names)
master = utils.join(master,officers,'left',['OfficerID'],['OfficerID'])
temp_col_names = list(master.dtype.names)
gender_index = temp_col_names.index("Gender")
race_index = temp_col_names.index("Race")
temp_col_names[gender_index] = "OfficerGender"
temp_col_names[race_index] = "OfficerRace"
master.dtype.names = tuple(temp_col_names)
There are some allegations where no officer ID was provided. For this analysis, we will discard those allegations.
In [124]:
#This is a pretty awkward way to remove nan, is there a better way I missed?
master = modify.choose_rows_where(master,[{'func': modify.row_val_between, 'col_name': 'OfficerID', 'vals': [-np.inf,np.inf]}])
Now, let's encode our data numerically
In [125]:
#Unit is interpreted as numeric, but we really want to analyze it categorically
#There should be an easier way to treat a numeric column as categorical data
master = utils.append_cols(master,master['Unit'].astype('|S10'),['UnitCat'])
master = utils.remove_cols(master,['Unit'])
master_data, master_classes = modify.label_encode(master)
For convenience, we'll build every possible categorical directive
In [126]:
#Directives
def cat_directives(array,classes):
cat_directives = {}
for column in classes:
cat_directives[column] = {v:[{'func': modify.row_val_eq, 'col_name': column, 'vals': i}] for i,v in enumerate(classes[column])}
return cat_directives
where = cat_directives(master_data,master_classes)
Now, we can build intuitive masks as combinations of our human-readable directives
In [127]:
#Masks
#Gender
female_officers = modify.where_all_are_true(master_data,where['OfficerGender']['F'])
male_officers = modify.where_all_are_true(master_data,where['OfficerGender']['M'])
female_citizens = modify.where_all_are_true(master_data,where['CitizenGender']['F'])
male_citizens = modify.where_all_are_true(master_data,where['CitizenGender']['M'])
#Race
white_officers = modify.where_all_are_true(master_data,where['OfficerRace']['White'])
black_officers = modify.where_all_are_true(master_data,where['OfficerRace']['Black'])
hispanic_officers = modify.where_all_are_true(master_data,where['OfficerRace']['Hispanic'])
white_citizens = modify.where_all_are_true(master_data,where['CitizenRace']['White'])
black_citizens = modify.where_all_are_true(master_data,where['CitizenRace']['Black'])
hispanic_citizens = modify.where_all_are_true(master_data,where['CitizenRace']['Hispanic'])
#Cross-sections
white_M_officers_black_F_citizens = modify.where_all_are_true(master_data,where['OfficerRace']['White']+
where['OfficerGender']['M']+
where['CitizenRace']['Black']+
where['CitizenGender']['F'])
Let's generate a potentially interesting new feature from our existing data, and pull out all non-numeric data
In [128]:
duration = modify.combine_cols(master_data,np.subtract,['EndDate','StartDate'])
durationDays = duration / np.timedelta64(1, 'D')
duration_data = utils.append_cols(master_data,[durationDays],['InvestigationDuration'])
numeric_data = utils.remove_cols(master_data,['StartDate','EndDate','IncidentDate'])
We understand what data we have, and we have some tools to easily slice and dice. Let's dive in and learn something.
In [129]:
#Ex 1: What percentage of allegations have a black female citizen and a white male officer?
print np.sum(white_M_officers_black_F_citizens.astype(np.float))/np.size(white_M_officers_black_F_citizens.astype(np.float))
In [130]:
#Ex 2: What is the breakdown of officers with complaints by race?
#This seems a little clunky to me
#Would be nice if plot_simple_histogram could handle categorical labels for me
display.plot_simple_histogram(master_data['OfficerRace'],verbose=False)
display.plt.xticks(range(len(master_classes['OfficerRace'])), master_classes['OfficerRace'])
Out[130]:
In [131]:
#Ex 3: What does the distribution of complaints look like?
complaint_counter = display.Counter(numeric_data['OfficerID'])
officer_list, complaint_counts = zip(*complaint_counter.items())
display.plot_simple_histogram(complaint_counts)
Out[131]:
In [132]:
#Ex 4: What can we learn from the 100 officers who receive the most complaints?
#FYI: Wikipedia says 12,244 officers total, so this is roughly the top 1% of all Chicago officers.
#Obviously, all officers do not have the same quantity and quality of interactions with citizens.
#Need to account for this fact for any real analysis.
#Median imputation makes histogram look unnatural
#Top 100 Officers
top_100 = counts.most_common(100)
top_100_officers = map(lambda x: x[0],top_100)
#We should add this to modify.py for categorical data
def row_val_in(M,col_name,boundary):
return [x in boundary for x in M[col_name]]
top_100_profile = modify.choose_rows_where(officers,[{'func': row_val_in, 'col_name': 'OfficerID', 'vals': top_100_officers}])
#Can't check this against CPDB, their allegation counts are for the whole time period
#Not just 2011 - present.
display.plot_simple_histogram(master_data['Tenure'],verbose=False)
display.plot_simple_histogram(top_100_profile['Tenure'],verbose=False)
Out[132]:
In [12]:
#Ex 5: What does the distribution of outcomes look like?
#Hastily written, possibly not useful. Just curious.
#Almost everything is unknown or no action taken
def sortedFrequencies(array,classes,col_name):
if col_name not in classes:
raise ValueError('col_name must be categorical')
counts = display.Counter(array[col_name])
total = float(sum(counts.values()))
for key in counts:
counts[key] /= total
count_dict = {}
for value in counts:
count_dict[classes[col_name][value]] = counts[value]
return sorted(count_dict.items(), key=lambda x: x[1],reverse=True)
print sortedFrequencies(numeric_data,master_classes,'Outcome')
In [133]:
#Ex 6: What has the number of complaints over time been like?
#Looks seasonal (peaking in summer), and declining over time (coud the decline just be a collection issue?)
def numpy_to_month(dt64):
ts = (dt64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')
dt = datetime.datetime.utcfromtimestamp(ts)
d = datetime.date(dt.year, dt.month, 1) #round to month
return d
months, counts = zip(*display.Counter(map(numpy_to_month,duration_data['IncidentDate'])).items())
display.plt.plot_date(months,counts)
Out[133]:
In [134]:
#How does it look to split complaints by location?
#Very disproportionate. Locations 17,19,3,4 have almost all complaints.
display.plot_simple_histogram(numeric_data['Location'],verbose=False)
display.plt.xticks(range(len(master_classes['Location'])), master_classes['Location'])
#Unit?
#Still uneven, but more even than location.
display.plot_simple_histogram(numeric_data['UnitCat'],verbose=False)
display.plt.xticks(range(len(master_classes['UnitCat'])), master_classes['UnitCat'])
Out[134]:
In [135]:
#Are there officers getting a lot of complaints not from the high yield locations?
#What does the social network of concomitant officers look like?
In [ ]: