In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
police_killings = pd.read_csv('police_killings.csv', encoding='ISO-8859-1')
In [3]:
police_killings.dtypes
Out[3]:
In [4]:
police_killings.head()
Out[4]:
In [5]:
race_counts = police_killings['raceethnicity'].value_counts()
In [6]:
plt.bar(np.arange(race_counts.size), race_counts, align='center')
plt.xticks(np.arange(race_counts.size), race_counts.index, rotation='vertical', fontsize='large')
plt.show()
In [7]:
race_counts / np.sum(race_counts) * 100
Out[7]:
from the charts above, we found that about half of people been killed are white. Black people account for approximately one thirds.
In [8]:
income = police_killings[police_killings['p_income'] != '-']['p_income']
In [9]:
income = income.astype('float')
In [10]:
sns.distplot(income, kde=False)
sns.plt.show()
In [11]:
income.median()
Out[11]:
The figure above shows that the high police killing rate happen in the states that the median income are about 22,000 dollar.
In order to analyze the killing data by state, we need to know not only the number of people have been killed in each state, but also the population in each state.
In [12]:
state_pop = pd.read_csv('state_population.csv')
In [13]:
state_pop.head()
Out[13]:
In [14]:
state_counts = police_killings['state_fp'].value_counts()
Creating a new Data Frame and combine with the state population, so that we can find out the killing rate.
In [15]:
states = pd.DataFrame({'STATE': state_counts.index, 'shooting': state_counts})
In [16]:
states = states.merge(state_pop, on='STATE')
In [17]:
states.head()
Out[17]:
Convert the unit of population to millions.
In [18]:
states['pop_millions'] = states['POPESTIMATE2015'] / 1000000
In [19]:
states['rate'] = states['shooting'] / states['pop_millions']
Ordering rate
column from highest to lowest.
In [20]:
states[['STATE', 'shooting', 'NAME', 'pop_millions', 'rate']].sort_values('rate', ascending=False)
Out[20]:
From the chart above, we can wrap up. Generally speaking, states located in middle south have the heighest rate of police killing rate. However, the rate in northeast seem to be the lowest.
Which states have the highest number of police killing?
In [21]:
states[['STATE', 'shooting', 'NAME', 'pop_millions', 'rate']].sort_values('shooting', ascending=False)
Out[21]:
In [22]:
pk = police_killings[(police_killings['share_white'] != '-') & \
(police_killings['share_black'] != '-') & \
(police_killings['share_hispanic'] != '-')]
In [23]:
pk['share_white'] = pk['share_white'].astype('float64')
pk['share_black'] = pk['share_black'].astype('float64')
pk['share_hispanic'] = pk['share_hispanic'].astype('float64')
We have to change state's name to its abbreviate name because there is no abbr name for state.
In [24]:
lowest_states_list = ["CT", "PA", "IA", "NY", "MA", "NH", "ME", "IL", "OH", "WI"]
highest_states_list = ["OK", "AZ", "NE", "HI", "AK", "ID", "NM", "LA", "CO", "DE"]
lowest_states = pk[pk['state'].isin(lowest_states_list)]
highest_states = pk[pk['state'].isin(highest_states_list)]
Comparing these columns by means of median.
In [25]:
columns = ["pop", "county_income", "share_white", "share_black", "share_hispanic"]
In [26]:
lowest_states[columns].median()
Out[26]:
In [27]:
highest_states[columns].median()
Out[27]:
In the states of the lower country income, the police killing rate are higher. By contrary, in the states of higher country income, the police killing rate is lower.
In [28]:
causes = police_killings['cause'].value_counts()
causes.plot(kind='pie', autopct='%.2f', title='Cause', fontsize=16, figsize=(12,12))
plt.legend(labels=causes.index)
plt.show()
Obviously, most causes are gunshot.
In [ ]: