In [ ]:
import pandas as pd
import numpy as np
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (20,5) # This can be the default, or else, you can also specify this every time you generate a graph
In [ ]:
import vincent
vincent.core.initialize_notebook()
In [ ]:
# Note: You will have to unzip the file, as Github has a file size limit of 100mb
df = pd.read_csv("Data/database.csv")
In [ ]:
df.head()
In [ ]:
df.shape
In [ ]:
df.set_index(["Record ID"],inplace = True)
In [ ]:
df.head()
In [ ]:
df.apply(lambda x: sum(x.isnull()),axis=0)
Our dataset seems really clean, without any missing values, which is wonderful!
In [ ]:
df.describe()
In [ ]:
df["Weapon"].head()
Can the scale we deploy for graphics, affect our perception of the nature of the data? Let's check different configurations.
In [ ]:
plt.rcParams["figure.figsize"] = (20,4)
df["Weapon"].value_counts().plot(kind = "bar")
plt.title('Deaths Attributable to Various Weapons')
In [ ]:
plt.rcParams["figure.figsize"] = (10,10)
df["Weapon"].value_counts().plot(kind = "bar")
plt.title('Deaths Attributable to Various Weapons')
In [ ]:
plt.rcParams["figure.figsize"] = (20,4)
plt.yscale('log', nonposy='clip')
df["Weapon"].value_counts().plot(kind = "bar")
plt.title('Deaths Attributable to Various Weapons')
Let's pay some attention to unsolved crimes. What are their characteristics?
In [ ]:
df[df["Crime Solved"] != "Yes"].shape
In [ ]:
unsolved = df[df["Crime Solved"] != "Yes"]
In [ ]:
unsolved.head()
In [ ]:
unsolved.describe()
In [ ]:
plt.rcParams["figure.figsize"] = (20,7)
unsolved['Year'].value_counts().sort_index(ascending=True).plot(kind='line')
plt.title('Number of Unsolved Homicides: 1980 to 2014')
In [ ]:
dict_states = {'Alaska':'AK','Alabama':'AL','Arkansas':'AR','Arizona':'AZ', 'California':'CA', 'Colorado':'CO', 'Connecticut':'CT',
'District of Columbia':'DC', 'Delaware':'DE', 'Florida':'FL', 'Georgia':'GA', 'Hawaii':'HI', 'Iowa':'IA',
'Idaho':'ID', 'Illinois':'IL', 'Indiana':'IN', 'Kansas':'KS', 'Kentucky':'KY', 'Louisiana':'LA',
'Massachusetts':'MA', 'Maryland':'MD', 'Maine':'ME', 'Michigan':'MI', 'Minnesota':'MN', 'Missouri':'MO',
'Mississippi':'MS', 'Montana':'MT', 'North Carolina':'NC', 'North Dakota':'ND', 'Nebraska':'NE',
'New Hampshire':'NH', 'New Jersey':'NJ', 'New Mexico':'NM', 'Nevada':'NV', 'New York':'NY', 'Ohio':'OH',
'Oklahoma':'OK', 'Oregon':'OR', 'Pennsylvania':'PA', 'Puerto Rico':'PR', 'Rhode Island':'RI',
'South Carolina':'SC', 'South Dakota':'SD', 'Tennessee':'TN', 'Texas':'TX', 'Utah':'UT',
'Virginia':'VA', 'Vermont':'VT', 'Washington':'WA', 'Wisconsin':'WI', 'West Virginia':'WV', 'Wyoming':'WY'}
In [ ]:
abb_st = [val for val in dict_states.values()]
len(abb_st)
In [ ]:
plt.rcParams["figure.figsize"] = (20,7)
ax = sns.countplot(x="State", hue="Weapon", data=unsolved[unsolved["Weapon"]=="Handgun"])
ax.set_xticklabels(abb_st)
plt.title("Unsolved Homicides Caused By Handguns")
In [ ]:
unsolved['Weapon'].value_counts()
In [ ]:
plt.rcParams["figure.figsize"] = (15,10)
In [ ]:
unsolved['Weapon'].value_counts().plot(kind='bar')
In [ ]:
bar = vincent.Bar(unsolved['State'].value_counts())
bar
bar.x_axis_properties(label_angle = 180+90, label_align = "right")
bar.legend(title = "Unsolved Homicides: Weapons Involved")
In [ ]:
# Team_Before = df['Punches Before'].groupby(df['Team'])
rel = unsolved['Weapon'].groupby(unsolved['Victim Sex'])
In [ ]:
rel.size().plot(kind='bar')
Significant majority of victims in unsolved homicides are males.
In [ ]:
unsolved["Month"].value_counts().plot(kind="bar")
In [ ]:
unsolved["Agency Type"].value_counts().plot(kind="bar")
#plt.yscale('log', nonposy='clip')
In [ ]:
unsolved["Crime Type"].unique()
Where are potential serial killers hiding? In plain sight in large cities, or in small towns?
In [ ]:
pot_sk = unsolved[unsolved["Crime Type"] == "Murder or Manslaughter"]
pot_sk.head()
In [ ]:
pot_sk.shape
In [ ]:
pot_sk["City"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Cities: Unsolved Murders or Manslaughters")
Some of the smaller cities have just 1 unsolved homicide. Serial Killers are defined as those having atleast 3 victims. Let's put the threshold at 5 unsolved for the city.
In [ ]:
pot_sk["City"].value_counts().tail(10).plot(kind="bar")
plt.title("Bottom 10 Cities: Unsolved Murders or Manslaughters")
pot_sk.groupby("City").filter(lambda x: len(x)>5)
In [ ]:
two_or_more = pot_sk.groupby("City").filter(lambda x: len(x)>5)
two_or_more["City"].value_counts().tail(10).plot(kind="bar")
In [ ]:
df["Relationship"].unique()
In [ ]:
known = df[df["Relationship"] != "Unknown"]
known.head()
In [ ]:
known["Relationship"].value_counts()
In [ ]:
plt.rcParams["figure.figsize"] = (20,5)
known["Relationship"].value_counts().plot(kind="bar")
plt.title("Relationsip of Victim to Perpetrator")
plt.yscale('log', nonposy='clip')
In [ ]:
df.head(2)
In [ ]:
df["Perpetrator Race"].unique()
In [ ]:
df.columns
In [ ]:
pd.pivot_table(known,index=["Victim Race","Perpetrator Race"],values=["Victim Count"],aggfunc=[np.sum])
#columns=["Product"],aggfunc=[np.sum])
It looks like most people are killed by people from their own racial background.