In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly
import us
In [2]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 6)
In [3]:
df = pd.read_csv("../data/Mass Shootings Dataset Ver 2 clean.csv", encoding = "ISO-8859-1", parse_dates=["Date"])
In [4]:
df.columns
Out[4]:
In [5]:
histogram_columns = ['Fatalities', 'Injured', 'Total victims', 'Mental Health Issues',
'Race', 'Gender', 'Gun law', 'Party affiliation']
In [6]:
def plot_histogram(title, selector):
column = selector.value_counts()
plt.xticks(rotation=45)
if isinstance(column.index[0], str):
column.index = [c.replace(' ', '\n') for c in column.index]
sns.barplot(column.index, column.values)
plt.title(title)
plt.show()
In [7]:
for column_name in histogram_columns:
selector = df[column_name]
plot_histogram(column_name, selector)
In [8]:
plt.figure(figsize=(28, 6))
plot_histogram('State', df['State'])
In [9]:
spring = (df.Date.dt.dayofyear >= 79) & (df.Date.dt.dayofyear < 172)
summer = (df.Date.dt.dayofyear >= 172) & (df.Date.dt.dayofyear < 265)
fall = (df.Date.dt.dayofyear >= 265) & (df.Date.dt.dayofyear < 355)
winter = (df.Date.dt.dayofyear >= 355) | (df.Date.dt.dayofyear < 79)
df['Season'] = ''
df.Season[spring] = 'Spring'
df.Season[summer] = 'Summer'
df.Season[winter] = 'Winter'
df.Season[fall] = 'Fall'
In [10]:
workweek = df.Date.dt.dayofweek.isin([0, 1, 2, 3])
weekend = df.Date.dt.dayofweek.isin([4, 5, 6])
df['DayType'] = ''
df.DayType[workweek] = 'Workweek'
df.DayType[weekend] = 'Weekend'
In [11]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
holidays = calendar().holidays(start=df.Date.min(), end=df.Date.max())
holiday = df.Date.isin(holidays)
df['Holiday'] = ''
df.Holiday[holiday] = 'Holiday'
df.Holiday[~holiday] = 'Regular Day'
In [12]:
time = {
'Year': df.Date.dt.year,
'Month': df.Date.dt.month,
'Day': df.Date.dt.day,
'Weekday': df.Date.dt.weekday_name,
'Season': df.Season,
'DayType': df.DayType,
'Holiday': df.Holiday
}
In [13]:
for title, selector in time.items():
plot_histogram(title, selector)
In [14]:
values_by_date = df.sort_values('Date')
dates = values_by_date.Date.values
In [15]:
def plot_time_series(title, values, scale='linear'):
plt.plot(dates, values, '.')
plt.ylabel(f'{scale}')
plt.yscale(scale)
plt.title(title)
plt.show()
In [16]:
time_series_columns = ['Fatalities', 'Injured', 'Total victims']
for column_name in time_series_columns:
plot_time_series(column_name, values_by_date[column_name].values)
plot_time_series(column_name, values_by_date[column_name].values, scale='log')
In [17]:
def plot_states_map(title, state_names, values):
state_index_codes = list(
map(lambda name: us.states.lookup(name).abbr, state_names)
)
data = [
dict(
type='choropleth',
locationmode='USA-states',
locations=state_index_codes,
z=values
)
]
layout = dict(
title=title,
geo=dict(
scope='usa'
)
)
figure = dict(data=data, layout=layout)
return plotly.plotly.iplot(figure, filename=title)
In [18]:
state_counts = df.groupby('State').Fatalities.count()
plot_states_map('Incidents count', state_counts.index, df.State.value_counts().values)
Out[18]:
In [19]:
df.groupby('State') \
.agg([np.min, np.mean, np.median, np.max, np.std, np.sum]) \
[['Fatalities', 'Injured', 'Total victims']].T
Out[19]:
In [20]:
median_fatalities = df.groupby('State').Fatalities.median()
plot_states_map('Median Fatalities', median_fatalities.index, median_fatalities.values)
Out[20]:
In [21]:
median_injured = df.groupby('State').Injured.median()
plot_states_map('Median injured', median_injured.index, median_injured.values)
Out[21]:
In [22]:
median_total_victims = df.groupby('State')['Total victims'].median()
plot_states_map('Median Total Victims', median_total_victims.index, median_total_victims.values)
Out[22]:
In [23]:
df['Year'] = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Day'] = df.Date.dt.day
df['Weekday'] = df.Date.dt.weekday_name
In [24]:
df.to_csv("../data/Mass Shootings Dataset Ver 2 clean + time.csv", encoding = "ISO-8859-1", index=False)