Histograms

Read data



In [1]:

    
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly
import us



In [2]:

    
%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 6)



In [3]:

    
df = pd.read_csv("../data/Mass Shootings Dataset Ver 2 clean.csv", encoding = "ISO-8859-1", parse_dates=["Date"])



In [4]:

    
df.columns









    Out[4]:





Index(['S#', 'Title', 'Date', 'Fatalities', 'Injured', 'Total victims',
       'Mental Health Issues', 'Race', 'Gender', 'Latitude', 'Longitude',
       'Cities', 'State', 'Gun law', 'Party affiliation', 'Population'],
      dtype='object')

Input Data



In [5]:

    
histogram_columns = ['Fatalities', 'Injured', 'Total victims', 'Mental Health Issues',
                     'Race', 'Gender', 'Gun law', 'Party affiliation']



In [6]:

    
def plot_histogram(title, selector):
    column = selector.value_counts()
    
    plt.xticks(rotation=45)
    if isinstance(column.index[0], str):
        column.index = [c.replace(' ', '\n') for c in column.index]
    
    sns.barplot(column.index, column.values)
    plt.title(title)
    plt.show()



In [7]:

    
for column_name in histogram_columns:
    selector = df[column_name]
    plot_histogram(column_name, selector)



In [8]:

    
plt.figure(figsize=(28, 6))
plot_histogram('State', df['State'])

Exploration

Dates



In [9]:

    
spring = (df.Date.dt.dayofyear >= 79) & (df.Date.dt.dayofyear < 172)
summer = (df.Date.dt.dayofyear >= 172) & (df.Date.dt.dayofyear < 265)
fall = (df.Date.dt.dayofyear >= 265) & (df.Date.dt.dayofyear < 355)
winter = (df.Date.dt.dayofyear >= 355) | (df.Date.dt.dayofyear < 79)

df['Season'] = ''

df.Season[spring] = 'Spring'
df.Season[summer] = 'Summer'
df.Season[winter] = 'Winter'
df.Season[fall] = 'Fall'









    



/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [10]:

    
workweek = df.Date.dt.dayofweek.isin([0, 1, 2, 3])
weekend = df.Date.dt.dayofweek.isin([4, 5, 6])

df['DayType'] = ''

df.DayType[workweek] = 'Workweek'
df.DayType[weekend] = 'Weekend'









    



/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [11]:

    
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
holidays = calendar().holidays(start=df.Date.min(), end=df.Date.max())
holiday = df.Date.isin(holidays)

df['Holiday'] = ''

df.Holiday[holiday] = 'Holiday'
df.Holiday[~holiday] = 'Regular Day'









    



/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [12]:

    
time = {
    'Year': df.Date.dt.year,
    'Month': df.Date.dt.month,
    'Day': df.Date.dt.day,
    'Weekday': df.Date.dt.weekday_name,
    'Season': df.Season,
    'DayType': df.DayType,
    'Holiday': df.Holiday
}



In [13]:

    
for title, selector in time.items():
    plot_histogram(title, selector)

Time series



In [14]:

    
values_by_date = df.sort_values('Date')
dates = values_by_date.Date.values



In [15]:

    
def plot_time_series(title, values, scale='linear'):    
    plt.plot(dates, values, '.')
    plt.ylabel(f'{scale}')
    plt.yscale(scale)
    plt.title(title)
    plt.show()



In [16]:

    
time_series_columns = ['Fatalities', 'Injured', 'Total victims']

for column_name in time_series_columns:
    plot_time_series(column_name, values_by_date[column_name].values)
    plot_time_series(column_name, values_by_date[column_name].values, scale='log')

Map



In [17]:

    
def plot_states_map(title, state_names, values):
    state_index_codes = list(
        map(lambda name: us.states.lookup(name).abbr, state_names)
    )

    data = [
        dict(
            type='choropleth',
            locationmode='USA-states',
            locations=state_index_codes,
            z=values
        )
    ]

    layout = dict(
        title=title,
        geo=dict(
            scope='usa'
        )
    )
    
    figure = dict(data=data, layout=layout)
    return plotly.plotly.iplot(figure, filename=title)



In [18]:

    
state_counts = df.groupby('State').Fatalities.count()
plot_states_map('Incidents count', state_counts.index, df.State.value_counts().values)









    Out[18]:



In [19]:

    
df.groupby('State') \
    .agg([np.min, np.mean, np.median, np.max, np.std, np.sum]) \
    [['Fatalities', 'Injured', 'Total victims']].T









    Out[19]:







  
    
      
      State
      Alabama
      Alaska
      Arizona
      Arkansas
      California
      Colorado
      Connecticut
      Delaware
      Florida
      Georgia
      ...
      South Dakota
      Tennessee
      Texas
      Utah
      Vermont
      Virginia
      Washington
      West Virginia
      Wisconsin
      Wyoming
    
  
  
    
      Fatalities
      amin
      0.000000
      2.0
      0.000000
      0.000000
      0.000000
      3.000000
      5.000000
      1.0
      0.000000
      0.000000
      ...
      6.0
      0.000000
      0.000000
      1.000000
      2.0
      0.000000
      1.000000
      5.0
      2.000000
      1.0
    
    
      mean
      2.909091
      2.0
      3.916667
      2.500000
      5.142857
      7.166667
      14.000000
      1.0
      5.120000
      2.625000
      ...
      6.0
      1.857143
      6.250000
      3.500000
      2.0
      5.666667
      5.647059
      5.0
      4.666667
      1.0
    
    
      median
      2.000000
      2.0
      4.500000
      2.500000
      4.000000
      4.500000
      9.000000
      1.0
      3.000000
      2.000000
      ...
      6.0
      2.000000
      4.000000
      3.500000
      2.0
      2.000000
      5.000000
      5.0
      4.000000
      1.0
    
    
      amax
      11.000000
      2.0
      9.000000
      5.000000
      22.000000
      15.000000
      28.000000
      1.0
      49.000000
      10.000000
      ...
      6.0
      5.000000
      24.000000
      6.000000
      2.0
      32.000000
      13.000000
      5.0
      8.000000
      1.0
    
    
      std
      3.015113
      NaN
      2.503028
      3.535534
      4.753592
      5.036533
      12.288206
      NaN
      9.640712
      2.704934
      ...
      NaN
      1.772811
      5.766281
      3.535534
      NaN
      10.012492
      3.141281
      NaN
      2.179449
      NaN
    
    
      sum
      32.000000
      2.0
      47.000000
      5.000000
      180.000000
      43.000000
      42.000000
      1.0
      128.000000
      42.000000
      ...
      6.0
      13.000000
      125.000000
      7.000000
      2.0
      51.000000
      96.000000
      5.0
      42.000000
      1.0
    
    
      Injured
      amin
      0.000000
      2.0
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      4.0
      0.000000
      0.000000
      ...
      0.0
      1.000000
      0.000000
      3.000000
      2.0
      0.000000
      0.000000
      0.0
      1.000000
      4.0
    
    
      mean
      2.818182
      2.0
      2.583333
      5.500000
      6.371429
      18.166667
      1.333333
      4.0
      6.080000
      3.562500
      ...
      0.0
      2.714286
      7.700000
      3.500000
      2.0
      4.666667
      2.352941
      0.0
      2.444444
      4.0
    
    
      median
      3.000000
      2.0
      1.000000
      5.500000
      3.000000
      7.000000
      2.000000
      4.0
      3.000000
      2.000000
      ...
      0.0
      3.000000
      4.000000
      3.500000
      2.0
      2.000000
      1.000000
      0.0
      2.000000
      4.0
    
    
      amax
      6.000000
      2.0
      13.000000
      10.000000
      30.000000
      70.000000
      2.000000
      4.0
      53.000000
      12.000000
      ...
      0.0
      5.000000
      32.000000
      4.000000
      2.0
      23.000000
      23.000000
      0.0
      4.000000
      4.0
    
    
      std
      1.721522
      NaN
      3.800917
      6.363961
      7.096336
      26.843373
      1.154701
      NaN
      10.551145
      3.424787
      ...
      NaN
      1.496026
      10.094814
      0.707107
      NaN
      7.071068
      5.442210
      NaN
      1.333333
      NaN
    
    
      sum
      31.000000
      2.0
      31.000000
      11.000000
      223.000000
      109.000000
      4.000000
      4.0
      152.000000
      57.000000
      ...
      0.0
      19.000000
      154.000000
      7.000000
      2.0
      42.000000
      40.000000
      0.0
      22.000000
      4.0
    
    
      Total victims
      amin
      3.000000
      4.0
      3.000000
      3.000000
      3.000000
      3.000000
      4.000000
      5.0
      3.000000
      3.000000
      ...
      5.0
      3.000000
      3.000000
      3.000000
      4.0
      3.000000
      3.000000
      4.0
      3.000000
      4.0
    
    
      mean
      5.636364
      4.0
      6.083333
      9.000000
      11.200000
      24.666667
      14.666667
      5.0
      11.000000
      5.750000
      ...
      5.0
      4.285714
      13.500000
      6.000000
      4.0
      9.888889
      7.470588
      4.0
      6.555556
      4.0
    
    
      median
      4.000000
      4.0
      4.000000
      9.000000
      8.000000
      10.500000
      11.000000
      5.0
      5.000000
      4.000000
      ...
      5.0
      4.000000
      6.500000
      6.000000
      4.0
      4.000000
      5.000000
      4.0
      7.000000
      4.0
    
    
      amax
      16.000000
      4.0
      19.000000
      15.000000
      40.000000
      82.000000
      29.000000
      5.0
      102.000000
      21.000000
      ...
      5.0
      7.000000
      48.000000
      9.000000
      4.0
      55.000000
      27.000000
      4.0
      11.000000
      4.0
    
    
      std
      3.775519
      NaN
      4.521833
      8.485281
      9.907810
      30.663768
      12.897028
      NaN
      19.527758
      4.419653
      ...
      NaN
      1.380131
      14.475024
      4.242641
      NaN
      16.959101
      6.083971
      NaN
      2.788867
      NaN
    
    
      sum
      62.000000
      4.0
      73.000000
      18.000000
      392.000000
      148.000000
      44.000000
      5.0
      275.000000
      92.000000
      ...
      5.0
      30.000000
      270.000000
      12.000000
      4.0
      89.000000
      127.000000
      4.0
      59.000000
      4.0
    
  

18 rows × 47 columns



In [20]:

    
median_fatalities = df.groupby('State').Fatalities.median()
plot_states_map('Median Fatalities', median_fatalities.index, median_fatalities.values)









    Out[20]:



In [21]:

    
median_injured = df.groupby('State').Injured.median()
plot_states_map('Median injured', median_injured.index, median_injured.values)









    Out[21]:



In [22]:

    
median_total_victims = df.groupby('State')['Total victims'].median()
plot_states_map('Median Total Victims', median_total_victims.index, median_total_victims.values)









    Out[22]:



In [23]:

    
df['Year'] = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Day'] = df.Date.dt.day
df['Weekday'] = df.Date.dt.weekday_name



In [24]:

    
df.to_csv("../data/Mass Shootings Dataset Ver 2 clean + time.csv", encoding = "ISO-8859-1", index=False)

	State	Alabama	Alaska	Arizona	Arkansas	California	Colorado	Connecticut	Delaware	Florida	Georgia	...	South Dakota	Tennessee	Texas	Utah	Vermont	Virginia	Washington	West Virginia	Wisconsin	Wyoming
Fatalities	amin	0.000000	2.0	0.000000	0.000000	0.000000	3.000000	5.000000	1.0	0.000000	0.000000	...	6.0	0.000000	0.000000	1.000000	2.0	0.000000	1.000000	5.0	2.000000	1.0
	mean	2.909091	2.0	3.916667	2.500000	5.142857	7.166667	14.000000	1.0	5.120000	2.625000	...	6.0	1.857143	6.250000	3.500000	2.0	5.666667	5.647059	5.0	4.666667	1.0
	median	2.000000	2.0	4.500000	2.500000	4.000000	4.500000	9.000000	1.0	3.000000	2.000000	...	6.0	2.000000	4.000000	3.500000	2.0	2.000000	5.000000	5.0	4.000000	1.0
	amax	11.000000	2.0	9.000000	5.000000	22.000000	15.000000	28.000000	1.0	49.000000	10.000000	...	6.0	5.000000	24.000000	6.000000	2.0	32.000000	13.000000	5.0	8.000000	1.0
	std	3.015113	NaN	2.503028	3.535534	4.753592	5.036533	12.288206	NaN	9.640712	2.704934	...	NaN	1.772811	5.766281	3.535534	NaN	10.012492	3.141281	NaN	2.179449	NaN
	sum	32.000000	2.0	47.000000	5.000000	180.000000	43.000000	42.000000	1.0	128.000000	42.000000	...	6.0	13.000000	125.000000	7.000000	2.0	51.000000	96.000000	5.0	42.000000	1.0
Injured	amin	0.000000	2.0	0.000000	1.000000	0.000000	0.000000	0.000000	4.0	0.000000	0.000000	...	0.0	1.000000	0.000000	3.000000	2.0	0.000000	0.000000	0.0	1.000000	4.0
	mean	2.818182	2.0	2.583333	5.500000	6.371429	18.166667	1.333333	4.0	6.080000	3.562500	...	0.0	2.714286	7.700000	3.500000	2.0	4.666667	2.352941	0.0	2.444444	4.0
	median	3.000000	2.0	1.000000	5.500000	3.000000	7.000000	2.000000	4.0	3.000000	2.000000	...	0.0	3.000000	4.000000	3.500000	2.0	2.000000	1.000000	0.0	2.000000	4.0
	amax	6.000000	2.0	13.000000	10.000000	30.000000	70.000000	2.000000	4.0	53.000000	12.000000	...	0.0	5.000000	32.000000	4.000000	2.0	23.000000	23.000000	0.0	4.000000	4.0
	std	1.721522	NaN	3.800917	6.363961	7.096336	26.843373	1.154701	NaN	10.551145	3.424787	...	NaN	1.496026	10.094814	0.707107	NaN	7.071068	5.442210	NaN	1.333333	NaN
	sum	31.000000	2.0	31.000000	11.000000	223.000000	109.000000	4.000000	4.0	152.000000	57.000000	...	0.0	19.000000	154.000000	7.000000	2.0	42.000000	40.000000	0.0	22.000000	4.0
Total victims	amin	3.000000	4.0	3.000000	3.000000	3.000000	3.000000	4.000000	5.0	3.000000	3.000000	...	5.0	3.000000	3.000000	3.000000	4.0	3.000000	3.000000	4.0	3.000000	4.0
	mean	5.636364	4.0	6.083333	9.000000	11.200000	24.666667	14.666667	5.0	11.000000	5.750000	...	5.0	4.285714	13.500000	6.000000	4.0	9.888889	7.470588	4.0	6.555556	4.0
	median	4.000000	4.0	4.000000	9.000000	8.000000	10.500000	11.000000	5.0	5.000000	4.000000	...	5.0	4.000000	6.500000	6.000000	4.0	4.000000	5.000000	4.0	7.000000	4.0
	amax	16.000000	4.0	19.000000	15.000000	40.000000	82.000000	29.000000	5.0	102.000000	21.000000	...	5.0	7.000000	48.000000	9.000000	4.0	55.000000	27.000000	4.0	11.000000	4.0
	std	3.775519	NaN	4.521833	8.485281	9.907810	30.663768	12.897028	NaN	19.527758	4.419653	...	NaN	1.380131	14.475024	4.242641	NaN	16.959101	6.083971	NaN	2.788867	NaN
	sum	62.000000	4.0	73.000000	18.000000	392.000000	148.000000	44.000000	5.0	275.000000	92.000000	...	5.0	30.000000	270.000000	12.000000	4.0	89.000000	127.000000	4.0	59.000000	4.0