1. Load



In [1]:

    
import pandas as pd
import numpy as np
import geopy
import us



In [2]:

    
df = pd.read_csv("../data/Mass Shootings Dataset Ver 2.csv", encoding = "ISO-8859-1")
df.replace([np.NaN], [''], inplace=True)



In [3]:

    
df.columns









    Out[3]:





Index(['S#', 'Title', 'Location', 'Date', 'Summary', 'Fatalities', 'Injured',
       'Total victims', 'Mental Health Issues', 'Race', 'Gender', 'Latitude',
       'Longitude'],
      dtype='object')

2. Process

Title



In [4]:

    
df.Title.head()









    Out[4]:





0           Las Vegas Strip mass shooting
1              San Francisco UPS shooting
2       Pennsylvania supermarket shooting
3    Florida awning manufacturer shooting
4        Rural Ohio nursing home shooting
Name: Title, dtype: object

Location



In [5]:

    
df.Location.replace(['Washington D.C.', ''],
                    ['Washington, Washington', ','],
                    inplace=True)



In [6]:

    
cities, raw_states = zip(*[value.rsplit(',', maxsplit=1) for value in df.Location.values])



In [7]:

    
df['Cities'] = cities



In [8]:

    
states = [us.states.lookup(raw_state.strip()).name if raw_state else '' for raw_state in raw_states]



In [9]:

    
df['State'] = states

examples of multiple cities per one location:



In [10]:

    
for location in df.Location.unique():
    if len(location.split(',')) != 2:
        print(location)









    



Pennsburg, Souderton, Lansdale, Harleysville, Pennsylvania
South Valley, Albuquerque, New Mexico
Nickel Mines, Lancaster, Pennsylvania
Santee, San Diego, California



In [11]:

    
df.drop('Location', axis=1, inplace=True)

Date



In [12]:

    
df.Date.head()









    Out[12]:





0    10/1/2017
1    6/14/2017
2     6/7/2017
3     6/5/2017
4    5/12/2017
Name: Date, dtype: object

Summary



In [13]:

    
df.Summary.head()









    Out[13]:





0                                                     
1    Jimmy Lam, 38, fatally shot three coworkers an...
2    Randy Stair, a 24-year-old worker at Weis groc...
3    John Robert Neumann, Jr., 45, a former employe...
4    Thomas Hartless, 43, shot and killed a former ...
Name: Summary, dtype: object

Fatalities



In [14]:

    
df.Fatalities.unique()









    Out[14]:





array([58,  3,  5, 49,  0,  1,  4,  2,  6, 14, 16, 10,  8,  9,  7, 13, 28,
       12, 11, 32, 15, 24, 22, 17])

Injured



In [15]:

    
df.Injured.unique()









    Out[15]:





array([527,   2,   0,   6,   3,  11,  53,   4,   5,   1,  14,  10,   7,
        21,   9,  13,  12,  16,   8,  15,  70,  32,  23,  24,  25,  19,
        20,  30])

Total victims



In [16]:

    
df['Total victims'].unique()









    Out[16]:





array([585,   5,   3,  11,   6,  16, 102,   4,   9,   8,   7,  17,  12,
        35,  10,  19,  15,  29,  82,  45,  18,  26,  13,  55,  14,  21,
        37,  27,  25,  43,  20,  40,  22,  48])



In [17]:

    
all(df.Fatalities.values + df.Injured.values == df['Total victims'].values)









    Out[17]:





False

Total victims inconsistency:



In [18]:

    
for f, i, t, _ in zip(df.Fatalities.values, df.Injured.values, df['Total victims'].values, range(50)):
    if f + i != t:
        print(f, '+', i, '!=', t)









    



1 + 4 != 4
0 + 3 != 4
3 + 1 != 3
2 + 2 != 3
2 + 2 != 3
2 + 2 != 3
4 + 0 != 3
4 + 0 != 3
2 + 2 != 3
5 + 0 != 4
4 + 14 != 17

possible differences:



In [19]:

    
np.unique(df.Fatalities.values + df.Injured.values - df['Total victims'].values)









    Out[19]:





array([-3, -2, -1,  0,  1,  2])

Mental Health Issues



In [20]:

    
df['Mental Health Issues'].unique()









    Out[20]:





array(['Unclear', 'Yes', 'Unknown', 'No', 'unknown'], dtype=object)



In [21]:

    
df['Mental Health Issues'].replace(['Unclear', 'Unknown', 'unknown'], 
                                   ['Unknown', 'Unknown', 'Unknown'], inplace=True)



In [22]:

    
df['Mental Health Issues'].unique()









    Out[22]:





array(['Unknown', 'Yes', 'No'], dtype=object)

Race



In [23]:

    
df.Race.unique()









    Out[23]:





array(['White', 'Asian', '', 'Black', 'Latino', 'Other', 'Unknown',
       'Black American or African American',
       'White American or European American', 'Asian American',
       'Some other race', 'Two or more races',
       'Black American or African American/Unknown',
       'White American or European American/Some other Race',
       'Native American or Alaska Native', 'white', 'black',
       'Asian American/Some other race'], dtype=object)



In [24]:

    
df.Race.replace(['', 'Other', 'Black American or African American',
                'White American or European American', 'Asian American',
                'Some other race', 'Two or more races',
                'Black American or African American/Unknown',
                'White American or European American/Some other Race',
                'Native American or Alaska Native', 'white', 'black',
                'Asian American/Some other race'],

                ['Unknown', 'Unknown', 'Black', 'White', 'Asian', 
                 'Unknown', 'Mixed', 'Black', 'White', 'Native',
                 'White', 'Black', 'Asian'], 
                
                inplace=True)



In [25]:

    
df.Race.unique()









    Out[25]:





array(['White', 'Asian', 'Unknown', 'Black', 'Latino', 'Mixed', 'Native'], dtype=object)

Gender



In [26]:

    
df.Gender.unique()









    Out[26]:





array(['M', 'Unknown', 'Male', 'M/F', 'Male/Female', 'Female'], dtype=object)



In [27]:

    
df.Gender.replace(['Male', 'Male/Female', 'Female', 'M/F'],
                  ['M', 'Unknown', 'F', 'Unknown'],
                  inplace=True)



In [28]:

    
df.Gender.unique()









    Out[28]:





array(['M', 'Unknown', 'F'], dtype=object)

Latitude



In [29]:

    
df.Latitude.head()









    Out[29]:





0    36.1813
1           
2           
3           
4           
Name: Latitude, dtype: object

Longitude



In [30]:

    
df.Longitude.head()









    Out[30]:





0   -115.134
1           
2           
3           
4           
Name: Longitude, dtype: object

Fill missing states based on coordinates



In [31]:

    
missing_state_latitudes = df.Latitude[df.State == ''].values
missing_state_longitudes = df.Longitude[df.State == ''].values



In [32]:

    
geotagger = geopy.GoogleV3()

def coordinates_to_state_name(latitude, longitude):
    try:
        query = f'{latitude}, {longitude}'
        result = geotagger.reverse(query)
        
        address_elements = result[1].address.split(',')
        state_string = address_elements[-2] 
        state_code = state_string.split()[0]
        state_name = us.states.lookup(state_code).name
        return state_name
    
    except IndexError:
        print('Missing state name', result)
    
    except Exception:
        print('Bad request', query)
    
    return ''



In [33]:

    
missing_state_names = [
    coordinates_to_state_name(latitude, longitude)
    for latitude, longitude in zip(missing_state_latitudes, missing_state_longitudes)
]









    



Bad request 39.114053000000006, -94.627464
Bad request 38.104085999999995, -122.25663700000001
Bad request 34.052234000000006, -118.24368500000001



In [34]:

    
df.State[df.State == ''] = missing_state_names









    



/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

Gun laws

http://lawcenter.giffords.org/scorecard/

A - the strongest gun laws

F - the weakest gun laws



In [35]:

    
gun_laws = pd.read_csv("../data/gun_laws.csv", encoding="UTF-8")



In [36]:

    
gun_laws.columns









    Out[36]:





Index(['State', 'Gun law'], dtype='object')



In [37]:

    
df = df.merge(gun_laws, on='State')

Party affiliation

http://www.pewforum.org/religious-landscape-study/compare/party-affiliation/by/state/



In [38]:

    
party_affiliation = pd.read_csv("../data/party_affiliation.csv", encoding="UTF-8")



In [39]:

    
party_affiliation.columns









    Out[39]:





Index(['State', 'Republican lean', 'No lean', 'Democrat lean'], dtype='object')



In [40]:

    
df_pa = df.merge(party_affiliation, on='State')



In [41]:

    
republican = df_pa['Republican lean'] > df_pa['Democrat lean']
democrat = df_pa['Democrat lean'] > df_pa['Republican lean']

df['Party affiliation'] = 'Neutral'
df['Party affiliation'][democrat] = 'Democrat'
df['Party affiliation'][republican] = 'Republican'









    



/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [42]:

    
set(party_affiliation.State) - set(df.State)









    Out[42]:





{'District of Columbia', 'New Hampshire', 'North Dakota', 'Rhode Island'}

Population



In [43]:

    
population = pd.read_csv("../data/population.csv", encoding="UTF-8")



In [44]:

    
population.columns









    Out[44]:





Index(['State', 'Population'], dtype='object')



In [45]:

    
df = df.merge(population, on='State')

3. Save



In [46]:

    
df.drop('Summary', axis=1, inplace=True)



In [47]:

    
df.to_csv("../data/Mass Shootings Dataset Ver 2 clean.csv", encoding = "ISO-8859-1", index=False)