1. Load


In [1]:
import pandas as pd
import numpy as np
import geopy
import us

In [2]:
df = pd.read_csv("../data/Mass Shootings Dataset Ver 2.csv", encoding = "ISO-8859-1")
df.replace([np.NaN], [''], inplace=True)

In [3]:
df.columns


Out[3]:
Index(['S#', 'Title', 'Location', 'Date', 'Summary', 'Fatalities', 'Injured',
       'Total victims', 'Mental Health Issues', 'Race', 'Gender', 'Latitude',
       'Longitude'],
      dtype='object')

2. Process

Title


In [4]:
df.Title.head()


Out[4]:
0           Las Vegas Strip mass shooting
1              San Francisco UPS shooting
2       Pennsylvania supermarket shooting
3    Florida awning manufacturer shooting
4        Rural Ohio nursing home shooting
Name: Title, dtype: object

Location


In [5]:
df.Location.replace(['Washington D.C.', ''],
                    ['Washington, Washington', ','],
                    inplace=True)

In [6]:
cities, raw_states = zip(*[value.rsplit(',', maxsplit=1) for value in df.Location.values])

In [7]:
df['Cities'] = cities

In [8]:
states = [us.states.lookup(raw_state.strip()).name if raw_state else '' for raw_state in raw_states]

In [9]:
df['State'] = states

examples of multiple cities per one location:


In [10]:
for location in df.Location.unique():
    if len(location.split(',')) != 2:
        print(location)


Pennsburg, Souderton, Lansdale, Harleysville, Pennsylvania
South Valley, Albuquerque, New Mexico
Nickel Mines, Lancaster, Pennsylvania
Santee, San Diego, California

In [11]:
df.drop('Location', axis=1, inplace=True)

Date


In [12]:
df.Date.head()


Out[12]:
0    10/1/2017
1    6/14/2017
2     6/7/2017
3     6/5/2017
4    5/12/2017
Name: Date, dtype: object

Summary


In [13]:
df.Summary.head()


Out[13]:
0                                                     
1    Jimmy Lam, 38, fatally shot three coworkers an...
2    Randy Stair, a 24-year-old worker at Weis groc...
3    John Robert Neumann, Jr., 45, a former employe...
4    Thomas Hartless, 43, shot and killed a former ...
Name: Summary, dtype: object

Fatalities


In [14]:
df.Fatalities.unique()


Out[14]:
array([58,  3,  5, 49,  0,  1,  4,  2,  6, 14, 16, 10,  8,  9,  7, 13, 28,
       12, 11, 32, 15, 24, 22, 17])

Injured


In [15]:
df.Injured.unique()


Out[15]:
array([527,   2,   0,   6,   3,  11,  53,   4,   5,   1,  14,  10,   7,
        21,   9,  13,  12,  16,   8,  15,  70,  32,  23,  24,  25,  19,
        20,  30])

Total victims


In [16]:
df['Total victims'].unique()


Out[16]:
array([585,   5,   3,  11,   6,  16, 102,   4,   9,   8,   7,  17,  12,
        35,  10,  19,  15,  29,  82,  45,  18,  26,  13,  55,  14,  21,
        37,  27,  25,  43,  20,  40,  22,  48])

In [17]:
all(df.Fatalities.values + df.Injured.values == df['Total victims'].values)


Out[17]:
False

Total victims inconsistency:


In [18]:
for f, i, t, _ in zip(df.Fatalities.values, df.Injured.values, df['Total victims'].values, range(50)):
    if f + i != t:
        print(f, '+', i, '!=', t)


1 + 4 != 4
0 + 3 != 4
3 + 1 != 3
2 + 2 != 3
2 + 2 != 3
2 + 2 != 3
4 + 0 != 3
4 + 0 != 3
2 + 2 != 3
5 + 0 != 4
4 + 14 != 17

possible differences:


In [19]:
np.unique(df.Fatalities.values + df.Injured.values - df['Total victims'].values)


Out[19]:
array([-3, -2, -1,  0,  1,  2])

Mental Health Issues


In [20]:
df['Mental Health Issues'].unique()


Out[20]:
array(['Unclear', 'Yes', 'Unknown', 'No', 'unknown'], dtype=object)

In [21]:
df['Mental Health Issues'].replace(['Unclear', 'Unknown', 'unknown'], 
                                   ['Unknown', 'Unknown', 'Unknown'], inplace=True)

In [22]:
df['Mental Health Issues'].unique()


Out[22]:
array(['Unknown', 'Yes', 'No'], dtype=object)

Race


In [23]:
df.Race.unique()


Out[23]:
array(['White', 'Asian', '', 'Black', 'Latino', 'Other', 'Unknown',
       'Black American or African American',
       'White American or European American', 'Asian American',
       'Some other race', 'Two or more races',
       'Black American or African American/Unknown',
       'White American or European American/Some other Race',
       'Native American or Alaska Native', 'white', 'black',
       'Asian American/Some other race'], dtype=object)

In [24]:
df.Race.replace(['', 'Other', 'Black American or African American',
                'White American or European American', 'Asian American',
                'Some other race', 'Two or more races',
                'Black American or African American/Unknown',
                'White American or European American/Some other Race',
                'Native American or Alaska Native', 'white', 'black',
                'Asian American/Some other race'],

                ['Unknown', 'Unknown', 'Black', 'White', 'Asian', 
                 'Unknown', 'Mixed', 'Black', 'White', 'Native',
                 'White', 'Black', 'Asian'], 
                
                inplace=True)

In [25]:
df.Race.unique()


Out[25]:
array(['White', 'Asian', 'Unknown', 'Black', 'Latino', 'Mixed', 'Native'], dtype=object)

Gender


In [26]:
df.Gender.unique()


Out[26]:
array(['M', 'Unknown', 'Male', 'M/F', 'Male/Female', 'Female'], dtype=object)

In [27]:
df.Gender.replace(['Male', 'Male/Female', 'Female', 'M/F'],
                  ['M', 'Unknown', 'F', 'Unknown'],
                  inplace=True)

In [28]:
df.Gender.unique()


Out[28]:
array(['M', 'Unknown', 'F'], dtype=object)

Latitude


In [29]:
df.Latitude.head()


Out[29]:
0    36.1813
1           
2           
3           
4           
Name: Latitude, dtype: object

Longitude


In [30]:
df.Longitude.head()


Out[30]:
0   -115.134
1           
2           
3           
4           
Name: Longitude, dtype: object

Fill missing states based on coordinates


In [31]:
missing_state_latitudes = df.Latitude[df.State == ''].values
missing_state_longitudes = df.Longitude[df.State == ''].values

In [32]:
geotagger = geopy.GoogleV3()

def coordinates_to_state_name(latitude, longitude):
    try:
        query = f'{latitude}, {longitude}'
        result = geotagger.reverse(query)
        
        address_elements = result[1].address.split(',')
        state_string = address_elements[-2] 
        state_code = state_string.split()[0]
        state_name = us.states.lookup(state_code).name
        return state_name
    
    except IndexError:
        print('Missing state name', result)
    
    except Exception:
        print('Bad request', query)
    
    return ''

In [33]:
missing_state_names = [
    coordinates_to_state_name(latitude, longitude)
    for latitude, longitude in zip(missing_state_latitudes, missing_state_longitudes)
]


Bad request 39.114053000000006, -94.627464
Bad request 38.104085999999995, -122.25663700000001
Bad request 34.052234000000006, -118.24368500000001

In [34]:
df.State[df.State == ''] = missing_state_names


/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

Gun laws

http://lawcenter.giffords.org/scorecard/

A - the strongest gun laws

F - the weakest gun laws


In [35]:
gun_laws = pd.read_csv("../data/gun_laws.csv", encoding="UTF-8")

In [36]:
gun_laws.columns


Out[36]:
Index(['State', 'Gun law'], dtype='object')

In [37]:
df = df.merge(gun_laws, on='State')

Party affiliation


In [38]:
party_affiliation = pd.read_csv("../data/party_affiliation.csv", encoding="UTF-8")

In [39]:
party_affiliation.columns


Out[39]:
Index(['State', 'Republican lean', 'No lean', 'Democrat lean'], dtype='object')

In [40]:
df_pa = df.merge(party_affiliation, on='State')

In [41]:
republican = df_pa['Republican lean'] > df_pa['Democrat lean']
democrat = df_pa['Democrat lean'] > df_pa['Republican lean']

df['Party affiliation'] = 'Neutral'
df['Party affiliation'][democrat] = 'Democrat'
df['Party affiliation'][republican] = 'Republican'


/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
/home/maciej/miniconda3/envs/ed/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [42]:
set(party_affiliation.State) - set(df.State)


Out[42]:
{'District of Columbia', 'New Hampshire', 'North Dakota', 'Rhode Island'}

Population


In [43]:
population = pd.read_csv("../data/population.csv", encoding="UTF-8")

In [44]:
population.columns


Out[44]:
Index(['State', 'Population'], dtype='object')

In [45]:
df = df.merge(population, on='State')

3. Save


In [46]:
df.drop('Summary', axis=1, inplace=True)

In [47]:
df.to_csv("../data/Mass Shootings Dataset Ver 2 clean.csv", encoding = "ISO-8859-1", index=False)