In [1]:
import pandas as pd
import numpy as np
import geopy
import us
In [2]:
df = pd.read_csv("../data/Mass Shootings Dataset Ver 2.csv", encoding = "ISO-8859-1")
df.replace([np.NaN], [''], inplace=True)
In [3]:
df.columns
Out[3]:
In [4]:
df.Title.head()
Out[4]:
In [5]:
df.Location.replace(['Washington D.C.', ''],
['Washington, Washington', ','],
inplace=True)
In [6]:
cities, raw_states = zip(*[value.rsplit(',', maxsplit=1) for value in df.Location.values])
In [7]:
df['Cities'] = cities
In [8]:
states = [us.states.lookup(raw_state.strip()).name if raw_state else '' for raw_state in raw_states]
In [9]:
df['State'] = states
examples of multiple cities per one location:
In [10]:
for location in df.Location.unique():
if len(location.split(',')) != 2:
print(location)
In [11]:
df.drop('Location', axis=1, inplace=True)
In [12]:
df.Date.head()
Out[12]:
In [13]:
df.Summary.head()
Out[13]:
In [14]:
df.Fatalities.unique()
Out[14]:
In [15]:
df.Injured.unique()
Out[15]:
In [16]:
df['Total victims'].unique()
Out[16]:
In [17]:
all(df.Fatalities.values + df.Injured.values == df['Total victims'].values)
Out[17]:
Total victims inconsistency:
In [18]:
for f, i, t, _ in zip(df.Fatalities.values, df.Injured.values, df['Total victims'].values, range(50)):
if f + i != t:
print(f, '+', i, '!=', t)
possible differences:
In [19]:
np.unique(df.Fatalities.values + df.Injured.values - df['Total victims'].values)
Out[19]:
In [20]:
df['Mental Health Issues'].unique()
Out[20]:
In [21]:
df['Mental Health Issues'].replace(['Unclear', 'Unknown', 'unknown'],
['Unknown', 'Unknown', 'Unknown'], inplace=True)
In [22]:
df['Mental Health Issues'].unique()
Out[22]:
In [23]:
df.Race.unique()
Out[23]:
In [24]:
df.Race.replace(['', 'Other', 'Black American or African American',
'White American or European American', 'Asian American',
'Some other race', 'Two or more races',
'Black American or African American/Unknown',
'White American or European American/Some other Race',
'Native American or Alaska Native', 'white', 'black',
'Asian American/Some other race'],
['Unknown', 'Unknown', 'Black', 'White', 'Asian',
'Unknown', 'Mixed', 'Black', 'White', 'Native',
'White', 'Black', 'Asian'],
inplace=True)
In [25]:
df.Race.unique()
Out[25]:
In [26]:
df.Gender.unique()
Out[26]:
In [27]:
df.Gender.replace(['Male', 'Male/Female', 'Female', 'M/F'],
['M', 'Unknown', 'F', 'Unknown'],
inplace=True)
In [28]:
df.Gender.unique()
Out[28]:
In [29]:
df.Latitude.head()
Out[29]:
In [30]:
df.Longitude.head()
Out[30]:
In [31]:
missing_state_latitudes = df.Latitude[df.State == ''].values
missing_state_longitudes = df.Longitude[df.State == ''].values
In [32]:
geotagger = geopy.GoogleV3()
def coordinates_to_state_name(latitude, longitude):
try:
query = f'{latitude}, {longitude}'
result = geotagger.reverse(query)
address_elements = result[1].address.split(',')
state_string = address_elements[-2]
state_code = state_string.split()[0]
state_name = us.states.lookup(state_code).name
return state_name
except IndexError:
print('Missing state name', result)
except Exception:
print('Bad request', query)
return ''
In [33]:
missing_state_names = [
coordinates_to_state_name(latitude, longitude)
for latitude, longitude in zip(missing_state_latitudes, missing_state_longitudes)
]
In [34]:
df.State[df.State == ''] = missing_state_names
In [35]:
gun_laws = pd.read_csv("../data/gun_laws.csv", encoding="UTF-8")
In [36]:
gun_laws.columns
Out[36]:
In [37]:
df = df.merge(gun_laws, on='State')
In [38]:
party_affiliation = pd.read_csv("../data/party_affiliation.csv", encoding="UTF-8")
In [39]:
party_affiliation.columns
Out[39]:
In [40]:
df_pa = df.merge(party_affiliation, on='State')
In [41]:
republican = df_pa['Republican lean'] > df_pa['Democrat lean']
democrat = df_pa['Democrat lean'] > df_pa['Republican lean']
df['Party affiliation'] = 'Neutral'
df['Party affiliation'][democrat] = 'Democrat'
df['Party affiliation'][republican] = 'Republican'
In [42]:
set(party_affiliation.State) - set(df.State)
Out[42]:
In [43]:
population = pd.read_csv("../data/population.csv", encoding="UTF-8")
In [44]:
population.columns
Out[44]:
In [45]:
df = df.merge(population, on='State')
In [46]:
df.drop('Summary', axis=1, inplace=True)
In [47]:
df.to_csv("../data/Mass Shootings Dataset Ver 2 clean.csv", encoding = "ISO-8859-1", index=False)