In [1]:
import pandas as pd
import missingno as msno
from matplotlib import pyplot as plt
In [2]:
stats2015 = pd.read_csv("the-counted-revised-2015.csv")
stats2016 = pd.read_csv("the-counted-revised-2016.csv")
In [3]:
msno.bar(stats2015)
In [4]:
msno.bar(stats2016)
'streetaddress' seems to be the only column that is missing data. That is good news as we can count any of the other columns
In [5]:
stats2015.head()
Out[5]:
In [6]:
stats2016.head()
Out[6]:
In [7]:
#Dropping the extra axis
stats2015 = stats2015.drop(['Unnamed: 0'],axis=1)
stats2016 = stats2016.drop(['Unnamed: 0'],axis=1)
In [8]:
# Using the count methods to obtain the frequency of data
armed2015 = stats2015.groupby('armed')['index'].count()
armed2016 = stats2016.groupby('armed')['index'].count()
In [9]:
print('2015',armed2015,'\n')
print('2016',armed2016)
In [10]:
race2015 = stats2015.groupby('raceethnicity')['index'].count()
race2016 = stats2016.groupby('raceethnicity')['index'].count()
In [11]:
print('2015',race2015,'\n')
print('2016',race2016)
In [12]:
how2015 = stats2015.groupby('classification')['index'].count()
how2016 = stats2016.groupby('classification')['index'].count()
In [13]:
print('2015',how2015,'\n')
print('2016',how2016)
I noticed that there was only one "other" column for raceethnicity so I decided to explore.
In [14]:
stats2015[stats2015['raceethnicity'] == 'Other']
Out[14]:
Looking through the database's sources, there seems to be no evidence that Mohamed Ibrahim is qualified for the race section of 'Other'. Therefore, I plan to change this statistic to 'Unknown'.
In [15]:
stats2015.ix[782,'raceethnicity'] = 'Unknown'
In [16]:
armed2015.to_json('armed2015.json')
race2015.to_json('race2015.json')
how2015.to_json('how2015.json')
armed2016.to_json('armed2016.json')
race2016.to_json('race2016.json')
how2016.to_json('how2016.json')