The first two digits designate the economic sector, the third digit designates the subsector, the fourth digit designates the industry group, the fifth digit designates the NAICS industry, and the sixth digit designates the national industry. The 5-digit NAICS code is the level at which there is comparability in code and definitions for most of the NAICS sectors across the three countries participating in NAICS (the United States, Canada, and Mexico). The 6-digit level allows for the United States, Canada, and Mexico each to have country-specific detail. A complete and valid NAICS code contains six digits.
In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
from ggplot import *
import re
import seaborn as sns
%matplotlib inline
from textwrap import wrap
In [2]:
sns.set(style="white", context="talk")
In [26]:
def open_file():
with open("fatalities.txt", "r") as ins:
array = []
for line in ins:
array.append(line)
return array
def clean_up(array):
for line_num,line in enumerate(array):
line = array[line_num].strip("\n") # Remove newline character
if "--" in line: # Change "--" to 0
line = re.sub("--", "0", line)
array[line_num] = line
return array
def create_df(array):
industry=[]
naics=[]
total=[]
violence=[]
transportation=[]
fires=[]
falls=[]
exposure=[]
contact=[]
regex = re.compile("[A-Za-z(]")
for line_num,line in enumerate(array):
words = line.split()
linetitle = []
for w in words:
if regex.match(w[0]):
words = words[1:]
linetitle.append(w)
industry_row=' '.join(linetitle)
for w_num,w in enumerate(words):
words[w_num]=w.replace(",","") # remove commas in numbers
if len(words)==8: # NAICS code is non-blank entry
if len(words[0])==3: # NAICS code is 3-digits, meaning sub-sector level
industry.append(industry_row)
naics.append(int(words[0]))
total.append(int(words[1]))
violence.append(int(words[2]))
transportation.append(int(words[3]))
fires.append(int(words[4]))
falls.append(int(words[5]))
exposure.append(int(words[6]))
contact.append(int(words[7]))
last_industry_name = industry_row
# Create pandas dataframe object from dictionary
d={'industry':industry,
'naics': naics,
'total': total,
'violence': violence,
'transportation': transportation,
'fires': fires,
'falls': falls,
'exposure': exposure,
'contact': contact}
df = pd.DataFrame(d)
cols = ['industry','naics','total','violence','transportation','fires','falls','exposure','contact']
df = df[cols]
return df
def modify_df(df):
df['violenceP']=df.violence/df.total
df['transportationP']=df.transportation/df.total
df['firesP']=df.fires/df.total
df['fallsP']=df.falls/df.total
df['exposureP']=df.exposure/df.total
df['contactP']=df.contact/df.total
df=df.replace([np.inf, -np.inf], np.nan) # replace any inf values with nan values
return df
def create_barchart(df,column_name,num,title_string):
tempdf=df.sort_values(by=column_name,ascending=False)[0:num]
tempdff=tempdf[['industry','violence','transportation','fires','falls','exposure','contact']]
tempdff=tempdff.set_index('industry')
a=tempdff.index
ax=tempdff.plot.barh(stacked=True,figsize=(8, 6),title=title_string);
labels = [ '\n'.join(wrap(l, 37)) for l in a ]
ax.set_xlabel("# of deaths")
ax.set_yticklabels(labels)
fig=ax.get_figure()
return fig
In [48]:
if __name__ == "__main__":
array = open_file()
array = clean_up(array)
df = create_df(array)
#df = modify_df(df)
In [49]:
df.head()
Out[49]:
In [43]:
fig=create_barchart(df,'total',10,"Top 10 sub-sectors with most fatalities, 2014")
fig.savefig('total_top10.png',bbox_inches='tight', dpi=300)
In [44]:
fig=create_barchart(df,'violence',10,"Top 10 sub-sectors with most VIOLENT fatalities, 2014")
fig.savefig('violence_top10.png',bbox_inches='tight', dpi=300)
In [45]:
fig=create_barchart(df,'violenceP',10,"Top 10 subsectors with \nhighest VIOLENT fatalities by proportion, 2014")
fig.savefig('violenceP_top10.png',bbox_inches='tight', dpi=300)
In [46]:
fig=create_barchart(df[df.total>20],'violenceP',10,"Top 10 subsectors with >20 total deaths AND\nhighest VIOLENT fatalities by proportion, 2014")
fig.savefig('violenceP_gr20_top10.png',bbox_inches='tight', dpi=300)
In [ ]: