Workplace fatalities

2014 Census of Fatal Occupational Injuries (preliminary data)

Industry by event or exposure, 2014 (PDF 272K)

NAICS explanation:

The first two digits designate the economic sector, the third digit designates the subsector, the fourth digit designates the industry group, the fifth digit designates the NAICS industry, and the sixth digit designates the national industry. The 5-digit NAICS code is the level at which there is comparability in code and definitions for most of the NAICS sectors across the three countries participating in NAICS (the United States, Canada, and Mexico). The 6-digit level allows for the United States, Canada, and Mexico each to have country-specific detail. A complete and valid NAICS code contains six digits.



In [30]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
from ggplot import *
import re
import seaborn as sns
%matplotlib inline
from textwrap import wrap



In [2]:

    
sns.set(style="white", context="talk")



In [26]:

    
def open_file():
    with open("fatalities.txt", "r") as ins:
        array = []
        for line in ins:
            array.append(line)
    return array

def clean_up(array):
    for line_num,line in enumerate(array):
        line = array[line_num].strip("\n")      # Remove newline character
        if "--" in line:                        # Change "--" to 0
            line = re.sub("--", "0", line)
        array[line_num] = line
    return array
    
def create_df(array):
    industry=[]
    naics=[]
    total=[]
    violence=[]
    transportation=[]
    fires=[]
    falls=[]
    exposure=[]
    contact=[]
    
    regex = re.compile("[A-Za-z(]")

    for line_num,line in enumerate(array):
        words = line.split()
        linetitle = []            
        for w in words:
            if regex.match(w[0]):
                words = words[1:]
                linetitle.append(w)
                industry_row=' '.join(linetitle)
        for w_num,w in enumerate(words):
            words[w_num]=w.replace(",","")      # remove commas in numbers
        if len(words)==8:                       # NAICS code is non-blank entry
            if len(words[0])==3:                # NAICS code is 3-digits, meaning sub-sector level
                industry.append(industry_row)
                naics.append(int(words[0]))
                total.append(int(words[1]))
                violence.append(int(words[2]))
                transportation.append(int(words[3]))
                fires.append(int(words[4]))
                falls.append(int(words[5]))
                exposure.append(int(words[6]))
                contact.append(int(words[7]))
                last_industry_name = industry_row
                
    # Create pandas dataframe object from dictionary
    d={'industry':industry,
       'naics': naics,
       'total': total,
       'violence': violence,
       'transportation': transportation,
       'fires': fires,
       'falls': falls,
       'exposure': exposure,
       'contact': contact}
    df = pd.DataFrame(d)
    cols = ['industry','naics','total','violence','transportation','fires','falls','exposure','contact']
    df = df[cols]
    return df

def modify_df(df):
    df['violenceP']=df.violence/df.total
    df['transportationP']=df.transportation/df.total
    df['firesP']=df.fires/df.total
    df['fallsP']=df.falls/df.total
    df['exposureP']=df.exposure/df.total
    df['contactP']=df.contact/df.total
    df=df.replace([np.inf, -np.inf], np.nan)   # replace any inf values with nan values
    return df

def create_barchart(df,column_name,num,title_string):
    tempdf=df.sort_values(by=column_name,ascending=False)[0:num]
    tempdff=tempdf[['industry','violence','transportation','fires','falls','exposure','contact']]
    tempdff=tempdff.set_index('industry')
    a=tempdff.index
    ax=tempdff.plot.barh(stacked=True,figsize=(8, 6),title=title_string);
    labels = [ '\n'.join(wrap(l, 37)) for l in a ]
    ax.set_xlabel("# of deaths")
    ax.set_yticklabels(labels)
    fig=ax.get_figure()
    return fig



In [48]:

    
if __name__ == "__main__":
    array = open_file()
    array = clean_up(array)
    df = create_df(array)
    #df = modify_df(df)



In [49]:

    
df.head()









    Out[49]:






  
    
      
      industry
      naics
      total
      violence
      transportation
      fires
      falls
      exposure
      contact
    
  
  
    
      0
      Crop production
      111
      248
      15
      126
      5
      28
      6
      68
    
    
      1
      Animal production and aquaculture
      112
      156
      22
      71
      4
      12
      3
      44
    
    
      2
      Forestry and logging
      113
      92
      0
      21
      0
      4
      0
      64
    
    
      3
      Fishing, hunting and trapping
      114
      25
      0
      19
      0
      0
      3
      0
    
    
      4
      Support activities for agriculture and forestry
      115
      43
      0
      27
      0
      0
      4
      8



In [43]:

    
fig=create_barchart(df,'total',10,"Top 10 sub-sectors with most fatalities, 2014")
fig.savefig('total_top10.png',bbox_inches='tight', dpi=300)



In [44]:

    
fig=create_barchart(df,'violence',10,"Top 10 sub-sectors with most VIOLENT fatalities, 2014")
fig.savefig('violence_top10.png',bbox_inches='tight', dpi=300)



In [45]:

    
fig=create_barchart(df,'violenceP',10,"Top 10 subsectors with \nhighest VIOLENT fatalities by proportion, 2014")
fig.savefig('violenceP_top10.png',bbox_inches='tight', dpi=300)



In [46]:

    
fig=create_barchart(df[df.total>20],'violenceP',10,"Top 10 subsectors with >20 total deaths AND\nhighest VIOLENT fatalities by proportion, 2014")
fig.savefig('violenceP_gr20_top10.png',bbox_inches='tight', dpi=300)



In [ ]:

	industry	naics	total	violence	transportation	fires	falls	exposure	contact
0	Crop production	111	248	15	126	5	28	6	68
1	Animal production and aquaculture	112	156	22	71	4	12	3	44
2	Forestry and logging	113	92	0	21	0	4	0	64
3	Fishing, hunting and trapping	114	25	0	19	0	0	3	0
4	Support activities for agriculture and forestry	115	43	0	27	0	0	4	8