In [47]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

In [10]:
df = pd.read_excel("../data/coalpublic2013.xls", header=2, index_col="MSHA ID")

In [11]:
df.head()


Out[11]:
Year Mine Name Mine State Mine County Mine Status Mine Type Company Type Operation Type Operating Company Operating Company Address Union Code Coal Supply Region Production (short tons) Average Employees Labor Hours
MSHA ID
103381 2013 Tacoa Highwall Miner Alabama Bibb Active, men working, not producing Surface Indepedent Producer Operator Mine only Jesse Creek Mining, Llc 1615 Kent Dairy Rd, Alabaster, AL 35007 Appalachia Southern 56004 10 22392
103404 2013 Reid School Mine Alabama Blount Permanently abandoned Surface Indepedent Producer Operator Mine only Taft Coal Sales & Associates, 3000 Riverchase Galleria Ste 1, Birmingham, AL... UNIT Appalachia Southern 28807 18 28447
100759 2013 North River #1 Underground Min Alabama Fayette Active, men working, not producing Underground Indepedent Producer Operator Mine and Preparation Plant Jim Walter Resources Inc 3114 County Rd 63 S, Berry, AL 35546 UNIT Appalachia Southern 1440115 183 474784
103246 2013 Bear Creek Alabama Franklin Active Surface Indepedent Producer Operator Mine only Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Appalachia Southern 87587 13 29193
103451 2013 Knight Mine Alabama Franklin Active Surface Indepedent Producer Operator Mine only Birmingham Coal & Coke Co., In P.O. Box 354, Lynn, AL 35242 Appalachia Southern 147499 27 46393

In [12]:
df['Company Type'].unique()


Out[12]:
array([u'Indepedent Producer Operator', u'Operating Subsidiary',
       u'Contractor'], dtype=object)

In [18]:
df['Company Type'].replace(to_replace='Indepedent Producer Operator', 
                           value='Independent Producer Operator', inplace=True)

In [19]:
df.head()


Out[19]:
Year Mine Name Mine State Mine County Mine Status Mine Type Company Type Operation Type Operating Company Operating Company Address Union Code Coal Supply Region Production (short tons) Average Employees Labor Hours
MSHA ID
103381 2013 Tacoa Highwall Miner Alabama Bibb Active, men working, not producing Surface Independent Producer Operator Mine only Jesse Creek Mining, Llc 1615 Kent Dairy Rd, Alabaster, AL 35007 Appalachia Southern 56004 10 22392
103404 2013 Reid School Mine Alabama Blount Permanently abandoned Surface Independent Producer Operator Mine only Taft Coal Sales & Associates, 3000 Riverchase Galleria Ste 1, Birmingham, AL... UNIT Appalachia Southern 28807 18 28447
100759 2013 North River #1 Underground Min Alabama Fayette Active, men working, not producing Underground Independent Producer Operator Mine and Preparation Plant Jim Walter Resources Inc 3114 County Rd 63 S, Berry, AL 35546 UNIT Appalachia Southern 1440115 183 474784
103246 2013 Bear Creek Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Appalachia Southern 87587 13 29193
103451 2013 Knight Mine Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In P.O. Box 354, Lynn, AL 35242 Appalachia Southern 147499 27 46393

In [24]:
df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True)

In [25]:
df.head()


Out[25]:
Year Mine_Name Mine_State Mine_County Mine_Status Mine_Type Company_Type Operation_Type Operating_Company Operating_Company_Address Union_Code Coal_Supply_Region Production_(short_tons) Average_Employees Labor_Hours
MSHA ID
103381 2013 Tacoa Highwall Miner Alabama Bibb Active, men working, not producing Surface Independent Producer Operator Mine only Jesse Creek Mining, Llc 1615 Kent Dairy Rd, Alabaster, AL 35007 Appalachia Southern 56004 10 22392
103404 2013 Reid School Mine Alabama Blount Permanently abandoned Surface Independent Producer Operator Mine only Taft Coal Sales & Associates, 3000 Riverchase Galleria Ste 1, Birmingham, AL... UNIT Appalachia Southern 28807 18 28447
100759 2013 North River #1 Underground Min Alabama Fayette Active, men working, not producing Underground Independent Producer Operator Mine and Preparation Plant Jim Walter Resources Inc 3114 County Rd 63 S, Berry, AL 35546 UNIT Appalachia Southern 1440115 183 474784
103246 2013 Bear Creek Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Appalachia Southern 87587 13 29193
103451 2013 Knight Mine Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In P.O. Box 354, Lynn, AL 35242 Appalachia Southern 147499 27 46393

In [26]:
len(df)


Out[26]:
1450

In [29]:
plt.scatter(df.Average_Employees, df.Labor_Hours)
plt.xlabel("Number of Employees")
plt.ylabel("Total Hours Worked")


Out[29]:
<matplotlib.text.Text at 0xd581470>

In [31]:
sns.regplot(df.Average_Employees, df.Labor_Hours)
plt.savefig("../figures/2017-09-03" + "-employees_vs_hours.png")



In [32]:
for column in df.columns:
    print column


Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours

In [34]:
plt.scatter(df.Labor_Hours, df['Production_(short_tons)'])


Out[34]:
<matplotlib.collections.PathCollection at 0xe3d6e80>

In [36]:
df['Production_(short_tons)'].hist()


Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0xe67dcc0>

In [45]:
# We are removing data where Production is <= 0
df = df[df['Production_(short_tons)'] > 0]

In [46]:
df['Production_(short_tons)'].hist()


Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0xde4d128>

In [48]:
df['log_production'] = np.log(df['Production_(short_tons)'])

In [51]:
# log normal distribution
df['log_production'].hist()


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0xf9cfb38>

In [55]:
df.to_csv("../data/cleaned_coalpublic2013.csv")

In [ ]: