2016-04-05-BadWizard-first-look



In [65]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set();


/Users/BadWizard/anaconda3/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [16]:
df = pd.read_excel('../data/coalpublic2013.xls',header=2,index_col=1)
df.head()


Out[16]:
Year Mine Name Mine State Mine County Mine Status Mine Type Company Type Operation Type Operating Company Operating Company Address Union Code Coal Supply Region Production (short tons) Average Employees Labor Hours
MSHA ID
103381 2013 Tacoa Highwall Miner Alabama Bibb Active, men working, not producing Surface Indepedent Producer Operator Mine only Jesse Creek Mining, Llc 1615 Kent Dairy Rd, Alabaster, AL 35007 Appalachia Southern 56004 10 22392
103404 2013 Reid School Mine Alabama Blount Permanently abandoned Surface Indepedent Producer Operator Mine only Taft Coal Sales & Associates, 3000 Riverchase Galleria Ste 1, Birmingham, AL... UNIT Appalachia Southern 28807 18 28447
100759 2013 North River #1 Underground Min Alabama Fayette Active, men working, not producing Underground Indepedent Producer Operator Mine and Preparation Plant Jim Walter Resources Inc 3114 County Rd 63 S, Berry, AL 35546 UNIT Appalachia Southern 1440115 183 474784
103246 2013 Bear Creek Alabama Franklin Active Surface Indepedent Producer Operator Mine only Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Appalachia Southern 87587 13 29193
103451 2013 Knight Mine Alabama Franklin Active Surface Indepedent Producer Operator Mine only Birmingham Coal & Coke Co., In P.O. Box 354, Lynn, AL 35242 Appalachia Southern 147499 27 46393

In [17]:
df['Company Type'].unique()


Out[17]:
array(['Indepedent Producer Operator', 'Operating Subsidiary', 'Contractor'], dtype=object)

In [18]:
df['Company Type'].replace(to_replace = 'Indepedent Producer Operator',
                           value = 'Independent Producer Operator',
                           inplace = True)

In [19]:
df['Company Type'].head()


Out[19]:
MSHA ID
103381    Independent Producer Operator
103404    Independent Producer Operator
100759    Independent Producer Operator
103246    Independent Producer Operator
103451    Independent Producer Operator
Name: Company Type, dtype: object

In [40]:
df.rename(columns=lambda x: x.replace(" ","_"),inplace=True)

In [41]:
df.columns


Out[41]:
Index(['Year', 'Mine_Name', 'Mine_State', 'Mine_County', 'Mine_Status',
       'Mine_Type', 'Company_Type', 'Operation_Type', 'Operating_Company',
       'Operating_Company_Address', 'Union_Code', 'Coal_Supply_Region',
       'Production_(short_tons)', 'Average_Employees', 'Labor_Hours'],
      dtype='object')

In [39]:
df1 = df.copy()
df1.rename(columns = lambda x: x.replace('Mine','Yours'),inplace=True)
df1.head()


Out[39]:
Year Yours_Name Yours_State Yours_County Yours_Status Yours_Type Company_Type Operation_Type Operating_Company Operating_Company Address Union_Code Coal_Supply Region Production_(short tons) Average_Employees Labor_Hours
MSHA ID
103381 2013 Tacoa Highwall Miner Alabama Bibb Active, men working, not producing Surface Independent Producer Operator Mine only Jesse Creek Mining, Llc 1615 Kent Dairy Rd, Alabaster, AL 35007 Appalachia Southern 56004 10 22392
103404 2013 Reid School Mine Alabama Blount Permanently abandoned Surface Independent Producer Operator Mine only Taft Coal Sales & Associates, 3000 Riverchase Galleria Ste 1, Birmingham, AL... UNIT Appalachia Southern 28807 18 28447
100759 2013 North River #1 Underground Min Alabama Fayette Active, men working, not producing Underground Independent Producer Operator Mine and Preparation Plant Jim Walter Resources Inc 3114 County Rd 63 S, Berry, AL 35546 UNIT Appalachia Southern 1440115 183 474784
103246 2013 Bear Creek Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Appalachia Southern 87587 13 29193
103451 2013 Knight Mine Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In P.O. Box 354, Lynn, AL 35242 Appalachia Southern 147499 27 46393

In [32]:
df.shape


Out[32]:
(1450, 15)

In [35]:
plt.scatter(df.Average_Employees,df.Labor_Hours)
plt.xlabel('Number of employees')
plt.ylabel('Total Hours Worked')


Out[35]:
<matplotlib.text.Text at 0x115b076a0>

In [37]:
sns.regplot(df.Average_Employees,df.Labor_Hours)
plt.savefig("../figures/2016-04-05" + "-employees_vs_hours.png")



In [42]:
for column in df.columns:
    print(column)


Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours

In [43]:
plt.scatter(df.Labor_Hours, df['Production_(short_tons)'])


Out[43]:
<matplotlib.collections.PathCollection at 0x1196c3f60>

In [44]:
df['Production_(short_tons)'].hist()


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x1194db3c8>

In [45]:
min(df['Production_(short_tons)'])


Out[45]:
0

In [61]:
df[df['Production_(short_tons)']==0].shape


Out[61]:
(389, 15)

In [62]:
df[df['Production_(short_tons)']>0]


Out[62]:
(1061, 15)

In [63]:
# we are removing data here !
df = df[df['Production_(short_tons)']>0]

In [66]:
df['log_production'] = np.log(df['Production_(short_tons)'])


/Users/BadWizard/anaconda3/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [67]:
df.log_production.hist()


Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x1199add30>

In [68]:
df.to_csv('../data/cleaned_coalpublic2013.csv')

In [ ]: