Make a prediction about the coal production



In [3]:

    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set();



In [4]:

    
df = pd.read_csv("../data/cleaned_coalpublic2013.csv", index_col='MSHA ID')
df.head()









    Out[4]:






  
    
      
      Year
      Mine_Name
      Mine_State
      Mine_County
      Mine_Status
      Mine_Type
      Company_Type
      Operation_Type
      Operating_Company
      Operating_Company_Address
      Union_Code
      Coal_Supply_Region
      Production_(short_tons)
      Average_Employees
      Labor_Hours
      log_production
    
    
      MSHA ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      103381
      2013
      Tacoa Highwall Miner
      Alabama
      Bibb
      Active, men working, not producing
      Surface
      Independent Producer Operator
      Mine only
      Jesse Creek Mining, Llc
      1615 Kent Dairy Rd, Alabaster, AL 35007
      
      Appalachia Southern
      56004
      10
      22392
      10.933178
    
    
      103404
      2013
      Reid School Mine
      Alabama
      Blount
      Permanently abandoned
      Surface
      Independent Producer Operator
      Mine only
      Taft Coal Sales & Associates,
      3000 Riverchase Galleria Ste 1, Birmingham, AL...
      UNIT
      Appalachia Southern
      28807
      18
      28447
      10.268374
    
    
      100759
      2013
      North River #1 Underground Min
      Alabama
      Fayette
      Active, men working, not producing
      Underground
      Independent Producer Operator
      Mine and Preparation Plant
      Jim Walter Resources Inc
      3114 County Rd 63 S, Berry, AL 35546
      UNIT
      Appalachia Southern
      1440115
      183
      474784
      14.180234
    
    
      103246
      2013
      Bear Creek
      Alabama
      Franklin
      Active
      Surface
      Independent Producer Operator
      Mine only
      Birmingham Coal & Coke Co., In
      912 Edenton Street, Birmingham, AL 35242
      
      Appalachia Southern
      87587
      13
      29193
      11.380388
    
    
      103451
      2013
      Knight Mine
      Alabama
      Franklin
      Active
      Surface
      Independent Producer Operator
      Mine only
      Birmingham Coal & Coke Co., In
      P.O. Box 354, Lynn, AL 35242
      
      Appalachia Southern
      147499
      27
      46393
      11.901577



In [5]:

    
len(df)









    Out[5]:





1061



In [6]:

    
for column in df.columns:
    print column









    



Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production



In [7]:

    
df.log_production.hist()









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x10c008410>



In [8]:

    
df.Mine_Status.unique()









    Out[8]:





array(['Active, men working, not producing', 'Permanently abandoned',
       'Active', 'Temporarily closed', 'New, under construction'], dtype=object)



In [9]:

    
df[['Mine_Status', 'log_production']].groupby('Mine_Status').mean()









    Out[9]:






  
    
      
      log_production
    
    
      Mine_Status
      
    
  
  
    
      Active
      11.977453
    
    
      Active, men working, not producing
      10.499962
    
    
      New, under construction
      3.951244
    
    
      Permanently abandoned
      9.896046
    
    
      Temporarily closed
      9.162933

Predict the Production of coal mines



In [10]:

    
for column in df.columns:
    print column









    



Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production



In [11]:

    
df.Year.unique()









    Out[11]:





array([2013])



In [12]:

    
features = ['Average_Employees',
            'Labor_Hours',
           ]

categoricals = ['Mine_State',
                'Mine_County',
                'Mine_Status',
                'Mine_Type',
                'Company_Type',
                'Operation_Type',
                'Union_Code',
                'Coal_Supply_Region',
               ]

target = 'log_production'



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [14]:

    
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y="Mine_Status", x="log_production", data=df,
               split=True, inner="stick", )
plt.tight_layout()



In [15]:

    
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y="Company_Type", x="log_production", data=df, 
               split=True, inner="stick");
plt.tight_layout()



In [16]:

    
df['Company_Type'].unique()









    Out[16]:





array(['Independent Producer Operator', 'Operating Subsidiary',
       'Contractor'], dtype=object)



In [18]:

    
pd.get_dummies(df['Company_Type']).sample(50).head()









    Out[18]:






  
    
      
      Contractor
      Independent Producer Operator
      Operating Subsidiary
    
    
      MSHA ID
      
      
      
    
  
  
    
      103419
      0
      1
      0
    
    
      3601262
      0
      1
      0
    
    
      4407123
      0
      0
      1
    
    
      3609605
      0
      1
      0
    
    
      2302262
      0
      1
      0



In [19]:

    
dummy_categoricals = []
for categorical in categoricals:
    print categorical, len(df[categorical].unique())
    # Avoid the dummy variable trap!
    drop_var = sorted(df[categorical].unique())[-1]
    temp_df = pd.get_dummies(df[categorical], prefix=categorical)
    df = pd.concat([df, temp_df], axis=1)
    temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
    dummy_categoricals += temp_df.columns.tolist()









    



Mine_State 29
Mine_County 164
Mine_Status 5
Mine_Type 3
Company_Type 3
Operation_Type 2
Union_Code 7
Coal_Supply_Region 8



In [20]:

    
dummy_categoricals[:10]









    Out[20]:





['Mine_State_Alabama',
 'Mine_State_Alaska',
 'Mine_State_Arizona',
 'Mine_State_Arkansas',
 'Mine_State_Colorado',
 'Mine_State_Illinois',
 'Mine_State_Indiana',
 'Mine_State_Kansas',
 'Mine_State_Kentucky (East)',
 'Mine_State_Kentucky (West)']



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Build our model



In [21]:

    
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor



In [22]:

    
len(dummy_categoricals)









    Out[22]:





213



In [23]:

    
train, test = train_test_split(df, test_size=0.3)



In [ ]:



In [25]:

    
rf = RandomForestRegressor(n_estimators=100, oob_score=True)



In [27]:

    
rf.fit(train[features + dummy_categoricals], train[target])









    Out[27]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)



In [29]:

    
fig = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]))
plt.ylabel("Predicted production")
plt.xlim(0, 22)
plt.ylim(0, 22)
plt.tight_layout()



In [30]:

    
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error



In [31]:

    
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)









    Out[31]:





0.88433913539444742



In [32]:

    
explained_variance_score(test[target], predicted)









    Out[32]:





0.88455128068585942



In [33]:

    
mean_squared_error(test[target], predicted)









    Out[33]:





0.54444109401274343



In [34]:

    
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
                               'importance':rf.feature_importances_
                              }).sort_values(by='importance', 
                                              ascending=False).reset_index(drop=True)
rf_importances.head(20)









    Out[34]:






  
    
      
      importance
      name
    
  
  
    
      0
      0.846915
      Labor_Hours
    
    
      1
      0.048470
      Average_Employees
    
    
      2
      0.009491
      Mine_Type_Surface
    
    
      3
      0.006033
      Mine_County_Campbell
    
    
      4
      0.005222
      Coal_Supply_Region_Powder River Basin
    
    
      5
      0.004416
      Mine_Status_Active
    
    
      6
      0.003813
      Mine_State_West Virginia (Southern)
    
    
      7
      0.003739
      Mine_County_Martin
    
    
      8
      0.003676
      Mine_County_Boone
    
    
      9
      0.003028
      Coal_Supply_Region_Illinois Basin
    
    
      10
      0.002612
      Coal_Supply_Region_Appalachia Central
    
    
      11
      0.002508
      Mine_County_Schuylkill
    
    
      12
      0.002293
      Mine_County_Grant
    
    
      13
      0.002258
      Mine_County_Buchanan
    
    
      14
      0.001972
      Company_Type_Independent Producer Operator
    
    
      15
      0.001959
      Operation_Type_Mine and Preparation Plant
    
    
      16
      0.001931
      Mine_State_Pennsylvania (Anthracite)
    
    
      17
      0.001727
      Mine_Status_Active, men working, not producing
    
    
      18
      0.001577
      Mine_Type_Refuse
    
    
      19
      0.001556
      Coal_Supply_Region_Appalachia Northern



In [ ]:

	Year	Mine_Name	Mine_State	Mine_County	Mine_Status	Mine_Type	Company_Type	Operation_Type	Operating_Company	Operating_Company_Address	Union_Code	Coal_Supply_Region	Production_(short_tons)	Average_Employees	Labor_Hours	log_production
MSHA ID
103381	2013	Tacoa Highwall Miner	Alabama	Bibb	Active, men working, not producing	Surface	Independent Producer Operator	Mine only	Jesse Creek Mining, Llc	1615 Kent Dairy Rd, Alabaster, AL 35007		Appalachia Southern	56004	10	22392	10.933178
103404	2013	Reid School Mine	Alabama	Blount	Permanently abandoned	Surface	Independent Producer Operator	Mine only	Taft Coal Sales & Associates,	3000 Riverchase Galleria Ste 1, Birmingham, AL...	UNIT	Appalachia Southern	28807	18	28447	10.268374
100759	2013	North River #1 Underground Min	Alabama	Fayette	Active, men working, not producing	Underground	Independent Producer Operator	Mine and Preparation Plant	Jim Walter Resources Inc	3114 County Rd 63 S, Berry, AL 35546	UNIT	Appalachia Southern	1440115	183	474784	14.180234
103246	2013	Bear Creek	Alabama	Franklin	Active	Surface	Independent Producer Operator	Mine only	Birmingham Coal & Coke Co., In	912 Edenton Street, Birmingham, AL 35242		Appalachia Southern	87587	13	29193	11.380388
103451	2013	Knight Mine	Alabama	Franklin	Active	Surface	Independent Producer Operator	Mine only	Birmingham Coal & Coke Co., In	P.O. Box 354, Lynn, AL 35242		Appalachia Southern	147499	27	46393	11.901577

	log_production
Mine_Status
Active	11.977453
Active, men working, not producing	10.499962
New, under construction	3.951244
Permanently abandoned	9.896046
Temporarily closed	9.162933

	importance	name
0	0.846915	Labor_Hours
1	0.048470	Average_Employees
2	0.009491	Mine_Type_Surface
3	0.006033	Mine_County_Campbell
4	0.005222	Coal_Supply_Region_Powder River Basin
5	0.004416	Mine_Status_Active
6	0.003813	Mine_State_West Virginia (Southern)
7	0.003739	Mine_County_Martin
8	0.003676	Mine_County_Boone
9	0.003028	Coal_Supply_Region_Illinois Basin
10	0.002612	Coal_Supply_Region_Appalachia Central
11	0.002508	Mine_County_Schuylkill
12	0.002293	Mine_County_Grant
13	0.002258	Mine_County_Buchanan
14	0.001972	Company_Type_Independent Producer Operator
15	0.001959	Operation_Type_Mine and Preparation Plant
16	0.001931	Mine_State_Pennsylvania (Anthracite)
17	0.001727	Mine_Status_Active, men working, not producing
18	0.001577	Mine_Type_Refuse
19	0.001556	Coal_Supply_Region_Appalachia Northern