Make prediction about coal prediction



In [2]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set();



In [8]:

    
df = pd.read_csv('../data/cleaned_coalpublic2013.csv',header=0,index_col='MSHA ID')
df.head()









    Out[8]:






  
    
      
      Year
      Mine_Name
      Mine_State
      Mine_County
      Mine_Status
      Mine_Type
      Company_Type
      Operation_Type
      Operating_Company
      Operating_Company_Address
      Union_Code
      Coal_Supply_Region
      Production_(short_tons)
      Average_Employees
      Labor_Hours
      log_production
    
    
      MSHA ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      103381
      2013
      Tacoa Highwall Miner
      Alabama
      Bibb
      Active, men working, not producing
      Surface
      Independent Producer Operator
      Mine only
      Jesse Creek Mining, Llc
      1615 Kent Dairy Rd, Alabaster, AL 35007
      
      Appalachia Southern
      56004
      10
      22392
      10.933178
    
    
      103404
      2013
      Reid School Mine
      Alabama
      Blount
      Permanently abandoned
      Surface
      Independent Producer Operator
      Mine only
      Taft Coal Sales & Associates,
      3000 Riverchase Galleria Ste 1, Birmingham, AL...
      UNIT
      Appalachia Southern
      28807
      18
      28447
      10.268374
    
    
      100759
      2013
      North River #1 Underground Min
      Alabama
      Fayette
      Active, men working, not producing
      Underground
      Independent Producer Operator
      Mine and Preparation Plant
      Jim Walter Resources Inc
      3114 County Rd 63 S, Berry, AL 35546
      UNIT
      Appalachia Southern
      1440115
      183
      474784
      14.180234
    
    
      103246
      2013
      Bear Creek
      Alabama
      Franklin
      Active
      Surface
      Independent Producer Operator
      Mine only
      Birmingham Coal & Coke Co., In
      912 Edenton Street, Birmingham, AL 35242
      
      Appalachia Southern
      87587
      13
      29193
      11.380388
    
    
      103451
      2013
      Knight Mine
      Alabama
      Franklin
      Active
      Surface
      Independent Producer Operator
      Mine only
      Birmingham Coal & Coke Co., In
      P.O. Box 354, Lynn, AL 35242
      
      Appalachia Southern
      147499
      27
      46393
      11.901577



In [9]:

    
df.shape









    Out[9]:





(1061, 16)



In [11]:

    
for column in df.columns:
    print(column)









    



Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production



In [67]:

    
df.log_production.hist()









    Out[67]:





<matplotlib.axes._subplots.AxesSubplot at 0x1199add30>



In [10]:

    
df['Mine_Status'].unique()









    Out[10]:





array(['Active, men working, not producing', 'Permanently abandoned',
       'Active', 'Temporarily closed', 'New, under construction'], dtype=object)



In [15]:

    
df[['Mine_Status','log_production']].groupby('Mine_Status').mean()









    Out[15]:






  
    
      
      log_production
    
    
      Mine_Status
      
    
  
  
    
      Active
      11.977453
    
    
      Active, men working, not producing
      10.499962
    
    
      New, under construction
      3.951244
    
    
      Permanently abandoned
      9.896046
    
    
      Temporarily closed
      9.162933

Predict the Production of coal mines



In [18]:

    
for column in df.columns:
    print(column)









    



Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production



In [19]:

    
df.Year.unique()









    Out[19]:





array([2013])



In [17]:

    
df.Union_Code.unique()









    Out[17]:





array([' ', 'UNIT', 'United Mine Workers of America', 'INTE',
       'International Union of Operation Engineers',
       'Scotia Employees Association', 'Western Energy Workers'], dtype=object)



In [44]:

    
features = ['Average_Employees',
            'Labor_Hours',
          ]

categoricals = ['Mine_State',
                'Mine_County',
                'Mine_Status',
                'Mine_Type',
                'Company_Type',
                'Operation_Type',
                'Union_Code',
                'Coal_Supply_Region',
          ]

target = 'log_production'

first, look at the interplay between each possible predictor and the target variable



In [45]:

    
df.columns









    Out[45]:





Index(['Year', 'Mine_Name', 'Mine_State', 'Mine_County', 'Mine_Status',
       'Mine_Type', 'Company_Type', 'Operation_Type', 'Operating_Company',
       'Operating_Company_Address',
       ...
       'Union_Code_United Mine Workers of America',
       'Union_Code_Western Energy Workers',
       'Coal_Supply_Region_Appalachia Central',
       'Coal_Supply_Region_Appalachia Northern',
       'Coal_Supply_Region_Appalachia Southern',
       'Coal_Supply_Region_Illinois Basin', 'Coal_Supply_Region_Interior',
       'Coal_Supply_Region_Powder River Basin',
       'Coal_Supply_Region_Uinta Region', 'Coal_Supply_Region_Western'],
      dtype='object', length=989)



In [46]:

    
fig = plt.subplots(figsize = (14,8))
sns.set_context('poster')
sns.violinplot(y='Mine_Status',x='log_production', data=df,
              split=True, inner = 'stick',)
plt.tight_layout()



In [47]:

    
fig = plt.subplots(figsize = (14,8))
sns.set_context('poster')
sns.violinplot(y='Company_Type',x='log_production', data=df,
              split=True, inner = 'stick',)
plt.tight_layout()



In [48]:

    
df.Company_Type.unique()









    Out[48]:





array(['Independent Producer Operator', 'Operating Subsidiary',
       'Contractor'], dtype=object)

create dummies

the function creates 3 new colunmns for each category



In [49]:

    
pd.get_dummies(df['Company_Type']).sample(50).head()









    Out[49]:






  
    
      
      Contractor
      Independent Producer Operator
      Operating Subsidiary
    
    
      MSHA ID
      
      
      
    
  
  
    
      1517941
      0.0
      0.0
      1.0
    
    
      1512753
      0.0
      0.0
      1.0
    
    
      1519318
      0.0
      0.0
      1.0
    
    
      3602733
      0.0
      1.0
      0.0
    
    
      4103428
      0.0
      0.0
      1.0



In [50]:

    
# turn eqch categorical variable into a dummy variable

dummy_categoricals = []
for categorical in categoricals:
    print(categorical,len(df[categorical].unique())) 
    drop_var = sorted(df[categorical].unique())[-1]
    temp_df = pd.get_dummies(df[categorical],prefix=categorical)
    df = pd.concat([df,temp_df],axis = 1)
    temp_df.drop('_'.join([categorical, str(drop_var)]), axis = 1, inplace = True)
    dummy_categoricals +=temp_df.columns.tolist()









    



Mine_State 29
Mine_County 164
Mine_Status 5
Mine_Type 3
Company_Type 3
Operation_Type 2
Union_Code 7
Coal_Supply_Region 8



In [52]:

    
dummy_categoricals[:10]









    Out[52]:





['Mine_State_Alabama',
 'Mine_State_Alaska',
 'Mine_State_Arizona',
 'Mine_State_Arkansas',
 'Mine_State_Colorado',
 'Mine_State_Illinois',
 'Mine_State_Indiana',
 'Mine_State_Kansas',
 'Mine_State_Kentucky (East)',
 'Mine_State_Kentucky (West)']

Build our model



In [53]:

    
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor



In [54]:

    
len(dummy_categoricals)









    Out[54]:





213



In [57]:

    
df.shape









    Out[57]:





(1061, 1210)



In [60]:

    
test.shape









    Out[60]:





(319, 1210)



In [55]:

    
train, test = train_test_split(df, test_size = 0.3)



In [ ]:



In [63]:

    
rf = RandomForestRegressor(n_estimators=100, oob_score=True)



In [64]:

    
rf.fit(train[features + dummy_categoricals], train[target])









    Out[64]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)



In [67]:

    
fig = plt.subplots(figsize = (8,8))
sns.regplot(test[target],rf.predict(test[features + dummy_categoricals]))
plt.ylabel('Predicted log_production')
plt.xlim(0, 22)
plt.ylim(0, 22)
plt.tight_layout()



In [68]:

    
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error



In [70]:

    
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)









    Out[70]:





0.87932195036446992



In [71]:

    
explained_variance_score(test[target], predicted)









    Out[71]:





0.88083361311273922



In [72]:

    
mean_squared_error(test[target], predicted)









    Out[72]:





0.6987063758707428



In [76]:

    
# find out the relative importance of each feature

rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
                              'importance':rf.feature_importances_
                              }).sort_values(by='importance', 
                                             ascending = False).reset_index(drop=True)
rf_importances[:20]









    Out[76]:






  
    
      
      importance
      name
    
  
  
    
      0
      0.863181
      Labor_Hours
    
    
      1
      0.030674
      Average_Employees
    
    
      2
      0.007066
      Coal_Supply_Region_Powder River Basin
    
    
      3
      0.005829
      Coal_Supply_Region_Powder River Basin
    
    
      4
      0.003798
      Mine_Type_Surface
    
    
      5
      0.003375
      Mine_Type_Surface
    
    
      6
      0.001956
      Mine_County_Boone
    
    
      7
      0.001802
      Mine_State_West Virginia (Southern)
    
    
      8
      0.001784
      Mine_State_West Virginia (Southern)
    
    
      9
      0.001746
      Mine_Status_Active
    
    
      10
      0.001709
      Mine_Status_Active
    
    
      11
      0.001623
      Coal_Supply_Region_Appalachia Central
    
    
      12
      0.001491
      Mine_County_Boone
    
    
      13
      0.001464
      Coal_Supply_Region_Appalachia Central
    
    
      14
      0.001442
      Coal_Supply_Region_Illinois Basin
    
    
      15
      0.001388
      Coal_Supply_Region_Illinois Basin
    
    
      16
      0.001369
      Mine_State_Refuse Recovery
    
    
      17
      0.001316
      Mine_County_Buchanan
    
    
      18
      0.001271
      Mine_County_Buchanan
    
    
      19
      0.001258
      Company_Type_Independent Producer Operator



In [ ]:

	Year	Mine_Name	Mine_State	Mine_County	Mine_Status	Mine_Type	Company_Type	Operation_Type	Operating_Company	Operating_Company_Address	Union_Code	Coal_Supply_Region	Production_(short_tons)	Average_Employees	Labor_Hours	log_production
MSHA ID
103381	2013	Tacoa Highwall Miner	Alabama	Bibb	Active, men working, not producing	Surface	Independent Producer Operator	Mine only	Jesse Creek Mining, Llc	1615 Kent Dairy Rd, Alabaster, AL 35007		Appalachia Southern	56004	10	22392	10.933178
103404	2013	Reid School Mine	Alabama	Blount	Permanently abandoned	Surface	Independent Producer Operator	Mine only	Taft Coal Sales & Associates,	3000 Riverchase Galleria Ste 1, Birmingham, AL...	UNIT	Appalachia Southern	28807	18	28447	10.268374
100759	2013	North River #1 Underground Min	Alabama	Fayette	Active, men working, not producing	Underground	Independent Producer Operator	Mine and Preparation Plant	Jim Walter Resources Inc	3114 County Rd 63 S, Berry, AL 35546	UNIT	Appalachia Southern	1440115	183	474784	14.180234
103246	2013	Bear Creek	Alabama	Franklin	Active	Surface	Independent Producer Operator	Mine only	Birmingham Coal & Coke Co., In	912 Edenton Street, Birmingham, AL 35242		Appalachia Southern	87587	13	29193	11.380388
103451	2013	Knight Mine	Alabama	Franklin	Active	Surface	Independent Producer Operator	Mine only	Birmingham Coal & Coke Co., In	P.O. Box 354, Lynn, AL 35242		Appalachia Southern	147499	27	46393	11.901577

	log_production
Mine_Status
Active	11.977453
Active, men working, not producing	10.499962
New, under construction	3.951244
Permanently abandoned	9.896046
Temporarily closed	9.162933

	Contractor	Independent Producer Operator	Operating Subsidiary
MSHA ID
1517941	0.0	0.0	1.0
1512753	0.0	0.0	1.0
1519318	0.0	0.0	1.0
3602733	0.0	1.0	0.0
4103428	0.0	0.0	1.0

	importance	name
0	0.863181	Labor_Hours
1	0.030674	Average_Employees
2	0.007066	Coal_Supply_Region_Powder River Basin
3	0.005829	Coal_Supply_Region_Powder River Basin
4	0.003798	Mine_Type_Surface
5	0.003375	Mine_Type_Surface
6	0.001956	Mine_County_Boone
7	0.001802	Mine_State_West Virginia (Southern)
8	0.001784	Mine_State_West Virginia (Southern)
9	0.001746	Mine_Status_Active
10	0.001709	Mine_Status_Active
11	0.001623	Coal_Supply_Region_Appalachia Central
12	0.001491	Mine_County_Boone
13	0.001464	Coal_Supply_Region_Appalachia Central
14	0.001442	Coal_Supply_Region_Illinois Basin
15	0.001388	Coal_Supply_Region_Illinois Basin
16	0.001369	Mine_State_Refuse Recovery
17	0.001316	Mine_County_Buchanan
18	0.001271	Mine_County_Buchanan
19	0.001258	Company_Type_Independent Producer Operator