Make a prediction about the coal production


In [3]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set();

In [4]:
df = pd.read_csv("../data/cleaned_coalpublic2013.csv", index_col='MSHA ID')
df.head()


Out[4]:
Year Mine_Name Mine_State Mine_County Mine_Status Mine_Type Company_Type Operation_Type Operating_Company Operating_Company_Address Union_Code Coal_Supply_Region Production_(short_tons) Average_Employees Labor_Hours log_production
MSHA ID
103381 2013 Tacoa Highwall Miner Alabama Bibb Active, men working, not producing Surface Independent Producer Operator Mine only Jesse Creek Mining, Llc 1615 Kent Dairy Rd, Alabaster, AL 35007 Appalachia Southern 56004 10 22392 10.933178
103404 2013 Reid School Mine Alabama Blount Permanently abandoned Surface Independent Producer Operator Mine only Taft Coal Sales & Associates, 3000 Riverchase Galleria Ste 1, Birmingham, AL... UNIT Appalachia Southern 28807 18 28447 10.268374
100759 2013 North River #1 Underground Min Alabama Fayette Active, men working, not producing Underground Independent Producer Operator Mine and Preparation Plant Jim Walter Resources Inc 3114 County Rd 63 S, Berry, AL 35546 UNIT Appalachia Southern 1440115 183 474784 14.180234
103246 2013 Bear Creek Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In 912 Edenton Street, Birmingham, AL 35242 Appalachia Southern 87587 13 29193 11.380388
103451 2013 Knight Mine Alabama Franklin Active Surface Independent Producer Operator Mine only Birmingham Coal & Coke Co., In P.O. Box 354, Lynn, AL 35242 Appalachia Southern 147499 27 46393 11.901577

In [5]:
len(df)


Out[5]:
1061

In [6]:
for column in df.columns:
    print column


Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production

In [7]:
df.log_production.hist()


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c008410>

In [8]:
df.Mine_Status.unique()


Out[8]:
array(['Active, men working, not producing', 'Permanently abandoned',
       'Active', 'Temporarily closed', 'New, under construction'], dtype=object)

In [9]:
df[['Mine_Status', 'log_production']].groupby('Mine_Status').mean()


Out[9]:
log_production
Mine_Status
Active 11.977453
Active, men working, not producing 10.499962
New, under construction 3.951244
Permanently abandoned 9.896046
Temporarily closed 9.162933

Predict the Production of coal mines


In [10]:
for column in df.columns:
    print column


Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production

In [11]:
df.Year.unique()


Out[11]:
array([2013])

In [12]:
features = ['Average_Employees',
            'Labor_Hours',
           ]

categoricals = ['Mine_State',
                'Mine_County',
                'Mine_Status',
                'Mine_Type',
                'Company_Type',
                'Operation_Type',
                'Union_Code',
                'Coal_Supply_Region',
               ]

target = 'log_production'

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [14]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y="Mine_Status", x="log_production", data=df,
               split=True, inner="stick", )
plt.tight_layout()



In [15]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y="Company_Type", x="log_production", data=df, 
               split=True, inner="stick");
plt.tight_layout()



In [16]:
df['Company_Type'].unique()


Out[16]:
array(['Independent Producer Operator', 'Operating Subsidiary',
       'Contractor'], dtype=object)

In [18]:
pd.get_dummies(df['Company_Type']).sample(50).head()


Out[18]:
Contractor Independent Producer Operator Operating Subsidiary
MSHA ID
103419 0 1 0
3601262 0 1 0
4407123 0 0 1
3609605 0 1 0
2302262 0 1 0

In [19]:
dummy_categoricals = []
for categorical in categoricals:
    print categorical, len(df[categorical].unique())
    # Avoid the dummy variable trap!
    drop_var = sorted(df[categorical].unique())[-1]
    temp_df = pd.get_dummies(df[categorical], prefix=categorical)
    df = pd.concat([df, temp_df], axis=1)
    temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
    dummy_categoricals += temp_df.columns.tolist()


Mine_State 29
Mine_County 164
Mine_Status 5
Mine_Type 3
Company_Type 3
Operation_Type 2
Union_Code 7
Coal_Supply_Region 8

In [20]:
dummy_categoricals[:10]


Out[20]:
['Mine_State_Alabama',
 'Mine_State_Alaska',
 'Mine_State_Arizona',
 'Mine_State_Arkansas',
 'Mine_State_Colorado',
 'Mine_State_Illinois',
 'Mine_State_Indiana',
 'Mine_State_Kansas',
 'Mine_State_Kentucky (East)',
 'Mine_State_Kentucky (West)']

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Build our model


In [21]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [22]:
len(dummy_categoricals)


Out[22]:
213

In [23]:
train, test = train_test_split(df, test_size=0.3)

In [ ]:


In [25]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)

In [27]:
rf.fit(train[features + dummy_categoricals], train[target])


Out[27]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [29]:
fig = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]))
plt.ylabel("Predicted production")
plt.xlim(0, 22)
plt.ylim(0, 22)
plt.tight_layout()



In [30]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error

In [31]:
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)


Out[31]:
0.88433913539444742

In [32]:
explained_variance_score(test[target], predicted)


Out[32]:
0.88455128068585942

In [33]:
mean_squared_error(test[target], predicted)


Out[33]:
0.54444109401274343

In [34]:
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
                               'importance':rf.feature_importances_
                              }).sort_values(by='importance', 
                                              ascending=False).reset_index(drop=True)
rf_importances.head(20)


Out[34]:
importance name
0 0.846915 Labor_Hours
1 0.048470 Average_Employees
2 0.009491 Mine_Type_Surface
3 0.006033 Mine_County_Campbell
4 0.005222 Coal_Supply_Region_Powder River Basin
5 0.004416 Mine_Status_Active
6 0.003813 Mine_State_West Virginia (Southern)
7 0.003739 Mine_County_Martin
8 0.003676 Mine_County_Boone
9 0.003028 Coal_Supply_Region_Illinois Basin
10 0.002612 Coal_Supply_Region_Appalachia Central
11 0.002508 Mine_County_Schuylkill
12 0.002293 Mine_County_Grant
13 0.002258 Mine_County_Buchanan
14 0.001972 Company_Type_Independent Producer Operator
15 0.001959 Operation_Type_Mine and Preparation Plant
16 0.001931 Mine_State_Pennsylvania (Anthracite)
17 0.001727 Mine_Status_Active, men working, not producing
18 0.001577 Mine_Type_Refuse
19 0.001556 Coal_Supply_Region_Appalachia Northern

In [ ]: