Make a prediction about the coal production


In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()

In [8]:
df = pd.read_csv("../data/cleaned_coalpublic2013.xls", index_col='MSHA ID')
df.head()
len(df)


Out[8]:
1061

In [10]:
for column in df.columns:
    print column


Year
Mine_Name
Mine_State
Mine_County
Mine_Status
Mine_Type
Company_Type
Operation_Type
Operating_Company
Operating_Company_Address
Union_Code
Coal_Supply_Region
Production_(short_tons)
Average_Employees
Labor_Hours
log_production

In [11]:
df.log_production.hist()


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5dfa2239d0>

In [12]:
df.Mine_Status.unique()


Out[12]:
array(['Active, men working, not producing', 'Permanently abandoned',
       'Active', 'Temporarily closed', 'New, under construction'], dtype=object)

In [15]:
df[['Mine_Status', 'log_production']].groupby('Mine_Status').mean()


Out[15]:
log_production
Mine_Status
Active 11.977453
Active, men working, not producing 10.499962
New, under construction 3.951244
Permanently abandoned 9.896046
Temporarily closed 9.162933

Predict the production of coal mines


In [26]:
features = ['Average_Employees',
            'Labor_Hours'
           ]
categoricals = ['Mine_State',
            'Mine_County',
            'Mine_Status',
            'Mine_Type',
            'Company_Type',
            'Operation_Type',
            'Union_Code',
            'Coal_Supply_Region',
           ]
target = 'log_production'

In [18]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Mine_Status', x='log_production', data=df,
                  split=True, inner='stick', )
plt.tight_layout()



In [19]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Company_Type', x='log_production', data=df,
                  split=True, inner='stick', )
plt.tight_layout()



In [20]:
df['Company_Type'].unique()


Out[20]:
array(['Independent Producer Operator', 'Operating Subsidiary',
       'Contractor'], dtype=object)

In [22]:
pd.get_dummies(df['Company_Type']).sample(50).head()


Out[22]:
Contractor Independent Producer Operator Operating Subsidiary
MSHA ID
3605018 0.0 0.0 1.0
4407154 0.0 1.0 0.0
4609105 0.0 1.0 0.0
1518662 0.0 1.0 0.0
1519645 0.0 1.0 0.0

In [27]:
dummy_categoricals =[]
for categorical in categoricals:
    print categorical, len(df[categorical].unique())
    # Avoid the dummy variable trap!
    drop_var = sorted(df[categorical].unique())[-1]
    temp_df = pd.get_dummies(df[categorical], prefix=categorical)
    df = pd.concat([df, temp_df], axis=1)
    temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
    dummy_categoricals += temp_df.columns.tolist()


Mine_State 29
Mine_County 164
Mine_Status 5
Mine_Type 3
Company_Type 3
Operation_Type 2
Union_Code 7
Coal_Supply_Region 8

In [28]:
dummy_categoricals[:10]


Out[28]:
['Mine_State_Alabama',
 'Mine_State_Alaska',
 'Mine_State_Arizona',
 'Mine_State_Arkansas',
 'Mine_State_Colorado',
 'Mine_State_Illinois',
 'Mine_State_Indiana',
 'Mine_State_Kansas',
 'Mine_State_Kentucky (East)',
 'Mine_State_Kentucky (West)']

Build our Model


In [29]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [30]:
len(dummy_categoricals)


Out[30]:
213

In [31]:
train, test = train_test_split(df, test_size=0.3)

In [33]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)

In [34]:
rf.fit(train[features + dummy_categoricals], train[target])


Out[34]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [37]:
fit = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]))
plt.xlim(0,22)
plt.xlabel("Actual Production")
plt.ylabel("Predicted Production")
plt.ylim(0,22)
plt.tight_layout()



In [38]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error

In [40]:
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)


Out[40]:
0.85343550681684022

In [41]:
explained_variance_score(test[target], predicted)


Out[41]:
0.85365764406281675

In [42]:
mean_squared_error(test[target], predicted)


Out[42]:
0.74334590286293978

In [44]:
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
                              'importance':rf.feature_importances_}).sort_values(by='importance', 
                                                                                 ascending=False).reset_index(drop=True)
rf_importances.head(20)


Out[44]:
importance name
0 0.861462 Labor_Hours
1 0.038749 Average_Employees
2 0.006018 Coal_Supply_Region_Powder River Basin
3 0.004721 Coal_Supply_Region_Powder River Basin
4 0.004207 Mine_Type_Surface
5 0.003919 Mine_Type_Surface
6 0.001827 Coal_Supply_Region_Appalachia Central
7 0.001814 Mine_State_West Virginia (Southern)
8 0.001793 Company_Type_Independent Producer Operator
9 0.001672 Mine_State_West Virginia (Southern)
10 0.001626 Mine_State_West Virginia (Southern)
11 0.001531 Mine_Status_Active
12 0.001513 Mine_County_Campbell
13 0.001485 Company_Type_Independent Producer Operator
14 0.001383 Coal_Supply_Region_Illinois Basin
15 0.001381 Coal_Supply_Region_Appalachia Central
16 0.001359 Mine_County_Campbell
17 0.001351 Mine_County_Buchanan
18 0.001338 Mine_County_Indiana
19 0.001272 Mine_County_Grant

In [ ]: