In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error
sns.set();
In [2]:
df = pd.read_csv("../data/cleaned_coalpublic2013.csv", index_col='MSHA ID')
df[['Year', 'Mine_Name']].head()
Out[2]:
In [3]:
features = ['Average_Employees',
'Labor_Hours',
]
categoricals = ['Mine_State',
'Mine_County',
'Mine_Status',
'Mine_Type',
'Company_Type',
'Operation_Type',
'Union_Code',
'Coal_Supply_Region',
]
target = 'log_production'
In [4]:
sns.set_context('poster')
fig = plt.subplots(figsize=(14,8))
sns.violinplot(y="Company_Type", x="log_production", data=df,
split=True, inner="stick");
plt.tight_layout()
plt.savefig("../figures/Coal_prediction_company_type_vs_log_production.png")
In [5]:
dummy_categoricals = []
for categorical in categoricals:
# Avoid the dummy variable trap!
drop_var = sorted(df[categorical].unique())[-1]
temp_df = pd.get_dummies(df[categorical], prefix=categorical)
df = pd.concat([df, temp_df], axis=1)
temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
dummy_categoricals += temp_df.columns.tolist()
In [6]:
train, test = train_test_split(df, test_size=0.3)
In [7]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)
rf.fit(train[features + dummy_categoricals], train[target])
Out[7]:
In [8]:
fig = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]), color='green')
plt.ylabel("Predicted production")
plt.xlim(0, 22)
plt.ylim(0, 22)
plt.tight_layout()
plt.savefig("../figures/Coal-production-RF-prediction.png")
In [9]:
predicted = rf.predict(test[features + dummy_categoricals])
print "R^2 score:", r2_score(test[target], predicted)
print "MSE:", mean_squared_error(test[target], predicted)
In [10]:
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
'importance':rf.feature_importances_
}).sort_values(by='importance',
ascending=False).reset_index(drop=True)
rf_importances.head(5)
Out[10]:
In [ ]: