In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
In [5]:
df = pd.read_csv("../data/cleaned_coalpublic2013.csv", index_col="MSHA ID")
df.head()
Out[5]:
In [8]:
len(df)
Out[8]:
In [32]:
for column in df.columns:
print column
In [7]:
# log normal distribution
df['log_production'].hist()
Out[7]:
In [9]:
df.Mine_Status.unique()
Out[9]:
In [11]:
df[['Mine_Status', 'log_production']].groupby('Mine_Status').mean()
Out[11]:
In [13]:
for column in df.columns:
print column
In [14]:
df.Union_Code.unique()
Out[14]:
In [15]:
df.Year.unique()
Out[15]:
In [17]:
features = ['Average_Employees',
'Labor_Hours']
categorials = ['Mine_State',
'Mine_County',
'Mine_Status',
'Mine_Type',
'Company_Type',
'Operation_Type',
'Operating_Company',
'Operating_Company_Address',
'Union_Code',
'Coal_Supply_Region']
target = 'log_production'
In [18]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Mine_Status', x="log_production", data=df, split=True, inner="stick");
plt.tight_layout()
In [19]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Company_Type', x="log_production", data=df, split=True, inner="stick");
plt.tight_layout()
In [20]:
df.Company_Type.unique()
Out[20]:
In [24]:
pd.get_dummies(df.Company_Type).sample(50).head()
Out[24]:
In [27]:
dummy_categoricals = []
for categorical in categorials:
print categorical, len(df[categorical].unique())
# avoid the dummy variable trap!
drop_var = sorted(df[categorical].unique())[-1]
temp_df = pd.get_dummies(df[categorical], prefix=categorical)
df = pd.concat([df, temp_df], axis=1)
temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
dummy_categoricals += temp_df.columns.tolist()
In [28]:
dummy_categoricals[:10]
Out[28]:
In [29]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
In [30]:
len(dummy_categoricals)
Out[30]:
In [32]:
train, test = train_test_split(df, test_size=0.3)
In [33]:
train.head()
Out[33]:
In [35]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)
In [36]:
rf.fit(train[features + dummy_categoricals], train[target])
Out[36]:
In [38]:
fig = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]))
plt.ylabel('Predicted Production')
plt.xlim(0,22)
plt.ylim(0,22)
plt.tight_layout()
In [39]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error
In [41]:
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)
Out[41]:
In [42]:
explained_variance_score(test[target], predicted)
Out[42]:
In [43]:
mean_squared_error(test[target], predicted)
Out[43]:
In [44]:
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
'importance':rf.feature_importances_
}).sort_values(by='importance', ascending=False).reset_index(drop=True)
rf_importances.head(20)
Out[44]: