In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set();
In [8]:
df = pd.read_csv('../data/cleaned_coalpublic2013.csv',header=0,index_col='MSHA ID')
df.head()
Out[8]:
In [9]:
df.shape
Out[9]:
In [11]:
for column in df.columns:
print(column)
In [67]:
df.log_production.hist()
Out[67]:
In [10]:
df['Mine_Status'].unique()
Out[10]:
In [15]:
df[['Mine_Status','log_production']].groupby('Mine_Status').mean()
Out[15]:
In [18]:
for column in df.columns:
print(column)
In [19]:
df.Year.unique()
Out[19]:
In [17]:
df.Union_Code.unique()
Out[17]:
In [44]:
features = ['Average_Employees',
'Labor_Hours',
]
categoricals = ['Mine_State',
'Mine_County',
'Mine_Status',
'Mine_Type',
'Company_Type',
'Operation_Type',
'Union_Code',
'Coal_Supply_Region',
]
target = 'log_production'
In [45]:
df.columns
Out[45]:
In [46]:
fig = plt.subplots(figsize = (14,8))
sns.set_context('poster')
sns.violinplot(y='Mine_Status',x='log_production', data=df,
split=True, inner = 'stick',)
plt.tight_layout()
In [47]:
fig = plt.subplots(figsize = (14,8))
sns.set_context('poster')
sns.violinplot(y='Company_Type',x='log_production', data=df,
split=True, inner = 'stick',)
plt.tight_layout()
In [48]:
df.Company_Type.unique()
Out[48]:
In [49]:
pd.get_dummies(df['Company_Type']).sample(50).head()
Out[49]:
In [50]:
# turn eqch categorical variable into a dummy variable
dummy_categoricals = []
for categorical in categoricals:
print(categorical,len(df[categorical].unique()))
drop_var = sorted(df[categorical].unique())[-1]
temp_df = pd.get_dummies(df[categorical],prefix=categorical)
df = pd.concat([df,temp_df],axis = 1)
temp_df.drop('_'.join([categorical, str(drop_var)]), axis = 1, inplace = True)
dummy_categoricals +=temp_df.columns.tolist()
In [52]:
dummy_categoricals[:10]
Out[52]:
In [53]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
In [54]:
len(dummy_categoricals)
Out[54]:
In [57]:
df.shape
Out[57]:
In [60]:
test.shape
Out[60]:
In [55]:
train, test = train_test_split(df, test_size = 0.3)
In [ ]:
In [63]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)
In [64]:
rf.fit(train[features + dummy_categoricals], train[target])
Out[64]:
In [67]:
fig = plt.subplots(figsize = (8,8))
sns.regplot(test[target],rf.predict(test[features + dummy_categoricals]))
plt.ylabel('Predicted log_production')
plt.xlim(0, 22)
plt.ylim(0, 22)
plt.tight_layout()
In [68]:
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error
In [70]:
predicted = rf.predict(test[features + dummy_categoricals])
r2_score(test[target], predicted)
Out[70]:
In [71]:
explained_variance_score(test[target], predicted)
Out[71]:
In [72]:
mean_squared_error(test[target], predicted)
Out[72]:
In [76]:
# find out the relative importance of each feature
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
'importance':rf.feature_importances_
}).sort_values(by='importance',
ascending = False).reset_index(drop=True)
rf_importances[:20]
Out[76]:
In [ ]: