In [2]:
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
In [3]:
df = pd.read_csv("Big_Mart_Train.csv")
df.head()
Out[3]:
In [4]:
# Preprocessing
## fill NA
df.Item_Weight = df.Item_Weight.fillna(np.nanmedian(df.Item_Weight))
df.Outlet_Size = df.Outlet_Size.fillna(df.Outlet_Size.mode().iloc[0])
## standardize values
df.Item_Fat_Content = df.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
df.Item_Fat_Content = df.Item_Fat_Content.replace(['reg'], ['Regular'])
df.Outlet_Establishment_Year = 2017 - df.Outlet_Establishment_Year
## drop ids
df.drop('Item_Identifier',axis=1, inplace=True)
## label encoding on categorical data
le = LabelEncoder()
for i in df.columns:
if (df[i].dtype == 'object'):
df[i] = le.fit_transform(df[i].astype('str'))
df[i] = df[i].astype('object')
## save target
target_reg = df['Item_Outlet_Sales'] # regression target
def reg2clf(v, threshold):
if v > threshold:
return 1
return 0
target_clf = df['Item_Outlet_Sales'].apply(lambda r: reg2clf(r, df['Item_Outlet_Sales'].median())) # classification target
df.drop('Item_Outlet_Sales',axis=1, inplace=True)
df.head()
Out[4]:
In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, target_reg, train_size=0.75, test_size=0.25)
In [6]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
Out[6]:
In [12]:
%time
preds = np.stack([t.predict(X_test) for t in m.estimators_])
np.mean(preds[:,0]), np.std(preds[:,0])
Out[12]:
In [26]:
fi = rf_feat_importance(m, X_train)
fi
Out[26]:
In [27]:
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False, color='g')
In [28]:
plot_fi(fi)
Out[28]:
In [25]:
from scipy.cluster import hierarchy as hc
corr = np.round(scipy.stats.spearmanr(df).correlation, 4) # spearman correlation method
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=16)
plt.show()
In [32]:
def split_vals(a,n):
return a[:n], a[n:]
def get_oob(df, n_trn, y_train):
m = RandomForestRegressor(n_estimators=30, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
x, _ = split_vals(df, n_trn)
m.fit(x, y_train)
return m.oob_score_
In [43]:
get_oob(X_train, len(X_train), y_train)
Out[43]:
In [49]:
# remove each feature and check oob score
for col in X_train.columns:
print(col, get_oob(X_train.drop(col, axis=1), len(X_train), y_train))
In [76]:
from treeinterpreter import treeinterpreter as ti
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
check_idx = 0
for i in range(len(X_test)):
if i == check_idx:
r = X_test.values[None,i]
print('True Value:', y_test.values[i])
prediction, bias, contributions = ti.predict(m, r)
print('Prediction', prediction)
print('Average of the Prediction', bias)
print('Contribution:', [elem for elem in zip(X_test.columns, X_test.iloc[0], contributions[0])])
break
In [83]:
contribution_dct = {}
for i in range(len(X_test)):
if i == 10:
break
r = X_test.values[None,i]
prediction, bias, contributions = ti.predict(m, r)
contribution_dct[i] = dict(zip(X_test.columns, contributions[0]))
contribution_df = pd.DataFrame(contribution_dct).T
contribution_df.head()
Out[83]:
In [87]:
avg_contribution = contribution_df.apply(np.mean)
avg_contribution
Out[87]:
In [88]:
X_train, X_test, y_train, y_test = train_test_split(df, target_clf, train_size=0.75, test_size=0.25)
In [89]:
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
Out[89]:
In [91]:
%time
preds = np.stack([t.predict(X_test) for t in m.estimators_]) # each estimator is a tre
print(preds)
np.mean(preds[:,0]), np.std(preds[:,0])
Out[91]:
In [93]:
from treeinterpreter import treeinterpreter as ti
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
check_idx = 0
for i in range(len(X_test)):
if i == check_idx:
r = X_test.values[None,i]
print('True Value:', y_test.values[i])
prediction, bias, contributions = ti.predict(m, r) # it predicts probability
print('Prediction', prediction)
print('Average of the Prediction', bias)
print('Contribution:', [elem for elem in zip(X_test.columns, X_test.iloc[0], contributions[0])])
break
predict() is predicting probability, which is goodtreeinterpreter is a bit slow, about 1 second per row, it can be used in eyeballing method when you don't have much records to check