In [39]:
# import libraries and read iris dataset
import numpy as np
import pandas as pd
# import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import eli5
from eli5.sklearn import PermutationImportance
# following 2 lines for normalization
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns = ['sepal_len', 'sepal_width', 'petal_len', 'petal_width'])
target = pd.DataFrame(iris.target, columns = ['target'])
clf = RandomForestClassifier(n_estimators=100, random_state=1)
#clf = LogisticRegression()
# clf = XGBClassifier()
In [40]:
# Function to add new features
# please add some features, for example ratio of sepal length/width, differnce between petal length/width, estimated size of petal(petal length x width x constant?) etc
def add_feature(df):
df['petal_ratio'] = df.petal_len / df.petal_width
df['sepal_ratio'] = df.sepal_len / df.sepal_width
df['psl_ratio'] = df.petal_len / df.sepal_len
df['psw_ratio'] = df.petal_width / df.sepal_width
# df['sepal_size'] = df.sepal_len * df.sepal_width
# df['petal_size'] = df.petal_len * df.petal_width
df['pss_ratio'] = (df.sepal_len * df.sepal_width) / (df.petal_len * df.petal_width)
return
def add_stat_feature(df, df_maen, df_std):
from scipy.stats import norm # Starndard Normal distribution
for col in df.columns:
for target in df_mean.columns:
df_scale = df_std.at[col, target]
if df_scale == 0: # Sanity check to avoid 0 devide
print('unexpected condition: std=0', col, target)
#col_val = abs(df[col]-df_mean.at[col, target]) # pandas Series (array-like 1d object)
#df[col+'_'+str(target)] = norm.sf(col_val, loc=0, scale=df_scale) * 2 # SF for both side of SF
df[col+'_'+str(target)] = norm.pdf((df[col]-df_mean.at[col, target])/df_scale, loc=0, scale=1)
for target in ['_0', '_1', '_2']:
proba = 'proba' + target
df[proba] = 1.0
for col in df.columns:
if col[-2:] == target:
df[proba] *= df[col]
return
In [41]:
# call add_feature and Concat data and target for analysis
add_feature(df)
df_Xy = pd.concat([df, target], axis=1)
df_mean = df_Xy.groupby('target').mean().T
df_std = df_Xy.groupby('target').std().T
add_stat_feature(df, df_mean, df_std)
In [42]:
print(df_mean, '\n\n', df_std)
In [43]:
df_Xy = pd.concat([df[['proba_0', 'proba_1', 'proba_2']], target], axis=1)
df_Xy.groupby('target').describe().T
Out[43]:
In [44]:
df_Xy.columns
Out[44]:
In [45]:
# Let's check correlation between features with heatmap
disp_feat = ['proba_0', 'proba_1', 'proba_2', 'target']
sns.heatmap(df_Xy[disp_feat].corr(), center=0, cmap='RdBu', annot=True)
Out[45]:
In [46]:
# Split data into train, cross-validation
train_X, val_X, train_y, val_y = train_test_split(df, iris.target, random_state=1)
# Fit model
clf.fit(train_X, train_y)
#clf_rf = RandomForestClassifier(n_estimators=100, random_state=1).fit(train_X, train_y)
Out[46]:
In [47]:
# Show permutation inmportance
'''
# use xgboost.plot_importance method for xgboost
from xgboost import plot_importance
plot_importance(clf)
'''
perm = PermutationImportance(clf, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())
Out[47]:
Above result was suprising. Can we assume that
Please think some hypothesis from observed output, and create/modify strategy to get new finding. We have learned some tools to do that.
After finding proper combination of features, try to find best model to predict. In order to select hyper-parameters of estimator, GridSearchCV and RandomizedSearchCV will help to check the combination. Try other estimator such as SVC, KNN, xgbost.
In [48]:
# make a prediction and show accuracy
val_p = clf.predict(val_X)
print(val_p)
print(val_y)
print((val_y == val_p).mean())
# Probality of prediction
print(clf.predict_proba(val_X))
In [49]:
# PDP of petal length/width Sepal length
from pdpbox import pdp, get_dataset, info_plots
for feature_name in ['petal_ratio', 'psw_ratio', 'pss_ratio', 'psl_ratio']:
# Create the data that we will plot
pdp_feat = pdp.pdp_isolate(model=clf, dataset=val_X, model_features=val_X.columns, feature=feature_name)
# plot it
pdp.pdp_plot(pdp_feat, feature_name)
plt.show()
In [50]:
row_to_show = 22
err_prediction = val_X.iloc[row_to_show] # use 1 row of data here. Could use multiple rows if desired
err_prediction_array = err_prediction.values.reshape(1, -1)
print('raw data:\n', val_X.iloc[row_to_show], sep='')
clf.predict_proba(err_prediction_array)
Out[50]:
In [51]:
import shap # package used to calculate Shap values
# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf)
# if LogsticRegession is used, use KenelExplainer
#explainer = shap.KernelExplainer(clf.predict_proba, train_X)
# Calculate Shap values
shap_values = explainer.shap_values(err_prediction)
shap.initjs()
#shap.force_plot(explainer.expected_value[0], shap_values[0], err_prediction)
# SHAP force plot of class=1
shap.force_plot(explainer.expected_value[1], shap_values[1], err_prediction)
Out[51]:
In [52]:
# SHAP forceplot of class=2
shap.initjs()
shap.force_plot(explainer.expected_value[2], shap_values[2], err_prediction)
Out[52]:
In [53]:
val_X.columns
Out[53]:
In [54]:
# use standard scaler to adjust scaling
# val_X.columns
features = ['petal_len_0', 'petal_len_1', 'petal_len_2',
'petal_width_0', 'petal_width_1', 'petal_width_2', 'petal_ratio_0',
'petal_ratio_1', 'petal_ratio_2', 'psl_ratio_0', 'psl_ratio_1', 'psl_ratio_2',
'psw_ratio_0', 'psw_ratio_1', 'psw_ratio_2', 'pss_ratio_0',
'pss_ratio_1', 'pss_ratio_2', 'proba_0', 'proba_1', 'proba_2']
clf.fit(train_X[features], train_y)
val_p = clf.predict(val_X[features])
print(val_p)
print(val_y)
print((val_y == val_p).mean())
# Probality of prediction
print(clf.predict_proba(val_X[features]))
# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf)
# explainer = shap.KernelExplainer(clf.predict_proba, train_X[features])
# Calculate Shap values
err_prediction = val_X[features].iloc[row_to_show]
shap_values = explainer.shap_values(err_prediction)
shap.initjs()
#shap.force_plot(explainer.expected_value[0], shap_values[0], err_prediction)
# SHAP force plot of class=1
shap.force_plot(explainer.expected_value[1], shap_values[1], err_prediction)
Out[54]:
In [55]:
while True:
train_X, val_X, train_y, val_y = train_test_split(df, iris.target, test_size=0.2) # random choice
clf.fit(train_X[features], train_y)
val_p = clf.predict(val_X[features])
err_count = (val_y != val_p).sum()
print('Error Count:', err_count)
if err_count != 0:
break
In [56]:
print(val_p)
print(val_y)
print(np.where(val_p != val_y))
print(clf.predict_proba(val_X[features]))
err_prediction = val_X[features].iloc[1]
shap_values = explainer.shap_values(err_prediction)
shap.initjs()
#shap.force_plot(explainer.expected_value[0], shap_values[0], err_prediction)
# SHAP force plot of class=1
shap.force_plot(explainer.expected_value[1], shap_values[1], err_prediction)
Out[56]:
In [57]:
from sklearn.model_selection import cross_val_score # or ShuffleSplit
features = ['sepal_len_0',
'sepal_len_1', 'sepal_len_2', 'sepal_width_0', 'sepal_width_1',
'sepal_width_2', 'petal_len_0', 'petal_len_1', 'petal_len_2',
'petal_width_0', 'petal_width_1', 'petal_width_2', 'petal_ratio_0',
'petal_ratio_1', 'petal_ratio_2', 'sepal_ratio_0', 'sepal_ratio_1',
'sepal_ratio_2', 'psl_ratio_0', 'psl_ratio_1', 'psl_ratio_2',
'psw_ratio_0', 'psw_ratio_1', 'psw_ratio_2', 'pss_ratio_0',
'pss_ratio_1', 'pss_ratio_2']
scores = cross_val_score(clf, df[features], iris.target, cv=5)
print(scores)
# std +2: confidence level 95%
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))