In [1]:
import pandas as pd
import numpy as np
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from sklearn.model_selection import train_test_split
# Reading the csv
titanic_df = pd.read_csv("/home/deploy/pramit/data/titanic/train.csv")
titanic_df.describe()
Out[1]:
In [2]:
# Quick data transformation and cleaning ...
titanic_df["Sex"] = titanic_df["Sex"].astype('category')
titanic_df["Sex_Encoded"] = titanic_df["Sex"].cat.codes
titanic_df["Embarked"] = titanic_df["Embarked"].astype('category')
titanic_df["Embarked_Encoded"] = titanic_df["Embarked"].cat.codes
print(titanic_df.head(5))
titanic_df_clean = titanic_df.drop(['Ticket','Cabin', 'Name', 'Sex', 'Embarked'], axis=1)
# # Remove NaN values
titanic_df_clean = titanic_df_clean.dropna()
print(titanic_df_clean.head(5))
In [3]:
y = titanic_df_clean['Survived']
In [4]:
data = titanic_df_clean.drop(['Survived'], axis=1)
data['label'] = y
print(data.head())
# Lets trying building an Interpretable Model
feature_labels = list(data.columns)
print(type(feature_labels[0]))
print(feature_labels)
In [5]:
# Split into train and test
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
Xtrain, Xtest, ytrain, ytest = train_test_split(data, y, test_size=0.20, random_state=0)
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)
#r_input_data = pandas2ri.py2ri(data)
In [6]:
as_factor = ro.r['as.factor']
s_apply = ro.r['sapply']
frame = ro.r['data.frame']
r_Xtrain = frame(s_apply(Xtrain, as_factor))
r_Xtest = frame(s_apply(Xtest, as_factor))
print(type(r_Xtrain))
In [7]:
sbrl = importr('sbrl')
In [8]:
print(sbrl.sbrl.__dict__)
In [9]:
%prun
model = sbrl.sbrl(r_Xtrain, iters=50000,
pos_sign=1, neg_sign=0, rule_minlen=1,
rule_maxlen=4, minsupport_pos=0.10, minsupport_neg=0.10, eta=1.0, nchain=40)
print(model)
In [10]:
result_r_score = ro.r.predict(model, r_Xtest)
In [11]:
pandas_df = pandas2ri.ri2py_dataframe(result_r_score)
pandas_df.head()
Out[11]:
In [12]:
#from sklearn.metrics import roc_auc_score
from sklearn import metrics
predicted_scores_prob = pd.DataFrame(pandas_df.values.T)
fpr, tpr, thresholds = metrics.roc_curve(ytest ,predicted_scores_prob[1], pos_label=1)
In [13]:
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)
In [14]:
from sklearn.ensemble import RandomForestClassifier
In [15]:
y_train = ytrain
y_test = ytest
x_train = Xtrain.drop(['label'], axis=1)
x_test = Xtest.drop(['label'], axis=1)
rf_model = RandomForestClassifier().fit(x_train, y_train)
rf_predict_score = pd.DataFrame(rf_model.predict_proba(x_test))
print(rf_predict_score).head()
In [16]:
rf_predict_score[0].head()
rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(y_test, rf_predict_score[1], pos_label=1)
rf_roc_auc = metrics.auc(rf_fpr, rf_tpr)
print(rf_roc_auc)
In [17]:
fpr_list = [rf_fpr, fpr]
tpr_list = [rf_tpr, tpr]
roc_auc_list = [rf_roc_auc, roc_auc]
label_list = ['Random Forest', 'Decision Rules']
In [19]:
from itertools import cycle
import matplotlib.pyplot as plt
#plt.style.use('ggplot')
plt.style.use(['fivethirtyeight'])
%matplotlib inline
plt.figure(figsize=(10,10))
#plt.rc('legend', size=10)
plt.rc('font', size=12)
lw = 2
colors = ['aqua', 'darkorange']
for i in range(len(fpr_list)):
plt.plot(fpr_list[i], tpr_list[i], color=colors[i],
lw=lw, label='ROC curve {0} (area = {1:0.2f})'.format(label_list[i], roc_auc_list[i]), )
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Receiver operating characteristic', fontsize=18)
plt.legend(loc="lower right")
Out[19]:
In [55]:
plt.style.available
Out[55]:
In [37]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()
x_values = [1, 2, 3, 4, 5]
y_values = [6, 7, 2, 3, 6]
y_values2 = [3, 7, 2, 9, 6]
p = figure()
p.line(x=x_values, y=y_values2, line_color="blue")
p.line(x=x_values, y=y_values, line_color="pink")
show(p)
In [ ]: