In [26]:
import pandas as pd
import numpy as np
# Reading the csv
titanic_df = pd.read_csv("/home/deploy/pramit/data/titanic/train.csv")
titanic_df.describe()
Out[26]:
In [3]:
# Quick data transformation and cleaning ...
titanic_df["Sex"] = titanic_df["Sex"].astype('category')
titanic_df["Sex_Encoded"] = titanic_df["Sex"].cat.codes
titanic_df["Embarked"] = titanic_df["Embarked"].astype('category')
titanic_df["Embarked_Encoded"] = titanic_df["Embarked"].cat.codes
print(titanic_df.head(5))
titanic_df_clean = titanic_df.drop(['Ticket','Cabin', 'Name', 'Sex', 'Embarked'], axis=1)
# # Remove NaN values
titanic_df_clean = titanic_df_clean.dropna()
print(titanic_df_clean.head(5))
In [50]:
y = titanic_df_clean['Survived']
In [99]:
data = titanic_df_clean.drop(['Survived'], axis=1)
data['label'] = y
print(data.head())
# Lets trying building an Interpretable Model
feature_labels = list(data.columns)
print(type(feature_labels[0]))
print(feature_labels)
In [100]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r_data = pandas2ri.py2ri(data)
as_factor = ro.r['as.factor']
s_apply = ro.r['sapply']
frame = ro.r['data.frame']
t = frame(s_apply(r_data, as_factor))
print(type(t))
In [101]:
print(type(r_data))
In [102]:
sbrl = importr('sbrl')
In [103]:
print(sbrl.sbrl.__dict__)
In [146]:
%timeit
model = sbrl.sbrl(t, iters=50000,
pos_sign=1, neg_sign=0, rule_minlen=1,
rule_maxlen=3, minsupport_pos=0.10, minsupport_neg=0.10, eta=1.0, nchain=40)
print(model)
In [133]:
result_r_frame = ro.r.predict(model, t)
In [134]:
print(type(result_r_frame))
In [135]:
pandas_df = pandas2ri.ri2py_dataframe(result_r_frame)
pandas_df.head()
Out[135]:
In [137]:
predicted_scores_prob = pd.DataFrame(pandas_df.values.T)
print(type(predicted_scores_prob))
In [138]:
print(y.shape)
print(predicted_scores_prob.shape)
In [139]:
predicted_scores_prob[0].shape
Out[139]:
In [140]:
#from sklearn.metrics import roc_auc_score
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y ,predicted_scores_prob[1], pos_label=1)
In [141]:
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)
In [142]:
from sklearn.ensemble import RandomForestClassifier
In [143]:
ytrain = data['label']
Xtrain = data.drop(['label'], axis=1)
rf_model = RandomForestClassifier().fit(Xtrain, ytrain)
rf_predict_score = pd.DataFrame(rf_model.predict_proba(Xtrain))
print(rf_predict_score).head()
rf_predict_score[0].head()
rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(ytrain, rf_predict_score[1], pos_label=1)
rf_roc_auc = metrics.auc(rf_fpr, rf_tpr)
print(rf_roc_auc)
In [ ]: