In [26]:
import pandas as pd
import numpy as np

# Reading the csv
titanic_df = pd.read_csv("/home/deploy/pramit/data/titanic/train.csv")
titanic_df.describe()


Out[26]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [3]:
# Quick data transformation and cleaning ...
titanic_df["Sex"] = titanic_df["Sex"].astype('category')
titanic_df["Sex_Encoded"] = titanic_df["Sex"].cat.codes

titanic_df["Embarked"] = titanic_df["Embarked"].astype('category')
titanic_df["Embarked_Encoded"] = titanic_df["Embarked"].cat.codes
print(titanic_df.head(5))
titanic_df_clean = titanic_df.drop(['Ticket','Cabin', 'Name', 'Sex', 'Embarked'], axis=1)
# # Remove NaN values
titanic_df_clean = titanic_df_clean.dropna() 
print(titanic_df_clean.head(5))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Sex_Encoded  \
0      0         A/5 21171   7.2500   NaN        S            1   
1      0          PC 17599  71.2833   C85        C            0   
2      0  STON/O2. 3101282   7.9250   NaN        S            0   
3      0            113803  53.1000  C123        S            0   
4      0            373450   8.0500   NaN        S            1   

   Embarked_Encoded  
0                 2  
1                 0  
2                 2  
3                 2  
4                 2  
   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  Sex_Encoded  \
0            1         0       3  22.0      1      0   7.2500            1   
1            2         1       1  38.0      1      0  71.2833            0   
2            3         1       3  26.0      0      0   7.9250            0   
3            4         1       1  35.0      1      0  53.1000            0   
4            5         0       3  35.0      0      0   8.0500            1   

   Embarked_Encoded  
0                 2  
1                 0  
2                 2  
3                 2  
4                 2  

In [50]:
y = titanic_df_clean['Survived']

In [99]:
data = titanic_df_clean.drop(['Survived'], axis=1)
data['label'] = y
print(data.head())
# Lets trying building an Interpretable Model
feature_labels = list(data.columns)
print(type(feature_labels[0]))
print(feature_labels)


   PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_Encoded  \
0            1       3  22.0      1      0   7.2500            1   
1            2       1  38.0      1      0  71.2833            0   
2            3       3  26.0      0      0   7.9250            0   
3            4       1  35.0      1      0  53.1000            0   
4            5       3  35.0      0      0   8.0500            1   

   Embarked_Encoded  label  
0                 2      0  
1                 0      1  
2                 2      1  
3                 2      1  
4                 2      0  
<type 'str'>
['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Encoded', 'Embarked_Encoded', 'label']

In [100]:
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r_data = pandas2ri.py2ri(data)

as_factor = ro.r['as.factor']
s_apply = ro.r['sapply']
frame = ro.r['data.frame']
t = frame(s_apply(r_data, as_factor))
print(type(t))


<class 'rpy2.robjects.vectors.DataFrame'>

In [101]:
print(type(r_data))


<class 'rpy2.robjects.vectors.DataFrame'>

In [102]:
sbrl = importr('sbrl')

In [103]:
print(sbrl.sbrl.__dict__)


{'__rpackagename__': 'sbrl', '__rname__': 'sbrl', '_local_env': <rpy2.rinterface.SexpEnvironment - Python:0x7fc45eaebf60 / R:0xa8ba0c8>, '_prm_translate': OrderedDict([('neg_sign', 'neg_sign'), ('minsupport_pos', 'minsupport_pos'), ('pos_sign', 'pos_sign'), ('iters', 'iters'), ('tdata', 'tdata'), ('nchain', 'nchain'), ('rule_minlen', 'rule_minlen'), ('eta', 'eta'), ('rule_maxlen', 'rule_maxlen'), ('minsupport_neg', 'minsupport_neg'), ('alpha', 'alpha'), ('lambda', 'lambda')])}

In [146]:
%timeit
model = sbrl.sbrl(t, iters=50000, 
                  pos_sign=1, neg_sign=0, rule_minlen=1, 
                  rule_maxlen=3, minsupport_pos=0.10, minsupport_neg=0.10, eta=1.0, nchain=40)
print(model)


Eclat

parameter specification:
 tidLists support minlen maxlen            target   ext
    FALSE     0.1      1      3 frequent itemsets FALSE

algorithmic control:
 sparse sort verbose
      7   -2    TRUE

Absolute minimum support count: 29 

create itemset ... 
set transactions ...[514 item(s), 290 transaction(s)] done [0.00s].
sorting and recoding items ... [12 item(s)] done [0.00s].
creating bit matrix ... [12 row(s), 290 column(s)] done [0.00s].
writing  ... [89 set(s)] done [0.00s].
Creating S4 object  ... done [0.00s].
Eclat

parameter specification:
 tidLists support minlen maxlen            target   ext
    FALSE     0.1      1      3 frequent itemsets FALSE

algorithmic control:
 sparse sort verbose
      7   -2    TRUE

Absolute minimum support count: 42 

create itemset ... 
set transactions ...[673 item(s), 424 transaction(s)] done [0.00s].
sorting and recoding items ... [11 item(s)] done [0.00s].
creating bit matrix ... [11 row(s), 424 column(s)] done [0.00s].
writing  ... [55 set(s)] done [0.00s].
Creating S4 object  ... done [0.00s].
The rules list is : 
If      {Pclass=3,Sex_Encoded=0} (rule[60]) then positive probability = 0.46153846
else if {Sex_Encoded=0} (rule[72]) then positive probability = 0.93788820
else if {Pclass=1} (rule[38]) then positive probability = 0.39805825
else if {Parch=0} (rule[10]) then positive probability = 0.12328767
else  (default rule)  then positive probability = 0.29687500


In [133]:
result_r_frame = ro.r.predict(model, t)

In [134]:
print(type(result_r_frame))


<class 'rpy2.robjects.vectors.ListVector'>

In [135]:
pandas_df = pandas2ri.ri2py_dataframe(result_r_frame)
pandas_df.head()


Out[135]:
0 1 2 3 4 5 6 7 8 9 ... 704 705 706 707 708 709 710 711 712 713
0 0.876712 0.062112 0.538462 0.062112 0.876712 0.601942 0.703125 0.538462 0.062112 0.538462 ... 0.062112 0.876712 0.538462 0.876712 0.876712 0.538462 0.876712 0.062112 0.601942 0.876712
1 0.123288 0.937888 0.461538 0.937888 0.123288 0.398058 0.296875 0.461538 0.937888 0.461538 ... 0.937888 0.123288 0.461538 0.123288 0.123288 0.461538 0.123288 0.937888 0.398058 0.123288

2 rows × 714 columns


In [137]:
predicted_scores_prob = pd.DataFrame(pandas_df.values.T)
print(type(predicted_scores_prob))


<class 'pandas.core.frame.DataFrame'>

In [138]:
print(y.shape)
print(predicted_scores_prob.shape)


(714,)
(714, 2)

In [139]:
predicted_scores_prob[0].shape


Out[139]:
(714,)

In [140]:
#from sklearn.metrics import roc_auc_score
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y ,predicted_scores_prob[1], pos_label=1)

In [141]:
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)


0.843900455433

In [142]:
from sklearn.ensemble import RandomForestClassifier

In [143]:
ytrain = data['label']
Xtrain = data.drop(['label'], axis=1)
rf_model = RandomForestClassifier().fit(Xtrain, ytrain)
rf_predict_score = pd.DataFrame(rf_model.predict_proba(Xtrain))
print(rf_predict_score).head()

rf_predict_score[0].head()
rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(ytrain, rf_predict_score[1], pos_label=1)
rf_roc_auc = metrics.auc(rf_fpr, rf_tpr)
print(rf_roc_auc)


     0    1
0  0.8  0.2
1  0.2  0.8
2  0.1  0.9
3  0.2  0.8
4  1.0  0.0
0.999113532856

In [ ]: