notebook.community

Edit and run



In [26]:

    
import pandas as pd
import numpy as np

# Reading the csv
titanic_df = pd.read_csv("/home/deploy/pramit/data/titanic/train.csv")
titanic_df.describe()









    Out[26]:







  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200



In [3]:

    
# Quick data transformation and cleaning ...
titanic_df["Sex"] = titanic_df["Sex"].astype('category')
titanic_df["Sex_Encoded"] = titanic_df["Sex"].cat.codes

titanic_df["Embarked"] = titanic_df["Embarked"].astype('category')
titanic_df["Embarked_Encoded"] = titanic_df["Embarked"].cat.codes
print(titanic_df.head(5))
titanic_df_clean = titanic_df.drop(['Ticket','Cabin', 'Name', 'Sex', 'Embarked'], axis=1)
# # Remove NaN values
titanic_df_clean = titanic_df_clean.dropna() 
print(titanic_df_clean.head(5))









    



   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Sex_Encoded  \
0      0         A/5 21171   7.2500   NaN        S            1   
1      0          PC 17599  71.2833   C85        C            0   
2      0  STON/O2. 3101282   7.9250   NaN        S            0   
3      0            113803  53.1000  C123        S            0   
4      0            373450   8.0500   NaN        S            1   

   Embarked_Encoded  
0                 2  
1                 0  
2                 2  
3                 2  
4                 2  
   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  Sex_Encoded  \
0            1         0       3  22.0      1      0   7.2500            1   
1            2         1       1  38.0      1      0  71.2833            0   
2            3         1       3  26.0      0      0   7.9250            0   
3            4         1       1  35.0      1      0  53.1000            0   
4            5         0       3  35.0      0      0   8.0500            1   

   Embarked_Encoded  
0                 2  
1                 0  
2                 2  
3                 2  
4                 2



In [50]:

    
y = titanic_df_clean['Survived']



In [99]:

    
data = titanic_df_clean.drop(['Survived'], axis=1)
data['label'] = y
print(data.head())
# Lets trying building an Interpretable Model
feature_labels = list(data.columns)
print(type(feature_labels[0]))
print(feature_labels)









    



   PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_Encoded  \
0            1       3  22.0      1      0   7.2500            1   
1            2       1  38.0      1      0  71.2833            0   
2            3       3  26.0      0      0   7.9250            0   
3            4       1  35.0      1      0  53.1000            0   
4            5       3  35.0      0      0   8.0500            1   

   Embarked_Encoded  label  
0                 2      0  
1                 0      1  
2                 2      1  
3                 2      1  
4                 2      0  
<type 'str'>
['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Encoded', 'Embarked_Encoded', 'label']



In [100]:

    
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
r_data = pandas2ri.py2ri(data)

as_factor = ro.r['as.factor']
s_apply = ro.r['sapply']
frame = ro.r['data.frame']
t = frame(s_apply(r_data, as_factor))
print(type(t))









    



<class 'rpy2.robjects.vectors.DataFrame'>



In [101]:

    
print(type(r_data))









    



<class 'rpy2.robjects.vectors.DataFrame'>



In [102]:

    
sbrl = importr('sbrl')



In [103]:

    
print(sbrl.sbrl.__dict__)









    



{'__rpackagename__': 'sbrl', '__rname__': 'sbrl', '_local_env': <rpy2.rinterface.SexpEnvironment - Python:0x7fc45eaebf60 / R:0xa8ba0c8>, '_prm_translate': OrderedDict([('neg_sign', 'neg_sign'), ('minsupport_pos', 'minsupport_pos'), ('pos_sign', 'pos_sign'), ('iters', 'iters'), ('tdata', 'tdata'), ('nchain', 'nchain'), ('rule_minlen', 'rule_minlen'), ('eta', 'eta'), ('rule_maxlen', 'rule_maxlen'), ('minsupport_neg', 'minsupport_neg'), ('alpha', 'alpha'), ('lambda', 'lambda')])}



In [146]:

    
%timeit
model = sbrl.sbrl(t, iters=50000, 
                  pos_sign=1, neg_sign=0, rule_minlen=1, 
                  rule_maxlen=3, minsupport_pos=0.10, minsupport_neg=0.10, eta=1.0, nchain=40)
print(model)









    



Eclat

parameter specification:
 tidLists support minlen maxlen            target   ext
    FALSE     0.1      1      3 frequent itemsets FALSE

algorithmic control:
 sparse sort verbose
      7   -2    TRUE

Absolute minimum support count: 29 

create itemset ... 
set transactions ...[514 item(s), 290 transaction(s)] done [0.00s].
sorting and recoding items ... [12 item(s)] done [0.00s].
creating bit matrix ... [12 row(s), 290 column(s)] done [0.00s].
writing  ... [89 set(s)] done [0.00s].
Creating S4 object  ... done [0.00s].
Eclat

parameter specification:
 tidLists support minlen maxlen            target   ext
    FALSE     0.1      1      3 frequent itemsets FALSE

algorithmic control:
 sparse sort verbose
      7   -2    TRUE

Absolute minimum support count: 42 

create itemset ... 
set transactions ...[673 item(s), 424 transaction(s)] done [0.00s].
sorting and recoding items ... [11 item(s)] done [0.00s].
creating bit matrix ... [11 row(s), 424 column(s)] done [0.00s].
writing  ... [55 set(s)] done [0.00s].
Creating S4 object  ... done [0.00s].
The rules list is : 
If      {Pclass=3,Sex_Encoded=0} (rule[60]) then positive probability = 0.46153846
else if {Sex_Encoded=0} (rule[72]) then positive probability = 0.93788820
else if {Pclass=1} (rule[38]) then positive probability = 0.39805825
else if {Parch=0} (rule[10]) then positive probability = 0.12328767
else  (default rule)  then positive probability = 0.29687500



In [133]:

    
result_r_frame = ro.r.predict(model, t)



In [134]:

    
print(type(result_r_frame))









    



<class 'rpy2.robjects.vectors.ListVector'>



In [135]:

    
pandas_df = pandas2ri.ri2py_dataframe(result_r_frame)
pandas_df.head()









    Out[135]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      704
      705
      706
      707
      708
      709
      710
      711
      712
      713
    
  
  
    
      0
      0.876712
      0.062112
      0.538462
      0.062112
      0.876712
      0.601942
      0.703125
      0.538462
      0.062112
      0.538462
      ...
      0.062112
      0.876712
      0.538462
      0.876712
      0.876712
      0.538462
      0.876712
      0.062112
      0.601942
      0.876712
    
    
      1
      0.123288
      0.937888
      0.461538
      0.937888
      0.123288
      0.398058
      0.296875
      0.461538
      0.937888
      0.461538
      ...
      0.937888
      0.123288
      0.461538
      0.123288
      0.123288
      0.461538
      0.123288
      0.937888
      0.398058
      0.123288
    
  

2 rows × 714 columns



In [137]:

    
predicted_scores_prob = pd.DataFrame(pandas_df.values.T)
print(type(predicted_scores_prob))









    



<class 'pandas.core.frame.DataFrame'>



In [138]:

    
print(y.shape)
print(predicted_scores_prob.shape)









    



(714,)
(714, 2)



In [139]:

    
predicted_scores_prob[0].shape









    Out[139]:





(714,)



In [140]:

    
#from sklearn.metrics import roc_auc_score
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y ,predicted_scores_prob[1], pos_label=1)



In [141]:

    
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)









    



0.843900455433



In [142]:

    
from sklearn.ensemble import RandomForestClassifier



In [143]:

    
ytrain = data['label']
Xtrain = data.drop(['label'], axis=1)
rf_model = RandomForestClassifier().fit(Xtrain, ytrain)
rf_predict_score = pd.DataFrame(rf_model.predict_proba(Xtrain))
print(rf_predict_score).head()

rf_predict_score[0].head()
rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(ytrain, rf_predict_score[1], pos_label=1)
rf_roc_auc = metrics.auc(rf_fpr, rf_tpr)
print(rf_roc_auc)









    



     0    1
0  0.8  0.2
1  0.2  0.8
2  0.1  0.9
3  0.2  0.8
4  1.0  0.0
0.999113532856



In [ ]:

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	0	1	2	3	4	5	6	7	8	9	...	704	705	706	707	708	709	710	711	712	713
0	0.876712	0.062112	0.538462	0.062112	0.876712	0.601942	0.703125	0.538462	0.062112	0.538462	...	0.062112	0.876712	0.538462	0.876712	0.876712	0.538462	0.876712	0.062112	0.601942	0.876712
1	0.123288	0.937888	0.461538	0.937888	0.123288	0.398058	0.296875	0.461538	0.937888	0.461538	...	0.937888	0.123288	0.461538	0.123288	0.123288	0.461538	0.123288	0.937888	0.398058	0.123288