In [ ]:


In [1]:
from time import time
import psycopg2
from collections import Counter
import gc
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
np.set_printoptions(suppress=True,precision=10)

In [2]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression

In [3]:
def selectMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT m.subject_id,m.word,m.counting,s.isalive "
                      " FROM matrix m LEFT JOIN subjectwords s ON m.subject_id=s.subject_id"
    )
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append((row))
    cur.close()
    conn.close()
    return select

In [4]:
def convertMatrix() :
    labels = ['subject_id', 'Word', 'Counting','isAlive']
    df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
    print(len(df))
    return pd.pivot_table(df,index=["subject_id","isAlive"],columns=["Word"],values=["Counting"],
                       aggfunc={"Counting":[np.sum]},fill_value=0)

In [5]:
t0=time()
table = convertMatrix()
print("converMatrix done in %0.3fs." % (time() - t0))
print(table.shape)
gc.collect()


4569945
converMatrix done in 60.456s.
(845, 168509)
Out[5]:
14

In [6]:
survived = table.index.labels[1].tolist()
patients = table.values

In [7]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [8]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(845, 168509)
(676, 168509)
(169, 168509)

Base Model All Will Die


In [9]:
def base_rate_model(X) :
    y = np.ones(X.shape[0])
    return y

In [10]:
y_base_rate = base_rate_model(patients_test)
print("Base rate acurracy is %2.2f" % accuracy_score(survived_test,base_rate_model(patients_test)))


Base rate acurracy is 0.54

Direct Regresion Logistic


In [11]:
model = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)


Best C is [ 0.01] with an average of 0.539149642572
Best C is [ 0.0253] with an average of 0.538613628289
Best C is [ 0.02431] with an average of 0.538520949883
Best C is [ 0.024112] with an average of 0.538566414762
Best C is [ 0.0240724] with an average of 0.538566414762
Score logit 0.538461538462
Score last 0.550295857988
Score best 0.550295857988
acurracy is 0.55

NMF + Logistic Regression accurancy


In [12]:
def generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=30) :
    nmf = NMF(n_components=n_components, random_state=1,alpha=.1, l1_ratio=.5)
    patients_trainnmf = nmf.fit_transform(patients_train)
    H = nmf.components_
    patients_testnmf = nmf.transform(patients_test)
    modelnmf = ajustLogisticRegression(patients_trainnmf,survived_train,patients_testnmf,survived_test)
    return modelnmf,patients_testnmf

In [13]:
modelnmf7,pat_testnmf7 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=7)


Best C is [ 1.] with an average of 0.501225333917
Best C is [ 2.53] with an average of 0.563893060294
Best C is [ 2.431] with an average of 0.563620782823
Best C is [ 2.4112] with an average of 0.563716340103
Best C is [ 2.40724] with an average of 0.563668561463
Score logit 0.544378698225
Score last 0.508875739645
Score best 0.508875739645
acurracy is 0.54

In [14]:
modelnmf32,pat_testnmf32 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=32)


Best C is [ 1.] with an average of 0.535750311238
Best C is [ 2.53] with an average of 0.586559946466
Best C is [ 2.431] with an average of 0.586146081725
Best C is [ 2.4112] with an average of 0.586192111716
Best C is [ 2.40724] with an average of 0.586237576594
Score logit 0.621301775148
Score last 0.579881656805
Score best 0.579881656805
acurracy is 0.62

In [15]:
modelnmf51,pat_testnmf51 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=51)


Best C is [ 10.] with an average of 0.52214510632
Best C is [ 10.9] with an average of 0.55781328436
Best C is [ 10.63] with an average of 0.557812410035
Best C is [ 10.576] with an average of 0.557579583796
Best C is [ 10.5652] with an average of 0.557579583796
Score logit 0.603550295858
Score last 0.585798816568
Score best 0.585798816568
acurracy is 0.60

In [16]:
modelnmf75,pat_testnmf75 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=75)


Best C is [ 0.1] with an average of 0.557338984598
Best C is [ 0.253] with an average of 0.569762166653
Best C is [ 0.2431] with an average of 0.569636157288
Best C is [ 0.24112] with an average of 0.569725647608
Best C is [ 0.240724] with an average of 0.569909820883
Score logit 0.597633136095
Score last 0.597633136095
Score best 0.597633136095
acurracy is 0.60

AUC


In [17]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [18]:
def aucModel(name,survived_test,model,patients_test):
    logit_roc_auc = roc_auc_score(survived_test,model.predict(patients_test))
    print(name+" AUC = %2.2f2f"% logit_roc_auc,end=" ")
    return logit_roc_auc

In [20]:
accuracyScore,bestScore,nmf_roc_auc = 0,0,0
for x in range(7, 100):
    modelnmfNew,pat_testnmfNew = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=x)
    newAccuracy = accuracy_score(survived_test,modelnmfNew.predict(pat_testnmfNew))
    newroc_auc = aucModel("NMF "+str(x),survived_test,modelnmfNew,pat_testnmfNew)
    if newroc_auc>nmf_roc_auc:
        modelnmf,pat_testnmf=modelnmfNew,pat_testnmfNew
        accuracyScore,bestScore,nmf_roc_auc=newAccuracy,x,newroc_auc
print("bestScore "+str(bestScore)+" accurracy is %2.2f"% accuracyScore)
print(classification_report(survived_test,modelnmf.predict(pat_testnmf)))


Best C is [ 10.] with an average of 0.504286546874
Best C is [ 8.2] with an average of 0.514017779497
Best C is [ 7.39] with an average of 0.514017779497
Best C is [ 7.228] with an average of 0.514017779497
Best C is [ 7.1956] with an average of 0.514017779497
Score logit 0.544378698225
Score last 0.556213017751
Score best 0.550295857988
acurracy is 0.56
NMF 1 AUC = 0.502f Best C is [ 1.] with an average of 0.51442207998
Best C is [ 0.55] with an average of 0.56365927443
Best C is [ 0.41806] with an average of 0.564867153843
Best C is [ 0.31357] with an average of 0.569507471637
Best C is [ 0.2968516] with an average of 0.577739951985
Score logit 0.538461538462
Score last 0.538461538462
Score best 0.538461538462
acurracy is 0.54
NMF 2 AUC = 0.502f Best C is [ 1.] with an average of 0.514373075153
Best C is [ 0.55] with an average of 0.528899988783
Best C is [ 0.41806] with an average of 0.565842697492
Best C is [ 0.397162] with an average of 0.565611619902
Best C is [ 0.3929824] with an average of 0.56547378583
Score logit 0.538461538462
Score last 0.550295857988
Score best 0.550295857988
acurracy is 0.55
NMF 3 AUC = 0.502f Best C is [ 1.] with an average of 0.505587648481
Best C is [ 2.53] with an average of 0.555758706887
Best C is [ 2.431] with an average of 0.555574533612
Best C is [ 2.4112] with an average of 0.555389795225
Best C is [ 2.40724] with an average of 0.555435260103
Score logit 0.544378698225
Score last 0.491124260355
Score best 0.491124260355
acurracy is 0.54
NMF 4 AUC = 0.502f Best C is [ 1.] with an average of 0.501225333917
Best C is [ 2.53] with an average of 0.563893060294
Best C is [ 2.431] with an average of 0.563620782823
Best C is [ 2.4112] with an average of 0.563716340103
Best C is [ 2.40724] with an average of 0.563668561463
Score logit 0.544378698225
Score last 0.508875739645
Score best 0.508875739645
acurracy is 0.54
NMF 5 AUC = 0.502f Best C is [ 1.] with an average of 0.53001078618
Best C is [ 2.53] with an average of 0.565264001242
Best C is [ 2.431] with an average of 0.565593514985
Best C is [ 2.4112] with an average of 0.565590070999
Best C is [ 2.40724] with an average of 0.565636975315
Score logit 0.550295857988
Score last 0.497041420118
Score best 0.497041420118
acurracy is 0.55
NMF 6 AUC = 0.512f Best C is [ 1.] with an average of 0.535632831869
Best C is [ 2.53] with an average of 0.569532901687
Best C is [ 2.431] with an average of 0.569065297972
Best C is [ 2.4112] with an average of 0.569251785008
Best C is [ 2.40724] with an average of 0.569298689323
Score logit 0.556213017751
Score last 0.544378698225
Score best 0.544378698225
acurracy is 0.56
NMF 7 AUC = 0.522f Best C is [ 1.] with an average of 0.520969555591
Best C is [ 2.53] with an average of 0.556289496547
Best C is [ 2.431] with an average of 0.556615512992
Best C is [ 2.4112] with an average of 0.556664421856
Best C is [ 2.40724] with an average of 0.556709886734
Score logit 0.544378698225
Score last 0.502958579882
Score best 0.502958579882
acurracy is 0.54
NMF 8 AUC = 0.502f Best C is [ 1.] with an average of 0.534933457502
Best C is [ 2.53] with an average of 0.558388515299
Best C is [ 2.431] with an average of 0.558673694388
Best C is [ 2.4112] with an average of 0.558717154717
Best C is [ 2.40724] with an average of 0.558764059033
Score logit 0.550295857988
Score last 0.520710059172
Score best 0.520710059172
acurracy is 0.55
NMF 9 AUC = 0.512f bestScore 7 accurracy is 0.56
             precision    recall  f1-score   support

          0       0.60      0.08      0.14        77
          1       0.55      0.96      0.70        92

avg / total       0.57      0.56      0.44       169


In [21]:
base_roc_auc = roc_auc_score(survived_test,base_rate_model(patients_test))
print("Base Rate AUC = %2.2f2f"% base_roc_auc)
print(classification_report(survived_test,base_rate_model(patients_test)))
logit_roc_auc = aucModel("Logistic",survived_test,model,patients_test)
nmf7_roc_auc = aucModel("NMF 7",survived_test,modelnmf7,pat_testnmf7)
nmf32_roc_auc = aucModel("NMF 32",survived_test,modelnmf32,pat_testnmf32)
nmf51_roc_auc = aucModel("NMF 51",survived_test,modelnmf51,pat_testnmf51)
nmf75_roc_auc = aucModel("NMF 75",survived_test,modelnmf75,pat_testnmf75)


Base Rate AUC = 0.502f
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        77
          1       0.54      1.00      0.70        92

avg / total       0.30      0.54      0.38       169

/usr/local/lib/python3.4/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Logistic AUC = 0.532f NMF 5 AUC = 0.502f NMF 32 AUC = 0.602f NMF 51 AUC = 0.582f NMF 75 AUC = 0.572f 

In [22]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

In [23]:
def roc_curveplot(name,survived_test,model,patients_test,logit_roc_auc):
    fpr, tpr, thresholds = roc_curve(survived_test,model.predict_proba(patients_test)[:,1])
    plt.plot(fpr,tpr,label='ROC '+name+' curve (AUC =%0.2f)' % logit_roc_auc)

In [24]:
plt.figure()
roc_curveplot('Logistic',survived_test,model,patients_test,logit_roc_auc)
roc_curveplot('NMF 7',survived_test,modelnmf7,pat_testnmf7,nmf7_roc_auc)
roc_curveplot('NMF 32',survived_test,modelnmf32,pat_testnmf32,nmf32_roc_auc)
roc_curveplot('NMF 51',survived_test,modelnmf51,pat_testnmf51,nmf51_roc_auc)
roc_curveplot('NMF '+str(bestScore),survived_test,modelnmf,pat_testnmf,nmf_roc_auc)
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()



In [ ]: