In [ ]:
In [1]:
from time import time
import psycopg2
from collections import Counter
import gc
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
np.set_printoptions(suppress=True,precision=10)
In [2]:
import sys
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression
In [3]:
def selectMatrix(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT m.subject_id,m.word,m.counting,s.isalive "
" FROM matrix m LEFT JOIN subjectwords s ON m.subject_id=s.subject_id"
)
cur.execute(select_stament)
select = []
for row in cur :
select.append((row))
cur.close()
conn.close()
return select
In [4]:
def convertMatrix() :
labels = ['subject_id', 'Word', 'Counting','isAlive']
df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
print(len(df))
return pd.pivot_table(df,index=["subject_id","isAlive"],columns=["Word"],values=["Counting"],
aggfunc={"Counting":[np.sum]},fill_value=0)
In [5]:
t0=time()
table = convertMatrix()
print("converMatrix done in %0.3fs." % (time() - t0))
print(table.shape)
gc.collect()
Out[5]:
In [6]:
survived = table.index.labels[1].tolist()
patients = table.values
In [7]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)
In [8]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)
In [9]:
def base_rate_model(X) :
y = np.ones(X.shape[0])
return y
In [10]:
y_base_rate = base_rate_model(patients_test)
print("Base rate acurracy is %2.2f" % accuracy_score(survived_test,base_rate_model(patients_test)))
In [11]:
model = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)
In [12]:
def generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=30) :
nmf = NMF(n_components=n_components, random_state=1,alpha=.1, l1_ratio=.5)
patients_trainnmf = nmf.fit_transform(patients_train)
H = nmf.components_
patients_testnmf = nmf.transform(patients_test)
modelnmf = ajustLogisticRegression(patients_trainnmf,survived_train,patients_testnmf,survived_test)
return modelnmf,patients_testnmf
In [13]:
modelnmf7,pat_testnmf7 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=7)
In [14]:
modelnmf32,pat_testnmf32 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=32)
In [15]:
modelnmf51,pat_testnmf51 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=51)
In [16]:
modelnmf75,pat_testnmf75 = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=75)
In [17]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
In [18]:
def aucModel(name,survived_test,model,patients_test):
logit_roc_auc = roc_auc_score(survived_test,model.predict(patients_test))
print(name+" AUC = %2.2f2f"% logit_roc_auc,end=" ")
return logit_roc_auc
In [20]:
accuracyScore,bestScore,nmf_roc_auc = 0,0,0
for x in range(7, 100):
modelnmfNew,pat_testnmfNew = generateNMF(patients_train,survived_train,patients_test,survived_test,n_components=x)
newAccuracy = accuracy_score(survived_test,modelnmfNew.predict(pat_testnmfNew))
newroc_auc = aucModel("NMF "+str(x),survived_test,modelnmfNew,pat_testnmfNew)
if newroc_auc>nmf_roc_auc:
modelnmf,pat_testnmf=modelnmfNew,pat_testnmfNew
accuracyScore,bestScore,nmf_roc_auc=newAccuracy,x,newroc_auc
print("bestScore "+str(bestScore)+" accurracy is %2.2f"% accuracyScore)
print(classification_report(survived_test,modelnmf.predict(pat_testnmf)))
In [21]:
base_roc_auc = roc_auc_score(survived_test,base_rate_model(patients_test))
print("Base Rate AUC = %2.2f2f"% base_roc_auc)
print(classification_report(survived_test,base_rate_model(patients_test)))
logit_roc_auc = aucModel("Logistic",survived_test,model,patients_test)
nmf7_roc_auc = aucModel("NMF 7",survived_test,modelnmf7,pat_testnmf7)
nmf32_roc_auc = aucModel("NMF 32",survived_test,modelnmf32,pat_testnmf32)
nmf51_roc_auc = aucModel("NMF 51",survived_test,modelnmf51,pat_testnmf51)
nmf75_roc_auc = aucModel("NMF 75",survived_test,modelnmf75,pat_testnmf75)
In [22]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
In [23]:
def roc_curveplot(name,survived_test,model,patients_test,logit_roc_auc):
fpr, tpr, thresholds = roc_curve(survived_test,model.predict_proba(patients_test)[:,1])
plt.plot(fpr,tpr,label='ROC '+name+' curve (AUC =%0.2f)' % logit_roc_auc)
In [24]:
plt.figure()
roc_curveplot('Logistic',survived_test,model,patients_test,logit_roc_auc)
roc_curveplot('NMF 7',survived_test,modelnmf7,pat_testnmf7,nmf7_roc_auc)
roc_curveplot('NMF 32',survived_test,modelnmf32,pat_testnmf32,nmf32_roc_auc)
roc_curveplot('NMF 51',survived_test,modelnmf51,pat_testnmf51,nmf51_roc_auc)
roc_curveplot('NMF '+str(bestScore),survived_test,modelnmf,pat_testnmf,nmf_roc_auc)
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()
In [ ]: