In [1]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression
from PlotWords import plot_word
from Matrix import convert_matrix
from NMF import generateNMF,auc_model,find_best_NMF

In [2]:
def findpatient(message,table,beats):
    groupsize = table.shape[0]
    deads = sum(table.index.labels[1].tolist())
    percent = -1 if groupsize==0 else deads/groupsize
    if groupsize!=0:
        print(message,"groupsize:%d"%groupsize,"deads:%d"%deads,"percent:{0:.2%}".format(percent),beats)

In [3]:
import psycopg2
def selectPearson(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='SELECT patient,deadpatient,p1 FROM wordspearson WHERE word = %s'
#    print(cur.mogrify(select_statement,locals()))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        patient=row[0]
        select = {'word':word,"pacientes":patient,"porcentaje de muertos":"{0:.2%}".format(row[1]/patient),"p1":row[2]}
    cur.close()
    conn.close()
    return select

In [4]:
def print_top_words(model, feature_names,topic_index=None):
    result = []
    for topic_idx, topic in enumerate(components):
        features = [{"word":feature_names[i],"p1":topic[i]} 
                    for i in topic.argsort()[:-1] if topic[i] > 0]
        if features!=[] and (topic_index==None or topic_index==topic_idx): result.append({"topic":topic_idx,"features":features})
    return result

In [5]:
def addValue(key,my_dict):
    if key in my_dict:
        my_dict[key] += 1
    else:
        my_dict[key] = 1
    return my_dict

In [6]:
from operator import itemgetter
def predict_proba(compoT,table):
    predicts = modelnmf.predict_proba(compoT)[:,1]
    beats = list(table)
    sortedBeats = []
    numPred = len(predicts)
    nvals={}
    for i in range(0,numPred):
        word = beats[i]
        sel = selectPearson(word)
        if sel=={}:
            print("palabra no encontrada en pearson:",word)
        sortedBeats.append({'word':beats[i],'predict':predicts[i],'pacientes':sel['pacientes'],
                            'porcentaje de muertos':sel['porcentaje de muertos'],"p1":sel['p1']})
        if predicts[i]>.99 : addValue("99",nvals)
        elif predicts[i]>.90 : addValue("90",nvals)
        elif predicts[i]>.80 : addValue("80",nvals)
        elif predicts[i]>.70 : addValue("70",nvals)
        else: addValue("under",nvals)
    print(nvals)
    return sorted(sortedBeats, key=itemgetter('p1'), reverse=True)

In [7]:
table = convert_matrix(with_pearson=True) #.iloc[:,:400]
survived = table.index.labels[1].tolist()
patients = table.values
table


(554, 400)
Out[7]:
aaadc aabadd aabaf aabafb aabaga aabbg aabbgb aabdab aabdad aabdb ... hag haj hbf hdd hddd hfa hfg hga hgd jag
subject_id isAlive
135 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 0 1 0 0 2 0 0 2 0 2 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 0 0 0 0 0 0 1 0 7 ... 0 0 0 0 0 0 0 0 0 0
214 1 0 0 2 0 1 0 0 0 0 2 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 0 0 0 0 0 0 1 0 6 ... 0 0 0 0 0 0 0 0 0 0
283 1 0 1 0 0 0 0 0 2 0 2 ... 0 0 0 0 0 0 0 0 0 0
368 1 0 0 1 1 1 1 1 2 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 1 1 2 1 3 0 0 8 2 16 ... 0 0 0 1 1 1 0 0 0 0
408 1 0 0 0 0 0 0 0 2 5 2 ... 0 0 0 0 0 0 0 0 0 0
462 0 0 8 2 1 0 0 0 14 4 15 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 0 0 0 0 0 0 1 0 5 ... 0 0 0 0 0 0 0 0 0 0
638 1 2 0 2 2 1 1 0 2 0 3 ... 0 1 0 1 1 0 0 1 0 0
682 1 0 0 1 0 0 0 0 0 0 1 ... 3 2 0 1 1 7 0 3 0 2
736 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 0 0 0 0 0 0 0 2 0 6 ... 1 1 0 0 0 1 0 0 0 0
749 1 0 0 1 0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
793 1 2 2 2 1 3 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 0 2 1 0 0 0 0 1 0 0 ... 0 0 0 0 0 1 0 0 0 0
952 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1004 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 1 1 2 0 1 1 0 1 6 6 ... 0 0 0 0 0 0 0 0 1 0
1144 0 0 0 0 0 0 1 0 0 0 1 ... 0 0 0 1 0 0 0 1 0 0
1160 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 0 4 2 0 2 0 5 2 10 ... 0 0 0 1 0 0 0 0 0 0
1226 1 0 0 2 1 0 0 0 1 0 3 ... 0 0 0 0 0 0 0 0 0 0
1459 0 0 4 4 3 0 0 0 10 1 24 ... 0 0 0 0 0 0 0 0 0 0
1528 1 0 1 6 2 3 0 0 3 0 3 ... 0 0 0 0 0 0 0 0 0 0
1531 1 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
1569 1 0 0 6 1 0 1 0 1 2 3 ... 1 0 0 0 0 0 0 2 0 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23034 1 0 0 6 4 0 0 0 13 2 17 ... 1 0 0 0 0 0 0 0 1 0
23097 1 10 4 7 1 1 1 0 3 4 11 ... 0 0 0 1 1 0 0 0 0 0
23120 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23130 1 0 0 1 0 0 0 0 2 0 3 ... 0 0 0 0 0 0 0 0 0 0
23178 1 0 0 3 3 0 2 1 2 0 14 ... 0 0 0 1 0 0 0 0 1 0
23200 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23298 0 0 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
23336 1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 0 10 2 0 0 0 0 8 2 8 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 3 1 0 0 1 0 0 5 5 8 ... 1 0 0 0 0 0 0 0 0 0
23474 1 0 0 3 0 1 1 0 3 0 9 ... 0 0 0 0 0 0 0 0 1 0
23510 1 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23944 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 0 0 4 0 0 0 0 4 0 2 ... 0 0 0 1 0 0 0 0 0 0
24076 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
24133 0 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24142 1 0 0 2 1 2 0 0 3 0 23 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 0 0 0 0 0 0 0 5 0 7 ... 0 0 0 0 0 0 0 0 0 0
24227 0 0 0 0 0 0 0 0 0 0 3 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 0 3 6 0 0 0 0 1 2 5 ... 2 0 0 8 2 4 2 1 4 1
42261 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
42492 0 0 0 2 1 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

554 rows × 400 columns


In [8]:
from sklearn.model_selection import train_test_split
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [9]:
model,acurracy = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)
logit_roc_auc = auc_model("Logistic",model,patients_test,survived_test)


acurracy is 70.27%
Logistic AUC = 0.4632f

In [10]:
def countPatients(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='''SELECT count(1),sum(isalive) FROM matrix m LEFT JOIN subjectwords s 
    ON m.subject_id=s.subject_id where m.word = %s GROUP BY m.word'''
#    print(cur.mogrify(select_statement,(word,)))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        select = {"patient":row[0],"deadPatient":row[1],}
    cur.close()
    conn.close()
    return select

In [11]:
from scipy.stats.stats import pearsonr
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'p1':pearson[0],'p2':pearson[1],'percent':"{0:.2%}".format(percent)+" de %d" %patient})

In [12]:
pearsonList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
pearsonList[:5]


Out[12]:
[{'p1': 0.14447975105937516,
  'p2': 0.00064753378305492824,
  'percent': '90.11% de 91',
  'word': 'babaaf'},
 {'p1': 0.13704871227843324,
  'p2': 0.0012218563623213852,
  'percent': '84.00% de 150',
  'word': 'babaf'},
 {'p1': 0.13674682231211277,
  'p2': 0.001252978906552512,
  'percent': '90.10% de 101',
  'word': 'ababaf'},
 {'p1': 0.13498226343655298,
  'p2': 0.001449968856339564,
  'percent': '86.57% de 134',
  'word': 'abbg'},
 {'p1': 0.13318083120123872,
  'p2': 0.001680102147764,
  'percent': '97.44% de 39',
  'word': 'fbabd'}]

In [14]:
modelnmf, bestNmf, patientnmf, accuracyScore, p1, bestScore = find_best_NMF(patients,survived)


acurracy is 78.38%
NMF 2 AUC = 0.5002f
pearson 0.0537649203283
acurracy is 78.38%
NMF 3 AUC = 0.5002f
pearson 0.0970548800057
acurracy is 78.38%
NMF 4 AUC = 0.5002f
pearson 0.0900181934643
acurracy is 78.38%
NMF 5 AUC = 0.5002f
pearson 0.0815622793441
acurracy is 78.38%
NMF 6 AUC = 0.5002f
pearson 0.0796729240204
acurracy is 78.38%
NMF 7 AUC = 0.5002f
pearson 0.127376250873
acurracy is 78.38%
NMF 8 AUC = 0.5002f
pearson 0.127157220215
acurracy is 76.58%
NMF 9 AUC = 0.4892f
pearson 0.123638781336

In [ ]:
from sklearn.metrics import classification_report
print("bestScore "+str(bestScore)+" accurracy is %2.2f"% accuracyScore)
print(classification_report(survived,modelnmf.predict(patientnmf)))

In [ ]:
import numpy as np
compoT = np.transpose(bestNmf.components_)
print("components",bestNmf.components_.shape)
print("components Transpose",compoT.shape)

In [ ]:
sortedBeats = predict_proba(compoT,table)
beatKeys = []
for value in sortedBeats:
    beatKeys.append(value['word'])

In [ ]:
print("Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística")
print(sortedBeats[:5])
plot_word(sortedBeats[:5])

In [ ]:
from operator import itemgetter
from scipy.stats.stats import pearsonr
patients_trainnmf = bestNmf.transform(patients_train)
pearsonList = []
for i in range(bestScore):
    patientpear=patients_trainnmf[:,i]
    pearson = pearsonr(patientpear,survived_train)
  #  if(pearson[0]>0):
    pearsonList.append({'group':i,'p1':pearson[0],'p2':pearson[1]})
sortedList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
sortedList[:10]

In [ ]:
columns = list(table)
components = bestNmf.components_
topword = print_top_words(components, columns,topic_index=sortedList[0]['group'])[0]['features']
subwords = []
for subword in topword:
    if subword['p1']>0:
        subwords.append(subword['word'])
print(str(subwords[:10]))

In [ ]:
table = convert_matrix(sumvals=False,filter_words=tuple(subwords))
survived = table.index.labels[1].tolist()
patients = table.values
table

In [ ]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)

In [ ]:
model,acurracy = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)

In [ ]:
logit_roc_auc = auc_model("Logistic",model,patients_test,survived_test)

In [ ]:
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'p1':pearson[0],'p2':pearson[1],'percent':"{0:.2%}".format(percent)+" de %d" %patient})

In [ ]:
pearsonList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
pearsonList[:5]

In [ ]: