Identificacion de grupos con los resultados de la prediccion anterior

Imports


In [1]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression
from PlotWords import plot_word
from Matrix import convert_matrix
from NMF import generateNMF
import pandas as pd

Número de componentes por los que se factorizara la tabla segun el resultado anterior


In [2]:
n_components=11

In [3]:
def findpatient(message,table,beats):
    groupsize = table.shape[0]
    deads = sum(table.index.labels[1].tolist())
    percent = -1 if groupsize==0 else deads/groupsize
    if groupsize!=0:
        print(message,"groupsize:%d"%groupsize,"deads:%d"%deads,"percent:{0:.2%}".format(percent),beats)

In [4]:
import psycopg2
def selectPearson(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='SELECT patient,deadpatient,p1 FROM wordspearson WHERE word = %s'
#    print(cur.mogrify(select_statement,locals()))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        patient=row[0]
        select = {'word':word,"pacientes":patient,"porcentaje de muertos":"{0:.2%}".format(row[1]/patient),"correlación":row[2]}
    cur.close()
    conn.close()
    return select

In [5]:
def addValue(key,my_dict):
    if key in my_dict:
        my_dict[key] += 1
    else:
        my_dict[key] = 1
    return my_dict

In [6]:
from operator import itemgetter
def predict_proba(modelnmf,compoT,table):
    predicts = modelnmf.predict_proba(compoT)[:,1]
    beats = list(table)
    sortedBeats = []
    numPred = len(predicts)
    nvals={}
    for i in range(0,numPred):
        word = beats[i]
        sel = selectPearson(word)
        if sel=={}:
            print("palabra no encontrada en pearson:",word)
        cuantosMueren=sel['porcentaje de muertos']+" de "+str(sel['pacientes'])
        sortedBeats.append({'word':beats[i],'predict':predicts[i],'cuantosMueren':cuantosMueren,"correlación":sel['correlación']})
        if predicts[i]>.99 : addValue("99",nvals)
        elif predicts[i]>.90 : addValue("90",nvals)
        elif predicts[i]>.80 : addValue("80",nvals)
        elif predicts[i]>.70 : addValue("70",nvals)
        else: addValue("under",nvals)
    print(nvals)
    return sorted(sortedBeats, key=itemgetter('correlación'), reverse=True)

In [7]:
table = convert_matrix(len_words=(3,4,5,6))
survived = table.index.labels[1].tolist()
patients = table.values


(590, 18683)

In [8]:
from sklearn.model_selection import train_test_split
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [9]:
modelnmf, nmf, patients_nmf, acurracy,roc_aucNew = generateNMF(patients,survived,n_components=n_components)
print("acurracy",acurracy)


acurracy is 77.1186%

In [10]:
import numpy as np
compoT = np.transpose(nmf.components_)
print("components",nmf.components_.shape)
print("components Transpose",compoT.shape)
print(nmf.reconstruction_err_)


components (11, 18683)
components Transpose (18683, 11)
15094.2209658

In [11]:
sortedBeats = predict_proba(modelnmf,compoT,table)
beatKeys = []
for value in sortedBeats:
    beatKeys.append(value['word'])


{'99': 12, '70': 18399, 'under': 175, '90': 27, '80': 70}

In [12]:
#print(sortedBeats[:10])
df = pd.DataFrame(sortedBeats)
df = df.set_index('word')
df = df.sort_values(['predict'], ascending=[False])
df


Out[12]:
correlación cuantosMueren predict
word
baba 0.013894 74.55% de 558 1.000000
abab 0.004972 74.46% de 556 1.000000
ababa -0.026421 74.09% de 548 0.999999
babab -0.045099 73.71% de 525 0.999997
ababab -0.022126 74.01% de 504 0.999994
bababa -0.022303 74.02% de 508 0.999994
ddd 0.120737 81.17% de 223 0.999562
dddd 0.151999 87.20% de 125 0.996478
eee -0.069023 57.89% de 19 0.996063
eeee -0.053322 60.00% de 15 0.995438
eeeee -0.053072 58.33% de 12 0.994826
eeeeee -0.053072 58.33% de 12 0.994236
ddddd 0.127952 88.61% de 79 0.987524
dddddd 0.123666 91.07% de 56 0.970772
adb 0.044367 75.15% de 515 0.962858
dba 0.046960 75.25% de 505 0.959284
dad 0.079473 77.30% de 348 0.953160
aabab -0.033428 73.99% de 546 0.953130
add 0.068976 76.80% de 362 0.952909
bdb 0.105833 76.54% de 486 0.950998
babaa -0.026421 74.09% de 548 0.950975
aad 0.054559 75.14% de 539 0.949040
dbb 0.061406 75.55% de 499 0.948452
dda 0.074431 77.03% de 357 0.948283
aababa -0.048417 73.74% de 537 0.945605
dbd 0.135330 80.73% de 275 0.942196
ababaa -0.047174 73.73% de 533 0.940767
bad 0.027017 74.90% de 502 0.935041
bbd 0.056602 75.57% de 483 0.934190
ddb 0.137471 80.63% de 284 0.929283
... ... ... ...
aaaada -0.007012 74.25% de 466 0.637564
aadaaa -0.018929 73.99% de 469 0.636226
aaadaa -0.018929 73.99% de 469 0.636002
bbbaa -0.048213 73.81% de 546 0.630013
adaaa 0.033267 75.05% de 493 0.628866
aadaa 0.001291 74.43% de 485 0.628864
aaada -0.004059 74.32% de 479 0.626935
daaaa 0.025576 74.90% de 494 0.625192
aabbb -0.044901 73.86% de 547 0.621917
bbbbab 0.002718 74.46% de 505 0.619443
babbbb -0.002915 74.35% de 503 0.619264
aca 0.023238 75.06% de 417 0.617627
bbabbb 0.004863 74.50% de 498 0.615471
bbbabb 0.033835 75.00% de 508 0.615454
abbbbb -0.042739 73.68% de 513 0.613046
bbbbba -0.037253 73.79% de 515 0.612743
aac 0.037606 75.49% de 412 0.612532
caa -0.016712 74.00% de 450 0.606858
cccccc 0.011110 76.00% de 50 0.590056
bbabb -0.007885 74.29% de 525 0.586286
ccccc 0.033319 78.69% de 61 0.575745
abbbb -0.048417 73.74% de 537 0.567314
bbbba -0.041096 73.83% de 535 0.566913
bbba -0.020404 74.19% de 558 0.557916
cccc 0.016729 76.25% de 80 0.540503
abbb -0.020404 74.19% de 558 0.535858
ccc 0.039625 77.54% de 138 0.474788
bbbbbb -0.008443 74.25% de 501 0.268469
bbbb -0.035759 73.94% de 541 0.207765
bbbbb -0.041396 73.75% de 522 0.203861

18683 rows × 3 columns


In [13]:
print("Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística")
plot_word(sortedBeats[:10])


Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística

In [14]:
from operator import itemgetter
from scipy.stats.stats import pearsonr
pd.set_option('display.float_format', lambda x: '%.5f' % x)

def find_pearson(value, patient, survived):
    pearsonList = []
    for i in range(value):
        patientpear = patient[:, i]
        pearson = pearsonr(patientpear, survived)
        pearsonList.append({'grupo':i,'correlación':pearson[0],'p-valor':pearson[1]})
    return sorted(pearsonList, key=itemgetter('correlación'), reverse=True)

sortedList = find_pearson(n_components,patients_nmf,survived)
df = pd.DataFrame(sortedList)
df = df.set_index('grupo')
df = df.sort_values(['correlación'], ascending=[False])
df


Out[14]:
correlación p-valor
grupo
3 0.08482 0.03943
9 0.07765 0.05945
7 0.03780 0.35936
4 0.03457 0.40195
6 -0.02033 0.62221
10 -0.02061 0.61741
1 -0.02564 0.53416
5 -0.03619 0.38028
2 -0.04026 0.32898
8 -0.06977 0.09043
0 -0.06997 0.08950

In [15]:
def print_top_words(components,feature_names,topic_index=None):
    result = []
    for topic_idx, topic in enumerate(components):
        features = [{"word":feature_names[i],"correlación":topic[i]} 
                    for i in topic.argsort()[:-1] if topic[i] > 0]
        if features!=[] and (topic_index==None or topic_index==topic_idx): 
            result.append({"topic":topic_idx,"features":features})
    return result

In [16]:
columns = list(table)
topword = print_top_words(nmf.components_, columns,topic_index=sortedList[0]['grupo'])[0]['features']
subwords = []
for subword in topword:
    if subword['correlación']>0:
        subwords.append(subword['word'])
print(str(subwords[:10]))


['eeabce', 'ccadb', 'gdbg', 'bdcbca', 'ebcecc', 'ggafh', 'abceee', 'dddddg', 'baacbe', 'babdfa']

In [17]:
table = convert_matrix(sumvals=False,filter_words=tuple(subwords))
survived = table.index.labels[1].tolist()
patients = table.values
table


(590, 5879)
Out[17]:
aaaaae aaaabd aaaabe aaaaci aaaadg aaaae aaaaea aaaaeb aaaaee aaaafe ... jgjj jha jhag jhb jja jjaa jjf jjha jjj jjja
subject_id isAlive
20 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
135 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 0 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
214 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
283 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
368 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 0 1 0 0 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
408 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
462 0 0 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
638 1 0 1 0 0 1 0 0 0 0 1 ... 0 1 0 0 0 0 0 1 0 0
682 1 1 1 0 0 1 1 1 0 0 0 ... 0 1 1 0 0 0 0 0 0 0
736 0 1 0 0 0 0 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 1 1
749 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
793 1 0 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 0 1 0 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
952 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
974 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1004 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1144 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1160 0 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1226 1 0 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1459 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1528 1 0 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23178 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23193 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23200 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23298 0 1 1 1 0 0 1 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
23336 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23339 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 0 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 0 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23474 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23510 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23944 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24030 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24076 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24133 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24142 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24152 1 1 0 0 0 0 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 1 0
24185 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24227 0 0 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 0 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42255 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42261 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 1 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42492 0 0 1 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

590 rows × 5879 columns


In [18]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(590, 5879)
(472, 5879)
(118, 5879)

In [23]:
model, acurracy, roc_auc = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)
print("acurracy is {0:.4%}".format(acurracy),"roc_auc",roc_auc)


acurracy is 77.1186%
acurracy is 77.1186% roc_auc 0.5

In [24]:
def countPatients(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='''SELECT count(1),sum(isalive) FROM matrix m LEFT JOIN subjectwords s 
    ON m.subject_id=s.subject_id where m.word = %s GROUP BY m.word'''
#    print(cur.mogrify(select_statement,(word,)))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        select = {"patient":row[0],"deadPatient":row[1],}
    cur.close()
    conn.close()
    return select

In [25]:
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'correlación':pearson[0],'p-valor':pearson[1],
                        'pacientes muertos':"{0:.2%}".format(percent)+" de %d" %patient})

In [26]:
pearsonList = sorted(pearsonList, key=itemgetter('correlación'), reverse=True)
pearsonList
df = pd.DataFrame(pearsonList)
df = df.set_index('word')
df = df.sort_values(['correlación'], ascending=[False])
df


Out[26]:
correlación p-valor pacientes muertos
word
adc 0.20889 0.00000 89.68% de 155
fdf 0.20378 0.00000 96.39% de 83
dbda 0.18907 0.00000 85.07% de 221
fbd 0.18370 0.00001 87.27% de 165
dff 0.18014 0.00001 93.18% de 88
ffd 0.17763 0.00001 91.26% de 103
fff 0.17753 0.00001 93.83% de 81
addaba 0.17329 0.00002 90.57% de 106
bdab 0.16941 0.00004 79.23% de 414
ddba 0.16922 0.00004 84.02% de 219
aadc 0.16713 0.00005 88.62% de 123
faff 0.16444 0.00006 100.00% de 43
ababaf 0.16343 0.00007 90.10% de 101
bdaad 0.16258 0.00007 85.89% de 163
fabdb 0.16236 0.00007 100.00% de 42
ddaba 0.16144 0.00008 86.81% de 144
fbf 0.16142 0.00008 90.00% de 100
aabdab 0.16106 0.00009 81.51% de 292
dgf 0.16033 0.00009 97.92% de 48
ffdd 0.16027 0.00009 100.00% de 41
addab 0.15951 0.00010 86.71% de 143
bdad 0.15852 0.00011 83.19% de 226
ajd 0.15816 0.00011 100.00% de 40
baadab 0.15671 0.00013 81.85% de 270
bdbda 0.15445 0.00017 87.02% de 131
ffh 0.15388 0.00018 100.00% de 38
babaaf 0.15367 0.00018 90.11% de 91
dabd 0.15354 0.00018 83.26% de 215
baaabd 0.15314 0.00019 81.25% de 288
bdabb 0.15309 0.00019 80.56% de 319
... ... ... ...
beaaa -0.13666 0.00087 51.35% de 37
aabea -0.13831 0.00076 50.00% de 34
aaabea -0.14041 0.00063 48.39% de 31
baeaab -0.14087 0.00060 0.00% de 4
caeabb -0.14087 0.00060 0.00% de 4
abaecb -0.14087 0.00060 0.00% de 4
aaceea -0.14087 0.00060 0.00% de 4
aaeebc -0.14087 0.00060 0.00% de 4
abecba -0.14087 0.00060 0.00% de 4
aecaab -0.14087 0.00060 0.00% de 4
baabec -0.14087 0.00060 0.00% de 4
cbaaae -0.14087 0.00060 0.00% de 4
ceaabb -0.14087 0.00060 0.00% de 4
becabb -0.14087 0.00060 0.00% de 4
eaaacb -0.14087 0.00060 0.00% de 4
eebcaa -0.14087 0.00060 0.00% de 4
eaa -0.14272 0.00051 60.78% de 102
ebaaaa -0.14275 0.00051 43.48% de 23
aaaeab -0.14884 0.00029 22.22% de 9
ababae -0.14884 0.00029 22.22% de 9
ecabb -0.14884 0.00029 22.22% de 9
eaaaa -0.14936 0.00027 55.56% de 63
eaaaaa -0.15003 0.00025 54.39% de 57
aeabc -0.15096 0.00023 14.29% de 7
aeabcb -0.15096 0.00023 14.29% de 7
eeecaa -0.15096 0.00023 14.29% de 7
beaaaa -0.15782 0.00012 45.16% de 31
eaaa -0.16104 0.00009 56.79% de 81
aabeaa -0.16281 0.00007 39.13% de 23
aaeab -0.16375 0.00006 28.57% de 14

5879 rows × 3 columns


In [ ]: