In [1]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression
from PlotWords import plot_word
from Matrix import convert_matrix
from NMF import generateNMF,find_best_NMF

In [2]:
def findpatient(message,table,beats):
    groupsize = table.shape[0]
    deads = sum(table.index.labels[1].tolist())
    percent = -1 if groupsize==0 else deads/groupsize
    if groupsize!=0:
        print(message,"groupsize:%d"%groupsize,"deads:%d"%deads,"percent:{0:.2%}".format(percent),beats)

In [3]:
import psycopg2
def selectPearson(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='SELECT patient,deadpatient,p1 FROM wordspearson WHERE word = %s'
#    print(cur.mogrify(select_statement,locals()))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        patient=row[0]
        select = {'word':word,"pacientes":patient,"porcentaje de muertos":"{0:.2%}".format(row[1]/patient),"p1":row[2]}
    cur.close()
    conn.close()
    return select

In [4]:
def print_top_words(model, feature_names,topic_index=None):
    result = []
    for topic_idx, topic in enumerate(components):
        features = [{"word":feature_names[i],"p1":topic[i]} 
                    for i in topic.argsort()[:-1] if topic[i] > 0]
        if features!=[] and (topic_index==None or topic_index==topic_idx): result.append({"topic":topic_idx,"features":features})
    return result

In [5]:
def addValue(key,my_dict):
    if key in my_dict:
        my_dict[key] += 1
    else:
        my_dict[key] = 1
    return my_dict

In [6]:
table = convert_matrix(with_pearson=True) #.iloc[:,:400]
survived = table.index.labels[1].tolist()
patients = table.values
table


(554, 400)
Out[6]:
aaadc aabadd aabaf aabafb aabaga aabbg aabbgb aabdab aabdad aabdb ... hag haj hbf hdd hddd hfa hfg hga hgd jag
subject_id isAlive
135 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 0 1 0 0 2 0 0 2 0 2 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 0 0 0 0 0 0 1 0 7 ... 0 0 0 0 0 0 0 0 0 0
214 1 0 0 2 0 1 0 0 0 0 2 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 0 0 0 0 0 0 1 0 6 ... 0 0 0 0 0 0 0 0 0 0
283 1 0 1 0 0 0 0 0 2 0 2 ... 0 0 0 0 0 0 0 0 0 0
368 1 0 0 1 1 1 1 1 2 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 1 1 2 1 3 0 0 8 2 16 ... 0 0 0 1 1 1 0 0 0 0
408 1 0 0 0 0 0 0 0 2 5 2 ... 0 0 0 0 0 0 0 0 0 0
462 0 0 8 2 1 0 0 0 14 4 15 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 0 0 0 0 0 0 1 0 5 ... 0 0 0 0 0 0 0 0 0 0
638 1 2 0 2 2 1 1 0 2 0 3 ... 0 1 0 1 1 0 0 1 0 0
682 1 0 0 1 0 0 0 0 0 0 1 ... 3 2 0 1 1 7 0 3 0 2
736 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 0 0 0 0 0 0 0 2 0 6 ... 1 1 0 0 0 1 0 0 0 0
749 1 0 0 1 0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
793 1 2 2 2 1 3 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 0 2 1 0 0 0 0 1 0 0 ... 0 0 0 0 0 1 0 0 0 0
952 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1004 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 1 1 2 0 1 1 0 1 6 6 ... 0 0 0 0 0 0 0 0 1 0
1144 0 0 0 0 0 0 1 0 0 0 1 ... 0 0 0 1 0 0 0 1 0 0
1160 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 0 4 2 0 2 0 5 2 10 ... 0 0 0 1 0 0 0 0 0 0
1226 1 0 0 2 1 0 0 0 1 0 3 ... 0 0 0 0 0 0 0 0 0 0
1459 0 0 4 4 3 0 0 0 10 1 24 ... 0 0 0 0 0 0 0 0 0 0
1528 1 0 1 6 2 3 0 0 3 0 3 ... 0 0 0 0 0 0 0 0 0 0
1531 1 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
1569 1 0 0 6 1 0 1 0 1 2 3 ... 1 0 0 0 0 0 0 2 0 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23034 1 0 0 6 4 0 0 0 13 2 17 ... 1 0 0 0 0 0 0 0 1 0
23097 1 10 4 7 1 1 1 0 3 4 11 ... 0 0 0 1 1 0 0 0 0 0
23120 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23130 1 0 0 1 0 0 0 0 2 0 3 ... 0 0 0 0 0 0 0 0 0 0
23178 1 0 0 3 3 0 2 1 2 0 14 ... 0 0 0 1 0 0 0 0 1 0
23200 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23298 0 0 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
23336 1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 0 10 2 0 0 0 0 8 2 8 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 3 1 0 0 1 0 0 5 5 8 ... 1 0 0 0 0 0 0 0 0 0
23474 1 0 0 3 0 1 1 0 3 0 9 ... 0 0 0 0 0 0 0 0 1 0
23510 1 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23944 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 0 0 4 0 0 0 0 4 0 2 ... 0 0 0 1 0 0 0 0 0 0
24076 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
24133 0 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24142 1 0 0 2 1 2 0 0 3 0 23 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 0 0 0 0 0 0 0 5 0 7 ... 0 0 0 0 0 0 0 0 0 0
24227 0 0 0 0 0 0 0 0 0 0 3 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 0 3 6 0 0 0 0 1 2 5 ... 2 0 0 8 2 4 2 1 4 1
42261 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
42492 0 0 0 2 1 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

554 rows × 400 columns


In [7]:
from sklearn.model_selection import train_test_split
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [8]:
model,acurracy,logit_roc_auc = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)

In [9]:
def countPatients(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='''SELECT count(1),sum(isalive) FROM matrix m LEFT JOIN subjectwords s 
    ON m.subject_id=s.subject_id where m.word = %s GROUP BY m.word'''
#    print(cur.mogrify(select_statement,(word,)))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        select = {"patient":row[0],"deadPatient":row[1],}
    cur.close()
    conn.close()
    return select

In [10]:
from scipy.stats.stats import pearsonr
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'correlación':pearson[0],'p-valor':pearson[1],'pacientes muertos':"{0:.2%}".format(percent)+" de %d" %patient})

In [11]:
from operator import itemgetter
pearsonList = sorted(pearsonList, key=itemgetter('correlación'), reverse=True)
pearsonList
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

df = pd.DataFrame(pearsonList)
df = df.set_index('word')
df = df.sort_values(['correlación'], ascending=[False])
df


Out[11]:
correlación p-valor pacientes muertos
word
babaaf 0.14448 0.00065 90.11% de 91
babaf 0.13705 0.00122 84.00% de 150
ababaf 0.13675 0.00125 90.10% de 101
abbg 0.13498 0.00145 86.57% de 134
fbabd 0.13318 0.00168 97.44% de 39
abaf 0.13287 0.00172 81.18% de 287
dafaab 0.13162 0.00191 95.35% de 43
bababf 0.13056 0.00207 87.36% de 87
fbbaba 0.12763 0.00262 89.47% de 95
dbaaf 0.12759 0.00262 92.73% de 55
adbaf 0.12740 0.00266 90.67% de 75
bja 0.12554 0.00308 90.28% de 72
abafb 0.12473 0.00328 83.78% de 148
aabbg 0.12439 0.00336 88.51% de 87
abfba 0.12410 0.00344 83.65% de 159
fabd 0.12363 0.00356 90.29% de 103
dabc 0.12319 0.00368 90.91% de 77
abbga 0.12279 0.00380 88.10% de 84
bafba 0.12228 0.00395 83.67% de 147
dfaaba 0.12125 0.00426 95.00% de 40
bfaf 0.12117 0.00429 92.59% de 54
fafab 0.12010 0.00465 100.00% de 25
bfba 0.11952 0.00485 81.85% de 248
fbda 0.11952 0.00485 88.35% de 103
ffdaa 0.11891 0.00507 95.45% de 44
abbh 0.11862 0.00518 91.04% de 67
fabdb 0.11804 0.00541 100.00% de 42
dbaafa 0.11796 0.00544 94.87% de 39
baaag 0.11775 0.00552 85.71% de 119
hbf 0.11745 0.00564 100.00% de 24
... ... ... ...
aabadd 0.02390 0.57452 85.22% de 115
abdbd 0.02309 0.58767 86.26% de 131
bdaad 0.02280 0.59234 85.89% de 163
aacda 0.02241 0.59857 88.51% de 87
bdd 0.02081 0.62508 82.53% de 269
dcaa 0.01896 0.65609 87.30% de 126
dca 0.01591 0.70863 84.05% de 163
acabaa 0.01534 0.71856 82.58% de 178
bdbbd 0.01433 0.73647 85.47% de 117
dbabd 0.01384 0.74507 85.61% de 132
bdaab 0.01269 0.76564 79.17% de 336
bdda 0.01250 0.76905 81.69% de 213
dcd 0.01142 0.78846 88.73% de 71
bbdd 0.00983 0.81745 85.47% de 179
abdad 0.00928 0.82747 85.09% de 161
babdaa 0.00887 0.83490 81.01% de 258
ddbaa 0.00725 0.86489 82.93% de 164
bdbd 0.00577 0.89214 85.79% de 183
aabdad 0.00078 0.98546 84.87% de 119
dabd -0.00205 0.96162 83.26% de 215
ddabd -0.00319 0.94023 92.45% de 53
bddaa -0.00327 0.93873 83.54% de 164
bdaada -0.00378 0.92934 84.21% de 133
bbdbd -0.00753 0.85966 86.07% de 122
abdada -0.00925 0.82801 84.73% de 131
abdd -0.01762 0.67908 82.23% de 197
bdabd -0.01860 0.66230 85.71% de 119
bdbdb -0.02037 0.63238 84.68% de 124
dabda -0.02888 0.49761 83.24% de 173
daabd -0.03705 0.38412 83.53% de 170

400 rows × 3 columns


In [12]:
find_best_NMF(patients,survived)


{'pearson': 0.05376539303670632, 'diffErr': None, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 2162.3655318943565, 'n_components': 2}
{'pearson': 0.097055800137172007, 'diffErr': 440.15960104444525, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 1722.2059308499113, 'n_components': 3}
{'pearson': 0.090018400435492876, 'diffErr': 234.52210889286789, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 1487.6838219570434, 'n_components': 4}
{'pearson': 0.081563786972520536, 'diffErr': 214.44990071202324, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 1273.2339212450202, 'n_components': 5}
{'pearson': 0.079673449258838888, 'diffErr': 121.27194865084743, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 1151.9619725941727, 'n_components': 6}
{'pearson': 0.12737775530295065, 'diffErr': 71.658850984822038, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 1080.3031216093507, 'n_components': 7}
{'pearson': 0.12715761038199216, 'diffErr': 75.007581107189594, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 1005.2955405021611, 'n_components': 8}
{'pearson': 0.12364285814459262, 'diffErr': 78.577705555105695, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 926.71783494705539, 'n_components': 9}
{'pearson': 0.1222106018457856, 'diffErr': 63.967251255186284, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 862.75058369186911, 'n_components': 10}
{'pearson': 0.099045533691014515, 'diffErr': 51.181103957561277, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 811.56947973430783, 'n_components': 11}
{'pearson': 0.099084258886736207, 'diffErr': 36.475940264779297, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 775.09353946952854, 'n_components': 12}
{'pearson': 0.10246098678989352, 'diffErr': 27.017598185873226, 'accuracy': 0.7567567567567568, 'roc_auc': 0.49784482758620685, 'recostrucción error': 748.07594128365531, 'n_components': 13}
{'pearson': 0.10884052466783017, 'diffErr': 38.138504964553135, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 709.93743631910218, 'n_components': 14}
{'pearson': 0.090041511365150456, 'diffErr': 30.003798853342801, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 679.93363746575938, 'n_components': 15}
{'pearson': 0.10204645411700471, 'diffErr': 26.314215507768722, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 653.61942195799065, 'n_components': 16}
{'pearson': 0.10124549238778958, 'diffErr': 23.896084496502112, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 629.72333746148854, 'n_components': 17}
{'pearson': 0.11822625137185924, 'diffErr': 22.544328158030567, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 607.17900930345797, 'n_components': 18}
{'pearson': 0.11335065884627309, 'diffErr': 24.670450717502149, 'accuracy': 0.7927927927927928, 'roc_auc': 0.53591954022988508, 'recostrucción error': 582.50855858595583, 'n_components': 19}
{'pearson': 0.11115230437067979, 'diffErr': 29.813402608102365, 'accuracy': 0.7927927927927928, 'roc_auc': 0.53591954022988508, 'recostrucción error': 552.69515597785346, 'n_components': 20}
{'pearson': 0.10890175755782711, 'diffErr': 20.929019423761247, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 531.76613655409221, 'n_components': 21}
{'pearson': 0.10864733451773047, 'diffErr': 17.838291790875701, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 513.92784476321651, 'n_components': 22}
{'pearson': 0.10430678586646505, 'diffErr': 21.372749984657048, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 492.55509477855946, 'n_components': 23}
{'pearson': 0.13529780606043829, 'diffErr': 17.744502982209156, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 474.81059179635031, 'n_components': 24}
{'pearson': 0.13597996122133038, 'diffErr': 12.156754929258341, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 462.65383686709197, 'n_components': 25}
{'pearson': 0.13625247830829873, 'diffErr': 10.947475664333069, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 451.7063612027589, 'n_components': 26}
{'pearson': 0.1356389980344033, 'diffErr': 13.977371239191598, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 437.7289899635673, 'n_components': 27}
{'pearson': 0.13113745749974789, 'diffErr': 12.321043593860225, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 425.40794636970708, 'n_components': 28}
{'pearson': 0.13434291529512474, 'diffErr': 11.646934011812561, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 413.76101235789451, 'n_components': 29}
{'pearson': 0.1332552104002945, 'diffErr': 8.8676138972157901, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 404.89339846067872, 'n_components': 30}
{'pearson': 0.13310692291439358, 'diffErr': 11.33595210415649, 'accuracy': 0.7567567567567568, 'roc_auc': 0.47701149425287354, 'recostrucción error': 393.55744635652223, 'n_components': 31}
{'pearson': 0.12279439837122308, 'diffErr': 6.2225822373068809, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 387.33486411921535, 'n_components': 32}
{'pearson': 0.12582738312735586, 'diffErr': 7.5661593329539869, 'accuracy': 0.76576576576576572, 'roc_auc': 0.49784482758620685, 'recostrucción error': 379.76870478626137, 'n_components': 33}
{'pearson': 0.1233475452935988, 'diffErr': 9.8408870973310059, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 369.92781768893036, 'n_components': 34}
{'pearson': 0.12359598837132334, 'diffErr': -0.36642654632964877, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 370.29424423526001, 'n_components': 35}
{'pearson': 0.13238134762429354, 'diffErr': 15.520605249025209, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 354.7736389862348, 'n_components': 36}
{'pearson': 0.13145715342181577, 'diffErr': 0.92703371833732717, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 353.84660526789747, 'n_components': 37}
{'pearson': 0.1234834973805936, 'diffErr': 3.9009479982794915, 'accuracy': 0.7567567567567568, 'roc_auc': 0.49784482758620685, 'recostrucción error': 349.94565726961798, 'n_components': 38}
{'pearson': 0.12034045811701181, 'diffErr': 15.409030714449443, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 334.53662655516854, 'n_components': 39}
{'pearson': 0.12768512333789622, 'diffErr': 5.8330295080371002, 'accuracy': 0.76576576576576572, 'roc_auc': 0.50359195402298851, 'recostrucción error': 328.70359704713144, 'n_components': 40}
{'pearson': 0.12818467028018282, 'diffErr': 4.422483919765682, 'accuracy': 0.7567567567567568, 'roc_auc': 0.47701149425287354, 'recostrucción error': 324.28111312736576, 'n_components': 41}
{'pearson': 0.12768209391396318, 'diffErr': 5.4266172427953734, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 318.85449588457038, 'n_components': 42}
{'pearson': 0.12649395324992271, 'diffErr': 5.6268662973392907, 'accuracy': 0.7567567567567568, 'roc_auc': 0.47701149425287354, 'recostrucción error': 313.22762958723109, 'n_components': 43}
{'pearson': 0.12301791179285991, 'diffErr': 2.3299603924428993, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 310.89766919478819, 'n_components': 44}
{'pearson': 0.12481104557606498, 'diffErr': 6.6167445011430459, 'accuracy': 0.73873873873873874, 'roc_auc': 0.47126436781609193, 'recostrucción error': 304.28092469364515, 'n_components': 45}
{'pearson': 0.11563240238329764, 'diffErr': 3.3893473240239018, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 300.89157736962125, 'n_components': 46}
{'pearson': 0.13093683979132412, 'diffErr': 3.2676572390707292, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 297.62392013055052, 'n_components': 47}
{'pearson': 0.11123601632780356, 'diffErr': 0.59834155080989149, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 297.02557857974062, 'n_components': 48}
{'pearson': 0.10898602879795428, 'diffErr': 6.677871572346362, 'accuracy': 0.72972972972972971, 'roc_auc': 0.46551724137931033, 'recostrucción error': 290.34770700739426, 'n_components': 49}
{'pearson': 0.11789478519734453, 'diffErr': 5.2444714106479182, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 285.10323559674634, 'n_components': 50}
{'pearson': 0.11428020761902029, 'diffErr': 1.8280798713487343, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 283.27515572539761, 'n_components': 51}
{'pearson': 0.10895149035850181, 'diffErr': 2.7829809607944753, 'accuracy': 0.77477477477477474, 'roc_auc': 0.50359195402298851, 'recostrucción error': 280.49217476460313, 'n_components': 52}
{'pearson': 0.11072744402286747, 'diffErr': 4.7081903106206937, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 275.78398445398244, 'n_components': 53}
{'pearson': 0.11043090884173715, 'diffErr': 3.5916345080893279, 'accuracy': 0.7567567567567568, 'roc_auc': 0.47701149425287354, 'recostrucción error': 272.19234994589311, 'n_components': 54}
{'pearson': 0.10860303152906108, 'diffErr': 1.9462277631640177, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 270.2461221827291, 'n_components': 55}
{'pearson': 0.11042438194395338, 'diffErr': 2.2089893875227062, 'accuracy': 0.7567567567567568, 'roc_auc': 0.47701149425287354, 'recostrucción error': 268.03713279520639, 'n_components': 56}
{'pearson': 0.11056222019083801, 'diffErr': 4.2536832908987208, 'accuracy': 0.77477477477477474, 'roc_auc': 0.50933908045977017, 'recostrucción error': 263.78344950430767, 'n_components': 57}
{'pearson': 0.10855009252091609, 'diffErr': 4.4530352118738961, 'accuracy': 0.7567567567567568, 'roc_auc': 0.49784482758620685, 'recostrucción error': 259.33041429243377, 'n_components': 58}
{'pearson': 0.11413790365953326, 'diffErr': 0.063218539682168284, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47126436781609193, 'recostrucción error': 259.2671957527516, 'n_components': 59}
{'pearson': 0.10783599536869071, 'diffErr': 3.5946200138592985, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 255.67257573889231, 'n_components': 60}
{'pearson': 0.10786327294560029, 'diffErr': 6.1234477075406915, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 249.54912803135161, 'n_components': 61}
{'pearson': 0.11153178798310334, 'diffErr': 3.4407679892554199, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 246.10836004209619, 'n_components': 62}
{'pearson': 0.10929600918187392, 'diffErr': 1.3775251947712093, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 244.73083484732499, 'n_components': 63}
{'pearson': 0.10800171192847335, 'diffErr': 4.7805478497244849, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 239.9502869976005, 'n_components': 64}
{'pearson': 0.10919347393273963, 'diffErr': 0.57240736672630987, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 239.37787963087419, 'n_components': 65}
{'pearson': 0.11022478726297691, 'diffErr': 5.3297802997604151, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 234.04809933111378, 'n_components': 66}
{'pearson': 0.10749507012292835, 'diffErr': 1.6786866891908971, 'accuracy': 0.73873873873873874, 'roc_auc': 0.46551724137931033, 'recostrucción error': 232.36941264192288, 'n_components': 67}
{'pearson': 0.10774247111437618, 'diffErr': 2.6082760488228303, 'accuracy': 0.74774774774774777, 'roc_auc': 0.47701149425287354, 'recostrucción error': 229.76113659310005, 'n_components': 68}
{'pearson': 0.10819457132593975, 'diffErr': 0.52110169060102862, 'accuracy': 0.73873873873873874, 'roc_auc': 0.47126436781609193, 'recostrucción error': 229.24003490249902, 'n_components': 69}
{'pearson': 0.10840061030599936, 'diffErr': 3.6988226741294739, 'accuracy': 0.74774774774774777, 'roc_auc': 0.49209770114942525, 'recostrucción error': 225.54121222836955, 'n_components': 70}
{'pearson': 0.10840795392254648, 'diffErr': 2.3544472234675027, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 223.18676500490204, 'n_components': 71}
{'pearson': 0.1338026592380685, 'diffErr': 1.2494485104288913, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 221.93731649447315, 'n_components': 72}
{'pearson': 0.12679120690152884, 'diffErr': 2.5573378511593887, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 219.37997864331376, 'n_components': 73}
{'pearson': 0.11008912759619609, 'diffErr': 3.8664836136147755, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 215.51349502969899, 'n_components': 74}
{'pearson': 0.12422596379221923, 'diffErr': 1.6610409430571451, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 213.85245408664184, 'n_components': 75}
{'pearson': 0.1156094341872473, 'diffErr': 1.6181542621540359, 'accuracy': 0.72972972972972971, 'roc_auc': 0.45977011494252873, 'recostrucción error': 212.23429982448781, 'n_components': 76}
{'pearson': 0.12336383109630233, 'diffErr': 1.825455947382153, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 210.40884387710565, 'n_components': 77}
{'pearson': 0.13002024274834093, 'diffErr': 1.32705524711821, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 209.08178862998744, 'n_components': 78}
{'pearson': 0.12034040339050936, 'diffErr': 3.1098238507635472, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 205.9719647792239, 'n_components': 79}
{'pearson': 0.13080602724505158, 'diffErr': 2.3670574227451198, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 203.60490735647878, 'n_components': 80}
{'pearson': 0.12931131892153361, 'diffErr': -3.0165629784734165, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 206.62147033495219, 'n_components': 81}
{'pearson': 0.1288994255794626, 'diffErr': 4.6211931070489811, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 202.00027722790321, 'n_components': 82}
{'pearson': 0.13298009496205385, 'diffErr': 3.4761567451215285, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 198.52412048278168, 'n_components': 83}
{'pearson': 0.12550266347444977, 'diffErr': -1.6362610138038463, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 200.16038149658553, 'n_components': 84}
{'pearson': 0.12097303851167147, 'diffErr': 4.7402196763711686, 'accuracy': 0.73873873873873874, 'roc_auc': 0.47126436781609193, 'recostrucción error': 195.42016182021436, 'n_components': 85}
{'pearson': 0.12429863387713905, 'diffErr': -1.4432919584201329, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 196.86345377863449, 'n_components': 86}
{'pearson': 0.12008172124865137, 'diffErr': -0.64783866287589831, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 197.51129244151039, 'n_components': 87}
{'pearson': 0.13189348624965153, 'diffErr': 6.3788565193314639, 'accuracy': 0.68468468468468469, 'roc_auc': 0.45186781609195403, 'recostrucción error': 191.13243592217893, 'n_components': 88}
{'pearson': 0.13297653068435561, 'diffErr': -2.0232294549037988, 'accuracy': 0.7567567567567568, 'roc_auc': 0.49784482758620685, 'recostrucción error': 193.15566537708273, 'n_components': 89}
{'pearson': 0.12386758796756281, 'diffErr': 3.6598432113309798, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 189.49582216575175, 'n_components': 90}
{'pearson': 0.12013908694364021, 'diffErr': 2.6203760898033295, 'accuracy': 0.69369369369369371, 'roc_auc': 0.44252873563218392, 'recostrucción error': 186.87544607594842, 'n_components': 91}
{'pearson': 0.12441114724116407, 'diffErr': 0.42183184664102669, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 186.45361422930739, 'n_components': 92}
{'pearson': 0.12481007672869607, 'diffErr': 3.1222785155133295, 'accuracy': 0.76576576576576572, 'roc_auc': 0.4885057471264368, 'recostrucción error': 183.33133571379406, 'n_components': 93}
{'pearson': 0.12991094277381543, 'diffErr': -1.5210975011825667, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 184.85243321497663, 'n_components': 94}
{'pearson': 0.12457430516869321, 'diffErr': 3.5220662426741569, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 181.33036697230247, 'n_components': 95}
{'pearson': 0.12360940563989327, 'diffErr': 1.0244246568673532, 'accuracy': 0.77477477477477474, 'roc_auc': 0.4942528735632184, 'recostrucción error': 180.30594231543512, 'n_components': 96}
{'pearson': 0.12092079275121478, 'diffErr': 2.0728136977318172, 'accuracy': 0.77477477477477474, 'roc_auc': 0.50933908045977017, 'recostrucción error': 178.2331286177033, 'n_components': 97}
{'pearson': 0.11850784026965402, 'diffErr': 1.3070625365479884, 'accuracy': 0.78378378378378377, 'roc_auc': 0.5, 'recostrucción error': 176.92606608115531, 'n_components': 98}
{'pearson': 0.12562156413760861, 'diffErr': 0.68605270609234026, 'accuracy': 0.7567567567567568, 'roc_auc': 0.48275862068965519, 'recostrucción error': 176.24001337506297, 'n_components': 99}

In [13]:
import numpy as np
bestScore = 12
model, nmf, patients_nmf, acurracy, roc_auc = generateNMF(patients, survived, n_components=bestScore)
compoT = np.transpose(nmf.components_)
print("components",nmf.components_.shape)
print("components Transpose",compoT.shape)


components (12, 400)
components Transpose (400, 12)

In [14]:
from operator import itemgetter
def predict_proba(compoT,table,model):
    predicts = model.predict_proba(compoT)[:,1]
    beats = list(table)
    sortedBeats = []
    numPred = len(predicts)
    print(len(beats),numPred)
    nvals={}
    for i in range(0,numPred):
        word = beats[i]
        sel = selectPearson(word)
        if sel=={}:
            print("palabra no encontrada en pearson:",word)
        sortedBeats.append({'word':beats[i],'predict':predicts[i],'pacientes':sel['pacientes'],
                            'porcentaje de muertos':sel['porcentaje de muertos'],"correlación":sel['p1']})
        if predicts[i]>.99 : addValue("99",nvals)
        elif predicts[i]>.90 : addValue("90",nvals)
        elif predicts[i]>.80 : addValue("80",nvals)
        elif predicts[i]>.70 : addValue("70",nvals)
        else: addValue("under",nvals)
    print(nvals)
    return sorted(sortedBeats, key=itemgetter('correlación'), reverse=True)

In [15]:
sortedBeats = predict_proba(compoT,table,model)
beatKeys = []
for value in sortedBeats:
    beatKeys.append(value['word'])


400 400
{'80': 70, '70': 110, '90': 88, 'under': 67, '99': 65}

In [16]:
df = pd.DataFrame(sortedBeats)
df = df.set_index('word')
df = df.sort_values(['correlación'], ascending=[False])
df


Out[16]:
correlación pacientes porcentaje de muertos predict
word
adc 0.20889 155 89.68% 0.99887
fdf 0.20378 83 96.39% 0.99931
dbda 0.18907 221 85.07% 0.59133
fbd 0.18370 165 87.27% 0.86346
dff 0.18014 88 93.18% 0.99897
ffd 0.17763 103 91.26% 0.99937
fff 0.17753 81 93.83% 0.99999
bdbd 0.17495 183 85.79% 0.02946
addaba 0.17329 106 90.57% 0.88175
faf 0.17169 154 87.01% 1.00000
dfdd 0.17163 60 96.67% 0.99951
afg 0.17134 105 90.48% 0.99999
haf 0.17118 66 95.45% 0.98105
bdd 0.17036 269 82.53% 0.00000
bdab 0.16941 414 79.23% 0.98765
ddba 0.16922 219 84.02% 0.99999
fabd 0.16740 103 90.29% 0.76081
bbdd 0.16738 179 85.47% 0.05326
gga 0.16714 70 94.29% 0.99989
aadc 0.16713 123 88.62% 0.98968
dbbd 0.16695 192 84.90% 0.01009
gah 0.16649 44 100.00% 0.96022
daf 0.16616 230 83.48% 1.00000
afd 0.16499 221 83.71% 0.99997
faff 0.16444 43 100.00% 0.98875
adaf 0.16388 186 84.95% 0.99992
ababaf 0.16343 101 90.10% 0.72496
adca 0.16318 106 89.62% 0.99019
agg 0.16298 62 95.16% 0.99994
bdaad 0.16258 163 85.89% 0.66851
... ... ... ... ...
bdbaad 0.12117 76 88.16% 0.65504
dbbbad 0.12117 76 88.16% 0.74107
addba 0.12114 164 82.93% 0.99819
ddbaa 0.12114 164 82.93% 0.90549
adaac 0.12110 81 87.65% 0.85661
aabaga 0.12110 81 87.65% 0.72104
daad 0.12106 307 79.48% 0.95275
ddfddd 0.12077 24 100.00% 0.92977
hbf 0.12077 24 100.00% 0.69919
fgag 0.12077 24 100.00% 0.89833
faffa 0.12077 24 100.00% 0.92578
dgadaa 0.12077 24 100.00% 0.73167
hddd 0.12077 24 100.00% 0.94726
dafga 0.12077 24 100.00% 0.76656
adgada 0.12077 24 100.00% 0.74002
aadgad 0.12077 24 100.00% 0.73676
aadadg 0.12077 24 100.00% 0.70743
afdadd 0.12077 24 100.00% 0.73075
ddd 0.12074 223 81.17% 1.00000
dgada 0.12070 31 96.77% 0.79627
jag 0.12070 31 96.77% 0.90347
gaadd 0.12070 31 96.77% 0.73365
fdff 0.12070 31 96.77% 0.88115
baabh 0.12070 31 96.77% 0.68832
dddaf 0.12070 31 96.77% 0.78696
bdabc 0.12070 31 96.77% 0.69108
agha 0.12070 31 96.77% 0.87282
aabbgb 0.12070 31 96.77% 0.68132
aabdad 0.12057 119 84.87% 0.70271
baabd 0.12056 326 79.14% 0.96409

400 rows × 4 columns


In [17]:
print("Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística")
plot_word(sortedBeats[:5])


Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística

In [18]:
from operator import itemgetter
from scipy.stats.stats import pearsonr
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

def find_pearson(value, patient, survived):
    pearsonList = []
    for i in range(value):
        patientpear = patient[:, i]
        pearson = pearsonr(patientpear, survived)
        pearsonList.append({'grupo':i,'correlación':pearson[0],'p-valor':pearson[1]})
    return sorted(pearsonList, key=itemgetter('correlación'), reverse=True)

sortedList = find_pearson(bestScore,patients_nmf,survived)
#sortedList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
df = pd.DataFrame(sortedList)
df = df.set_index('grupo')
df = df.sort_values(['correlación'], ascending=[False])
df


Out[18]:
correlación p-valor
grupo
4 0.09908 0.01967
6 0.09041 0.03337
3 0.08380 0.04868
11 0.07882 0.06374
2 0.05747 0.17680
5 0.05044 0.23595
0 0.04251 0.31791
7 0.03834 0.36779
9 0.03613 0.39604
10 0.02537 0.55127
1 0.01667 0.69548
8 -0.02997 0.48152

In [19]:
columns = list(table)
components = nmf.components_
topword = print_top_words(components, columns,topic_index=sortedList[0]['grupo'])[0]['features']
subwords = []
for subword in topword:
    if subword['p1']>0:
        subwords.append(subword['word'])
print(str(subwords[:10]))


['fffa', 'hafa', 'agf', 'dabadb', 'fdfa', 'hfa', 'aaffd', 'ffda', 'fgh', 'gaj']

In [20]:
table = convert_matrix(sumvals=False,filter_words=tuple(subwords))
survived = table.index.labels[1].tolist()
patients = table.values
table


(553, 276)
Out[20]:
aabadd aabaf aabafb aabaga aabbg aabbgb aabdab aabdad aabdb aabdbb ... gha haf hafa hag haj hbf hfa hfg hga hgd
subject_id isAlive
135 1 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 1 0 0 1 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 0 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
214 1 0 1 0 1 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 0 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
283 1 1 0 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
368 1 0 1 1 1 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 1 1 1 1 0 0 1 1 1 1 ... 0 1 1 0 0 0 1 0 0 0
408 1 0 0 0 0 0 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
462 0 1 1 1 0 0 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 0 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
638 1 0 1 1 1 1 0 1 0 1 1 ... 0 0 0 0 1 0 0 0 1 0
682 1 0 1 0 0 0 0 0 0 1 0 ... 1 1 1 1 1 0 1 0 1 0
736 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 0 0 0 0 0 0 1 0 1 1 ... 0 1 0 1 1 0 1 0 0 0
749 1 0 1 0 0 0 0 0 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
793 1 1 1 1 1 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 1 1 0 0 0 0 1 0 0 0 ... 1 1 1 0 0 0 1 0 0 0
1004 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 1 1 0 1 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 1
1144 0 0 0 0 0 1 0 0 0 1 0 ... 1 0 0 0 0 0 0 0 1 0
1160 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 1 1 0 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
1226 1 0 1 1 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1459 0 1 1 1 0 0 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
1528 1 1 1 1 1 0 0 1 0 1 1 ... 1 0 0 0 0 0 0 0 0 0
1531 1 0 0 0 0 0 0 0 0 1 0 ... 0 1 1 0 0 0 0 0 0 0
1569 1 0 1 1 0 1 0 1 1 1 1 ... 1 1 0 1 0 0 0 0 1 0
1924 1 0 1 1 0 0 0 1 0 1 1 ... 1 0 0 1 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23034 1 0 1 1 0 0 0 1 1 1 1 ... 1 1 1 1 0 0 0 0 0 1
23097 1 1 1 1 1 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
23120 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23130 1 0 1 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
23178 1 0 1 1 0 1 1 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 1
23200 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23298 0 0 0 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
23336 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 1 1 0 0 0 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 1 0 0 1 0 0 1 1 1 1 ... 1 0 0 1 0 0 0 0 0 0
23474 1 0 1 0 1 1 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 1
23510 1 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23944 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 0 1 0 0 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
24076 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
24133 0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24142 1 0 1 1 1 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 0 0 0 0 0 0 1 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
24227 0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 1 1 0 0 0 0 1 1 1 0 ... 1 1 1 1 0 0 1 1 1 1
42261 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
42492 0 0 1 1 1 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

553 rows × 276 columns


In [21]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(553, 276)
(442, 276)
(111, 276)

In [22]:
model, acurracy, roc_auc = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)

In [23]:
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'correlación':pearson[0],'p-valor':pearson[1],'pacientes muertos':"{0:.2%}".format(percent)+" de %d" %patient})

In [24]:
df = pd.DataFrame(pearsonList)
df = df.set_index('word')
df = df.sort_values(['correlación'], ascending=[False])
df


Out[24]:
correlación p-valor pacientes muertos
word
adc 0.20680 0.00000 89.68% de 155
fdf 0.20472 0.00000 96.39% de 83
dbda 0.18304 0.00001 85.07% de 221
fbd 0.17969 0.00002 87.27% de 165
dff 0.17956 0.00002 93.18% de 88
fff 0.17720 0.00003 93.83% de 81
ffd 0.17615 0.00003 91.26% de 103
addaba 0.17142 0.00005 90.57% de 106
haf 0.17138 0.00005 95.45% de 66
bdbd 0.16961 0.00006 85.79% de 183
fabd 0.16536 0.00009 90.29% de 103
fabdb 0.16373 0.00011 100.00% de 42
ddba 0.16192 0.00013 84.02% de 219
bbdd 0.16174 0.00013 85.47% de 179
ffdd 0.16161 0.00014 100.00% de 41
ababaf 0.16127 0.00014 90.10% de 101
dgf 0.16115 0.00014 97.92% de 48
bdd 0.16093 0.00014 82.53% de 269
dbbd 0.16070 0.00015 84.90% de 192
ajd 0.15947 0.00017 100.00% de 40
fbf 0.15922 0.00017 90.00% de 100
daf 0.15816 0.00019 83.48% de 230
bdaad 0.15737 0.00020 85.89% de 163
afd 0.15732 0.00020 83.71% de 221
ddaba 0.15706 0.00021 86.81% de 144
ffh 0.15513 0.00025 100.00% de 38
addab 0.15506 0.00025 86.71% de 143
dfd 0.15466 0.00026 86.96% de 138
afdf 0.15300 0.00030 96.08% de 51
cafa 0.15191 0.00034 93.65% de 63
... ... ... ...
bdbaad 0.11819 0.00539 88.16% de 76
dbbbad 0.11819 0.00539 88.16% de 76
abfbab 0.11819 0.00539 88.16% de 76
abdd 0.11792 0.00549 82.23% de 197
aabaga 0.11782 0.00554 87.65% de 81
bacd 0.11780 0.00554 86.14% de 101
adbbda 0.11780 0.00554 86.14% de 101
abdab 0.11778 0.00555 79.36% de 344
abafb 0.11759 0.00563 83.78% de 148
aabaf 0.11758 0.00563 81.39% de 231
fbbd 0.11755 0.00565 86.81% de 91
bbdda 0.11727 0.00576 85.00% de 120
aabadd 0.11673 0.00599 85.22% de 115
dadaba 0.11642 0.00613 84.50% de 129
badbd 0.11626 0.00620 85.45% de 110
bdbdb 0.11574 0.00644 84.68% de 124
bafba 0.11551 0.00654 83.67% de 147
bdda 0.11548 0.00655 81.69% de 213
aabdad 0.11512 0.00673 84.87% de 119
bdaada 0.11504 0.00677 84.21% de 133
acabaa 0.11483 0.00687 82.58% de 178
agd 0.11438 0.00709 83.44% de 151
ddbaa 0.11338 0.00761 82.93% de 164
addba 0.11338 0.00761 82.93% de 164
dbabab 0.11273 0.00797 81.01% de 237
afbba 0.11253 0.00808 82.11% de 190
aabdb 0.10912 0.01023 79.07% de 344
abdbbb 0.10867 0.01055 80.30% de 264
bdaab 0.10864 0.01057 79.17% de 336
baabd 0.10392 0.01449 79.14% de 326

276 rows × 3 columns


In [ ]: