In [1]:
import pandas as pd
import psycopg2
import numpy as np
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression
from PlotWords import plot_word
from Matrix import convert_matrix

In [3]:
def savePearson(pearson,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement=('INSERT INTO wordspearson(word,p1,p2,patient,deadPatient)'
                      ' SELECT unnest( %(word)s ) ,'
                      ' unnest( %(p1)s) ,'
                      ' unnest( %(p2)s) ,'
                      ' unnest( %(patient)s) ,'
                      ' unnest( %(deadPatient)s)')
    word=[r['word'] for r in pearson]
    p1=[r['p1'] for r in pearson]
    p2=[r['p2'] for r in pearson]
    patient=[r['patient'] for r in pearson]
    deadPatient=[r['deadPatient'] for r in pearson]
#    print(cur.mogrify(insert_statement,locals()))
    cur.execute(insert_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [4]:
def selectPearson(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='SELECT word,p1,p2,patient,deadpatient FROM wordspearson'
#    print(cur.mogrify(select_statement,locals()))
    cur.execute(select_statement)
    select = []
    for row in cur :
        patient=row[3]
        cuantosMueren ="{0:.2%}".format(row[4]/patient)+" de "+str(patient)
        select.append({"word":row[0],"p1":row[1],"p2":row[2],"cuantosMueren":cuantosMueren})
    cur.close()
    conn.close()
    return sorted(select, key=itemgetter('p1'), reverse=True)

In [5]:
def countPatients(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='''SELECT count(1),sum(isalive) FROM matrix m LEFT JOIN subjectwords s 
    ON m.subject_id=s.subject_id where m.word = %s GROUP BY m.word'''
#    print(cur.mogrify(select_statement,(word,)))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        select = {"patient":row[0],"deadPatient":row[1],}
    cur.close()
    conn.close()
    return select

In [6]:
def cleanPearson(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    delete_statement='DELETE FROM wordspearson'
#    print(cur.mogrify(delete_statement,locals()))
    cur.execute(delete_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [7]:
table = convert_matrix(sumvals=False)
table


(590, 58840)
Out[7]:
aaaaaaa aaaaaaaa aaaaaaab aaaaaaac aaaaaaad aaaaaaae aaaaaaaf aaaaaaag aaaaaaah aaaaaaai ... kkb kke lab lbb lbbb lbbc lcc leb lib libb
subject_id isAlive
20 0 1 1 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
135 1 1 1 1 0 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 1 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
177 1 1 1 1 0 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
214 1 1 1 1 1 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
263 1 1 1 1 0 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
283 1 1 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
368 1 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 1 1 1 1 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
408 1 1 1 1 0 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
462 0 1 1 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
618 1 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
638 1 1 1 1 1 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
682 1 1 1 1 1 1 1 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
736 0 1 1 1 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 1 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
749 1 1 1 1 1 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
793 1 1 1 1 1 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 1 1 1 0 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
952 1 1 1 1 1 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
974 0 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1004 1 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1144 0 1 1 1 1 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
1160 0 1 1 1 0 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 1 1 1 0 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1226 1 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1459 0 1 1 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1528 1 1 1 1 1 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23178 1 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23193 0 1 1 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23200 0 1 1 1 1 0 0 0 0 0 0 ... 0 1 0 1 1 0 0 1 0 0
23298 0 1 1 1 1 1 1 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
23336 1 1 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23339 0 1 1 1 0 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 1 1 1 1 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 1 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 1 1 1 0 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
23451 1 1 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23474 1 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23510 1 1 1 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23944 1 1 1 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 1 1 1 1 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24030 0 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24076 1 1 1 1 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24133 0 1 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24142 1 1 1 1 0 1 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
24152 1 1 1 1 0 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24227 0 1 1 1 1 1 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
25466 0 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 1 1 1 0 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
42255 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42261 0 1 1 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 1 1 1 1 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42492 0 1 1 1 1 1 0 1 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
43459 0 1 1 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

590 rows × 58840 columns


In [8]:
survived = table.index.labels[1].tolist()
patients = table.values
columns = list(table.columns.values)

In [9]:
cleanPearson()
print("cleanedPearson now reading",len(columns),"columns")
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    pearsonList.append({'word':word,'p1':pearson[0],'p2':pearson[1],'patient':count['patient'],'deadPatient':count['deadPatient']})
#    print(i,end=", ")
print(" preparedToSavePearson")
savePearson(pearsonList)
print("savedPearson")


cleanedPearson now reading 58840 columns
 preparedToSavePearson
savedPearson

In [10]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [11]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(590, 58840)
(472, 58840)
(118, 58840)

In [13]:
model,accuracy_score,logit_roc_auc = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)

In [14]:
model


Out[14]:
LogisticRegressionCV(Cs=[0.007966, 0.0080488, 0.008131600000000001, 0.0082144, 0.008297200000000001, 0.00838],
           class_weight=None, cv=5, dual=True, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=-1, penalty='l2', random_state=0, refit=True,
           scoring='roc_auc', solver='liblinear', tol=0.0001, verbose=0)

In [7]:
from operator import itemgetter
pearsonDict = selectPearson()

In [8]:
plot_word(pearsonDict[:10])


b"SELECT v1.r_s,v8.q_s,v1.record,a.rec_from\n    FROM rstq v1\n    INNER JOIN a on a.record=v1.record INNER JOIN rstq v2 on v1.record=v2.record  INNER JOIN rstq v3 on v2.record=v3.record  INNER JOIN rstq v4 on v3.record=v4.record  INNER JOIN rstq v5 on v4.record=v5.record  INNER JOIN rstq v6 on v5.record=v6.record  INNER JOIN rstq v7 on v6.record=v7.record  INNER JOIN rstq v8 on v7.record=v8.record  WHERE v1.centroid ='d'  AND v2.centroid='a' AND v1.id+1=v2.id  AND v3.centroid='b' AND v2.id+1=v3.id  AND v4.centroid='a' AND v3.id+1=v4.id  AND v5.centroid='a' AND v4.id+1=v5.id  AND v6.centroid='a' AND v5.id+1=v6.id  AND v7.centroid='a' AND v6.id+1=v7.id  AND v8.centroid='d' AND v7.id+1=v8.id  AND v1.r_s<v8.q_s\n    AND a.record NOT IN ('mimic2wdb/matched/s20354/s20354-2526-08-25-00-53',\n    'mimic2wdb/matched/s14584/s14584-2721-07-20-18-49',\n    'mimic2wdb/matched/s18413/s18413-3047-06-23-22-31',\n    'mimic2wdb/matched/s16032/s16032-3339-07-31-12-58')\n    LIMIT 1"
dabaaaad 89179756 89180858 mimic2wdb/matched/s19087/s19087-2560-06-15-07-53

In [9]:
df = pd.DataFrame(pearsonDict)
df = df.set_index('word')
df = df.sort_values(['p1'], ascending=[False])
df


Out[9]:
cuantosMueren p1 p2
word
adc 89.68% de 155 0.208886 3.061022e-07
fdf 96.39% de 83 0.203784 5.979812e-07
dabbaba 88.17% de 169 0.199764 1.001482e-06
dbda 85.07% de 221 0.189067 3.755808e-06
dabaaaad 95.06% de 81 0.188816 3.870746e-06
fbd 87.27% de 165 0.183705 7.089394e-06
dff 93.18% de 88 0.180136 1.071184e-05
ffd 91.26% de 103 0.177633 1.424185e-05
fff 93.83% de 81 0.177530 1.440773e-05
bdbd 85.79% de 183 0.174950 1.924153e-05
addaba 90.57% de 106 0.173294 2.311825e-05
faf 87.01% de 154 0.171685 2.758436e-05
dfdd 96.67% de 60 0.171629 2.775279e-05
afg 90.48% de 105 0.171339 2.864863e-05
haf 95.45% de 66 0.171176 2.915994e-05
bdd 82.53% de 269 0.170361 3.186366e-05
bdab 79.23% de 414 0.169413 3.530489e-05
ddba 84.02% de 219 0.169222 3.603969e-05
ababdba 86.34% de 161 0.167458 4.354558e-05
fabd 90.29% de 103 0.167401 4.381046e-05
bbdd 85.47% de 179 0.167382 4.389931e-05
gga 94.29% de 70 0.167137 4.506321e-05
aadc 88.62% de 123 0.167130 4.509729e-05
adabbaba 89.38% de 113 0.167010 4.567608e-05
dbbd 84.90% de 192 0.166946 4.598597e-05
gah 100.00% de 44 0.166489 4.827397e-05
daf 83.48% de 230 0.166158 4.999756e-05
abdabaa 84.04% de 213 0.165887 5.145518e-05
afd 83.71% de 221 0.164993 5.653954e-05
dbababa 86.45% de 155 0.164761 5.794047e-05
... ... ... ...
ababebba 0.00% de 5 -0.157634 1.206170e-04
abiab 0.00% de 5 -0.157634 1.206170e-04
abebaaaa 0.00% de 5 -0.157634 1.206170e-04
abebaaa 0.00% de 5 -0.157634 1.206170e-04
aaaeabc 0.00% de 5 -0.157634 1.206170e-04
ababaea 0.00% de 5 -0.157634 1.206170e-04
aababae 0.00% de 5 -0.157634 1.206170e-04
aaacecc 0.00% de 5 -0.157634 1.206170e-04
aaababi 0.00% de 5 -0.157634 1.206170e-04
aaaaeabc 0.00% de 5 -0.157634 1.206170e-04
aaaacea 0.00% de 5 -0.157634 1.206170e-04
aacecc 0.00% de 5 -0.157634 1.206170e-04
beaaaa 45.16% de 31 -0.157821 1.183678e-04
aaaabea 42.31% de 26 -0.157932 1.170472e-04
bbbi 51.92% de 52 -0.160181 9.314819e-05
eaaa 56.79% de 81 -0.161042 8.527587e-05
aabeaa 39.13% de 23 -0.162812 7.101815e-05
aebaaaaa 20.00% de 10 -0.163708 6.469227e-05
aaebaaa 20.00% de 10 -0.163708 6.469227e-05
aaeab 28.57% de 14 -0.163751 6.440178e-05
biabb 28.57% de 14 -0.163751 6.440178e-05
aaaaaeba 12.50% de 8 -0.166323 4.913144e-05
aaaaeba 12.50% de 8 -0.166323 4.913144e-05
bccccabc 12.50% de 8 -0.166323 4.913144e-05
aaabeaa 35.00% de 20 -0.169153 3.630863e-05
aaaaebaa 0.00% de 6 -0.172828 2.433653e-05
aababi 0.00% de 6 -0.172828 2.433653e-05
bbia 38.46% de 26 -0.176856 1.554560e-05
aebaaaa 18.18% de 11 -0.177590 1.431167e-05
aaebaaaa 0.00% de 8 -0.199907 9.834993e-07

58840 rows × 3 columns


In [ ]: