In [1]:
import psycopg2
import gc
from psycopg2.extensions import register_adapter, AsIs
from collections import Counter
import pandas as pd
import numpy as np
from time import time
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
np.set_printoptions(suppress=True,precision=10)
In [2]:
def obtainMaxRecords(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT replace(split_part(record, '/',3),'s',''),max(record) "
" FROM rstq "
" WHERE cast(replace(split_part(record, '/',3),'s','') as integer) "
" NOT IN (select subject_id from subjectrecord) "
" AND centroid IS NOT NULL"
" GROUP BY split_part(record, '/',3)"
)
cur.execute(select_stament)
subject = []
for row in cur :
subject.append({"subject_id":int(row[0]),"record":row[1]})
conn.close()
return subject
In [3]:
def insert(words,table,dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
insert_statement = 'INSERT into '+table+' (%s) values %s'
columns = words.keys()
values = [words[column] for column in columns]
# print(cur.mogrify(insert_statement, (AsIs(','.join(columns)), tuple(values))))
cur.execute(insert_statement, (AsIs(','.join(columns)), tuple(values)))
conn.commit()
cur.close()
conn.close()
In [4]:
def fillsubjectRecord() :
for subject in obtainMaxRecords() :
insert(subject,"subjectrecord")
In [5]:
def obtainSubjects(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT subject_id,record "
" FROM subjectrecord"
)
cur.execute(select_stament)
subject = []
for row in cur :
subject.append({"subject_id":int(row[0]),"record":row[1]})
cur.close()
conn.close()
return subject
In [6]:
def patientIsAlive(patient,dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT dod "
" FROM patients WHERE subject_id = "+str(patient)+" LIMIT 1"
)
cur.execute(select_stament)
select = []
for row in cur :
select.append(1 if row[0] is not None else 0 )
cur.close()
conn.close()
return select
In [7]:
def obtainWord(subject,dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT centroid "
" FROM rstq WHERE record='"+str(subject)+"' ORDER BY r_s"
)
cur.execute(select_stament)
centroids = ""
for row in cur :
centroid = row[0]
if centroid is not None :
centroids= centroids+centroid
conn.close()
return centroids
In [8]:
def deleteWord(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = "DELETE FROM subjectwords"
cur.execute(select_stament)
conn.commit()
cur.close()
conn.close()
In [9]:
def selectWord(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = '''SELECT subject_id,word
FROM subjectwords
WHERE length(word)>1000
ORDER BY isalive DESC
LIMIT 400'''
cur.execute(select_stament)
select = []
for row in cur :
select.append([row[0],row[1]])
conn.close()
return select
In [10]:
def createListOfWords() :
subjects = obtainSubjects()
lenSubjects = len(subjects)
deleteWord()
i,j=0,0
for subject in subjects :
subject_id = subject['subject_id']
isAlive = patientIsAlive(subject_id)
if isAlive != [] :
j=j+1
word = obtainWord(subject['record'])
if word is not None:
words = {'subject_id':subject_id,'word':word,'isalive':isAlive[0]}
if len(word)>1000 :
# print(i,end=":")
insert(words,table='subjectwords')
i=i+1
print()
print("In a list of "+str(lenSubjects)+" we know the status of "+str(j)+" patients and save "+str(i)+" wordforms of the patients with more that 1000 heartbeats")
In [11]:
def get_all_substrings(input_string,length=5):
substrings = []
for j in range(len(input_string)) :
for i in range(length) :
substrings.append(input_string[j:j+i+1])
return Counter(substrings)
In [12]:
def existMatrix(word,subject,dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT 1 "
" FROM matrix WHERE subject_id='"+str(subject)+"' AND word='"+str(word)+"'"
)
cur.execute(select_stament)
exist = False
for row in cur :
exist = True
cur.close()
conn.close()
return exist
In [13]:
def saveMatrix(matrix,dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
insert_statement=('INSERT INTO matrix(word,subject_id,counting)'
' SELECT unnest( %(word)s ) ,'
' unnest( %(subject_id)s) ,'
' unnest( %(counting)s)')
word=[r['word'] for r in matrix]
subject_id=[r['subject_id'] for r in matrix]
counting=[r['counting'] for r in matrix]
# print(cur.mogrify(insert_statement,locals()))
cur.execute(insert_statement,locals())
conn.commit()
cur.close()
conn.close()
In [14]:
def cleanMatrix(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("DELETE "
" FROM matrix"
)
cur.execute(select_stament)
conn.commit()
cur.close()
conn.close()
In [15]:
def fillMatrix():
cleanMatrix()
i=0
for word in selectWord() :
subject = word[0]
subs =get_all_substrings(word[1],length=10)
matrix = []
for key in subs:
matrix.append({'word':key,'counting':subs[key],'subject_id':subject})
if matrix != [] :
i=i+1
saveMatrix(matrix)
print("The matrix was filled with "+str(i)+" values.")
In [16]:
def selectMatrix(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT subject_id,word,counting "
" FROM matrix ORDER BY subject_id"
)
cur.execute(select_stament)
select = []
for row in cur :
select.append((row))
cur.close()
conn.close()
return select
In [17]:
def convertMatrix() :
labels = ['subject_id', 'Word', 'Counting']
df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
print(len(df))
return pd.pivot_table(df,index=["subject_id"],columns=["Word"],values=["Counting"],
aggfunc={"Counting":[np.sum]},fill_value=0)
In [18]:
t0 = time()
fillsubjectRecord()
print("done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
createListOfWords()
print("done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
fillMatrix()
print("done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
table = convertMatrix()
print("done in %0.3fs." % (time() - t0))
gc.collect()
Out[18]:
In [19]:
nmf = NMF(n_components=30, random_state=1,alpha=.1, l1_ratio=.5)
W = nmf.fit_transform(table)
H = nmf.components_
print(np.shape(W))
print(np.shape(H))
In [20]:
patients = []
for patient in table.index :
isAlive = patientIsAlive(patient)
if isAlive != [] :
patients.append(isAlive)
In [21]:
# flatten y into a 1-D array
y = np.ravel(patients)
modelo_lr = LogisticRegression()
modelo_lr.fit(y=y,X=W)
Out[21]:
In [22]:
modelo_lr.score(W, y)
Out[22]:
In [23]:
print(len(patients))
In [24]:
y.mean()
Out[24]:
In [ ]: