In [1]:
import psycopg2
import gc
from psycopg2.extensions import register_adapter, AsIs
from collections import Counter
import pandas as pd
import numpy as np
from time import time
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
np.set_printoptions(suppress=True,precision=10)

In [2]:
def obtainMaxRecords(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT replace(split_part(record, '/',3),'s',''),max(record) "
                      " FROM rstq "
                      " WHERE cast(replace(split_part(record, '/',3),'s','') as integer) "
                      " NOT IN (select subject_id from subjectrecord) "
                      " AND centroid IS NOT NULL"
                      " GROUP BY split_part(record, '/',3)"
    )
    cur.execute(select_stament)
    subject = []
    for row in cur :
        subject.append({"subject_id":int(row[0]),"record":row[1]})
    conn.close()
    return subject

In [3]:
def insert(words,table,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement = 'INSERT into '+table+' (%s) values %s'
    columns = words.keys()
    values = [words[column] for column in columns]
#    print(cur.mogrify(insert_statement, (AsIs(','.join(columns)), tuple(values))))
    cur.execute(insert_statement, (AsIs(','.join(columns)), tuple(values)))
    conn.commit()
    cur.close()
    conn.close()

In [4]:
def fillsubjectRecord() :
    for subject in obtainMaxRecords() :
        insert(subject,"subjectrecord")

In [5]:
def obtainSubjects(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT subject_id,record "
                      " FROM subjectrecord"
    )
    cur.execute(select_stament)
    subject = []
    for row in cur :
        subject.append({"subject_id":int(row[0]),"record":row[1]})
    cur.close()
    conn.close()
    return subject

In [6]:
def patientIsAlive(patient,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT dod "
                      " FROM patients WHERE subject_id = "+str(patient)+" LIMIT 1"
    )
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append(1 if row[0] is not None else 0 )
    cur.close()
    conn.close()
    return select

In [7]:
def obtainWord(subject,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT centroid "
                      " FROM rstq WHERE record='"+str(subject)+"' ORDER BY r_s"
    )
    cur.execute(select_stament)
    centroids = ""
    for row in cur :
        centroid = row[0]
        if centroid is not None :
            centroids= centroids+centroid
    conn.close()
    return centroids

In [8]:
def deleteWord(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = "DELETE FROM subjectwords"
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [9]:
def selectWord(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = '''SELECT subject_id,word 
    FROM subjectwords 
    WHERE length(word)>1000 
    ORDER BY isalive DESC 
    LIMIT 400'''
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append([row[0],row[1]])
    conn.close()
    return select

In [10]:
def createListOfWords() :
    subjects = obtainSubjects()
    lenSubjects = len(subjects)
    deleteWord()
    i,j=0,0
    for subject in subjects :
        subject_id = subject['subject_id']
        isAlive = patientIsAlive(subject_id)
        if isAlive != [] :
            j=j+1
            word = obtainWord(subject['record'])
            if word is not None:
                words = {'subject_id':subject_id,'word':word,'isalive':isAlive[0]}
                if len(word)>1000 :
#                    print(i,end=":")
                    insert(words,table='subjectwords')
                    i=i+1
    print()
    print("In a list of "+str(lenSubjects)+" we know the status of "+str(j)+" patients and save "+str(i)+" wordforms of the patients with more that 1000 heartbeats")

In [11]:
def get_all_substrings(input_string,length=5):
    substrings = []
    for j in range(len(input_string)) :
        for i in range(length) :
            substrings.append(input_string[j:j+i+1])
    return Counter(substrings)

In [12]:
def existMatrix(word,subject,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT 1 "
                      " FROM matrix WHERE subject_id='"+str(subject)+"' AND word='"+str(word)+"'"
    )
    cur.execute(select_stament)
    exist = False
    for row in cur :
        exist = True
    cur.close()
    conn.close()
    return exist

In [13]:
def saveMatrix(matrix,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement=('INSERT INTO matrix(word,subject_id,counting)'
                      ' SELECT unnest( %(word)s ) ,'
                      ' unnest( %(subject_id)s) ,'
                      ' unnest( %(counting)s)')
    word=[r['word'] for r in matrix]
    subject_id=[r['subject_id'] for r in matrix]
    counting=[r['counting'] for r in matrix]
#    print(cur.mogrify(insert_statement,locals()))
    cur.execute(insert_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [14]:
def cleanMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("DELETE "
                      " FROM matrix"
    )
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [15]:
def fillMatrix():
    cleanMatrix()
    i=0
    for word in selectWord() :
        subject = word[0]
        subs =get_all_substrings(word[1],length=10)
        matrix = []
        for key in subs:
            matrix.append({'word':key,'counting':subs[key],'subject_id':subject})
        if matrix != [] :
            i=i+1
            saveMatrix(matrix)
    print("The matrix was filled with "+str(i)+" values.")

In [16]:
def selectMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT subject_id,word,counting "
                      " FROM matrix ORDER BY subject_id"
    )
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append((row))
    cur.close()
    conn.close()
    return select

In [17]:
def convertMatrix() :
    labels = ['subject_id', 'Word', 'Counting']
    df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
    print(len(df))
    return pd.pivot_table(df,index=["subject_id"],columns=["Word"],values=["Counting"],
                       aggfunc={"Counting":[np.sum]},fill_value=0)

In [18]:
t0 = time()
fillsubjectRecord()
print("done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
createListOfWords()
print("done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
fillMatrix()
print("done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
table = convertMatrix()
print("done in %0.3fs." % (time() - t0))
gc.collect()


done in 67.219s.

In a list of 737 we know the status of 713 patients and save 584 wordforms of the patients with more that 1000 heartbeats
done in 201.093s.
The matrix was filled with 400 values.
done in 147.987s.
1783985
done in 227.170s.
Out[18]:
14

In [19]:
nmf = NMF(n_components=30, random_state=1,alpha=.1, l1_ratio=.5)
W = nmf.fit_transform(table)
H = nmf.components_
print(np.shape(W))
print(np.shape(H))


(400, 30)
(30, 310432)

In [20]:
patients = []
for patient in table.index :
    isAlive = patientIsAlive(patient)
    if isAlive != [] :
        patients.append(isAlive)

In [21]:
# flatten y into a 1-D array
y = np.ravel(patients)
modelo_lr = LogisticRegression()
modelo_lr.fit(y=y,X=W)


Out[21]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
modelo_lr.score(W, y)


Out[22]:
0.88

In [23]:
print(len(patients))


400

In [24]:
y.mean()


Out[24]:
0.875

In [ ]: