In [ ]:


In [1]:
from time import time
import psycopg2
from collections import Counter
import gc
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import accuracy_score
np.set_printoptions(suppress=True,precision=10)

In [3]:
def cleanMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("DELETE "
                      " FROM matrix"
    )
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [4]:
def selectWord(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = '''SELECT subject_id,word,isalive 
    FROM subjectwords 
    WHERE length(word)>1000 
    AND ((isalive=0 AND length(word)>3750) OR isalive=1)
    ''' 
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append({'subject_id':row[0],'word':row[1],'isalive':row[2]})
    conn.close()
    return select

In [5]:
def get_all_substrings(input_string,length=5):
    substrings = []
    for j in range(len(input_string)) :
        for i in range(length) :
            substrings.append(input_string[j:j+i+1])
    return Counter(substrings)

In [6]:
def saveMatrix(matrix,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement=('INSERT INTO matrix(word,subject_id,counting)'
                      ' SELECT unnest( %(word)s ) ,'
                      ' unnest( %(subject_id)s) ,'
                      ' unnest( %(counting)s)')
    word=[r['word'] for r in matrix]
    subject_id=[r['subject_id'] for r in matrix]
    counting=[r['counting'] for r in matrix]
#    print(cur.mogrify(insert_statement,locals()))
    cur.execute(insert_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [7]:
def fillMatrix(length):
    cleanMatrix()
    i=0
    matrix = []
    for word in selectWord() :
        subject = word['subject_id']
        subs =get_all_substrings(word['word'],length=length)
        for key in subs:
            matrix.append({'word':key,'counting':subs[key],'subject_id':subject})
    saveMatrix(matrix)
    print("The matrix was filled with "+str(len(matrix))+" values.")

In [8]:
def fixMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("DELETE "
                      " FROM matrix"
                      " WHERE word in (SELECT word FROM matrix GROUP BY word HAVING sum(counting)<5)"
    )
    cur.execute(select_stament)
    select_stament = '''DELETE 
    FROM matrix 
    WHERE word IN (SELECT word FROM matrix group by word HAVING count(1)>=(
    SELECT count(distinct subject_id) FROM matrix))
    '''
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [9]:
def selectMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT m.subject_id,m.word,m.counting,s.isalive "
                      " FROM matrix m LEFT JOIN subjectwords s ON m.subject_id=s.subject_id"
    )
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append((row))
    cur.close()
    conn.close()
    return select

In [10]:
def convertMatrix() :
    labels = ['subject_id', 'Word', 'Counting','isAlive']
    df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
    print(len(df))
    return pd.pivot_table(df,index=["subject_id","isAlive"],columns=["Word"],values=["Counting"],
                       aggfunc={"Counting":[np.sum]},fill_value=0)

In [11]:
t0 = time()
fillMatrix(length=8)
print("fillMatrix done in %0.3fs." % (time() - t0))


The matrix was filled with 1102152 values.
fillMatrix done in 80.318s.

In [12]:
gc.collect()
t0 = time()
fixMatrix()
print("fixMatrix done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
table = convertMatrix()
print("converMatrix done in %0.3fs." % (time() - t0))
print(table.shape)
gc.collect()


fixMatrix done in 46.535s.
961128
converMatrix done in 209.236s.
(545, 28340)
Out[12]:
14

In [13]:
survived = table.index.labels[1].tolist()
patients = table.values

In [14]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [15]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(545, 28340)
(436, 28340)
(109, 28340)