Fill the Matrix

In this notebook we fill the values of the matrix


In [1]:
from time import time
import psycopg2
from collections import Counter
import gc
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import accuracy_score
np.set_printoptions(suppress=True,precision=10)

In [2]:
def cleanMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("DELETE "
                      " FROM matrix"
    )
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [3]:
def selectWord(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = '''SELECT subject_id,word,isalive 
    FROM subjectwords 
    WHERE ((isalive=0 AND length(word)>3750) OR isalive=1) 
    ''' 
#    AND length(word)>1000
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append({'subject_id':row[0],'word':row[1],'isalive':row[2]})
    conn.close()
    return select

In [4]:
def get_all_substrings(input_string,length=5):
    substrings = []
    for j in range(len(input_string)) :
        for i in range(length) :
            substrings.append(input_string[j:j+i+1])
    return Counter(substrings)

In [5]:
def saveMatrix(matrix,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement=('INSERT INTO matrix(word,subject_id,counting)'
                      ' SELECT unnest( %(word)s ) ,'
                      ' unnest( %(subject_id)s) ,'
                      ' unnest( %(counting)s)')
    word=[r['word'] for r in matrix]
    subject_id=[r['subject_id'] for r in matrix]
    counting=[r['counting'] for r in matrix]
  #  print(cur.mogrify(insert_statement,locals()))
    cur.execute(insert_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [6]:
def fillMatrix(length):
    cleanMatrix()
    i=0
    matrix = []
    for word in selectWord() :
        subject = word['subject_id']
        subs =get_all_substrings(word['word'],length=length)
        for key in subs:
            matrix.append({'word':key,'counting':subs[key],'subject_id':subject})
    saveMatrix(matrix)
    print("The matrix was filled with "+str(len(matrix))+" values.")

In [7]:
def fixMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("DELETE "
                      " FROM matrix"
                      " WHERE word in (SELECT word FROM matrix GROUP BY word HAVING sum(counting)<5)"
    )
    cur.execute(select_stament)
    select_stament = '''DELETE 
    FROM matrix 
    WHERE word IN (SELECT word FROM matrix group by word HAVING count(1)>=(
    SELECT count(distinct subject_id) FROM matrix))
    '''
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [8]:
def selectMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT m.subject_id,m.word,m.counting,s.isalive "
                      " FROM matrix m LEFT JOIN subjectwords s ON m.subject_id=s.subject_id"
    )
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append((row))
    cur.close()
    conn.close()
    return select

In [9]:
def convertMatrix() :
    labels = ['subject_id', 'Word', 'Counting','isAlive']
    df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
    print(len(df))
    return pd.pivot_table(df,index=["subject_id","isAlive"],columns=["Word"],values=["Counting"],
                       aggfunc={"Counting":[np.sum]},fill_value=0)

In [10]:
t0 = time()
fillMatrix(length=8)
print("fillMatrix done in %0.3fs." % (time() - t0))


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

The matrix was filled with 5732402 values.
fillMatrix done in 1165.055s.

In [11]:
gc.collect()
t0 = time()
fixMatrix()
print("fixMatrix done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
table = convertMatrix()
print("converMatrix done in %0.3fs." % (time() - t0))
print(table.shape)
gc.collect()


fixMatrix done in 876.777s.
4569945
converMatrix done in 85.299s.
(845, 168509)
Out[11]:
14

In [12]:
survived = table.index.labels[1].tolist()
patients = table.values

In [13]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [14]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(845, 168509)
(676, 168509)
(169, 168509)

In [ ]: