Fill the Matrix

In this notebook we fill the values of the matrix


In [1]:
from time import time
import psycopg2
from collections import Counter
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(suppress=True,precision=10)

In [2]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from Matrix import convert_matrix

In [3]:
def cleanMatrix(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = "DELETE FROM matrix"
    cur.execute(select_stament)
    conn.commit()
    cur.close()
    conn.close()

In [4]:
def selectWord(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = '''SELECT subject_id,word,isalive 
    FROM subjectwords 
    WHERE (isalive=0 AND LENGTH(word)>15000) OR isalive=1 
    ''' 
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append({'subject_id':row[0],'word':row[1],'isalive':row[2]})
    conn.close()
    return select

In [5]:
def get_all_substrings(input_string,minimun=0,maximun=5):
    substrings = []
    for j in range(len(input_string)) :
        for i in range(minimun,maximun) :
            substring = input_string[j:j+i+1]
            if len(substring)>minimun :
                substrings.append(substring)
    return Counter(substrings)

In [6]:
def saveMatrix(matrix,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement=('INSERT INTO matrix(word,subject_id,counting)'
                      ' SELECT unnest( %(word)s ) ,'
                      ' unnest( %(subject_id)s) ,'
                      ' unnest( %(counting)s)')
    word=[r['word'] for r in matrix]
    subject_id=[r['subject_id'] for r in matrix]
    counting=[r['counting'] for r in matrix]
  #  print(cur.mogrify(insert_statement,locals()))
    cur.execute(insert_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [7]:
def fillMatrix(minimun=0,maximun=5):
    t0 = time()
    cleanMatrix()
    i,lenMatrix=0,0
    for word in selectWord() :
        matrix = []
        subject = word['subject_id']
        subs =get_all_substrings(word['word'],minimun,maximun)
        for key in subs:
            matrix.append({'word':key,'counting':subs[key],'subject_id':subject})
        if matrix!=[]:
            saveMatrix(matrix)
        lenMatrix=lenMatrix+len(matrix)
    print("The matrix was filled with "+str(lenMatrix)+" values.")
    print("fillMatrix done in %0.3fs." % (time() - t0))

In [8]:
def fixMatrix(dbname="mimic") :
    t0 = time()
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    def executeDelete(cur,conn,select_stament):
        cur.execute(select_stament)
        conn.commit()
    repeated_words = ("DELETE FROM matrix WHERE word "
                      "IN (SELECT word FROM matrix GROUP BY word HAVING sum(counting)<5)")
    executeDelete(cur,conn,repeated_words)
    select_stament = ("DELETE FROM matrix WHERE subject_id "
                      "IN (SELECT subject_id FROM matrix GROUP BY subject_id HAVING sum(counting)<10)"
    )
    executeDelete(cur,conn,select_stament)
    select_stament = '''DELETE FROM matrix WHERE word 
    IN (SELECT word FROM matrix group by word HAVING count(1)>=(
    SELECT count(distinct subject_id) FROM matrix)*.95)
    '''
    executeDelete(cur,conn,select_stament)
    executeDelete(cur,conn,repeated_words)
    cur.close()
    conn.close()
    print("fixMatrix done in %0.3fs." % (time() - t0))

In [9]:
def heartBeatDeads(dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = '''SELECT DISTINCT word FROM matrix 
    WHERE subject_id IN (SELECT subject_id FROM subjectwords WHERE isalive=1)'''
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append(row[0])
    cur.close()
    conn.close()
    print(len(select))
    return select

In [10]:
def deleteOnlyAlive(heardBeatDeads,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = "DELETE FROM matrix WHERE word in %s"
#    print(cur.mogrify(select_stament,(heardBeatDeads,)))
    cur.execute(select_stament,(heardBeatDeads,))
    conn.commit()
    cur.close()
    conn.close()

In [11]:
def printSurvivor(survived):
    labels = 'Viven', 'Mueren'
    sizes = [survived.count(0),survived.count(1)]
    def make_autopct(values):
        def my_autopct(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
        return my_autopct
    plt.title("Número de pacientes "+str(sum(sizes)))
    plt.pie(sizes, labels=labels, autopct=make_autopct(sizes),
            shadow=True, startangle=90)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

llenado de la matriz con palabras entre un maximo y un minimo


In [12]:
fillMatrix(minimun=2,maximun=8)


The matrix was filled with 735369 values.
fillMatrix done in 73.214s.

In [13]:
fixMatrix()


fixMatrix done in 5.562s.

In [14]:
#deadBeats = heartBeatDeads()
#deleteOnlyAlive(tuple(deadBeats))

In [15]:
table = convert_matrix()
table


(590, 18683)
Out[15]:
aaaaac aaaaad aaaaae aaaaaf aaaaag aaaaah aaaaai aaaaaj aaaabb aaaabc ... kkb kke lab lbb lbbb lbbc lcc leb lib libb
subject_id isAlive
20 0 0 18 0 1 0 0 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
135 1 0 20 0 1 2 0 0 0 55 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 2 80 0 0 0 0 0 0 312 3 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 11 0 9 1 0 0 0 45 0 ... 0 0 0 0 0 0 0 0 0 0
214 1 1 54 0 34 9 0 0 0 104 2 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 1 0 0 2 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 5 0 0 0 0 0 0 65 0 ... 0 0 0 0 0 0 0 0 0 0
283 1 0 0 0 0 0 0 0 0 51 0 ... 0 0 0 0 0 0 0 0 0 0
368 1 54 5 0 3 2 0 0 0 350 14 ... 0 0 0 0 0 0 0 0 0 0
377 1 10 84 0 17 7 2 0 0 133 1 ... 0 0 0 0 0 0 0 0 0 0
408 1 0 297 0 5 1 0 0 1 5 0 ... 0 0 0 0 0 0 0 0 0 0
462 0 0 15 0 4 0 0 0 0 86 0 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 4 0 0 0 0 0 0 34 1 ... 0 0 0 0 0 0 0 0 0 0
638 1 1 24 0 3 1 2 0 2 116 0 ... 0 0 0 0 0 0 0 0 0 0
682 1 6 116 1 382 81 18 0 2 3 5 ... 0 0 0 0 0 0 0 0 0 0
736 0 0 0 1 0 0 0 0 0 43 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 3 5 0 0 0 0 0 0 166 0 ... 0 0 0 0 0 0 0 0 0 0
749 1 1 295 0 32 4 0 0 0 26 0 ... 0 0 0 0 0 0 0 0 0 0
793 1 2 625 0 46 14 1 0 0 28 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 0 572 0 69 16 1 0 4 2 0 ... 0 0 0 0 0 0 0 0 0 0
952 1 1 12 0 1 1 1 0 0 20 0 ... 0 0 0 0 0 0 0 0 0 0
974 0 1 20 0 1 0 0 0 0 104 1 ... 0 0 0 0 0 0 0 0 0 0
1004 1 0 1 0 0 0 0 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 8 201 0 18 0 1 0 1 117 2 ... 0 0 0 0 0 0 0 0 0 0
1144 0 10 134 0 11 3 1 0 0 424 1 ... 0 0 0 0 0 0 0 0 0 0
1160 0 0 116 0 2 2 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 61 0 12 2 1 0 1 146 0 ... 0 0 0 0 0 0 0 0 0 0
1226 1 1 2 0 0 0 0 0 0 81 0 ... 0 0 0 0 0 0 0 0 0 0
1459 0 0 163 0 5 0 0 0 0 273 0 ... 0 0 0 0 0 0 0 0 0 0
1528 1 1 155 0 91 12 3 0 1 47 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23178 1 153 59 0 5 1 0 0 0 293 15 ... 0 0 0 0 0 0 0 0 0 0
23193 0 1 15 0 2 0 0 0 0 147 1 ... 0 0 0 0 0 0 0 0 0 0
23200 0 14 0 0 0 0 0 0 0 7 5 ... 0 4 0 1 1 0 0 1 0 0
23298 0 12 18 3 4 0 2 0 0 400 61 ... 0 0 0 0 0 0 0 0 0 0
23336 1 3 29 0 0 0 1 0 0 68 3 ... 0 0 0 0 0 0 0 0 0 0
23339 0 0 1 0 0 1 0 0 0 35 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 3 45 0 6 2 1 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 25 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 0 154 0 1 1 0 0 0 25 0 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 69 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 26 75 0 2 0 0 0 0 148 23 ... 0 0 0 0 0 0 0 0 0 0
23474 1 2 7 0 5 0 0 0 0 210 2 ... 0 0 0 0 0 0 0 0 0 0
23510 1 0 5 0 1 0 0 0 0 62 1 ... 0 0 0 0 0 0 0 0 0 0
23944 1 17 0 0 0 0 0 0 0 88 30 ... 0 0 0 0 0 0 0 0 0 0
24004 1 29 92 0 21 0 1 0 1 100 5 ... 0 0 0 0 0 0 0 0 0 0
24030 0 0 2 0 0 1 0 0 0 52 0 ... 0 0 0 0 0 0 0 0 0 0
24076 1 0 51 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 6 0 0 0 0 0 1 261 0 ... 0 0 0 0 0 0 0 0 0 0
24133 0 251 6 0 0 0 0 0 0 19 19 ... 0 0 0 0 0 0 0 0 0 0
24142 1 1 21 0 1 1 1 0 0 369 1 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 7 1 0 0 0 0 0 53 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 1 11 0 0 0 0 0 0 333 1 ... 0 0 0 0 0 0 0 0 0 0
24227 0 3 61 0 10 2 0 0 1 62 2 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 3 0 0 0 0 0 0 247 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 0 187 0 42 6 4 0 4 28 0 ... 0 0 0 0 0 0 0 0 0 0
42255 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42261 0 0 45 0 0 0 0 0 0 233 1 ... 0 0 0 0 0 0 0 0 0 0
42410 0 11 26 1 0 0 0 0 0 119 1 ... 0 0 0 0 0 0 0 0 0 0
42492 0 5 53 0 11 3 0 1 0 55 4 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 42 0 ... 0 0 0 0 0 0 0 0 0 0

590 rows × 18683 columns


In [16]:
printSurvivor(table.index.labels[1].tolist())



In [17]:
def findAverage(table):
    numRegs,count = 0,0
    for value in table.columns:
        total = table[value].sum()
        numRegs += total
        count +=1
    average =int(numRegs/count)
    print(numRegs,count,"average",average)
    return average

In [18]:
def findOverAverage(table):
    count=0
    underaverage = []
    average=findAverage(table)
    for value in table.columns:
        total = table[value].sum()
        if total>average:
            underaverage.append(value)
            count+=1
#            print("%s:%d" %(value,total,),end="|")
    return underaverage

In [19]:
#over_average = findOverAverage(table)
#deleteOnlyAlive(tuple(over_average))

In [20]:
table = convert_matrix()
table


(590, 18683)
Out[20]:
aaaaac aaaaad aaaaae aaaaaf aaaaag aaaaah aaaaai aaaaaj aaaabb aaaabc ... kkb kke lab lbb lbbb lbbc lcc leb lib libb
subject_id isAlive
20 0 0 18 0 1 0 0 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
135 1 0 20 0 1 2 0 0 0 55 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 2 80 0 0 0 0 0 0 312 3 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 11 0 9 1 0 0 0 45 0 ... 0 0 0 0 0 0 0 0 0 0
214 1 1 54 0 34 9 0 0 0 104 2 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 1 0 0 2 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 5 0 0 0 0 0 0 65 0 ... 0 0 0 0 0 0 0 0 0 0
283 1 0 0 0 0 0 0 0 0 51 0 ... 0 0 0 0 0 0 0 0 0 0
368 1 54 5 0 3 2 0 0 0 350 14 ... 0 0 0 0 0 0 0 0 0 0
377 1 10 84 0 17 7 2 0 0 133 1 ... 0 0 0 0 0 0 0 0 0 0
408 1 0 297 0 5 1 0 0 1 5 0 ... 0 0 0 0 0 0 0 0 0 0
462 0 0 15 0 4 0 0 0 0 86 0 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 4 0 0 0 0 0 0 34 1 ... 0 0 0 0 0 0 0 0 0 0
638 1 1 24 0 3 1 2 0 2 116 0 ... 0 0 0 0 0 0 0 0 0 0
682 1 6 116 1 382 81 18 0 2 3 5 ... 0 0 0 0 0 0 0 0 0 0
736 0 0 0 1 0 0 0 0 0 43 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 3 5 0 0 0 0 0 0 166 0 ... 0 0 0 0 0 0 0 0 0 0
749 1 1 295 0 32 4 0 0 0 26 0 ... 0 0 0 0 0 0 0 0 0 0
793 1 2 625 0 46 14 1 0 0 28 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 0 572 0 69 16 1 0 4 2 0 ... 0 0 0 0 0 0 0 0 0 0
952 1 1 12 0 1 1 1 0 0 20 0 ... 0 0 0 0 0 0 0 0 0 0
974 0 1 20 0 1 0 0 0 0 104 1 ... 0 0 0 0 0 0 0 0 0 0
1004 1 0 1 0 0 0 0 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 8 201 0 18 0 1 0 1 117 2 ... 0 0 0 0 0 0 0 0 0 0
1144 0 10 134 0 11 3 1 0 0 424 1 ... 0 0 0 0 0 0 0 0 0 0
1160 0 0 116 0 2 2 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 61 0 12 2 1 0 1 146 0 ... 0 0 0 0 0 0 0 0 0 0
1226 1 1 2 0 0 0 0 0 0 81 0 ... 0 0 0 0 0 0 0 0 0 0
1459 0 0 163 0 5 0 0 0 0 273 0 ... 0 0 0 0 0 0 0 0 0 0
1528 1 1 155 0 91 12 3 0 1 47 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23178 1 153 59 0 5 1 0 0 0 293 15 ... 0 0 0 0 0 0 0 0 0 0
23193 0 1 15 0 2 0 0 0 0 147 1 ... 0 0 0 0 0 0 0 0 0 0
23200 0 14 0 0 0 0 0 0 0 7 5 ... 0 4 0 1 1 0 0 1 0 0
23298 0 12 18 3 4 0 2 0 0 400 61 ... 0 0 0 0 0 0 0 0 0 0
23336 1 3 29 0 0 0 1 0 0 68 3 ... 0 0 0 0 0 0 0 0 0 0
23339 0 0 1 0 0 1 0 0 0 35 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 3 45 0 6 2 1 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 25 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 0 154 0 1 1 0 0 0 25 0 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 69 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 26 75 0 2 0 0 0 0 148 23 ... 0 0 0 0 0 0 0 0 0 0
23474 1 2 7 0 5 0 0 0 0 210 2 ... 0 0 0 0 0 0 0 0 0 0
23510 1 0 5 0 1 0 0 0 0 62 1 ... 0 0 0 0 0 0 0 0 0 0
23944 1 17 0 0 0 0 0 0 0 88 30 ... 0 0 0 0 0 0 0 0 0 0
24004 1 29 92 0 21 0 1 0 1 100 5 ... 0 0 0 0 0 0 0 0 0 0
24030 0 0 2 0 0 1 0 0 0 52 0 ... 0 0 0 0 0 0 0 0 0 0
24076 1 0 51 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 6 0 0 0 0 0 1 261 0 ... 0 0 0 0 0 0 0 0 0 0
24133 0 251 6 0 0 0 0 0 0 19 19 ... 0 0 0 0 0 0 0 0 0 0
24142 1 1 21 0 1 1 1 0 0 369 1 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 7 1 0 0 0 0 0 53 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 1 11 0 0 0 0 0 0 333 1 ... 0 0 0 0 0 0 0 0 0 0
24227 0 3 61 0 10 2 0 0 1 62 2 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 3 0 0 0 0 0 0 247 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 0 187 0 42 6 4 0 4 28 0 ... 0 0 0 0 0 0 0 0 0 0
42255 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42261 0 0 45 0 0 0 0 0 0 233 1 ... 0 0 0 0 0 0 0 0 0 0
42410 0 11 26 1 0 0 0 0 0 119 1 ... 0 0 0 0 0 0 0 0 0 0
42492 0 5 53 0 11 3 0 1 0 55 4 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 42 0 ... 0 0 0 0 0 0 0 0 0 0

590 rows × 18683 columns


In [21]:
printSurvivor(table.index.labels[1].tolist())



In [ ]: