In [ ]:
In [1]:
from time import time
import psycopg2
from collections import Counter
import gc
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import accuracy_score
np.set_printoptions(suppress=True,precision=10)
In [3]:
def cleanMatrix(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("DELETE "
" FROM matrix"
)
cur.execute(select_stament)
conn.commit()
cur.close()
conn.close()
In [4]:
def selectWord(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = '''SELECT subject_id,word,isalive
FROM subjectwords
WHERE length(word)>1000
AND ((isalive=0 AND length(word)>3750) OR isalive=1)
'''
cur.execute(select_stament)
select = []
for row in cur :
select.append({'subject_id':row[0],'word':row[1],'isalive':row[2]})
conn.close()
return select
In [5]:
def get_all_substrings(input_string,length=5):
substrings = []
for j in range(len(input_string)) :
for i in range(length) :
substrings.append(input_string[j:j+i+1])
return Counter(substrings)
In [6]:
def saveMatrix(matrix,dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
insert_statement=('INSERT INTO matrix(word,subject_id,counting)'
' SELECT unnest( %(word)s ) ,'
' unnest( %(subject_id)s) ,'
' unnest( %(counting)s)')
word=[r['word'] for r in matrix]
subject_id=[r['subject_id'] for r in matrix]
counting=[r['counting'] for r in matrix]
# print(cur.mogrify(insert_statement,locals()))
cur.execute(insert_statement,locals())
conn.commit()
cur.close()
conn.close()
In [7]:
def fillMatrix(length):
cleanMatrix()
i=0
matrix = []
for word in selectWord() :
subject = word['subject_id']
subs =get_all_substrings(word['word'],length=length)
for key in subs:
matrix.append({'word':key,'counting':subs[key],'subject_id':subject})
saveMatrix(matrix)
print("The matrix was filled with "+str(len(matrix))+" values.")
In [8]:
def fixMatrix(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("DELETE "
" FROM matrix"
" WHERE word in (SELECT word FROM matrix GROUP BY word HAVING sum(counting)<5)"
)
cur.execute(select_stament)
select_stament = '''DELETE
FROM matrix
WHERE word IN (SELECT word FROM matrix group by word HAVING count(1)>=(
SELECT count(distinct subject_id) FROM matrix))
'''
cur.execute(select_stament)
conn.commit()
cur.close()
conn.close()
In [9]:
def selectMatrix(dbname="mimic") :
conn = psycopg2.connect("dbname="+dbname)
cur = conn.cursor()
select_stament = ("SELECT m.subject_id,m.word,m.counting,s.isalive "
" FROM matrix m LEFT JOIN subjectwords s ON m.subject_id=s.subject_id"
)
cur.execute(select_stament)
select = []
for row in cur :
select.append((row))
cur.close()
conn.close()
return select
In [10]:
def convertMatrix() :
labels = ['subject_id', 'Word', 'Counting','isAlive']
df = pd.DataFrame.from_records(selectMatrix(), columns=labels)
print(len(df))
return pd.pivot_table(df,index=["subject_id","isAlive"],columns=["Word"],values=["Counting"],
aggfunc={"Counting":[np.sum]},fill_value=0)
In [11]:
t0 = time()
fillMatrix(length=8)
print("fillMatrix done in %0.3fs." % (time() - t0))
In [12]:
gc.collect()
t0 = time()
fixMatrix()
print("fixMatrix done in %0.3fs." % (time() - t0))
gc.collect()
t0 = time()
table = convertMatrix()
print("converMatrix done in %0.3fs." % (time() - t0))
print(table.shape)
gc.collect()
Out[12]:
In [13]:
survived = table.index.labels[1].tolist()
patients = table.values
In [14]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)
In [15]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)