In [1]:
import pandas as pd
import psycopg2
import numpy as np
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression

In [3]:
def selectMatrix(withPearson,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_stament = ("SELECT m.subject_id,m.word,m.counting,s.isalive "
                      " FROM matrix m LEFT JOIN subjectwords s ON m.subject_id=s.subject_id"
    )
    if withPearson:
        select_stament = select_stament+" WHERE m.word in (select word from wordspearson where p1>0.01) "
    cur.execute(select_stament)
    select = []
    for row in cur :
        select.append((row))
    cur.close()
    conn.close()
    return select

In [4]:
def convertMatrix(withPearson=False) :
    labels = ['subject_id', 'Word', 'Counting','isAlive']
    df = pd.DataFrame.from_records(selectMatrix(withPearson), columns=labels)
    print(len(df))
    return pd.pivot_table(df,index=["subject_id","isAlive"],columns=["Word"],values=["Counting"],
                       aggfunc={"Counting":[np.sum]},fill_value=0)

In [5]:
def savePearson(pearson,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    insert_statement=('INSERT INTO wordspearson(word,p1,p2)'
                      ' SELECT unnest( %(word)s ) ,'
                      ' unnest( %(p1)s) ,'
                      ' unnest( %(p2)s)')
    word=[r['word'] for r in pearson]
    p1=[r['p1'] for r in pearson]
    p2=[r['p2'] for r in pearson]
#    print(cur.mogrify(insert_statement,locals()))
    cur.execute(insert_statement,locals())
    conn.commit()
    cur.close()
    conn.close()

In [6]:
table = convertMatrix()


961128

In [7]:
survived = table.index.labels[1].tolist()
patients = table.values
columns = list(table.columns.values)

In [8]:
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    pearsonList.append({'word':columns[i][2],'p1':pearson[0],'p2':pearson[1]})
savePearson(pearsonList)

In [9]:
table = convertMatrix(True)


34946

In [10]:
survived = table.index.labels[1].tolist()
patients = table.values

In [11]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [12]:
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(500, 218)
(400, 218)
(100, 218)

In [13]:
logitmodelInitSAPS,best_val = ajustLogisticRegression(patients_train,survived_train,patients_test)


/usr/local/lib/python3.4/dist-packages/sklearn/svm/base.py:898: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
/usr/local/lib/python3.4/dist-packages/sklearn/svm/base.py:898: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished
CV averages for values [0.0001, 0.001, 0.01, 0.1, 1, 10] are:[ 0.77849832  0.74358607  0.74194313  0.73258437  0.73097569  0.68385533]
Best C is[ 0.0001]
/usr/local/lib/python3.4/dist-packages/sklearn/svm/base.py:898: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
/usr/local/lib/python3.4/dist-packages/sklearn/svm/base.py:898: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished
CV averages for values [0.0001, 0.00019, 0.00028000000000000003, 0.00037, 0.00046, 0.00055] are:[ 0.77849832  0.7683347   0.76560329  0.76094687  0.75321894  0.75034763]
Best C is[ 0.0001]
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
CV averages for values [9.999999999999999e-06, 3.7e-05, 6.4e-05, 9.099999999999999e-05, 0.000118, 0.000145] are:[ 0.78420387  0.78420387  0.78168751  0.77849832  0.776614    0.77407574]
Best C is[  1.00000000e-05]
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished
CV averages for values [0.0001, 8.47e-05, 6.94e-05, 5.410000000000001e-05, 3.880000000000001e-05, 2.35e-05] are:[ 0.77849832  0.7792029   0.77983196  0.78356121  0.78420387  0.78420387]
Best C is[  3.88000000e-05]
CV averages for values [4.6450000000000004e-05, 4.339e-05, 4.033e-05, 3.727e-05, 3.4210000000000006e-05, 3.1150000000000005e-05] are:[ 0.78356121  0.78235834  0.78420387  0.78420387  0.78420387  0.78420387]
Best C is[  4.03300000e-05]
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished

In [20]:
model = LogisticRegression(C=best_val,
                             fit_intercept=True, penalty='l2', 
                                                dual=True,  solver='liblinear',  n_jobs=-1, verbose=1, 
                                                random_state=0)
model.fit(patients_train,survived_train)


[LibLinear]
/usr/local/lib/python3.4/dist-packages/sklearn/linear_model/logistic.py:1228: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = -1.
  " = {}.".format(self.n_jobs))
Out[20]:
LogisticRegression(C=4.0330000000000002e-05, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [21]:
print("Logistic Regression acurracy is %2.2f" % accuracy_score(survived_test,model.predict(patients_test)))


Logistic Regression acurracy is 0.64

In [19]:
print("Logistic Regression acurracy is %2.2f" % accuracy_score(survived_test,logitmodelInitSAPS.predict(patients_test)))


Logistic Regression acurracy is 0.64

In [16]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [17]:
def aucModel(name,survived_test,model,patients_test):
    logit_roc_auc = roc_auc_score(survived_test,model.predict(patients_test))
    print(name+" AUC = %2.2f"% logit_roc_auc)
    return logit_roc_auc

In [18]:
model = LogisticRegression(penalty='l2',C=int(best_val),class_weight="balanced", 
                                                dual=True,  solver='liblinear',  
                                                random_state=0, max_iter=51200)
model.fit(patients_train,survived_train)
print("Logistic Regression acurracy is %2.2f" % accuracy_score(survived_test,model.predict(patients_test)))
logit_roc_auc = aucModel("Logistic",survived_test,model,patients_test)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-18-174e43c07d8c> in <module>()
      2                                                 dual=True,  solver='liblinear',
      3                                                 random_state=0, max_iter=51200)
----> 4 model.fit(patients_train,survived_train)
      5 print("Logistic Regression acurracy is %2.2f" % accuracy_score(survived_test,model.predict(patients_test)))
      6 logit_roc_auc = aucModel("Logistic",survived_test,model,patients_test)

/usr/local/lib/python3.4/dist-packages/sklearn/linear_model/logistic.py in fit(self, X, y, sample_weight)
   1231                 self.class_weight, self.penalty, self.dual, self.verbose,
   1232                 self.max_iter, self.tol, self.random_state,
-> 1233                 sample_weight=sample_weight)
   1234             self.n_iter_ = np.array([n_iter_])
   1235             return self

/usr/local/lib/python3.4/dist-packages/sklearn/svm/base.py in _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)
    888         X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
    889         class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
--> 890         epsilon, sample_weight)
    891     # Regarding rnd.randint(..) in the above signature:
    892     # seed for srand in range [0..INT_MAX); due to limitations in Numpy

sklearn/svm/liblinear.pyx in sklearn.svm.liblinear.train_wrap()

ValueError: b'C <= 0'

In [ ]:
logitmodelInitSAPS

In [ ]:
print("Logistic Regression acurracy is %2.2f" % accuracy_score(survived_test,logitmodelInitSAPS.predict(patients_test)))
logit_roc_auc = aucModel("Logistic",survived_test,model,patients_test)

In [ ]: