In [1]:
import sys 
import os
sys.path.append(os.path.abspath("/home/scidb/HeartRatePatterns/Python"))
from LogisticRegresion import ajustLogisticRegression
from PlotWords import plot_word
from Matrix import convert_matrix
from NMF import generateNMF,auc_model,find_best_NMF

In [2]:
def findpatient(message,table,beats):
    groupsize = table.shape[0]
    deads = sum(table.index.labels[1].tolist())
    percent = -1 if groupsize==0 else deads/groupsize
    if groupsize!=0:
        print(message,"groupsize:%d"%groupsize,"deads:%d"%deads,"percent:{0:.2%}".format(percent),beats)

In [3]:
import psycopg2
def selectPearson(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='SELECT patient,deadpatient,p1 FROM wordspearson WHERE word = %s'
#    print(cur.mogrify(select_statement,locals()))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        patient=row[0]
        select = {'word':word,"pacientes":patient,"porcentaje de muertos":"{0:.2%}".format(row[1]/patient),"p1":row[2]}
    cur.close()
    conn.close()
    return select

In [4]:
def print_top_words(model, feature_names,topic_index=None):
    result = []
    for topic_idx, topic in enumerate(components):
        features = [{"word":feature_names[i],"p1":topic[i]} 
                    for i in topic.argsort()[:-1] if topic[i] > 0]
        if features!=[] and (topic_index==None or topic_index==topic_idx): result.append({"topic":topic_idx,"features":features})
    return result

In [5]:
def addValue(key,my_dict):
    if key in my_dict:
        my_dict[key] += 1
    else:
        my_dict[key] = 1
    return my_dict

In [6]:
from operator import itemgetter
def predict_proba(compoT,table):
    predicts = modelnmf.predict_proba(compoT)[:,1]
    beats = list(table)
    sortedBeats = []
    numPred = len(predicts)
    nvals={}
    for i in range(0,numPred):
        word = beats[i]
        sel = selectPearson(word)
        if sel=={}:
            print("palabra no encontrada en pearson:",word)
        sortedBeats.append({'word':beats[i],'predict':predicts[i],'pacientes':sel['pacientes'],
                            'porcentaje de muertos':sel['porcentaje de muertos'],"p1":sel['p1']})
        if predicts[i]>.99 : addValue("99",nvals)
        elif predicts[i]>.90 : addValue("90",nvals)
        elif predicts[i]>.80 : addValue("80",nvals)
        elif predicts[i]>.70 : addValue("70",nvals)
        else: addValue("under",nvals)
    print(nvals)
    return sorted(sortedBeats, key=itemgetter('p1'), reverse=True)

In [7]:
table = convert_matrix(with_pearson=True) #.iloc[:,:400]
survived = table.index.labels[1].tolist()
patients = table.values
table


(554, 400)
Out[7]:
aaadc aabadd aabaf aabafb aabaga aabbg aabbgb aabdab aabdad aabdb ... hag haj hbf hdd hddd hfa hfg hga hgd jag
subject_id isAlive
135 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 0 1 0 0 2 0 0 2 0 2 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 0 0 0 0 0 0 1 0 7 ... 0 0 0 0 0 0 0 0 0 0
214 1 0 0 2 0 1 0 0 0 0 2 ... 0 0 0 0 0 0 0 0 0 0
263 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 0 0 0 0 0 0 1 0 6 ... 0 0 0 0 0 0 0 0 0 0
283 1 0 1 0 0 0 0 0 2 0 2 ... 0 0 0 0 0 0 0 0 0 0
368 1 0 0 1 1 1 1 1 2 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 1 1 2 1 3 0 0 8 2 16 ... 0 0 0 1 1 1 0 0 0 0
408 1 0 0 0 0 0 0 0 2 5 2 ... 0 0 0 0 0 0 0 0 0 0
462 0 0 8 2 1 0 0 0 14 4 15 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 0 0 0 0 0 0 1 0 5 ... 0 0 0 0 0 0 0 0 0 0
638 1 2 0 2 2 1 1 0 2 0 3 ... 0 1 0 1 1 0 0 1 0 0
682 1 0 0 1 0 0 0 0 0 0 1 ... 3 2 0 1 1 7 0 3 0 2
736 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 0 0 0 0 0 0 0 2 0 6 ... 1 1 0 0 0 1 0 0 0 0
749 1 0 0 1 0 0 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
793 1 2 2 2 1 3 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 0 2 1 0 0 0 0 1 0 0 ... 0 0 0 0 0 1 0 0 0 0
952 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1004 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 1 1 2 0 1 1 0 1 6 6 ... 0 0 0 0 0 0 0 0 1 0
1144 0 0 0 0 0 0 1 0 0 0 1 ... 0 0 0 1 0 0 0 1 0 0
1160 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 0 4 2 0 2 0 5 2 10 ... 0 0 0 1 0 0 0 0 0 0
1226 1 0 0 2 1 0 0 0 1 0 3 ... 0 0 0 0 0 0 0 0 0 0
1459 0 0 4 4 3 0 0 0 10 1 24 ... 0 0 0 0 0 0 0 0 0 0
1528 1 0 1 6 2 3 0 0 3 0 3 ... 0 0 0 0 0 0 0 0 0 0
1531 1 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
1569 1 0 0 6 1 0 1 0 1 2 3 ... 1 0 0 0 0 0 0 2 0 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23034 1 0 0 6 4 0 0 0 13 2 17 ... 1 0 0 0 0 0 0 0 1 0
23097 1 10 4 7 1 1 1 0 3 4 11 ... 0 0 0 1 1 0 0 0 0 0
23120 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23130 1 0 0 1 0 0 0 0 2 0 3 ... 0 0 0 0 0 0 0 0 0 0
23178 1 0 0 3 3 0 2 1 2 0 14 ... 0 0 0 1 0 0 0 0 1 0
23200 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23298 0 0 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
23336 1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 0 10 2 0 0 0 0 8 2 8 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 3 1 0 0 1 0 0 5 5 8 ... 1 0 0 0 0 0 0 0 0 0
23474 1 0 0 3 0 1 1 0 3 0 9 ... 0 0 0 0 0 0 0 0 1 0
23510 1 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23944 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 0 0 4 0 0 0 0 4 0 2 ... 0 0 0 1 0 0 0 0 0 0
24076 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
24133 0 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24142 1 0 0 2 1 2 0 0 3 0 23 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 0 0 0 0 0 0 0 5 0 7 ... 0 0 0 0 0 0 0 0 0 0
24227 0 0 0 0 0 0 0 0 0 0 3 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 0 3 6 0 0 0 0 1 2 5 ... 2 0 0 8 2 4 2 1 4 1
42261 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
42492 0 0 0 2 1 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

554 rows × 400 columns


In [8]:
from sklearn.model_selection import train_test_split
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)

In [9]:
model,acurracy = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)
logit_roc_auc = auc_model("Logistic",model,patients_test,survived_test)


acurracy is 70.27%
Logistic AUC = 0.4632f

In [10]:
def countPatients(word,dbname="mimic") :
    conn = psycopg2.connect("dbname="+dbname)
    cur = conn.cursor()
    select_statement='''SELECT count(1),sum(isalive) FROM matrix m LEFT JOIN subjectwords s 
    ON m.subject_id=s.subject_id where m.word = %s GROUP BY m.word'''
#    print(cur.mogrify(select_statement,(word,)))
    cur.execute(select_statement,(word,))
    select = {}
    for row in cur :
        select = {"patient":row[0],"deadPatient":row[1],}
    cur.close()
    conn.close()
    return select

In [11]:
from scipy.stats.stats import pearsonr
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'p1':pearson[0],'p2':pearson[1],'percent':"{0:.2%}".format(percent)+" de %d" %patient})

In [12]:
pearsonList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
pearsonList[:5]


Out[12]:
[{'p1': 0.14447975105937516,
  'p2': 0.00064753378305492824,
  'percent': '90.11% de 91',
  'word': 'babaaf'},
 {'p1': 0.13704871227843324,
  'p2': 0.0012218563623213852,
  'percent': '84.00% de 150',
  'word': 'babaf'},
 {'p1': 0.13674682231211277,
  'p2': 0.001252978906552512,
  'percent': '90.10% de 101',
  'word': 'ababaf'},
 {'p1': 0.13498226343655298,
  'p2': 0.001449968856339564,
  'percent': '86.57% de 134',
  'word': 'abbg'},
 {'p1': 0.13318083120123872,
  'p2': 0.001680102147764,
  'percent': '97.44% de 39',
  'word': 'fbabd'}]

In [13]:
modelnmf, bestNmf, patientnmf, nmf_roc_auc, accuracyScore, p1, bestScore = find_best_NMF(patients,survived)


acurracy is 78.38%
NMF 2 AUC = 0.5002f
pearson 0.0537649203283
acurracy is 78.38%
NMF 3 AUC = 0.5002f
pearson 0.0970548800057
acurracy is 78.38%
NMF 4 AUC = 0.5002f
pearson 0.0900181934643
acurracy is 78.38%
NMF 5 AUC = 0.5002f
pearson 0.0815622793441
acurracy is 78.38%
NMF 6 AUC = 0.5002f
pearson 0.0796729240204
acurracy is 78.38%
NMF 7 AUC = 0.5002f
pearson 0.127376250873
acurracy is 78.38%
NMF 8 AUC = 0.5002f
pearson 0.127157220215
acurracy is 76.58%
NMF 9 AUC = 0.4892f
pearson 0.123638781336
acurracy is 75.68%
NMF 10 AUC = 0.4832f
pearson 0.122206562747
acurracy is 75.68%
NMF 11 AUC = 0.4832f
pearson 0.0990535043886
acurracy is 75.68%
NMF 12 AUC = 0.4832f
pearson 0.0990853644751
acurracy is 75.68%
NMF 13 AUC = 0.4982f
pearson 0.102467314415
acurracy is 77.48%
NMF 14 AUC = 0.4942f
pearson 0.108840348973
acurracy is 76.58%
NMF 15 AUC = 0.5042f
pearson 0.0900318141107
acurracy is 75.68%
NMF 16 AUC = 0.4832f
pearson 0.102026135242
acurracy is 75.68%
NMF 17 AUC = 0.4832f
pearson 0.101221840985
acurracy is 74.77%
NMF 18 AUC = 0.4772f
pearson 0.11822358176
acurracy is 79.28%
NMF 19 AUC = 0.5362f
pearson 0.113328947258
acurracy is 79.28%
NMF 20 AUC = 0.5362f
pearson 0.111086919475
acurracy is 75.68%
NMF 21 AUC = 0.4832f
pearson 0.109031244
acurracy is 76.58%
NMF 22 AUC = 0.4892f
pearson 0.108766270318
acurracy is 75.68%
NMF 23 AUC = 0.4832f
pearson 0.10435556013
acurracy is 75.68%
NMF 24 AUC = 0.4832f
pearson 0.135327484389
acurracy is 75.68%
NMF 25 AUC = 0.4832f
pearson 0.13600034719
acurracy is 75.68%
NMF 26 AUC = 0.4832f
pearson 0.136272010896
acurracy is 75.68%
NMF 27 AUC = 0.4832f
pearson 0.13563401252
acurracy is 75.68%
NMF 28 AUC = 0.4832f
pearson 0.131380704463
acurracy is 76.58%
NMF 29 AUC = 0.4892f
pearson 0.134351656817
acurracy is 76.58%
NMF 30 AUC = 0.4892f
pearson 0.133251270575
acurracy is 75.68%
NMF 31 AUC = 0.4772f
pearson 0.13313661246
acurracy is 75.68%
NMF 32 AUC = 0.4832f
pearson 0.119854339105
acurracy is 76.58%
NMF 33 AUC = 0.5042f
pearson 0.125097457917
acurracy is 76.58%
NMF 34 AUC = 0.4892f
pearson 0.123151743977
acurracy is 74.77%
NMF 35 AUC = 0.4772f
pearson 0.124337858315
acurracy is 76.58%
NMF 36 AUC = 0.4892f
pearson 0.132304542591
acurracy is 74.77%
NMF 37 AUC = 0.4772f
pearson 0.131466974887
acurracy is 76.58%
NMF 38 AUC = 0.5042f
pearson 0.122703236708
acurracy is 77.48%
NMF 39 AUC = 0.4942f
pearson 0.119992557772
acurracy is 76.58%
NMF 40 AUC = 0.5042f
pearson 0.127445175749
acurracy is 75.68%
NMF 41 AUC = 0.4772f
pearson 0.127540870402
acurracy is 76.58%
NMF 42 AUC = 0.4892f
pearson 0.127245261609
acurracy is 76.58%
NMF 43 AUC = 0.4892f
pearson 0.126060874121
acurracy is 78.38%
NMF 44 AUC = 0.5002f
pearson 0.123114201273
acurracy is 73.87%
NMF 45 AUC = 0.4712f
pearson 0.124872546882
acurracy is 74.77%
NMF 46 AUC = 0.4772f
pearson 0.116093134734
acurracy is 74.77%
NMF 47 AUC = 0.4772f
pearson 0.124067427516
acurracy is 76.58%
NMF 48 AUC = 0.4892f
pearson 0.111187228757
acurracy is 72.97%
NMF 49 AUC = 0.4662f
pearson 0.110170259788
acurracy is 74.77%
NMF 50 AUC = 0.4712f
pearson 0.118424921622
acurracy is 78.38%
NMF 51 AUC = 0.5002f
pearson 0.114400054286
acurracy is 78.38%
NMF 52 AUC = 0.5002f
pearson 0.110177310139
acurracy is 78.38%
NMF 53 AUC = 0.5002f
pearson 0.110779358577
acurracy is 75.68%
NMF 54 AUC = 0.4712f
pearson 0.111667326203
acurracy is 78.38%
NMF 55 AUC = 0.5002f
pearson 0.108685835382
acurracy is 75.68%
NMF 56 AUC = 0.4832f
pearson 0.113311191119
acurracy is 78.38%
NMF 57 AUC = 0.5002f
pearson 0.110706342321
acurracy is 75.68%
NMF 58 AUC = 0.4982f
pearson 0.108924064144
acurracy is 73.87%
NMF 59 AUC = 0.4712f
pearson 0.114398507083
acurracy is 76.58%
NMF 60 AUC = 0.4892f
pearson 0.107393419948
acurracy is 75.68%
NMF 61 AUC = 0.4832f
pearson 0.107950234053
acurracy is 76.58%
NMF 62 AUC = 0.4892f
pearson 0.111775100031
acurracy is 72.07%
NMF 63 AUC = 0.4602f
pearson 0.110912856356
acurracy is 78.38%
NMF 64 AUC = 0.5002f
pearson 0.107277636122
acurracy is 75.68%
NMF 65 AUC = 0.4832f
pearson 0.107422429947
acurracy is 72.97%
NMF 66 AUC = 0.4662f
pearson 0.110636108658
acurracy is 72.97%
NMF 67 AUC = 0.4662f
pearson 0.10741847518
acurracy is 74.77%
NMF 68 AUC = 0.4772f
pearson 0.107583130953
acurracy is 72.07%
NMF 69 AUC = 0.4602f
pearson 0.108290954679
acurracy is 74.77%
NMF 70 AUC = 0.4922f
pearson 0.107371195456
acurracy is 70.27%
NMF 71 AUC = 0.4482f
pearson 0.108299046842
acurracy is 77.48%
NMF 72 AUC = 0.4942f
pearson 0.136012247677
acurracy is 77.48%
NMF 73 AUC = 0.4942f
pearson 0.13039083113
acurracy is 78.38%
NMF 74 AUC = 0.5002f
pearson 0.11270438662
acurracy is 77.48%
NMF 75 AUC = 0.4942f
pearson 0.127820849406
acurracy is 75.68%
NMF 76 AUC = 0.4832f
pearson 0.10946394825
acurracy is 77.48%
NMF 77 AUC = 0.4942f
pearson 0.129820812781
acurracy is 78.38%
NMF 78 AUC = 0.5002f
pearson 0.131700858324
acurracy is 77.48%
NMF 79 AUC = 0.4942f
pearson 0.118870616692
acurracy is 71.17%
NMF 80 AUC = 0.4542f
pearson 0.129442036567
acurracy is 75.68%
NMF 81 AUC = 0.4832f
pearson 0.12981829406
acurracy is 76.58%
NMF 82 AUC = 0.4892f
pearson 0.129604957551
acurracy is 78.38%
NMF 83 AUC = 0.5002f
pearson 0.123392931417
acurracy is 76.58%
NMF 84 AUC = 0.4892f
pearson 0.126319487838
acurracy is 77.48%
NMF 85 AUC = 0.4942f
pearson 0.122068369518
acurracy is 77.48%
NMF 86 AUC = 0.4942f
pearson 0.13033421612
acurracy is 76.58%
NMF 87 AUC = 0.4892f
pearson 0.124920008523
acurracy is 68.47%
NMF 88 AUC = 0.4522f
pearson 0.131583371378
acurracy is 72.07%
NMF 89 AUC = 0.4752f
pearson 0.131392702517
acurracy is 76.58%
NMF 90 AUC = 0.4892f
pearson 0.131702522152
acurracy is 70.27%
NMF 91 AUC = 0.4482f
pearson 0.123126225331
acurracy is 76.58%
NMF 92 AUC = 0.5042f
pearson 0.124675432704
acurracy is 76.58%
NMF 93 AUC = 0.4892f
pearson 0.12480511772
acurracy is 75.68%
NMF 94 AUC = 0.4832f
pearson 0.122824161948
acurracy is 77.48%
NMF 95 AUC = 0.4942f
pearson 0.127540770322
acurracy is 77.48%
NMF 96 AUC = 0.4942f
pearson 0.126825536427
acurracy is 75.68%
NMF 97 AUC = 0.4832f
pearson 0.124673909615
acurracy is 75.68%
NMF 98 AUC = 0.4832f
pearson 0.122939937479
acurracy is 78.38%
NMF 99 AUC = 0.5002f
pearson 0.121235503165

In [14]:
from sklearn.metrics import classification_report
print("bestScore "+str(bestScore)+" accurracy is %2.2f"% accuracyScore)
print(classification_report(survived,modelnmf.predict(patientnmf)))


bestScore 26 accurracy is 0.76
             precision    recall  f1-score   support

          0       0.61      0.08      0.14       136
          1       0.77      0.98      0.86       418

avg / total       0.73      0.76      0.69       554


In [16]:
import numpy as np
compoT = np.transpose(bestNmf.components_)
print("components",bestNmf.components_.shape)
print("components Transpose",compoT.shape)


components (26, 400)
components Transpose (400, 26)

In [17]:
sortedBeats = predict_proba(compoT,table)
beatKeys = []
for value in sortedBeats:
    beatKeys.append(value['word'])


{'80': 54, '90': 115, '99': 147, '70': 44, 'under': 40}

In [18]:
print("Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística")
print(sortedBeats[:5])
plot_word(sortedBeats[:5])


Top 5 HeartBeats con mayor probabilidad de muerte según la regresión logística
[{'predict': 0.99896917839044441, 'pacientes': 155, 'word': 'adc', 'porcentaje de muertos': '89.68%', 'p1': 0.208886190046134}, {'predict': 0.99999999792459926, 'pacientes': 83, 'word': 'fdf', 'porcentaje de muertos': '96.39%', 'p1': 0.203783764475214}, {'predict': 0.99955502675564845, 'pacientes': 221, 'word': 'dbda', 'porcentaje de muertos': '85.07%', 'p1': 0.189067111670612}, {'predict': 0.99772854498501051, 'pacientes': 165, 'word': 'fbd', 'porcentaje de muertos': '87.27%', 'p1': 0.183704825193073}, {'predict': 0.99999998819892866, 'pacientes': 88, 'word': 'dff', 'porcentaje de muertos': '93.18%', 'p1': 0.180136446835082}]

In [31]:
from operator import itemgetter
from scipy.stats.stats import pearsonr
patients_trainnmf = patientnmf # bestNmf.transform(patients_train)
pearsonList = []
for i in range(bestScore):
    patientpear=patients_trainnmf[:,i]
    pearson = pearsonr(patientpear,survived_train)
  #  if(pearson[0]>0):
    pearsonList.append({'group':i,'p1':pearson[0],'p2':pearson[1]})
sortedList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
sortedList[:10]


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-31-53df11a05e4c> in <module>()
      1 from operator import itemgetter
      2 from scipy.stats.stats import pearsonr
----> 3 patients_trainnmf = bestNmf.transform(patients_train)
      4 pearsonList = []
      5 for i in range(bestScore):

/usr/local/lib/python3.4/dist-packages/sklearn/decomposition/nmf.py in transform(self, X)
   1278             alpha=self.alpha, l1_ratio=self.l1_ratio, regularization='both',
   1279             random_state=self.random_state, verbose=self.verbose,
-> 1280             shuffle=self.shuffle)
   1281 
   1282         return W

/usr/local/lib/python3.4/dist-packages/sklearn/decomposition/nmf.py in non_negative_factorization(X, W, H, n_components, init, update_H, solver, beta_loss, tol, max_iter, alpha, l1_ratio, regularization, random_state, verbose, shuffle)
   1000         _check_init(W, (n_samples, n_components), "NMF (input W)")
   1001     elif not update_H:
-> 1002         _check_init(H, (n_components, n_features), "NMF (input H)")
   1003         # 'mu' solver should not be initialized by zeros
   1004         if solver == 'mu':

/usr/local/lib/python3.4/dist-packages/sklearn/decomposition/nmf.py in _check_init(A, shape, whom)
     48     if np.shape(A) != shape:
     49         raise ValueError('Array with wrong shape passed to %s. Expected %s, '
---> 50                          'but got %s ' % (whom, shape, np.shape(A)))
     51     check_non_negative(A, whom)
     52     if np.max(A) == 0:

ValueError: Array with wrong shape passed to NMF (input H). Expected (26, 232), but got (26, 400) 

In [23]:
columns = list(table)
components = bestNmf.components_
topword = print_top_words(components, columns,topic_index=sortedList[0]['group'])[0]['features']
subwords = []
for subword in topword:
    if subword['p1']>0:
        subwords.append(subword['word'])
print(str(subwords[:10]))


['adcd', 'aadc', 'aagh', 'badc', 'fga', 'ajda', 'gfaf', 'ffda', 'abbdd', 'fgf']

In [24]:
table = convert_matrix(sumvals=False,filter_words=tuple(subwords))
survived = table.index.labels[1].tolist()
patients = table.values
table


(540, 232)
Out[24]:
aabadd aabaf aabafb aabaga aabbg aabbgb aabdad aabdb aabdbb aacda ... gfd gff gha haf hag haj hbf hfa hfg hga
subject_id isAlive
135 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
151 1 1 0 0 1 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
177 1 0 0 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
214 1 0 1 0 1 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
279 1 0 0 0 0 0 0 0 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
283 1 1 0 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
368 1 0 1 1 1 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
377 1 1 1 1 1 0 0 1 1 1 0 ... 1 0 0 1 0 0 0 1 0 0
408 1 0 0 0 0 0 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
462 0 1 1 1 0 0 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
618 1 0 0 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
638 1 0 1 1 1 1 0 0 1 1 0 ... 1 0 0 0 0 1 0 0 0 1
682 1 0 1 0 0 0 0 0 1 0 0 ... 0 1 1 1 1 1 0 1 0 1
736 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
743 1 0 0 0 0 0 0 0 1 1 0 ... 0 0 0 1 1 1 0 1 0 0
749 1 0 1 0 0 0 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
793 1 1 1 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
886 1 1 1 0 0 0 0 0 0 0 0 ... 0 0 1 1 0 0 0 1 0 0
1004 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1075 1 1 1 0 1 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
1144 0 0 0 0 0 1 0 0 1 0 0 ... 0 0 1 0 0 0 0 0 0 1
1160 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1222 0 0 1 1 0 1 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
1226 1 0 1 1 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
1459 0 1 1 1 0 0 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
1528 1 1 1 1 1 0 0 0 1 1 0 ... 0 0 1 0 0 0 0 0 0 0
1531 1 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 1 0 0 0 0 0 0
1569 1 0 1 1 0 1 0 1 1 1 1 ... 1 1 1 1 1 0 0 0 0 1
1924 1 0 1 1 0 0 0 0 1 1 0 ... 1 0 1 0 1 0 0 0 0 0
2049 0 0 1 1 0 0 0 1 1 1 1 ... 0 0 1 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21561 0 0 0 0 0 0 0 0 1 1 0 ... 0 1 1 0 0 0 0 0 0 0
23034 1 0 1 1 0 0 0 1 1 1 1 ... 0 0 1 1 1 0 0 0 0 0
23097 1 1 1 1 1 1 0 1 1 1 1 ... 0 1 0 0 0 0 0 0 0 0
23120 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23130 1 0 1 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
23178 1 0 1 1 0 1 1 0 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
23200 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23298 0 0 0 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
23336 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23363 1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23384 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23401 1 1 1 0 0 0 0 1 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
23451 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23468 1 1 0 0 1 0 0 1 1 1 1 ... 0 0 1 0 1 0 0 0 0 0
23474 1 0 1 0 1 1 0 0 1 1 1 ... 0 1 0 0 0 0 0 0 0 0
23510 1 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24004 1 0 1 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
24076 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24129 1 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 1
24133 0 0 1 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
24142 1 0 1 1 1 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
24152 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24185 1 0 0 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
24227 0 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
25466 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41962 1 1 1 0 0 0 0 1 1 0 0 ... 1 1 1 1 1 0 0 1 1 1
42261 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42410 0 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
42492 0 0 1 1 1 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
43459 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

540 rows × 232 columns


In [25]:
patients_train, patients_test,survived_train, survived_test = train_test_split(patients,survived,test_size=0.2, random_state=42)
print(table.shape)
print(patients_train.shape)
print(patients_test.shape)


(540, 232)
(432, 232)
(108, 232)

In [26]:
model,acurracy = ajustLogisticRegression(patients_train,survived_train,patients_test,survived_test)


acurracy is 77.78%

In [27]:
logit_roc_auc = auc_model("Logistic",model,patients_test,survived_test)


Logistic AUC = 0.5002f

In [28]:
columns = list(table.columns.values)
pearsonList = []
for i in range(len(columns)):
    pearson = pearsonr(patients[:,i],survived)
    word = columns[i]
    count = countPatients(word)
    deadPatient,patient = count['deadPatient'],count['patient']
    percent = deadPatient/patient
    pearsonList.append({'word':word,'p1':pearson[0],'p2':pearson[1],'percent':"{0:.2%}".format(percent)+" de %d" %patient})

In [29]:
pearsonList = sorted(pearsonList, key=itemgetter('p1'), reverse=True)
pearsonList[:5]


Out[29]:
[{'p1': 0.20629619960424944,
  'p2': 1.3322321476546345e-06,
  'percent': '89.68% de 155',
  'word': 'adc'},
 {'p1': 0.18111123938083684,
  'p2': 2.2961628993734019e-05,
  'percent': '85.07% de 221',
  'word': 'dbda'},
 {'p1': 0.17845426123218194,
  'p2': 3.0350870577087259e-05,
  'percent': '87.27% de 165',
  'word': 'fbd'},
 {'p1': 0.17579442910616386,
  'p2': 3.9969076433626121e-05,
  'percent': '91.26% de 103',
  'word': 'ffd'},
 {'p1': 0.17161281097954326,
  'p2': 6.1117120579637452e-05,
  'percent': '95.45% de 66',
  'word': 'haf'}]

In [ ]: