Load necessary packages


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, linear_model
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.feature_selection import RFE

Load datasets


In [2]:
dataset_norm = pd.read_pickle('dataset_norm.pickle')
dataset_norm_withingroup = pd.read_pickle('dataset_norm_withingroup.pickle')

Get training data


In [3]:
dataset_norm_train = dataset_norm[(dataset_norm.training == 1) & (dataset_norm.sample == 1)]
dataset_norm_withingroup_train = dataset_norm_withingroup[(dataset_norm_withingroup.training == 1) & (dataset_norm_withingroup.sample == 1)]

dataset_norm_test = dataset_norm[(dataset_norm.training == 0) & (dataset_norm.sample == 1)]
dataset_norm_withingroup_test = dataset_norm_withingroup[(dataset_norm_withingroup.training == 0) & (dataset_norm_withingroup.sample == 1)]

dataset_norm_testfull = dataset_norm[(dataset_norm.training == 0) | (dataset_norm.sample == 0)]
dataset_norm_withingroup_testfull = dataset_norm_withingroup[(dataset_norm_withingroup.training == 0) | (dataset_norm_withingroup.sample == 0)]

Set up models


In [4]:
varlist_short = [c for c in dataset_norm_train.columns.values if c not in ['target','type_interventional','training','sample'] and 'clus_' not in c]
varlist_long = [c for c in dataset_norm_train.columns.values if c not in ['target','type_interventional','training','sample']]

In [5]:
dn_X_short = dataset_norm_train[varlist_short]
dn_X_long = dataset_norm_train[varlist_long]
dn_X_short_test = dataset_norm_test[varlist_short]
dn_X_long_test = dataset_norm_test[varlist_long]
dn_X_short_testfull = dataset_norm_testfull[varlist_short]
dn_X_long_testfull = dataset_norm_testfull[varlist_long]
dn_Y = dataset_norm_train['target']
dn_Y_test = dataset_norm_test['target']
dn_Y_testfull = dataset_norm_testfull['target']

dng_X_short = dataset_norm_withingroup_train[varlist_short]
dng_X_long = dataset_norm_withingroup_train[varlist_long]
dng_X_short_test = dataset_norm_withingroup_test[varlist_short]
dng_X_long_test = dataset_norm_withingroup_test[varlist_long]
dng_X_short_testfull = dataset_norm_withingroup_testfull[varlist_short]
dng_X_long_testfull = dataset_norm_withingroup_testfull[varlist_long]
dng_Y = dataset_norm_withingroup_train['target']
dng_Y_test = dataset_norm_withingroup_test['target']
dng_Y_testfull = dataset_norm_withingroup_testfull['target']

Logistic regression models


In [6]:
print '%s\t%s\t%s\t%s\t%s\t%s' % ('missing','clusvars','testsize','Cval','ROC','score')

for s in ['dn','dng']:
    for l in ['short','long']:
        for c in range(6):
            for t in ['test','testfull']:
                logit = linear_model.LogisticRegression(C=eval('1e' + str(c)))
                logit.fit(eval(s + '_X_' + l), eval(s + '_Y'))
                pred = logit.predict(eval(s + '_X_' + l + '_' + t))
                roc = roc_auc_score(eval(s + '_Y_' + t), pred)
                ls = logit.score(eval(s + '_X_' + l + '_' + t), eval(s + '_Y_' + t))
                print '%s\t%s\t%s\t%d\t%g\t%g' % (s,l,t,c,roc,ls)


missing	clusvars	testsize	Cval	ROC	score
dn	short	test	0	0.632862	0.632804
dn	short	testfull	0	0.634528	0.614164
dn	short	test	1	0.633308	0.633251
dn	short	testfull	1	0.634435	0.613985
dn	short	test	2	0.633308	0.633251
dn	short	testfull	2	0.634466	0.614045
dn	short	test	3	0.633308	0.633251
dn	short	testfull	3	0.634474	0.61406
dn	short	test	4	0.633308	0.633251
dn	short	testfull	4	0.634474	0.61406
dn	short	test	5	0.633308	0.633251
dn	short	testfull	5	0.634474	0.61406
dn	long	test	0	0.636642	0.636608
dn	long	testfull	0	0.639911	0.630014
dn	long	test	1	0.637762	0.637727
dn	long	testfull	1	0.640909	0.630268
dn	long	test	2	0.637762	0.637727
dn	long	testfull	2	0.640917	0.630283
dn	long	test	3	0.637762	0.637727
dn	long	testfull	3	0.640909	0.630268
dn	long	test	4	0.637762	0.637727
dn	long	testfull	4	0.640909	0.630268
dn	long	test	5	0.637762	0.637727
dn	long	testfull	5	0.640909	0.630268
dng	short	test	0	0.634237	0.634195
dng	short	testfull	0	0.633948	0.612031
dng	short	test	1	0.634906	0.634864
dng	short	testfull	1	0.634479	0.612225
dng	short	test	2	0.634906	0.634864
dng	short	testfull	2	0.634464	0.612195
dng	short	test	3	0.634906	0.634864
dng	short	testfull	3	0.634472	0.61221
dng	short	test	4	0.634906	0.634864
dng	short	testfull	4	0.634472	0.61221
dng	short	test	5	0.634906	0.634864
dng	short	testfull	5	0.634472	0.61221
dng	long	test	0	0.644924	0.644895
dng	long	testfull	0	0.64756	0.635011
dng	long	test	1	0.645588	0.645564
dng	long	testfull	1	0.646845	0.635295
dng	long	test	2	0.64581	0.645787
dng	long	testfull	2	0.646645	0.635325
dng	long	test	3	0.64581	0.645787
dng	long	testfull	3	0.646621	0.63528
dng	long	test	4	0.64581	0.645787
dng	long	testfull	4	0.646614	0.635265
dng	long	test	5	0.64581	0.645787
dng	long	testfull	5	0.646614	0.635265

Picking grouped missing value replacement, long variable list, and C=1

Determining ranked variable importance


In [6]:
logit = linear_model.LogisticRegression(C=1)
logit.fit(dng_X_long,dng_Y)
feat = RFE(estimator=logit, n_features_to_select=1, step=1)
feat.fit(dng_X_long,dng_Y)


Out[6]:
RFE(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
  estimator_params={}, n_features_to_select=1, step=1, verbose=0)

Testing various numbers of important variables to determine optimal number of parameters


In [8]:
pred_all = logit.predict(dng_X_long_testfull)
roc_all = roc_auc_score(dng_Y_testfull, pred_all)
ls_all = logit.score(dng_X_long_testfull, dng_Y_testfull)

print '%g\t%g' % (roc_all, ls_all)

important_vars = [v for v, n in sorted(zip(varlist_long,list(feat.ranking_)), key=lambda x: x[1])]

for v in range(50):
    X = dng_X_long[important_vars[:v+1]]
    logit_temp = linear_model.LogisticRegression(C=1)
    logit_temp.fit(X,dng_Y)
    X_test = dng_X_long_testfull[important_vars[:v+1]]
    pred = logit_temp.predict(X_test)
    roc = roc_auc_score(dng_Y_testfull, pred)
    ls = logit_temp.score(X_test, dng_Y_testfull)
    print '%d:\t%g\t%g' % (v+1, roc, ls)


0.647568	0.635026
1:	0.500959	0.964649
2:	0.521818	0.101886
3:	0.571956	0.320912
4:	0.583805	0.417165
5:	0.59517	0.482892
6:	0.595438	0.48216
7:	0.595584	0.482025
8:	0.598031	0.431329
9:	0.598815	0.430761
10:	0.601293	0.445553
11:	0.601285	0.445538
12:	0.600975	0.455773
13:	0.600944	0.455714
14:	0.600085	0.469056
15:	0.611737	0.559093
16:	0.611551	0.558316
17:	0.615805	0.649056
18:	0.617168	0.63002
19:	0.620394	0.622086
20:	0.623586	0.605337
21:	0.61985	0.631455
22:	0.636071	0.630304
23:	0.622858	0.618515
24:	0.654598	0.61986
25:	0.631676	0.608056
26:	0.631951	0.606921
27:	0.631308	0.608594
28:	0.633316	0.615811
29:	0.628076	0.594011
30:	0.630458	0.5957
31:	0.628308	0.594878
32:	0.630225	0.598584
33:	0.630386	0.598061
34:	0.637504	0.615572
35:	0.63765	0.615019
36:	0.634426	0.60837
37:	0.634465	0.608863
38:	0.640259	0.618396
39:	0.63215	0.625642
40:	0.631479	0.625179
41:	0.634495	0.623924
42:	0.635194	0.623192
43:	0.634903	0.623879
44:	0.636421	0.624313
45:	0.636699	0.624851
46:	0.638099	0.620473
47:	0.637846	0.620816
48:	0.643012	0.624552
49:	0.643412	0.624492
50:	0.645898	0.62763

Getting coefficients and printing out top 20 variables


In [9]:
coef_dict = dict(zip(varlist_long,list(logit.coef_[0])))

In [10]:
clus_dict = {
  'clus_35': 'Digestive System Neoplasms, Pancreatic Diseases, Endocrine Gland Neoplasms, Infant, Newborn, Diseases, Gonadal Disorders, Lymphatic Diseases, Hematologic Diseases, Gastrointestinal Diseases, Immunoproliferative Disorders, Neoplasms by Histologic Type, Neoplasms by Site',
  'clus_2': 'Optic Nerve Diseases, Otorhinolaryngologic Neoplasms, Ocular Motility Disorders, Neurocutaneous Syndromes, Laryngeal Diseases, Cranial Nerve Diseases',
  'clus_25': 'Immune System Processes, Purpura, Thrombocytopenic',
  'clus_1': 'Immunologic Deficiency Syndromes, Virus Diseases, Sexually Transmitted Diseases, DNA Virus Infections, Skin Diseases, Viral, Slow Virus Diseases, Hepatitis, Viral, Human, RNA Virus Infections',
  'clus_20': 'Vascular Diseases, Pathologic Processes, Heart Diseases',
  'clus_36': 'Respiratory Tract Neoplasms, Bronchial Diseases, Lung Diseases, Hypersensitivity, Respiratory Hypersensitivity',
  'clus_37': 'Urogenital Neoplasms, Genital Diseases, Male',
  'clus_17': 'Nutrition Disorders, Neurodegenerative Diseases, Anthropometry, Body Constitution, Physiological Processes, Delirium, Dementia, Amnestic, Cognitive Disorders, Neurologic Manifestations, Central Nervous System Diseases, Diagnostic Techniques and Procedures, Neurobehavioral Manifestations, Signs and Symptoms',
  'clus_12': 'Uveal Diseases, Eye Neoplasms, Retinal Diseases',
  'clus_21': 'Ear Diseases, Vision Disorders, Lens Diseases, Refractive Errors, Ocular Hypertension',
  'clus_16': 'Rheumatic Diseases, Joint Diseases, Encephalitis, Viral, Central Nervous System Viral Diseases, Connective Tissue Diseases, Poisoning, Autoimmune Diseases, Demyelinating Diseases, Autoimmune Diseases of the Nervous System, Arbovirus Infections, Neurotoxicity Syndromes',
  'clus_31': 'Neoplasms, Second Primary, Neoplastic Processes',
  'clus_7': 'Bone Diseases, Endocrine System Diseases, Dwarfism, Pituitary Diseases',
  'clus_30': 'Diabetes Mellitus, Metabolic Diseases',
  'clus_19': 'Craniocerebral Trauma, Spinal Cord Injuries, Trauma, Nervous System',
  'clus_8': 'Skin Diseases, Parasitic, Helminthiasis, Protozoan Infections',
  'clus_38': 'Female Urogenital Diseases, Urologic Diseases',
  'clus_22': 'Foot Diseases, Fasciitis, Foot Deformities',
  'clus_0': 'Substance Withdrawal Syndrome, Anxiety Disorders, Sociology, Marijuana Abuse, Substance-Related Disorders, Tobacco Use Disorder, Mood Disorders, Alcohol-Related Disorders, Cocaine-Related Disorders, Behavior, Opioid-Related Disorders',
  'clus_18': 'Musculoskeletal Abnormalities, Temporomandibular Joint Disorders, Stomatognathic System Abnormalities, Jaw Diseases',
  'clus_34': 'Respiratory Tract Infections, Nose Diseases',
  'clus_15': 'Neoplasms, Multiple Primary, Neoplastic Syndromes, Hereditary, Nervous System Malformations',
  'clus_3': 'Orbital Diseases, Eye Diseases, Hereditary, Lacrimal Apparatus Diseases, Eye Diseases, Corneal Diseases',
  'clus_32': 'Infection, Bacterial Infections',
  'clus_13': 'Eye Infections, Viral, Conjunctival Diseases, Eye Infections',
  'clus_24': 'Mental Disorders Diagnosed in Childhood, Schizophrenia and Disorders with Psychotic Features, Personality Disorders, Mental Disorders',
  'clus_29': 'Pathological Conditions, Anatomical, Biliary Tract Diseases',
  'clus_28': 'Muscular Diseases, Tendon Injuries, Neuromuscular Diseases',
  'clus_33': 'Congenital Abnormalities, Cardiovascular Abnormalities',
  'clus_27': 'Pharyngeal Diseases, Mouth Diseases, Tooth Diseases',
  'clus_5': 'Fetal Diseases, Rupture, Pregnancy Complications',
  'clus_14': 'Tumor Virus Infections, Mycoses, Neoplasms, Experimental',
  'clus_4': 'Leg Injuries, Hip Injuries, Arm Injuries, Fractures, Bone, Spinal Injuries, Back Injuries',
  'clus_11': 'Radiation Injuries, Environment, Public Health',
  'clus_10': 'Dissociative Disorders, Nervous System Diseases, Autonomic Nervous System Diseases, Somatoform Disorders',
  'clus_9': 'Biological Processes, Connective Tissue, Burns',
  'clus_26': 'Nervous System Physiological Phenomena, Psychophysiology',
  'clus_23': 'Wounds, Nonpenetrating, Wounds and Injuries',
  'clus_6': 'Genetic Variation, Cellular Structures, Ploidies, Genetic Structures'
}

In [11]:
for i in range(20):
    varname = important_vars[i]
    printvar = ('Cluster: ' + clus_dict[varname]) if 'clus_' in varname else varname
    print '%d\t%g\t%s' % (i+1, coef_dict[varname], printvar)


1	-1.18987	Cluster: Biological Processes, Connective Tissue, Burns
2	0.443029	intervention_behavior
3	0.555066	healthy_volunteers
4	0.536616	location_asia
5	0.720476	sponsor_lead_govt
6	-0.729694	Cluster: Nervous System Physiological Phenomena, Psychophysiology
7	-0.620453	Cluster: Wounds, Nonpenetrating, Wounds and Injuries
8	-0.483794	Cluster: Digestive System Neoplasms, Pancreatic Diseases, Endocrine Gland Neoplasms, Infant, Newborn, Diseases, Gonadal Disorders, Lymphatic Diseases, Hematologic Diseases, Gastrointestinal Diseases, Immunoproliferative Disorders, Neoplasms by Histologic Type, Neoplasms by Site
9	-0.457997	Cluster: Leg Injuries, Hip Injuries, Arm Injuries, Fractures, Bone, Spinal Injuries, Back Injuries
10	0.395298	Cluster: Orbital Diseases, Eye Diseases, Hereditary, Lacrimal Apparatus Diseases, Eye Diseases, Corneal Diseases
11	-0.503028	Cluster: Neoplasms, Multiple Primary, Neoplastic Syndromes, Hereditary, Nervous System Malformations
12	0.502377	gender_male
13	-0.33562	Cluster: Genetic Variation, Cellular Structures, Ploidies, Genetic Structures
14	-0.439387	design_intervention_single
15	-0.537975	design_intervention_parallel
16	-0.335301	Cluster: Dissociative Disorders, Nervous System Diseases, Autonomic Nervous System Diseases, Somatoform Disorders
17	-0.370591	location_northamerica
18	-0.666337	intervention_device
19	0.37761	design_observation_cohort
20	-0.592197	design_masking_openlabel

Confusion matrix


In [13]:
X = dng_X_long[important_vars[:21]]
logit = linear_model.LogisticRegression(C=1)
logit.fit(X,dng_Y)
X_test = dng_X_long_testfull[important_vars[:21]]
pred = logit.predict(X_test)
cm = confusion_matrix(dng_Y_testfull, pred)

In [14]:
print cm


[[ 1360   879]
 [23787 40902]]

Support Vector Machine models

Using the same final dataset as the logistic model


In [26]:
svm_clf_lin = svm.SVC(kernel='linear', probability=True)
svm_clf_lin.fit(dng_X_long,dng_Y)
pred_lin = svm_clf_lin.predict(dng_X_long_testfull)
roc_lin = roc_auc_score(dng_Y_testfull, pred_lin)
ls_lin = svm_clf_lin.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_lin, ls_lin)


0.646792	0.620607

In [27]:
svm_clf = svm.SVC(probability=True)
svm_clf.fit(dng_X_long,dng_Y)
pred_rbf = svm_clf.predict(dng_X_long_testfull)
roc_rbf = roc_auc_score(dng_Y_testfull, pred_rbf)
ls_rbf = svm_clf.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_rbf, ls_rbf)


0.672755	0.643707

In [28]:
svm_clf_poly = svm.SVC(kernel='poly', probability=True)
svm_clf_poly.fit(dng_X_long,dng_Y)
pred_poly = svm_clf_poly.predict(dng_X_long_testfull)
roc_poly = roc_auc_score(dng_Y_testfull, pred_poly)
ls_poly = svm_clf_poly.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_poly, ls_poly)


0.566426	0.883248

Draw ROC curves


In [32]:
fpr_lin, tpr_lin, thresholds_lin = roc_curve(dng_Y_testfull, svm_clf_lin.predict_proba(dng_X_long_testfull)[:, 1])
fpr_rbf, tpr_rbf, thresholds_rbf = roc_curve(dng_Y_testfull, svm_clf.predict_proba(dng_X_long_testfull)[:, 1])
fpr_poly, tpr_poly, thresholds_poly = roc_curve(dng_Y_testfull, svm_clf_poly.predict_proba(dng_X_long_testfull)[:, 1])

In [33]:
%matplotlib inline

fig = plt.figure()
fig.set_figwidth(7)
fig.set_figheight(7)

plt.plot(fpr_lin, tpr_lin, 'r', label='Linear SVM', lw=2.5)
plt.plot(fpr_rbf, tpr_rbf, 'g', label='SVM with rbf kernel', lw=2.5)
plt.plot(fpr_poly, tpr_poly, 'b', label='SVM with polynomial kernel', lw=2.5)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curves')
plt.legend(loc="lower right")

fig.show()



In [ ]: