In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, linear_model
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.feature_selection import RFE
In [2]:
dataset_norm = pd.read_pickle('dataset_norm.pickle')
dataset_norm_withingroup = pd.read_pickle('dataset_norm_withingroup.pickle')
Get training data
In [3]:
dataset_norm_train = dataset_norm[(dataset_norm.training == 1) & (dataset_norm.sample == 1)]
dataset_norm_withingroup_train = dataset_norm_withingroup[(dataset_norm_withingroup.training == 1) & (dataset_norm_withingroup.sample == 1)]
dataset_norm_test = dataset_norm[(dataset_norm.training == 0) & (dataset_norm.sample == 1)]
dataset_norm_withingroup_test = dataset_norm_withingroup[(dataset_norm_withingroup.training == 0) & (dataset_norm_withingroup.sample == 1)]
dataset_norm_testfull = dataset_norm[(dataset_norm.training == 0) | (dataset_norm.sample == 0)]
dataset_norm_withingroup_testfull = dataset_norm_withingroup[(dataset_norm_withingroup.training == 0) | (dataset_norm_withingroup.sample == 0)]
In [4]:
varlist_short = [c for c in dataset_norm_train.columns.values if c not in ['target','type_interventional','training','sample'] and 'clus_' not in c]
varlist_long = [c for c in dataset_norm_train.columns.values if c not in ['target','type_interventional','training','sample']]
In [5]:
dn_X_short = dataset_norm_train[varlist_short]
dn_X_long = dataset_norm_train[varlist_long]
dn_X_short_test = dataset_norm_test[varlist_short]
dn_X_long_test = dataset_norm_test[varlist_long]
dn_X_short_testfull = dataset_norm_testfull[varlist_short]
dn_X_long_testfull = dataset_norm_testfull[varlist_long]
dn_Y = dataset_norm_train['target']
dn_Y_test = dataset_norm_test['target']
dn_Y_testfull = dataset_norm_testfull['target']
dng_X_short = dataset_norm_withingroup_train[varlist_short]
dng_X_long = dataset_norm_withingroup_train[varlist_long]
dng_X_short_test = dataset_norm_withingroup_test[varlist_short]
dng_X_long_test = dataset_norm_withingroup_test[varlist_long]
dng_X_short_testfull = dataset_norm_withingroup_testfull[varlist_short]
dng_X_long_testfull = dataset_norm_withingroup_testfull[varlist_long]
dng_Y = dataset_norm_withingroup_train['target']
dng_Y_test = dataset_norm_withingroup_test['target']
dng_Y_testfull = dataset_norm_withingroup_testfull['target']
In [6]:
print '%s\t%s\t%s\t%s\t%s\t%s' % ('missing','clusvars','testsize','Cval','ROC','score')
for s in ['dn','dng']:
for l in ['short','long']:
for c in range(6):
for t in ['test','testfull']:
logit = linear_model.LogisticRegression(C=eval('1e' + str(c)))
logit.fit(eval(s + '_X_' + l), eval(s + '_Y'))
pred = logit.predict(eval(s + '_X_' + l + '_' + t))
roc = roc_auc_score(eval(s + '_Y_' + t), pred)
ls = logit.score(eval(s + '_X_' + l + '_' + t), eval(s + '_Y_' + t))
print '%s\t%s\t%s\t%d\t%g\t%g' % (s,l,t,c,roc,ls)
Picking grouped missing value replacement, long variable list, and C=1
Determining ranked variable importance
In [6]:
logit = linear_model.LogisticRegression(C=1)
logit.fit(dng_X_long,dng_Y)
feat = RFE(estimator=logit, n_features_to_select=1, step=1)
feat.fit(dng_X_long,dng_Y)
Out[6]:
Testing various numbers of important variables to determine optimal number of parameters
In [8]:
pred_all = logit.predict(dng_X_long_testfull)
roc_all = roc_auc_score(dng_Y_testfull, pred_all)
ls_all = logit.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_all, ls_all)
important_vars = [v for v, n in sorted(zip(varlist_long,list(feat.ranking_)), key=lambda x: x[1])]
for v in range(50):
X = dng_X_long[important_vars[:v+1]]
logit_temp = linear_model.LogisticRegression(C=1)
logit_temp.fit(X,dng_Y)
X_test = dng_X_long_testfull[important_vars[:v+1]]
pred = logit_temp.predict(X_test)
roc = roc_auc_score(dng_Y_testfull, pred)
ls = logit_temp.score(X_test, dng_Y_testfull)
print '%d:\t%g\t%g' % (v+1, roc, ls)
Getting coefficients and printing out top 20 variables
In [9]:
coef_dict = dict(zip(varlist_long,list(logit.coef_[0])))
In [10]:
clus_dict = {
'clus_35': 'Digestive System Neoplasms, Pancreatic Diseases, Endocrine Gland Neoplasms, Infant, Newborn, Diseases, Gonadal Disorders, Lymphatic Diseases, Hematologic Diseases, Gastrointestinal Diseases, Immunoproliferative Disorders, Neoplasms by Histologic Type, Neoplasms by Site',
'clus_2': 'Optic Nerve Diseases, Otorhinolaryngologic Neoplasms, Ocular Motility Disorders, Neurocutaneous Syndromes, Laryngeal Diseases, Cranial Nerve Diseases',
'clus_25': 'Immune System Processes, Purpura, Thrombocytopenic',
'clus_1': 'Immunologic Deficiency Syndromes, Virus Diseases, Sexually Transmitted Diseases, DNA Virus Infections, Skin Diseases, Viral, Slow Virus Diseases, Hepatitis, Viral, Human, RNA Virus Infections',
'clus_20': 'Vascular Diseases, Pathologic Processes, Heart Diseases',
'clus_36': 'Respiratory Tract Neoplasms, Bronchial Diseases, Lung Diseases, Hypersensitivity, Respiratory Hypersensitivity',
'clus_37': 'Urogenital Neoplasms, Genital Diseases, Male',
'clus_17': 'Nutrition Disorders, Neurodegenerative Diseases, Anthropometry, Body Constitution, Physiological Processes, Delirium, Dementia, Amnestic, Cognitive Disorders, Neurologic Manifestations, Central Nervous System Diseases, Diagnostic Techniques and Procedures, Neurobehavioral Manifestations, Signs and Symptoms',
'clus_12': 'Uveal Diseases, Eye Neoplasms, Retinal Diseases',
'clus_21': 'Ear Diseases, Vision Disorders, Lens Diseases, Refractive Errors, Ocular Hypertension',
'clus_16': 'Rheumatic Diseases, Joint Diseases, Encephalitis, Viral, Central Nervous System Viral Diseases, Connective Tissue Diseases, Poisoning, Autoimmune Diseases, Demyelinating Diseases, Autoimmune Diseases of the Nervous System, Arbovirus Infections, Neurotoxicity Syndromes',
'clus_31': 'Neoplasms, Second Primary, Neoplastic Processes',
'clus_7': 'Bone Diseases, Endocrine System Diseases, Dwarfism, Pituitary Diseases',
'clus_30': 'Diabetes Mellitus, Metabolic Diseases',
'clus_19': 'Craniocerebral Trauma, Spinal Cord Injuries, Trauma, Nervous System',
'clus_8': 'Skin Diseases, Parasitic, Helminthiasis, Protozoan Infections',
'clus_38': 'Female Urogenital Diseases, Urologic Diseases',
'clus_22': 'Foot Diseases, Fasciitis, Foot Deformities',
'clus_0': 'Substance Withdrawal Syndrome, Anxiety Disorders, Sociology, Marijuana Abuse, Substance-Related Disorders, Tobacco Use Disorder, Mood Disorders, Alcohol-Related Disorders, Cocaine-Related Disorders, Behavior, Opioid-Related Disorders',
'clus_18': 'Musculoskeletal Abnormalities, Temporomandibular Joint Disorders, Stomatognathic System Abnormalities, Jaw Diseases',
'clus_34': 'Respiratory Tract Infections, Nose Diseases',
'clus_15': 'Neoplasms, Multiple Primary, Neoplastic Syndromes, Hereditary, Nervous System Malformations',
'clus_3': 'Orbital Diseases, Eye Diseases, Hereditary, Lacrimal Apparatus Diseases, Eye Diseases, Corneal Diseases',
'clus_32': 'Infection, Bacterial Infections',
'clus_13': 'Eye Infections, Viral, Conjunctival Diseases, Eye Infections',
'clus_24': 'Mental Disorders Diagnosed in Childhood, Schizophrenia and Disorders with Psychotic Features, Personality Disorders, Mental Disorders',
'clus_29': 'Pathological Conditions, Anatomical, Biliary Tract Diseases',
'clus_28': 'Muscular Diseases, Tendon Injuries, Neuromuscular Diseases',
'clus_33': 'Congenital Abnormalities, Cardiovascular Abnormalities',
'clus_27': 'Pharyngeal Diseases, Mouth Diseases, Tooth Diseases',
'clus_5': 'Fetal Diseases, Rupture, Pregnancy Complications',
'clus_14': 'Tumor Virus Infections, Mycoses, Neoplasms, Experimental',
'clus_4': 'Leg Injuries, Hip Injuries, Arm Injuries, Fractures, Bone, Spinal Injuries, Back Injuries',
'clus_11': 'Radiation Injuries, Environment, Public Health',
'clus_10': 'Dissociative Disorders, Nervous System Diseases, Autonomic Nervous System Diseases, Somatoform Disorders',
'clus_9': 'Biological Processes, Connective Tissue, Burns',
'clus_26': 'Nervous System Physiological Phenomena, Psychophysiology',
'clus_23': 'Wounds, Nonpenetrating, Wounds and Injuries',
'clus_6': 'Genetic Variation, Cellular Structures, Ploidies, Genetic Structures'
}
In [11]:
for i in range(20):
varname = important_vars[i]
printvar = ('Cluster: ' + clus_dict[varname]) if 'clus_' in varname else varname
print '%d\t%g\t%s' % (i+1, coef_dict[varname], printvar)
Confusion matrix
In [13]:
X = dng_X_long[important_vars[:21]]
logit = linear_model.LogisticRegression(C=1)
logit.fit(X,dng_Y)
X_test = dng_X_long_testfull[important_vars[:21]]
pred = logit.predict(X_test)
cm = confusion_matrix(dng_Y_testfull, pred)
In [14]:
print cm
Using the same final dataset as the logistic model
In [26]:
svm_clf_lin = svm.SVC(kernel='linear', probability=True)
svm_clf_lin.fit(dng_X_long,dng_Y)
pred_lin = svm_clf_lin.predict(dng_X_long_testfull)
roc_lin = roc_auc_score(dng_Y_testfull, pred_lin)
ls_lin = svm_clf_lin.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_lin, ls_lin)
In [27]:
svm_clf = svm.SVC(probability=True)
svm_clf.fit(dng_X_long,dng_Y)
pred_rbf = svm_clf.predict(dng_X_long_testfull)
roc_rbf = roc_auc_score(dng_Y_testfull, pred_rbf)
ls_rbf = svm_clf.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_rbf, ls_rbf)
In [28]:
svm_clf_poly = svm.SVC(kernel='poly', probability=True)
svm_clf_poly.fit(dng_X_long,dng_Y)
pred_poly = svm_clf_poly.predict(dng_X_long_testfull)
roc_poly = roc_auc_score(dng_Y_testfull, pred_poly)
ls_poly = svm_clf_poly.score(dng_X_long_testfull, dng_Y_testfull)
print '%g\t%g' % (roc_poly, ls_poly)
Draw ROC curves
In [32]:
fpr_lin, tpr_lin, thresholds_lin = roc_curve(dng_Y_testfull, svm_clf_lin.predict_proba(dng_X_long_testfull)[:, 1])
fpr_rbf, tpr_rbf, thresholds_rbf = roc_curve(dng_Y_testfull, svm_clf.predict_proba(dng_X_long_testfull)[:, 1])
fpr_poly, tpr_poly, thresholds_poly = roc_curve(dng_Y_testfull, svm_clf_poly.predict_proba(dng_X_long_testfull)[:, 1])
In [33]:
%matplotlib inline
fig = plt.figure()
fig.set_figwidth(7)
fig.set_figheight(7)
plt.plot(fpr_lin, tpr_lin, 'r', label='Linear SVM', lw=2.5)
plt.plot(fpr_rbf, tpr_rbf, 'g', label='SVM with rbf kernel', lw=2.5)
plt.plot(fpr_poly, tpr_poly, 'b', label='SVM with polynomial kernel', lw=2.5)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curves')
plt.legend(loc="lower right")
fig.show()
In [ ]: