About

Evaluate the runtimes of several different methods that can be used for feature subset selection. Our goal is the evaluate the overall runtimes of the existing approaches. Note that the information-theoretic methods use Python+C as does the implementation of Lasso. The random forest for QIIME is based on an R package. Refer to scripts/timers.sh for the implementation details.

We essentially evaluate each command with generic parameters and redirect the runtimes test files.


In [ ]:
%%bash 
cd ../scripts/
sh timerz.sh
cd ../notebook/

In [1]:
%%bash
ls ../files/jmi-*.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/all-jmi.txt

ls ../files/mim-*.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/all-mim.txt

ls ../files/cmim-*.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/all-cmim.txt

ls ../files/mrmr-*.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/all-mrmr.txt

ls ../files/qiime.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/qiime.txt

ls ../files/npfs.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/npfs2.txt
  
ls ../files/lasso.time \
  | xargs cat \
  | grep real \
  | sed -e "s/real//g" \
  | tr '\t' ' ' \
  | sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
  | bc -l > ../files/lasso2.txt

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../src/")
import bmu
import utils
import matplotlib.pylab as plt
import mi

tag = "ag-gut"
biom_fp = "../data/caporaso-gut.biom"
map_fp = "../data/caporaso-gut.txt"
n_select = 25

data, samples, features = bmu.load_biom(biom_fp)
map_data = bmu.load_map(map_fp)
labels, label_map = utils.label_formatting(map_data, samples, "SEX", signed=False)
samples = np.array(samples)
features = np.array(features)
data = utils.normalize(data+1.)
mutual_info = mi.calc_mi(data=data, labels=labels)


df = pd.read_csv("../files/qiime-dir/feature_importance_scores.txt", sep="\t")
plt.figure()
plt.plot(np.array(range(len(df["Mean_decrease_in_accuracy"]))), 
         np.log(df["Mean_decrease_in_accuracy"]),
         label='RF Weight',lw=2)
plt.xlabel('feature number (by importance)')
plt.ylabel('weight')
plt.legend()
plt.autoscale(tight=True) 
plt.savefig("../files/rf-log-scores.pdf", bbox_inches="tight")


plt.figure()
plt.plot(np.array(range(len(df["Mean_decrease_in_accuracy"]))), 
         df["Mean_decrease_in_accuracy"],
         label='RF Weight',lw=2)
plt.xlabel('feature number (by importance)')
plt.ylabel('weight')
plt.legend()
plt.autoscale(tight=True) 
plt.savefig("../files/rf-scores.pdf", bbox_inches="tight")

plt.figure()
df = pd.read_csv("../files/qiime-dir/cv_probabilities.txt", sep="\t")
h=plt.hist(df["male"], 50, label='probabilities')
plt.xlabel('P(Male|X)')
plt.ylabel('Count')
plt.autoscale(tight=True) 
plt.savefig("../files/rf-probs.pdf", bbox_inches="tight")


/usr/local/lib/python2.7/site-packages/pandas-0.14.0-py2.7-linux-x86_64.egg/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.
  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))

In [18]:
%%bash 
cat ../files/lasso-selected-ag-gut.txt | sed -e "s/[a-z]\_\_//g" -e "s/\,/\, /g" -e "s/^/\\\item /g" | head -10


\item Bacteria, Bacteroidetes, Bacteroidia, Bacteroidales, Bacteroidaceae, Bacteroides, uniformis
\item Bacteria, Bacteroidetes, Bacteroidia, Bacteroidales, Prevotellaceae, Prevotella, copri
\item Bacteria, Firmicutes, Clostridia, Clostridiales, Ruminococcaceae, , 
\item Bacteria, Firmicutes, Erysipelotrichi, Erysipelotrichales, Erysipelotrichaceae, Eubacterium, dolichum
\item Bacteria, Firmicutes, Clostridia, Clostridiales, Ruminococcaceae, , 
\item Bacteria, Bacteroidetes, Bacteroidia, Bacteroidales, Bacteroidaceae, Bacteroides, 
\item Bacteria, Proteobacteria, Gammaproteobacteria, Cardiobacteriales, , , 
\item Bacteria, Cyanobacteria, Chloroplast, Streptophyta, , , 
\item Bacteria, Proteobacteria, Gammaproteobacteria, Xanthomonadales, Sinobacteraceae, , 
\item Bacteria, Firmicutes, Clostridia, Clostridiales, Lachnospiraceae, Blautia, producta

In [4]:
X = np.array(range(100,1000,100))
mim_times = np.array([float(x) for x in open("../files/all-mim.txt","U").read()[:-1].split("\n")])
jmi_times = np.array([float(x) for x in open("../files/all-jmi.txt","U").read()[:-1].split("\n")])
cmim_times = np.array([float(x) for x in open("../files/all-cmim.txt","U").read()[:-1].split("\n")])
mrmr_times = np.array([float(x) for x in open("../files/all-mrmr.txt","U").read()[:-1].split("\n")])
qiime_times = np.ones(X.shape)*np.array([float(x) for x in open("../files/qiime.txt","U").read()[:-1].split("\n")])[0]
lasso_times = np.ones(X.shape)*np.array([float(x) for x in open("../files/lasso2.txt","U").read()[:-1].split("\n")])[0]
npfs_times = np.ones(X.shape)*np.array([float(x) for x in open("../files/npfs2.txt","U").read()[:-1].split("\n")])[0]

#plt.plot(X, jmi_times, label="JMI", c="b")
#plt.plot(X, mim_times, label="MIM", c="r")
#plt.plot(X, qiime_times, label="RF-QIIME", c="k")
plt.figure()
plt.plot(X[1:], np.log(cmim_times)[1:], label="CMIM", c="g",marker="o",lw=2)
plt.plot(X[1:], np.log(mrmr_times)[1:], label="mRMR", c="c",marker="s",lw=2)
plt.plot(X[1:], np.log(jmi_times)[1:], label="JMI", c="b",marker="p",lw=2)
plt.plot(X[1:], np.log(mim_times)[1:], label="MIM", c="r",marker="x",lw=2)
plt.plot(X[1:], np.log(qiime_times)[1:], label="RF-QIIME", c="k",marker="^",lw=2)
plt.plot(X[1:], np.log(lasso_times)[1:], label="LASSO", c="y",marker=">",lw=2)
plt.plot(X[1:], np.log(npfs_times)[1:], label="NPFS", color=".5",marker="o",lw=2)
plt.xlabel('# of Features')
plt.ylabel('LOG(Runtime) (sec)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size':22})
# NOTE THAT QIIME IS JUST PLOTTED AS A FUNCTION OF FEATURES FOR DEMO PURPOSES
plt.savefig("../files/runtime-log.pdf", bbox_inches="tight")



plt.figure()
plt.plot(X[1:], cmim_times[1:], label="CMIM", c="g",marker="o",lw=2)
plt.plot(X[1:], mrmr_times[1:], label="mRMR", c="c",marker="s",lw=2)
plt.plot(X[1:], jmi_times[1:], label="JMI", c="b",marker="p",lw=2)
plt.plot(X[1:], mim_times[1:], label="MIM", c="r",marker="x",lw=2)
plt.plot(X[1:], qiime_times[1:], label="RF-QIIME", c="k",marker="^",lw=2)
plt.plot(X[1:], lasso_times[1:], label="LASSO", c="y",marker=">",lw=2)
plt.plot(X[1:], npfs_times[1:], label="NPFS", color=".5",marker="o",lw=2)
plt.xlabel('# of Features')
plt.ylabel('Runtime (sec)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size':22})
plt.savefig("../files/runtime.pdf", bbox_inches="tight")


print "LASSO: "+str(lasso_times[1])
print "RF-QIIME: "+str(qiime_times[1])
print "NPFS: "+str(npfs_times[1])
print "MIM: "+str(mim_times[1])


LASSO: 192.153
RF-QIIME: 152.665
NPFS: 30.833
MIM: 2.986

In [ ]: