Evaluate the runtimes of several different methods that can be used for feature subset selection. Our goal is the evaluate the overall runtimes of the existing approaches. Note that the information-theoretic methods use Python+C as does the implementation of Lasso. The random forest for QIIME is based on an R package. Refer to scripts/timers.sh for the implementation details.
We essentially evaluate each command with generic parameters and redirect the runtimes test files.
In [ ]:
%%bash
cd ../scripts/
sh timerz.sh
cd ../notebook/
In [1]:
%%bash
ls ../files/jmi-*.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/all-jmi.txt
ls ../files/mim-*.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/all-mim.txt
ls ../files/cmim-*.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/all-cmim.txt
ls ../files/mrmr-*.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/all-mrmr.txt
ls ../files/qiime.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/qiime.txt
ls ../files/npfs.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/npfs2.txt
ls ../files/lasso.time \
| xargs cat \
| grep real \
| sed -e "s/real//g" \
| tr '\t' ' ' \
| sed -e "s/ //g" -e "s/m/ /g" -e "s/s//g" -e "s/^/60\*/g" -e "s/ /\+/g" \
| bc -l > ../files/lasso2.txt
In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../src/")
import bmu
import utils
import matplotlib.pylab as plt
import mi
tag = "ag-gut"
biom_fp = "../data/caporaso-gut.biom"
map_fp = "../data/caporaso-gut.txt"
n_select = 25
data, samples, features = bmu.load_biom(biom_fp)
map_data = bmu.load_map(map_fp)
labels, label_map = utils.label_formatting(map_data, samples, "SEX", signed=False)
samples = np.array(samples)
features = np.array(features)
data = utils.normalize(data+1.)
mutual_info = mi.calc_mi(data=data, labels=labels)
df = pd.read_csv("../files/qiime-dir/feature_importance_scores.txt", sep="\t")
plt.figure()
plt.plot(np.array(range(len(df["Mean_decrease_in_accuracy"]))),
np.log(df["Mean_decrease_in_accuracy"]),
label='RF Weight',lw=2)
plt.xlabel('feature number (by importance)')
plt.ylabel('weight')
plt.legend()
plt.autoscale(tight=True)
plt.savefig("../files/rf-log-scores.pdf", bbox_inches="tight")
plt.figure()
plt.plot(np.array(range(len(df["Mean_decrease_in_accuracy"]))),
df["Mean_decrease_in_accuracy"],
label='RF Weight',lw=2)
plt.xlabel('feature number (by importance)')
plt.ylabel('weight')
plt.legend()
plt.autoscale(tight=True)
plt.savefig("../files/rf-scores.pdf", bbox_inches="tight")
plt.figure()
df = pd.read_csv("../files/qiime-dir/cv_probabilities.txt", sep="\t")
h=plt.hist(df["male"], 50, label='probabilities')
plt.xlabel('P(Male|X)')
plt.ylabel('Count')
plt.autoscale(tight=True)
plt.savefig("../files/rf-probs.pdf", bbox_inches="tight")
In [18]:
%%bash
cat ../files/lasso-selected-ag-gut.txt | sed -e "s/[a-z]\_\_//g" -e "s/\,/\, /g" -e "s/^/\\\item /g" | head -10
In [4]:
X = np.array(range(100,1000,100))
mim_times = np.array([float(x) for x in open("../files/all-mim.txt","U").read()[:-1].split("\n")])
jmi_times = np.array([float(x) for x in open("../files/all-jmi.txt","U").read()[:-1].split("\n")])
cmim_times = np.array([float(x) for x in open("../files/all-cmim.txt","U").read()[:-1].split("\n")])
mrmr_times = np.array([float(x) for x in open("../files/all-mrmr.txt","U").read()[:-1].split("\n")])
qiime_times = np.ones(X.shape)*np.array([float(x) for x in open("../files/qiime.txt","U").read()[:-1].split("\n")])[0]
lasso_times = np.ones(X.shape)*np.array([float(x) for x in open("../files/lasso2.txt","U").read()[:-1].split("\n")])[0]
npfs_times = np.ones(X.shape)*np.array([float(x) for x in open("../files/npfs2.txt","U").read()[:-1].split("\n")])[0]
#plt.plot(X, jmi_times, label="JMI", c="b")
#plt.plot(X, mim_times, label="MIM", c="r")
#plt.plot(X, qiime_times, label="RF-QIIME", c="k")
plt.figure()
plt.plot(X[1:], np.log(cmim_times)[1:], label="CMIM", c="g",marker="o",lw=2)
plt.plot(X[1:], np.log(mrmr_times)[1:], label="mRMR", c="c",marker="s",lw=2)
plt.plot(X[1:], np.log(jmi_times)[1:], label="JMI", c="b",marker="p",lw=2)
plt.plot(X[1:], np.log(mim_times)[1:], label="MIM", c="r",marker="x",lw=2)
plt.plot(X[1:], np.log(qiime_times)[1:], label="RF-QIIME", c="k",marker="^",lw=2)
plt.plot(X[1:], np.log(lasso_times)[1:], label="LASSO", c="y",marker=">",lw=2)
plt.plot(X[1:], np.log(npfs_times)[1:], label="NPFS", color=".5",marker="o",lw=2)
plt.xlabel('# of Features')
plt.ylabel('LOG(Runtime) (sec)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size':22})
# NOTE THAT QIIME IS JUST PLOTTED AS A FUNCTION OF FEATURES FOR DEMO PURPOSES
plt.savefig("../files/runtime-log.pdf", bbox_inches="tight")
plt.figure()
plt.plot(X[1:], cmim_times[1:], label="CMIM", c="g",marker="o",lw=2)
plt.plot(X[1:], mrmr_times[1:], label="mRMR", c="c",marker="s",lw=2)
plt.plot(X[1:], jmi_times[1:], label="JMI", c="b",marker="p",lw=2)
plt.plot(X[1:], mim_times[1:], label="MIM", c="r",marker="x",lw=2)
plt.plot(X[1:], qiime_times[1:], label="RF-QIIME", c="k",marker="^",lw=2)
plt.plot(X[1:], lasso_times[1:], label="LASSO", c="y",marker=">",lw=2)
plt.plot(X[1:], npfs_times[1:], label="NPFS", color=".5",marker="o",lw=2)
plt.xlabel('# of Features')
plt.ylabel('Runtime (sec)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size':22})
plt.savefig("../files/runtime.pdf", bbox_inches="tight")
print "LASSO: "+str(lasso_times[1])
print "RF-QIIME: "+str(qiime_times[1])
print "NPFS: "+str(npfs_times[1])
print "MIM: "+str(mim_times[1])
In [ ]: