In [1]:
import sys
sys.path.append("../src/")
import numpy as np
import bmu
import utils
import matplotlib.pylab as plt
from sklearn.linear_model import Lasso, ElasticNet, LassoCV, ElasticNetCV
import re
In [2]:
tag = "ag-gut"
biom_fp = "../data/american-gut-mf.biom"
map_fp = "../data/american-gut-mf.txt"
n_select = 25
data, samples, features = bmu.load_biom(biom_fp)
map_data = bmu.load_map(map_fp)
labels, label_map = utils.label_formatting(map_data, samples, "SEX", signed=True)
samples = np.array(samples)
features = np.array(features)
data = utils.normalize(data+1.,scale=None)
In [9]:
lasso_mdl = Lasso(alpha=1.,
fit_intercept=True,
normalize=False,
precompute='auto',
copy_X=True,
max_iter=1000,
tol=0.0001,
warm_start=False,
positive=False)
lasso_mdl.fit(X=data, y=labels)
plt.figure()
plt.plot(range(len(features)), np.sort(lasso_mdl.coef_)[::-1], marker="o", label="positive")
plt.plot(range(len(features)), np.sort(lasso_mdl.coef_), marker="o", c="r", label="negative")
plt.xlim([0,300])
plt.title("Largest LASSO Coefficients")
plt.xlabel("arbitrary index")
plt.ylabel("Coefficient")
plt.legend()
plt.savefig("../files/"+tag+"-lasso-coeff.pdf", bbox_inches="tight")
In [11]:
enet_mdl = ElasticNet(alpha=.5,
l1_ratio=0.5,
fit_intercept=True,
normalize=False,
precompute='auto',
max_iter=1000,
copy_X=True,
tol=0.0001,
warm_start=False,
positive=False)
enet_mdl.fit(X=data, y=labels)
plt.figure()
plt.plot(range(len(features)), np.sort(enet_mdl.coef_)[::-1], marker='o', label='positive')
plt.plot(range(len(features)), np.sort(enet_mdl.coef_), marker='o', c='r', label='negative')
plt.xlim([0,300])
plt.title("Largest Elastic Net Coefficients")
plt.xlabel("arbitrary index")
plt.ylabel("Coefficient")
plt.legend()
plt.savefig("../files/"+tag+"-enet-coeff.pdf", bbox_inches="tight")
In [3]:
lasso_cv_mdl = LassoCV(cv=15,
fit_intercept=True,
normalize=False,
precompute='auto',
copy_X=True,
max_iter=1000,
tol=0.001)
lasso_cv_mdl.fit(X=data, y=labels)
m_log_alphas = -np.log10(lasso_cv_mdl.alphas_)
f = plt.figure()
plt.plot(m_log_alphas, lasso_cv_mdl.mse_path_, '-', color = '0.75' )
plt.plot(m_log_alphas, lasso_cv_mdl.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
#plt.axvline(-np.log10(lasso_cv_mdl.alpha_), linestyle='--', color='k',
# label='alpha: CV estimate')
plt.legend()
plt.autoscale(tight=True)
plt.xlabel('-log(lambda)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent')
plt.savefig("../files/"+tag+"-lasso-cv-mse.pdf", bbox_inches="tight")
f = open("../files/lasso-selected-"+tag+".txt", "wb")
dels = re.compile('"| |\[|\]')
for i in np.argsort(np.abs(lasso_cv_mdl.coef_))[::-1][:n_select]:
f.write(dels.sub("", features[i])+"\n")
f.close()
In [ ]:
enet_cv_mdl = ElasticNetCV(cv=15)
enet_cv_mdl.fit(X=data, y=labels)
m_log_alphas = -np.log10(enet_cv_mdl.alphas_)
f = plt.figure()
plt.plot(m_log_alphas, enet_cv_mdl.mse_path_, '-', color = '0.75' )
plt.plot(m_log_alphas, enet_cv_mdl.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
#plt.axvline(-np.log10(enet_cv_mdl.alpha_), linestyle='--', color='k',
# label='alpha: CV estimate')
plt.legend()
plt.autoscale(tight=True)
plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.title('Mean square error on each fold: coordinate descent')
plt.savefig("../files/"+tag+"-enet-cv-mse.pdf", bbox_inches="tight")
f = open("../files/enet-selected-"+tag+".txt", "wb")
dels = re.compile('"| |\[|\]')
for i in np.argsort(np.abs(lasso_cv_mdl.coef_))[::-1][:n_select]:
f.write(dels.sub("", features[i])+"\n")
f.close()
In [ ]: