In [2]:
[global]
# The "true" sparse regression coefficient
parameter: beta = [3, 1.5, 0, 0, 2, 0, 0, 0]
In [3]:
# Simulate sparse data-sets
[lasso_1, ridge_1]
depends: R_library("MASS>=7.3")
# training and testing samples
parameter: N = (40, 200)
parameter: rstd = 3
parameter: replicate = [x+1 for x in range(5)]
input: for_each = ['replicate']
output: train = f"data_{_replicate}.train.csv", test = f"data_{_replicate}.test.csv"
R: expand = "${ }"
set.seed(${_replicate})
N = sum(c(${paths(N):,}))
p = length(c(${paths(beta):,}))
X = MASS::mvrnorm(n = N, rep(0, p), 0.5^abs(outer(1:p, 1:p, FUN = "-")))
Y = X %*% c(${paths(beta):,}) + rnorm(N, mean = 0, sd = ${rstd})
Xtrain = X[1:${N[0]},]; Xtest = X[(${N[0]}+1):(${N[0]}+${N[1]}),]
Ytrain = Y[1:${N[0]}]; Ytest = Y[(${N[0]}+1):(${N[0]}+${N[1]})]
write.table(cbind(Ytrain, Xtrain), ${_output[0]:r}, row.names = F, col.names = F, sep = ',')
write.table(cbind(Ytest, Xtest), ${_output[1]:r}, row.names = F, col.names = F, sep = ',')
In [4]:
# Ridge regression model implemented in R
# Build predictor via cross-validation and make prediction
[ridge_2]
depends: R_library("glmnet>=2.0")
parameter: nfolds = 5
output: pred = f"{_input[0]:nn}.ridge.predicted.csv", coef = f"{_input[0]:nn}.ridge.coef.csv"
R: expand = "${ }"
train = read.csv(${_input['train']:r}, header = F)
test = read.csv(${_input['test']:r}, header = F)
model = glmnet::cv.glmnet(as.matrix(train[,-1]), train[,1], family = "gaussian", alpha = 0, nfolds = ${nfolds}, intercept = F)
betahat = as.vector(coef(model, s = "lambda.min")[-1])
Ypred = predict(model, as.matrix(test[,-1]), s = "lambda.min")
write.table(Ypred, ${_output['pred']:r}, row.names = F, col.names = F, sep = ',')
write.table(betahat, ${_output['coef']:r}, row.names = F, col.names = F, sep = ',')
In [5]:
# LASSO model implemented in Python
# Build predictor via cross-validation and make prediction
# Here I use numerical indices rather than variable names for input and output
# to demonstrate alternative syntax for `_input[]` and `_output[]`
[lasso_2]
depends: Py_Module("sklearn>=0.18.1"), Py_Module("numpy>=1.6.1"), Py_Module("scipy>=0.9")
parameter: nfolds = 5
output: pred = f"{_input[0]:nn}.lasso.predicted.csv", coef = f"{_input[0]:nn}.lasso.coef.csv"
python: expand = "${ }"
import numpy as np
from sklearn.linear_model import LassoCV
train = np.genfromtxt(${_input[0]:r}, delimiter = ",")
test = np.genfromtxt(${_input[1]:r}, delimiter = ",")
model = LassoCV(cv = ${nfolds}, fit_intercept = False).fit(train[:,1:], train[:,1])
Ypred = model.predict(test[:,1:])
np.savetxt(${_output[0]:r}, Ypred)
np.savetxt(${_output[1]:r}, model.coef_)
In [6]:
# Evaluate predictors by calculating mean squared error
# of prediction vs truth (first line of output)
# and of betahat vs truth (2nd line of output)
[lasso_3, ridge_3]
input: y = output_from(1)['test'], yhat = output_from(2)['pred'], coef = output_from(2)['coef']
output: f"{_input['yhat']:nn}.mse.csv"
R: expand = "${ }", stderr = False
b = c(${paths(beta):,})
Ytruth = as.matrix(read.csv(${_input['y']:r}, header = F)[,-1]) %*% b
Ypred = scan(${_input['yhat']:r})
prediction_mse = mean((Ytruth - Ypred)^2)
betahat = scan(${_input['coef']:r})
estimation_mse = mean((betahat - b) ^ 2)
cat(paste(prediction_mse, estimation_mse), file = ${_output:r})
In [7]:
# Run default core analysis
[default_1]
sos_run(['ridge', 'lasso'])
In [8]:
# Compute and report error estimates
# in HTML table format
[default_2]
depends: executable("pandoc")
input: dynamic("*.mse.csv")
import numpy as np
ridge_summary = np.mean(np.array([sum([x.strip().split() for x in open(f).readlines()], []) for f in _input if 'ridge' in str(f)], dtype = float).T, axis = 1).tolist()
lasso_summary = np.mean(np.array([sum([x.strip().split() for x in open(f).readlines()], []) for f in _input if 'lasso' in str(f)], dtype = float).T, axis = 1).tolist()
report: output = "report.md", expand = "${ }"
%% Comparison summary
| Method | Avg. Estimation Error | Avg. Prediction Error |
|:------:|:-------:|:-------:|
| LASSO | ${lasso_summary[1]} | ${lasso_summary[0]} |
| Ridge | ${ridge_summary[1]} | ${ridge_summary[0]} |
download:
https://vatlab.github.io/sos-docs/css/pandoc.css
pandoc: input = "report.md", output = "report.html", args = '{input:q} --css pandoc.css --self-contained -s --output {output:q}'
In [ ]:
%sosrun
In [ ]:
%preview report.html