Linear-model based prediction

This script fits linear models using Lasso and Ridge regression and summarizes their prediction performance This script is written in the "outcome-oriented" style, aka the "Make Style"


In [2]:
[global]
parameter: beta = [3, 1.5, 0, 0, 2, 0, 0, 0]
ridge_result = [f'data_{x+1}.ridge.mse.csv' for x in range(5)]
lasso_result = [f'data_{x+1}.lasso.mse.csv' for x in range(5)]

In [3]:
# Simulate sparse data-sets
[simulation: provides = ["data_{id}.train.csv", "data_{id}.test.csv"]]
depends: R_library("MASS>=7.3")
parameter: N = (40, 200) # training and testing samples
parameter: rstd = 3
source='regression_modules/simulate.R'
output: f"data_{id}.train.csv", f"data_{id}.test.csv"
bash: expand = True
  Rscript {source} seed={id} N="c({paths(N):,})" b="c({paths(beta):,})" rstd={rstd} oftrain="'{_output[0]}'" oftest="'{_output[1]}'"

In [4]:
# Ridge regression model implemented in R
# Build predictor via cross-validation and make prediction
[ridge: provides = ["data_{id}.ridge.predicted.csv", "data_{id}.ridge.coef.csv"]]
depends: f"data_{id}.train.csv", f"data_{id}.test.csv", R_library("glmnet>=2.0")
parameter: nfolds = 5
source='regression_modules/ridge.R'
output: f"data_{id}.ridge.predicted.csv", f"data_{id}.ridge.coef.csv"
bash: expand = True
  Rscript {source} train="'{_depends[0]}'" test="'{_depends[1]}'" nfolds={nfolds} ofpred="'{_output[0]}'" ofcoef="'{_output[1]}'"

In [5]:
# LASSO model implemented in Python
# Build predictor via cross-validation and make prediction
[lasso: provides = ["data_{id}.lasso.predicted.csv", "data_{id}.lasso.coef.csv"]]
depends: f"data_{id}.train.csv", f"data_{id}.test.csv", Py_Module("sklearn>=0.18.1"), Py_Module("numpy>=1.6.1"), Py_Module("scipy>=0.9")
parameter: nfolds = 5
source='regression_modules/lasso.py'
output: f"data_{id}.lasso.predicted.csv", f"data_{id}.lasso.coef.csv"
bash: expand = True
  python {source} {_depends[0]} {_depends[1]} {nfolds} {_output[0]} {_output[1]}

In [6]:
# Evaluate predictors by calculating mean squared error
# of prediction vs truth (first line of output)
# and of betahat vs truth (2nd line of output)
[evaluation: provides = 'data_{id}.{method}.mse.csv']
depends: f"data_{id}.test.csv", f"data_{id}.{method}.predicted.csv",
         f"data_{id}.{method}.coef.csv"
source='regression_modules/evaluate.R'
bash: expand = True
  Rscript {source} b="c({paths(beta):,})" test="'{_depends[0]}'" fpred="'{_depends[1]}'" fcoef="'{_depends[2]}'" output="'{_output}'"

In [7]:
[get-pandoc-css: provides = 'pandoc.css']
download:
  https://raw.githubusercontent.com/vatlab/sos-docs/master/css/pandoc.css

In [8]:
# Compute and report error estimates
# in HTML table format
[default]
depends: ridge_result, lasso_result, "pandoc.css", executable('pandoc')
import numpy as np
ridge_summary = np.mean(np.array([sum([x.strip().split() for x in open(f).readlines()], []) for f in ridge_result], dtype = float).T, axis = 1).tolist()
lasso_summary = np.mean(np.array([sum([x.strip().split() for x in open(f).readlines()], []) for f in lasso_result], dtype = float).T, axis = 1).tolist()

report: expand = "${ }", output = "report.md"
%% Comparison summary

| Method | Avg. Estimation Error | Avg. Prediction Error |
|:------:|:-------:|:-------:|
| LASSO | ${lasso_summary[1]} | ${lasso_summary[0]} |
| Ridge | ${ridge_summary[1]} | ${ridge_summary[0]} |

pandoc: input = "report.md", output = "report.html", args = '{input:q} --css pandoc.css --self-contained -s --output {output:q}'

In [ ]:
%sosrun

In [ ]:
%preview report.html