Outcome_Oriented.sos
102 lines
4.7 kB
#!/usr/bin/env sos-runner # fileformat=SOS1.0 # Author: Gao Wang and Bo Peng # # Linear-model based prediction # # This script fits linear models # using Lasso and Ridge regression # and summarizes their prediction performance # This script is written in the "outcome-oriented" style, # aka the "Make Style" [global] parameter: beta = [3, 1.5, 0, 0, 2, 0, 0, 0] ridge_result = [f'data_{x+1}.ridge.mse.csv' for x in range(5)] lasso_result = [f'data_{x+1}.lasso.mse.csv' for x in range(5)] # Simulate sparse data-sets [simulation: provides = ["data_{id}.train.csv", "data_{id}.test.csv"]] depends: R_library("MASS>=7.3") parameter: N = (40, 200) # training and testing samples parameter: rstd = 3 output: f"data_{id}.train.csv", f"data_{id}.test.csv" R: expand = "${ }" set.seed(${id}) N = sum(c(${paths(N):,})) p = length(c(${paths(beta):,})) X = MASS::mvrnorm(n = N, rep(0, p), 0.5^abs(outer(1:p, 1:p, FUN = "-"))) Y = X %*% c(${paths(beta):,}) + rnorm(N, mean = 0, sd = ${rstd}) Xtrain = X[1:${N[0]},]; Xtest = X[(${N[0]}+1):(${N[0]}+${N[1]}),] Ytrain = Y[1:${N[0]}]; Ytest = Y[(${N[0]}+1):(${N[0]}+${N[1]})] write.table(cbind(Ytrain, Xtrain), ${_output[0]:r}, row.names = F, col.names = F, sep = ',') write.table(cbind(Ytest, Xtest), ${_output[1]:r}, row.names = F, col.names = F, sep = ',') # Ridge regression model implemented in R # Build predictor via cross-validation and make prediction [ridge: provides = ["data_{id}.ridge.predicted.csv", "data_{id}.ridge.coef.csv"]] parameter: nfolds = 5 depends: f"data_{id}.train.csv", f"data_{id}.test.csv", R_library("glmnet>=2.0") output: f"data_{id}.ridge.predicted.csv", f"data_{id}.ridge.coef.csv" R: expand = "${ }" train = read.csv(${_depends[0]:r}, header = F) test = read.csv(${_depends[1]:r}, header = F) model = glmnet::cv.glmnet(as.matrix(train[,-1]), train[,1], family = "gaussian", alpha = 0, nfolds = ${nfolds}, intercept = F) betahat = as.vector(coef(model, s = "lambda.min")[-1]) Ypred = predict(model, as.matrix(test[,-1]), s = "lambda.min") write.table(Ypred, ${_output[0]:r}, row.names = F, col.names = F, sep = ',') write.table(betahat, ${_output[1]:r}, row.names = F, col.names = F, sep = ',') # LASSO model implemented in Python # Build predictor via cross-validation and make prediction [lasso: provides = ["data_{id}.lasso.predicted.csv", "data_{id}.lasso.coef.csv"]] parameter: nfolds = 5 depends: f"data_{id}.train.csv", f"data_{id}.test.csv", Py_Module("sklearn>=0.18.1"), Py_Module("numpy>=1.6.1"), Py_Module("scipy>=0.9") output: f"data_{id}.lasso.predicted.csv", f"data_{id}.lasso.coef.csv" python: expand = "${ }" import numpy as np from sklearn.linear_model import LassoCV train = np.genfromtxt(${_depends[0]:r}, delimiter = ",") test = np.genfromtxt(${_depends[1]:r}, delimiter = ",") model = LassoCV(cv = ${nfolds}, fit_intercept = False).fit(train[:,1:], train[:,1]) Ypred = model.predict(test[:,1:]) np.savetxt(${_output[0]:r}, Ypred) np.savetxt(${_output[1]:r}, model.coef_) # Evaluate predictors by calculating mean squared error # of prediction vs truth (first line of output) # and of betahat vs truth (2nd line of output) [evaluation: provides = 'data_{id}.{method}.mse.csv'] depends: f"data_{id}.test.csv", f"data_{id}.{method}.predicted.csv", f"data_{id}.{method}.coef.csv" R: expand = "${ }", stderr = False b = c(${paths(beta):,}) Ytruth = as.matrix(read.csv(${_depends[0]:r}, header = F)[,-1]) %*% b Ypred = scan(${_depends[1]:r}) prediction_mse = mean((Ytruth - Ypred)^2) betahat = scan(${_depends[2]:r}) estimation_mse = mean((betahat - b) ^ 2) cat(paste(prediction_mse, estimation_mse), file = ${_output:r}) [get-pandoc-css: provides = 'pandoc.css'] download: https://raw.githubusercontent.com/vatlab/sos-docs/master/css/pandoc.css # Compute and report error estimates # in HTML table format [default] depends: ridge_result, lasso_result, "pandoc.css", executable('pandoc') import numpy as np ridge_summary = np.mean(np.array([sum([x.strip().split() for x in open(f).readlines()], []) for f in ridge_result], dtype = float).T, axis = 1).tolist() lasso_summary = np.mean(np.array([sum([x.strip().split() for x in open(f).readlines()], []) for f in lasso_result], dtype = float).T, axis = 1).tolist() report: expand = "${ }", output = "report.md" %% Comparison summary | Method | Avg. Estimation Error | Avg. Prediction Error | |:------:|:-------:|:-------:| | LASSO | ${lasso_summary[1]} | ${lasso_summary[0]} | | Ridge | ${ridge_summary[1]} | ${ridge_summary[0]} | pandoc: input = "report.md", output = "report.html", args = '{input:q} --css pandoc.css --self-contained -s --output {output:q}'