Figure from Explaining and Harnessing Adversarial Examples by Goodfellow et al.
> train_inds = sample(1:150, 130)
> iris_train = iris[train_inds,]
> iris_test = iris[setdiff(1:150, train_inds),]
> lm_fit = lm(Sepal.Length ~ Sepal.Width+Petal.Length,
+ data=iris_train)
> sd(predict(lm_fit, iris_test) - iris_test$Sepal.Length)
[1] 0.3879703
> library(randomForest)
> iris_train$residual = lm_fit$residuals
> iris_train$fitted_values = lm_fit$fitted.values
> rf_fit = randomForest(
+ residual ~ Sepal.Width+Petal.Length+fitted_values,
+ data=iris_train)
>
> lm_pred = predict(lm_fit, iris_test)
> iris_test$fitted_values = lm_pred
> sd(predict(rf_fit, iris_test)+lm_pred -
+ iris_test$Sepal.Length)
[1] 0.3458326
> rf_fit2 = randomForest(
+ Sepal.Length ~ Sepal.Width+Petal.Length,
+ data=iris_train)
> sd(predict(rf_fit2, iris_test) - iris_test$Sepal.Length)
[1] 0.40927
The Ridge Part
The Least Absolute Shrinkage and Selection Operator (LASSO) Part
mike@Michaels-MacBook-Pro-2:~/projects/glmnet/inst/mortran
> wc -l glmnet5.m
3998 glmnet5.m
mike@Michaels-MacBook-Pro-2:~/projects/glmnet/inst/mortran
> head -n 1000 glmnet5.m | tail -n 20
if g(k).gt.ab*vp(k) < ix(k)=1; ixx=1;>
>
if(ixx.eq.1) go to :again:;
exit;
>
if nlp.gt.maxit < jerr=-m; return;>
:b: iz=1;
loop < nlp=nlp+1; dlx=0.0;
<l=1,nin; k=ia(l); gk=dot_product(y,x(:,k));
ak=a(k); u=gk+ak*xv(k); v=abs(u)-vp(k)*ab; a(k)=0.0;
if(v.gt.0.0)
a(k)=max(cl(1,k),min(cl(2,k),sign(v,u)/(xv(k)+vp(k)*dem)));
if(a(k).eq.ak) next;
del=a(k)-ak; rsq=rsq+del*(2.0*gk-del*xv(k));
y=y-del*x(:,k); dlx=max(xv(k)*del**2,dlx);
>
if(dlx.lt.thr) exit; if nlp.gt.maxit < jerr=-m; return;>
>
jz=0;
>
lewis_glm = function(X, y, lambda, alpha=1, family=binomial, maxit=500,
tol=1e-8, beta=spMatrix(nrow=ncol(X), ncol=1)) {
converged = FALSE
for(i in 1:maxit) {
eta = as.vector(X %*% beta)
g = family()$linkinv(eta)
gprime = family()$mu.eta(eta)
z = eta + (y - g) / gprime
W = as.vector(gprime^2 / family()$variance(g))
beta_old = beta
beta = solve(crossprod(X, W*X), crossprod(X, W*z))
if(sqrt(as.vector(Matrix::crossprod(beta-beta_old))) < tol) {
converged = TRUE
break
}
}
list(beta=beta, iterations=i, converged=converged)
}
pirls = function(X, y, lambda, alpha=1, family=binomial, maxit=500,
tol=1e-8, beta=spMatrix(nrow=ncol(X), ncol=1),
beta_update=coordinate_descent) {
converged = FALSE
for(i in 1:maxit) {
eta = as.vector(X %*% beta)
g = family()$linkinv(eta)
gprime = family()$mu.eta(eta)
z = eta + (y - g) / gprime
W = as.vector(gprime^2 / family()$variance(g))
beta_old = beta
beta = beta_update(X, W, z, lambda, alpha, beta, maxit)
if(sqrt(as.vector(Matrix::crossprod(beta-beta_old))) < tol) {
converged = TRUE
break
}
}
list(beta=beta, iterations=i, converged=converged)
}
coordinate_descent = function(X, W, z, lambda, alpha, beta, maxit) {
quad_loss = quadratic_loss(X, W, z, lambda, alpha, beta)
for(i in 1:maxit) {
beta_old = beta
quad_loss_old = quad_loss
beta = update_coordinates(X, W, z, lambda, alpha, beta)
quad_loss = quadratic_loss(X, W, z, lambda, alpha, beta)
if(quad_loss >= quad_loss_old) {
beta = beta_old
break
}
}
if (i == maxit && quad_loss <= quad_loss_old) {
warning("Coordinate descent did not converge.")
}
beta
}
> summaryRprof("pirls_profile.out")
...
$by.total
total.time total.pct self.time self.pct
"beta_update" 1.06 100.00 0.00 0.00
"pirls" 1.06 100.00 0.00 0.00
"update_coordinates" 1.06 100.00 0.00 0.00
update_coordinates = function(X, W, z, lambda, alpha, beta) {
beta_old = beta
for (i in 1:length(beta)) {
beta[i] = soft_thresh_r(sum(W*X[,i]*(z - X[,-i] %*% beta_old[-i])),
sum(W)*lambda*alpha)
}
beta / (colSums(W*X^2) + lambda*(1-alpha))
}
template<typename ModelMatrixType, typename WeightMatrixType,
typename ResponseType, typename BetaMatrixType>
double update_coordinate(const unsigned int i,
const ModelMatrixType &X, const WeightMatrixType &W, const ResponseType &z,
const double &lambda, const double &alpha, const BetaMatrixType &beta,
const double thresh) {
ModelMatrixType X_shed(X), X_col(X.col(i));
BetaMatrixType beta_shed(beta);
X_shed.shed_col(i);
beta_shed.shed_row(i);
double val = arma::mat((W*X_col).t() * (z - X_shed * beta_shed))[0];
double ret = soft_thresh(val, thresh);
if (ret != 0)
ret /= accu(W * square(X_col)) + lambda*(1-alpha);
return ret;
}
fit_r = pirls(x, y, lambda=lambda, family=gaussian)
fit_rc = pirls(x, y, lambda=lambda, family=gaussian,
beta_update=c_coordinate_descent)