library(ggplot2)
library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-10

library(cowplot)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

set.seed(113)
Coef = rnorm(5, 0, sd=1)
X = matrix(rnorm(500, 0, sd=1), nrow = 5, ncol = 100)
Y = apply(X, 2, function(x) Coef%*%x)

df = data.frame(x = c("a", "b", "c", "d", "e"), coef = Coef, Category="True")
ggplot(df, aes(x=x, y=coef, fill=Category)) + geom_bar(stat = "Identity") + theme_bw() +
  xlab("") + ylab("Coefficients")

Reg_analysis <- function(Xtrain, Ytrain, Xtest, Ytest, Ronly = FALSE) {
  model = glmnet(t(Xtrain), Ytrain, alpha=0)
  Ypred = predict.glmnet(model, newx = t(Xtest), s=0)

  df = data.frame(True = Ytest, Predict = as.numeric(Ypred))
  dfcoef = rbind(
    data.frame(Var = c("a", "b", "c", "d", "e"), Coef = Coef, Category="True"),
    data.frame(Var = c("a", "b", "c", "d", "e"), Coef = coef.glmnet(model, s=0)[2:6], Category="Trained")
            )
  dfcoef$Category = factor(dfcoef$Category)
  plots = list()

  err = max(1-sum((Ypred-Ytest)^2)/sum((Ypred-mean(Ypred))^2), 0)
  if (!Ronly) {
    plots[[1]] = ggplot(df, aes(y=True, x=Predict)) + geom_point() + xlab("Prected Y") + ylab("True Y") + 
      annotate("text", x=min(df$Predict)*0.5, y=max(df$True), label=paste0("R squared = ",as.character(round(err, digits = 2)))) + coord_cartesian(xlim = c(min(Ytest), max(Ytest)))
    plots[[2]] = ggplot(dfcoef, aes(x=Var, y=Coef, fill=Category)) + geom_bar(stat = "Identity", position = "dodge") +
      ylab("Coefficients")
    return(plots)
  } else{
    return(err)
  }
}

Training model without noise but with increasing dimensions (random variables)

Only 10 training data and 200 validation data is used.

Nfeats = c(0,2,3,4,5,6)
Rs = list()

Xtrain = matrix(rnorm(50, 0, sd=1), nrow = 5, ncol = 10)
Ytrain = apply(Xtrain, 2, function(x) Coef%*%x)
Xextra = matrix(rnorm(1000, 0, sd=1), nrow = 100, ncol = 10)

Xtest = matrix(rnorm(1000, 0, sd=1), nrow = 5, ncol = 200)
Ytest = apply(Xtest, 2, function(x) Coef%*%x)
Xextra_test = matrix(rnorm(100*200, 0, sd=1), nrow = 100, ncol=200)

for (Nfeat in Nfeats) {
  
  R <- Reg_analysis(rbind(Xtrain,Xextra[1:Nfeat,]), Ytrain, rbind(Xtest,Xextra_test[1:Nfeat, ]), Ytest)[[1]] + 
    ggtitle(paste0("dimension = ", as.character(Nfeat + 5))) + theme(aspect.ratio=1)
  Rs[[length(Rs)+1]] = R
#  plot(R)
}

plot_grid(plotlist = Rs, ncol=2)

Training model with varying number of training data

Ntrains = c(10,50, seq(100,1000, by = 50))
Nfeats = c(5,10,15,20,25,30,50,100,500)
Noises = c(0,1,2,5)

Xtrain = matrix(rnorm(5 * max(Ntrains), 0, sd=1), nrow = 5, ncol = max(Ntrains))
Ytrain = apply(Xtrain, 2, function(x) Coef%*%x)
Xextra = matrix(rnorm(max(Nfeats)*max(Ntrains), 0, sd=1), nrow = max(Nfeats), ncol = max(Ntrains))

Xtest = matrix(rnorm(1000, 0, sd=1), nrow = 5, ncol = 200)
Ytest = apply(Xtest, 2, function(x) Coef%*%x)
Xextra_test = matrix(rnorm(max(Nfeats)*200, 0, sd=1), nrow = max(Nfeats), ncol=200)

df = data.frame()
for (Noise in Noises) {
  Ytrain_use = Ytrain + rnorm(max(Ntrains), 0, Noise)
  for (Ntrain in Ntrains){
    for (Nfeat in Nfeats){
      R <- Reg_analysis(rbind(Xtrain[,1:Ntrain],Xextra[1:Nfeat,1:Ntrain]), Ytrain_use[1:Ntrain], 
                        rbind(Xtest,Xextra_test[1:Nfeat, ]), Ytest, Ronly = TRUE)
      df = rbind(df, data.frame(R = R, Ntrain = Ntrain, dimension = Nfeat, Noise = Noise))
    }
  }
#  plot(R)
}

#plot_grid(plotlist = Rs, ncol=2)

df$dimension = factor(df$dimension)

for (Noise in Noises){
  plot(
    ggplot(df[df$Noise==Noise,], aes(x=Ntrain, y=R, col=dimension)) +
      geom_smooth(se=FALSE) + ggtitle(paste0("Noise level = ", as.character(Noise))) + 
      xlab("Number of training data") + ylab("Predictive performance (R squared)")
  )
}

## `geom_smooth()` using method = 'loess'

## `geom_smooth()` using method = 'loess'

## `geom_smooth()` using method = 'loess'

## `geom_smooth()` using method = 'loess'

Numerical experiment on differnet number of dimensions

Training model without noise but with increasing dimensions (random variables)

Training model with varying number of training data