require(splines)
rm(list=ls())
seed <- 12345
set.seed(seed)
# This script uses code made available at
# https://www.r-bloggers.com/cross-validation-for-predictive-analytics-using-r/
################################# MAIN CODE ###########################################
cat("
-------------------------------------------------------------------------------------
Welcome to the first ML demo/homework (year 2017): statistical learning / overfitting
Today we are going to study the problem of overfitting and some ways to overcome it.
Let us first generate some data and try to fit few models on them.
")
invisible(readline(prompt = "Press [enter] to continue"))
gen_data <- function(n, beta, sigma_eps) {
eps <- rnorm(n, 0, sigma_eps)
x <- sort(runif(n, 0, 100))
X <- cbind(1, poly(x, degree = (length(beta) - 1), raw = TRUE))
y <- as.numeric(X %*% beta + eps)
return(data.frame(x = x, y = y))
}
# Fit the models
n_rep <- 100
n_df <- 30
df <- 1:n_df
beta <- c(5, -0.1, 0.004, -3e-05)
n_train <- 50
n_test <- 1000
sigma_eps <- 0.5
xy <- res <- list()
xy_test <- gen_data(n_test, beta, sigma_eps)
for (i in 1:n_rep) {
xy[[i]] <- gen_data(n_train, beta, sigma_eps)
x <- xy[[i]][, "x"]
y <- xy[[i]][, "y"]
res[[i]] <- apply(t(df), 2, function(degf) lm(y ~ ns(x, df = degf)))
}
# Plot the data
plot_idx = 1
x <- xy[[plot_idx]]$x
X <- cbind(1, poly(x, degree = (length(beta) - 1), raw = TRUE))
y <- xy[[plot_idx]]$y
plot(y ~ x, col = "gray", lwd = 2)
lines(x, X %*% beta, lwd = 3, col = "black")
lines(x, fitted(res[[plot_idx]][[1]]), lwd = 3, col = "palegreen3")
lines(x, fitted(res[[plot_idx]][[5]]), lwd = 3, col = "darkorange")
lines(x, fitted(res[[plot_idx]][[25]]), lwd = 3, col = "steelblue")
legend(x = "topleft", legend = c("True function", "Linear fit (df = 1)", "Best model (df = 5)",
"Overfitted model (df = 25)"), lwd = rep(3, 4),
col = c("black", "palegreen3", "darkorange", "steelblue"),
text.width = 32, cex = 0.85)
cat("
In the plot you can see data points generated by a polynomial regression model (the
true function is plotted in black) and three piecewise-cubic spline models with different
flexibility (expressed in terms of degrees of freedom of the model - see \"?ns\").
Read the code and play with the parameters, then answer the following questions:
- How many datasets are generated and what is their size?
- How many models are fitted?
- Knowing that the variable res[[plot_idx]][[i]]$residuals contains the residuals
of the fitted model with df=i, find the model which has the smallest squared sum
of residuals. Plot it and interpret the results: is this the model which would
describe your data in the best possible way?
Let us now compute training and test errors on all our synthetically generated
datasets, and see how they change w.r.t. the increasing flexibility of our models.
")
invisible(readline(prompt = "Press [enter] to continue"))
# Compute the training and test errors for each model
pred <- list()
mse <- te <- matrix(NA, nrow = n_df, ncol = n_rep)
for (i in 1:n_rep) {
mse[, i] <- sapply(res[[i]], function(obj) deviance(obj)/nobs(obj))
pred[[i]] <- mapply(function(obj, degf) predict(obj, data.frame(x = xy_test$x)),
res[[i]], df)
te[, i] <- sapply(as.list(data.frame(pred[[i]])), function(y_hat) mean((xy_test$y -
y_hat)^2))
}
# Compute the average training and test errors
av_mse <- rowMeans(mse)
av_te <- rowMeans(te)
# Plot the errors
plot(df, av_mse, type = "l", lwd = 2, col = gray(0.4), ylab = "Prediction error",
xlab = "Flexibilty (spline's degrees of freedom [log scaled])", ylim = c(0,
1), log = "x")
abline(h = sigma_eps, lty = 2, lwd = 0.5)
for (i in 1:n_rep) {
lines(df, te[, i], col = "lightpink")
}
for (i in 1:n_rep) {
lines(df, mse[, i], col = gray(0.8))
}
lines(df, av_mse, lwd = 2, col = gray(0.4))
lines(df, av_te, lwd = 2, col = "darkred")
points(df[1], av_mse[1], col = "palegreen3", pch = 17, cex = 1.5)
points(df[1], av_te[1], col = "palegreen3", pch = 17, cex = 1.5)
points(df[which.min(av_te)], av_mse[which.min(av_te)], col = "darkorange", pch = 16,
cex = 1.5)
points(df[which.min(av_te)], av_te[which.min(av_te)], col = "darkorange", pch = 16,
cex = 1.5)
points(df[25], av_mse[25], col = "steelblue", pch = 15, cex = 1.5)
points(df[25], av_te[25], col = "steelblue", pch = 15, cex = 1.5)
legend(x = "top", legend = c("Training error", "Test error"), lwd = rep(2, 2),
col = c(gray(0.4), "darkred"), text.width = 0.3, cex = 0.85)
cat("
In the plot you can see training and test errors for the previously generated
datasets, and their average (in thicker lines). The orange dots show the value
of the df parameter where the test error is lowest. As you can see, while the
training error decreases when the flexibility grows, the test error first
decreases too, but after some point (on the right of the orange dot) it becomes
larger and larger.
- What caused that \"elbow\" in the plot, where the error suddenly drops
and then almost stop decreasing in the training set, while it starts growing
in the test set?
- Try to decrease the size of your training set to 20 points and look at the
two new pictures. Explain what happened (how did the curves change? Is still
df=5 the best model?) and motivate it.
- Try to increase the size of you training set first to 100, then to 1000
points. Explain what happened (how did the curves change? Is still
df=5 the best model?) and motivate it.
")
invisible(readline(prompt = "Press [enter] to continue"))
cat("
*** Extending the problem ***
Perform more experiments on the whole script and try to answer these questions:
- try changing the seed used to initialize the random number generator and make
sure the behavior of the script is consistent. Do you see any significative
changes or can you reach the same conclusions even with other data?
- add one more parameter to the \"beta\" variable (e.g. -5e-8), thus increasing
the degree of the polynomial generating the data points. Is there anything
different in your results? Why?
- until now, we have talked about errors without ever referring to the smallest
possible one that we can get, that is the irreducible error. How large is it?
Update this code to estimate the irreducible error from your data, and verify
it is consistent with the noise that is synthetically added to your model.
")