rm(list=ls())
# set the seed as the last three digits of your student ID
seed = 111
if (seed == 111) {
stop("\nThe random generator seed is still set to its default value.\nEdit the script and change it with the last three digits of your student ID\n\n",
call. = FALSE)
}
set.seed(seed)
# number of points in the dataset
dataset_size = 40
############################### FUNCTION DEFINITION ###########################################
# fitAndEval
# given the variables x,y this function does the following:
# (1) it splits the dataset in training/testing (half and half)
# (2) it fits different polynomials on the training set (from degree 1 upwards)
# (3) it shows the fits graphically
# (4) it quantitatively evaluates a (different, more dense) set of polynomial functions,
# fitted on the training dataset, w.r.t. the test dataset, showing the MSE (mean
# squared error as a function of the degree of the polynomial)
fitAndEvalPoly <- function(x,y){
cat("We will now divide our dataset into two subsets, training and test.\n")
cat("We will then try to fit different models on the training set. All of them are polynomials,\n")
cat("starting from order 1 (a linear model) up to order 11. Finally, we will evaluate these\n")
cat("functions --in terms of Mean Squared Error-- both on the training and on the test datasets.\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
dataset_size = length(x)
train = rep(FALSE,dataset_size)
train[1:floor(dataset_size/2)] = TRUE
# fit the model, linearly and with 5 different polynomials
fit1 <- lm(y ~ poly(x, 1, raw = TRUE), subset=train)
fit2 <- lm(y ~ poly(x, 3, raw = TRUE), subset=train)
fit3 <- lm(y ~ poly(x, 5, raw = TRUE), subset=train)
fit4 <- lm(y ~ poly(x, 7, raw = TRUE), subset=train)
fit5 <- lm(y ~ poly(x, 9, raw = TRUE), subset=train)
fit6 <- lm(y ~ poly(x, 11, raw = TRUE), subset=train)
# fit6 <- smooth.spline(x[train], y[train], df=10)
# build the plot
plot(x[train],y[train],pch=19)
# generate a number of points to plot the predictions with lines
xx = seq(min(x),max(x), length=200)
# calculate predictions for those points and plot them
lines(xx,predict(fit1, data.frame(x=xx)), col="black")
lines(xx,predict(fit2, data.frame(x=xx)), col="red")
lines(xx,predict(fit3, data.frame(x=xx)), col="green")
lines(xx,predict(fit4, data.frame(x=xx)), col="blue")
lines(xx,predict(fit5, data.frame(x=xx)), col="purple")
lines(xx,predict(fit6, data.frame(x=xx)), col="orange")
# lines(xx,predict(fit6, xx)$y, col="orange")
legend("bottomright",NULL,c("y ~ x","y ~ poly(x,3)","y ~ poly(x,5)","y ~ poly(x,7)","y ~ poly(x,9)","y ~ poly(x,11)"),lty=c(1,1,1,1,1,1),col=c("black","red","green","blue","purple","orange"))
cat("Note that the higher the order of the polynomial, the better the function will fit the training data.\n")
cat("However, increasing the order will yield two effects:\n")
cat("(1) The MSE in the training set will become smaller... \n")
cat("(2) ... but the function will tend to overfit the data!\n\n")
cat("As a result, the MSE in the *test* set will tend to increase instead of decreasing.\n")
cat("In the following code, we will calculate MSE for both training and test sets,\n")
cat("for increasing order of the polynomial function. We will then plot them as a function\n")
cat("of the order of the polynomial.\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
MSE.train = c()
MSE.test = c()
# calculates all polynomials from order 1 to 10
# you can change these parameters to make the plot more readable
# (as it might greatly vary according to the random seed you used)
# e.g. you can reduce the highest order to see smaller differences,
# or choose only odd orders to see more results in the same number of steps
maxOrder = 5
MSE.steps = seq(1,maxOrder)
for (i in MSE.steps){
fit <- lm(y ~ poly(x, i, raw = TRUE), subset=train)
MSE.train[i] = mean((predict(fit,data.frame(x=x[train]))-y[train])^2)
MSE.test[i] = mean((predict(fit,data.frame(x=x[!train]))-y[!train])^2)
# fit <- smooth.spline(x[train], y[train], df=i)
# MSE.train[i] = mean((predict(fit,x[train])$y-y[train])^2)
# MSE.test[i] = mean((predict(fit,x[!train])$y-y[!train])^2)
}
plot(MSE.steps,MSE.train,ylim=c(min(c(MSE.train,MSE.test)),max(c(MSE.train,MSE.test))), col="blue")
lines(MSE.steps,MSE.train, col="blue")
points(MSE.steps,MSE.test,col="red")
lines(MSE.steps,MSE.test,col="red")
legend("topleft",NULL,c("training MSE","test MSE"),lty=c(1,1),col=c("blue","red"))
cat ("Minimum training error: ", min(MSE.train), " for degree=", which.min(MSE.train),"\n")
cat ("Minimum test error: ", min(MSE.test), " for degree=", which.min(MSE.test),"\n")
cat ("Check how the MSE changes as a function of the increasing flexibility (power of the polynomial).\n")
cat ("(note that you can modify the number and the order of the polynomials taken into account\n")
cat (" by this evaluation, so that your plot is more readable).\n\n")
cat ("- is the training MSE always decreasing?\n")
cat ("- is the test MSE always decreasing?\n")
cat ("- is the training MSE always lower than the test MSE?\n")
cat ("For each of the above questions, can you also tell *why*?\n")
}
fitAndEvalSpline <- function(x,y){
invisible(readline(prompt = "Press [enter] to continue"))
dataset_size = length(x)
train = rep(FALSE,dataset_size)
train[1:floor(dataset_size/2)] = TRUE
# fit the model, linearly and with 5 different polynomials
fit1 <- smooth.spline(x[train], y[train], df=2)
fit2 <- smooth.spline(x[train], y[train], df=4)
fit3 <- smooth.spline(x[train], y[train], df=8)
fit4 <- smooth.spline(x[train], y[train], df=16)
# build the plot
plot(x[train],y[train],pch=19)
# generate a number of points to plot the predictions with lines
xx = seq(min(x),max(x), length=200)
# calculate predictions for those points and plot them
lines(xx,predict(fit1, xx)$y, col="black")
lines(xx,predict(fit2, xx)$y, col="red")
lines(xx,predict(fit3, xx)$y, col="green")
lines(xx,predict(fit4, xx)$y, col="blue")
legend("bottomright",NULL,c("df=2","df=4","df=8","df=16"),lty=c(1,1,1,1),col=c("black","red","green","blue"))
cat("Note that, again, the higher the flexibility (df=degrees of freedom), the better \n")
cat("the function will fit the training data.\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
MSE.train = c()
MSE.test = c()
maxOrder = 16
MSE.steps = seq(2,maxOrder)
for (i in MSE.steps){
fit <- smooth.spline(x[train], y[train], df=i)
MSE.train = c(MSE.train, mean((predict(fit,x[train])$y-y[train])^2))
MSE.test = c(MSE.test, mean((predict(fit,x[!train])$y-y[!train])^2))
}
plot(MSE.steps,MSE.train,ylim=c(min(MSE.train),max(MSE.test)), col="blue")
lines(MSE.steps,MSE.train, col="blue")
points(MSE.steps,MSE.test,col="red")
lines(MSE.steps,MSE.test,col="red")
legend("topleft",NULL,c("training MSE","test MSE"),lty=c(1,1),col=c("blue","red"))
cat ("Minimum training error: ", min(MSE.train), " for df=", which.min(MSE.train)+1,"\n")
cat ("Minimum test error: ", min(MSE.test), " for df=", which.min(MSE.test)+1,"\n")
cat ("Check how the MSE changes as a function of the increasing flexibility (df of the spline).\n")
cat ("- is the training MSE always decreasing?\n")
cat ("- is the test MSE always decreasing?\n")
cat ("- is the training MSE always lower than the test MSE?\n")
cat ("For each of the above questions, can you also tell *why*?\n")
}
################################# MAIN CODE ###########################################
cat("\n\n-------------------------------------------------------------------------------------\n")
cat("Welcome to the first PAMI homework.\n\n")
cat("The aim of this homework is to show you, with some examples, the problem of\n")
cat("data overfitting. To do this, we will work with simple two-variables datasets.\n")
cat("In the first dataset, the predictor x is generated with a random distribution\n")
cat("with mean 0 and variance 1, while the response y linearly depends on x.\n")
# x is randomly generated, with mean 0 and variance 1
x = rnorm(dataset_size, 0, 1)
# y linearly depends on x
y = x + rnorm(dataset_size, 0, 1)
cat("The two variables are highly correlated with cor(x,y) =", cor(x, y),"\n\n")
invisible(readline(prompt = "Press [enter] to plot the dataset"))
# plot the data
plot(x,y,pch=19)
fitAndEvalPoly(x,y)
cat("\n\n-------------------------------------------------------------------------------------\n")
cat("Let us now fit another dataset, where the relationship between x and y is quadratic.\n")
x2 = rnorm(dataset_size, 0, 1)
y2 = x2^2 + x2 + rnorm(dataset_size, 0, 2)
invisible(readline(prompt = "Press [enter] to plot the dataset"))
# plot the data
plot(x2,y2,pch=19)
fitAndEvalPoly(x2,y2)
#
cat("\n\n-------------------------------------------------------------------------------------\n")
cat("Let us now fit another dataset, where the relationship between x and y is cubic.\n")
x = rnorm(dataset_size, 0, 1)
y = x^3 + 3 * x^2 - 2 * x + rnorm(dataset_size, 0, 2)
invisible(readline(prompt = "Press [enter] to plot the dataset"))
# plot the data
plot(x,y,pch=19)
fitAndEvalPoly(x,y)
cat("\n\n-------------------------------------------------------------------------------------\n")
cat("Up to now, we have only tried to fit *polynomials* and witnessed that higher order brings\n")
cat("higher flexibility, but also worse cases of overfitting.\n")
cat("Can we further increase flexibility? And in this case what happens?\n")
cat("To verify this, we will try *smoothing splines* on one of the previous datasets\n\n")
# the "quadratic dataset"
# x = rnorm(dataset_size, 0, 1)
# y = x^2 + x + rnorm(dataset_size, 0, 2)
fitAndEvalSpline(x2,y2)