# make sure you have all the following libs installed. If not, use install.packages() # to get them library(ggplot2) library(grid) rm(list=ls()) # set the seed as the last three digits of your student ID seed = 111 if (seed == 111) { stop("\nThe random generator seed is still set to its default value.\nEdit the script and change it with the last three digits of your student ID\n\n", call. = FALSE) } set.seed(seed) # add other global variable initializations here ################################# FUNCTION DEFINITION ####################################### vol_of_sphere <- function(dim, N) { m <- matrix(runif(dim*N,-1,1), ncol=dim) sum(sqrt(rowSums(m^2)) <= 1) / N } rand_dist_origin <- function(N,dim) { m <- matrix(rnorm(N*dim),ncol=dim) v <- sqrt(rowSums(m^2)) return(c(avg = mean(v), min = min(v), max = max(v))) } perform_lda_prediction <- function(fit, data, labels) { pred = predict(fit, as.data.frame(data)) class = pred$class table(class,labels) mean(class == labels) } ################################# MAIN CODE ########################################### cat(" ------------------------------------------------------------------------------------- Welcome to the second PAMI demo/homework (year 2016): curse of dimensionality The \"curse of dimensionality\" is a phenomenon (or, better, many phenomena) occurring when the dimensionality of your problem, e.g. the number of features that characterizes your observation, increases. When this happens, data tends to become sparse, making it difficult for any method which requires statistical significance to work properly. Let us try few examples from http://www.joyofdata.de/blog/curse-dimensionality/ and comment them. ") invisible(readline(prompt = "Press [enter] to continue")) N = 1000000 # this is the number of points filling the hyperbox, decrease if too slow df <- data.frame( dim <- 1:15, v <- sapply(1:15, function(dim) vol_of_sphere(dim,N)) ) ggplot(df,aes(dim,v)) + geom_point(colour=I("red"), size=I(5), alpha=I(0.5)) + geom_line(colour=I("red"), size=I(.2), alpha=I(0.5)) + scale_x_discrete() + labs(title="Ratio between volume of sphere and its containing box", x="#dimensions", y="volume ratio") + scale_y_continuous(breaks=0:10/10) + theme(axis.text.x = element_text(colour="black"), axis.text.y = element_text(colour="black"), axis.title.x = element_text(vjust=-0.5, size=15), axis.title.y = element_text(vjust=-0.2, size=15), plot.margin = unit(c(1,1,1,1), "cm"), plot.title = element_text(vjust=2, size=17)) cat(" This first example shows how the ratio between the volume of a (hyper)sphere of radius 1 and its containing (hyper)box when the number of dimensions increases. Answer the following questions: Q1) How is this ratio calculated? Look at the code and tell how the volumes of the sphere and the hypercube are found. Q2) What is the actual meaning of this plot? What does it mean for a point in the box to fall or not within the sphere too? ") invisible(readline(prompt = "Press [enter] to continue")) cat(" One of the main concepts whose interpretation is affected by the curse of dimensionality is the one of *similarity*, that here is dual to distance. Q3) Check out the code that generates the following results. What does it do? Modify the \"dim\" variable and comment the results of the summary() command ") # try to run the following code manually, change dim and comment the results of summary(d) dim = 2 m <- matrix(runif(dim*N,-1,1), ncol=dim) d = sqrt(rowSums(m^2)) summary(d) invisible(readline(prompt = "Press [enter] to continue")) cat(" Let us perform the same operation more consistently, generating random datasets with increasing dimensionality and checking how min, max, and average distance change. ") max_dim <- 100 N = 100000 v <- mapply(rand_dist_origin, rep(100000,max_dim-1), 2:max_dim) df <- data.frame( dim = 2:max_dim, aggregation = as.factor(c(rep("min", length(v["min",])), rep("max", length(v["max",])), rep("avg", length(v["avg",])))), v = c(v["min",], v["max",], v["avg",]) ) ggplot(df,aes(dim,v)) + geom_point(aes(colour=aggregation), size=I(3), alpha=I(0.6)) + geom_line(aes(colour=aggregation), size=I(.2), alpha=I(0.5)) + scale_x_discrete(breaks=1:10*10) + labs(title="Statistics for Distances of Std Norm Distributed Points from 0", x="dimension", y="aggregate value of distances") + scale_y_continuous(breaks=0:max(v)) + theme(axis.text.x = element_text(colour="black"), axis.text.y = element_text(colour="black"), axis.title.x = element_text(vjust=-0.5, size=15), axis.title.y = element_text(vjust=-0.2, size=15), plot.margin = unit(c(1,1,1,1), "cm"), plot.title = element_text(vjust=2, size=17)) + annotate("text", x=0, y=Inf, label="(joyofdata.de)", vjust=1.5, hjust=-.1, size=4) cat(" Q4) What happens when dimensionality increases? Note that the difference between max and min roughly remains constant... But what happens to the *relative difference* (i.e. (max-min)/min)? Try to plot it together with the other data and explain the meaning of this result. Q5) Finally, consider a classification problem (e.g. a set of 100 \"email\" observations you need to classify as either \"spam\" or \"not spam\") and explain how dimensionality might affect its results. Given the same amount of observations, is it better to have more or less features? What happens when you have many of them? How can you address potential problems due to the curse of dimensionality? ")