library(pracma) library(MASS) library(class) rm(list=ls()) ################################# MAIN CODE ########################################### cat(" ------------------------------------------------------------------------------------- Welcome to the third PAMI demo/homework (year 2016): classification In this demo we will learn to do classification (\"manually\" and using ready-made R functions) on a very simple dataset. The purpose of the demo is both to show you how to solve classification problems in R and to allow you to assess your knowledge about this topic, using some questions which are similar to the ones which are usually asked during the written exam. For this reason, the size of the problem is very small, so you should be able to perform all your calculations on paper and then use the results you get in R to verify you did everything right. Let us first generate the dataset. ") invisible(readline(prompt = "Press [enter] to continue")) # prepare data X1 = c(1,1,1.5,2,2,2,2,3,2.5,2.5,3,4,4,4,5,5) X2 = c(1,3,2,1,2,3,4,2,2.5,5.5,3.5,2.5,3.5,4.5,3.5,4.5) Y = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2) m = data.frame(cbind(X1,X2,Y)) plot(m[,1:2]) points(m[Y==2,1:2], col='red') cat(" The total number of observations is 16, evenly divided in two classes (1=black and 2=red). Observations are 2-dimensional, but to explain calculations step-by-step we will start taking into account just the first dimension. Below here you can see the values of the variable X1 and the labels that have been assigned to the observations: ") print(rbind(X1,Y)) invisible(readline(prompt = "Press [enter] to continue")) # split data to easiy access the different classes # m1 are all the 2D points in class 1, m2 are the 2D points in class 2 m1 = m[m$Y==1,1:2] m2 = m[m$Y==2,1:2] n = dim(m)[1] n1 = dim(m1)[1] n2 = dim(m2)[1] # p values are the priors (the probability that one element belongs to a given class k) p = as.matrix(c(n1/n, n2/n)) # m1_ and m2_ represent our observations in 1D (using only the X1 values) m1_ = m1[,1] m2_ = m2[,1] cat(" Let us now perform LDA on this 1-dimensional dataset. If you check out the lab material on LDA (http://davide.eynard.it/teaching/2016_PAMI/Lab10.pdf) you will see that an observation x is classified with the class k that returns the highest *discriminant function* delta_k(x) = x * mu_k/sigma^2 - mu_k^2/(2*sigma^2) + log(pi_k) where: mu_k are the estimated means of the observations for each class k sigma^2 is the estimated variance of the class distributions (note that we assume the variance is the same for all classes - we can approximate a common variance from the data using pooled variance: see Lab10 notes) pi_k are the priors for each class k (n_k/n) Q1) Calculate all the parameters you need for LDA classification (mu_k, sigmasq, and pi_k). Try to do that manually first, then write the R code to do that below here and verify that your results are consistent. ") invisible(readline(prompt = "Press [enter] to continue")) # calculate means for m1_ and m2_ (COMPLETE THE CODE) # mu1 = ... # mu2 = ... # calculate pooled variance (COMPLETE THE CODE) #sigmasq = ... # pi_k values have already been calculated as p[k] before cat(" The previous formula for the discriminant function is linear in x (i.e. a_k * x + b_k). Q2) Calculate now the a_k, b_k parameters that you will use within the discriminant functions later. ") invisible(readline(prompt = "Press [enter] to continue")) # COMPLETE THE CODE # a1 = ... # a2 = ... # b1 = ... # b2 = ... # the label k assigned to an observation x is the one that corresponds to the # biggest delta_k(x) cat(" Classification according to your LDA implementation returns the following labels: ") lbls = max.col(cbind(a1 * m[,1] + b1, a2 * m[,1] + b2)) print(lbls) cat(" ... and here are both the table comparing labels and ground-truth and the classification accuracy: ") print(table(lbls,Y)) print(mean(lbls == Y)) cat(" Q3) What is the result of your classification? Are there any wrong labels (and in this case could you explain why)? How much is the accuracy? Q4) What is the value of the boundary between class 1 and 2? How do you calculate it? ") invisible(readline(prompt = "Press [enter] to continue")) cat(" Let us now perform LDA on the full 2D dataset. As some calculations (e.g. matrix inversion) are not trivial to do manually, feel free to use the help of R ;-) ") # 2D case: get means (COMPLETE THE CODE) # mu1 = ... # note that the code is different in 2D! # mu2 = ... # get covariances (COMPLETE THE CODE) # s1 = ... # note that the code is different in 2D! # s2 = ... # calculate pooled covariance and its inverse (COMPLETE THE CODE) # S = ... Si = inv(S) # calculate deltas deltas = cbind(as.matrix(m[,1:2]) %*% Si %*% mu1 - as.double(1/2 * t(mu1) %*% Si %*% mu1 + log(p[1])), as.matrix(m[,1:2]) %*% Si %*% mu2 - as.double(1/2 * t(mu2) %*% Si %*% mu2 + log(p[2]))) cat(" Classification according to your LDA implementation returns the following labels: ") lbls = max.col(deltas) print(lbls) cat(" ... and here are both the table comparing labels and ground-truth and the classification accuracy: ") print(table(lbls,Y)) print(mean(lbls == Y)) cat(" Now complete the source code of this script to run LDA using the builtin function provided in R and show your results are consistent with it. Q5) What is the result of your classification? Are there any wrong labels (and in this case could you explain why)? How much is the accuracy? ") invisible(readline(prompt = "Press [enter] to continue")) # complete and uncomment the code below to automatically run LDA lda.fit = lda(Y ~ X1 + X2, data=m) #lda.pred = predict(...) #lda.class = ... #table(lda.class,Y) #mean(lda.class == Y) invisible(readline(prompt = "Press [enter] to continue")) cat(" Until now, we have just tried to understand how well our model fits the original data (i.e. the training data). However, we did not conclude anything about how well it will describe *new* data! Here are some new observations (shown as dots in the plot). ") X1 = c(1.5,2,2.5,3,3,4,4,4.5) X2 = c(3.5,5,3.5,1,4.5,1.5,5.5,3.5) Y = c(1,1,1,1,2,2,2,2) m.test = data.frame(cbind(X1,X2,Y)) plot(m[,1:2]) points(m[m[,3]==2,1:2], col='red') points(m.test[,1:2],pch=20) points(m.test[m.test[,3]==2,1:2], pch=20, col='red') cat(" Q6) Run the lda predictor you trained using the previous observations to classify the new set of observations and comment the result. Is it expected? Are there any misclassifications and if so which ones and for what reason? Can you draw a (rough) classification boundary from the results you got? Can you actually calculate it, similarly to what you did in the 1D case? NOTE: to classify the new test set, you can use the lda.fit variable you obtained during training ") # complete the following code: # lda.pred = predict(...) lda.pred = predict(lda.fit, newdata = m.test[,1:2]) lda.class = lda.pred$class print(table(lda.class, m.test[,3])) print(mean(lda.class == m.test[,3]))