rm(list=ls()) # set the seed as the last three digits of your student ID seed = 12345 ################################# MAIN CODE ########################################### cat(" ------------------------------------------------------------------------------------- Welcome to the third ML demo/homework (year 2017): clustering [This homework is inspired by exercise 10.9 in the ISLR book, page 416] Today we are going to perform unsupervised learning on a well-known dataset, provided by default in R, called USArrests. The dataset has information about the number of assault, murder, and rape crimes (per 100,000) and the urban population (percent) in each of the 50 US States in 1973. You can find more information about it here: https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/USArrests.html Check the dataset contents on your own and answer the following questions: - how are the different variables distributed? Do they all have the same scale (mean/variance) or not? - what could be the problem if one variable had a very different scale from the others? How would you normalize the data to avoid this issue? ") invisible(readline(prompt = "Press [enter] to continue")) cat(" As our data is 4-dimensional, there is no easy way to plot it as it is. However, using PCA we can reduce its dimensionality to something easier to visualize (e.g. 2D). The picture (displayed with the \"biplot\" command) shows the different states plotted on the first two principal components of the data. Red arrows point towards the directions where the four features grow. ") # use PCA to reduce dimensionality pr.out=prcomp(USArrests, scale=TRUE) biplot(pr.out, scale=0) # use summary to see what amount of the variance is covered by the first two PC #summary(pr.out) invisible(readline(prompt = "Press [enter] to continue")) plot(pr.out$x[,1],pr.out$x[,2]) cat(" Now you can see the same plot made with the plot() function. For the sake of our cluster visualization, we are going to plot our data this way. - Does the data plotted in 2D exhibit any clustering tendency? Let us now run K-Means on these data to verify whether the \"elbow method\" gives us some meaningful results. We will now run K-Means 10 times, with K=1:10, calculate the within sum of squares for each run, and finally plot the results. ") invisible(readline(prompt = "Press [enter] to continue")) wss = 0 for (i in 1:10) wss[i] <- sum(kmeans(USArrests, centers=i, nstart=100)$withinss) plot(1:10, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") cat (" What can you say about the plot? Can you spot an elbow or not? How can you interpret this behavior, also knowing how the dataset looks like from the previous plot? Let us now run the hierarchical clustering algorithm, with complete linkage and k=3, and comment the results of the clustering. ") invisible(readline(prompt = "Press [enter] to continue")) scaled = USArrests # right now we are not scaling our data, but we will do this later with # the following line: # scaled = scale(USArrests) hc = hclust(dist(scaled), method = "complete") plot(hc) num_clusters = 3 hc.clusters=cutree(hc,num_clusters) plot(pr.out$x[,1],pr.out$x[,2],col=hc.clusters) # check centroid properties apply(USArrests[hc.clusters==1,],2,mean) apply(USArrests[hc.clusters==2,],2,mean) apply(USArrests[hc.clusters==3,],2,mean) cat (" Look at the code we ran, check the results (don't forget the previous plot, showing the dendrogram), and answer the following questions: - how do the clusters look like? Can you give an interpretation to them? To interpret the cluster results, look at the properties of their centroids (in the code, run the rows where cluster means are calculated) - try running the same algorithm after scaling the results with mean=0 and standard deviation 1 (see code). How do results change? - run the same algorithm, on scaled data, with k=4 instead. How would you interpret the data this time? Can you tell what is better between using k=3 and k=4? ") invisible(readline(prompt = "Press [enter] to continue"))