rm(list=ls()) # set the seed as the last three digits of your student ID seed = 111 if (seed == 111) { cat ("NOTE: to run this example you need to install and load two external packages.\n") cat ("To install them, execute the following two commands:\n\n") cat (" install.packages(\"e1071\")\n") cat (" install.packages(\"clue\")\n") stop("\nThe random generator seed is still set to its default value.\nEdit the script and change it with the last three digits of your student ID\n\n", call. = FALSE) } set.seed(seed) ################################# MAIN CODE ########################################### library(e1071) library(clue) cat ("Welcome to the fourth (and last!) PAMI homework. This time we are going to test\n") cat ("some clustering algorithms on the Iris flower dataset. This is a quite classical\n") cat ("dataset, consisting in 50 samples from each of three species of Iris. For each sample,\n") cat ("four features were measured. You can find more information about this dataset here:\n\n") cat ("http://en.wikipedia.org/wiki/Iris_flower_data_set\n\n") cat ("Let us now load the dataset (which is available by default in R) and look at its contents.") invisible(readline(prompt = "Press [enter] to continue")) data(iris) iris cat ("The first four columns hold, respectively, length and width of sepal and petal of each\n") cat ("sample. The fifth column holds the ground truth, e.g. the specie the sample belongs to.\n") cat ("Let us see how the data looks like: as it is 4D, we will plot pairs and use the ground\n") cat ("truth information to see how data is distributed in the different dimensions.\n") invisible(readline(prompt = "Press [enter] to continue")) classes = as.numeric(factor(iris$Species, levels=c("setosa","versicolor","virginica"))) plot(iris[,1:4], col=classes) cat ("As you can see, while one cluster (Iris setosa) is well separated from the others,\n") cat ("the other two are very close to each other. This means that some of the samples,\n") cat ("despite being classified into two different families, share very common features.\n") invisible(readline(prompt = "Press [enter] to continue")) cat ("Let us first run K-Means on these data. We actually know K already, but let us use\n") cat ("it to verify whether the \"elbow method\" gives us some reasonable results.\n") cat ("We will now run K-Means 10 times, with K=1:10, calculate the within sum of squares\n") cat ("for each run, and finally plot the results.\n") invisible(readline(prompt = "Press [enter] to continue")) wss = 0 for (i in 1:10) wss[i] <- sum(kmeans(iris[,1:4], centers=i, nstart=100)$withinss) plot(1:10, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") cat ("What can you say about the plot? Can you spot an elbow or not? How can you interpret\n") cat ("this behavior, also knowing how the dataset looks like from the previous plot?\n") cat ("Let us run K-Means again with K=3, the \"real\" number of clusters") invisible(readline(prompt = "Press [enter] to continue")) result = kmeans(iris[,1:4],centers = 3,nstart = 100) print(result) plot(iris[,1:4], col=result$cluster) cat ("\nThe NMI of the clustering performed with K-Means is ", cl_agreement(as.cl_partition(result$cluster),as.cl_partition(classes),method = 'NMI'),"\n") cat ("Let us now try another clustering algorithm: Hierarchical\n") invisible(readline(prompt = "Press [enter] to continue")) hc = hclust(dist(iris[,1:4]),method = "complete") cat ("Here is the full dendrogram...") plot(hc) invisible(readline(prompt = "Press [enter] to continue")) cat ("... and here is how data is classified.\n") hcmembers = cutree(hc, k = 3) plot(iris[,1:4], col=hcmembers) cat ("\nThe NMI of the clustering performed with Hierarchical is ", cl_agreement(as.cl_partition(hcmembers),as.cl_partition(classes),method = 'NMI'),"\n") cat ("Let us now try another clustering algorithm: Fuzzy C-Means\n") invisible(readline(prompt = "Press [enter] to continue")) result<-cmeans(iris[,1:4],centers = 3,iter.max = 50,verbose = TRUE,method = "cmeans",m = 2) print(result) plot(iris, col=result$cluster) cat ("\nThe NMI of the clustering performed with Fuzzy C-Means is ", cl_agreement(as.cl_partition(result$cluster),as.cl_partition(classes),method = 'NMI'),"\n")