rm(list=ls())
# set the seed as the last three digits of your student ID
seed = 12345
# number of points in the dataset
trainSetSize = 3000
rotate <- function(x) t(apply(x, 2, rev))
showdigit <- function(dataset,digitId,labels=dataset[,1]){
m = rotate(matrix(1-as.double(dataset[digitId,-1]+1)/2,16,16,byrow=TRUE))
image(m,axes=FALSE,col = grey(seq(0, 1, length = 256)), main=paste("Digit ID = ",digitId," (label = ",labels[digitId],")"))
}
################################# MAIN CODE ###########################################
cat("
Welcome to the fourth (and last) demo/homework (year 2017): classification.
In this demo you will run classification algorithms against two different datasets.
In the first experiment, you will test your own email spam filter on the HP Labs
SPAM E-mail Database. For more information about this dataset (and for the actual
data), check out the following URLs:
[1] http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/spam.info.txt
[2] http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/spam.data
[3] http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/spam.traintest
[4] https://archive.ics.uci.edu/ml/datasets/Spambase
The dataset contains features describing 4601 email messages (of which 1813=39.4%
are spam). Note that you won't have access to the contents of the emails, but just
to their 58 attributes:
- 48 of them are frequencies of some relevant words
- 6 of them are frequencies of some relevant characters
- 3 of them relate to the use of capital letters in the message
- the last one is a nominal {0,1} class attribute of type spam
(1 = email was spam, 0 otherwise)
")
invisible(readline(prompt = "Press [enter] to continue"))
spam = read.table("spam.data")
test = read.table("spam.traintest")
train = (test==0)
spamdf = as.data.frame(spam)
rm(test)
# tell something about the training and test sets (specify they can be modified)
cat("
To evaluate our classification performance we are splitting the dataset in training
and test sets. The spam dataset comes with a ready-made split (useful for comparison
with different methods), with the following characteristics:
")
cat("- training set size:", length(which(train==TRUE)),
"(",length(which(spam[train,58]==1)),"spam,",length(which(spam[train,58]==0)),"non-spam)\n")
cat("- test set size:", length(which(train==FALSE)),
"(",length(which(spam[!train,58]==1)),"spam,",length(which(spam[!train,58]==0)),"non-spam)\n\n")
cat("One of your tasks will be to change this default set into a randomly generated
one (just check the source code of this demo to see how to do it).
")
# Uncomment the following two lines of code to enable random generation of the training/test sets.
# train = rep(FALSE,nrow(spam))
# train[sample(1:nrow(spam), trainSetSize)] = TRUE
spam.train = spam[train,]
spam.test = spam[!train,]
cat("
We will now run logistic regression to classify the email messages. What you will
see below are the *confusion matrix* and the *test error rate* calculated by doing
a default split at probability threshold = 0.5 (i.e. everything with predicted p>.5
will be considered as spam, everything else will be not).
")
invisible(readline(prompt = "Press [enter] to continue"))
glm.fit = suppressWarnings(glm(V58 ~ ., data=spam.train, family=binomial))
glm.probs = predict(glm.fit, spam.test, type="response")
# tell we run the split by doing the default probs
glm.pred = rep(0,dim(spam.test)[1])
glm.pred[glm.probs>.5]=1
V58.test = spamdf$V58[!train]
V58.train = spamdf$V58[train]
cat("=== Classification results for logistic regression ===\n\n")
cat("Confusion matrix:\n")
table(glm.pred,V58.test, dnn=c("Spam (predicted)","Spam (ground truth)"))
cat("\nTest error rate = ", mean(glm.pred!=V58.test),"\n")
cat("======================================================\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
cat("Now answer the following questions:
- look at the confusion matrix from logistic regression and interpret it.
How many messages have been correctly classified as spam? How many as
non-spam? How many false positives you have (e.g. messages classified
as spam that were not)? And how many false negatives?
- the spam experiment runs by default with a given training set (provided
by the dataset creators for an easier comparison with other methods).
Try to change it with a random one. Does anything significative change
if you do this? What if you change the default size of the set?
- if you think about the spam experiment as a *real* problem, you might
not just settle with a solution which gives you the smallest error, but
you'd probably want to also minimize the *false positives* (i.e. you do
not want to classify regular messages as spam). Play with the threshold
to reduce the number of false positives and discuss the results: how
does the general error change? How does the amount of spam you cannot
recognize anymore increase?
")
invisible(readline(prompt = "Press [enter] to continue"))
# classify with LDA
library(MASS)
lda.fit = lda(V58 ~ ., data = spamdf, subset=train)
lda.pred = predict(lda.fit, spam.test)
lda.class = lda.pred$class
cat("
Let us now compare this results with the ones you get from other methods.
")
cat("========== Classification results for LDA ===========\n\n")
cat("Confusion matrix:\n")
table(lda.class,V58.test, dnn=c("Spam (predicted)","Spam (ground truth)"))
cat("\nTest error rate = ", mean(lda.class!=V58.test),"\n")
cat("======================================================\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
# classify with QDA
qda.fit = qda(V58 ~ ., data = spamdf, subset=train)
qda.pred = predict(qda.fit, spam.test)
qda.class = qda.pred$class
cat("========== Classification results for QDA ===========\n\n")
cat("Confusion matrix:\n")
table(qda.class,V58.test, dnn=c("Spam (predicted)","Spam (ground truth)"))
cat("\nTest error rate = ", mean(qda.class!=V58.test),"\n")
cat("======================================================\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
# classify with KNN
library(class)
knn.pred = knn(spam.train, spam.test, V58.train, k = 1)
cat("========== Classification results for KNN ===========\n\n")
cat("Confusion matrix:\n")
table(knn.pred,V58.test, dnn=c("Spam (predicted)","Spam (ground truth)"))
cat("\nTest error rate = ", mean(knn.pred!=V58.test),"\n")
cat("======================================================\n\n")
cat("
What can you conclude about the nature of the dataset after seeing the results
you got from the other classification methods?
")
invisible(readline(prompt = "Press [enter] to continue"))
cat("
Let us move now to our second experiment: handwritten digit classification (dataset
info: http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/zip.info.txt).
In this case the features characterizing the digits are the digits themselves, i.e.
the colors actually the gray level) of their pixels. To give you a grasp of it, here
are few of them:
")
invisible(readline(prompt = "Press [enter] to continue"))
digits.train = read.table(gzfile("zip.train.gz"))
digits.test = read.table(gzfile("zip.test.gz"))
randIdx = sample(1:nrow(digits.train),5)
for (i in randIdx){
showdigit(digits.train,i)
invisible(readline(prompt = "Press [enter] to continue"))
}
cat("
We will now classify these digits with LDA, QDA, and KNN (this might take a while,
please be patient...)
")
invisible(readline(prompt = "Press [enter] to continue"))
lda.fit = lda(V1 ~ . , data = digits.train)
lda.pred=predict(lda.fit,digits.test)
lda.class=lda.pred$class
cat("========== Classification results for LDA ===========\n\n")
cat("Confusion matrix:\n")
table(lda.class,digits.test$V1)
cat("\nTest error rate = ", mean(lda.class!=digits.test$V1),"\n")
cat("======================================================\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
#digits.test$V1[1:10]
#lda.class[1:10]
# add jitter before QDA to avoid exact multicolinearity
digits.train.J = digits.train
digits.train.J[, -1] <- apply(digits.train[, -1], 2, jitter)
qda.fit = qda(V1 ~ . , data = digits.train.J)
qda.pred=predict(qda.fit,digits.test)
qda.class=qda.pred$class
cat("========== Classification results for QDA ===========\n\n")
cat("Confusion matrix:\n")
table(qda.class,digits.test$V1)
cat("\nTest error rate = ", mean(qda.class!=digits.test$V1),"\n")
cat("======================================================\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
#digits.test$V1[1:10]
#qda.class[1:10]
cat("(don't panic... KNN is the slowest one ;-))\n")
knn.pred = knn(digits.train, digits.test, digits.train$V1, k = 1)
cat("========== Classification results for KNN ===========\n\n")
cat("Confusion matrix:\n")
table(knn.pred,digits.test$V1)
cat("\nTest error rate = ", mean(knn.pred!=digits.test$V1),"\n")
cat("======================================================\n\n")
invisible(readline(prompt = "Press [enter] to continue"))
cat("
Answer the following questions:
- in the digits classification experiment, what is the digit that is misclassified
the most by QDA? Try to display some examples of this misclassified digit (all
the code you need is available in the script)
- compare the results given by the different classification approaches. What are
the best ones? Can you provide hypotheses explaining this behaviour?
")
# uncomment the following code to show a set of misclassified pics (just choose the proper
# values for val1 and val2, you can guess them from a confusion matrix)
# val1 = 0
# val2 = 0
# wrongIndices = which(digits.test$V1==val1 & qda.class==val2)
# for (i in wrongIndices){
# showdigit(digits.test,i,qda.class)
# invisible(readline(prompt = "Press [enter] to continue"))
# }
cat("
That's it! If you are interested in the comparison between other methods on this
dataset, also check:
http://blog.quantitations.com/machine%20learning/2013/02/27/comparing-classification-algorithms-for-handwritten-digits/
")