Filtering spam emails is an application of Data Science that is implemented in our daily lives. In this project we hope to build a naive bayes classifier using the CSDMC2010 SPAM corpus. The dataset contains 4327 emails along with their response label indicating whether the email is spam or ham. There are a total of 1378 Ham and 2949 Spam email in this dataset. Using our naive bayes classifier, we hope to classify a given email as spam or ham depending on its content.
library(stringr)
library(magrittr)
library(ggplot2)
training.directory <- "~/Desktop/USFSpring2018/Statistical_Learning/Intro-to-Machine-Learning/Data/SPAMData/TRAINING/"
# Loading training and test files from the SPAM data set
training.names <- list.files(training.directory)
#Load each email using the above file.names
training.emails <- list()
setwd(training.directory)
for(i in 1:length(training.names)){
training.emails[[i]] <- readLines(training.names[i])
}
#Training labels
labels <- read.table("~/Desktop/USFSpring2018/Statistical_Learning/Intro-to-Machine-Learning/Data/SPAMData/SPAMTrain.label")
# Computes the naive bayes classifier given all necessary probability calculations (i.e.,priors and class-conditional)
# Posterior probabilities will be normalized such that their individual probabilities sum to 1.
#
# args:
# ccp - list of matrices with class-conditional probabilities. Each matrix corresponds to each level of y
# pp - vector of prior probabilities of response y
# obs - row observation vector from an X matrix with binary values
#
# returns:
# Vector containing respective posterior probabilities for each level of y
# Note: Row response should be classified as entry with the highest posterior probability
compute_classifier <- function(ccp, pp, obs){
llh0 <- numeric(0)
llh1 <- numeric(0)
if(typeof(ccp) == "list"){ # If there are multiple features in X matrix
for(i in seq_along(ccp)){
curr_tbl <- ccp[[i]]
llh0 <- c(llh0, curr_tbl[1, obs[i] + 1])
llh1 <- c(llh1, curr_tbl[2, obs[i] + 1])
}
}else{
llh0 <- ccp[1,obs[i] + 1]
llh1 <- ccp[2,obs[i] + 1]
}
llh0 <- prod(llh0)
llh1 <- prod(llh1)
num0 <- llh0 * pp[1]
num1 <- llh1 * pp[2]
evidence0 <- pp[1]*llh0 + pp[1]*llh1
evidence1 <- pp[2]*llh0 + pp[2]*llh1
# Normalizing posterior probabilities
post0 <- num0/evidence0 / sum(c(num0/evidence0, num1/evidence1))
post1 <- num1/evidence1 / sum(c(num0/evidence0, num1/evidence1))
return(c(post0, post1))
}
# Computes naive bayes classification on a given binary X matrix of predictors and a
# training vector of labels.
#
# args:
# X - an n x p data table or matrix X whose columns are binary variables and rows are observations
# y - an n length response vector y, whose entries are factors representing labels for each observation
#
# returns:
# list with following objects :
# prp - a vetor of prior probabilities for each label in y
# ccp - class conditional probabilities P(x=1 | Y=j) for k = 1,...,p and all labels j
# post - list of P(Y=j | X=x) calculated from algorithm for each observation
# pred - n dimensional vector specifying the classifcation for each observation
# mis - misclassification error rate of classifier
naive_bayes <- function(X, y){
if(!is.factor(y))
y <- as.factor(y)
output <- list() # list to be outputted in the end
pp <- prop.table(table(y)) # prior probabilities
ccp_list <- list() # Class conditional probabilities list
if(dim(X)[2] == 1){
ccp_list <- prop.table(table(y, X[,1]), margin=1)
}else{
for(i in 1:dim(X)[2]){
ccp_list[[i]] <- prop.table(table(y, X[,i]), margin=1)
}
}
ccp_list <- setNames(ccp_list, names(X))
posteriors <- list()
for(i in 1:nrow(X)){
posteriors[[i]] <- compute_classifier(ccp_list, pp, unlist(X[i,]))
}
# Grabbing classification of row which is index - 1 (0 or 1) of the larger probability
preds <- factor(lapply(posteriors, FUN=function(obs){return(which.max(obs) - 1)}) %>% as.vector(), levels =levels(y))
# Calculating misclassification error rate
mis <- sum(y != preds)/length(y)
return(list(prp = pp,
ccp = ccp_list,
post = posteriors,
pred = preds,
mis = mis))
}
In order to gain insight on the content of emails, we extracted the frequency of words that appear in ham and spam emails and stored the top 500 words. We then computed the average number of times each word appears in a given email along with the probability of occurrence. To create features, we created a matrix of binary features that indicate whether a specified word appears in the email. We then used the following words to create our covariates:
The following code preprocesses the data for word counts.
legit <- which(labels[,1] == 1)
spam <- which(labels[,1] == 0)
test.email <- training.emails[[1]]
spam_indices <- which(labels[1] == 0)
# gets frequency of every word from all emails
freq<<- data.frame(word = character(), count = numeric())
addFreq <- function(index){
# split an email by word
email_split <- str_split(training.emails[[index]], boundary("word"))
# get unique words
uniqueWords <- unique(unlist(email_split))
for (i in 1:length(uniqueWords)){
# calculate frequency in this email
curr <- sum(str_count(training.emails[[index]], uniqueWords[i]))
# if data frame is empty
if (nrow(freq) == 0){
freq <<- rbind(freq, data.frame(word = uniqueWords[i], count = curr))
}
else if (length(uniqueWords[i]) > 0 && !is.na(uniqueWords[i])){
# not in data frame yet
if (!any(freq$word == uniqueWords[i])){
freq <<- rbind(freq, data.frame(word = uniqueWords[i], count = curr))
}
else{
# in data frame, add to current value
currInd <- which(freq[,1]==uniqueWords[i])
freq$count[currInd] = freq$count[currInd] + curr
}
}
}
}
# get frequency of each word in spam emails
for (i in 1:length(spam_indices)){
addFreq(spam_indices[i])
}
# Order rows in decending order by count
orderedFreq <- freq[order(-freq$count),]
# Get mean number times a word appears in email
proportion <- list()
for (i in 1:nrow(orderedFreq)) {
x <- (orderedFreq[i,2])/length(spam)
proportion <- c(proportion, x)
}
col <- data.frame(t(proportion))
col <- t(col)
orderedFreq <- cbind(orderedFreq, col)
# mean w.r.t spam
spam.Freq <- orderedFreq
# get frequency of each word in non-spam emails
freq<<- data.frame(word = character(), count = numeric())
for (i in 1:length(legit)){
addFreq(legit[i])
}
orderedFreq <- freq[order(-freq$count),]
proportion <- list()
for (i in 1:nrow(orderedFreq)) {
x <- (orderedFreq[i,2])/length(legit)
proportion <- c(proportion, x)
}
col <- data.frame(t(proportion))
col <- t(col)
orderedFreq <- cbind(orderedFreq, col)
# mean frequency w.r.t non spam
legit.Freq <- orderedFreq
# get remove all strings of length == 1
spamDF <- data.frame()
for (i in 1:nrow(spam.Freq)) {
if (nchar(as.character(spam.Freq[i,1])) == 1) {
}
else {
spamDF <- rbind(spamDF, spam.Freq[i,])
}
}
legitDF <- data.frame()
for (i in 1:nrow(legit.Freq)) {
if (nchar(as.character(legit.Freq[i,1])) == 1) {
}
else {
legitDF <- rbind(legitDF, legit.Freq[i,])
}
}
# get number of spam emails a word appears in
# if word appears in an email just once, count increments
numEmailsApp <- list()
for (i in 1:500) { # run on top 500 words
test.word <- spamDF[i,1]
count <- 0
for (j in 1:nrow(spamDF)) {
email_split <- str_split(training.emails[[spam[j]]], boundary("word"))
uniqueWords <- as.vector(unique(unlist(email_split)))
if (as.character(test.word) %in% uniqueWords) {
count <- count + 1
}
}
numEmailsApp[i] <- count
}
appDF <- as.data.frame(numEmailsApp)
appDF <- t(appDF)
vect <- c()
for (i in 1:nrow(appDF)) {
num <- appDF[i,]/nrow(legitDF)
vect <- c(vect, num)
}
vect <- as.data.frame(vect)
# modify col names
colnames(legitDF2) <- c("word", "count", "mean count", "num emails word appears in", "probability of appearance")
# make master dataframe of all spam words and probabilities
legitMaster <- legitDF2[order(-spamDF2$`probability of appearance`),]
# get number of real emails a word appears in
# if word appears in an email just once, count increments
numEmailsApp <- list()
for (i in 1:500) { # run on top 500 words only
test.word <- legitDF[i,1]
count <- 0
for (j in 1:nrow(legitDF)) {
email_split <- str_split(training.emails[[legit[j]]], boundary("word"))
uniqueWords <- as.vector(unique(unlist(email_split)))
if (as.character(test.word) %in% uniqueWords) {
count <- count + 1
}
}
numEmailsApp[i] <- count
}
appDF <- as.data.frame(numEmailsApp)
appDF <- t(appDF)
vect <- c()
for (i in 1:nrow(spamMaster)) {
num <- spamMaster[i,4]/length(spam)
vect <- c(vect, num)
}
vect <- as.data.frame(vect)
spamMaster <- spamMaster[,-5]
spamMaster <- cbind(spamMaster, vect)
# master list of words and probabilities w.r.t. spam emails
colnames(spamMaster) <- c("word", "count", "mean count", "num emails word appears in", "probability of appearance")
spamMaster <- spamMaster[order(-spamMaster$`probability of appearance`),]
The above code takes forever to run so we simply ran it once and loaded the necessary workspace for future testing. The following code creates the binary feature matrix.
load("/Users/lancefernando/Desktop/USFSpring2018/Statistical_Learning/MyWork/CaseStudy1/masterData.RData") # Loading Rdata wksp
# Showing example of top spam word dataframe
head(spamMaster)
## word count mean count num emails word appears in
## 33 id 22156 16.07837 1378
## 26 by 11730 8.512337 1378
## 31 with 10414 7.557329 1378
## 17 from 9102 6.605225 1378
## 16 Received 7188 5.216255 1378
## 52 To 5641 4.093614 1378
## probability of appearance
## 33 1
## 26 1
## 31 1
## 17 1
## 16 1
## 52 1
# Grabbing top keywords
spamTop <- as.vector(spamMaster[,1])
legitTop <- as.vector(legitMaster[,1])
spamnotlegit <- c() # Vector of exclusive spam words
spamandlegit <- c() # Vector of top spam and ham
legitnotspam <- c() # Vector of exclusive ham words
# ~~~~ Adding words to respective vectors ~~~~
for (i in 1:length(spamTop)) {
word <- spamTop[i]
if (as.character(word) %in% legitTop) {
spamandlegit <- c(spamandlegit, word)
}
else {
spamnotlegit <- c(spamnotlegit, word)
}
}
for (i in 1:length(legitTop)) {
word <- legitTop[i]
if (as.character(word) %in% spamTop) {
}
else {
legitnotspam <- c(legitnotspam, word)
}
}
# Storing words that are in both spam and ham emails but occur
# in one or the other more than 10% of the time
wordVect <- c()
for (i in 1:length(spamandlegit)) {
word <- spamandlegit[i]
spam.index <- which(spamMaster$word == spamandlegit[i])
legit.index <- which(legitMaster$word == spamandlegit[i])
diff <- (spamMaster$`probability of appearance`[spam.index]) - (legitMaster$`probability of appearance`[legit.index])
if (abs(diff) > 0.1) {
wordVect <- c(wordVect, word)
}
}
spam_words <- spamnotlegit[1:50] # Top 50 exclusive spam words
ham_words <- legitnotspam[1:50] # Top 50 exclusive ham words
both_words <- wordVect # All words appearing more than 10% of the time in either email type over the other
# Helper function that appends to a global X matrix of features
# where columns are dummy variables of whether or not specified words
# appear in the email.
#
# args:
# eml - string of a single email
# vec - vector of strings to check existence in email
# train_emls - list of all emails
wordSearch <- function(eml, vec){
email <- c()
for(i in seq_along(vec)){
# If word appears in email at least once, encode col/row as 1
if (sum(str_count(eml, paste(vec[i]))) >= 1){
email <- c(email, 1)
}
else{
email <- c(email, 0)
}
}
# Binding to global feature matrix X
X <<- rbind(X, email)
return(X)
}
vec <- c(spam_words, both_words, ham_words) # Using all features
# Global feature matrix X
X <<- data.frame(NULL)
for(i in 1:length(training.emails)){
wordSearch(training.emails[[i]], vec)
}
colnames(X) <- vec
resp <- labels$V1 %>% as.factor() # Grabbing training labels
newX <- X[,-which(names(X) %in% c("Re", "00", "on", "20"))] # Removing feature that gives us problems =(
The following code runs our naive bayes classifier algorithm on our data.
# ~~~~~~ Running classifier ~~~~~~~
out <- naive_bayes(newX, resp)
# Prior probabilities
out$prp
## y
## 0 1
## 0.3184654 0.6815346
# First class conditional probabilities for first 5 features
out$ccp[1:5]
## $`8859`
##
## y 0 1
## 0 0.3795356 0.6204644
## 1 0.7161750 0.2838250
##
## $align
##
## y 0 1
## 0 0.41799710 0.58200290
## 1 0.97083757 0.02916243
##
## $below
##
## y 0 1
## 0 0.45065312 0.54934688
## 1 0.96914208 0.03085792
##
## $border
##
## y 0 1
## 0 0.45137881 0.54862119
## 1 0.95049169 0.04950831
##
## $font
##
## y 0 1
## 0 0.45065312 0.54934688
## 1 0.95625636 0.04374364
# Posterior probabilites for first five emails
out$post[1:5]
## [[1]]
## 0 1
## 1.000000e+00 8.096193e-18
##
## [[2]]
## 0 1
## 0.8798274 0.1201726
##
## [[3]]
## 0 1
## 5.540544e-29 1.000000e+00
##
## [[4]]
## 0 1
## 1.000000e+00 2.283798e-18
##
## [[5]]
## 0 1
## 0.990354474 0.009645526
# Predictions for first five emails
out$pred[1:5]
## [1] 0 0 1 0 0
## Levels: 0 1
# Outputting misclassification error rate
out$mis
## [1] 0.05431015
Using our naive bayes classifier, we were able to achieve a misclassification error rate of 0.0543 or a success rate of 94.57% using 170 features!
Using 10-fold cross validation to properly assess our misclassification error rate.
set.seed(1)
kfold_idxs <- sample(x = 1:10, replace = TRUE, size = length(resp))
miss_tr <- numeric(0)
miss_te <- numeric(0)
for(i in 1:10){
train <- which(kfold_idxs != i)
test <- -train
fit <- naive_bayes(newX[train,], resp[train])
#Calculating current fold training missclass rate
miss_tr <- c(miss_tr, fit$mis)
#Calculating test misclass
posteriors <- list()
X_test <- newX[test,]
for(i in 1:nrow(X_test)){
posteriors[[i]] <- compute_classifier(fit$ccp, fit$prp, unlist(X_test[i,]))
}
# Grabbing classification of row which is index - 1 (0 or 1) of the larger probability
preds <- factor(lapply(posteriors, FUN=function(obs){return(which.max(obs) - 1)}) %>% as.vector(), levels =levels(resp))
# Storing test misclassification error rate
miss_te <- c(miss_te, sum(resp[test] != preds)/length(resp[test]))
}
# Plotting our error
ggplot() +
geom_boxplot(mapping=aes(x = c(rep("Train",10),rep("Test",10)),
y = c(miss_tr,miss_te),
col = c(rep("Train",10),rep("Test",10))), show.legend = FALSE) +
labs(x="Partition of Data",
y="Measurement",
title="Train and Test Error Rate Using 10-Fold CV",
subtitle=paste("Avg Train Error: ",
round(mean(miss_tr, na.rm = TRUE),3),
" | Avg Test Error: ",
round(mean(miss_te, na.rm = TRUE),3), sep="")) +
annotate(geom = "text", x = "Train", y = median(miss_tr, na.rm = TRUE),
label=round(median(miss_tr, na.rm = TRUE), 3)) +
annotate(geom = "text", x = "Test", y = median(miss_te, na.rm = TRUE),
label=round(median(miss_te, na.rm = TRUE), 3))
According to the boxplot above, our average misclassification test error was 0.052 with a median of 0.054. Though these metrics are lower than that of our training metrics, the training metrics have much tighter bounds. By cross-validating our results we achieve a better assessment of how well our classifier is doing. Overall, using a total of 170 Features our classifier achieves a success rate of about 94.80% being able to correctly categorize spam emails 95/100 times!