Difference between revisions of "R: spam classification"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
Line 8: | Line 8: | ||
setwd("~/Your Folder Path") | setwd("~/Your Folder Path") | ||
raw.data <- read.table("SMSSpamCollection", header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE) | raw.data <- read.table("SMSSpamCollection", header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE) | ||
+ | |||
+ | names(raw.data) <- c("Label", "Text") | ||
+ | table(raw.data$Label) | ||
+ | |||
+ | set.seed(1912) | ||
+ | raw.data <- raw.data[sample(nrow(raw.data)),] | ||
+ | |||
+ | sms.corpus <- corpus(raw.data$Text) | ||
+ | docvars(sms.corpus) <- raw.data$Label | ||
+ | |||
+ | spam.plot <- corpus_subset(sms.corpus, docvar1 == "spam") | ||
+ | spam.plot <- dfm(spam.plot, tolower = TRUE, removePunct = TRUE, removeTwitter = TRUE, removeNumbers = TRUE, remove=stopwords("SMART")) | ||
+ | |||
+ | spam.col <- brewer.pal(10, "BrBG") | ||
+ | spam.cloud <- textplot_wordcloud(spam.plot, min.freq = 16, color = spam.col) | ||
+ | title("Spam Wordcloud", col.main = "grey14") | ||
+ | |||
+ | ham.plot <- corpus_subset(sms.corpus, docvar1 == "ham") | ||
+ | ham.plot <- dfm(ham.plot, tolower = TRUE, removePunct = TRUE, removeTwitter = TRUE, removeNumbers = TRUE, remove=c("gt", "lt", stopwords("SMART"))) | ||
+ | ham.col <- brewer.pal(10, "BrBG") | ||
+ | textplot_wordcloud(ham.plot, min.freq = 50, colors = ham.col, fixed.asp = TRUE) | ||
+ | title("Ham Wordcloud", col.main = "grey14") | ||
+ | |||
+ | sms.dfm <- dfm(sms.corpus, tolower = TRUE) | ||
+ | sms.dfm <- dfm_trim(sms.dfm, min_count = 5, min_docfreq = 3) | ||
+ | sms.dfm <- dfm_weight(sms.dfm, type = "tfidf") | ||
+ | |||
+ | sms.raw.train <- raw.data[1:4738,] | ||
+ | sms.raw.test <- raw.data[4739:nrow(raw.data),] | ||
+ | |||
+ | sms.dfm.train <- sms.dfm[1:4738,] | ||
+ | sms.dfm.test <- sms.dfm[4739:nrow(raw.data),] | ||
+ | |||
+ | sms.classifier <- textmodel_NB(sms.dfm.train, sms.raw.train$Label) | ||
+ | |||
+ | sms.predictions <- predict(sms.classifier, newdata = sms.dfm.test) | ||
+ | table(sms.predictions$nb.predicted, sms.raw.test$Label) | ||
Revision as of 10:18, 25 November 2018
install.packages("quanteda") install.packages("RColorBrewer")
library(quanteda) library(RColorBrewer)
setwd("~/Your Folder Path") raw.data <- read.table("SMSSpamCollection", header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
names(raw.data) <- c("Label", "Text") table(raw.data$Label)
set.seed(1912) raw.data <- raw.data[sample(nrow(raw.data)),]
sms.corpus <- corpus(raw.data$Text) docvars(sms.corpus) <- raw.data$Label
spam.plot <- corpus_subset(sms.corpus, docvar1 == "spam") spam.plot <- dfm(spam.plot, tolower = TRUE, removePunct = TRUE, removeTwitter = TRUE, removeNumbers = TRUE, remove=stopwords("SMART"))
spam.col <- brewer.pal(10, "BrBG") spam.cloud <- textplot_wordcloud(spam.plot, min.freq = 16, color = spam.col) title("Spam Wordcloud", col.main = "grey14")
ham.plot <- corpus_subset(sms.corpus, docvar1 == "ham") ham.plot <- dfm(ham.plot, tolower = TRUE, removePunct = TRUE, removeTwitter = TRUE, removeNumbers = TRUE, remove=c("gt", "lt", stopwords("SMART"))) ham.col <- brewer.pal(10, "BrBG") textplot_wordcloud(ham.plot, min.freq = 50, colors = ham.col, fixed.asp = TRUE) title("Ham Wordcloud", col.main = "grey14")
sms.dfm <- dfm(sms.corpus, tolower = TRUE) sms.dfm <- dfm_trim(sms.dfm, min_count = 5, min_docfreq = 3) sms.dfm <- dfm_weight(sms.dfm, type = "tfidf")
sms.raw.train <- raw.data[1:4738,] sms.raw.test <- raw.data[4739:nrow(raw.data),]
sms.dfm.train <- sms.dfm[1:4738,] sms.dfm.test <- sms.dfm[4739:nrow(raw.data),]
sms.classifier <- textmodel_NB(sms.dfm.train, sms.raw.train$Label)
sms.predictions <- predict(sms.classifier, newdata = sms.dfm.test) table(sms.predictions$nb.predicted, sms.raw.test$Label)