R: tidytext: topic-modelling
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 150) library(ggplot2) library(methods) library(scales) theme_set(theme_light())
## Latent Dirichlet allocation
library(topicmodels) data("AssociatedPress") AssociatedPress
# This function returns an object containing the full details of the model fit, # such as how words are associated with topics and # how topics are associated with documents. ap_lda <- LDA(AssociatedPress, k = 2, control = list(seed = 1234)) ap_lda
### Word-topic probabilities
library(tidytext) ap_topics <- tidy(ap_lda, matrix = "beta") ap_topics
# plot library(ggplot2) library(dplyr) ap_top_terms <- ap_topics %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) ap_top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered()
# consider the terms that had the *greatest difference* in $\beta$ between topic 1 and topic 2. # This can be estimated based on the log ratio of the two: $\log_2(\frac{\beta_2}{\beta_1})$ # (a log ratio is useful because it makes the difference symmetrical: # $\beta_2$ being twice as large leads to a log ratio of 1, # while $\beta_1$ being twice as large results in -1). # To constrain it to a set of especially relevant words, # we can filter for relatively common words, # such as those that have a $\beta$ greater than 1/1000 in at least one topic. library(tidyr) beta_spread <- ap_topics %>% mutate(topic = paste0("topic", topic)) %>% spread(topic, beta) %>% filter(topic1 > .001 | topic2 > .001) %>% mutate(log_ratio = log2(topic2 / topic1)) beta_spread
# plot beta_spread %>% group_by(direction = log_ratio > 0) %>% top_n(10, abs(log_ratio)) %>% ungroup() %>% mutate(term = reorder(term, log_ratio)) %>% ggplot(aes(term, log_ratio)) + geom_col() + labs(y = "Log2 ratio of beta in topic 2 / topic 1") + coord_flip()
### Document-topic probabilities
ap_documents <- tidy(ap_lda, matrix = "gamma") ap_documents
# check document 6 tidy(AssociatedPress) %>% filter(document == 6) %>% arrange(desc(count))
## Example: the great library heist
# download dari gutenberg titles <- c("Twenty Thousand Leagues under the Sea", "The War of the Worlds", "Pride and Prejudice", "Great Expectations") library(gutenbergr) books <- gutenberg_works(title %in% titles) %>% gutenberg_download(meta_fields = "title")
library(stringr) # divide into documents, each representing one chapter by_chapter <- books %>% group_by(title) %>% mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>% ungroup() %>% filter(chapter > 0) %>% unite(document, title, chapter) # split into words by_chapter_word <- by_chapter %>% unnest_tokens(word, text) # find document-word counts word_counts <- by_chapter_word %>% anti_join(stop_words) %>% count(document, word, sort = TRUE) %>% ungroup() word_counts
### LDA on chapters
chapters_dtm <- word_counts %>% cast_dtm(document, word, n) chapters_dtm
# use the `LDA()` function to create a four-topic model chapters_lda <- LDA(chapters_dtm, k = 4, control = list(seed = 1234)) chapters_lda
# examine per-topic-per-word probabilities. chapter_topics <- tidy(chapters_lda, matrix = "beta") chapter_topics
# use dplyr's `top_n()` to find the top 5 terms within each topic. top_terms <- chapter_topics %>% group_by(topic) %>% top_n(5, beta) %>% ungroup() %>% arrange(topic, -beta) top_terms
# plot library(ggplot2) top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered()
### Per-document classification
chapters_gamma <- tidy(chapters_lda, matrix = "gamma") chapters_gamma
# re-separate the document name into title and chapter chapters_gamma <- chapters_gamma %>% separate(document, c("title", "chapter"), sep = "_", convert = TRUE) chapters_gamma
# reorder titles in order of topic 1, topic 2, etc before plotting # ERROR disini chapters_gamma %>% mutate(title = reorder(title, gamma * topic)) %>% ggplot(aes(factor(topic), gamma)) + geom_boxplot() + facet_wrap(~ title)
# look like some chapters from Great Expectations (which should be topic 4) # were somewhat associated with other topics chapter_classifications <- chapters_gamma %>% group_by(title, chapter) %>% top_n(1, gamma) %>% ungroup() chapter_classifications
book_topics <- chapter_classifications %>% count(title, topic) %>% group_by(title) %>% top_n(1, n) %>% ungroup() %>% transmute(consensus = title, topic) chapter_classifications %>% inner_join(book_topics, by = "topic") %>% filter(title != consensus)
### By word assignments: `augment`
assignments <- augment(chapters_lda, data = chapters_dtm) assignments
# returns a tidy data frame of book-term counts, but adds an extra column: `.topic assignments <- assignments %>% separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>% inner_join(book_topics, by = c(".topic" = "topic")) assignments
# combination of the true book (`title`) and the book assigned to it (`consensus`) # is useful for further exploration. library(scales) assignments %>% count(title, consensus, wt = count) %>% group_by(title) %>% mutate(percent = n / sum(n)) %>% ggplot(aes(consensus, title, fill = percent)) + geom_tile() + scale_fill_gradient2(high = "red", label = percent_format()) + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1), panel.grid = element_blank()) + labs(x = "Book words were assigned to", y = "Book words came from", fill = "% of assignments")
# What were the most commonly mistaken words? wrong_words <- assignments %>% filter(title != consensus) wrong_words wrong_words %>% count(title, consensus, term, wt = count) %>% ungroup() %>% arrange(desc(n))
# confirm "flopson" appears only in *Great Expectations*, # even though it's assigned to the "Pride and Prejudice" cluster. word_counts %>% filter(word == "flopson")
## Alternative LDA implementations
library(dplyr) library(tidytext) library(stringr) library(ggplot2) theme_set(theme_light())
library(mallet) # create a vector with one string per chapter collapsed <- by_chapter_word %>% anti_join(stop_words, by = "word") %>% mutate(word = str_replace(word, "'", "")) %>% group_by(document) %>% summarize(text = paste(word, collapse = " ")) # create an empty file of "stopwords" file.create(empty_file <- tempfile()) docs <- mallet.import(collapsed$document, collapsed$text, empty_file) mallet_model <- MalletLDA(num.topics = 4) mallet_model$loadDocuments(docs) mallet_model$train(100)
# use the `tidy()` and `augment()` functions described in the rest of the chapter # in an almost identical way. This includes extracting the probabilities of words # within each topic or topics within each document.
# word-topic pairs tidy(mallet_model) # document-topic pairs tidy(mallet_model, matrix = "gamma") # column needs to be named "term" for "augment" term_counts <- rename(word_counts, term = word) augment(mallet_model, term_counts)