R: tidytext: topic-modelling
		
		
		
		
		
		Jump to navigation
		Jump to search
		
		
	
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 150) library(ggplot2) library(methods) library(scales) theme_set(theme_light())
## Latent Dirichlet allocation
library(topicmodels)
data("AssociatedPress")
AssociatedPress
# This function returns an object containing the full details of the model fit, # such as how words are associated with topics and # how topics are associated with documents. ap_lda <- LDA(AssociatedPress, k = 2, control = list(seed = 1234)) ap_lda
### Word-topic probabilities
library(tidytext) ap_topics <- tidy(ap_lda, matrix = "beta") ap_topics
# plot library(ggplot2) library(dplyr) ap_top_terms <- ap_topics %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) ap_top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered()
# consider the terms that had the *greatest difference* in $\beta$ between topic 1 and topic 2.
# This can be estimated based on the log ratio of the two: $\log_2(\frac{\beta_2}{\beta_1})$
# (a log ratio is useful because it makes the difference symmetrical:
# $\beta_2$ being twice as large leads to a log ratio of 1,
# while $\beta_1$ being twice as large results in -1).
# To constrain it to a set of especially relevant words,
# we can filter for relatively common words,
# such as those that have a $\beta$ greater than 1/1000 in at least one topic.
library(tidyr)
beta_spread <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
beta_spread
# plot beta_spread %>% group_by(direction = log_ratio > 0) %>% top_n(10, abs(log_ratio)) %>% ungroup() %>% mutate(term = reorder(term, log_ratio)) %>% ggplot(aes(term, log_ratio)) + geom_col() + labs(y = "Log2 ratio of beta in topic 2 / topic 1") + coord_flip()
### Document-topic probabilities
ap_documents <- tidy(ap_lda, matrix = "gamma") ap_documents
# check document 6 tidy(AssociatedPress) %>% filter(document == 6) %>% arrange(desc(count))
## Example: the great library heist
# download dari gutenberg
titles <- c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
            "Pride and Prejudice", "Great Expectations")
library(gutenbergr)
books <- gutenberg_works(title %in% titles) %>%
  gutenberg_download(meta_fields = "title")
library(stringr)
# divide into documents, each representing one chapter
by_chapter <- books %>%
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter)
# split into words
by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)
# find document-word counts
word_counts <- by_chapter_word %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()
word_counts
### LDA on chapters
chapters_dtm <- word_counts %>% cast_dtm(document, word, n) chapters_dtm
# use the `LDA()` function to create a four-topic model chapters_lda <- LDA(chapters_dtm, k = 4, control = list(seed = 1234)) chapters_lda
# examine per-topic-per-word probabilities. chapter_topics <- tidy(chapters_lda, matrix = "beta") chapter_topics
# use dplyr's `top_n()` to find the top 5 terms within each topic. top_terms <- chapter_topics %>% group_by(topic) %>% top_n(5, beta) %>% ungroup() %>% arrange(topic, -beta) top_terms
# plot library(ggplot2) top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered()
### Per-document classification
chapters_gamma <- tidy(chapters_lda, matrix = "gamma") chapters_gamma
# re-separate the document name into title and chapter
chapters_gamma <- chapters_gamma %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE)
chapters_gamma
# reorder titles in order of topic 1, topic 2, etc before plotting # ERROR disini chapters_gamma %>% mutate(title = reorder(title, gamma * topic)) %>% ggplot(aes(factor(topic), gamma)) + geom_boxplot() + facet_wrap(~ title)
# look like some chapters from Great Expectations (which should be topic 4) # were somewhat associated with other topics chapter_classifications <- chapters_gamma %>% group_by(title, chapter) %>% top_n(1, gamma) %>% ungroup() chapter_classifications
book_topics <- chapter_classifications %>% count(title, topic) %>% group_by(title) %>% top_n(1, n) %>% ungroup() %>% transmute(consensus = title, topic) chapter_classifications %>% inner_join(book_topics, by = "topic") %>% filter(title != consensus)
### By word assignments: `augment`
assignments <- augment(chapters_lda, data = chapters_dtm) assignments
# returns a tidy data frame of book-term counts, but adds an extra column: `.topic
assignments <- assignments %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
  inner_join(book_topics, by = c(".topic" = "topic"))
assignments
# combination of the true book (`title`) and the book assigned to it (`consensus`)
# is useful for further exploration.
library(scales)
assignments %>%
  count(title, consensus, wt = count) %>%
  group_by(title) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(consensus, title, fill = percent)) +
  geom_tile() +
  scale_fill_gradient2(high = "red", label = percent_format()) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid = element_blank()) +
  labs(x = "Book words were assigned to",
       y = "Book words came from",
       fill = "% of assignments")
# What were the most commonly mistaken words? wrong_words <- assignments %>% filter(title != consensus) wrong_words wrong_words %>% count(title, consensus, term, wt = count) %>% ungroup() %>% arrange(desc(n))
# confirm "flopson" appears only in *Great Expectations*, # even though it's assigned to the "Pride and Prejudice" cluster. word_counts %>% filter(word == "flopson")
## Alternative LDA implementations
library(dplyr) library(tidytext) library(stringr) library(ggplot2) theme_set(theme_light())
library(mallet) # create a vector with one string per chapter collapsed <- by_chapter_word %>% anti_join(stop_words, by = "word") %>% mutate(word = str_replace(word, "'", "")) %>% group_by(document) %>% summarize(text = paste(word, collapse = " ")) # create an empty file of "stopwords" file.create(empty_file <- tempfile()) docs <- mallet.import(collapsed$document, collapsed$text, empty_file) mallet_model <- MalletLDA(num.topics = 4) mallet_model$loadDocuments(docs) mallet_model$train(100)
# use the `tidy()` and `augment()` functions described in the rest of the chapter # in an almost identical way. This includes extracting the probabilities of words # within each topic or topics within each document.
# word-topic pairs tidy(mallet_model) # document-topic pairs tidy(mallet_model, matrix = "gamma") # column needs to be named "term" for "augment" term_counts <- rename(word_counts, term = word) augment(mallet_model, term_counts)