R: tidytext: topic-modelling

From OnnoWiki
Jump to navigation Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/06-topic-models.Rmd
 
library(knitr)
opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)
options(width = 100, dplyr.width = 150)
library(ggplot2)
library(methods)
library(scales)
theme_set(theme_light())
## Latent Dirichlet allocation
library(topicmodels)
data("AssociatedPress")
AssociatedPress
# This function returns an object containing the full details of the model fit,
# such as how words are associated with topics and
# how topics are associated with documents.
ap_lda <- LDA(AssociatedPress, k = 2, control = list(seed = 1234))
ap_lda


### Word-topic probabilities
library(tidytext)
ap_topics <- tidy(ap_lda, matrix = "beta")
ap_topics
# plot
library(ggplot2)
library(dplyr)
ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
ap_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()


# consider the terms that had the *greatest difference* in $\beta$ between topic 1 and topic 2.
# This can be estimated based on the log ratio of the two: $\log_2(\frac{\beta_2}{\beta_1})$
# (a log ratio is useful because it makes the difference symmetrical:
# $\beta_2$ being twice as large leads to a log ratio of 1,
# while $\beta_1$ being twice as large results in -1).
# To constrain it to a set of especially relevant words,
# we can filter for relatively common words,
# such as those that have a $\beta$ greater than 1/1000 in at least one topic.
library(tidyr)
beta_spread <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
beta_spread
# plot
beta_spread %>%
  group_by(direction = log_ratio > 0) %>%
  top_n(10, abs(log_ratio)) %>%
  ungroup() %>%
  mutate(term = reorder(term, log_ratio)) %>%
  ggplot(aes(term, log_ratio)) +
  geom_col() +
  labs(y = "Log2 ratio of beta in topic 2 / topic 1") +
  coord_flip()






### Document-topic probabilities
ap_documents <- tidy(ap_lda, matrix = "gamma")
ap_documents
# check document 6
tidy(AssociatedPress) %>%
  filter(document == 6) %>%
  arrange(desc(count))



## Example: the great library heist
# download dari gutenberg
titles <- c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
            "Pride and Prejudice", "Great Expectations")
library(gutenbergr)
books <- gutenberg_works(title %in% titles) %>%
  gutenberg_download(meta_fields = "title")
library(stringr)
# divide into documents, each representing one chapter
by_chapter <- books %>%
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter)
# split into words
by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)
# find document-word counts
word_counts <- by_chapter_word %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()
word_counts


### LDA on chapters
chapters_dtm <- word_counts %>%
  cast_dtm(document, word, n)
chapters_dtm
# use the `LDA()` function to create a four-topic model
chapters_lda <- LDA(chapters_dtm, k = 4, control = list(seed = 1234))
chapters_lda
# examine per-topic-per-word probabilities.
chapter_topics <- tidy(chapters_lda, matrix = "beta")
chapter_topics
# use dplyr's `top_n()` to find the top 5 terms within each topic.
top_terms <- chapter_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms
# plot
library(ggplot2)
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()







### Per-document classification
chapters_gamma <- tidy(chapters_lda, matrix = "gamma")
chapters_gamma
# re-separate the document name into title and chapter
chapters_gamma <- chapters_gamma %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE)
chapters_gamma
# reorder titles in order of topic 1, topic 2, etc before plotting
# ERROR disini
chapters_gamma %>%
  mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ title)
# look like some chapters from Great Expectations (which should be topic 4)
# were somewhat associated with other topics
chapter_classifications <- chapters_gamma %>%
  group_by(title, chapter) %>%
  top_n(1, gamma) %>%
  ungroup()
chapter_classifications
book_topics <- chapter_classifications %>%
  count(title, topic) %>%
  group_by(title) %>%
  top_n(1, n) %>%
  ungroup() %>%
  transmute(consensus = title, topic)
chapter_classifications %>%
  inner_join(book_topics, by = "topic") %>%
  filter(title != consensus)







### By word assignments: `augment`
assignments <- augment(chapters_lda, data = chapters_dtm)
assignments
# returns a tidy data frame of book-term counts, but adds an extra column: `.topic
assignments <- assignments %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
  inner_join(book_topics, by = c(".topic" = "topic"))
assignments
# combination of the true book (`title`) and the book assigned to it (`consensus`)
# is useful for further exploration.
library(scales)
assignments %>%
  count(title, consensus, wt = count) %>%
  group_by(title) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(consensus, title, fill = percent)) +
  geom_tile() +
  scale_fill_gradient2(high = "red", label = percent_format()) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid = element_blank()) +
  labs(x = "Book words were assigned to",
       y = "Book words came from",
       fill = "% of assignments")


# What were the most commonly mistaken words?
wrong_words <- assignments %>%
  filter(title != consensus)
wrong_words
wrong_words %>%
  count(title, consensus, term, wt = count) %>%
  ungroup() %>%
  arrange(desc(n))
#  confirm "flopson" appears only in *Great Expectations*,
# even though it's assigned to the "Pride and Prejudice" cluster.
word_counts %>%
  filter(word == "flopson")







## Alternative LDA implementations
library(dplyr)
library(tidytext)
library(stringr)
library(ggplot2)
theme_set(theme_light())
library(mallet)
# create a vector with one string per chapter
collapsed <- by_chapter_word %>%
  anti_join(stop_words, by = "word") %>%
  mutate(word = str_replace(word, "'", "")) %>%
  group_by(document) %>%
  summarize(text = paste(word, collapse = " "))
# create an empty file of "stopwords"
file.create(empty_file <- tempfile())
docs <- mallet.import(collapsed$document, collapsed$text, empty_file)
mallet_model <- MalletLDA(num.topics = 4)
mallet_model$loadDocuments(docs)
mallet_model$train(100)
# use the `tidy()` and `augment()` functions described in the rest of the chapter
# in an almost identical way. This includes extracting the probabilities of words
# within each topic or topics within each document.
# word-topic pairs
tidy(mallet_model)
# document-topic pairs
tidy(mallet_model, matrix = "gamma")
# column needs to be named "term" for "augment"
term_counts <- rename(word_counts, term = word)
augment(mallet_model, term_counts)


Referensi

Pranala Menarik