Difference between revisions of "R: tidytext: tf-idf Jane Austen novels"

From OnnoWiki
Jump to: navigation, search
(Created page with " # Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/03-tf-idf.Rmd library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width =...")
 
(No difference)

Latest revision as of 04:42, 4 December 2019

# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/03-tf-idf.Rmd
library(knitr)
opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)
options(width = 100, dplyr.width = 100)
library(ggplot2)
theme_set(theme_light())


library(dplyr)
library(janeaustenr)
library(tidytext)
book_words <- austen_books() %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)
total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
book_words
# plot
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")





## Zipf's law
freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)
freq_by_rank
# plot
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()
# rank subset
rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
# plot
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()






## The `bind_tf_idf` function
book_words <- book_words %>%
  bind_tf_idf(word, book, n)
book_words
#  terms with high tf-idf in Jane Austen's works.
book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
# plot
book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()


Referensi


Pranala Menarik