R: tidytext: tf-idf Jane Austen novels
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/03-tf-idf.Rmd
library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 100) library(ggplot2) theme_set(theme_light())
library(dplyr) library(janeaustenr) library(tidytext) book_words <- austen_books() %>% unnest_tokens(word, text) %>% count(book, word, sort = TRUE) total_words <- book_words %>% group_by(book) %>% summarize(total = sum(n)) book_words <- left_join(book_words, total_words) book_words
# plot library(ggplot2) ggplot(book_words, aes(n/total, fill = book)) + geom_histogram(show.legend = FALSE) + xlim(NA, 0.0009) + facet_wrap(~book, ncol = 2, scales = "free_y")
## Zipf's law freq_by_rank <- book_words %>% group_by(book) %>% mutate(rank = row_number(), `term frequency` = n/total) freq_by_rank
# plot freq_by_rank %>% ggplot(aes(rank, `term frequency`, color = book)) + geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + scale_x_log10() + scale_y_log10()
# rank subset rank_subset <- freq_by_rank %>% filter(rank < 500, rank > 10) lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
# plot freq_by_rank %>% ggplot(aes(rank, `term frequency`, color = book)) + geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) + geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + scale_x_log10() + scale_y_log10()
## The `bind_tf_idf` function book_words <- book_words %>% bind_tf_idf(word, book, n) book_words
# terms with high tf-idf in Jane Austen's works. book_words %>% select(-total) %>% arrange(desc(tf_idf))
# plot book_words %>% arrange(desc(tf_idf)) %>% mutate(word = factor(word, levels = rev(unique(word)))) %>% group_by(book) %>% top_n(15) %>% ungroup() %>% ggplot(aes(word, tf_idf, fill = book)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = "tf-idf") + facet_wrap(~book, ncol = 2, scales = "free") + coord_flip()
Referensi