R: tidytext: tf-idf corpus of physics texts
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/03-tf-idf.Rmd
library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 100) library(ggplot2) theme_set(theme_light())
# gutenbergr library(gutenbergr) physics <- gutenberg_download(c(37729, 14725, 13476, 30155), meta_fields = "author")
# count physics_words <- physics %>% unnest_tokens(word, text) %>% count(author, word, sort = TRUE) physics_words
# calculate & plot library(forcats) plot_physics <- physics_words %>% bind_tf_idf(word, author, n) %>% mutate(word = fct_reorder(word, tf_idf)) %>% mutate(author = factor(author, levels = c("Galilei, Galileo", "Huygens, Christiaan", "Tesla, Nikola", "Einstein, Albert"))) plot_physics %>% group_by(author) %>% top_n(15, tf_idf) %>% ungroup() %>% mutate(word = reorder(word, tf_idf)) %>% ggplot(aes(word, tf_idf, fill = author)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = "tf-idf") + facet_wrap(~author, ncol = 2, scales = "free") + coord_flip()
# evaluasi _k_ library(stringr) physics %>% filter(str_detect(text, "_k_")) %>% select(text)
# clean up physics %>% filter(str_detect(text, "RC")) %>% select(text)
# stopwords & plot mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", "fig", "file", "cg", "cb", "cm", "ab", "_k", "_k_", "_x")) physics_words <- anti_join(physics_words, mystopwords, by = "word") plot_physics <- physics_words %>% bind_tf_idf(word, author, n) %>% mutate(word = str_remove_all(word, "_")) %>% group_by(author) %>% top_n(15, tf_idf) %>% ungroup() %>% mutate(word = reorder_within(word, tf_idf, author)) %>% mutate(author = factor(author, levels = c("Galilei, Galileo", "Huygens, Christiaan", "Tesla, Nikola", "Einstein, Albert"))) ggplot(plot_physics, aes(word, tf_idf, fill = author)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = "tf-idf") + facet_wrap(~author, ncol = 2, scales = "free") + coord_flip() + scale_x_reordered()