R: tidytext: compare text
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd
library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 100) library(ggplot2) theme_set(theme_light())
# Jane Austen library(janeaustenr) library(dplyr) library(stringr) original_books <- austen_books() %>% group_by(book) %>% mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>% ungroup() original_books library(tidytext) tidy_books <- original_books %>% unnest_tokens(word, text) tidy_books data(stop_words) tidy_books <- tidy_books %>% anti_join(stop_words) tidy_books %>% count(word, sort = TRUE)
# h.g.wells library(gutenbergr) hgwells <- gutenberg_download(c(35, 36, 5230, 159)) tidy_hgwells <- hgwells %>% unnest_tokens(word, text) %>% anti_join(stop_words) tidy_hgwells %>% count(word, sort = TRUE)
# bronte bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767)) tidy_bronte <- bronte %>% unnest_tokens(word, text) %>% anti_join(stop_words) tidy_bronte %>% count(word, sort = TRUE)
# calculate the frequency for each word for the works of Jane Austen, the Brontë sisters, and H.G. Wells by binding the data frames together. # We can use `spread` and `gather` from tidyr to reshape our dataframe library(tidyr) frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"), mutate(tidy_hgwells, author = "H.G. Wells"), mutate(tidy_books, author = "Jane Austen")) %>% mutate(word = str_extract(word, "[a-z']+")) %>% count(author, word) %>% group_by(author) %>% mutate(proportion = n / sum(n)) %>% select(-n) %>% spread(author, proportion) %>% gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
# let's plot (Figure library(scales) # expect a warning about rows with missing values being removed ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) + geom_abline(color = "gray40", lty = 2) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = percent_format()) + scale_y_log10(labels = percent_format()) + scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") + facet_wrap(~author, ncol = 2) + theme(legend.position="none") + labs(y = "Jane Austen", x = NULL)
# how similar and different these sets of word frequencies are using a correlation test cor.test(data = frequency[frequency$author == "Brontë Sisters",], ~ proportion + `Jane Austen`) cor.test(data = frequency[frequency$author == "H.G. Wells",], ~ proportion + `Jane Austen`)
Referensi