R: tidytext: word-combinations Tokenizing by n-gram
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/04-word-combinations.Rmd
library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 100) library(ggplot2) theme_set(theme_light())
## Tokenizing by n-gram library(dplyr) library(tidytext) library(janeaustenr) austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) austen_bigrams
### Counting and filtering n-grams austen_bigrams %>% count(bigram, sort = TRUE)
# buang stopwords library(tidyr) bigrams_separated <- austen_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) # new bigram counts: bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE) bigram_counts
# recombine bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united
# trigram austen_books() %>% unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !word3 %in% stop_words$word) %>% count(word1, word2, word3, sort = TRUE)
### Analyzing bigrams bigrams_filtered %>% filter(word2 == "street") %>% count(book, word1, sort = TRUE)
# tf-idf bigram bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf
# plot library(ggplot2) bigram_tf_idf %>% arrange(desc(tf_idf)) %>% group_by(book) %>% top_n(12, tf_idf) %>% ungroup() %>% mutate(bigram = reorder(bigram, tf_idf)) %>% ggplot(aes(bigram, tf_idf, fill = book)) + geom_col(show.legend = FALSE) + facet_wrap(~ book, ncol = 2, scales = "free") + coord_flip() + labs(y = "tf-idf of bigram to novel", x = "")
### Using bigrams to provide context in sentiment analysis bigrams_separated %>% filter(word1 == "not") %>% count(word1, word2, sort = TRUE)
# sentiment AFINN <- get_sentiments("afinn") AFINN AFINN <- afinn AFINN
# examine the most frequent words that were preceded by "not" # and were associated with a sentiment not_words <- bigrams_separated %>% filter(word1 == "not") %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word2, value, sort = TRUE) not_words
# plot library(ggplot2) not_words %>% mutate(contribution = n * value) %>% arrange(desc(abs(contribution))) %>% head(20) %>% mutate(word2 = reorder(word2, contribution)) %>% ggplot(aes(word2, n * value, fill = n * value > 0)) + geom_col(show.legend = FALSE) + xlab("Words preceded by \"not\"") + ylab("Sentiment value * number of occurrences") + coord_flip()
# pick four common words (or more) that negate the subsequent term, # and use the same joining and counting approach to examine all of them at once. negation_words <- c("not", "no", "never", "without") negated_words <- bigrams_separated %>% filter(word1 %in% negation_words) %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word1, word2, value, sort = TRUE)
# plot negated_words %>% mutate(contribution = n * value, word2 = reorder(paste(word2, word1, sep = "__"), contribution)) %>% group_by(word1) %>% top_n(12, abs(contribution)) %>% ggplot(aes(word2, contribution, fill = n * value > 0)) + geom_col(show.legend = FALSE) + facet_wrap(~ word1, scales = "free") + scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) + xlab("Words preceded by negation term") + ylab("Sentiment value * # of occurrences") + coord_flip()
### Visualizing a network of bigrams with ggraph
library(igraph) # original counts bigram_counts # filter for only relatively common combinations bigram_graph <- bigram_counts %>% filter(n > 20) %>% graph_from_data_frame() bigram_graph
# plot library(ggraph) set.seed(2017) ggraph(bigram_graph, layout = "fr") + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name), vjust = 1, hjust = 1)
# plot arrow set.seed(2016) a <- grid::arrow(type = "closed", length = unit(.15, "inches")) ggraph(bigram_graph, layout = "fr") + geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(.07, 'inches')) + geom_node_point(color = "lightblue", size = 5) + geom_node_text(aes(label = name), vjust = 1, hjust = 1) + theme_void()
### Visualizing bigrams in other texts
library(dplyr) library(tidyr) library(tidytext) library(ggplot2) library(igraph) library(ggraph) count_bigrams <- function(dataset) { dataset %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% separate(bigram, c("word1", "word2"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>% count(word1, word2, sort = TRUE) } visualize_bigrams <- function(bigrams) { set.seed(2016) a <- grid::arrow(type = "closed", length = unit(.15, "inches")) bigrams %>% graph_from_data_frame() %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) + geom_node_point(color = "lightblue", size = 5) + geom_node_text(aes(label = name), vjust = 1, hjust = 1) + theme_void() }
# the King James version is book 10 on Project Gutenberg: library(gutenbergr) kjv <- gutenberg_download(10)
library(stringr) kjv_bigrams <- kjv %>% count_bigrams() # filter out rare combinations, as well as digits kjv_bigrams %>% filter(n > 40, !str_detect(word1, "\\d"), !str_detect(word2, "\\d")) %>% visualize_bigrams()
## Counting and correlating pairs of words with the widyr package austen_section_words <- austen_books() %>% filter(book == "Pride & Prejudice") %>% mutate(section = row_number() %/% 10) %>% filter(section > 0) %>% unnest_tokens(word, text) %>% filter(!word %in% stop_words$word) austen_section_words
# word pairs library(widyr) # count words co-occuring within sections word_pairs <- austen_section_words %>% pairwise_count(word, section, sort = TRUE) word_pairs
# find the words that most often occur with Darcy word_pairs %>% filter(item1 == "darcy")
### Pairwise correlation {#pairwise-correlation}
# we need to filter for at least relatively common words first word_cors <- austen_section_words %>% group_by(word) %>% filter(n() >= 20) %>% pairwise_cor(word, section, sort = TRUE) word_cors
# words most correlated with a word like "pounds" using a `filter` operation. word_cors %>% filter(item1 == "pounds")
# plot word_cors %>% filter(item1 %in% c("elizabeth", "pounds", "married", "pride")) %>% group_by(item1) %>% top_n(6) %>% ungroup() %>% mutate(item2 = reorder(item2, correlation)) %>% ggplot(aes(item2, correlation)) + geom_bar(stat = "identity") + facet_wrap(~ item1, scales = "free") + coord_flip()
# plot visualize the correlations and clusters of words set.seed(2016) word_cors %>% filter(correlation > .15) %>% graph_from_data_frame() %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + geom_node_point(color = "lightblue", size = 5) + geom_node_text(aes(label = name), repel = TRUE) + theme_void()