R: tidytext: word-combinations Tokenizing by n-gram
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/04-word-combinations.Rmd
library(knitr) opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) options(width = 100, dplyr.width = 100) library(ggplot2) theme_set(theme_light())
## Tokenizing by n-gram library(dplyr) library(tidytext) library(janeaustenr) austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) austen_bigrams
### Counting and filtering n-grams austen_bigrams %>% count(bigram, sort = TRUE)
# buang stopwords
library(tidyr)
bigrams_separated <- austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
# recombine bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united
# trigram
austen_books() %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
### Analyzing bigrams bigrams_filtered %>% filter(word2 == "street") %>% count(book, word1, sort = TRUE)
# tf-idf bigram bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf
# plot
library(ggplot2)
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(book) %>%
top_n(12, tf_idf) %>%
ungroup() %>%
mutate(bigram = reorder(bigram, tf_idf)) %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ book, ncol = 2, scales = "free") +
coord_flip() +
labs(y = "tf-idf of bigram to novel",
x = "")
### Using bigrams to provide context in sentiment analysis bigrams_separated %>% filter(word1 == "not") %>% count(word1, word2, sort = TRUE)
# sentiment
AFINN <- get_sentiments("afinn")
AFINN
AFINN <- afinn
AFINN
# examine the most frequent words that were preceded by "not" # and were associated with a sentiment not_words <- bigrams_separated %>% filter(word1 == "not") %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word2, value, sort = TRUE) not_words
# plot
library(ggplot2)
not_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * value, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment value * number of occurrences") +
coord_flip()
# pick four common words (or more) that negate the subsequent term,
# and use the same joining and counting approach to examine all of them at once.
negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, value, sort = TRUE)
# plot
negated_words %>%
mutate(contribution = n * value,
word2 = reorder(paste(word2, word1, sep = "__"), contribution)) %>%
group_by(word1) %>%
top_n(12, abs(contribution)) %>%
ggplot(aes(word2, contribution, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ word1, scales = "free") +
scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
xlab("Words preceded by negation term") +
ylab("Sentiment value * # of occurrences") +
coord_flip()
### Visualizing a network of bigrams with ggraph
library(igraph) # original counts bigram_counts # filter for only relatively common combinations bigram_graph <- bigram_counts %>% filter(n > 20) %>% graph_from_data_frame() bigram_graph
# plot library(ggraph) set.seed(2017) ggraph(bigram_graph, layout = "fr") + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name), vjust = 1, hjust = 1)
# plot arrow
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
### Visualizing bigrams in other texts
library(dplyr)
library(tidyr)
library(tidytext)
library(ggplot2)
library(igraph)
library(ggraph)
count_bigrams <- function(dataset) {
dataset %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
# the King James version is book 10 on Project Gutenberg: library(gutenbergr) kjv <- gutenberg_download(10)
library(stringr)
kjv_bigrams <- kjv %>%
count_bigrams()
# filter out rare combinations, as well as digits
kjv_bigrams %>%
filter(n > 40,
!str_detect(word1, "\\d"),
!str_detect(word2, "\\d")) %>%
visualize_bigrams()
## Counting and correlating pairs of words with the widyr package austen_section_words <- austen_books() %>% filter(book == "Pride & Prejudice") %>% mutate(section = row_number() %/% 10) %>% filter(section > 0) %>% unnest_tokens(word, text) %>% filter(!word %in% stop_words$word) austen_section_words
# word pairs library(widyr) # count words co-occuring within sections word_pairs <- austen_section_words %>% pairwise_count(word, section, sort = TRUE) word_pairs
# find the words that most often occur with Darcy word_pairs %>% filter(item1 == "darcy")
### Pairwise correlation {#pairwise-correlation}
# we need to filter for at least relatively common words first word_cors <- austen_section_words %>% group_by(word) %>% filter(n() >= 20) %>% pairwise_cor(word, section, sort = TRUE) word_cors
# words most correlated with a word like "pounds" using a `filter` operation. word_cors %>% filter(item1 == "pounds")
# plot
word_cors %>%
filter(item1 %in% c("elizabeth", "pounds", "married", "pride")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
# plot visualize the correlations and clusters of words set.seed(2016) word_cors %>% filter(correlation > .15) %>% graph_from_data_frame() %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + geom_node_point(color = "lightblue", size = 5) + geom_node_text(aes(label = name), repel = TRUE) + theme_void()