Difference between revisions of "R: bigram"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " library(dplyr) library(tidytext) library(janeaustenr) austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) auste...") |
Onnowpurbo (talk | contribs) |
||
Line 23: | Line 23: | ||
bigram_counts <- bigrams_filtered %>% | bigram_counts <- bigrams_filtered %>% | ||
count(word1, word2, sort = TRUE) | count(word1, word2, sort = TRUE) | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | bigrams_united <- bigrams_filtered %>% | ||
+ | unite(bigram, word1, word2, sep = " ") | ||
+ | bigrams_united | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | austen_books() %>% | ||
+ | unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% | ||
+ | separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% | ||
+ | filter(!word1 %in% stop_words$word, | ||
+ | !word2 %in% stop_words$word, | ||
+ | !word3 %in% stop_words$word) %>% | ||
+ | count(word1, word2, word3, sort = TRUE) | ||
+ | |||
+ | |||
+ | |||
+ | bigram_tf_idf <- bigrams_united %>% | ||
+ | count(book, bigram) %>% | ||
+ | bind_tf_idf(bigram, book, n) %>% | ||
+ | arrange(desc(tf_idf)) | ||
+ | bigram_tf_idf |
Revision as of 12:39, 31 October 2018
library(dplyr) library(tidytext) library(janeaustenr)
austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) austen_bigrams
austen_bigrams %>% count(bigram, sort = TRUE)
library(tidyr) bigrams_separated <- austen_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)
# new bigram counts: bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united
austen_books() %>% unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !word3 %in% stop_words$word) %>% count(word1, word2, word3, sort = TRUE)
bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf