Difference between revisions of "R: bigram"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
| Line 4: | Line 4: | ||
library(janeaustenr) | library(janeaustenr) | ||
| + | |||
| + | text <- readtext("out.txt") | ||
| + | text_bigrams <- text %>% | ||
| + | unnest_tokens(bigram, text, token = "ngrams", n = 2) | ||
| + | text_bigrams | ||
| + | library(tidyr) | ||
| + | bigrams_separated <- text_bigrams %>% | ||
| + | separate(bigram, c("word1", "word2"), sep = " ") | ||
| + | bigrams_filtered <- bigrams_separated %>% | ||
| + | filter(!word1 %in% stop_words$word) %>% | ||
| + | filter(!word2 %in% stop_words$word) | ||
| + | bigram_counts <- bigrams_filtered %>% | ||
| + | count(word1, word2, sort = TRUE) | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | # contoh dari austen book | ||
austen_bigrams <- austen_books() %>% | austen_bigrams <- austen_books() %>% | ||
unnest_tokens(bigram, text, token = "ngrams", n = 2) | unnest_tokens(bigram, text, token = "ngrams", n = 2) | ||
| Line 10: | Line 28: | ||
austen_bigrams %>% | austen_bigrams %>% | ||
count(bigram, sort = TRUE) | count(bigram, sort = TRUE) | ||
| − | |||
| − | |||
library(tidyr) | library(tidyr) | ||
Revision as of 19:59, 4 November 2018
library(dplyr) library(tidytext) library(janeaustenr)
text <- readtext("out.txt")
text_bigrams <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
text_bigrams
library(tidyr)
bigrams_separated <- text_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
# contoh dari austen book
austen_bigrams <- austen_books() %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
austen_bigrams
austen_bigrams %>%
count(bigram, sort = TRUE)
library(tidyr)
bigrams_separated <- austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
austen_books() %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf