R: bigram
Revision as of 20:43, 4 November 2018 by Onnowpurbo (talk | contribs)
library(dplyr) library(tidytext) library(janeaustenr) library(tidyr) library(igraph) library(ggplot2) library(ggraph) library(readtext)
text <- readtext("out.txt") text_bigrams <- text %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) text_bigrams bigrams_separated <- text_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") # stopwords default bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) # stopwords Indonesia bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stopwords::stopwords("id", source = "stopwords-iso")) %>% filter(!word2 %in% stopwords::stopwords("id", source = "stopwords-iso")) bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE) bigram_graph <- bigram_counts %>% filter(n > 40) %>% graph_from_data_frame() bigram_graph set.seed(2017) ggraph(bigram_graph, layout = "fr") + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name), vjust = 1, hjust = 1)
# contoh dari austen book austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) austen_bigrams
austen_bigrams %>% count(bigram, sort = TRUE)
library(tidyr) bigrams_separated <- austen_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)
# new bigram counts: bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united
austen_books() %>% unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !word3 %in% stop_words$word) %>% count(word1, word2, word3, sort = TRUE)
bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf