Difference between revisions of "R: bigram"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
| (5 intermediate revisions by the same user not shown) | |||
| Line 7: | Line 7: | ||
library(ggplot2) | library(ggplot2) | ||
library(ggraph) | library(ggraph) | ||
| − | library( | + | library(readtext) |
text <- readtext("out.txt") | text <- readtext("out.txt") | ||
| Line 15: | Line 15: | ||
bigrams_separated <- text_bigrams %>% | bigrams_separated <- text_bigrams %>% | ||
separate(bigram, c("word1", "word2"), sep = " ") | separate(bigram, c("word1", "word2"), sep = " ") | ||
| + | # | ||
| + | # stopwords default | ||
bigrams_filtered <- bigrams_separated %>% | bigrams_filtered <- bigrams_separated %>% | ||
filter(!word1 %in% stop_words$word) %>% | filter(!word1 %in% stop_words$word) %>% | ||
filter(!word2 %in% stop_words$word) | filter(!word2 %in% stop_words$word) | ||
| + | # | ||
| + | # stopwords Indonesia | ||
| + | bigrams_filtered <- bigrams_separated %>% | ||
| + | filter(!word1 %in% stopwords::stopwords("id", source = "stopwords-iso")) %>% | ||
| + | filter(!word2 %in% stopwords::stopwords("id", source = "stopwords-iso")) | ||
bigram_counts <- bigrams_filtered %>% | bigram_counts <- bigrams_filtered %>% | ||
count(word1, word2, sort = TRUE) | count(word1, word2, sort = TRUE) | ||
| + | # | ||
bigram_graph <- bigram_counts %>% | bigram_graph <- bigram_counts %>% | ||
filter(n > 40) %>% | filter(n > 40) %>% | ||
| Line 29: | Line 37: | ||
geom_node_point() + | geom_node_point() + | ||
geom_node_text(aes(label = name), vjust = 1, hjust = 1) | geom_node_text(aes(label = name), vjust = 1, hjust = 1) | ||
| − | + | # | |
| + | bigrams_united <- bigrams_filtered %>% | ||
| + | unite(bigram, word1, word2, sep = " ") | ||
| + | bigrams_united | ||
| + | # | ||
| + | bigram_tf_idf <- bigrams_united %>% | ||
| + | count(doc_id, bigram) %>% | ||
| + | bind_tf_idf(doc_id, bigram, n) %>% | ||
| + | arrange(desc(tf_idf)) | ||
| + | bigram_tf_idf | ||
| Line 51: | Line 68: | ||
bigram_counts <- bigrams_filtered %>% | bigram_counts <- bigrams_filtered %>% | ||
count(word1, word2, sort = TRUE) | count(word1, word2, sort = TRUE) | ||
| − | |||
| − | |||
| − | |||
bigrams_united <- bigrams_filtered %>% | bigrams_united <- bigrams_filtered %>% | ||
Latest revision as of 07:46, 5 November 2018
library(dplyr) library(tidytext) library(janeaustenr) library(tidyr) library(igraph) library(ggplot2) library(ggraph) library(readtext)
text <- readtext("out.txt")
text_bigrams <- text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
text_bigrams
bigrams_separated <- text_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
#
# stopwords default
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
#
# stopwords Indonesia
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stopwords::stopwords("id", source = "stopwords-iso")) %>%
filter(!word2 %in% stopwords::stopwords("id", source = "stopwords-iso"))
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
#
bigram_graph <- bigram_counts %>%
filter(n > 40) %>%
graph_from_data_frame()
bigram_graph
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
#
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
#
bigram_tf_idf <- bigrams_united %>%
count(doc_id, bigram) %>%
bind_tf_idf(doc_id, bigram, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
# contoh dari austen book
austen_bigrams <- austen_books() %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
austen_bigrams
austen_bigrams %>%
count(bigram, sort = TRUE)
library(tidyr)
bigrams_separated <- austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united
austen_books() %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf