Difference between revisions of "R: bigram"
		
		
		
		
		
		Jump to navigation
		Jump to search
		
				
		
		
	
| Onnowpurbo (talk | contribs) | Onnowpurbo (talk | contribs)  | ||
| Line 17: | Line 17: | ||
|   bigram_counts <- bigrams_filtered %>% |   bigram_counts <- bigrams_filtered %>% | ||
|         count(word1, word2, sort = TRUE) |         count(word1, word2, sort = TRUE) | ||
| + |  library(igraph) | ||
| + |  library(ggplot2) | ||
| + |  library(ggraph) | ||
| + |  bigram_graph <- bigram_counts %>% | ||
| + |    filter(n > 40) %>% | ||
| + |    graph_from_data_frame() | ||
| + |  bigram_graph | ||
| + |  library(ggraph) | ||
| + |  set.seed(2017) | ||
| + |  ggraph(bigram_graph, layout = "fr") + | ||
| + |    geom_edge_link() + | ||
| + |    geom_node_point() + | ||
| + |    geom_node_text(aes(label = name), vjust = 1, hjust = 1) | ||
Revision as of 20:10, 4 November 2018
library(dplyr) library(tidytext) library(janeaustenr)
text <- readtext("out.txt")
text_bigrams <- text %>%
                unnest_tokens(bigram, text, token = "ngrams", n = 2)
text_bigrams
library(tidyr)
bigrams_separated <- text_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
       filter(!word1 %in% stop_words$word) %>%
       filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
      count(word1, word2, sort = TRUE)
library(igraph)
library(ggplot2)
library(ggraph)
bigram_graph <- bigram_counts %>%
  filter(n > 40) %>%
  graph_from_data_frame()
bigram_graph
library(ggraph)
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
# contoh dari austen book
austen_bigrams <- austen_books() %>%
                  unnest_tokens(bigram, text, token = "ngrams", n = 2)
austen_bigrams
austen_bigrams %>%
      count(bigram, sort = TRUE)
library(tidyr)
bigrams_separated <- austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
       filter(!word1 %in% stop_words$word) %>%
       filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
      count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
       unite(bigram, word1, word2, sep = " ")
bigrams_united
austen_books() %>%
   unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
   separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
   filter(!word1 %in% stop_words$word,
          !word2 %in% stop_words$word,
          !word3 %in% stop_words$word) %>%
   count(word1, word2, word3, sort = TRUE)
bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf)) bigram_tf_idf