Difference between revisions of "R: bigram"

From OnnoWiki
Jump to navigation Jump to search
(Created page with " library(dplyr) library(tidytext) library(janeaustenr) austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) auste...")
 
 
(10 intermediate revisions by the same user not shown)
Line 3: Line 3:
 
  library(tidytext)
 
  library(tidytext)
 
  library(janeaustenr)
 
  library(janeaustenr)
 +
library(tidyr)
 +
library(igraph)
 +
library(ggplot2)
 +
library(ggraph)
 +
library(readtext)
  
 +
text <- readtext("out.txt")
 +
text_bigrams <- text %>%
 +
                unnest_tokens(bigram, text, token = "ngrams", n = 2)
 +
text_bigrams
 +
bigrams_separated <- text_bigrams %>%
 +
separate(bigram, c("word1", "word2"), sep = " ")
 +
#
 +
# stopwords default
 +
bigrams_filtered <- bigrams_separated %>%
 +
        filter(!word1 %in% stop_words$word) %>%
 +
        filter(!word2 %in% stop_words$word)
 +
#
 +
# stopwords Indonesia
 +
bigrams_filtered <- bigrams_separated %>%
 +
        filter(!word1 %in% stopwords::stopwords("id", source = "stopwords-iso")) %>%
 +
        filter(!word2 %in% stopwords::stopwords("id", source = "stopwords-iso"))
 +
bigram_counts <- bigrams_filtered %>%
 +
      count(word1, word2, sort = TRUE)
 +
#
 +
bigram_graph <- bigram_counts %>%
 +
  filter(n > 40) %>%
 +
  graph_from_data_frame()
 +
bigram_graph
 +
set.seed(2017)
 +
ggraph(bigram_graph, layout = "fr") +
 +
  geom_edge_link() +
 +
  geom_node_point() +
 +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
 +
#
 +
bigrams_united <- bigrams_filtered %>%
 +
  unite(bigram, word1, word2, sep = " ")
 +
bigrams_united
 +
#
 +
bigram_tf_idf <- bigrams_united %>%
 +
    count(doc_id, bigram) %>%
 +
    bind_tf_idf(doc_id, bigram, n) %>%
 +
    arrange(desc(tf_idf))
 +
bigram_tf_idf
 +
 +
 +
 +
# contoh dari austen book
 
  austen_bigrams <- austen_books() %>%
 
  austen_bigrams <- austen_books() %>%
 
                   unnest_tokens(bigram, text, token = "ngrams", n = 2)
 
                   unnest_tokens(bigram, text, token = "ngrams", n = 2)
Line 10: Line 57:
 
  austen_bigrams %>%
 
  austen_bigrams %>%
 
       count(bigram, sort = TRUE)
 
       count(bigram, sort = TRUE)
 
 
  
 
  library(tidyr)
 
  library(tidyr)
Line 23: Line 68:
 
  bigram_counts <- bigrams_filtered %>%
 
  bigram_counts <- bigrams_filtered %>%
 
       count(word1, word2, sort = TRUE)
 
       count(word1, word2, sort = TRUE)
 +
 +
bigrams_united <- bigrams_filtered %>%
 +
        unite(bigram, word1, word2, sep = " ")
 +
bigrams_united
 +
 +
 +
 +
 +
austen_books() %>%
 +
    unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
 +
    separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
 +
    filter(!word1 %in% stop_words$word,
 +
          !word2 %in% stop_words$word,
 +
          !word3 %in% stop_words$word) %>%
 +
    count(word1, word2, word3, sort = TRUE)
 +
 +
 +
 +
bigram_tf_idf <- bigrams_united %>%
 +
    count(book, bigram) %>%
 +
    bind_tf_idf(bigram, book, n) %>%
 +
    arrange(desc(tf_idf))
 +
bigram_tf_idf
 +
 +
 +
 +
==Pranala Menarik==
 +
 +
* [[R]]

Latest revision as of 07:46, 5 November 2018

library(dplyr)
library(tidytext)
library(janeaustenr)
library(tidyr)
library(igraph)
library(ggplot2)
library(ggraph)
library(readtext)
text <- readtext("out.txt")
text_bigrams <- text %>%
                unnest_tokens(bigram, text, token = "ngrams", n = 2)
text_bigrams
bigrams_separated <- text_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
#
# stopwords default
bigrams_filtered <- bigrams_separated %>%
       filter(!word1 %in% stop_words$word) %>%
       filter(!word2 %in% stop_words$word)
#
# stopwords Indonesia
bigrams_filtered <- bigrams_separated %>%
       filter(!word1 %in% stopwords::stopwords("id", source = "stopwords-iso")) %>%
       filter(!word2 %in% stopwords::stopwords("id", source = "stopwords-iso"))
bigram_counts <- bigrams_filtered %>%
      count(word1, word2, sort = TRUE)
#
bigram_graph <- bigram_counts %>%
  filter(n > 40) %>%
  graph_from_data_frame()
bigram_graph
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
#
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")
bigrams_united
#
bigram_tf_idf <- bigrams_united %>%
   count(doc_id, bigram) %>%
   bind_tf_idf(doc_id, bigram, n) %>%
   arrange(desc(tf_idf))
bigram_tf_idf


# contoh dari austen book
austen_bigrams <- austen_books() %>%
                  unnest_tokens(bigram, text, token = "ngrams", n = 2)
austen_bigrams
austen_bigrams %>%
      count(bigram, sort = TRUE)
library(tidyr)
bigrams_separated <- austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
       filter(!word1 %in% stop_words$word) %>%
       filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
      count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
       unite(bigram, word1, word2, sep = " ")
bigrams_united



austen_books() %>%
   unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
   separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
   filter(!word1 %in% stop_words$word,
          !word2 %in% stop_words$word,
          !word3 %in% stop_words$word) %>%
   count(word1, word2, word3, sort = TRUE)


bigram_tf_idf <- bigrams_united %>%
   count(book, bigram) %>%
   bind_tf_idf(bigram, book, n) %>%
   arrange(desc(tf_idf))
bigram_tf_idf


Pranala Menarik