Difference between revisions of "R: tidy text dataset - tibble"

From OnnoWiki
Jump to navigation Jump to search
Line 62: Line 62:
 
==Tidy Text gutenbergr==
 
==Tidy Text gutenbergr==
  
Kayanya ada masalah dengan gutenbergr, karena di R versi terakhir sudah tidak ada ...
+
install.packages("gutenbergr")
 
 
 
  library(gutenbergr)
 
  library(gutenbergr)
 
  hgwells <- gutenberg_download(c(35, 36, 5230, 159))
 
  hgwells <- gutenberg_download(c(35, 36, 5230, 159))
Line 70: Line 69:
 
  anti_join(stop_words)
 
  anti_join(stop_words)
  
kegiatan di atas tidak bisa di lakukan
+
tidy_hgwells %>%
 +
    count(word, sort = TRUE)
  
 
==Pranala Menarik==
 
==Pranala Menarik==
  
 
* [[R]]
 
* [[R]]

Revision as of 10:39, 31 October 2018

Text Vector

text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")
text


Tidy Text Dataset

install.packages("dplyr")
library(dplyr)
text_df <- data_frame(line = 1:4, text = text)
text_df


Tidy Text Novel

library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
   group_by(book) %>%
   mutate(linenumber = row_number(),
          chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                            ignore_case = TRUE)))) %>%
   ungroup()
original_books

Buat menjadi one-token-per-row

library(tidytext)
tidy_books <- original_books %>%
              unnest_tokens(word, text)
tidy_books

Buang stopwords

data(stop_words)
tidy_books <- tidy_books %>%
              anti_join(stop_words)

Word Count

tidy_books %>%
    count(word, sort = TRUE)

Plot

library(ggplot2)
tidy_books %>%
    count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
       geom_col() +
       xlab(NULL) +
       coord_flip()


Tidy Text gutenbergr

install.packages("gutenbergr")
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
tidy_hgwells <- hgwells %>%
                unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_hgwells %>%
    count(word, sort = TRUE)

Pranala Menarik