Difference between revisions of "R: tidy text dataset - tibble"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
Line 58: | Line 58: | ||
xlab(NULL) + | xlab(NULL) + | ||
coord_flip() | coord_flip() | ||
+ | |||
+ | |||
+ | ==Tidy Text gutenbergr== | ||
+ | |||
+ | Kayanya ada masalah dengan gutenbergr, karena di R versi terakhir sudah tidak ada ... | ||
+ | |||
+ | library(gutenbergr) | ||
+ | hgwells <- gutenberg_download(c(35, 36, 5230, 159)) | ||
+ | tidy_hgwells <- hgwells %>% | ||
+ | unnest_tokens(word, text) %>% | ||
+ | anti_join(stop_words) | ||
+ | |||
+ | kegiatan di atas tidak bisa di lakukan | ||
==Pranala Menarik== | ==Pranala Menarik== | ||
* [[R]] | * [[R]] |
Revision as of 10:36, 31 October 2018
Text Vector
text <- c("Because I could not stop for Death -", "He kindly stopped for me -", "The Carriage held but just Ourselves -", "and Immortality") text
Tidy Text Dataset
install.packages("dplyr") library(dplyr) text_df <- data_frame(line = 1:4, text = text) text_df
Tidy Text Novel
library(janeaustenr) library(dplyr) library(stringr) original_books <- austen_books() %>% group_by(book) %>% mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>% ungroup() original_books
Buat menjadi one-token-per-row
library(tidytext) tidy_books <- original_books %>% unnest_tokens(word, text) tidy_books
Buang stopwords
data(stop_words) tidy_books <- tidy_books %>% anti_join(stop_words)
Word Count
tidy_books %>% count(word, sort = TRUE)
Plot
library(ggplot2) tidy_books %>% count(word, sort = TRUE) %>% filter(n > 600) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col() + xlab(NULL) + coord_flip()
Tidy Text gutenbergr
Kayanya ada masalah dengan gutenbergr, karena di R versi terakhir sudah tidak ada ...
library(gutenbergr) hgwells <- gutenberg_download(c(35, 36, 5230, 159)) tidy_hgwells <- hgwells %>% unnest_tokens(word, text) %>% anti_join(stop_words)
kegiatan di atas tidak bisa di lakukan