Difference between revisions of "R: tidy text dataset - tibble"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
| Line 62: | Line 62: | ||
==Tidy Text gutenbergr== | ==Tidy Text gutenbergr== | ||
| − | + | install.packages("gutenbergr") | |
| − | |||
library(gutenbergr) | library(gutenbergr) | ||
hgwells <- gutenberg_download(c(35, 36, 5230, 159)) | hgwells <- gutenberg_download(c(35, 36, 5230, 159)) | ||
| Line 70: | Line 69: | ||
anti_join(stop_words) | anti_join(stop_words) | ||
| − | + | tidy_hgwells %>% | |
| + | count(word, sort = TRUE) | ||
==Pranala Menarik== | ==Pranala Menarik== | ||
* [[R]] | * [[R]] | ||
Revision as of 10:39, 31 October 2018
Text Vector
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
Tidy Text Dataset
install.packages("dplyr")
library(dplyr)
text_df <- data_frame(line = 1:4, text = text)
text_df
Tidy Text Novel
library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
Buat menjadi one-token-per-row
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
Buang stopwords
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
Word Count
tidy_books %>%
count(word, sort = TRUE)
Plot
library(ggplot2)
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
Tidy Text gutenbergr
install.packages("gutenbergr")
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_hgwells %>%
count(word, sort = TRUE)