Difference between revisions of "R: tidy text dataset - tibble"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
| Line 94: | Line 94: | ||
spread(author, proportion) %>% | spread(author, proportion) %>% | ||
gather(author, proportion, `Brontë Sisters`:`H.G. Wells`) | gather(author, proportion, `Brontë Sisters`:`H.G. Wells`) | ||
| + | |||
| + | Plot | ||
| + | |||
| + | library(scales) | ||
| + | # expect a warning about rows with missing values being removed | ||
| + | ggplot(frequency, aes(x = proportion, y = `Jane Austen`, | ||
| + | color = abs(`Jane Austen` - proportion))) + | ||
| + | geom_abline(color = "gray40", lty = 2) + | ||
| + | geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + | ||
| + | geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + | ||
| + | scale_x_log10(labels = percent_format()) + | ||
| + | scale_y_log10(labels = percent_format()) + | ||
| + | scale_color_gradient(limits = c(0, 0.001), | ||
| + | low = "darkslategray4", high = "gray75") + | ||
| + | facet_wrap(~author, ncol = 2) + | ||
| + | theme(legend.position="none") + | ||
| + | labs(y = "Jane Austen", x = NULL) | ||
==Pranala Menarik== | ==Pranala Menarik== | ||
* [[R]] | * [[R]] | ||
Revision as of 10:48, 31 October 2018
Text Vector
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
Tidy Text Dataset
install.packages("dplyr")
library(dplyr)
text_df <- data_frame(line = 1:4, text = text)
text_df
Tidy Text Novel
library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
Buat menjadi one-token-per-row
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
Buang stopwords
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
Word Count
tidy_books %>%
count(word, sort = TRUE)
Plot
library(ggplot2)
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
Tidy Text gutenbergr
install.packages("gutenbergr")
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_hgwells %>%
count(word, sort = TRUE)
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_bronte %>%
count(word, sort = TRUE)
Compare 3 text
library(tidyr)
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
Plot
library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`,
color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)