Difference between revisions of "R: tidytext: tidytext"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with "# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd ==Referensi== * https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rm...") |
Onnowpurbo (talk | contribs) |
||
| Line 1: | Line 1: | ||
| − | # Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd | + | # Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd |
| + | # The tidy text format {#tidytext} | ||
| + | library(knitr) | ||
| + | opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE) | ||
| + | options(width = 100, dplyr.width = 100) | ||
| + | library(ggplot2) | ||
| + | theme_set(theme_light()) | ||
| + | ## The `unnest_tokens` function | ||
| + | text <- c("Because I could not stop for Death -", | ||
| + | "He kindly stopped for me -", | ||
| + | "The Carriage held but just Ourselves -", | ||
| + | "and Immortality") | ||
| + | text | ||
| + | library(dplyr) | ||
| + | text_df <- tibble(line = 1:4, text = text) | ||
| + | text_df | ||
| + | # Within our tidy text framework, we need to both break the text | ||
| + | # into individual tokens (a process called *tokenization*) *and* | ||
| + | # transform it to a tidy data structure. | ||
| + | # To do this, we use tidytext's `unnest_tokens()` function. | ||
| + | library(tidytext) | ||
| + | text_df %>% | ||
| + | unnest_tokens(word, text) | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | ## Tidying the works of Jane Austen {#tidyausten} | ||
| + | library(janeaustenr) | ||
| + | library(dplyr) | ||
| + | library(stringr) | ||
| + | original_books <- austen_books() %>% | ||
| + | group_by(book) %>% | ||
| + | mutate(linenumber = row_number(), | ||
| + | chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", | ||
| + | ignore_case = TRUE)))) %>% | ||
| + | ungroup() | ||
| + | original_books | ||
| + | |||
| + | |||
| + | # To work with this as a tidy dataset, | ||
| + | # we need to restructure it in the **one-token-per-row** format, | ||
| + | # which as we saw earlier is done with the `unnest_tokens()` function. | ||
| + | library(tidytext) | ||
| + | tidy_books <- original_books %>% | ||
| + | unnest_tokens(word, text) | ||
| + | tidy_books | ||
| + | |||
| + | |||
| + | # Now that the data is in one-word-per-row format, | ||
| + | # we can manipulate it with tidy tools like dplyr. | ||
| + | # Often in text analysis, we will want to remove stop words; | ||
| + | # stop words are words that are not useful for an analysis, | ||
| + | # typically extremely common words such as "the", "of", "to", and so forth in English. | ||
| + | # We can remove stop words (kept in the tidytext dataset `stop_words`) with an `anti_join()`. | ||
| + | data(stop_words) | ||
| + | tidy_books <- tidy_books %>% | ||
| + | anti_join(stop_words) | ||
| + | |||
| + | # We can also use dplyr's `count()` to find the most common words in all the books | ||
| + | # as a whole. | ||
| + | tidy_books %>% | ||
| + | count(word, sort = TRUE) | ||
| + | |||
| + | # Because we've been using tidy tools, our word counts are stored in a tidy data frame. | ||
| + | # This allows us to pipe this directly to the ggplot2 package, | ||
| + | # for example to create a visualization of the most common words | ||
| + | # (Figure \@ref(fig:plotcount)). | ||
| + | library(ggplot2) | ||
| + | tidy_books %>% | ||
| + | count(word, sort = TRUE) %>% | ||
| + | filter(n > 600) %>% | ||
| + | mutate(word = reorder(word, n)) %>% | ||
| + | ggplot(aes(word, n)) + | ||
| + | geom_col() + | ||
| + | xlab(NULL) + | ||
| + | coord_flip() | ||
Latest revision as of 09:49, 2 December 2019
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd
# The tidy text format {#tidytext}
library(knitr)
opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)
options(width = 100, dplyr.width = 100)
library(ggplot2)
theme_set(theme_light())
## The `unnest_tokens` function
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
library(dplyr) text_df <- tibble(line = 1:4, text = text) text_df
# Within our tidy text framework, we need to both break the text # into individual tokens (a process called *tokenization*) *and* # transform it to a tidy data structure. # To do this, we use tidytext's `unnest_tokens()` function. library(tidytext) text_df %>% unnest_tokens(word, text)
## Tidying the works of Jane Austen {#tidyausten}
library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
# To work with this as a tidy dataset, # we need to restructure it in the **one-token-per-row** format, # which as we saw earlier is done with the `unnest_tokens()` function. library(tidytext) tidy_books <- original_books %>% unnest_tokens(word, text) tidy_books
# Now that the data is in one-word-per-row format, # we can manipulate it with tidy tools like dplyr. # Often in text analysis, we will want to remove stop words; # stop words are words that are not useful for an analysis, # typically extremely common words such as "the", "of", "to", and so forth in English. # We can remove stop words (kept in the tidytext dataset `stop_words`) with an `anti_join()`. data(stop_words) tidy_books <- tidy_books %>% anti_join(stop_words)
# We can also use dplyr's `count()` to find the most common words in all the books # as a whole. tidy_books %>% count(word, sort = TRUE)
# Because we've been using tidy tools, our word counts are stored in a tidy data frame. # This allows us to pipe this directly to the ggplot2 package, # for example to create a visualization of the most common words # (Figure \@ref(fig:plotcount)). library(ggplot2) tidy_books %>% count(word, sort = TRUE) %>% filter(n > 600) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n)) + geom_col() + xlab(NULL) + coord_flip()