Difference between revisions of "R: tidytext: tidytext"

From OnnoWiki
Jump to navigation Jump to search
(Created page with "# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd ==Referensi== * https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rm...")
 
 
Line 1: Line 1:
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd
+
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd
  
 +
# The tidy text format {#tidytext}
 +
library(knitr)
 +
opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)
 +
options(width = 100, dplyr.width = 100)
 +
library(ggplot2)
 +
theme_set(theme_light())
  
  
 +
## The `unnest_tokens` function
 +
text <- c("Because I could not stop for Death -",
 +
          "He kindly stopped for me -",
 +
          "The Carriage held but just Ourselves -",
 +
          "and Immortality")
 +
text
  
 +
library(dplyr)
 +
text_df <- tibble(line = 1:4, text = text)
 +
text_df
  
  
 +
# Within our tidy text framework, we need to both break the text
 +
# into individual tokens (a process called *tokenization*) *and*
 +
# transform it to a tidy data structure.
 +
# To do this, we use tidytext's `unnest_tokens()` function.
 +
library(tidytext)
 +
text_df %>%
 +
  unnest_tokens(word, text)
 +
 +
 +
 +
 +
## Tidying the works of Jane Austen {#tidyausten}
 +
library(janeaustenr)
 +
library(dplyr)
 +
library(stringr)
 +
original_books <- austen_books() %>%
 +
  group_by(book) %>%
 +
  mutate(linenumber = row_number(),
 +
          chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
 +
                                                  ignore_case = TRUE)))) %>%
 +
  ungroup()
 +
original_books
 +
 +
 +
# To work with this as a tidy dataset,
 +
# we need to restructure it in the **one-token-per-row** format,
 +
# which as we saw earlier is done with the `unnest_tokens()` function.
 +
library(tidytext)
 +
tidy_books <- original_books %>%
 +
  unnest_tokens(word, text)
 +
tidy_books
 +
 +
 +
# Now that the data is in one-word-per-row format,
 +
# we can manipulate it with tidy tools like dplyr.
 +
# Often in text analysis, we will want to remove stop words;
 +
# stop words are words that are not useful for an analysis,
 +
# typically extremely common words such as "the", "of", "to", and so forth in English.
 +
# We can remove stop words (kept in the tidytext dataset `stop_words`) with an `anti_join()`.
 +
data(stop_words)
 +
tidy_books <- tidy_books %>%
 +
  anti_join(stop_words)
 +
 +
# We can also use dplyr's `count()` to find the most common words in all the books
 +
# as a whole.
 +
tidy_books %>%
 +
  count(word, sort = TRUE)
 +
 +
# Because we've been using tidy tools, our word counts are stored in a tidy data frame.
 +
# This allows us to pipe this directly to the ggplot2 package,
 +
# for example to create a visualization of the most common words
 +
# (Figure \@ref(fig:plotcount)).
 +
library(ggplot2)
 +
tidy_books %>%
 +
  count(word, sort = TRUE) %>%
 +
  filter(n > 600) %>%
 +
  mutate(word = reorder(word, n)) %>%
 +
  ggplot(aes(word, n)) +
 +
  geom_col() +
 +
  xlab(NULL) +
 +
  coord_flip()
  
  

Latest revision as of 09:49, 2 December 2019

# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/01-tidy-text.Rmd
# The tidy text format {#tidytext}
library(knitr)
opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)
options(width = 100, dplyr.width = 100)
library(ggplot2)
theme_set(theme_light())


## The `unnest_tokens` function
text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")
text
library(dplyr)
text_df <- tibble(line = 1:4, text = text)
text_df


# Within our tidy text framework, we need to both break the text
# into individual tokens (a process called *tokenization*) *and*
# transform it to a tidy data structure.
# To do this, we use tidytext's `unnest_tokens()` function.
library(tidytext)
text_df %>%
  unnest_tokens(word, text)



## Tidying the works of Jane Austen {#tidyausten}
library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()
original_books


# To work with this as a tidy dataset,
# we need to restructure it in the **one-token-per-row** format,
# which as we saw earlier is done with the `unnest_tokens()` function.
library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)
tidy_books


# Now that the data is in one-word-per-row format,
# we can manipulate it with tidy tools like dplyr.
# Often in text analysis, we will want to remove stop words;
# stop words are words that are not useful for an analysis,
# typically extremely common words such as "the", "of", "to", and so forth in English.
# We can remove stop words (kept in the tidytext dataset `stop_words`) with an `anti_join()`.
data(stop_words)
tidy_books <- tidy_books %>%
  anti_join(stop_words)
# We can also use dplyr's `count()` to find the most common words in all the books
# as a whole.
tidy_books %>%
  count(word, sort = TRUE) 
# Because we've been using tidy tools, our word counts are stored in a tidy data frame.
# This allows us to pipe this directly to the ggplot2 package,
# for example to create a visualization of the most common words
# (Figure \@ref(fig:plotcount)).
library(ggplot2)
tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()


Referensi

Pranala Menarik