Difference between revisions of "R: tidy text dataset - tibble"

From OnnoWiki
Jump to navigation Jump to search
 
(5 intermediate revisions by the same user not shown)
Line 35: Line 35:
 
               unnest_tokens(word, text)
 
               unnest_tokens(word, text)
 
  tidy_books
 
  tidy_books
 +
 +
Buang stopwords
 +
 +
data(stop_words)
 +
tidy_books <- tidy_books %>%
 +
              anti_join(stop_words)
 +
 +
Word Count
 +
 +
tidy_books %>%
 +
    count(word, sort = TRUE)
 +
 +
Plot
 +
 +
library(ggplot2)
 +
tidy_books %>%
 +
    count(word, sort = TRUE) %>%
 +
filter(n > 600) %>%
 +
mutate(word = reorder(word, n)) %>%
 +
ggplot(aes(word, n)) +
 +
        geom_col() +
 +
        xlab(NULL) +
 +
        coord_flip()
 +
 +
 +
==Tidy Text gutenbergr==
 +
 +
install.packages("gutenbergr")
 +
library(gutenbergr)
 +
 +
 +
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
 +
tidy_hgwells <- hgwells %>%
 +
                unnest_tokens(word, text) %>%
 +
anti_join(stop_words)
 +
tidy_hgwells %>%
 +
    count(word, sort = TRUE)
 +
 +
 +
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
 +
tidy_bronte <- bronte %>%
 +
                unnest_tokens(word, text) %>%
 +
anti_join(stop_words)
 +
tidy_bronte %>%
 +
    count(word, sort = TRUE)
 +
 +
Compare 3 text
 +
 +
library(tidyr)
 +
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
 +
                        mutate(tidy_hgwells, author = "H.G. Wells"),
 +
                        mutate(tidy_books, author = "Jane Austen")) %>%
 +
    mutate(word = str_extract(word, "[a-z']+")) %>%
 +
    count(author, word) %>%
 +
    group_by(author) %>%
 +
    mutate(proportion = n / sum(n)) %>%
 +
    select(-n) %>%
 +
    spread(author, proportion) %>%
 +
    gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
 +
 +
Plot
 +
 +
library(scales)
 +
# expect a warning about rows with missing values being removed
 +
ggplot(frequency, aes(x = proportion, y = `Jane Austen`,
 +
                  color = abs(`Jane Austen` - proportion))) +
 +
        geom_abline(color = "gray40", lty = 2) +
 +
        geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
 +
        geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
 +
        scale_x_log10(labels = percent_format()) +
 +
        scale_y_log10(labels = percent_format()) +
 +
        scale_color_gradient(limits = c(0, 0.001),
 +
            low = "darkslategray4", high = "gray75") +
 +
            facet_wrap(~author, ncol = 2) +
 +
            theme(legend.position="none") +
 +
        labs(y = "Jane Austen", x = NULL)
 +
 +
 +
Correlation
 +
 +
cor.test(data = frequency[frequency$author == "Brontë Sisters",],
 +
          ~ proportion + `Jane Austen`)
 +
 +
 +
cor.test(data = frequency[frequency$author == "H.G. Wells",],
 +
          ~ proportion + `Jane Austen`)
  
 
==Pranala Menarik==
 
==Pranala Menarik==
  
 
* [[R]]
 
* [[R]]

Latest revision as of 10:52, 31 October 2018

Text Vector

text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")
text


Tidy Text Dataset

install.packages("dplyr")
library(dplyr)
text_df <- data_frame(line = 1:4, text = text)
text_df


Tidy Text Novel

library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
   group_by(book) %>%
   mutate(linenumber = row_number(),
          chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                            ignore_case = TRUE)))) %>%
   ungroup()
original_books

Buat menjadi one-token-per-row

library(tidytext)
tidy_books <- original_books %>%
              unnest_tokens(word, text)
tidy_books

Buang stopwords

data(stop_words)
tidy_books <- tidy_books %>%
              anti_join(stop_words)

Word Count

tidy_books %>%
    count(word, sort = TRUE)

Plot

library(ggplot2)
tidy_books %>%
    count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
       geom_col() +
       xlab(NULL) +
       coord_flip()


Tidy Text gutenbergr

install.packages("gutenbergr")
library(gutenbergr)


hgwells <- gutenberg_download(c(35, 36, 5230, 159))
tidy_hgwells <- hgwells %>%
                unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_hgwells %>%
    count(word, sort = TRUE)


bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
               unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_bronte %>%
    count(word, sort = TRUE)

Compare 3 text

library(tidyr)
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
                       mutate(tidy_hgwells, author = "H.G. Wells"),
                       mutate(tidy_books, author = "Jane Austen")) %>%
   mutate(word = str_extract(word, "[a-z']+")) %>%
   count(author, word) %>%
   group_by(author) %>%
   mutate(proportion = n / sum(n)) %>%
   select(-n) %>%
   spread(author, proportion) %>%
   gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)

Plot

library(scales)
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`,
                  color = abs(`Jane Austen` - proportion))) +
       geom_abline(color = "gray40", lty = 2) +
       geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
       geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
       scale_x_log10(labels = percent_format()) +
       scale_y_log10(labels = percent_format()) +
       scale_color_gradient(limits = c(0, 0.001),
            low = "darkslategray4", high = "gray75") +
            facet_wrap(~author, ncol = 2) +
            theme(legend.position="none") +
       labs(y = "Jane Austen", x = NULL)


Correlation

cor.test(data = frequency[frequency$author == "Brontë Sisters",],
         ~ proportion + `Jane Austen`)


cor.test(data = frequency[frequency$author == "H.G. Wells",],
         ~ proportion + `Jane Austen`)

Pranala Menarik