R: tidytext: sentiment Most common positive and negative words

From OnnoWiki
Jump to navigation Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/02-sentiment-analysis.Rmd
library(knitr)
opts_chunk$set(message = FALSE, warning = FALSE, cache = TRUE)
options(width = 100, dplyr.width = 100)
library(ggplot2)
theme_set(theme_light())
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
bing_word_counts
# plot
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()


# spot an anomaly in the sentiment analysis;
# the word "miss" is coded as negative but it is used as a title for
# young, unmarried women in Jane Austen's work
custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                      lexicon = c("custom")), 
                               stop_words)
custom_stop_words


Referensi

Pranala Menarik