R: tidytext: document-term-matrices-mining financial articles
Jump to navigation
Jump to search
# Ref: https://github.com/dgrtwo/tidy-text-mining/blob/master/05-document-term-matrices.Rmd
library(tm.plugin.webmining)
library(purrr)
company <- c("Microsoft", "Apple", "Google", "Amazon", "Facebook",
"Twitter", "IBM", "Yahoo", "Netflix")
symbol <- c("MSFT", "AAPL", "GOOG", "AMZN", "FB", "TWTR", "IBM", "YHOO", "NFLX")
download_articles <- function(symbol) {
WebCorpus(GoogleFinanceSource(paste0("NASDAQ:", symbol)))
}
# ADA ERROR naga2-nya di function download article atau di tibble
stock_articles <- tibble(company = company,
symbol = symbol) %>%
mutate(corpus = map(symbol, download_articles))
stock_articles
# tokenized stock_tokens <- stock_articles %>% mutate(corpus = map(corpus, tidy)) %>% unnest(cols = (corpus)) %>% unnest_tokens(word, text) %>% select(company, datetimestamp, word, id, heading) stock_tokens
# tf-idf library(stringr) stock_tf_idf <- stock_tokens %>% count(company, word) %>% filter(!str_detect(word, "\\d+")) %>% bind_tf_idf(word, company, n) %>% arrange(-tf_idf)
# plot
stock_tf_idf %>%
group_by(company) %>%
top_n(8, tf_idf) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = company)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ company, scales = "free") +
coord_flip() +
labs(x = "Word",
y = "tf-idf")
# sentiment
stock_tokens %>%
anti_join(stop_words, by = "word") %>%
count(word, id, sort = TRUE) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(contribution = sum(n * value)) %>%
top_n(12, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
ggplot(aes(word, contribution)) +
geom_col() +
coord_flip() +
labs(y = "Frequency of word * AFINN value")
stock_tokens %>% anti_join(stop_words, by = "word") %>% count(word, id, sort = TRUE) %>% inner_join(afinn, by = "word") %>% group_by(word) %>% summarize(contribution = sum(n * value)) %>% top_n(12, abs(contribution)) %>% mutate(word = reorder(word, contribution)) %>% ggplot(aes(word, contribution)) + geom_col() + coord_flip() + labs(y = "Frequency of word * AFINN value")
stock_tokens %>%
count(word) %>%
inner_join(get_sentiments("loughran"), by = "word") %>%
group_by(sentiment) %>%
top_n(5, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
facet_wrap(~ sentiment, scales = "free") +
ylab("Frequency of this word in the recent financial articles")
stock_tokens %>%
count(word) %>%
inner_join(loughran, by = "word") %>%
group_by(sentiment) %>%
top_n(5, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
facet_wrap(~ sentiment, scales = "free") +
ylab("Frequency of this word in the recent financial articles")
# calculate sentiment
stock_sentiment_count <- stock_tokens %>%
inner_join(get_sentiments("loughran"), by = "word") %>%
count(sentiment, company) %>%
spread(sentiment, n, fill = 0)
stock_sentiment_count
stock_sentiment_count <- stock_tokens %>% inner_join(loughran, by = "word") %>% count(sentiment, company) %>% spread(sentiment, n, fill = 0) stock_sentiment_count
stock_sentiment_count %>%
mutate(score = (positive - negative) / (positive + negative)) %>%
mutate(company = reorder(company, score)) %>%
ggplot(aes(company, score, fill = score > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(x = "Company",
y = "Positivity score among 20 recent news articles")
Referensi