R: bigram 2

From OnnoWiki
Jump to navigation Jump to search

Sumber: http://www.rpubs.com/rgcmme/PLN-09


library(tm)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(RWeka)
# Needed for a bug when calculating n-grams with weka
options(mc.cores=1)


setwd("~/Youtube")
path = "./review_polarity/txt_sentoken/"
dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8")
corpus = Corpus(dir)
 # check how many document loaded
length(corpus)
## [1] 1000
# access 1st entry
corpus1


Create bigram

corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve"))
corpus.ng = tm_map(corpus.ng,removePunctuation)
corpus.ng = tm_map(corpus.ng,removeNumbers)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm.bigram = TermDocumentMatrix(corpus.ng,
control = list(tokenize = BigramTokenizer))
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df, 20)
pal=brewer.pal(8,"Blues")
pal=pal[-(1:3)]
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  ggtitle("Most frequent bigrams")


Trigram

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm.trigram = TermDocumentMatrix(corpus.ng,
control = list(tokenize = TrigramTokenizer))
freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df, 20)
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +   
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Trigrams") + ylab("Frequency") +
  ggtitle("Most frequent trigrams")






Referensi

Pranala Menarik