Difference between revisions of "R: ngram word clouds"

From OnnoWiki
Jump to navigation Jump to search
(Created page with "Sumber: http://www.rpubs.com/rgcmme/PLN-09 Load required libraries. library(tm) library(ggplot2) library(reshape2) library(wordcloud) library(RWeka) # Needed for a bug whe...")
 
 
Line 4: Line 4:
 
Load required libraries.
 
Load required libraries.
  
library(tm)
+
library(tm)
library(ggplot2)
+
library(ggplot2)
library(reshape2)
+
library(reshape2)
library(wordcloud)
+
library(wordcloud)
library(RWeka)
+
library(RWeka)
  
 
# Needed for a bug when calculating n-grams with weka
 
# Needed for a bug when calculating n-grams with weka
options(mc.cores=1)
+
options(mc.cores=1)
  
 
Set the working directory to the location of the script and data.
 
Set the working directory to the location of the script and data.
  
setwd("~/Youtube")
+
setwd("~/Youtube")
  
 
Load corpus from local files.
 
Load corpus from local files.
Line 23: Line 23:
 
Once unzipped, access the positive reviews in the dataset.
 
Once unzipped, access the positive reviews in the dataset.
  
path = "./review_polarity/txt_sentoken/"
+
path = "./review_polarity/txt_sentoken/"
  
dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8")
+
dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8")
corpus = Corpus(dir)
+
corpus = Corpus(dir)
  
 
Check how many documents have been loaded.
 
Check how many documents have been loaded.
  
length(corpus)
+
length(corpus)
  
 
## [1] 1000
 
## [1] 1000
Line 69: Line 69:
 
Apply transformations to the original corpus. In this case, add to the stop words list the “’s” and “’ve” words.
 
Apply transformations to the original corpus. In this case, add to the stop words list the “’s” and “’ve” words.
  
corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve"))
+
corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve"))
corpus.ng = tm_map(corpus.ng,removePunctuation)
+
corpus.ng = tm_map(corpus.ng,removePunctuation)
corpus.ng = tm_map(corpus.ng,removeNumbers)
+
corpus.ng = tm_map(corpus.ng,removeNumbers)
  
 
Use Weka’s n-gram tokenizer to create a TDM that uses as terms the bigrams that appear in the corpus.
 
Use Weka’s n-gram tokenizer to create a TDM that uses as terms the bigrams that appear in the corpus.
  
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
+
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm.bigram = TermDocumentMatrix(corpus.ng,
+
tdm.bigram = TermDocumentMatrix(corpus.ng,
control = list(tokenize = BigramTokenizer))
+
control = list(tokenize = BigramTokenizer))
  
 
Extract the frequency of each bigram and analyse the twenty most frequent ones.
 
Extract the frequency of each bigram and analyse the twenty most frequent ones.
  
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
+
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
+
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df, 20)
+
head(freq.df, 20)
  
 
##                            word freq
 
##                            word freq
Line 111: Line 111:
 
You can invoke the display.brewer.all function to see the whole palette
 
You can invoke the display.brewer.all function to see the whole palette
  
pal=brewer.pal(8,"Blues")
+
pal=brewer.pal(8,"Blues")
pal=pal[-(1:3)]
+
pal=pal[-(1:3)]
  
 
Plot the wordcloud.
 
Plot the wordcloud.
  
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
+
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
  
 
Plot the most frequent bigrams in a bar graph.
 
Plot the most frequent bigrams in a bar graph.
  
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +
+
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
+
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
+
  xlab("Bigrams") + ylab("Frequency") +
  ggtitle("Most frequent bigrams")
+
  ggtitle("Most frequent bigrams")
  
 
Create a trigram wordcloud
 
Create a trigram wordcloud
Line 129: Line 129:
 
To create a trigram wordcloud, the approach is the same but this time we tell the n-gram tokenizer to find trigrams.
 
To create a trigram wordcloud, the approach is the same but this time we tell the n-gram tokenizer to find trigrams.
  
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
+
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm.trigram = TermDocumentMatrix(corpus.ng,
+
tdm.trigram = TermDocumentMatrix(corpus.ng,
control = list(tokenize = TrigramTokenizer))
+
control = list(tokenize = TrigramTokenizer))
  
 
Extract the frequency of each trigram and analyse the twenty most frequent ones.
 
Extract the frequency of each trigram and analyse the twenty most frequent ones.
  
freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE)
+
freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
+
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df, 20)
+
head(freq.df, 20)
  
 
##                                      word freq
 
##                                      word freq
Line 163: Line 163:
 
Plot the wordcloud.
 
Plot the wordcloud.
  
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
+
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
  
 
Plot the most frequent trigrams in a bar graph.
 
Plot the most frequent trigrams in a bar graph.
  
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +   
+
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +   
  geom_bar(stat="identity") + coord_flip() +  
+
  geom_bar(stat="identity") + coord_flip() +  
  xlab("Trigrams") + ylab("Frequency") +
+
  xlab("Trigrams") + ylab("Frequency") +
  ggtitle("Most frequent trigrams")
+
  ggtitle("Most frequent trigrams")
  
  

Latest revision as of 12:51, 7 November 2018

Sumber: http://www.rpubs.com/rgcmme/PLN-09


Load required libraries.

library(tm)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(RWeka)
  1. Needed for a bug when calculating n-grams with weka
options(mc.cores=1)

Set the working directory to the location of the script and data.

setwd("~/Youtube")

Load corpus from local files.

Load the Sentiment polarity dataset version 2.0 from the Movie review data.

Once unzipped, access the positive reviews in the dataset.

path = "./review_polarity/txt_sentoken/"
dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8")
corpus = Corpus(dir)

Check how many documents have been loaded.

length(corpus)
    1. [1] 1000

Access the document in the first entry.

corpus1

    1. <<PlainTextDocument (metadata: 7)>>
    2. films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
    3. for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
    4. to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
    5. the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
    6. in other words , don't dismiss this film because of its source .
    7. if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
    8. getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
    9. the ghetto in question is , of course , whitechapel in 1888 london's east end .
    10. it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
    11. when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
    12. abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
    13. upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
    14. i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
    15. in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
    16. it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
    17. and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
    18. don't worry - it'll all make sense when you see it .
    19. now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
    20. the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
    21. oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
    22. even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
    23. ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
    24. i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
    25. the film , however , is all good .
    26. 2 : 00 - r for strong violence/gore , sexuality , language and drug content

Create a bigram wordcloud

Apply transformations to the original corpus. In this case, add to the stop words list the “’s” and “’ve” words.

corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve"))
corpus.ng = tm_map(corpus.ng,removePunctuation)
corpus.ng = tm_map(corpus.ng,removeNumbers)

Use Weka’s n-gram tokenizer to create a TDM that uses as terms the bigrams that appear in the corpus.

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm.bigram = TermDocumentMatrix(corpus.ng,
control = list(tokenize = BigramTokenizer))

Extract the frequency of each bigram and analyse the twenty most frequent ones.

freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df, 20)
    1. word freq
    2. special effects special effects 171
    3. star wars star wars 133
    4. new york new york 131
    5. even though even though 120
    6. one best one best 112
    7. science fiction science fiction 84
    8. star trek star trek 84
    9. high school high school 81
    10. pulp fiction pulp fiction 75
    11. takes place takes place 72
    12. ever seen ever seen 68
    13. one day one day 68
    14. supporting cast supporting cast 68
    15. one thing one thing 62
    16. jackie chan jackie chan 61
    17. first film first film 60
    18. years ago years ago 59
    19. much like much like 58
    20. seems like seems like 57
    21. motion picture motion picture 56

Choose a nice range of blue colors for the wordcloud.

You can invoke the display.brewer.all function to see the whole palette

pal=brewer.pal(8,"Blues")
pal=pal[-(1:3)]

Plot the wordcloud.

wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)

Plot the most frequent bigrams in a bar graph.

ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  ggtitle("Most frequent bigrams")

Create a trigram wordcloud

To create a trigram wordcloud, the approach is the same but this time we tell the n-gram tokenizer to find trigrams.

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm.trigram = TermDocumentMatrix(corpus.ng,
control = list(tokenize = TrigramTokenizer))

Extract the frequency of each trigram and analyse the twenty most frequent ones.

freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE)
freq.df = data.frame(word=names(freq), freq=freq)
head(freq.df, 20)
    1. word freq
    2. saving private ryan saving private ryan 39
    3. good will hunting good will hunting 34
    4. new york city new york city 29
    5. robert de niro robert de niro 25
    6. jay silent bob jay silent bob 22
    7. tommy lee jones tommy lee jones 22
    8. thin red line thin red line 21
    9. know last summer know last summer 20
    10. one best films one best films 20
    11. babe pig city babe pig city 18
    12. samuel l jackson samuel l jackson 17
    13. world war ii world war ii 16
    14. blair witch project blair witch project 15
    15. film takes place film takes place 15
    16. american history x american history x 14
    17. william h macy william h macy 13
    18. dusk till dawn dusk till dawn 12
    19. little known facts little known facts 12
    20. natural born killers natural born killers 12
    21. one best movies one best movies 12

Plot the wordcloud.

wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)

Plot the most frequent trigrams in a bar graph.

ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) +   
  geom_bar(stat="identity") + coord_flip() + 
  xlab("Trigrams") + ylab("Frequency") +
  ggtitle("Most frequent trigrams")




Pranala Menarik