Difference between revisions of "R: ngram word clouds"
Onnowpurbo (talk | contribs) (Created page with "Sumber: http://www.rpubs.com/rgcmme/PLN-09 Load required libraries. library(tm) library(ggplot2) library(reshape2) library(wordcloud) library(RWeka) # Needed for a bug whe...") |
Onnowpurbo (talk | contribs) |
||
Line 4: | Line 4: | ||
Load required libraries. | Load required libraries. | ||
− | library(tm) | + | library(tm) |
− | library(ggplot2) | + | library(ggplot2) |
− | library(reshape2) | + | library(reshape2) |
− | library(wordcloud) | + | library(wordcloud) |
− | library(RWeka) | + | library(RWeka) |
# Needed for a bug when calculating n-grams with weka | # Needed for a bug when calculating n-grams with weka | ||
− | options(mc.cores=1) | + | options(mc.cores=1) |
Set the working directory to the location of the script and data. | Set the working directory to the location of the script and data. | ||
− | setwd("~/Youtube") | + | setwd("~/Youtube") |
Load corpus from local files. | Load corpus from local files. | ||
Line 23: | Line 23: | ||
Once unzipped, access the positive reviews in the dataset. | Once unzipped, access the positive reviews in the dataset. | ||
− | path = "./review_polarity/txt_sentoken/" | + | path = "./review_polarity/txt_sentoken/" |
− | dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8") | + | dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8") |
− | corpus = Corpus(dir) | + | corpus = Corpus(dir) |
Check how many documents have been loaded. | Check how many documents have been loaded. | ||
− | length(corpus) | + | length(corpus) |
## [1] 1000 | ## [1] 1000 | ||
Line 69: | Line 69: | ||
Apply transformations to the original corpus. In this case, add to the stop words list the “’s” and “’ve” words. | Apply transformations to the original corpus. In this case, add to the stop words list the “’s” and “’ve” words. | ||
− | corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve")) | + | corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve")) |
− | corpus.ng = tm_map(corpus.ng,removePunctuation) | + | corpus.ng = tm_map(corpus.ng,removePunctuation) |
− | corpus.ng = tm_map(corpus.ng,removeNumbers) | + | corpus.ng = tm_map(corpus.ng,removeNumbers) |
Use Weka’s n-gram tokenizer to create a TDM that uses as terms the bigrams that appear in the corpus. | Use Weka’s n-gram tokenizer to create a TDM that uses as terms the bigrams that appear in the corpus. | ||
− | BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) | + | BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) |
− | tdm.bigram = TermDocumentMatrix(corpus.ng, | + | tdm.bigram = TermDocumentMatrix(corpus.ng, |
− | control = list(tokenize = BigramTokenizer)) | + | control = list(tokenize = BigramTokenizer)) |
Extract the frequency of each bigram and analyse the twenty most frequent ones. | Extract the frequency of each bigram and analyse the twenty most frequent ones. | ||
− | freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE) | + | freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE) |
− | freq.df = data.frame(word=names(freq), freq=freq) | + | freq.df = data.frame(word=names(freq), freq=freq) |
− | head(freq.df, 20) | + | head(freq.df, 20) |
## word freq | ## word freq | ||
Line 111: | Line 111: | ||
You can invoke the display.brewer.all function to see the whole palette | You can invoke the display.brewer.all function to see the whole palette | ||
− | pal=brewer.pal(8,"Blues") | + | pal=brewer.pal(8,"Blues") |
− | pal=pal[-(1:3)] | + | pal=pal[-(1:3)] |
Plot the wordcloud. | Plot the wordcloud. | ||
− | wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal) | + | wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal) |
Plot the most frequent bigrams in a bar graph. | Plot the most frequent bigrams in a bar graph. | ||
− | ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) + | + | ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) + |
− | + | geom_bar(stat = "identity") + coord_flip() + | |
− | + | xlab("Bigrams") + ylab("Frequency") + | |
− | + | ggtitle("Most frequent bigrams") | |
Create a trigram wordcloud | Create a trigram wordcloud | ||
Line 129: | Line 129: | ||
To create a trigram wordcloud, the approach is the same but this time we tell the n-gram tokenizer to find trigrams. | To create a trigram wordcloud, the approach is the same but this time we tell the n-gram tokenizer to find trigrams. | ||
− | TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3)) | + | TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3)) |
− | tdm.trigram = TermDocumentMatrix(corpus.ng, | + | tdm.trigram = TermDocumentMatrix(corpus.ng, |
− | control = list(tokenize = TrigramTokenizer)) | + | control = list(tokenize = TrigramTokenizer)) |
Extract the frequency of each trigram and analyse the twenty most frequent ones. | Extract the frequency of each trigram and analyse the twenty most frequent ones. | ||
− | freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE) | + | freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE) |
− | freq.df = data.frame(word=names(freq), freq=freq) | + | freq.df = data.frame(word=names(freq), freq=freq) |
− | head(freq.df, 20) | + | head(freq.df, 20) |
## word freq | ## word freq | ||
Line 163: | Line 163: | ||
Plot the wordcloud. | Plot the wordcloud. | ||
− | wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal) | + | wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal) |
Plot the most frequent trigrams in a bar graph. | Plot the most frequent trigrams in a bar graph. | ||
− | ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) + | + | ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) + |
− | + | geom_bar(stat="identity") + coord_flip() + | |
− | + | xlab("Trigrams") + ylab("Frequency") + | |
− | + | ggtitle("Most frequent trigrams") | |
Latest revision as of 12:51, 7 November 2018
Sumber: http://www.rpubs.com/rgcmme/PLN-09
Load required libraries.
library(tm) library(ggplot2) library(reshape2) library(wordcloud) library(RWeka)
- Needed for a bug when calculating n-grams with weka
options(mc.cores=1)
Set the working directory to the location of the script and data.
setwd("~/Youtube")
Load corpus from local files.
Load the Sentiment polarity dataset version 2.0 from the Movie review data.
Once unzipped, access the positive reviews in the dataset.
path = "./review_polarity/txt_sentoken/"
dir = DirSource(paste(path,"pos/",sep=""), encoding = "UTF-8") corpus = Corpus(dir)
Check how many documents have been loaded.
length(corpus)
- [1] 1000
Access the document in the first entry.
corpus1
- <<PlainTextDocument (metadata: 7)>>
- films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .
- for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen .
- to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd .
- the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes .
- in other words , don't dismiss this film because of its source .
- if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes .
- getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ?
- the ghetto in question is , of course , whitechapel in 1888 london's east end .
- it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision .
- when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case .
- abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium .
- upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach .
- i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay .
- in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end .
- it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts .
- and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) .
- don't worry - it'll all make sense when you see it .
- now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) .
- the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic .
- oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place .
- even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent .
- ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham .
- i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad .
- the film , however , is all good .
- 2 : 00 - r for strong violence/gore , sexuality , language and drug content
Create a bigram wordcloud
Apply transformations to the original corpus. In this case, add to the stop words list the “’s” and “’ve” words.
corpus.ng = tm_map(corpus,removeWords,c(stopwords(),"s","ve")) corpus.ng = tm_map(corpus.ng,removePunctuation) corpus.ng = tm_map(corpus.ng,removeNumbers)
Use Weka’s n-gram tokenizer to create a TDM that uses as terms the bigrams that appear in the corpus.
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) tdm.bigram = TermDocumentMatrix(corpus.ng, control = list(tokenize = BigramTokenizer))
Extract the frequency of each bigram and analyse the twenty most frequent ones.
freq = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE) freq.df = data.frame(word=names(freq), freq=freq) head(freq.df, 20)
- word freq
- special effects special effects 171
- star wars star wars 133
- new york new york 131
- even though even though 120
- one best one best 112
- science fiction science fiction 84
- star trek star trek 84
- high school high school 81
- pulp fiction pulp fiction 75
- takes place takes place 72
- ever seen ever seen 68
- one day one day 68
- supporting cast supporting cast 68
- one thing one thing 62
- jackie chan jackie chan 61
- first film first film 60
- years ago years ago 59
- much like much like 58
- seems like seems like 57
- motion picture motion picture 56
Choose a nice range of blue colors for the wordcloud.
You can invoke the display.brewer.all function to see the whole palette
pal=brewer.pal(8,"Blues") pal=pal[-(1:3)]
Plot the wordcloud.
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
Plot the most frequent bigrams in a bar graph.
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) + geom_bar(stat = "identity") + coord_flip() + xlab("Bigrams") + ylab("Frequency") + ggtitle("Most frequent bigrams")
Create a trigram wordcloud
To create a trigram wordcloud, the approach is the same but this time we tell the n-gram tokenizer to find trigrams.
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3)) tdm.trigram = TermDocumentMatrix(corpus.ng, control = list(tokenize = TrigramTokenizer))
Extract the frequency of each trigram and analyse the twenty most frequent ones.
freq = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE) freq.df = data.frame(word=names(freq), freq=freq) head(freq.df, 20)
- word freq
- saving private ryan saving private ryan 39
- good will hunting good will hunting 34
- new york city new york city 29
- robert de niro robert de niro 25
- jay silent bob jay silent bob 22
- tommy lee jones tommy lee jones 22
- thin red line thin red line 21
- know last summer know last summer 20
- one best films one best films 20
- babe pig city babe pig city 18
- samuel l jackson samuel l jackson 17
- world war ii world war ii 16
- blair witch project blair witch project 15
- film takes place film takes place 15
- american history x american history x 14
- william h macy william h macy 13
- dusk till dawn dusk till dawn 12
- little known facts little known facts 12
- natural born killers natural born killers 12
- one best movies one best movies 12
Plot the wordcloud.
wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
Plot the most frequent trigrams in a bar graph.
ggplot(head(freq.df,15), aes(reorder(word,freq), freq)) + geom_bar(stat="identity") + coord_flip() + xlab("Trigrams") + ylab("Frequency") + ggtitle("Most frequent trigrams")