Difference between revisions of "R: stopwords"

From OnnoWiki
Jump to navigation Jump to search
 
(2 intermediate revisions by the same user not shown)
Line 16: Line 16:
  
  
 
+
==Contoh 1==
  
 
  documents = c("She had toast for breakfast",
 
  documents = c("She had toast for breakfast",
Line 31: Line 31:
 
  documents
 
  documents
  
 +
 +
==Contoh 2==
 +
 +
 +
#downloading and installing the package from CRAN
 +
install.packages("tm")
 +
#loading tm
 +
library(tm)
 +
 +
#loading a text file from local computer
 +
newdata <- readlines(filepath)
 +
newdata <- readtext("filename.pdf")
 +
 +
#Load data as corpus
 +
#VectorSource() creates character vectors
 +
mydata <- Corpus(VectorSource(newdata))
 +
 +
# convert to lower case
 +
mydata <- tm_map(mydata, content_transformer(tolower))
 +
#remove ������ what would be emojis
 +
mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
 +
# remove URLs
 +
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
 +
mydata <- tm_map(mydata, content_transformer(removeURL))
 +
# remove anything other than English letters or space
 +
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
 +
mydata <- tm_map(mydata, content_transformer(removeNumPunct))
 +
# remove stopwords
 +
mydata <- tm_map(mydata, removeWords, stopwords("english"))
 +
mydata <- tm_map(mydata, removeWords, stopwords::stopwords("id", source = "stopwords-iso"))
 +
#u can create custom stop words using the code below.
 +
#myStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"use", "see", "used", "via", "amp")
 +
#mydata <- tm_map(mydata, removeWords, myStopwords)
 +
# remove extra whitespace
 +
mydata <- tm_map(mydata, stripWhitespace)
 +
# Remove numbers
 +
mydata <- tm_map(mydata, removeNumbers)
 +
# Remove punctuations
 +
mydata <- tm_map(mydata, removePunctuation)
 +
 +
 +
# stemmimg
 +
library(SnowballC)
 +
mydata <- tm_map(mydata, stemDocument)
 +
 +
#create a term matrix and store it as dtm
 +
dtm <- TermDocumentMatrix(mydata)
  
 
==Pranala Menarik==
 
==Pranala Menarik==
  
 
* [[R]]
 
* [[R]]

Latest revision as of 13:05, 1 November 2018


install.packages("stopwords")
# atau
install.packages("devtools")
devtools::install_github("quanteda/stopwords")


head(stopwords::stopwords("de", source = "snowball"), 20)
head(stopwords::stopwords("id", source = "stopwords-iso"), 20)
stopwords::stopwords_getsources()
stopwords::stopwords_getlanguages("snowball")
stopwords::stopwords_getlanguages("stopwords-iso")


Contoh 1

documents = c("She had toast for breakfast",
   "The coffee this morning was excellent", 
   "For lunch let's all have pancakes", 
   "Later in the day, there will be more talks", 
   "The talks on the first day were great", 
   "The second day should have good presentations too")
library(tm)
documents <- Corpus(VectorSource(documents))
documents = tm_map(documents, content_transformer(tolower))
documents = tm_map(documents, removePunctuation)
documents = tm_map(documents, removeWords, stopwords("english"))
documents


Contoh 2

#downloading and installing the package from CRAN
install.packages("tm")
#loading tm
library(tm)
#loading a text file from local computer
newdata <- readlines(filepath)
newdata <- readtext("filename.pdf")
#Load data as corpus
#VectorSource() creates character vectors
mydata <- Corpus(VectorSource(newdata))
# convert to lower case
mydata <- tm_map(mydata, content_transformer(tolower))
#remove ������ what would be emojis
mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeURL))
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeNumPunct))
# remove stopwords
mydata <- tm_map(mydata, removeWords, stopwords("english"))
mydata <- tm_map(mydata, removeWords, stopwords::stopwords("id", source = "stopwords-iso"))
#u can create custom stop words using the code below.
#myStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"use", "see", "used", "via", "amp")
#mydata <- tm_map(mydata, removeWords, myStopwords)
# remove extra whitespace
mydata <- tm_map(mydata, stripWhitespace)
# Remove numbers
mydata <- tm_map(mydata, removeNumbers)
# Remove punctuations
mydata <- tm_map(mydata, removePunctuation)


# stemmimg
library(SnowballC)
mydata <- tm_map(mydata, stemDocument)
#create a term matrix and store it as dtm
dtm <- TermDocumentMatrix(mydata)

Pranala Menarik