Difference between revisions of "R: stopwords"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " install.packages("stopwords") # atau install.packages("devtools") devtools::install_github("quanteda/stopwords") head(stopwords::stopwords("de", source = "snowball")...") |
Onnowpurbo (talk | contribs) |
||
(3 intermediate revisions by the same user not shown) | |||
Line 14: | Line 14: | ||
stopwords::stopwords_getlanguages("snowball") | stopwords::stopwords_getlanguages("snowball") | ||
stopwords::stopwords_getlanguages("stopwords-iso") | stopwords::stopwords_getlanguages("stopwords-iso") | ||
+ | |||
+ | |||
+ | ==Contoh 1== | ||
+ | |||
+ | documents = c("She had toast for breakfast", | ||
+ | "The coffee this morning was excellent", | ||
+ | "For lunch let's all have pancakes", | ||
+ | "Later in the day, there will be more talks", | ||
+ | "The talks on the first day were great", | ||
+ | "The second day should have good presentations too") | ||
+ | library(tm) | ||
+ | documents <- Corpus(VectorSource(documents)) | ||
+ | documents = tm_map(documents, content_transformer(tolower)) | ||
+ | documents = tm_map(documents, removePunctuation) | ||
+ | documents = tm_map(documents, removeWords, stopwords("english")) | ||
+ | documents | ||
+ | |||
+ | |||
+ | ==Contoh 2== | ||
+ | |||
+ | |||
+ | #downloading and installing the package from CRAN | ||
+ | install.packages("tm") | ||
+ | #loading tm | ||
+ | library(tm) | ||
+ | |||
+ | #loading a text file from local computer | ||
+ | newdata <- readlines(filepath) | ||
+ | newdata <- readtext("filename.pdf") | ||
+ | |||
+ | #Load data as corpus | ||
+ | #VectorSource() creates character vectors | ||
+ | mydata <- Corpus(VectorSource(newdata)) | ||
+ | |||
+ | # convert to lower case | ||
+ | mydata <- tm_map(mydata, content_transformer(tolower)) | ||
+ | #remove ������ what would be emojis | ||
+ | mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ") | ||
+ | # remove URLs | ||
+ | removeURL <- function(x) gsub("http[^[:space:]]*", "", x) | ||
+ | mydata <- tm_map(mydata, content_transformer(removeURL)) | ||
+ | # remove anything other than English letters or space | ||
+ | removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) | ||
+ | mydata <- tm_map(mydata, content_transformer(removeNumPunct)) | ||
+ | # remove stopwords | ||
+ | mydata <- tm_map(mydata, removeWords, stopwords("english")) | ||
+ | mydata <- tm_map(mydata, removeWords, stopwords::stopwords("id", source = "stopwords-iso")) | ||
+ | #u can create custom stop words using the code below. | ||
+ | #myStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"use", "see", "used", "via", "amp") | ||
+ | #mydata <- tm_map(mydata, removeWords, myStopwords) | ||
+ | # remove extra whitespace | ||
+ | mydata <- tm_map(mydata, stripWhitespace) | ||
+ | # Remove numbers | ||
+ | mydata <- tm_map(mydata, removeNumbers) | ||
+ | # Remove punctuations | ||
+ | mydata <- tm_map(mydata, removePunctuation) | ||
+ | |||
+ | |||
+ | # stemmimg | ||
+ | library(SnowballC) | ||
+ | mydata <- tm_map(mydata, stemDocument) | ||
+ | |||
+ | #create a term matrix and store it as dtm | ||
+ | dtm <- TermDocumentMatrix(mydata) | ||
==Pranala Menarik== | ==Pranala Menarik== | ||
* [[R]] | * [[R]] |
Latest revision as of 13:05, 1 November 2018
install.packages("stopwords")
# atau install.packages("devtools") devtools::install_github("quanteda/stopwords")
head(stopwords::stopwords("de", source = "snowball"), 20) head(stopwords::stopwords("id", source = "stopwords-iso"), 20)
stopwords::stopwords_getsources() stopwords::stopwords_getlanguages("snowball") stopwords::stopwords_getlanguages("stopwords-iso")
Contoh 1
documents = c("She had toast for breakfast", "The coffee this morning was excellent", "For lunch let's all have pancakes", "Later in the day, there will be more talks", "The talks on the first day were great", "The second day should have good presentations too") library(tm) documents <- Corpus(VectorSource(documents)) documents = tm_map(documents, content_transformer(tolower)) documents = tm_map(documents, removePunctuation) documents = tm_map(documents, removeWords, stopwords("english")) documents
Contoh 2
#downloading and installing the package from CRAN install.packages("tm") #loading tm library(tm)
#loading a text file from local computer newdata <- readlines(filepath) newdata <- readtext("filename.pdf")
#Load data as corpus #VectorSource() creates character vectors mydata <- Corpus(VectorSource(newdata))
# convert to lower case mydata <- tm_map(mydata, content_transformer(tolower)) #remove ������ what would be emojis mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ") # remove URLs removeURL <- function(x) gsub("http[^[:space:]]*", "", x) mydata <- tm_map(mydata, content_transformer(removeURL)) # remove anything other than English letters or space removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) mydata <- tm_map(mydata, content_transformer(removeNumPunct)) # remove stopwords mydata <- tm_map(mydata, removeWords, stopwords("english")) mydata <- tm_map(mydata, removeWords, stopwords::stopwords("id", source = "stopwords-iso")) #u can create custom stop words using the code below. #myStopwords <- c(setdiff(stopwords('english'), c("r", "big")),"use", "see", "used", "via", "amp") #mydata <- tm_map(mydata, removeWords, myStopwords) # remove extra whitespace mydata <- tm_map(mydata, stripWhitespace) # Remove numbers mydata <- tm_map(mydata, removeNumbers) # Remove punctuations mydata <- tm_map(mydata, removePunctuation)
# stemmimg library(SnowballC) mydata <- tm_map(mydata, stemDocument)
#create a term matrix and store it as dtm dtm <- TermDocumentMatrix(mydata)