Difference between revisions of "Python: Siapkan stopwords Indonesia"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with "==Download ID-Stopwords== sudo apt install git git clone https://github.com/masdevid/ID-Stopwords.git File ada di ID-Stopwords/id.stopwords.02.01.2016.txt ==Mencari S...") |
Onnowpurbo (talk | contribs) |
||
Line 1: | Line 1: | ||
+ | ini sudah tidak perlu di siapkan. | ||
+ | NLTK sudah siap dengan stopwords indonesian | ||
+ | |||
+ | ~/nltk_data/corpora/stopwords/indonesian | ||
+ | |||
+ | |||
==Download ID-Stopwords== | ==Download ID-Stopwords== | ||
Latest revision as of 11:13, 30 October 2018
ini sudah tidak perlu di siapkan. NLTK sudah siap dengan stopwords indonesian
~/nltk_data/corpora/stopwords/indonesian
Download ID-Stopwords
sudo apt install git git clone https://github.com/masdevid/ID-Stopwords.git
File ada di
ID-Stopwords/id.stopwords.02.01.2016.txt
Mencari Stopwords Sendiri
import os,nltk,os.path,re,string import argparse, re from collections import Counter def hanya_huruf( input ): r=re.match('^[a-zA-Z]+$', input) if r==None: return False else: return True def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', default=, help='input filename') return parser.parse_args() args = parse_args() infile = args.infile words = re.findall(r'\w+', open(infile).read().lower()) wordfreqs = Counter(words) for word, count in wordfreqs.items(): if hanya_huruf(word) and len(word)>1 and word!='Iing' and count<3 : word = word.strip(string.punctuation).lower() if word not in nltk.corpus.stopwords.words('english'): if word not in nltk.corpus.stopwords.words('indonesia'): print word