Difference between revisions of "Python: cari-stopwords-common.py"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " import os,nltk,os.path,re,string import argparse from nltk.stem.porter import PorterStemmer from collections import Counter import re ps=PorterStemmer() def hanya_h...") |
Onnowpurbo (talk | contribs) |
||
| Line 21: | Line 21: | ||
args = parse_args() | args = parse_args() | ||
infile = args.infile | infile = args.infile | ||
| + | |||
| + | words = re.findall(r'\w+', open(infile).read().lower()) | ||
| + | wordcommon = Counter(words).most_common(400) | ||
| − | + | for word, val in wordcommon: | |
| − | + | if hanya_huruf(word) and len(word)>1 and word!='Iing' : | |
| − | + | word = word.strip(string.punctuation).lower() | |
| + | if word not in nltk.corpus.stopwords.words('english'): | ||
| + | if word not in nltk.corpus.stopwords.words('indonesia'): | ||
| + | print word | ||
| + | |||
| + | ~ | ||
Latest revision as of 10:07, 5 February 2017
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
from collections import Counter
import re
ps=PorterStemmer()
def hanya_huruf( input ):
r=re.match('^[a-zA-Z]+$', input)
if r==None:
return False
else:
return True
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', default=, help='input filename')
return parser.parse_args()
args = parse_args()
infile = args.infile
words = re.findall(r'\w+', open(infile).read().lower())
wordcommon = Counter(words).most_common(400)
for word, val in wordcommon:
if hanya_huruf(word) and len(word)>1 and word!='Iing' :
word = word.strip(string.punctuation).lower()
if word not in nltk.corpus.stopwords.words('english'):
if word not in nltk.corpus.stopwords.words('indonesia'):
print word
~