Difference between revisions of "Python: cari-stopwords-common.py"

From OnnoWiki
Jump to navigation Jump to search
(Created page with " import os,nltk,os.path,re,string import argparse from nltk.stem.porter import PorterStemmer from collections import Counter import re ps=PorterStemmer() def hanya_h...")
 
 
Line 21: Line 21:
 
  args = parse_args()
 
  args = parse_args()
 
  infile = args.infile
 
  infile = args.infile
 +
 
 +
words = re.findall(r'\w+', open(infile).read().lower())
 +
wordcommon = Counter(words).most_common(400)
 
   
 
   
  words = re.findall(r'\w+', open(infile).read().lower())
+
  for word, val in wordcommon:
wordcommon = Counter(words).most_common(300)
+
    if hanya_huruf(word) and len(word)>1 and word!='Iing' :
print wordcommon
+
        word = word.strip(string.punctuation).lower()
 +
        if word not in nltk.corpus.stopwords.words('english'):
 +
          if word not in nltk.corpus.stopwords.words('indonesia'):
 +
              print word
 +
 
 +
~

Latest revision as of 10:07, 5 February 2017

import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
from collections import Counter
import re

ps=PorterStemmer()

def hanya_huruf( input ):
   r=re.match('^[a-zA-Z]+$', input)
   if r==None:
      return False
   else:
      return True

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    return parser.parse_args()

args = parse_args()
infile = args.infile
 
words = re.findall(r'\w+', open(infile).read().lower())
wordcommon = Counter(words).most_common(400)

for word, val in wordcommon:
    if hanya_huruf(word) and len(word)>1 and word!='Iing' :
       word = word.strip(string.punctuation).lower()
       if word not in nltk.corpus.stopwords.words('english'):
          if word not in nltk.corpus.stopwords.words('indonesia'):
              print word

~