Python: cari-stopwords.py

From OnnoWiki
Jump to navigation Jump to search

Install dulu

pip install nltk

Source code

import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer

ps=PorterStemmer() 

def hanya_huruf( input ):
   r=re.match('^[a-zA-Z]+$', input)
   if r==None:
      return False
   else:
      return True

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    return parser.parse_args()

args = parse_args()
infile = args.infile

filename = open(infile,'r')
fcontent=filename.read()
filename.close()

fs = fcontent.split()
wordlist=[]

for word in fs:
    if hanya_huruf(word) and len(word)<15 and len(word)>1 and word!='Iing' :
       print(word)
       word = ps.stem(word.strip(string.punctuation).lower())
       if word not in nltk.corpus.stopwords.words('english'):
          if word not in nltk.corpus.stopwords.words('indonesia'):
              if word not in wordlist:
                  wordlist.append(word)
                  print( word )
              else:
                  pass
       else:
           pass