Python: File Stemming dengan Sastrawi

From OnnoWiki
Jump to navigation Jump to search
import sys, getopt
import argparse
import os,nltk,os.path,re,string
import argparse
import Sastrawi

from nltk.stem.porter import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    parser.add_argument('-o', '--outfile', default=, help='output filename')
    return parser.parse_args()

def hanya_huruf( input ):
   r=re.match('^[a-zA-Z]+$', input)
   if r==None:
      return False
   else:
      return True

def main():
    args = parse_args()
    outfile = args.outfile
    infile = args.infile 

    f = open(infile,"r")
    fcontent = f.read()
    lines = fcontent.split()
    f.close()

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    f = open(outfile,"w")
    for word in lines:
      if hanya_huruf(word) and len(word)<20 and len(word)>1 and word!='Iing' :
         word = word.strip(string.punctuation).lower()
         word = stemmer.stem(word)
         if word not in nltk.corpus.stopwords.words('english'):
            if word not in nltk.corpus.stopwords.words('indonesian'):
                f.write(word)
		f.write(" ")
      else:
         pass
    f.close()
main()