Python: File Stemming dengan Sastrawi

From OnnoWiki
Revision as of 11:17, 30 October 2018 by Onnowpurbo (talk | contribs) (Created page with " import sys, getopt import argparse import os,nltk,os.path,re,string import argparse import Sastrawi from nltk.stem.porter import PorterStemmer from Sastrawi.Stemmer.S...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search
import sys, getopt
import argparse
import os,nltk,os.path,re,string
import argparse
import Sastrawi

from nltk.stem.porter import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    parser.add_argument('-o', '--outfile', default=, help='output filename')
    return parser.parse_args()

def hanya_huruf( input ):
   r=re.match('^[a-zA-Z]+$', input)
   if r==None:
      return False
   else:
      return True

def main():
    args = parse_args()
    outfile = args.outfile
    infile = args.infile 

    f = open(infile,"r")
    fcontent = f.read()
    lines = fcontent.split()
    f.close()

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    f = open(outfile,"w")
    for word in lines:
      if hanya_huruf(word) and len(word)<20 and len(word)>1 and word!='Iing' :
         word = word.strip(string.punctuation).lower()
         word = stemmer.stem(word)
         if word not in nltk.corpus.stopwords.words('english'):
            if word not in nltk.corpus.stopwords.words('indonesian'):
                f.write(word)
		f.write(" ")
      else:
         pass
    f.close()
main()