Difference between revisions of "Python: Siapkan stopwords Indonesia"

From OnnoWiki
Jump to navigation Jump to search
(Created page with "==Download ID-Stopwords== sudo apt install git git clone https://github.com/masdevid/ID-Stopwords.git File ada di ID-Stopwords/id.stopwords.02.01.2016.txt ==Mencari S...")
 
 
Line 1: Line 1:
 +
ini sudah tidak perlu di siapkan.
 +
NLTK sudah siap dengan stopwords indonesian
 +
 +
~/nltk_data/corpora/stopwords/indonesian
 +
 +
 
==Download ID-Stopwords==
 
==Download ID-Stopwords==
  

Latest revision as of 11:13, 30 October 2018

ini sudah tidak perlu di siapkan. NLTK sudah siap dengan stopwords indonesian

~/nltk_data/corpora/stopwords/indonesian


Download ID-Stopwords

sudo apt install git
git clone https://github.com/masdevid/ID-Stopwords.git

File ada di

ID-Stopwords/id.stopwords.02.01.2016.txt


Mencari Stopwords Sendiri

import os,nltk,os.path,re,string
import argparse, re
from collections import Counter

def hanya_huruf( input ):
   r=re.match('^[a-zA-Z]+$', input)
   if r==None:
      return False
   else:
      return True

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    return parser.parse_args()

args = parse_args()
infile = args.infile

words = re.findall(r'\w+', open(infile).read().lower())
wordfreqs = Counter(words)
for word, count in wordfreqs.items():
    if hanya_huruf(word) and len(word)>1 and word!='Iing' and count<3 :
       word = word.strip(string.punctuation).lower()
       if word not in nltk.corpus.stopwords.words('english'):
          if word not in nltk.corpus.stopwords.words('indonesia'):
              print word


Pranala Menarik