Difference between revisions of "NLTK: membuang kata yang tidak penting dan jarang dipakai"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
Line 26: | Line 26: | ||
parser = argparse.ArgumentParser() | parser = argparse.ArgumentParser() | ||
parser.add_argument('-i', '--infile', default='', help='input filename') | parser.add_argument('-i', '--infile', default='', help='input filename') | ||
− | return parser.parse_args() | + | return parser.parse_args() |
args = parse_args() | args = parse_args() | ||
Line 34: | Line 34: | ||
wordfreqs = Counter(words) | wordfreqs = Counter(words) | ||
for word, count in wordfreqs.items(): | for word, count in wordfreqs.items(): | ||
− | if count < 10: | + | if hanya_huruf(word) and len(word)>1 and word!='Iing' and count<10 : |
− | + | word = word.strip(string.punctuation).lower() | |
− | + | if word not in nltk.corpus.stopwords.words('english'): | |
+ | if word not in nltk.corpus.stopwords.words('indonesia'): | ||
+ | print word | ||
Cara pakainya | Cara pakainya | ||
python cari-stopwords-freqs.py -i hasiltwitsearch.txt > tambahan-untuk-stopwords-indonesia.txt | python cari-stopwords-freqs.py -i hasiltwitsearch.txt > tambahan-untuk-stopwords-indonesia.txt |
Latest revision as of 10:10, 5 February 2017
Untuk membuang kata2 yang tidak penting, langkah-nya
- print kata yang frekuensi penggunaannya kecil, misalnya <20
- masukan ke dalam daftar corpus, misalnya,
~/nltk_data/corpora/stopwords/indonesia
cara yang sederhana, menggunakan Python script berikut
import os,nltk,os.path,re,string import argparse from nltk.stem.porter import PorterStemmer from collections import Counter import re ps=PorterStemmer() def hanya_huruf( input ): r=re.match('^[a-zA-Z]+$', input) if r==None: return False else: return True def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', default=, help='input filename') return parser.parse_args() args = parse_args() infile = args.infile words = re.findall(r'\w+', open(infile).read().lower()) wordfreqs = Counter(words) for word, count in wordfreqs.items(): if hanya_huruf(word) and len(word)>1 and word!='Iing' and count<10 : word = word.strip(string.punctuation).lower() if word not in nltk.corpus.stopwords.words('english'): if word not in nltk.corpus.stopwords.words('indonesia'): print word
Cara pakainya
python cari-stopwords-freqs.py -i hasiltwitsearch.txt > tambahan-untuk-stopwords-indonesia.txt