Difference between revisions of "Python: NLTK stopwords"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
| Line 86: | Line 86: | ||
00:03 | 00:03 | ||
00:04 | 00:04 | ||
| + | |||
| + | |||
| + | ==Jika sudah ada stopword== | ||
| + | |||
| + | misalnya, | ||
| + | |||
| + | |||
| + | rm ~/nltk_data/corpora/stopwords/indonesia | ||
| + | touch ~/nltk_data/corpora/stopwords/indonesia | ||
| + | cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia | ||
| + | cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia | ||
| + | cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia | ||
| + | cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia | ||
| + | cat indonesia-stemmped >> ~/nltk_data/corpora/stopwords/indonesia | ||
Revision as of 17:09, 31 January 2017
Directory
~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', default=, help='input filename')
return parser.parse_args()
args = parse_args()
infile = args.infile
filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]
for word in fs:
word = ps.stem(word.strip(string.punctuation).lower())
if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
if word not in wordlist:
wordlist.append(word)
print( word )
else:
pass
else:
pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya punya sendiri kami kamu anda dia mereka jika yang itu siapa dengan a b c d e f .. .. 1 2 3 4 5 .. .. 01/1/2017 02/1/2017 03/1/2017 04/1/2017 05/1/2017 .. .. 00:00 00:01 00:02 00:03 00:04
Jika sudah ada stopword
misalnya,
rm ~/nltk_data/corpora/stopwords/indonesia touch ~/nltk_data/corpora/stopwords/indonesia cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-stemmped >> ~/nltk_data/corpora/stopwords/indonesia