Difference between revisions of "Python: NLTK stopwords"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with "Directory ~/nltk_data/corpora/stopwords") |
Onnowpurbo (talk | contribs) |
||
| Line 2: | Line 2: | ||
~/nltk_data/corpora/stopwords | ~/nltk_data/corpora/stopwords | ||
| + | |||
| + | |||
| + | Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik | ||
| + | |||
| + | import os,nltk,os.path,re,string | ||
| + | import argparse | ||
| + | from nltk.stem.porter import PorterStemmer | ||
| + | |||
| + | ps=PorterStemmer() | ||
| + | |||
| + | def parse_args(): | ||
| + | parser = argparse.ArgumentParser() | ||
| + | parser.add_argument('-i', '--infile', default='', help='input filename') | ||
| + | return parser.parse_args() | ||
| + | |||
| + | args = parse_args() | ||
| + | infile = args.infile | ||
| + | |||
| + | filename = open(infile,'r') | ||
| + | fcontent=filename.read() | ||
| + | filename.close() | ||
| + | |||
| + | fs = fcontent.split() | ||
| + | wordlist=[] | ||
| + | |||
| + | for word in fs: | ||
| + | word = ps.stem(word.strip(string.punctuation).lower()) | ||
| + | if word not in nltk.corpus.stopwords.words('english') and len(word)<15: | ||
| + | if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15: | ||
| + | if word not in wordlist: | ||
| + | wordlist.append(word) | ||
| + | print( word ) | ||
| + | else: | ||
| + | pass | ||
| + | else: | ||
| + | pass | ||
| + | |||
| + | |||
| + | Masukan kata2 yang tidak ingin ada dalam text ke dalam file | ||
| + | |||
| + | ~/nltk_data/corpora/stopwords/indonesia | ||
| + | |||
| + | contoh | ||
| + | |||
| + | saya | ||
| + | punya | ||
| + | sendiri | ||
| + | kami | ||
| + | kamu | ||
| + | anda | ||
| + | dia | ||
| + | mereka | ||
| + | jika | ||
| + | yang | ||
| + | itu | ||
| + | siapa | ||
| + | dengan | ||
| + | a | ||
| + | b | ||
| + | c | ||
| + | d | ||
| + | e | ||
| + | f | ||
| + | .. | ||
| + | .. | ||
| + | 1 | ||
| + | 2 | ||
| + | 3 | ||
| + | 4 | ||
| + | 5 | ||
| + | .. | ||
| + | .. | ||
| + | 01/1/2017 | ||
| + | 02/1/2017 | ||
| + | 03/1/2017 | ||
| + | 04/1/2017 | ||
| + | 05/1/2017 | ||
| + | .. | ||
| + | .. | ||
| + | 00:00 | ||
| + | 00:01 | ||
| + | 00:02 | ||
| + | 00:03 | ||
| + | 00:04 | ||
Revision as of 06:52, 31 January 2017
Directory
~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', default=, help='input filename')
return parser.parse_args()
args = parse_args()
infile = args.infile
filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]
for word in fs:
word = ps.stem(word.strip(string.punctuation).lower())
if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
if word not in wordlist:
wordlist.append(word)
print( word )
else:
pass
else:
pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya punya sendiri kami kamu anda dia mereka jika yang itu siapa dengan a b c d e f .. .. 1 2 3 4 5 .. .. 01/1/2017 02/1/2017 03/1/2017 04/1/2017 05/1/2017 .. .. 00:00 00:01 00:02 00:03 00:04