Difference between revisions of "Python: NLTK stopwords"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with "Directory ~/nltk_data/corpora/stopwords") |
Onnowpurbo (talk | contribs) |
||
(6 intermediate revisions by the same user not shown) | |||
Line 2: | Line 2: | ||
~/nltk_data/corpora/stopwords | ~/nltk_data/corpora/stopwords | ||
+ | |||
+ | |||
+ | Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik | ||
+ | |||
+ | import os,nltk,os.path,re,string | ||
+ | import argparse | ||
+ | from nltk.stem.porter import PorterStemmer | ||
+ | |||
+ | ps=PorterStemmer() | ||
+ | |||
+ | def parse_args(): | ||
+ | parser = argparse.ArgumentParser() | ||
+ | parser.add_argument('-i', '--infile', default='', help='input filename') | ||
+ | return parser.parse_args() | ||
+ | |||
+ | args = parse_args() | ||
+ | infile = args.infile | ||
+ | |||
+ | filename = open(infile,'r') | ||
+ | fcontent=filename.read() | ||
+ | filename.close() | ||
+ | |||
+ | fs = fcontent.split() | ||
+ | wordlist=[] | ||
+ | |||
+ | for word in fs: | ||
+ | word = ps.stem(word.strip(string.punctuation).lower()) | ||
+ | if word not in nltk.corpus.stopwords.words('english') and len(word)<15: | ||
+ | if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15: | ||
+ | if word not in wordlist: | ||
+ | wordlist.append(word) | ||
+ | print( word ) | ||
+ | else: | ||
+ | pass | ||
+ | else: | ||
+ | pass | ||
+ | |||
+ | |||
+ | Masukan kata2 yang tidak ingin ada dalam text ke dalam file | ||
+ | |||
+ | ~/nltk_data/corpora/stopwords/indonesia | ||
+ | |||
+ | contoh | ||
+ | |||
+ | saya | ||
+ | punya | ||
+ | sendiri | ||
+ | kami | ||
+ | kamu | ||
+ | anda | ||
+ | dia | ||
+ | mereka | ||
+ | jika | ||
+ | yang | ||
+ | itu | ||
+ | siapa | ||
+ | dengan | ||
+ | a | ||
+ | b | ||
+ | c | ||
+ | d | ||
+ | e | ||
+ | f | ||
+ | .. | ||
+ | .. | ||
+ | 1 | ||
+ | 2 | ||
+ | 3 | ||
+ | 4 | ||
+ | 5 | ||
+ | .. | ||
+ | .. | ||
+ | 01/1/2017 | ||
+ | 02/1/2017 | ||
+ | 03/1/2017 | ||
+ | 04/1/2017 | ||
+ | 05/1/2017 | ||
+ | .. | ||
+ | .. | ||
+ | 00:00 | ||
+ | 00:01 | ||
+ | 00:02 | ||
+ | 00:03 | ||
+ | 00:04 | ||
+ | |||
+ | |||
+ | ==Jika sudah ada stopword== | ||
+ | |||
+ | misalnya, | ||
+ | |||
+ | rm ~/nltk_data/corpora/stopwords/indonesia | ||
+ | touch ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-common >> ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-1common >> ~/nltk_data/corpora/stopwords/indonesia | ||
+ | cat indonesia-tambahan >> ~/nltk_data/corpora/stopwords/indonesia |
Latest revision as of 05:25, 5 February 2017
Directory
~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string import argparse from nltk.stem.porter import PorterStemmer ps=PorterStemmer() def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', default=, help='input filename') return parser.parse_args() args = parse_args() infile = args.infile filename = open(infile,'r') fcontent=filename.read() filename.close()
fs = fcontent.split() wordlist=[] for word in fs: word = ps.stem(word.strip(string.punctuation).lower()) if word not in nltk.corpus.stopwords.words('english') and len(word)<15: if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15: if word not in wordlist: wordlist.append(word) print( word ) else: pass else: pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya punya sendiri kami kamu anda dia mereka jika yang itu siapa dengan a b c d e f .. .. 1 2 3 4 5 .. .. 01/1/2017 02/1/2017 03/1/2017 04/1/2017 05/1/2017 .. .. 00:00 00:01 00:02 00:03 00:04
Jika sudah ada stopword
misalnya,
rm ~/nltk_data/corpora/stopwords/indonesia touch ~/nltk_data/corpora/stopwords/indonesia cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-common >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-1common >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-tambahan >> ~/nltk_data/corpora/stopwords/indonesia