Difference between revisions of "Python: NLTK stopwords"
		
		
		
		
		
		Jump to navigation
		Jump to search
		
				
		
		
	
| Onnowpurbo (talk | contribs)  (Created page with "Directory   ~/nltk_data/corpora/stopwords") | Onnowpurbo (talk | contribs)  | ||
| Line 2: | Line 2: | ||
|   ~/nltk_data/corpora/stopwords |   ~/nltk_data/corpora/stopwords | ||
| + | |||
| + | |||
| + | Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik | ||
| + | |||
| + |  import os,nltk,os.path,re,string | ||
| + |  import argparse | ||
| + |  from nltk.stem.porter import PorterStemmer | ||
| + | |||
| + |  ps=PorterStemmer() | ||
| + | |||
| + |  def parse_args(): | ||
| + |      parser = argparse.ArgumentParser() | ||
| + |      parser.add_argument('-i', '--infile', default='', help='input filename') | ||
| + |      return parser.parse_args() | ||
| + | |||
| + |  args = parse_args() | ||
| + |  infile = args.infile | ||
| + | |||
| + |  filename = open(infile,'r') | ||
| + |  fcontent=filename.read() | ||
| + |  filename.close() | ||
| + | |||
| + |  fs = fcontent.split() | ||
| + |  wordlist=[] | ||
| + | |||
| + |  for word in fs: | ||
| + |      word = ps.stem(word.strip(string.punctuation).lower()) | ||
| + |      if word not in nltk.corpus.stopwords.words('english') and len(word)<15: | ||
| + |         if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15: | ||
| + |             if word not in wordlist: | ||
| + |                 wordlist.append(word) | ||
| + |                 print( word ) | ||
| + |             else: | ||
| + |                 pass | ||
| + |         else: | ||
| + |             pass | ||
| + | |||
| + | |||
| + | Masukan kata2 yang tidak ingin ada dalam text ke dalam file | ||
| + | |||
| + |  ~/nltk_data/corpora/stopwords/indonesia | ||
| + | |||
| + | contoh | ||
| + | |||
| + |  saya | ||
| + |  punya | ||
| + |  sendiri | ||
| + |  kami | ||
| + |  kamu | ||
| + |  anda | ||
| + |  dia | ||
| + |  mereka | ||
| + |  jika | ||
| + |  yang | ||
| + |  itu | ||
| + |  siapa | ||
| + |  dengan | ||
| + |  a | ||
| + |  b | ||
| + |  c | ||
| + |  d | ||
| + |  e | ||
| + |  f | ||
| + |  .. | ||
| + |  .. | ||
| + |  1 | ||
| + |  2 | ||
| + |  3 | ||
| + |  4 | ||
| + |  5 | ||
| + |  .. | ||
| + |  .. | ||
| + |  01/1/2017 | ||
| + |  02/1/2017 | ||
| + |  03/1/2017 | ||
| + |  04/1/2017 | ||
| + |  05/1/2017 | ||
| + |  .. | ||
| + |  .. | ||
| + |  00:00 | ||
| + |  00:01 | ||
| + |  00:02 | ||
| + |  00:03 | ||
| + |  00:04 | ||
Revision as of 06:52, 31 January 2017
Directory
~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    return parser.parse_args()
args = parse_args()
infile = args.infile
filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]
for word in fs:
    word = ps.stem(word.strip(string.punctuation).lower())
    if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
       if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
           if word not in wordlist:
               wordlist.append(word)
               print( word )
           else:
               pass
       else:
           pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya punya sendiri kami kamu anda dia mereka jika yang itu siapa dengan a b c d e f .. .. 1 2 3 4 5 .. .. 01/1/2017 02/1/2017 03/1/2017 04/1/2017 05/1/2017 .. .. 00:00 00:01 00:02 00:03 00:04