Difference between revisions of "Python: NLTK cleaning text"

From OnnoWiki
Jump to navigation Jump to search
(Created page with " Cleaning Text 01 May 2016 Python Data Wrangling Create some raw text # Create a list of three strings. incoming_reports = ["We are attacking on their left flank but a...")
 
 
Line 6: Line 6:
 
Create some raw text
 
Create some raw text
  
# Create a list of three strings.
+
# Create a list of three strings.
incoming_reports = ["We are attacking on their left flank but are losing many men.",  
+
incoming_reports = ["We are attacking on their left flank but are losing many men.",  
              "We cannot see the enemy army. Nothing else to report.",  
+
                "We cannot see the enemy army. Nothing else to report.",  
              "We are ready to attack but are waiting for your orders."]
+
                "We are ready to attack but are waiting for your orders."]
  
 
Seperate by word
 
Seperate by word
  
# import word tokenizer
+
# import word tokenizer
from nltk.tokenize import word_tokenize
+
from nltk.tokenize import word_tokenize
 +
 +
# Apply word_tokenize to each element of the list called incoming_reports
 +
tokenized_reports = [word_tokenize(report) for report in incoming_reports]
 +
 +
# View tokenized_reports
 +
tokenized_reports
 +
 +
[['We',
 +
  'are',
 +
  'attacking',
 +
  'on',
 +
  'their',
 +
  'left',
 +
  'flank',
 +
  'but',
 +
  'are',
 +
  'losing',
 +
  'many',
 +
  'men',
 +
  '.'],
 +
  ['We',
 +
  'can',
 +
  'not',
 +
  'see',
 +
  'the',
 +
  'enemy',
 +
  'army',
 +
  '.',
 +
  'Nothing',
 +
  'else',
 +
  'to',
 +
  'report',
 +
  '.'],
 +
  ['We',
 +
  'are',
 +
  'ready',
 +
  'to',
 +
  'attack',
 +
  'but',
 +
  'are',
 +
  'waiting',
 +
  'for',
 +
  'your',
 +
  'orders',
 +
  '.']]
 +
 +
# Import regex
 +
import re
 +
 +
# Import string
 +
import string
 +
 
 +
 +
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
 +
 +
tokenized_reports_no_punctuation = []
 +
 +
for review in tokenized_reports:
 +
 +
    new_review = []
 +
    for token in review:
 +
        new_token = regex.sub(u'', token)
 +
        if not new_token == u'':
 +
            new_review.append(new_token)
 +
 +
    tokenized_reports_no_punctuation.append(new_review)
 +
 +
tokenized_reports_no_punctuation
  
# Apply word_tokenize to each element of the list called incoming_reports
+
[['We',
tokenized_reports = [word_tokenize(report) for report in incoming_reports]
+
  'are',
 
+
  'attacking',
# View tokenized_reports
+
  'on',
tokenized_reports
+
  'their',
 
+
  'left',
[['We',
+
  'flank',
  'are',
+
  'but',
  'attacking',
+
  'are',
  'on',
+
  'losing',
  'their',
+
  'many',
  'left',
+
  'men'],
  'flank',
+
   ['We',
  'but',
+
  'can',
  'are',
+
  'not',
  'losing',
+
  'see',
  'many',
+
  'the',
  'men',
+
  'enemy',
   '.'],
+
  'army',
['We',
+
  'Nothing',
  'can',
+
  'else',
  'not',
+
  'to',
  'see',
+
  'report'],
  'the',
+
   ['We',
  'enemy',
+
  'are',
  'army',
+
  'ready',
  '.',
+
  'to',
  'Nothing',
+
  'attack',
  'else',
+
  'but',
  'to',
+
  'are',
  'report',
+
  'waiting',
  '.'],
+
  'for',
['We',
+
  'your',
  'are',
+
  'orders']]  
  'ready',
 
  'to',
 
   'attack',
 
  'but',
 
  'are',
 
  'waiting',
 
  'for',
 
  'your',
 
  'orders',
 
  '.']]
 
 
 
# Import regex
 
import re
 
 
 
# Import string
 
import string
 
 
 
 
 
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
 
 
 
tokenized_reports_no_punctuation = []
 
 
 
for review in tokenized_reports:
 
 
 
    new_review = []
 
    for token in review:
 
        new_token = regex.sub(u'', token)
 
        if not new_token == u'':
 
            new_review.append(new_token)
 
 
 
    tokenized_reports_no_punctuation.append(new_review)
 
 
 
tokenized_reports_no_punctuation
 
 
 
[['We',
 
  'are',
 
  'attacking',
 
  'on',
 
  'their',
 
  'left',
 
  'flank',
 
  'but',
 
  'are',
 
  'losing',
 
  'many',
 
  'men'],
 
['We',
 
  'can',
 
  'not',
 
  'see',
 
  'the',
 
  'enemy',
 
  'army',
 
  'Nothing',
 
  'else',
 
  'to',
 
  'report'],
 
['We',
 
  'are',
 
  'ready',
 
  'to',
 
  'attack',
 
  'but',
 
  'are',
 
  'waiting',
 
  'for',
 
  'your',
 
  'orders']]
 
  
 
Remove filler words
 
Remove filler words
 +
 +
from nltk.corpus import stopwords
 +
 +
tokenized_reports_no_stopwords = []
 +
for report in tokenized_reports_no_punctuation:
 +
    new_term_vector = []
 +
    for word in report:
 +
        if not word in stopwords.words('english'):
 +
            new_term_vector.append(word)
 +
    tokenized_reports_no_stopwords.append(new_term_vector)
 +
 +
tokenized_reports_no_stopwords
  
from nltk.corpus import stopwords
+
[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],
 
+
  ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],
tokenized_reports_no_stopwords = []
+
  ['We', 'ready', 'attack', 'waiting', 'orders']]
for report in tokenized_reports_no_punctuation:
 
    new_term_vector = []
 
    for word in report:
 
        if not word in stopwords.words('english'):
 
            new_term_vector.append(word)
 
    tokenized_reports_no_stopwords.append(new_term_vector)
 
 
 
tokenized_reports_no_stopwords
 
 
 
[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],
 
['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],
 
['We', 'ready', 'attack', 'waiting', 'orders']]
 
  
  

Latest revision as of 15:49, 5 February 2017

Cleaning Text

   01 May 2016 Python Data Wrangling 

Create some raw text

# Create a list of three strings.
incoming_reports = ["We are attacking on their left flank but are losing many men.", 
               "We cannot see the enemy army. Nothing else to report.", 
               "We are ready to attack but are waiting for your orders."]

Seperate by word

# import word tokenizer
from nltk.tokenize import word_tokenize

# Apply word_tokenize to each element of the list called incoming_reports
tokenized_reports = [word_tokenize(report) for report in incoming_reports]

# View tokenized_reports
tokenized_reports

[['We',
  'are',
  'attacking',
  'on',
  'their',
  'left',
  'flank',
  'but',
  'are',
  'losing',
  'many',
  'men',
  '.'],
 ['We',
  'can',
  'not',
  'see',
  'the',
  'enemy',
  'army',
  '.',
  'Nothing',
  'else',
  'to',
  'report',
  '.'],
 ['We',
  'are',
  'ready',
  'to',
  'attack',
  'but',
  'are',
  'waiting',
  'for',
  'your',
  'orders',
  '.']] 

# Import regex
import re

# Import string
import string
  

regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_reports_no_punctuation = []

for review in tokenized_reports:

    new_review = []
    for token in review: 
        new_token = regex.sub(u, token)
        if not new_token == u:
            new_review.append(new_token) 

    tokenized_reports_no_punctuation.append(new_review)

tokenized_reports_no_punctuation
[['We',
  'are',
  'attacking',
  'on',
  'their',
  'left',
  'flank',
  'but',
  'are',
  'losing',
  'many',
  'men'],
 ['We',
  'can',
  'not',
  'see',
  'the',
  'enemy',
  'army',
  'Nothing',
  'else',
  'to',
  'report'],
 ['We',
  'are',
  'ready',
  'to',
  'attack',
  'but',
  'are',
  'waiting',
  'for',
  'your',
  'orders']] 

Remove filler words

from nltk.corpus import stopwords

tokenized_reports_no_stopwords = []
for report in tokenized_reports_no_punctuation:
    new_term_vector = []
    for word in report:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    tokenized_reports_no_stopwords.append(new_term_vector)

tokenized_reports_no_stopwords
[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],
 ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],
 ['We', 'ready', 'attack', 'waiting', 'orders']]



Referensi