Difference between revisions of "Python: Python for Mining Data From Twitter"

From OnnoWiki
Jump to navigation Jump to search
(Created page with "Twitter is increasingly being used for business or personal purposes. With Twitter API there is also an opportunity to do data mining of data (tweets) and find interesting inf...")
 
Line 18: Line 18:
 
Here is code example of getting urls and date data:
 
Here is code example of getting urls and date data:
  
urls = [ urls['url']
+
urls = [ urls['url']
    for status in statuses
+
    for status in statuses
      for urls in status['entities']['urls'] ]
+
        for urls in status['entities']['urls'] ]
  
 
+
created_ats = [ status['created_at']
created_ats = [ status['created_at']
+
    for status in statuses
    for status in statuses
+
        ]
        ]
 
  
 
3. Clustering Tweets
 
3. Clustering Tweets
Line 32: Line 31:
  
  
modelvectorizer = CountVectorizer(analyzer = "word", \
+
modelvectorizer = CountVectorizer(analyzer = "word", \
                            tokenizer = None,      \
+
                              tokenizer = None,      \
                            preprocessor = None,    \  
+
                              preprocessor = None,    \  
                            stop_words='english',  \
+
                              stop_words='english',  \
                            max_features = 5000)  
+
                              max_features = 5000)
 +
 +
train_data_features = vectorizer.fit_transform(texts)
 +
train_data_features = train_data_features.toarray()
 +
print (train_data_features.shape)
 +
print (train_data_features)
  
train_data_features = vectorizer.fit_transform(texts)
+
'''
train_data_features = train_data_features.toarray()
+
This will print like this:     
print (train_data_features.shape)
+
[[0 0 0 ..., 0 1 1]
print (train_data_features)
+
  [0 0 1 ..., 0 0 0]
'''
+
  [0 0 0 ..., 0 1 1]
This will print like this:     
+
  ...,  
[[0 0 0 ..., 0 1 1]
+
  [0 0 0 ..., 0 0 0]
[0 0 1 ..., 0 0 0]
+
  [0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 1 1]
+
  [0 0 0 ..., 0 0 0]]
...,  
+
'''
[0 0 0 ..., 0 0 0]
 
[0 0 0 ..., 0 0 0]
 
[0 0 0 ..., 0 0 0]]
 
'''
 
  
vocab = vectorizer.get_feature_names()
+
vocab = vectorizer.get_feature_names()
print (vocab)
+
print (vocab)
dist = np.sum(train_data_features, axis=0)
+
dist = np.sum(train_data_features, axis=0)
 
+
#For each, print the vocabulary word and the number of times it appears in the training set
+
#For each, print the vocabulary word and the number of times it appears in the training set
 
+
for tag, count in zip(vocab, dist):
+
for tag, count in zip(vocab, dist):
    print (count, tag)
+
    print (count, tag)
 
+
'''
+
'''
This will print something like this
+
This will print something like this
3 ai
+
3 ai
1 alexandria
+
1 alexandria
2 algorithms
+
2 algorithms
1 amp
+
1 amp
2 analytics
+
2 analytics
1 applications
+
1 applications
1 applied
+
1 applied
''''
+
''''
  
 
Now we are ready to do clustering.  We select to use Birch clustering algorithm. [3]  Below is the code snippet for this. We specify the number of clusters 6.
 
Now we are ready to do clustering.  We select to use Birch clustering algorithm. [3]  Below is the code snippet for this. We specify the number of clusters 6.
  
brc = Birch(branching_factor=50, n_clusters=6, threshold=0.5,  compute_labels=True)
+
brc = Birch(branching_factor=50, n_clusters=6, threshold=0.5,  compute_labels=True)
brc.fit(train_data_features)
+
brc.fit(train_data_features)
 +
 +
clustering_result=brc.predict(train_data_features)
 +
print ("\nClustering_result:\n")
 +
print (clustering_result)
 +
 +
'''
  
clustering_result=brc.predict(train_data_features)
+
Below is the example of printout (each tweet got the number, this number represents the number of cluster associated with this tweet, number of clusters is 6 ) :
print ("\nClustering_result:\n")
 
print (clustering_result)
 
 
 
'''
 
Below is the example of printout (each tweet got the number, this number represents the number of cluster associated with this tweet, number of clusters is 6 ):
 
 
Clustering_result:
 
Clustering_result:
  
[0 0 0 0 0 4 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 1 4 1 1 1
+
[0 0 0 0 0 4 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 1 4 1 1 1
 
  2 2]
 
  2 2]
'''
+
'''
  
 
In the next step we output some data and build plot of frequency for hashtags.
 
In the next step we output some data and build plot of frequency for hashtags.
Line 99: Line 100:
  
  
import twitter
+
import twitter
import json
+
import json
 
+
import matplotlib.pyplot as plt
 
+
import numpy as np
 
 
 
 
import matplotlib.pyplot as plt
 
import numpy as np
 
 
 
 
 
 
 
from sklearn.feature_extraction.text import CountVectorizer
 
from sklearn.cluster import Birch
 
 
 
CONSUMER_KEY ="xxxxxxxxxxxxxxx"
 
CONSUMER_SECRET ="xxxxxxxxxxxx"
 
OAUTH_TOKEN = "xxxxxxxxxxxxxx"
 
OAUTH_TOKEN_SECRET = "xxxxxxxxxx"
 
 
 
 
 
auth = twitter.oauth.OAuth (OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
 
 
 
twitter_api= twitter.Twitter(auth=auth)
 
q='#deep learning'
 
count=100
 
 
 
# Do search for tweets containing '#deep learning'
 
search_results = twitter_api.search.tweets (q=q, count=count)
 
 
 
statuses=search_results['statuses']
 
 
 
# Iterate through 5 more batches of results by following the cursor
 
for _ in range(5):
 
  print ("Length of statuses", len(statuses))
 
  try:
 
        next_results = search_results['search_metadata']['next_results']
 
  except KeyError: 
 
      break
 
  # Create a dictionary from next_results
 
  kwargs=dict( [kv.split('=') for kv in next_results[1:].split("&") ])
 
 
 
  search_results = twitter_api.search.tweets(**kwargs)
 
  statuses += search_results['statuses']
 
 
 
# Show one sample search result by slicing the list
 
print (json.dumps(statuses[0], indent=10))
 
 
 
 
 
 
 
# Extracting data such as hashtags, urls, texts and created at date
 
hashtags = [ hashtag['text'].lower()
 
    for status in statuses
 
      for hashtag in status['entities']['hashtags'] ]
 
 
 
 
 
urls = [ urls['url']
 
    for status in statuses
 
      for urls in status['entities']['urls'] ]
 
 
 
 
 
texts = [ status['text']
 
    for status in statuses
 
        ]
 
 
 
created_ats = [ status['created_at']
 
    for status in statuses
 
        ]
 
 
 
# Preparing data for trending in the format: date word
 
# Note: in the below loop w is not cleaned from #,? characters
 
i=0
 
print ("===============================\n")
 
for x in created_ats:
 
    for w in texts[i].split(" "):
 
        if len(w)>=2:
 
              print (x[4:10], x[26:31] ," ", w)
 
    i=i+1
 
 
 
 
 
 
 
 
 
# Prepare tweets data for clustering
 
# Converting text data into bag of words model
 
 
 
vectorizer = CountVectorizer(analyzer = "word", \
 
                            tokenizer = None,  \
 
                            preprocessor = None,  \
 
                            stop_words='english', \
 
                            max_features = 5000)
 
 
 
train_data_features = vectorizer.fit_transform(texts)
 
 
 
train_data_features = train_data_features.toarray()
 
 
 
print (train_data_features.shape)
 
 
 
print (train_data_features)
 
 
 
vocab = vectorizer.get_feature_names()
 
print (vocab)
 
 
 
dist = np.sum(train_data_features, axis=0)
 
 
 
# For each, print the vocabulary word and the number of times it
 
# appears in the training set
 
for tag, count in zip(vocab, dist):
 
    print (count, tag)
 
 
 
 
 
# Clustering data
 
 
 
brc = Birch(branching_factor=50, n_clusters=6, threshold=0.5,  compute_labels=True)
 
brc.fit(train_data_features)
 
 
 
clustering_result=brc.predict(train_data_features)
 
print ("\nClustering_result:\n")
 
print (clustering_result)
 
 
 
 
 
 
 
 
 
 
 
# Outputting some data
 
print (json.dumps(hashtags[0:50], indent=1))
 
print (json.dumps(urls[0:50], indent=1))
 
print (json.dumps(texts[0:50], indent=1))
 
print (json.dumps(created_ats[0:50], indent=1))
 
 
 
 
 
with open("data.txt", "a") as myfile:
 
    for w in hashtags:
 
          myfile.write(str(w.encode('ascii', 'ignore')))
 
          myfile.write("\n")
 
 
 
 
 
 
 
# count of word frequencies
 
wordcounts = {}
 
for term in hashtags:
 
    wordcounts[term] = wordcounts.get(term, 0) + 1
 
 
 
 
 
items = [(v, k) for k, v in wordcounts.items()]
 
 
 
 
 
 
 
print (len(items))
 
 
 
xnum=[i for i in range(len(items))]
 
for count, word in sorted(items, reverse=True):
 
    print("%5d %s" % (count, word))
 
 
 
 
 
 
 
 
 
for x in created_ats:
 
  print (x)
 
  print (x[4:10])
 
  print (x[26:31])
 
  print (x[4:7])
 
 
 
 
 
 
 
plt.figure()
 
plt.title("Frequency of Hashtags")
 
  
myarray = np.array(sorted(items, reverse=True))
+
from sklearn.feature_extraction.text import CountVectorizer
 +
from sklearn.cluster import Birch
 +
 +
CONSUMER_KEY ="xxxxxxxxxxxxxxx"
 +
CONSUMER_SECRET ="xxxxxxxxxxxx"
 +
OAUTH_TOKEN = "xxxxxxxxxxxxxx"
 +
OAUTH_TOKEN_SECRET = "xxxxxxxxxx"
  
 +
auth = twitter.oauth.OAuth (OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
 +
 +
twitter_api= twitter.Twitter(auth=auth)
 +
q='#deep learning'
 +
count=100
 +
 +
# Do search for tweets containing '#deep learning'
 +
search_results = twitter_api.search.tweets (q=q, count=count)
 +
 +
statuses=search_results['statuses']
 +
 +
# Iterate through 5 more batches of results by following the cursor
 +
for _ in range(5):
 +
    print ("Length of statuses", len(statuses))
 +
    try:
 +
        next_results = search_results['search_metadata']['next_results']
 +
    except KeyError: 
 +
        break
 +
    # Create a dictionary from next_results
 +
    kwargs=dict( [kv.split('=') for kv in next_results[1:].split("&") ])
 +
 +
    search_results = twitter_api.search.tweets(**kwargs)
 +
    statuses += search_results['statuses']
 +
 +
# Show one sample search result by slicing the list
 +
print (json.dumps(statuses[0], indent=10))
 +
 +
# Extracting data such as hashtags, urls, texts and created at date
 +
hashtags = [ hashtag['text'].lower()
 +
    for status in statuses
 +
        for hashtag in status['entities']['hashtags'] ]
 +
 +
urls = [ urls['url']
 +
    for status in statuses
 +
        for urls in status['entities']['urls'] ]
 +
 +
texts = [ status['text']
 +
    for status in statuses
 +
        ]
 +
 +
created_ats = [ status['created_at']
 +
    for status in statuses
 +
        ]
 +
 +
# Preparing data for trending in the format: date word
 +
# Note: in the below loop w is not cleaned from #,? characters
 +
i=0
 +
print ("===============================\n")
 +
for x in created_ats:
 +
      for w in texts[i].split(" "):
 +
        if len(w)>=2:
 +
              print (x[4:10], x[26:31] ," ", w)
 +
      i=i+1
 +
 +
# Prepare tweets data for clustering
 +
# Converting text data into bag of words model
 +
 +
vectorizer = CountVectorizer(analyzer = "word", \
 +
                              tokenizer = None,  \
 +
                              preprocessor = None,  \
 +
                              stop_words='english', \
 +
                              max_features = 5000) 
 +
 +
train_data_features = vectorizer.fit_transform(texts)
 +
train_data_features = train_data_features.toarray()
 +
 +
print (train_data_features.shape)
 +
print (train_data_features)
  
print (myarray[:,0])
+
vocab = vectorizer.get_feature_names()
 +
print (vocab)
 +
 +
dist = np.sum(train_data_features, axis=0)
 +
 +
# For each, print the vocabulary word and the number of times it
 +
# appears in the training set
 +
for tag, count in zip(vocab, dist):
 +
    print (count, tag)
 +
 +
# Clustering data
 +
 +
brc = Birch(branching_factor=50, n_clusters=6, threshold=0.5,  compute_labels=True)
 +
brc.fit(train_data_features)
 +
 +
clustering_result=brc.predict(train_data_features)
 +
print ("\nClustering_result:\n")
 +
print (clustering_result)
 +
 +
# Outputting some data
 +
print (json.dumps(hashtags[0:50], indent=1))
 +
print (json.dumps(urls[0:50], indent=1))
 +
print (json.dumps(texts[0:50], indent=1))
 +
print (json.dumps(created_ats[0:50], indent=1))
 +
 
 +
with open("data.txt", "a") as myfile:
 +
      for w in hashtags:
 +
            myfile.write(str(w.encode('ascii', 'ignore')))
 +
            myfile.write("\n")
 +
 +
# count of word frequencies
 +
wordcounts = {}
 +
for term in hashtags:
 +
    wordcounts[term] = wordcounts.get(term, 0) + 1
 +
 +
items = [(v, k) for k, v in wordcounts.items()]
 +
 +
print (len(items))
 +
 +
xnum=[i for i in range(len(items))]
 +
for count, word in sorted(items, reverse=True):
 +
    print("%5d %s" % (count, word))
 +
 +
for x in created_ats:
 +
  print (x)
 +
  print (x[4:10])
 +
  print (x[26:31])
 +
  print (x[4:7])
  
print (myarray[:,1])
+
plt.figure()
 +
plt.title("Frequency of Hashtags")
 +
 +
myarray = np.array(sorted(items, reverse=True))
 +
 +
print (myarray[:,0])
 +
print (myarray[:,1])
  
plt.xticks(xnum, myarray[:,1],rotation='vertical')
+
plt.xticks(xnum, myarray[:,1],rotation='vertical')
plt.plot (xnum, myarray[:,0])
+
plt.plot (xnum, myarray[:,0])
plt.show()
+
plt.show()
  
  

Revision as of 05:26, 29 January 2017

Twitter is increasingly being used for business or personal purposes. With Twitter API there is also an opportunity to do data mining of data (tweets) and find interesting information. In this post we will take a look how to get data from Twitter, prepare data for analysis and then do clustering of tweets using python programming language.

In our example of python script we will extract tweets that contain hashtag “deep learning”. The data obtained in this search then will be used for further processing and data mining.

The script can be divided in the following 3 sections briefly described below.

1. Accessing Twitter API

First the script is establishing connection to Twitter and credentials are being checked by Twitter service. This requires to provide access tokens such as CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET. Refer to [1] how to obtain this information from Twitter account.

2. Searching for Tweets

Once access token information verified then the search for tweets related to a particular hashtag “deep learning” in our example is performed and if it is successful we are getting data. The python script then iterates through 5 more batches of results by following the cursor. All results are saved in json data structure statuses.

Now we are extracting data such as hashtags, urls, texts and created at date. The date is useful if we need do trending over the time.

In the next step we are preparing data for trending in the format: date word. This allows to view how the usage of specific word in the tweets is changing over the time. Here is code example of getting urls and date data:

urls = [ urls['url']
    for status in statuses
       for urls in status['entities']['urls'] ]
created_ats = [ status['created_at']
    for status in statuses
        ]

3. Clustering Tweets

Now we are preparing tweets data for data clustering. We are converting text data into bag of words data representation. This is called vectorization which is the general process of turning a collection of text documents into numerical feature vectors. [2]


modelvectorizer = CountVectorizer(analyzer = "word", \
                             tokenizer = None,       \
                             preprocessor = None,    \ 
                             stop_words='english',   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(texts)
train_data_features = train_data_features.toarray()
print (train_data_features.shape)
print (train_data_features)

This will print like this:    
[[0 0 0 ..., 0 1 1]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 1 1]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]

vocab = vectorizer.get_feature_names()
print (vocab)
dist = np.sum(train_data_features, axis=0)

#For each, print the vocabulary word and the number of times it appears in the training set

for tag, count in zip(vocab, dist):
    print (count, tag)


This will print something like this
3 ai
1 alexandria
2 algorithms
1 amp
2 analytics
1 applications
1 applied
'

Now we are ready to do clustering. We select to use Birch clustering algorithm. [3] Below is the code snippet for this. We specify the number of clusters 6.

brc = Birch(branching_factor=50, n_clusters=6, threshold=0.5,  compute_labels=True)
brc.fit(train_data_features)

clustering_result=brc.predict(train_data_features)
print ("\nClustering_result:\n")
print (clustering_result)


Below is the example of printout (each tweet got the number, this number represents the number of cluster associated with this tweet, number of clusters is 6 ) : Clustering_result:

[0 0 0 0 0 4 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 1 4 1 1 1
2 2]

In the next step we output some data and build plot of frequency for hashtags.

Frequency of Hashtags

Source Code Thus we explored python coding of data mining for Twitter. We looked at different tasks such as searching tweets, extracting different data from search results, preparing data for trending, converting text results into numerical form, clustering and printing plot of frequency of hashtags. Below is the source code for all of this. In the future we plan add more functionality. There many possible ways how to data mine Twitter data. Some interesting ideas on the web can be found in [4].


import twitter
import json
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import Birch

CONSUMER_KEY ="xxxxxxxxxxxxxxx"
CONSUMER_SECRET ="xxxxxxxxxxxx"
OAUTH_TOKEN = "xxxxxxxxxxxxxx"
OAUTH_TOKEN_SECRET = "xxxxxxxxxx"
auth = twitter.oauth.OAuth (OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)

twitter_api= twitter.Twitter(auth=auth)
q='#deep learning'
count=100

# Do search for tweets containing '#deep learning'
search_results = twitter_api.search.tweets (q=q, count=count)

statuses=search_results['statuses']

# Iterate through 5 more batches of results by following the cursor
for _ in range(5):
   print ("Length of statuses", len(statuses))
   try:
        next_results = search_results['search_metadata']['next_results']
   except KeyError:   
       break
   # Create a dictionary from next_results
   kwargs=dict( [kv.split('=') for kv in next_results[1:].split("&") ])

   search_results = twitter_api.search.tweets(**kwargs)
   statuses += search_results['statuses']

# Show one sample search result by slicing the list
print (json.dumps(statuses[0], indent=10))

# Extracting data such as hashtags, urls, texts and created at date
hashtags = [ hashtag['text'].lower()
    for status in statuses
       for hashtag in status['entities']['hashtags'] ]

urls = [ urls['url']
    for status in statuses
       for urls in status['entities']['urls'] ]

texts = [ status['text']
    for status in statuses
        ]

created_ats = [ status['created_at']
    for status in statuses
        ]

# Preparing data for trending in the format: date word
# Note: in the below loop w is not cleaned from #,? characters
i=0
print ("===============================\n")
for x in created_ats:
     for w in texts[i].split(" "):
        if len(w)>=2:
              print (x[4:10], x[26:31] ," ", w)
     i=i+1

# Prepare tweets data for clustering
# Converting text data into bag of words model

vectorizer = CountVectorizer(analyzer = "word", \
                             tokenizer = None,  \
                             preprocessor = None,  \
                             stop_words='english', \
                             max_features = 5000)  

train_data_features = vectorizer.fit_transform(texts)
train_data_features = train_data_features.toarray()

print (train_data_features.shape)
print (train_data_features)
vocab = vectorizer.get_feature_names()
print (vocab)

dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count, tag)

# Clustering data

brc = Birch(branching_factor=50, n_clusters=6, threshold=0.5,  compute_labels=True)
brc.fit(train_data_features) 

clustering_result=brc.predict(train_data_features)
print ("\nClustering_result:\n")
print (clustering_result)

# Outputting some data
print (json.dumps(hashtags[0:50], indent=1))
print (json.dumps(urls[0:50], indent=1))
print (json.dumps(texts[0:50], indent=1))
print (json.dumps(created_ats[0:50], indent=1)) 
 
with open("data.txt", "a") as myfile:
     for w in hashtags: 
           myfile.write(str(w.encode('ascii', 'ignore')))
           myfile.write("\n") 

# count of word frequencies
wordcounts = {}
for term in hashtags:
    wordcounts[term] = wordcounts.get(term, 0) + 1

items = [(v, k) for k, v in wordcounts.items()]

print (len(items))

xnum=[i for i in range(len(items))]
for count, word in sorted(items, reverse=True):
    print("%5d %s" % (count, word))

for x in created_ats:
  print (x)
  print (x[4:10])
  print (x[26:31])
  print (x[4:7])
plt.figure()
plt.title("Frequency of Hashtags")

myarray = np.array(sorted(items, reverse=True))

print (myarray[:,0])
print (myarray[:,1])
plt.xticks(xnum, myarray[:,1],rotation='vertical')
plt.plot (xnum, myarray[:,0])
plt.show()




Referensi