Difference between revisions of "Python: Mining Twitter for GamerGate: Visualization"
Onnowpurbo (talk | contribs) (Created page with "In the previous posting, I went over how to connect to Twitter’s streaming API using a connector app and the Tweepy Python library, as well as a quick overview of how to con...") |
Onnowpurbo (talk | contribs) |
||
(3 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | + | Disini kita akan extrak semua informasi yang dibutuhkan menggunakan NetworkX untuk membuat directed graph dan mem-visualisasikan menggunakan Gephi untuk melihat siapa me-retweet siapa, mencatat umur tweet dalam hari, jumlah follower setiap user agar dikemudian hari dapat di filter jika kita menginginkannya. | |
− | + | Instalasi NetworkX, | |
− | + | sudo pip install networkx | |
− | + | jangan lupa instalasi Gephi. | |
− | + | Asumsinya tweet berhasil dikumpulkan di “gamergate.txt”. Script berikut akan mengambil data dari text file dan memasukan ke frame data yang baru, | |
− | + | ||
− | + | import json | |
− | + | import re | |
− | + | import pandas as pd | |
− | + | from time import gmtime, mktime, strptime | |
− | + | ||
− | + | tweets_data = [] | |
− | + | tweets_file = open(tweets_data_path, "r") | |
− | + | for line in tweets_file: | |
− | + | try: | |
− | + | tweet = json.loads(line) | |
− | + | tweets_data.append(tweet) | |
− | + | except: | |
− | + | continue | |
− | + | # | |
− | + | # Clean out limit messages, etc. | |
− | + | # | |
− | + | for tweet in tweets_data: | |
− | + | try: | |
− | + | user = tweet['user'] | |
− | + | except: | |
− | + | tweets_data.remove(tweet) | |
− | + | ||
− | + | for tweet in tweets_data: | |
− | + | try: | |
− | + | user = tweet['text'] | |
− | + | except: | |
− | + | tweets_data.remove(tweet) | |
− | + | ||
− | + | # | |
− | + | # See how many we wound up with | |
− | + | # | |
− | + | print len(tweets_data) | |
− | + | ||
− | + | # | |
− | + | # Pull the data we're interested in out of the Twitter data we captured | |
− | + | # | |
− | + | rows_list = [] | |
− | + | now = mktime(gmtime()) | |
− | + | for tweet in tweets_data: | |
− | + | author = "" | |
− | + | rtauthor = "" | |
− | + | age = rtage = followers = rtfollowers = 0 | |
− | + | # | |
− | + | # If it was a retweet, get both the original author and the retweeter, save the original author's | |
+ | # follower count and age | ||
+ | # | ||
+ | try: | ||
+ | author = tweet['user']['screen_name'] | ||
+ | rtauthor = tweet['retweeted_status']['user']['screen_name'] | ||
+ | rtage = int(now - mktime(strptime(tweet['retweeted_status']['user']['created_at'], "%a %b %d %H:%M:%S +0000 %Y")))/(60*60*24) | ||
+ | rtfollowers = tweet['retweeted_status']['user']['followers_count'] | ||
+ | except: | ||
+ | # | ||
+ | # Otherwise, just get the original author | ||
+ | # | ||
try: | try: | ||
− | + | author = tweet['user']['screen_name'] | |
− | + | except: | |
− | + | continue | |
− | + | # | |
− | + | # If this was a reply, save the screen name being replied to | |
− | + | # | |
− | + | reply_to = "" | |
− | + | if (tweet['in_reply_to_screen_name'] != None): | |
− | + | reply_to = tweet['in_reply_to_screen_name'] | |
− | + | # | |
− | + | # Calculate the age, in days, of this Twitter ID | |
− | + | # | |
− | + | age = int(now - mktime(strptime(tweet['user']['created_at'], "%a %b %d %H:%M:%S +0000 %Y")))/(60*60*24) | |
− | + | # | |
− | + | # Grab this ID's follower count and the text of the tweet | |
− | + | # | |
− | + | followers = tweet['user']['followers_count'] | |
− | + | text = tweet['text'] | |
− | + | dict1 = {} | |
− | + | # | |
− | + | # Construct a row, add it to our list | |
− | + | # | |
− | + | dict1.update({'author': author, 'reply_to': reply_to, 'age': age, 'followers': followers, 'retweet_of': rtauthor, 'rtfollowers': rtfollowers, 'rtage': rtage, 'text': text}) | |
− | + | rows_list.append(dict1) | |
− | + | ||
− | + | # | |
− | + | # When we've processed all the tweets, build the DataFrame from the rows | |
− | + | # we've collected | |
− | + | # | |
− | + | tweets = pd.DataFrame(rows_list) | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
Here’s a script that will iterate through the dataframe, row by row, and construct a directed graph of who’s retweeting whom. Each directed edge represented the relationship “is retweeted by”, the higher the weight of an edge, the more person B is getting retweeted by person A. Each node represents an individual ID on Twitter, and has attributes to track the number of followers and the age of the ID in days. | Here’s a script that will iterate through the dataframe, row by row, and construct a directed graph of who’s retweeting whom. Each directed edge represented the relationship “is retweeted by”, the higher the weight of an edge, the more person B is getting retweeted by person A. Each node represents an individual ID on Twitter, and has attributes to track the number of followers and the age of the ID in days. | ||
− | + | import networkx as nx | |
− | + | ||
− | + | # | |
− | + | # Create a new directed graph | |
− | + | # | |
− | + | J = nx.DiGraph() | |
− | + | # | |
− | + | # Iterate through the rows of our dataframe | |
− | + | # | |
− | + | for index, row in tweets.iterrows(): | |
− | + | # | |
− | + | # Gather the data out of the row | |
− | + | # | |
− | + | this_user_id = row['author'] | |
− | + | author = row['retweet_of'] | |
− | + | followers = row['followers'] | |
− | + | age = row['age'] | |
− | + | rtfollowers = row['rtfollowers'] | |
− | + | rtage = row['rtage'] | |
− | + | # | |
− | + | # Is the sender of this tweet in our network? | |
− | + | # | |
− | + | if not this_user_id in J: | |
− | + | J.add_node(this_user_id, attr_dict={ | |
− | + | 'followers': row['followers'], | |
− | + | 'age': row['age'], | |
− | + | }) | |
− | + | # | |
− | + | # If this is a retweet, is the original author a node? | |
− | + | # | |
− | + | if author != "" and not author in J: | |
− | + | J.add_node(author, attr_dict={ | |
− | + | 'followers': row['rtfollowers'], | |
− | + | 'age': row['rtage'], | |
− | + | }) | |
− | + | # | |
− | + | # If this is a retweet, add an edge between the two nodes. | |
− | + | # | |
− | + | if author != "": | |
− | + | if J.has_edge(author, this_user_id): | |
− | + | J[author][this_user_id]['weight'] += 1 | |
− | + | else: | |
− | + | J.add_weighted_edges_from([(author, this_user_id, 1.0)]) | |
− | + | ||
− | + | nx.write_gexf(J, 'ggrtages.gexf') | |
The last thing we did was to save out a GEFX file we can then read into Gephi. Start Gephi up, and open our file; we called ours “ggrtages.gexf”. | The last thing we did was to save out a GEFX file we can then read into Gephi. Start Gephi up, and open our file; we called ours “ggrtages.gexf”. | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
Latest revision as of 13:35, 29 January 2017
Disini kita akan extrak semua informasi yang dibutuhkan menggunakan NetworkX untuk membuat directed graph dan mem-visualisasikan menggunakan Gephi untuk melihat siapa me-retweet siapa, mencatat umur tweet dalam hari, jumlah follower setiap user agar dikemudian hari dapat di filter jika kita menginginkannya.
Instalasi NetworkX,
sudo pip install networkx
jangan lupa instalasi Gephi.
Asumsinya tweet berhasil dikumpulkan di “gamergate.txt”. Script berikut akan mengambil data dari text file dan memasukan ke frame data yang baru,
import json import re import pandas as pd from time import gmtime, mktime, strptime tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue # # Clean out limit messages, etc. # for tweet in tweets_data: try: user = tweet['user'] except: tweets_data.remove(tweet) for tweet in tweets_data: try: user = tweet['text'] except: tweets_data.remove(tweet) # # See how many we wound up with # print len(tweets_data) # # Pull the data we're interested in out of the Twitter data we captured # rows_list = [] now = mktime(gmtime()) for tweet in tweets_data: author = "" rtauthor = "" age = rtage = followers = rtfollowers = 0 # # If it was a retweet, get both the original author and the retweeter, save the original author's # follower count and age # try: author = tweet['user']['screen_name'] rtauthor = tweet['retweeted_status']['user']['screen_name'] rtage = int(now - mktime(strptime(tweet['retweeted_status']['user']['created_at'], "%a %b %d %H:%M:%S +0000 %Y")))/(60*60*24) rtfollowers = tweet['retweeted_status']['user']['followers_count'] except: # # Otherwise, just get the original author # try: author = tweet['user']['screen_name'] except: continue # # If this was a reply, save the screen name being replied to # reply_to = "" if (tweet['in_reply_to_screen_name'] != None): reply_to = tweet['in_reply_to_screen_name'] # # Calculate the age, in days, of this Twitter ID # age = int(now - mktime(strptime(tweet['user']['created_at'], "%a %b %d %H:%M:%S +0000 %Y")))/(60*60*24) # # Grab this ID's follower count and the text of the tweet # followers = tweet['user']['followers_count'] text = tweet['text'] dict1 = {} # # Construct a row, add it to our list # dict1.update({'author': author, 'reply_to': reply_to, 'age': age, 'followers': followers, 'retweet_of': rtauthor, 'rtfollowers': rtfollowers, 'rtage': rtage, 'text': text}) rows_list.append(dict1) # # When we've processed all the tweets, build the DataFrame from the rows # we've collected # tweets = pd.DataFrame(rows_list)
Here’s a script that will iterate through the dataframe, row by row, and construct a directed graph of who’s retweeting whom. Each directed edge represented the relationship “is retweeted by”, the higher the weight of an edge, the more person B is getting retweeted by person A. Each node represents an individual ID on Twitter, and has attributes to track the number of followers and the age of the ID in days.
import networkx as nx # # Create a new directed graph # J = nx.DiGraph() # # Iterate through the rows of our dataframe # for index, row in tweets.iterrows(): # # Gather the data out of the row # this_user_id = row['author'] author = row['retweet_of'] followers = row['followers'] age = row['age'] rtfollowers = row['rtfollowers'] rtage = row['rtage'] # # Is the sender of this tweet in our network? # if not this_user_id in J: J.add_node(this_user_id, attr_dict={ 'followers': row['followers'], 'age': row['age'], }) # # If this is a retweet, is the original author a node? # if author != "" and not author in J: J.add_node(author, attr_dict={ 'followers': row['rtfollowers'], 'age': row['rtage'], }) # # If this is a retweet, add an edge between the two nodes. # if author != "": if J.has_edge(author, this_user_id): J[author][this_user_id]['weight'] += 1 else: J.add_weighted_edges_from([(author, this_user_id, 1.0)]) nx.write_gexf(J, 'ggrtages.gexf')
The last thing we did was to save out a GEFX file we can then read into Gephi. Start Gephi up, and open our file; we called ours “ggrtages.gexf”.