In [90]:
# note that honeypot dataset is more than just tweets, hence the different name
from loader import load_michigan_tweets, load_political_tweets, load_honeypot_data, load_michigan_unretreived_tweet_ids
import datetime
from dateutil.parser import parse
from collections import Counter
import numpy as np
import pandas
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
import snap
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn import metrics
import seaborn
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import scipy.spatial.distance as ssd
from scipy.cluster.hierarchy import dendrogram, linkage
nltk.download('stopwords')
# "import snap" compiles to another library, not SNAP, in Python 3.
Out[90]:
In [ ]:
# loading the honeypot data takes a minute or three.
michigan_tweets = load_michigan_tweets() # a list of dictionaries
print("Loaded Michigan tweets.")
poltical_general_tweets, political_keyword_tweets = load_political_tweets() # dataframes
print("Loaded Political tweets.")
# dataframes and lists of
content_polluters, content_polluters_tweets, content_polluters_followings_every_hour_since_collected_at, legitimate_users, legitimate_users_tweets, legitimate_users_followings_every_hour_since_collected_at = load_honeypot_data()
print("Loaded Honeypot data.")
In [6]:
unretrieved_michigan_tweet_ids = load_michigan_unretreived_tweet_ids()
print(len(unretrieved_michigan_tweet_ids))
print(len(michigan_tweets))
print(len(michigan_tweets) + len(unretrieved_michigan_tweet_ids)) # to-do: figure out why this is 142,281 tweets instead of the expected 142,249 tweets
In [7]:
sample_tweet = michigan_tweets[0]
# print(sample_tweet)
In [8]:
# Some helper functions.
# Setting: User A retweets a tweet by user B.
# The timestamp of user A's retweet.
def get_timestamp(tweet):
return tweet['created_at']
# The content of user A's tweet / retweet.
def get_content(tweet):
return tweet['text']
# A list of URLs present in the content of user A's tweet / retweet.
def get_urls(tweet):
urls = set()
for url in tweet['entities']['urls']:
urls.add(url['expanded_url'])
if 'retweeted_status' in tweet and 'entities' in tweet['retweeted_status']:
for url in tweet['retweeted_status']['entities']['urls']:
urls.add(url['expanded_url'])
return urls if urls else None
# The hashtag present in user A's retweet.
def get_hashtags(tweet):
hashtags = set()
for hashtag in tweet['entities']['hashtags']:
hashtags.add(hashtag['text'])
return hashtags
# The tweet ID of user A's retweet.
def get_tweet_id(tweet):
return tweet['id']
# The tweet ID of user B's tweet.
def get_derived_tweet_id(tweet):
if 'retweeted_status' not in tweet:
return None
return tweet['retweeted_status']['id']
# User A's user ID.
def get_user_id(tweet):
return tweet['user']['id']
# User B's user ID.
def get_derived_user_id(tweet):
if 'retweeted_status' not in tweet:
return None
return tweet['retweeted_status']['user']['id']
# User A's display name.
def get_display_name(tweet):
return tweet['user']['name']
# User A's Twitter handle.
def get_handle(tweet):
return tweet['user']['screen_name']
# The number of followers of User A.
def get_num_followers(tweet):
return tweet['user']['followers_count']
# The number of friends of User A.
def get_num_friends(tweet):
return tweet['user']['friends_count']
# The number of posts of User A.
def get_num_posts(tweet):
return tweet['user']['statuses_count']
# The number of retweets that User A has performed.
def get_num_retweets(tweet):
return tweet['retweet_count']
# A tuple: (the tweet ID of User A's retweet, the tweet ID of User B's tweet).
# Returns None if the tweet passed in as an argument is not a retweet.
def get_tweet_to_derived_tweet_edge(tweet):
if not get_derived_tweet_id(tweet):
return None
return (get_tweet_id(tweet), get_derived_tweet_id(tweet))
In [9]:
# Usage examples
get_timestamp(sample_tweet), \
get_content(sample_tweet), \
get_urls(sample_tweet), \
get_hashtags(sample_tweet), \
get_tweet_id(sample_tweet), \
get_derived_tweet_id(sample_tweet), \
get_user_id(sample_tweet), \
get_derived_user_id(sample_tweet), \
get_display_name(sample_tweet), \
get_handle(sample_tweet), \
get_num_followers(sample_tweet), \
get_num_friends(sample_tweet), \
get_num_posts(sample_tweet), \
get_num_retweets(sample_tweet), \
get_tweet_to_derived_tweet_edge(sample_tweet)
Out[9]:
In [10]:
# Hashtags used by the Oxford paper.
pro_trump_hashtags = set(['AmericaFirst','benghazi','CrookedHillary','DrainTheSwamp','lockherup','maga3x','MAGA','MakeAmericaGreatAgain','NeverHillary','PodestaEmails','projectveritas','riggedelection','tcot','Trump2016','Trump','TrumpPence16','TrumpTrain','VoterFraud','votetrump','wakeupamerica'])
pro_hillary_hashtags = set(['Clinton','ClintonKaine16','democrats','dems','dnc','dumptrump','factcheck','hillary2016','Hillary','HillaryClinton','hillarysupporter','hrc','ImWithHer','LastTimeTrumpPaidTaxes','NeverTrump','OHHillYes','p2','strongertogether','trumptape','uniteblue'])
neutral_hashtags = set(['Election2016','Elections2016','uselections','uselection','earlyvote','iVoted','Potus'])
In [44]:
# Populating some data structures.
# For now, we're lumping everything together; later, we can distinguish between Trump, Hillary, and Neutral.
tweet_id_to_user_id = {}
tweet_id_to_content_and_urls = {}
all_hashtags = set()
user_id_to_handle = {}
edges = [] # list, not set, because we want to allow multiple edges
timestamps = []
dates = []
formatted_time = []
for t in michigan_tweets:
user_id = get_user_id(t)
tweet_id = get_tweet_id(t)
derived_tweet_id = get_derived_tweet_id(t)
derived_user_id = get_derived_user_id(t)
tweet_id_to_user_id[tweet_id] = user_id
if derived_tweet_id and derived_tweet_id not in tweet_id_to_user_id:
tweet_id_to_user_id[derived_tweet_id] = derived_user_id
tweet_id_to_content_and_urls[tweet_id] = get_content(t), get_urls(t)
all_hashtags.update(get_hashtags(t))
user_id_to_handle[user_id] = get_handle(t)
timestamp = get_timestamp(t)
timestamps.append(timestamp)
dates.append(str(parse(timestamp).date()))
formatted_time.append(parse(timestamp).strftime("%m-%d-%H"))
all_hashtags = list(all_hashtags)
hashtag_occurrences = []
for t in michigan_tweets:
edge = get_tweet_to_derived_tweet_edge(t)
if edge:
edges.append((tweet_id_to_user_id[edge[0]], tweet_id_to_user_id[edge[1]]))
hashtags = get_hashtags(t)
hashtag_occurrences.append([1 if ht in hashtags else 0 for ht in all_hashtags])
In [32]:
print 'There are a total of %s hashtags in the dataset.' %(len(all_hashtags))
In [30]:
print('The average edge weight between connected users (i.e., one of whom has retweeted the other at least once) is %f.' % (len(edges) / float(len(set(edges)))))
# The reason this is not equal to one is because user A may retweet user B multiple times.
In [45]:
cm = np.corrcoef(np.array(hashtag_occurrences), rowvar=False)
In [91]:
Z = linkage(cm)
In [97]:
%matplotlib inline
print cm.shape
plt.imshow(cm)
plt.colorbar()
plt.show()
In [76]:
X = StandardScaler().fit_transform(cm)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
# seaborn.clustermap(cm, metric="correlation")
In [77]:
%matplotlib inline
# Code from http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html.
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [78]:
Counter(labels)
Out[78]:
In [12]:
%matplotlib inline
date_counts = Counter(dates)
histogram = pandas.DataFrame.from_dict(date_counts, orient='index')
plt = histogram.plot(kind='bar')
plt.legend(["Number of Tweets"])
formatted_time_counts = Counter(formatted_time)
histogram = pandas.DataFrame.from_dict(formatted_time_counts, orient='index')
every_n = 12 # two x-axis ticks per day
plt = histogram.plot(kind='bar')
tick_locations = plt.xaxis.get_ticklocs()
tick_labels = [label.get_text() for label in plt.xaxis.get_ticklabels()]
_ = plt.xaxis.set_ticks(tick_locations[::every_n])
_ = plt.xaxis.set_ticklabels(tick_labels[::every_n])
plt.legend(["Number of Tweets"])
# The first graph shows the number of tweets per day.
# The second graph shows the number of tweets per hour. The axis ticks are 12 hours apart, but the data is hourly.
Out[12]:
In [95]:
# Setting up labeled_sites_to_type, which is a map from site to manual categorization by opensources.
labeled_sites = set()
labeled_types = []
labeled_sites_to_type = {}
# Data from https://github.com/BigMcLargeHuge/opensources/blob/master/sources/sources.csv.
with open('data/opensources.csv', 'r') as f:
for line in f:
content = line.split(',')
labeled_sites.add(content[0])
labeled_types.append(content[1])
labeled_sites_to_type[content[0]] = content[1]
labeled_types_frequencies = Counter(labeled_types)
labeled_types = set(labeled_types)
print(labeled_types_frequencies)
print
print len(labeled_sites)
print
print list(labeled_sites)[:10]
In [14]:
# Creating helper function for topic modeling.
def word_tokens_and_ids_from_tweets(tweet_ids):
tweets_contents = [tweet_id_to_content_and_urls[ti][0] for ti in tweet_ids]
punctuation = set(string.punctuation)
stop_words = stopwords.words('english')
def clean(contents):
lower = [i for i in contents.lower().split()]
stop_free = ' '.join([i for i in lower if i not in stop_words])
punc_free = ''.join([ch for ch in stop_free if ch not in punctuation])
return punc_free.split()
tweets_contents_cleaned = [clean(tc) for tc in tweets_contents]
dictionary = corpora.Dictionary(tweets_contents_cleaned)
return dictionary
dictionary = word_tokens_and_ids_from_tweets(tweet_id_to_content_and_urls.keys()) # tweet_id_to_content_and_urls's keys exclude derived tweets
In [15]:
def vector_from_tweet(tweet_id):
content = tweet_id_to_content_and_urls[tweet_id][0]
return dictionary.doc2bow(content.lower().split())
print (vector_from_tweet(793269953927536640L))
In [ ]: