In [58]:
import tweepy
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
from igraph import Graph as IGraph
import igraph
from IPython.core.display import HTML
from IPython.html.widgets import FloatProgress, Label
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 8)
OAUTH_KEY = ''
OAUTH_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_TOKEN_SECRET = ''
NEO4J_USER = ''
NEO4J_SECRET = ''
In [24]:
seeds = ['volya_belousova', 'egor4rgurev', 'kirillfrolovdw', 'ilyazhuchhj']
In [25]:
auth = tweepy.OAuthHandler(OAUTH_KEY, OAUTH_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
graph = Graph(user=NEO4J_USER, password=NEO4J_SECRET)
In [26]:
def get_follwers_by_id(account_id):
ids = []
for page in tweepy.Cursor(api.followers_ids, user_id=account_id).pages():
print("FOLLOWERS: Next page for %s" % account_id)
ids.extend(page)
return ids
def get_friends_by_id(account_id):
ids = []
for page in tweepy.Cursor(api.friends_ids, user_id=account_id).pages():
print("FRIENDS: Next page for %s" % account_id)
ids.extend(page)
return ids
def get_friends(account):
ids = []
for page in tweepy.Cursor(api.friends_ids, screen_name=account).pages():
print("Next page for %s" % account)
ids.extend(page)
return ids
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
In [27]:
friend_ids = {}
for account in seeds:
friend_ids[account] = get_friends(account)
commons = {}
for first in seeds:
for second in seeds:
if first != second:
commons[(first, second)] = list(set(friend_ids[first]) & set(friend_ids[second]))
In [28]:
all_users = friend_ids[seeds[0]]
for name in seeds:
all_users = list(set(all_users) | set(friend_ids[name]))
In [29]:
display("Common users: {0}".format(len(all_users)))
html = ["<table width=100%>"]
html.append('<tr><td></td>')
for name in seeds:
html.append('<td>{0}</td>'.format(name))
html.append('</tr>')
for first in seeds:
html.append('<tr><td>{0}</td>'.format(first))
for second in seeds:
if first != second:
html.append('<td>{0}</td>'.format(len(commons[(first,second)])))
else:
html.append('<td>x</td>')
html.append("</tr>")
html.append('</table>')
HTML(''.join(html))
Out[29]:
In [30]:
graph.run("CREATE CONSTRAINT ON (u:UserRes) ASSERT u.id IS UNIQUE")
processed_users = []
Out[30]:
In [31]:
for user_id in all_users:
if user_id not in processed_users:
user = Node("UserRes", id=user_id)
graph.merge(user)
try:
for friend_id in get_follwers_by_id(user_id):
if friend_id in all_users:
friend = Node("UserRes", id=friend_id)
graph.merge(friend)
graph.merge(Relationship(friend, "FRIEND_OF", user))
for friend_id in get_friends_by_id(user_id):
if friend_id in all_users:
friend = Node("UserRes", id=friend_id)
graph.merge(friend)
graph.merge(Relationship(user, "FRIEND_OF", friend))
except tweepy.TweepError:
print("User {0} has protected followers/friends".format(user_id))
processed_users.append(user_id)
print(float(len(processed_users)) / float(len(all_users)) * 100.0)
In [32]:
query = """
MATCH (user1:UserRes)-[:FRIEND_OF]->(user2:UserRes),
(user2:UserRes)-[:FRIEND_OF]->(user1)
RETURN user1.id, user2.id
"""
data = graph.run(query)
ig = IGraph.TupleList(data, weights=False)
ig.es["width"] = 1
ig.simplify(combine_edges={ "width": "sum" })
Out[32]:
In [33]:
clusters = IGraph.community_fastgreedy(ig)
clusters = clusters.as_clustering()
In [34]:
print("Found %d clusters" % len(clusters))
In [40]:
nodes = [{"id": node.index, "name": node["name"]} for node in ig.vs]
for node in nodes:
node["cluster"] = clusters.membership[node["id"]]
nodes_df = pd.DataFrame(nodes)
edges = [{"source": x[0], "target": x[1]} for x in ig.get_edgelist()]
edges_df = pd.DataFrame(edges)
edges_counts = edges_df.groupby('source').count().reset_index().rename(columns = {'target': 'count'})
In [37]:
nodes_df.groupby('cluster').count()
Out[37]:
We have only two clusters with significant user count. Let's check first
In [50]:
first_cluster = nodes_df[nodes_df["cluster"] == 0][["id", "name"]]
In [55]:
first_cluster_counts = first_cluster.set_index('id').join(edges_counts.set_index('source')).reset_index()
In [59]:
first_cluster_counts["count"].hist()
Out[59]:
In [73]:
for group in range(20):
start = group * 100
stop = (group + 1) * 100
users_slice = first_cluster_counts[(first_cluster_counts["count"] > start) & (first_cluster_counts["count"] < stop)]
print("Users from %d to %d has %d" %(start, stop, users_slice.count()[0]))
display(users_slice[:10])
Looks like most bot accounts has followers/follows count from 1200 to 1900
Let's filter it
In [74]:
filtered_bots = first_cluster_counts[(first_cluster_counts["count"] > 1200) & (first_cluster_counts["count"] < 1900)]
In [75]:
print("We found %s bots in first approximation" % filtered_bots.count()[0])
Now collect all information from these accounts and search for corellations
In [98]:
first_cluster_bots = []
for group in chunks(filtered_bots["name"].values, 100):
for user in api.lookup_users(user_ids=list(group)):
first_cluster_bots.append(user)
In [104]:
locations = [user.location for user in first_cluster_bots]
In [119]:
first_cluster_bots[0].favourites_count
Out[119]:
In [121]:
possible_bot_users = pd.DataFrame([{'name': user.name, 'id': user.id, 'location': user.location, 'screen_name': user.screen_name, 'followers': user.followers_count, 'friends': user.friends_count, 'created_at': user.created_at, 'favorites': user.favourites_count} for user in first_cluster_bots])
In [123]:
possible_bot_users.hist()
Out[123]:
In [142]:
possible_bot_users[["id", "location"]].groupby('location').count().plot(kind='bar')
Out[142]:
Ok, we have two significant values. Moscow and New York. Let's split dataset
In [145]:
moscow_users = possible_bot_users[possible_bot_users["location"] == u'Москва']
In [147]:
moscow_users.hist()
Out[147]:
In [162]:
moscow_users[:10]
Out[162]:
Now check NY users
In [155]:
ny_users = possible_bot_users[possible_bot_users["location"] == u'New York, USA']
In [160]:
ny_users.hist()
Out[160]:
In [161]:
ny_users[:10]
Out[161]:
In [163]:
print("Moscow bots: %d, NY bots: %d, Total: %d" % (moscow_users.count()[0], ny_users.count()[0], moscow_users.count()[0] + ny_users.count()[0]))
Now export moscow and ny users to csv
In [167]:
ny_users.append(moscow_users).to_csv("./moscow_ny_bots.csv", encoding='utf8')
In [ ]: