Install: pip install "graphistry[igraph]"
Note: pip install igraph is the wrong package. if installing manually, use python-igraph
In [1]:
from __future__ import print_function
from io import open
import pandas as pd
import igraph # Install Igraph with pip install python-igraph
import graphistry
#graphistry.register(key='MY_API_KEY', server='labs.graphistry.com') #https://www.graphistry.com/api-request
In [2]:
with open('../../data/characters.txt', encoding="latin-1") as f:
lines = f.readlines()
heroes = pd.DataFrame(
list(map(lambda x: (int(x.split(':')[0].split(' ')[1]), x.split(': ', 1)[1].split('\n')[0]), lines)),
columns=['hero_id', 'hero_name'])
print('#Heroes:', len(heroes))
heroes[:3]
Out[2]:
In [3]:
with open('../../data/comics.txt', encoding="latin-1") as f:
lines = f.readlines()
comics = pd.DataFrame(
list(map(lambda x: (int(x.split(':')[0].split(' ')[1]), x.split(': ', 1)[1].split('\n')[0]), lines)),
columns=['comic_id', 'comic_name'])
print('#Comics: ', len(comics))
comics[:3]
Out[3]:
In [4]:
with open('../../data/appearances.txt', encoding="latin-1") as f:
lines = f.readlines()[len(heroes) + len(comics) + 2:]
def expand (line):
parts = list(map(int, line.split(' ')))
return [(parts[0], role) for role in parts[1:]]
appearences = pd.DataFrame(
[item for sublist in list(map(expand, lines)) for item in sublist],
columns=['hero', 'comic'])
appearences[:3]
Out[4]:
In [6]:
# You may need to install numexpr: pip install numexpr
coappearences = \
appearences\
.merge(appearences, on='comic')\
.merge(comics, left_on='comic', right_on='comic_id')\
[['hero_x', 'hero_y']]\
.query('hero_x > hero_y')
unique_coappearences = coappearences.drop_duplicates(['hero_x', 'hero_y']).set_index(['hero_x', 'hero_y'])
unique_coappearences['counts'] = coappearences.groupby(['hero_x', 'hero_y']).size()
unique_coappearences = unique_coappearences.reset_index()
print('#edges', len(unique_coappearences))
unique_coappearences[:3]
Out[6]:
In [7]:
g = graphistry.bind(source='hero_x', destination='hero_y', edge_title='counts')
In [8]:
g.plot(unique_coappearences)
Out[8]:
In [9]:
# Here we are using two dataframes, one for edges and one for nodes
g2 = g.bind(node='hero_id', point_title='hero_name')
In [10]:
g2.plot(unique_coappearences, heroes)
Out[10]:
In [11]:
#Warning: slow
ig = g2.pandas2igraph(unique_coappearences, directed=False)
clusters = ig.community_infomap()
(i_edges, i_nodes) = g2.igraph2pandas(ig)
print('#clusters', str(len(list(set(clusters.membership)))))
In [12]:
nodes_colored = pd.DataFrame({'cluster': clusters.membership})\
.reset_index().rename(columns={'index': 'denseid'})\
.merge(i_nodes.reset_index().rename(columns={'index':'denseid'}), on='denseid')\
.merge(heroes, left_on='hero_id', right_on='hero_id')
print('#colored nodes', str(len(nodes_colored)))
nodes_colored[:3]
Out[12]:
In [13]:
nodes_colored['color'] = nodes_colored.apply(lambda x: x['cluster'] % 9, axis=1)
nodes_colored.pivot_table(index=['color'], aggfunc=lambda x: len(x.unique()))
Out[13]:
In [15]:
g3 = g2.bind(point_color='color', edge_weight='counts')
In [16]:
g3.plot(unique_coappearences, nodes_colored)
Out[16]:
In [17]:
big_clusters = nodes_colored\
.pivot_table(index=['cluster'], aggfunc=lambda x: len(x.unique()))\
.rename(columns={'hero_id': 'cluster_size'})\
.query('cluster_size > 100')\
.reset_index()[['cluster', 'cluster_size']]
print('# big clusters', len(big_clusters))
big_clusters[:3]
Out[17]:
In [18]:
good_nodes = nodes_colored.merge(big_clusters, on='cluster')
print('# nodes', len(good_nodes))
good_nodes[:3]
Out[18]:
In [19]:
good_edges = unique_coappearences\
.merge(good_nodes, left_on='hero_x', right_on='hero_id')\
.merge(good_nodes, left_on='hero_y', right_on='hero_id')\
[['hero_x', 'hero_y', 'counts']]
print('# edges', len(good_edges))
good_edges[:3]
Out[19]:
In [20]:
g3.plot(good_edges, good_nodes)
Out[20]:
In [21]:
#label edges whether they stay inside a cluster or connect nodes in different clusters
good_edges2 = good_edges\
.merge(\
good_nodes[['cluster', 'hero_id']].rename(columns={'cluster': 'cluster_x'}),\
left_on='hero_x', right_on='hero_id')\
.merge(\
good_nodes[['cluster', 'hero_id']].rename(columns={'cluster': 'cluster_y'}),\
left_on='hero_y', right_on='hero_id')
good_edges2['is_inner'] = good_edges2.apply(lambda x: x['cluster_x'] == x['cluster_y'], axis=1)
#bind to edge_weight
good_edges2['weight'] = good_edges2.apply(lambda x: 10 if x['is_inner'] else 8, axis=1)
good_edges2 = good_edges2[['hero_x', 'hero_y', 'counts', 'is_inner', 'weight']]
good_edges2[:3]
Out[21]:
In [22]:
g3.bind(edge_weight='weight').plot(good_edges2, good_nodes)
Out[22]:
In [23]:
shells = ig.shell_index()
print('#shells', str(len(list(set(shells)))))
In [24]:
nodes_shelled = pd.DataFrame({'shell': shells})\
.reset_index().rename(columns={'index': 'denseid'})\
.merge(nodes_colored, on='denseid')
print('#shelled nodes', str(len(nodes_shelled)))
nodes_shelled[:3]
Out[24]:
In [25]:
g3.plot(unique_coappearences, nodes_shelled)
Out[25]:
In [ ]: