In [7]:
import pandas as pd
import os, sys
import csv
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [17]:
googlegroups = [group.split('.')[0] for group in os.listdir('../data/mbox/')]
gg_dfs = []
for group in googlegroups:
gg_df = pd.read_csv('../data/edges/' + group + '-edges.csv')
gg_dfs.append(gg_sf)
gg_edgelist = pd.concat(gg_dfs)
gg_edgelist.to_csv('../data/network/plots_edgelist.csv')
In [ ]:
edges = pd.read_csv('../data/network/plots_edgelist.csv')
In [ ]:
def disambiguate(user, names):
if user in names:
return names[0]
else:
return user
copies = [['ANONYMIZED']]
for index, names in enumerate(copies):
edges['source'] = edges['source'].apply(disambiguate, args=(names,))
edges['target'] = edges['target'].apply(disambiguate, args=(names,))
# edges.head(50)
In [ ]:
edges.to_csv('../data/network/plots_edgelist.csv', index=False)
In [ ]:
reader = csv.DictReader('../data/network/plots_edgelist.csv')
g = Graph.DictList(edges=reader, directed=True, edge_foreign_keys=('source','target'), vertices=None)
g.es['weight'] = 1
g.simplify(combine_edges={'weight':'sum'})
In [ ]:
g.vs['degree'] = g.degree()
g.vs['in'] = g.indegree()
g.vs['out'] = g.outdegree()
g.vs['core'] = g.coreness()
g.vs['core-in'] = g.coreness(mode=IN)
g.vs['core-out'] = g.coreness(mode=OUT)
g.vs['closeness'] = g.closeness(weights='weight')
g.vs['betweenness'] = g.betweenness(weights='weight')
g.vs['clustering_coef'] = g.transitivity_local_undirected(weights='weight')
g.vs['eigen_centrality'] = g.eigenvector_centrality(weights='weight')
g.vs['avg_neighborhood'] = g.knn(weights='weight')[0]
g.vs['neighborhood'] = g.neighborhood_size()
g.vs['pagerank'] = g.pagerank(weights='weight')
g.vs['hub'] = g.hub_score(weights='weight')
g.vs['authority'] = g.authority_score(weights='weights')
In [ ]:
with open('vertex_attributes.csv', 'w') as csvfile:
fieldnames = g.vertex_attributes()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for v in g.vs:
vs_attrs = v.attributes()
writer.writerow(vs_attrs)