In [ ]:
from __future__ import print_function, unicode_literals
from dbpedia_utils import iter_entities_from
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import json
import gzip
import dbpedia_config
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
source_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
main_language = dbpedia_config.MAIN_LANGUAGE
links_file = '{0}/page_links_{1}.nt.bz2'.format(source_folder, main_language)
First, we load person data to process only biographies present in our dataset.
In [ ]:
gender_data = pd.read_csv('{0}/person_data_{1}.csv.gz'.format(target_folder, main_language), encoding='utf-8', index_col='uri').loc[:,('gender', 'birth_year')].reset_index().copy()
gender_data.head()
At this point of time you can decide whether you want to process the entire biography network, or, for instance, consider only biographies born before/after 1900, and so on. You decide :)
In this notebook, we will test the biographies of people born before 1900.
In [ ]:
gender_data = gender_data[(gender_data.birth_year < 1900) & (gender_data.birth_year) > 0].reset_index(drop=True)
gender_data.shape
Note that instead of indexing by URI as we do on the other notebooks, this time we indexed by a number because we use numbers to identify vertices in the graph.
So we build a map from URIs to integer indexes.
In [ ]:
gender_data.head()
In [ ]:
uri_to_i = {row.uri: idx for idx, row in gender_data.iterrows()}
In [ ]:
i_to_gender = {idx: row.gender[0] for idx, row in gender_data.iterrows()}
We use the graphtool library to create our graph.
In [ ]:
import graph_tool
def build_graph(dataframe, uri_to_i):
graph_data = {'graph': graph_tool.Graph(), 'id_map': {}, 'g_map': {}, 'pk_to_i': {}}
graph = graph_data['graph']
print('dataframe shape', dataframe.shape)
graph.add_vertex(dataframe.shape[0])
for entity in iter_entities_from(links_file):
resource = entity['resource']
if resource in uri_to_i:
#print(resource)
links = []
for other in entity['wikiPageWikiLink']:
if other in uri_to_i:
#print('->', other)
links.append(other)
src = uri_to_i[resource]
for link in links:
#print(src, link)
graph.add_edge(src, uri_to_i[link])
graph_data['gender'] = graph.new_vertex_property('string')
gender = graph_data['gender']
print(graph.num_vertices(), graph.num_edges())
for i, row in dataframe.iterrows():
gender[i] = row.gender[0]
graph.vertex_properties['gender'] = gender
return graph_data
graph_data = build_graph(gender_data, uri_to_i)
Not all nodes have links. In our analysis, we only considered nodes connected to at least one other node.
In [ ]:
from cytoolz import frequencies
from __future__ import division
def count_values(graph, property_name):
values = []
for v in graph.vertices():
values.append(graph.vertex_properties[property_name][v])
return frequencies(values)
def clean_isolated_nodes(graph_data):
graph = graph_data['graph']
degree_values = graph.degree_property_map('total').get_array()
degree_flag = degree_values.astype(np.bool)
flags = graph.new_vertex_property('bool', vals=degree_flag)
graph.set_vertex_filter(flags)
print('to keep', np.sum(flags.get_array()), 'from', graph.num_vertices())
graph_data['kept_frequencies'] = count_values(graph, 'gender')
print(graph_data['kept_frequencies'])
graph.clear_filters()
graph.set_vertex_filter(flags, inverted=True)
print('to delete', np.sum(flags.get_array()), 'from', graph.num_vertices())
graph_data['deleted_frequencies'] = count_values(graph, 'gender')
print(graph_data['deleted_frequencies'])
graph.clear_filters()
graph.set_vertex_filter(flags)
#graph.purge_vertices(in_place=True)
#graph.clear_filters()
clean_isolated_nodes(graph_data)
We can estimate how one gender is connected to itself and other genders.
In [ ]:
from collections import defaultdict
from collections import Counter
results = {}
def gender_stats(graph_data):
graph = graph_data['graph']
ratios = defaultdict(list)
edge_counts = defaultdict(lambda: defaultdict(int))
for src in graph.vertices():
counts = Counter()
for dst in src.out_neighbours():
counts[graph.vertex_properties['gender'][dst]] += 1
if not counts:
continue
key = graph.vertex_properties['gender'][src]
if counts['m'] > 0:
ratio = counts['f'] / float(counts['m'])
else:
ratio = 0.0
ratios[key].append(ratio)
edge_counts[key]['m'] += counts['m']
edge_counts[key]['f'] += counts['f']
#print src_data['label'], counts
return ratios, edge_counts
def build_ratios(graph_data, results):
graph = graph_data['graph']
genders = ['m', 'f']
print('edge fractions')
expected_counts = graph_data['kept_frequencies']
expected_freqs = [expected_counts['m'] / graph.num_vertices() * 100.0, expected_counts['f'] / graph.num_vertices() * 100.0, ]
print('expected freqs', expected_freqs)
ratios, edge_counts = gender_stats(graph_data)
results['gender_ratios'] = dict(ratios)
results['gender_edge_counts'] = dict(edge_counts)
#print('ratios', ratios)
print('edge counts', edge_counts)
build_ratios(graph_data, results)
Now we can estimate centrality measures. In particular, we consider PageRank.
In [ ]:
import graph_tool.centrality
def centrality(graph_data, results):
graph = graph_data['graph']
def estimate_fraction(method, name, fraction_name):
values = method(graph)
graph.vertex_properties[name] = values
sorted_pr = []
for v in graph.vertices():
sorted_pr.append((graph.vertex_properties[name][v], v))
print(name, graph.num_vertices(), len(sorted_pr))
sorted_pr = sorted(sorted_pr, reverse=True)
#print(sorted_pr)
f_fraction = []
print(sorted_pr[:3])
count_w = 0.0
for i, (node_pr, node_id) in enumerate(sorted_pr, start=1):
#for i, (node_id, node_data) in enumerate(graph.nodes_iter(data=True), start=1):
if graph.vertex_properties['gender'][node_id] == 'f':
count_w += 1.0
f_fraction.append(count_w / i)
results[fraction_name] = f_fraction
results['count_w'] = count_w
estimate_fraction(graph_tool.centrality.pagerank, 'pagerank', 'f_fraction')
estimate_fraction(lambda x: x.degree_property_map('in'), 'in_degree', 'f_id_fraction')
return
centrality(graph_data, results)
We can plot the PageRank distribution with respect to how many women are present at each subset of the results.
In [ ]:
def plot_fraction(graph_data, results, label, fraction='f_fraction'):
graph = graph_data['graph']
f_fraction = results[fraction]
count_w = results['count_w']
#plt.figure(figsize=(6,4))
plt.plot(np.arange(1, len(f_fraction) + 1, 1), f_fraction, '-', alpha=0.75, label=label)
#plt.plot(np.arange(1, len(f_fraction) + 1, 1), null_f_fraction, '-', alpha=0.75, label='Null Model')
plt.hlines([count_w / graph.num_vertices()], 1.0, graph.num_vertices(), linestyle='dashed', alpha=0.5, linewidth=0.5)
#plt.savefig('./results/connectivity_observedpagerank_proportion.png', dpi=100, bbox_inches='tight')
plot_fraction(graph_data, results, 'Pre-1900')
plt.xlabel('# of Biographies in the top-k results')
plt.ylabel('Fraction of Women')
plt.xscale('log')
We can also display the top biographies to find the most central persons according to gender.
In [ ]:
def top_entities(dataframe, graph_data, results, name='pagerank', n=30):
graph = graph_data['graph']
genders = ['m', 'f']
pagerank = graph.vertex_properties[name]
gender = graph.vertex_properties['gender']
sorted_pr = []
for v in graph.vertices():
sorted_pr.append((pagerank[v], v, gender[v]))
print(name, graph.num_vertices(), len(sorted_pr))
sorted_pr = sorted(sorted_pr, reverse=True)
top_pagerank = {}
for g in genders:
values = list(filter(lambda x: x[2] == g, sorted_pr))[0:n]
top_pagerank[g] = [(dataframe.loc[x[1]].uri, x[0]) for x in values]
results['top_pagerank'] = top_pagerank
return top_pagerank
def plot_top_30(top_pagerank):
people = []
for m, w in zip(top_pagerank['m'], top_pagerank['f']):
people.append({'woman': w[0], 'man': m[0], 'w_pr': w[1], 'm_pr': m[1]})
df = pd.DataFrame.from_records(people, index=range(1, 31))
plt.figure(figsize=(9,12))
plt.title('Top-30 Biographies per Gender')
plt.plot(df.m_pr, -df.index + 31, 'o', label="Men")
plt.plot(df.w_pr, -df.index + 31, '^', label="Women")
plt.xlabel('PageRank')
plt.yticks(np.arange(1, 31))
#plt.yticks(df.index, [n[1]['name'] for n in graph.nodes_iter(data=True)])#, fontsize='xx-small', rotation=45)
plt.ylim([0.5, 30.5])
#plt.xlim([-0.1,1.1])
#for i, (x0, x1) in enumerate(zip(bc_real, bc_base)):
# plt.arrow(x0, i, x1 - x0, 0, length_includes_head=True)
plt.hlines(-df.index + 31, df.m_pr, df.w_pr, alpha=0.3, linestyle='dashed', linewidth=1)
plt.legend(loc='lower right')
ax1 = plt.gca()
ax2 = ax1.twinx()
ax2.grid(False)
ax2.set_ylim(ax1.get_ylim())
ax2.set_yticks(np.arange(1, 31))
ax2.set_yticklabels(list(df.man)[::-1])
ax1.set_yticklabels(list(df.woman)[::-1])
top_entities(gender_data, graph_data, results)
plot_top_30(results['top_pagerank'])
We want to compare our network with baseline graphs that are, by construction, not biased. For more information you can check our paper on why we do this. Particularly we focus on:
To do so we need to estimate some stats first.
In [ ]:
import graph_tool.clustering
def append_graph_stats(graph_data, results):
graph = graph_data['graph']
results['n_nodes'] = graph.num_vertices()
results['n_edges'] = graph.num_edges()
print('directed', results['n_nodes'], results['n_edges'])
results['u_n_nodes'] = graph.num_vertices()
results['u_n_edges'] = graph.num_edges()
k = (results['u_n_edges'] - (results['u_n_edges'] % 2)) / results['u_n_nodes']
results['k'] = k
print('k', k, results['u_n_nodes'], results['u_n_edges'])
clust_prop = graph_tool.clustering.local_clustering(graph)
clust_coeff = clust_prop.get_array().mean().tolist()
results['clust_coeff'] = clust_coeff
print('coeff', results['clust_coeff'])
append_graph_stats(graph_data, results)
We want to estimate the $\beta$ parameter for the Small World graph we are going to generate. Note that we need to use networkx because graph_tool does not have a Small World generator.
In [ ]:
from scipy.optimize import brenth
import networkx as nx
def estimate_beta(graph_data, results):
k = int(np.round(results['k']))
print('k', k)
def find_clustering(x):
g_nx = nx.watts_strogatz_graph(results['n_nodes'], k, x)
nx.write_graphml(g_nx, '{0}/temp_graph.graphml'.format(target_folder))
g = graph_tool.Graph()
g.load('{0}/temp_graph.graphml'.format(target_folder))
g_prop = graph_tool.clustering.local_clustering(g)
g_coeff = g_prop.get_array().mean().tolist()
print(g_coeff - results['clust_coeff'])
return g_coeff - results['clust_coeff']
beta = brenth(lambda x: (nx.cluster.average_clustering(nx.watts_strogatz_graph(results['n_nodes'], k, x)) - results['clust_coeff']), 0.0, 1.0)
print('beta', beta)
results['beta'] = beta
estimate_beta(graph_data, results)
In [ ]:
import tempfile
import random
def prepare_baseline(graph, base_data):
graph_data = {'graph': graph, 'id_map': base_data['id_map'],
'kept_frequencies': base_data['kept_frequencies']}
results = {}
build_ratios(graph_data, results)
centrality(graph_data, results)
append_graph_stats(graph_data, results)
return graph_data, results
def generate_small_world(source_graph_data, source_results):
sw_graph = nx.watts_strogatz_graph(source_results['n_nodes'], int(np.round(source_results['k'])),
source_results['beta'], seed=31072010)
handle, filename = tempfile.mkstemp(suffix='.graphml')
nx.write_graphml(sw_graph, filename)
g = graph_tool.Graph()
g.load(filename)
print(g.num_vertices(), g.num_edges())
freqs = source_graph_data['kept_frequencies']
shuffled = ['m'] * freqs['m'] + ['f'] * freqs['f']
random.shuffle(shuffled)
g.vertex_properties['gender'] = g.new_vertex_property('string')
for i, gender in enumerate(shuffled, start=0):
g.vertex_properties['gender'][i] = gender
sw_data, sw_results = prepare_baseline(g, source_graph_data)
#save_results(sw_data, sw_results, 'sw', 'Small World')
return sw_data, sw_results
sw, sw_results = generate_small_world(graph_data, results)
We use graph_tool random module to perturb our network and build the baseline random networks.
In [ ]:
import graph_tool.generation
def random_graph(source_graph_data, source_results, model='uncorrelated', n_iter=1):
clone = source_graph_data['graph'].copy()
graph_tool.generation.random_rewire(clone, model=model, n_iter=n_iter)
base_data, base_results = prepare_baseline(clone, source_graph_data)
return base_data, base_results
def populate_rand(rand, rand_results, model='erdos', n_iter=1):
random_g_data, random_g_results = random_graph(pre, results['pre'], model=model, n_iter=n_iter)
rand['pre'] = random_g_data
rand_results.update(random_g_results)
rand, rand_results = random_graph(graph_data, results, model='erdos')
deg_seq, deg_seq_results = random_graph(graph_data, results, model='uncorrelated', n_iter=3)
And now we can plot the distributions of PageRank for each network. In this way, we can see how biased is the estimated centrality in comparison to unbiased networks.
In [ ]:
def plot_distributions(loc='best'):
plot_fraction(graph_data, results, 'PageRank (OBS)')
plot_fraction(graph_data, results, 'In-Degree (OBS)', fraction='f_id_fraction')
plot_fraction(sw, sw_results, 'Small World')
plot_fraction(rand, rand_results, 'Random')
plot_fraction(deg_seq, deg_seq_results, 'Degree Sequence')
plt.legend(loc=loc, fontsize='x-small')
sns.set_palette('Set2')
plot_distributions()
plt.xlim([10, 1000000])
plt.ylim([0, 0.3])
plt.xscale('log')
#plt.xlabel('Sorted Top-k Biographies')
plt.ylabel('Fraction of Women')
plt.title('0 -- 1900')
In [ ]: