Network Analysis - Generating Biography Networks

In this notebook we generate a network of biographies from the link structure in the English DBpedia. On this network we will compare different centrality measures as well as the distribution of links between genders.

By Eduardo Graells-Garrido.



In [ ]:

    
from __future__ import print_function, unicode_literals
from dbpedia_utils import iter_entities_from
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import json
import gzip
import dbpedia_config

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [ ]:

    
source_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
main_language = dbpedia_config.MAIN_LANGUAGE
links_file = '{0}/page_links_{1}.nt.bz2'.format(source_folder, main_language)

First, we load person data to process only biographies present in our dataset.



In [ ]:

    
gender_data = pd.read_csv('{0}/person_data_{1}.csv.gz'.format(target_folder, main_language), encoding='utf-8', index_col='uri').loc[:,('gender', 'birth_year')].reset_index().copy()
gender_data.head()

At this point of time you can decide whether you want to process the entire biography network, or, for instance, consider only biographies born before/after 1900, and so on. You decide :)

In this notebook, we will test the biographies of people born before 1900.



In [ ]:

    
gender_data = gender_data[(gender_data.birth_year < 1900) & (gender_data.birth_year) > 0].reset_index(drop=True)
gender_data.shape

Note that instead of indexing by URI as we do on the other notebooks, this time we indexed by a number because we use numbers to identify vertices in the graph.

So we build a map from URIs to integer indexes.



In [ ]:

    
gender_data.head()



In [ ]:

    
uri_to_i = {row.uri: idx for idx, row in gender_data.iterrows()}



In [ ]:

    
i_to_gender = {idx: row.gender[0] for idx, row in gender_data.iterrows()}

We use the graphtool library to create our graph.



In [ ]:

    
import graph_tool

def build_graph(dataframe, uri_to_i):
    graph_data = {'graph': graph_tool.Graph(), 'id_map': {}, 'g_map': {}, 'pk_to_i': {}}
    graph = graph_data['graph']
    print('dataframe shape', dataframe.shape)
    graph.add_vertex(dataframe.shape[0])

    for entity in iter_entities_from(links_file):
        resource = entity['resource']

        if resource in uri_to_i:
            #print(resource)
            links = []
            for other in entity['wikiPageWikiLink']:
                if other in uri_to_i:
                    #print('->', other)
                    links.append(other)
            src = uri_to_i[resource]
            for link in links:
                #print(src, link)
                graph.add_edge(src, uri_to_i[link])
            
    graph_data['gender'] = graph.new_vertex_property('string')
    gender = graph_data['gender']
    
    print(graph.num_vertices(), graph.num_edges())
    
    for i, row in dataframe.iterrows():
        gender[i] = row.gender[0]
        
    graph.vertex_properties['gender'] = gender
    
    return graph_data

graph_data = build_graph(gender_data, uri_to_i)

Not all nodes have links. In our analysis, we only considered nodes connected to at least one other node.



In [ ]:

    
from cytoolz import frequencies
from __future__ import division

def count_values(graph, property_name):
    values = []
    for v in graph.vertices():
        values.append(graph.vertex_properties[property_name][v])
    return frequencies(values)

def clean_isolated_nodes(graph_data):
    graph = graph_data['graph']
    degree_values = graph.degree_property_map('total').get_array()
    degree_flag = degree_values.astype(np.bool)
    
    flags = graph.new_vertex_property('bool', vals=degree_flag)
    
    graph.set_vertex_filter(flags)
    
    print('to keep', np.sum(flags.get_array()), 'from', graph.num_vertices())
    graph_data['kept_frequencies'] = count_values(graph, 'gender')
    print(graph_data['kept_frequencies'])
          
    graph.clear_filters()
    graph.set_vertex_filter(flags, inverted=True)
          
    print('to delete', np.sum(flags.get_array()), 'from', graph.num_vertices())
    graph_data['deleted_frequencies'] = count_values(graph, 'gender')
    print(graph_data['deleted_frequencies'])
          
    graph.clear_filters()
    graph.set_vertex_filter(flags)
    #graph.purge_vertices(in_place=True)
    #graph.clear_filters()
    
clean_isolated_nodes(graph_data)

We can estimate how one gender is connected to itself and other genders.



In [ ]:

    
from collections import defaultdict
from collections import Counter

results = {}

def gender_stats(graph_data):
    graph = graph_data['graph']
        
    ratios = defaultdict(list)
    edge_counts = defaultdict(lambda: defaultdict(int))

    for src in graph.vertices():
        counts = Counter()

        for dst in src.out_neighbours():
            counts[graph.vertex_properties['gender'][dst]] += 1

        if not counts:
            continue

        key = graph.vertex_properties['gender'][src]

        if counts['m'] > 0:
            ratio = counts['f'] / float(counts['m'])
        else:
            ratio = 0.0

        ratios[key].append(ratio)
        edge_counts[key]['m'] += counts['m']
        edge_counts[key]['f'] += counts['f']

        #print src_data['label'], counts
    return ratios, edge_counts

def build_ratios(graph_data, results):
    graph = graph_data['graph']
    
    genders = ['m', 'f']
    print('edge fractions')
    
    expected_counts = graph_data['kept_frequencies']
    expected_freqs = [expected_counts['m'] / graph.num_vertices() * 100.0, expected_counts['f'] / graph.num_vertices() * 100.0, ]
    print('expected freqs', expected_freqs)

    ratios, edge_counts = gender_stats(graph_data)
    results['gender_ratios'] = dict(ratios)
    results['gender_edge_counts'] = dict(edge_counts)
    #print('ratios', ratios)
    print('edge counts', edge_counts)
    
build_ratios(graph_data, results)

Now we can estimate centrality measures. In particular, we consider PageRank.



In [ ]:

    
import graph_tool.centrality

def centrality(graph_data, results):
    graph = graph_data['graph']
    
    def estimate_fraction(method, name, fraction_name):
        values = method(graph)
        graph.vertex_properties[name] = values
    
        sorted_pr = []
        for v in graph.vertices():
            sorted_pr.append((graph.vertex_properties[name][v], v))
        
        print(name, graph.num_vertices(), len(sorted_pr))
    
        sorted_pr = sorted(sorted_pr, reverse=True)
        #print(sorted_pr)
        f_fraction = []
        print(sorted_pr[:3])

        count_w = 0.0

        for i, (node_pr, node_id) in enumerate(sorted_pr, start=1):
            #for i, (node_id, node_data) in enumerate(graph.nodes_iter(data=True), start=1):
        
            if graph.vertex_properties['gender'][node_id] == 'f':
                count_w += 1.0
        
            f_fraction.append(count_w / i)
        
        results[fraction_name] = f_fraction
        results['count_w'] = count_w
            
    estimate_fraction(graph_tool.centrality.pagerank, 'pagerank', 'f_fraction')
    estimate_fraction(lambda x: x.degree_property_map('in'), 'in_degree', 'f_id_fraction')
    return
    
centrality(graph_data, results)

We can plot the PageRank distribution with respect to how many women are present at each subset of the results.



In [ ]:

    
def plot_fraction(graph_data, results, label, fraction='f_fraction'):
    graph = graph_data['graph']
    f_fraction = results[fraction]
    count_w = results['count_w']
    
    #plt.figure(figsize=(6,4))
    plt.plot(np.arange(1, len(f_fraction) + 1, 1), f_fraction, '-', alpha=0.75, label=label)
    #plt.plot(np.arange(1, len(f_fraction) + 1, 1), null_f_fraction, '-', alpha=0.75, label='Null Model')
    plt.hlines([count_w / graph.num_vertices()], 1.0, graph.num_vertices(), linestyle='dashed', alpha=0.5, linewidth=0.5)
    #plt.savefig('./results/connectivity_observedpagerank_proportion.png', dpi=100, bbox_inches='tight')
    
plot_fraction(graph_data, results, 'Pre-1900')
plt.xlabel('# of Biographies in the top-k results')
plt.ylabel('Fraction of Women')
plt.xscale('log')

We can also display the top biographies to find the most central persons according to gender.



In [ ]:

    
def top_entities(dataframe, graph_data, results, name='pagerank', n=30):
    graph = graph_data['graph']
    genders = ['m', 'f']
    pagerank = graph.vertex_properties[name]
    gender = graph.vertex_properties['gender']
    
    sorted_pr = []
    for v in graph.vertices():
        sorted_pr.append((pagerank[v], v, gender[v]))

    print(name, graph.num_vertices(), len(sorted_pr))

    sorted_pr = sorted(sorted_pr, reverse=True)
    
    top_pagerank = {}
    
    for g in genders:
        values = list(filter(lambda x: x[2] == g, sorted_pr))[0:n]
        top_pagerank[g] = [(dataframe.loc[x[1]].uri, x[0]) for x in values]
    
    results['top_pagerank'] = top_pagerank
    return top_pagerank
    
def plot_top_30(top_pagerank):
    people = []

    for m, w in zip(top_pagerank['m'], top_pagerank['f']):
        people.append({'woman': w[0], 'man': m[0], 'w_pr': w[1], 'm_pr': m[1]})

    df = pd.DataFrame.from_records(people, index=range(1, 31))

    plt.figure(figsize=(9,12))
    plt.title('Top-30 Biographies per Gender')

    plt.plot(df.m_pr, -df.index + 31, 'o', label="Men")
    plt.plot(df.w_pr, -df.index + 31, '^', label="Women")
    plt.xlabel('PageRank')
    plt.yticks(np.arange(1, 31))
    #plt.yticks(df.index, [n[1]['name'] for n in graph.nodes_iter(data=True)])#, fontsize='xx-small', rotation=45)
    plt.ylim([0.5, 30.5])
    #plt.xlim([-0.1,1.1])


    #for i, (x0, x1) in enumerate(zip(bc_real, bc_base)):
    #    plt.arrow(x0, i, x1 - x0, 0, length_includes_head=True)

    plt.hlines(-df.index + 31, df.m_pr, df.w_pr, alpha=0.3, linestyle='dashed', linewidth=1)

    plt.legend(loc='lower right')

    ax1 = plt.gca()
    ax2 = ax1.twinx()
    ax2.grid(False)
    ax2.set_ylim(ax1.get_ylim())
    ax2.set_yticks(np.arange(1, 31))
    ax2.set_yticklabels(list(df.man)[::-1])
    ax1.set_yticklabels(list(df.woman)[::-1])
    
top_entities(gender_data, graph_data, results)
plot_top_30(results['top_pagerank'])

Baseline Estimations

We want to compare our network with baseline graphs that are, by construction, not biased. For more information you can check our paper on why we do this. Particularly we focus on:

Small Worlds.
Random Graphs.
Random Graphs maintaining Full Degree Sequence.

To do so we need to estimate some stats first.



In [ ]:

    
import graph_tool.clustering

def append_graph_stats(graph_data, results):
    graph = graph_data['graph']
    
    results['n_nodes'] = graph.num_vertices()
    results['n_edges'] = graph.num_edges()

    print('directed', results['n_nodes'], results['n_edges'])

    results['u_n_nodes'] = graph.num_vertices()
    results['u_n_edges'] = graph.num_edges()
    
    k = (results['u_n_edges'] - (results['u_n_edges'] % 2)) / results['u_n_nodes']
    results['k'] = k
    print('k', k, results['u_n_nodes'], results['u_n_edges'])

    clust_prop = graph_tool.clustering.local_clustering(graph)
    clust_coeff = clust_prop.get_array().mean().tolist()
    results['clust_coeff'] = clust_coeff
    print('coeff', results['clust_coeff'])
    
append_graph_stats(graph_data, results)

We want to estimate the $\beta$ parameter for the Small World graph we are going to generate. Note that we need to use networkx because graph_tool does not have a Small World generator.



In [ ]:

    
from scipy.optimize import brenth
import networkx as nx

def estimate_beta(graph_data, results):
    k = int(np.round(results['k']))
    print('k', k)
    
    def find_clustering(x):
        g_nx = nx.watts_strogatz_graph(results['n_nodes'], k, x)    
        nx.write_graphml(g_nx, '{0}/temp_graph.graphml'.format(target_folder))
        g = graph_tool.Graph()
        g.load('{0}/temp_graph.graphml'.format(target_folder))
        g_prop = graph_tool.clustering.local_clustering(g)
        g_coeff = g_prop.get_array().mean().tolist()
        print(g_coeff - results['clust_coeff'])
        return g_coeff - results['clust_coeff']
    
    beta = brenth(lambda x: (nx.cluster.average_clustering(nx.watts_strogatz_graph(results['n_nodes'], k, x)) - results['clust_coeff']), 0.0, 1.0) 
    print('beta', beta)
    results['beta'] = beta

estimate_beta(graph_data, results)



In [ ]:

    
import tempfile
import random

def prepare_baseline(graph, base_data):
    graph_data = {'graph': graph, 'id_map': base_data['id_map'], 
                  'kept_frequencies': base_data['kept_frequencies']}
    results = {}
    build_ratios(graph_data, results)
    centrality(graph_data, results)
    append_graph_stats(graph_data, results)
    return graph_data, results

def generate_small_world(source_graph_data, source_results):
    sw_graph = nx.watts_strogatz_graph(source_results['n_nodes'], int(np.round(source_results['k'])), 
                                       source_results['beta'], seed=31072010)

    handle, filename = tempfile.mkstemp(suffix='.graphml')
    nx.write_graphml(sw_graph, filename)
    g = graph_tool.Graph()
    g.load(filename)
    
    print(g.num_vertices(), g.num_edges())

    freqs = source_graph_data['kept_frequencies']
    shuffled = ['m'] * freqs['m'] + ['f'] * freqs['f']
    random.shuffle(shuffled)
    
    g.vertex_properties['gender'] = g.new_vertex_property('string')
    for i, gender in enumerate(shuffled, start=0):
        g.vertex_properties['gender'][i] = gender
    
    sw_data, sw_results = prepare_baseline(g, source_graph_data)
    #save_results(sw_data, sw_results, 'sw', 'Small World')
    
    return sw_data, sw_results
    
sw, sw_results = generate_small_world(graph_data, results)

We use graph_tool random module to perturb our network and build the baseline random networks.



In [ ]:

    
import graph_tool.generation

def random_graph(source_graph_data, source_results, model='uncorrelated', n_iter=1):
    clone = source_graph_data['graph'].copy()
    graph_tool.generation.random_rewire(clone, model=model, n_iter=n_iter)
    base_data, base_results = prepare_baseline(clone, source_graph_data)    
    return base_data, base_results

def populate_rand(rand, rand_results, model='erdos', n_iter=1):
    random_g_data, random_g_results = random_graph(pre, results['pre'], model=model, n_iter=n_iter)
    rand['pre'] = random_g_data
    rand_results.update(random_g_results)

rand, rand_results = random_graph(graph_data, results, model='erdos')
deg_seq, deg_seq_results = random_graph(graph_data, results, model='uncorrelated', n_iter=3)

And now we can plot the distributions of PageRank for each network. In this way, we can see how biased is the estimated centrality in comparison to unbiased networks.



In [ ]:

    
def plot_distributions(loc='best'):
    plot_fraction(graph_data, results, 'PageRank (OBS)')
    plot_fraction(graph_data, results, 'In-Degree (OBS)', fraction='f_id_fraction')
    plot_fraction(sw, sw_results, 'Small World')
    plot_fraction(rand, rand_results, 'Random')
    plot_fraction(deg_seq, deg_seq_results, 'Degree Sequence')
    plt.legend(loc=loc, fontsize='x-small')

sns.set_palette('Set2')
plot_distributions()
plt.xlim([10, 1000000])
plt.ylim([0, 0.3])
plt.xscale('log')
#plt.xlabel('Sorted Top-k Biographies')
plt.ylabel('Fraction of Women')
plt.title('0 -- 1900')



In [ ]: