In [815]:
# dataset: https://www.kaggle.com/rtatman/six-degrees-of-francis-bacon/data
# citation: SDFB Team, Six Degrees of Francis Bacon: Reassembling the Early Modern Social Network. www.sixdegreesoffrancisbacon.com (August 29, 2017).

In [816]:
import numpy as np
import pandas as pd

In [817]:
import warnings
warnings.filterwarnings('ignore')

In [818]:
df_people = pd.read_csv('.\\data\\SDFB_people.csv', index_col='SDFB Person ID')
df_people.index.names = ['Id'];
df_people.head(n=3)


Out[818]:
ODNB ID Display Name Prefix First Name Last Name Suffix Title All Search Names Gender Historical Significance Birth Year Type Extant Birth Year Alternate Birth Year Death Year Type Extant Death Year Alternate Death Year Group List
Id
10005815 13002.0 James Hepburn NaN James Hepburn NaN NaN James Hepburn, James, James Hepburn male orientalist IN 1573 1573 AF/IN 1623 1623 []
10007183 16127.0 John Laurence NaN John Laurence NaN NaN John Laurence, John, John Laurence male writer on gardening IN 1668 1668 IN 1732 1732 []
10005953 13283.0 Joseph Hill NaN Joseph Hill NaN NaN Joseph Hill, Joseph, Joseph Hill male nonconformist minister IN 1625 1625 IN 1707 1707 []

In [819]:
files = [
    '.\\data\\SDFB_relationships_100000000_100020000.csv',
    '.\\data\\SDFB_relationships_100020001_100040000.csv',
    '.\\data\\SDFB_relationships_100040001_100060000.csv',
    '.\\data\\SDFB_relationships_100060001_100080000.csv',
    '.\\data\\SDFB_relationships_100080001_100100000.csv',
    '.\\data\\SDFB_relationships_100100001_100120000.csv',
    '.\\data\\SDFB_relationships_100120001_100140000.csv',
    '.\\data\\SDFB_relationships_100140001_100160000.csv',
    '.\\data\\SDFB_relationships_100160001_100180000.csv',
    '.\\data\\SDFB_relationships_greater_than_100180000.csv'
]

dfs = []
for file in files:
    df = pd.read_csv(file, index_col='SDFB Relationship ID')
    dfs.append(df)

df_relationships = pd.concat(dfs)
df_relationships.index.names = ['Id'];
df_relationships.head(n=3)


Out[819]:
Person 1 ID Person 2 ID Original Confidence Maximum Confidence Start Year Type Start Day Start Month Start Year End Year Type End Month End Day End Year
Id
100000004 10000001 10012160 52 52 AF/IN NaN NaN 1509 BF/IN NaN NaN 1560
100000005 10000001 10012316 53 53 AF/IN NaN NaN 1499 BF/IN NaN NaN 1560
100000006 10000002 10006344 46 46 AF/IN NaN NaN 1647 BF/IN NaN NaN 1688

In [820]:
node_list = [(i, {'display_name': df_people.loc[i,'Display Name'] }) for i in df_people.index]
edge_list = [(r[0],r[1], {'start':r[2], 'end':r[3]}) for r in df_relationships.loc[:,['Person 1 ID', 'Person 2 ID', 'Start Year', 'End Year']].values]

In [821]:
import networkx as nx
import nxviz as nz

In [822]:
df = df_people.loc[[10000473]]
df.head()


Out[822]:
ODNB ID Display Name Prefix First Name Last Name Suffix Title All Search Names Gender Historical Significance Birth Year Type Extant Birth Year Alternate Birth Year Death Year Type Extant Death Year Alternate Death Year Group List
Id
10000473 990.0 Francis Bacon NaN Francis Bacon NaN Viscount St. Alban Francis, Francis Bacon, Francis Bacon Viscount... male lord chancellor, politician, and philosopher IN 1561 1561 IN 1626 1626 ["Virginia Company", "Company of Mineral and B...

In [823]:
# skipping year #9 ...
years = np.unique(df_relationships[['Start Year']].values)[1:]

# start ...
start_date = int(df['Extant Birth Year'].values[0]) - 3

# end ...
end_date = int(df['Extant Death Year'].values[0]) + 3

# years to view ... 
years_to_view = years[(years >= start_date) & (years <= end_date)]

# subset of years to investigate ...
print(start_date, "-", end_date)


1558 - 1629

In [824]:
Graphs = []
for year in years_to_view:
    G = nx.Graph()
    G.add_nodes_from(node_list)
    
    # get edges ...
    # * want all edges start year being less than or equal to the year and the end year being greater than (making it strict) ...
    edges = [e for e in edge_list if (e[2]['start'] <= year) & (e[2]['end'] > year)]
    
    # add edges ...
    G.add_edges_from(edges)
    
    # append Graph ...
    Graphs.append(G)

In [825]:
edges_added = [0]
edges_removed = [0]

for i in np.arange(0,len(Graphs)-1,1):
    G1 = Graphs[i]
    G2 = Graphs[i+1]
    
    # determine the differences in the current vs future graphs ...
    edges_added.append(len(nx.difference(G2,G1).edges()))
    edges_removed.append(len(nx.difference(G1,G2).edges()))

In [826]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [827]:
pct_added = np.divide(np.subtract(edges_added,edges_removed), len(edges_added))

In [828]:
_ = plt.subplot(2,1,1)
_ = plt.title('network movement during his life')
_ = plt.plot(years_to_view, edges_added, marker='.', color='blue', alpha=.5, label='added')
_ = plt.plot(years_to_view, edges_removed, marker='.', color='red', alpha=.5, label='removed')
_ = plt.legend(loc='upper right')
_ = plt.xlabel('year')
_ = plt.ylabel('#')

_ = plt.subplot(2,1,2)
_ = plt.title('pct added to network during his life')
_ = plt.plot(years_to_view, pct_added, marker='.', color='blue', alpha=.5)
_ = plt.xlabel('year')
_ = plt.ylabel('%')

plt.tight_layout()
plt.show()



In [829]:
p_id = df.index[0]

In [851]:
np_dcs_vs_network = []
np_dcs_vs_connected_component = []
np_dcs_vs_connected_component_max = []

for graph in Graphs:
    
    # get degree centrality for entire network ... 
    dcs = nx.degree_centrality(graph)
    np_dcs_vs_network.append(dcs[p_id])
    
    # get degree centrality for connected component ...
    connected_component = None
    for cc in nx.connected_component_subgraphs(graph):
        if p_id in cc.nodes():
            connected_component = cc
            break
    
    max_score = 0
    score = 0
    if len(connected_component.edges()) != 0:
        dcs = nx.degree_centrality(connected_component)
        
        max_score = np.max(list(dcs.values()))
        score = dcs[p_id]
        
    np_dcs_vs_connected_component.append(score)
    np_dcs_vs_connected_component_max.append(max_score)

In [855]:
_ = plt.subplot(3,1,1)
_ = plt.title('bacon degree centrality for entire network')
_ = plt.plot(years_to_view, np_dcs_vs_network, color='blue', marker='.', alpha=.5, label='bacon')
_ = plt.xlabel('years')
_ = plt.ylabel('score')

_ = plt.subplot(3,1,2)
_ = plt.title('bacon degree centrality for connected component')
_ = plt.plot(years_to_view, np_dcs_vs_connected_component, color='blue', marker='.', alpha=.5, label='bacon')
_ = plt.xlabel('years')
_ = plt.ylabel('score')

_ = plt.subplot(3,1,3)
_ = plt.title('max degree centrality for connected component')
_ = plt.plot(years_to_view, np_dcs_vs_connected_component_max, color='cyan', marker='.', alpha=.5, label='max')
_ = plt.xlabel('years')
_ = plt.ylabel('score')

plt.tight_layout()
plt.show()



In [ ]: