In [815]:
# dataset: https://www.kaggle.com/rtatman/six-degrees-of-francis-bacon/data
# citation: SDFB Team, Six Degrees of Francis Bacon: Reassembling the Early Modern Social Network. www.sixdegreesoffrancisbacon.com (August 29, 2017).
In [816]:
import numpy as np
import pandas as pd
In [817]:
import warnings
warnings.filterwarnings('ignore')
In [818]:
df_people = pd.read_csv('.\\data\\SDFB_people.csv', index_col='SDFB Person ID')
df_people.index.names = ['Id'];
df_people.head(n=3)
Out[818]:
In [819]:
files = [
'.\\data\\SDFB_relationships_100000000_100020000.csv',
'.\\data\\SDFB_relationships_100020001_100040000.csv',
'.\\data\\SDFB_relationships_100040001_100060000.csv',
'.\\data\\SDFB_relationships_100060001_100080000.csv',
'.\\data\\SDFB_relationships_100080001_100100000.csv',
'.\\data\\SDFB_relationships_100100001_100120000.csv',
'.\\data\\SDFB_relationships_100120001_100140000.csv',
'.\\data\\SDFB_relationships_100140001_100160000.csv',
'.\\data\\SDFB_relationships_100160001_100180000.csv',
'.\\data\\SDFB_relationships_greater_than_100180000.csv'
]
dfs = []
for file in files:
df = pd.read_csv(file, index_col='SDFB Relationship ID')
dfs.append(df)
df_relationships = pd.concat(dfs)
df_relationships.index.names = ['Id'];
df_relationships.head(n=3)
Out[819]:
In [820]:
node_list = [(i, {'display_name': df_people.loc[i,'Display Name'] }) for i in df_people.index]
edge_list = [(r[0],r[1], {'start':r[2], 'end':r[3]}) for r in df_relationships.loc[:,['Person 1 ID', 'Person 2 ID', 'Start Year', 'End Year']].values]
In [821]:
import networkx as nx
import nxviz as nz
In [822]:
df = df_people.loc[[10000473]]
df.head()
Out[822]:
In [823]:
# skipping year #9 ...
years = np.unique(df_relationships[['Start Year']].values)[1:]
# start ...
start_date = int(df['Extant Birth Year'].values[0]) - 3
# end ...
end_date = int(df['Extant Death Year'].values[0]) + 3
# years to view ...
years_to_view = years[(years >= start_date) & (years <= end_date)]
# subset of years to investigate ...
print(start_date, "-", end_date)
In [824]:
Graphs = []
for year in years_to_view:
G = nx.Graph()
G.add_nodes_from(node_list)
# get edges ...
# * want all edges start year being less than or equal to the year and the end year being greater than (making it strict) ...
edges = [e for e in edge_list if (e[2]['start'] <= year) & (e[2]['end'] > year)]
# add edges ...
G.add_edges_from(edges)
# append Graph ...
Graphs.append(G)
In [825]:
edges_added = [0]
edges_removed = [0]
for i in np.arange(0,len(Graphs)-1,1):
G1 = Graphs[i]
G2 = Graphs[i+1]
# determine the differences in the current vs future graphs ...
edges_added.append(len(nx.difference(G2,G1).edges()))
edges_removed.append(len(nx.difference(G1,G2).edges()))
In [826]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
In [827]:
pct_added = np.divide(np.subtract(edges_added,edges_removed), len(edges_added))
In [828]:
_ = plt.subplot(2,1,1)
_ = plt.title('network movement during his life')
_ = plt.plot(years_to_view, edges_added, marker='.', color='blue', alpha=.5, label='added')
_ = plt.plot(years_to_view, edges_removed, marker='.', color='red', alpha=.5, label='removed')
_ = plt.legend(loc='upper right')
_ = plt.xlabel('year')
_ = plt.ylabel('#')
_ = plt.subplot(2,1,2)
_ = plt.title('pct added to network during his life')
_ = plt.plot(years_to_view, pct_added, marker='.', color='blue', alpha=.5)
_ = plt.xlabel('year')
_ = plt.ylabel('%')
plt.tight_layout()
plt.show()
In [829]:
p_id = df.index[0]
In [851]:
np_dcs_vs_network = []
np_dcs_vs_connected_component = []
np_dcs_vs_connected_component_max = []
for graph in Graphs:
# get degree centrality for entire network ...
dcs = nx.degree_centrality(graph)
np_dcs_vs_network.append(dcs[p_id])
# get degree centrality for connected component ...
connected_component = None
for cc in nx.connected_component_subgraphs(graph):
if p_id in cc.nodes():
connected_component = cc
break
max_score = 0
score = 0
if len(connected_component.edges()) != 0:
dcs = nx.degree_centrality(connected_component)
max_score = np.max(list(dcs.values()))
score = dcs[p_id]
np_dcs_vs_connected_component.append(score)
np_dcs_vs_connected_component_max.append(max_score)
In [855]:
_ = plt.subplot(3,1,1)
_ = plt.title('bacon degree centrality for entire network')
_ = plt.plot(years_to_view, np_dcs_vs_network, color='blue', marker='.', alpha=.5, label='bacon')
_ = plt.xlabel('years')
_ = plt.ylabel('score')
_ = plt.subplot(3,1,2)
_ = plt.title('bacon degree centrality for connected component')
_ = plt.plot(years_to_view, np_dcs_vs_connected_component, color='blue', marker='.', alpha=.5, label='bacon')
_ = plt.xlabel('years')
_ = plt.ylabel('score')
_ = plt.subplot(3,1,3)
_ = plt.title('max degree centrality for connected component')
_ = plt.plot(years_to_view, np_dcs_vs_connected_component_max, color='cyan', marker='.', alpha=.5, label='max')
_ = plt.xlabel('years')
_ = plt.ylabel('score')
plt.tight_layout()
plt.show()
In [ ]: