notebook.community

Edit and run



In [815]:

    
# dataset: https://www.kaggle.com/rtatman/six-degrees-of-francis-bacon/data
# citation: SDFB Team, Six Degrees of Francis Bacon: Reassembling the Early Modern Social Network. www.sixdegreesoffrancisbacon.com (August 29, 2017).



In [816]:

    
import numpy as np
import pandas as pd



In [817]:

    
import warnings
warnings.filterwarnings('ignore')



In [818]:

    
df_people = pd.read_csv('.\\data\\SDFB_people.csv', index_col='SDFB Person ID')
df_people.index.names = ['Id'];
df_people.head(n=3)









    Out[818]:







  
    
      
      ODNB ID
      Display Name
      Prefix
      First Name
      Last Name
      Suffix
      Title
      All Search Names
      Gender
      Historical Significance
      Birth Year Type
      Extant Birth Year
      Alternate Birth Year
      Death Year Type
      Extant Death Year
      Alternate Death Year
      Group List
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      10005815
      13002.0
      James Hepburn
      NaN
      James
      Hepburn
      NaN
      NaN
      James Hepburn, James, James Hepburn
      male
      orientalist
      IN
      1573
      1573
      AF/IN
      1623
      1623
      []
    
    
      10007183
      16127.0
      John Laurence
      NaN
      John
      Laurence
      NaN
      NaN
      John Laurence, John, John Laurence
      male
      writer on gardening
      IN
      1668
      1668
      IN
      1732
      1732
      []
    
    
      10005953
      13283.0
      Joseph Hill
      NaN
      Joseph
      Hill
      NaN
      NaN
      Joseph Hill, Joseph, Joseph Hill
      male
      nonconformist minister
      IN
      1625
      1625
      IN
      1707
      1707
      []



In [819]:

    
files = [
    '.\\data\\SDFB_relationships_100000000_100020000.csv',
    '.\\data\\SDFB_relationships_100020001_100040000.csv',
    '.\\data\\SDFB_relationships_100040001_100060000.csv',
    '.\\data\\SDFB_relationships_100060001_100080000.csv',
    '.\\data\\SDFB_relationships_100080001_100100000.csv',
    '.\\data\\SDFB_relationships_100100001_100120000.csv',
    '.\\data\\SDFB_relationships_100120001_100140000.csv',
    '.\\data\\SDFB_relationships_100140001_100160000.csv',
    '.\\data\\SDFB_relationships_100160001_100180000.csv',
    '.\\data\\SDFB_relationships_greater_than_100180000.csv'
]

dfs = []
for file in files:
    df = pd.read_csv(file, index_col='SDFB Relationship ID')
    dfs.append(df)

df_relationships = pd.concat(dfs)
df_relationships.index.names = ['Id'];
df_relationships.head(n=3)









    Out[819]:







  
    
      
      Person 1 ID
      Person 2 ID
      Original Confidence
      Maximum Confidence
      Start Year Type
      Start Day
      Start Month
      Start Year
      End Year Type
      End Month
      End Day
      End Year
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      100000004
      10000001
      10012160
      52
      52
      AF/IN
      NaN
      NaN
      1509
      BF/IN
      NaN
      NaN
      1560
    
    
      100000005
      10000001
      10012316
      53
      53
      AF/IN
      NaN
      NaN
      1499
      BF/IN
      NaN
      NaN
      1560
    
    
      100000006
      10000002
      10006344
      46
      46
      AF/IN
      NaN
      NaN
      1647
      BF/IN
      NaN
      NaN
      1688



In [820]:

    
node_list = [(i, {'display_name': df_people.loc[i,'Display Name'] }) for i in df_people.index]
edge_list = [(r[0],r[1], {'start':r[2], 'end':r[3]}) for r in df_relationships.loc[:,['Person 1 ID', 'Person 2 ID', 'Start Year', 'End Year']].values]



In [821]:

    
import networkx as nx
import nxviz as nz



In [822]:

    
df = df_people.loc[[10000473]]
df.head()









    Out[822]:







  
    
      
      ODNB ID
      Display Name
      Prefix
      First Name
      Last Name
      Suffix
      Title
      All Search Names
      Gender
      Historical Significance
      Birth Year Type
      Extant Birth Year
      Alternate Birth Year
      Death Year Type
      Extant Death Year
      Alternate Death Year
      Group List
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      10000473
      990.0
      Francis Bacon
      NaN
      Francis
      Bacon
      NaN
      Viscount St. Alban
      Francis, Francis Bacon, Francis Bacon Viscount...
      male
      lord chancellor, politician, and philosopher
      IN
      1561
      1561
      IN
      1626
      1626
      ["Virginia Company", "Company of Mineral and B...



In [823]:

    
# skipping year #9 ...
years = np.unique(df_relationships[['Start Year']].values)[1:]

# start ...
start_date = int(df['Extant Birth Year'].values[0]) - 3

# end ...
end_date = int(df['Extant Death Year'].values[0]) + 3

# years to view ... 
years_to_view = years[(years >= start_date) & (years <= end_date)]

# subset of years to investigate ...
print(start_date, "-", end_date)









    



1558 - 1629



In [824]:

    
Graphs = []
for year in years_to_view:
    G = nx.Graph()
    G.add_nodes_from(node_list)
    
    # get edges ...
    # * want all edges start year being less than or equal to the year and the end year being greater than (making it strict) ...
    edges = [e for e in edge_list if (e[2]['start'] <= year) & (e[2]['end'] > year)]
    
    # add edges ...
    G.add_edges_from(edges)
    
    # append Graph ...
    Graphs.append(G)



In [825]:

    
edges_added = [0]
edges_removed = [0]

for i in np.arange(0,len(Graphs)-1,1):
    G1 = Graphs[i]
    G2 = Graphs[i+1]
    
    # determine the differences in the current vs future graphs ...
    edges_added.append(len(nx.difference(G2,G1).edges()))
    edges_removed.append(len(nx.difference(G1,G2).edges()))



In [826]:

    
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()



In [827]:

    
pct_added = np.divide(np.subtract(edges_added,edges_removed), len(edges_added))



In [828]:

    
_ = plt.subplot(2,1,1)
_ = plt.title('network movement during his life')
_ = plt.plot(years_to_view, edges_added, marker='.', color='blue', alpha=.5, label='added')
_ = plt.plot(years_to_view, edges_removed, marker='.', color='red', alpha=.5, label='removed')
_ = plt.legend(loc='upper right')
_ = plt.xlabel('year')
_ = plt.ylabel('#')

_ = plt.subplot(2,1,2)
_ = plt.title('pct added to network during his life')
_ = plt.plot(years_to_view, pct_added, marker='.', color='blue', alpha=.5)
_ = plt.xlabel('year')
_ = plt.ylabel('%')

plt.tight_layout()
plt.show()



In [829]:

    
p_id = df.index[0]



In [851]:

    
np_dcs_vs_network = []
np_dcs_vs_connected_component = []
np_dcs_vs_connected_component_max = []

for graph in Graphs:
    
    # get degree centrality for entire network ... 
    dcs = nx.degree_centrality(graph)
    np_dcs_vs_network.append(dcs[p_id])
    
    # get degree centrality for connected component ...
    connected_component = None
    for cc in nx.connected_component_subgraphs(graph):
        if p_id in cc.nodes():
            connected_component = cc
            break
    
    max_score = 0
    score = 0
    if len(connected_component.edges()) != 0:
        dcs = nx.degree_centrality(connected_component)
        
        max_score = np.max(list(dcs.values()))
        score = dcs[p_id]
        
    np_dcs_vs_connected_component.append(score)
    np_dcs_vs_connected_component_max.append(max_score)



In [855]:

    
_ = plt.subplot(3,1,1)
_ = plt.title('bacon degree centrality for entire network')
_ = plt.plot(years_to_view, np_dcs_vs_network, color='blue', marker='.', alpha=.5, label='bacon')
_ = plt.xlabel('years')
_ = plt.ylabel('score')

_ = plt.subplot(3,1,2)
_ = plt.title('bacon degree centrality for connected component')
_ = plt.plot(years_to_view, np_dcs_vs_connected_component, color='blue', marker='.', alpha=.5, label='bacon')
_ = plt.xlabel('years')
_ = plt.ylabel('score')

_ = plt.subplot(3,1,3)
_ = plt.title('max degree centrality for connected component')
_ = plt.plot(years_to_view, np_dcs_vs_connected_component_max, color='cyan', marker='.', alpha=.5, label='max')
_ = plt.xlabel('years')
_ = plt.ylabel('score')

plt.tight_layout()
plt.show()



In [ ]:

	ODNB ID	Display Name	Prefix	First Name	Last Name	Suffix	Title	All Search Names	Gender	Historical Significance	Birth Year Type	Extant Birth Year	Alternate Birth Year	Death Year Type	Extant Death Year	Alternate Death Year	Group List
Id
10005815	13002.0	James Hepburn	NaN	James	Hepburn	NaN	NaN	James Hepburn, James, James Hepburn	male	orientalist	IN	1573	1573	AF/IN	1623	1623	[]
10007183	16127.0	John Laurence	NaN	John	Laurence	NaN	NaN	John Laurence, John, John Laurence	male	writer on gardening	IN	1668	1668	IN	1732	1732	[]
10005953	13283.0	Joseph Hill	NaN	Joseph	Hill	NaN	NaN	Joseph Hill, Joseph, Joseph Hill	male	nonconformist minister	IN	1625	1625	IN	1707	1707	[]

	Person 1 ID	Person 2 ID	Original Confidence	Maximum Confidence	Start Year Type	Start Day	Start Month	Start Year	End Year Type	End Month	End Day	End Year
Id
100000004	10000001	10012160	52	52	AF/IN	NaN	NaN	1509	BF/IN	NaN	NaN	1560
100000005	10000001	10012316	53	53	AF/IN	NaN	NaN	1499	BF/IN	NaN	NaN	1560
100000006	10000002	10006344	46	46	AF/IN	NaN	NaN	1647	BF/IN	NaN	NaN	1688

	ODNB ID	Display Name	Prefix	First Name	Last Name	Suffix	Title	All Search Names	Gender	Historical Significance	Birth Year Type	Extant Birth Year	Alternate Birth Year	Death Year Type	Extant Death Year	Alternate Death Year	Group List
Id
10000473	990.0	Francis Bacon	NaN	Francis	Bacon	NaN	Viscount St. Alban	Francis, Francis Bacon, Francis Bacon Viscount...	male	lord chancellor, politician, and philosopher	IN	1561	1561	IN	1626	1626	["Virginia Company", "Company of Mineral and B...