In [1]:
import pandas as pd
import networkx as netx
import numpy as np
import matplotlib.pyplot as plt

import warnings
import random
import itertools

Import


In [2]:
dataset = pd.read_csv(
            ".\\data\\CA-GrQc-data.txt", 
            header=None,
            names=["source","target"],
            sep="\t", 
            comment="#")

G = netx.from_pandas_dataframe(dataset,"source","target")

In [3]:
#### remove all self loops
G.remove_edges_from(G.selfloop_edges())

Random Walk


In [4]:
def graph_walk(nodes):
    graph = netx.Graph()
    p = None
    for n in nodes:
        graph.add_node(n)
        if p != None:
            graph.add_edge(n,p)
        p = n
    return graph

def random_walk(graph, start_node = None, size = -1):
    if start_node == None:
        start_node = random.choice(list(graph.nodes()))
    
    v = start_node
    for c in itertools.count():
        if size == c:
            return
        v = random.choice(list(graph.neighbors(v)))
        yield v

In [5]:
walk = list(random_walk(G, size = 20))

In [6]:
plt.axis('off')

graph = graph_walk(walk)

pos=netx.spring_layout(graph)
netx.draw_networkx(graph, pos=pos, with_labels=True)
plt.show()


Random Jumps


In [7]:
def graph_jumps(collections):
    graph = netx.Graph()
    for collection in collections:
        p = None
        for n in collection:
            graph.add_node(n)
            if p != None:
                graph.add_edge(n,p)
            p = n
    return graph

def random_walk_with_jump(graph, start_node = None, size = -1, p = .2):
    if start_node == None:
        start_node = random.choice(list(graph.nodes()))
    
    ev = []
    
    v = start_node
    col = [v]
    for c in itertools.count():
        if size == c:
            break
        
        if random.random() < p:
            ev.append(col)
            v = random.choice(list(graph.nodes()))
            col = [v]
        else:
            v = random.choice(list(graph.neighbors(v)))
            col.append(v)
    
    ev.append(col)
    return ev

In [8]:
jumps = random_walk_with_jump(G, size = 25, p = .6)

In [9]:
plt.axis('off')

graph = graph_jumps(jumps)

pos=netx.spring_layout(graph)
netx.draw_networkx(graph, pos=pos, with_labels=True)
plt.show()


Population Sampling


In [10]:
def get_degrees(graph, nodes):
    lookup = dict(G.degree())
    
    for n in nodes:
        yield lookup[n]
        

def uniform_sample(population, size = -1, with_replacement = True):
    
    objs = list(population)
    n = len(objs)
    
    for c in itertools.count():
        if size == c:
            return
        
        i = int(random.random() * n)
        v = objs[i]
        
        if not with_replacement:
            del objs[i]
            n = n-1
        
        yield v

In [11]:
def print_sample_stats(sample):
    n = len(sample)

    sample_mu = sample.mean()
    sample_std = (((sample - sample_mu)**2).sum() / (n-1))**(1/2)
    se_mean = sample_std / (n)**(1/2)
    
    percentile = np.percentile(sample,[2.5,97.5])

    print("sample mu:", sample_mu)
    print("sample std:", sample_std)
    print("standard error:", se_mean)
    
    print()
    print("confidence")
    print("%",percentile)

In [12]:
iters = 4000
n = 500
Totals = []

for c in range(iters):
    samples = list(uniform_sample(G.nodes(), size = n, with_replacement = True))
    degrees = np.array(list(get_degrees(G,samples)))
    Totals.append(degrees.sum())

T = np.array(Totals) / n

plt.title("Sampled Average Degree")
plt.hist(x=T, bins=25)
plt.show()



In [13]:
print_sample_stats(T)


sample mu: 5.5346955
sample std: 0.350198113872
standard error: 0.00553711836065

confidence
% [ 4.866  6.238]

Stratified Sampling


In [14]:
def get_by_degree(deg, searchBy):
    for key, value in deg.items():
        if searchBy(value):
            yield key

degrees = dict(G.degree())
_max = np.array(list(degrees.values())).max()

stratas = { }
prev = -1
for c in range(4):
    cr = c+1
    _min = int(_max * (cr/4))
    fx = lambda x: x > prev and x <= _min
    stratas.update({ cr: list(get_by_degree(degrees,fx)) })
    prev = _min

i = 1
iters = 2000
Totals = [ ]
for sn in [20,20,20,20]:
    sample = stratas[i]
    
    Totals_sub = []
    for c in range(iters):
        samples = list(uniform_sample(sample, size = sn, with_replacement = True))
        degrees = np.array(list(get_degrees(G,samples)))
        Totals_sub.append(degrees.sum())
    
    Totals.append(Totals_sub)
    i = i + 1

In [15]:
i = 1
for T in Totals:
    T1 = np.array(T) / 20
    _ = plt.title("Sampled Average Degree " + str(i))
    _ = plt.hist(x=T1, bins=25)
    plt.show()

    print_sample_stats(T1)
    
    i = i + 1


sample mu: 4.0639
sample std: 0.848812129399
standard error: 0.0189800162146

confidence
% [ 2.65  5.95]
sample mu: 27.648875
sample std: 1.16632334258
standard error: 0.0260797827775

confidence
% [ 25.35  29.9 ]
sample mu: 46.1799
sample std: 1.09105393512
standard error: 0.0243967076605

confidence
% [ 44.25  48.4 ]
sample mu: 70.2976
sample std: 1.42263353788
standard error: 0.0318110529776

confidence
% [ 67.65  73.1 ]

In [16]:
n = 20

n1 = len(stratas[1])
n2 = len(stratas[2])
n3 = len(stratas[3])
n4 = len(stratas[4])

N = n1 + n2 + n3 + n4

sample_mu1 = (np.array(Totals[0])/n).mean()
sample_mu2 = (np.array(Totals[1])/n).mean()
sample_mu3 = (np.array(Totals[2])/n).mean()
sample_mu4 = (np.array(Totals[3])/n).mean()

a1 = ((n1/N)*sample_mu1)
a2 = ((n2/N)*sample_mu2) 
a3 = ((n3/N)*sample_mu3) 
a4 = ((n4/N)*sample_mu4)

print(a1, " ", a2, " ", a3, " ", a4)
print(a1 + a2 + a3 + a4)


3.8662093285   0.959957125143   0.546194925601   0.147514994277
5.51987637352

Snowball Sampling


In [17]:
def snowballin(Graph,center,prev,depth,max_depth,outers):
    if max_depth == depth:
        return outers
    
    if center in list(outers.nodes()):
        return outers
    else:
        outers.add_node(center)
        if prev != -1:
            outers.add_edge(center,prev)
        
    neighbors = list(netx.neighbors(Graph,center))
    for neighbor in neighbors:
        outers = snowballin(G,neighbor,center,depth+1,max_depth,outers)
        
    return outers

In [18]:
connected_components = netx.connected_component_subgraphs(G)
sub_graphs = sorted(connected_components, key=len, reverse=True)
cc = sub_graphs[0]
node = list(cc.nodes())[0]

V = snowballin(cc,node,-1,0,4,netx.Graph())

In [19]:
plt.axis('off')

pos=netx.spring_layout(V)
netx.draw_networkx(V, pos=pos, with_labels=True)
plt.show()



In [20]:
node


Out[20]:
3466

In [ ]: