notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#from scipy import stats as st
import time
import glob
import ntpath



In [2]:

    
sys.path.append('/work/eng/eliavb/all_distance_sketch/src/proto/')



In [3]:

    
import cover_pb2
import ranking_pb2



In [4]:

    
def get_node_ranks_from_gpb(ranking):
    node_ranks = []
    for node_rank in ranking.node_ranks:
        node_ranks.append((node_rank.node_id, node_rank.node_rank))
    return node_ranks

def compare_ranks_node_id(x, y):
    node_id_x, node_rank_x = x
    node_id_y, node_rank_y = y
    if (node_rank_x < node_rank_y):
        return -1
    if (node_rank_x > node_rank_y):
        return 1
    if (node_id_x < node_id_y):
        return -1
    if (node_id_x > node_id_y):
        return 1
    return -1

def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)



In [27]:

    
base_dir = "/work/eng/eliavb/experiments/youtube/"



In [28]:

    
def get_kendall_dist(tup):
    id_, base_dir, num_elements = tup
    f_name_d = base_dir + id_ + "_d"
    f_name_r = base_dir + id_ + "_r"
    data_d = open(f_name_d, "rb").read()
    data_r = open(f_name_r, "rb").read()
    ranking_d = ranking_pb2.NodeRanksGpb()
    ranking_r = ranking_pb2.NodeRanksGpb()
    ranking_d.ParseFromString(data_d)
    ranking_r.ParseFromString(data_r)
    node_rank_d = sorted(get_node_ranks_from_gpb(ranking_d), cmp = compare_ranks_node_id)
    node_rank_r = sorted(get_node_ranks_from_gpb(ranking_r), cmp = compare_ranks_node_id)
    ids_d = [r[0] for r in node_rank_d]
    ids_r = [r[0] for r in node_rank_r]
    res = {}
    for n_ in num_elements:
        ids_d_set = set(ids_d[:n_])
        ids_r_set = set(ids_r[:n_])
        intersection = ids_d_set & ids_r_set
        union = ids_d_set | ids_r_set
        diff = float(len(intersection)) / float(min(len(ids_d_set), len(ids_r_set)))
        res[n_] = []
        res[n_].append((diff, id_))
    return res



In [7]:

    
b = int(time.time()) * 1000
print get_kendall_dist(("10010", base_dir, [100, 1000, 5000, 10000, 20000, 100000, 10000000]))
a = int(time.time()) * 1000
print a - b









    



{100000: [(1.0, '10010')], 20000: [(0.3008, '10010')], 100: [(0.09, '10010')], 1000: [(0.036, '10010')], 10000: [(0.1791, '10010')], 5000: [(0.2812, '10010')], 10000000: [(1.0, '10010')]}
2000



In [29]:

    
ids = glob.glob(base_dir + "*_[r|d]")



In [30]:

    
ids = list(set([path_leaf(path)[:-2] for path in ids]))



In [31]:

    
from multiprocessing import Pool, TimeoutError



In [32]:

    
pool = Pool(processes=10)
#num_nodes = [100, 1000, 5000, 10 * 1000, 20 * 1000, 30 * 1000, 40 * 1000, 70 * 1000, 100 * 1000, 10000000]
num_nodes = [100, 1000, 5000, 10 * 1000, 50 * 1000, 100 * 1000, 200 * 1000, 300 * 1000, 400 * 1000, 500 * 1000, 600 * 1000, 800 * 1000, 2000 * 1000]
b = int(time.time()) * 1000
kendall_dist = {}
for n_ in num_nodes:
    kendall_dist[n_] = []
args = []
for id_ in ids:
    args.append((id_, base_dir, num_nodes))
for res in pool.imap_unordered(get_kendall_dist, args):
    for n_ in num_nodes:
        kendall_dist[n_] += res[n_]
a = int(time.time()) * 1000
pool.close()
pool.join()
print a - b



In [33]:

    
pickle.dump(kendall_dist, open( base_dir + "results_dict", 'wb'))



In [36]:

    
len(kendall_dist[100])









    Out[36]:





1000



In [55]:

    
for n in num_nodes:
    plt.hist([r[0] for r in kendall_dist[n]]);
    plt.savefig("/users/eng/eliavb/web/reverse_rank_dist/yt_" + str(n) + ".png")
    plt.close()



In [38]:

    
plt.hist([r[0] for r in kendall_dist[5000]]);



In [39]:

    
plt.hist([r[0] for r in kendall_dist[10000]]);



In [40]:

    
plt.hist([r[0] for r in kendall_dist[50000]]);



In [41]:

    
plt.hist([r[0] for r in kendall_dist[100000]]);



In [43]:

    
plt.hist([r[0] for r in kendall_dist[200000]]);



In [44]:

    
plt.hist([r[0] for r in kendall_dist[300 * 1000]]);



In [45]:

    
plt.hist([r[0] for r in kendall_dist[400 * 1000]]);



In [46]:

    
plt.hist([r[0] for r in kendall_dist[500 * 1000]]);



In [47]:

    
plt.hist([r[0] for r in kendall_dist[600 * 1000]]);



In [48]:

    
plt.hist([r[0] for r in kendall_dist[800 * 1000]]);



In [51]:

    
plt.hist([r[0] for r in kendall_dist[2000 * 1000]]);



In [23]:

    
import pickle



In [24]:

    
pickle.dump(kendall_dist, open( base_dir + "results_dict", 'wb'))



In [25]:

    
res = pickle.load(open('/work/eng/eliavb/experiments/slashdot/results_dict', 'rb'))



In [26]:

    
res.keys()









    Out[26]:





[20000, 10000000, 100, 1000, 30000, 10000, 5000, 40000, 70000, 100000]



In [ ]: