In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#from scipy import stats as st
import time
import glob
import ntpath

In [2]:
sys.path.append('/work/eng/eliavb/all_distance_sketch/src/proto/')

In [3]:
import cover_pb2
import ranking_pb2

In [4]:
def get_node_ranks_from_gpb(ranking):
    node_ranks = []
    for node_rank in ranking.node_ranks:
        node_ranks.append((node_rank.node_id, node_rank.node_rank))
    return node_ranks

def compare_ranks_node_id(x, y):
    node_id_x, node_rank_x = x
    node_id_y, node_rank_y = y
    if (node_rank_x < node_rank_y):
        return -1
    if (node_rank_x > node_rank_y):
        return 1
    if (node_id_x < node_id_y):
        return -1
    if (node_id_x > node_id_y):
        return 1
    return -1

def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

In [27]:
base_dir = "/work/eng/eliavb/experiments/youtube/"

In [28]:
def get_kendall_dist(tup):
    id_, base_dir, num_elements = tup
    f_name_d = base_dir + id_ + "_d"
    f_name_r = base_dir + id_ + "_r"
    data_d = open(f_name_d, "rb").read()
    data_r = open(f_name_r, "rb").read()
    ranking_d = ranking_pb2.NodeRanksGpb()
    ranking_r = ranking_pb2.NodeRanksGpb()
    ranking_d.ParseFromString(data_d)
    ranking_r.ParseFromString(data_r)
    node_rank_d = sorted(get_node_ranks_from_gpb(ranking_d), cmp = compare_ranks_node_id)
    node_rank_r = sorted(get_node_ranks_from_gpb(ranking_r), cmp = compare_ranks_node_id)
    ids_d = [r[0] for r in node_rank_d]
    ids_r = [r[0] for r in node_rank_r]
    res = {}
    for n_ in num_elements:
        ids_d_set = set(ids_d[:n_])
        ids_r_set = set(ids_r[:n_])
        intersection = ids_d_set & ids_r_set
        union = ids_d_set | ids_r_set
        diff = float(len(intersection)) / float(min(len(ids_d_set), len(ids_r_set)))
        res[n_] = []
        res[n_].append((diff, id_))
    return res

In [7]:
b = int(time.time()) * 1000
print get_kendall_dist(("10010", base_dir, [100, 1000, 5000, 10000, 20000, 100000, 10000000]))
a = int(time.time()) * 1000
print a - b


{100000: [(1.0, '10010')], 20000: [(0.3008, '10010')], 100: [(0.09, '10010')], 1000: [(0.036, '10010')], 10000: [(0.1791, '10010')], 5000: [(0.2812, '10010')], 10000000: [(1.0, '10010')]}
2000

In [29]:
ids = glob.glob(base_dir + "*_[r|d]")

In [30]:
ids = list(set([path_leaf(path)[:-2] for path in ids]))

In [31]:
from multiprocessing import Pool, TimeoutError

In [32]:
pool = Pool(processes=10)
#num_nodes = [100, 1000, 5000, 10 * 1000, 20 * 1000, 30 * 1000, 40 * 1000, 70 * 1000, 100 * 1000, 10000000]
num_nodes = [100, 1000, 5000, 10 * 1000, 50 * 1000, 100 * 1000, 200 * 1000, 300 * 1000, 400 * 1000, 500 * 1000, 600 * 1000, 800 * 1000, 2000 * 1000]
b = int(time.time()) * 1000
kendall_dist = {}
for n_ in num_nodes:
    kendall_dist[n_] = []
args = []
for id_ in ids:
    args.append((id_, base_dir, num_nodes))
for res in pool.imap_unordered(get_kendall_dist, args):
    for n_ in num_nodes:
        kendall_dist[n_] += res[n_]
a = int(time.time()) * 1000
pool.close()
pool.join()
print a - b


4147000

In [33]:
pickle.dump(kendall_dist, open( base_dir + "results_dict", 'wb'))

In [36]:
len(kendall_dist[100])


Out[36]:
1000

In [55]:
for n in num_nodes:
    plt.hist([r[0] for r in kendall_dist[n]]);
    plt.savefig("/users/eng/eliavb/web/reverse_rank_dist/yt_" + str(n) + ".png")
    plt.close()

In [38]:
plt.hist([r[0] for r in kendall_dist[5000]]);



In [39]:
plt.hist([r[0] for r in kendall_dist[10000]]);



In [40]:
plt.hist([r[0] for r in kendall_dist[50000]]);



In [41]:
plt.hist([r[0] for r in kendall_dist[100000]]);



In [43]:
plt.hist([r[0] for r in kendall_dist[200000]]);



In [44]:
plt.hist([r[0] for r in kendall_dist[300 * 1000]]);



In [45]:
plt.hist([r[0] for r in kendall_dist[400 * 1000]]);



In [46]:
plt.hist([r[0] for r in kendall_dist[500 * 1000]]);



In [47]:
plt.hist([r[0] for r in kendall_dist[600 * 1000]]);



In [48]:
plt.hist([r[0] for r in kendall_dist[800 * 1000]]);



In [51]:
plt.hist([r[0] for r in kendall_dist[2000 * 1000]]);



In [23]:
import pickle

In [24]:
pickle.dump(kendall_dist, open( base_dir + "results_dict", 'wb'))

In [25]:
res = pickle.load(open('/work/eng/eliavb/experiments/slashdot/results_dict', 'rb'))

In [26]:
res.keys()


Out[26]:
[20000, 10000000, 100, 1000, 30000, 10000, 5000, 40000, 70000, 100000]

In [ ]: