In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#from scipy import stats as st
import time
import glob
import ntpath
In [2]:
sys.path.append('/work/eng/eliavb/all_distance_sketch/src/proto/')
In [3]:
import cover_pb2
import ranking_pb2
In [4]:
def get_node_ranks_from_gpb(ranking):
node_ranks = []
for node_rank in ranking.node_ranks:
node_ranks.append((node_rank.node_id, node_rank.node_rank))
return node_ranks
def compare_ranks_node_id(x, y):
node_id_x, node_rank_x = x
node_id_y, node_rank_y = y
if (node_rank_x < node_rank_y):
return -1
if (node_rank_x > node_rank_y):
return 1
if (node_id_x < node_id_y):
return -1
if (node_id_x > node_id_y):
return 1
return -1
def path_leaf(path):
head, tail = ntpath.split(path)
return tail or ntpath.basename(head)
In [27]:
base_dir = "/work/eng/eliavb/experiments/youtube/"
In [28]:
def get_kendall_dist(tup):
id_, base_dir, num_elements = tup
f_name_d = base_dir + id_ + "_d"
f_name_r = base_dir + id_ + "_r"
data_d = open(f_name_d, "rb").read()
data_r = open(f_name_r, "rb").read()
ranking_d = ranking_pb2.NodeRanksGpb()
ranking_r = ranking_pb2.NodeRanksGpb()
ranking_d.ParseFromString(data_d)
ranking_r.ParseFromString(data_r)
node_rank_d = sorted(get_node_ranks_from_gpb(ranking_d), cmp = compare_ranks_node_id)
node_rank_r = sorted(get_node_ranks_from_gpb(ranking_r), cmp = compare_ranks_node_id)
ids_d = [r[0] for r in node_rank_d]
ids_r = [r[0] for r in node_rank_r]
res = {}
for n_ in num_elements:
ids_d_set = set(ids_d[:n_])
ids_r_set = set(ids_r[:n_])
intersection = ids_d_set & ids_r_set
union = ids_d_set | ids_r_set
diff = float(len(intersection)) / float(min(len(ids_d_set), len(ids_r_set)))
res[n_] = []
res[n_].append((diff, id_))
return res
In [7]:
b = int(time.time()) * 1000
print get_kendall_dist(("10010", base_dir, [100, 1000, 5000, 10000, 20000, 100000, 10000000]))
a = int(time.time()) * 1000
print a - b
In [29]:
ids = glob.glob(base_dir + "*_[r|d]")
In [30]:
ids = list(set([path_leaf(path)[:-2] for path in ids]))
In [31]:
from multiprocessing import Pool, TimeoutError
In [32]:
pool = Pool(processes=10)
#num_nodes = [100, 1000, 5000, 10 * 1000, 20 * 1000, 30 * 1000, 40 * 1000, 70 * 1000, 100 * 1000, 10000000]
num_nodes = [100, 1000, 5000, 10 * 1000, 50 * 1000, 100 * 1000, 200 * 1000, 300 * 1000, 400 * 1000, 500 * 1000, 600 * 1000, 800 * 1000, 2000 * 1000]
b = int(time.time()) * 1000
kendall_dist = {}
for n_ in num_nodes:
kendall_dist[n_] = []
args = []
for id_ in ids:
args.append((id_, base_dir, num_nodes))
for res in pool.imap_unordered(get_kendall_dist, args):
for n_ in num_nodes:
kendall_dist[n_] += res[n_]
a = int(time.time()) * 1000
pool.close()
pool.join()
print a - b
In [33]:
pickle.dump(kendall_dist, open( base_dir + "results_dict", 'wb'))
In [36]:
len(kendall_dist[100])
Out[36]:
In [55]:
for n in num_nodes:
plt.hist([r[0] for r in kendall_dist[n]]);
plt.savefig("/users/eng/eliavb/web/reverse_rank_dist/yt_" + str(n) + ".png")
plt.close()
In [38]:
plt.hist([r[0] for r in kendall_dist[5000]]);
In [39]:
plt.hist([r[0] for r in kendall_dist[10000]]);
In [40]:
plt.hist([r[0] for r in kendall_dist[50000]]);
In [41]:
plt.hist([r[0] for r in kendall_dist[100000]]);
In [43]:
plt.hist([r[0] for r in kendall_dist[200000]]);
In [44]:
plt.hist([r[0] for r in kendall_dist[300 * 1000]]);
In [45]:
plt.hist([r[0] for r in kendall_dist[400 * 1000]]);
In [46]:
plt.hist([r[0] for r in kendall_dist[500 * 1000]]);
In [47]:
plt.hist([r[0] for r in kendall_dist[600 * 1000]]);
In [48]:
plt.hist([r[0] for r in kendall_dist[800 * 1000]]);
In [51]:
plt.hist([r[0] for r in kendall_dist[2000 * 1000]]);
In [23]:
import pickle
In [24]:
pickle.dump(kendall_dist, open( base_dir + "results_dict", 'wb'))
In [25]:
res = pickle.load(open('/work/eng/eliavb/experiments/slashdot/results_dict', 'rb'))
In [26]:
res.keys()
Out[26]:
In [ ]: