In [11]:
import networkx as nx
import numpy as np
import random
import _pickle as pkl

from snpp.cores.joint_part_pred import iterative_approach
from snpp.cores.max_balance import faster_greedy
from snpp.cores.lowrank import partition_graph, alq_spark, predict_signs
from snpp.cores.budget_allocation import exponential_budget, \
    constant_then_exponential_budget, \
    linear_budget, \
    constant_budget
from snpp.cores.louvain import best_partition
from snpp.cores.triangle import build_edge2edges
from snpp.utils.signed_graph import fill_diagonal
from snpp.utils.edge_filter import filter_by_min_triangle_count
from snpp.utils.data import load_train_test_data

from snpp.utils.spark import sc

In [12]:
dataset = 'slashdot'
lambda_ = 0.1
k = 40
max_iter = 20
random_seed = 123456

recache_input = False

min_tri_count = 20

random.seed(random_seed)
np.random.seed(random_seed)

In [13]:
g, test_m = load_train_test_data(dataset, recache_input)

test_idx_sorted = list(map(lambda e: tuple(sorted(e)), zip(*test_m.nonzero())))
test_idx = list(zip(*test_m.nonzero()))


loading sparse matrix from data/slashdot.npz
splitting train and test...
1044it [00:00, 3465.57it/s]
converting to nx.Graph
building Graph
464917it [01:01, 7584.11it/s]
saving training graph and test data...

In [14]:
print(g.number_of_edges())
print(len(test_idx_sorted))


426039
51658

In [15]:
# make prediction based on undirectionality
if False:
    # disabled for now
    ud_preds = []
    ud_truth = []
    for i, j in test_idx_sorted:
        if g.has_edge(i, j):
            ud_preds.append((i, j, g[i][j]['sign']))
            s = test_m[i, j]
            if s == 0:
                s = test_m[j, i]
            ud_truth.append((i, j, s))

    print('made predictions on {} edges based on undirectionality'.format(len(ud_preds)))
    print("=> accuracy is {}".format(
            len(set(ud_truth).intersection(set(ud_preds))) / len(ud_truth)))

In [18]:
targets = set(test_idx_sorted)
if False:
    print('removing already predicted entries')
    idx_i, idx_j, data = map(list, zip(*ud_preds))
    targets -= set(zip(idx_i, idx_j))
    targets = set([tuple(sorted(e)) for e in targets])  # sort each edge so that i <= j                                                    
    print('=> remaining #targets {}'.format(len(targets)))

print('filtering edges with at least {} triangles'.format(min_tri_count))
filtered_targets = set(filter_by_min_triangle_count(g, targets, min_tri_count))
print('=> remaining #targets {}'.format(len(filtered_targets)))


filtering edges with at least 20 triangles
=> remaining #targets 2566

In [17]:
# %%timeit -r 1 -n 1
# start the iterative approach
part, iter_preds, status = iterative_approach(
    g,
    T=filtered_targets,
    k=k,
    graph_partition_f=partition_graph,
    graph_partition_kwargs=dict(sc=sc,
                                lambda_=lambda_, iterations=max_iter,
                                seed=random_seed),
    budget_allocation_f=constant_budget,
    budget_allocation_kwargs=dict(const=200),
    solve_maxbalance_f=faster_greedy,
    solve_maxbalance_kwargs={'edge2edges': build_edge2edges(g.copy(),
                                                            targets)},
    truth=set([(i, j, (test_m[i, j]
                       if test_m[i, j] != 0
                       else test_m[j, i]))
               for i, j in filtered_targets]),
    perform_last_partition=False)

print('dumping result...')
pkl.dump(status, open('data/{}/status.pkl'.format(dataset), 'wb'))

print('made prediction on {} edges using iterative'.format(status.pred_cnt_list[-1]))
print('=> accuracy is {}'.format(status.acc_list[-1]))


  7%|▋         | 3621/51658 [00:00<00:02, 17314.79it/s]
build edge2edges
100%|██████████| 51658/51658 [00:02<00:00, 19296.47it/s]
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-17-3d9e21c8f654> in <module>()
     18                        else test_m[j, i]))
     19                for i, j in filtered_targets]),
---> 20     perform_last_partition=False)
     21 
     22 print('dumping result...')

/home/cloud-user/code/snpp/snpp/cores/joint_part_pred.py in iterative_approach(g, T, k, graph_partition_f, budget_allocation_f, solve_maxbalance_f, graph_partition_kwargs, budget_allocation_kwargs, solve_maxbalance_kwargs, truth, perform_last_partition)
     45     # data format checking
     46     for i, j in T:
---> 47         assert i <= j
     48 
     49     if truth:

AssertionError: 

In [ ]:
print(g.number_of_edges())

In [ ]:
# %%timeit -r 1 -n 1
A = nx.to_scipy_sparse_matrix(g, nodelist=g.nodes(),
                              weight='sign', format='csr')
A = fill_diagonal(A)
# assert (A.nnz - A.shape[0]) == len(targets)

X, Y = alq_spark(A, k=k, sc=sc,
                 lambda_=lambda_, iterations=max_iter,
                 seed=random_seed)

In [ ]:
remaining_targets = targets - filtered_targets
print('predicting using lowrank method on {} edges'.format(len(remaining_targets)))
lr_preds = predict_signs(X, Y, remaining_targets, sc)

In [ ]:
lr_preds, iter_preds = set(lr_preds), set(iter_preds)
assert len(lr_preds.intersection(iter_preds)) == 0

In [ ]:
def nz_value(m, i, j):
    return (m[i, j] if m[i, j] != 0 else m[j, i])

In [ ]:
truth = set((i, j, nz_value(test_m, i, j)) for i, j in test_idx_sorted)
preds = lr_preds | iter_preds | set(ud_preds)
assert len(preds) == len(truth)
print('=> undirectionality accuracy {} ({})'.format(len(truth.intersection(set(ud_preds))) / len(ud_preds), len(ud_preds)))
print('=> iteractive accuracy {} ({})'.format(len(truth.intersection(iter_preds)) / len(iter_preds), len(iter_preds)))
print('=> lowrank accuracy {} ({})'.format(len(truth.intersection(lr_preds)) / len(lr_preds), len(lr_preds)))
print('=> final accuracy {}'.format(len(truth.intersection(preds)) / len(truth)))