In [16]:
import json
import numpy as np
from numpy import linalg as la
import os
#predefined topics size
label_num = 180
#orginiality determined how far belief can propagate through the network
originality = 0.001
# propagation and mixing
iter_num = 300
# 90 best partition (similarity loss)
# 89 uncertainty and reliability merge in this level (similarity loss)
# 87 best partition (similarity + entropy loss)
tolerance = 89
with open("data/super_data_2.json", "r") as f:
super_data = json.load(f)
p_data=super_data['papers']
bayes_rank=super_data['bayes_ranks']
index_phrase=super_data['index_phrase']
#propagation stage
source=[]
for i in range(label_num):
source.append(bayes_rank[i])
# list for storing the index of all sources
source_idx=[]
# check set for preventing duplicate adding
check=set()
#build topics
for i in range (len(source)):
check.add(source[i])
source_idx.append(source[i])
# BFS for find legit order
ordering=[]
#build depth 1 nodes, depth measure how far this nodes is from the sources
depth_1=[]
for i in source_idx:
for j in p_data[i]['all_cite']:
if not int(j) in check:
depth_1.append(int(j))
check.add(int(j))
ordering.append(depth_1)
#recursively build all nodes
running=True
depth=0
while running:
new_layer=[]
for i in ordering[depth]:
for j in p_data[i]['all_cite']:
if not int(j) in check:
check.add(int(j))
new_layer.append(int(j))
if(len(new_layer)==0):
running=False
else:
ordering.append(new_layer)
depth+=1
# build edge weight by normalization
weights=[]
for p in p_data:
z = float(sum(p['all_cite_sim'])) + originality
vec=np.array(p['all_cite_sim'])/z
p['prop_ratio']=1-(originality/z)
weights.append(vec)
# turn all str in citation into int vector
for p in p_data:
int_vec=[]
for i in p['all_cite']:
int_vec.append(int(i))
p['edge_set']=int_vec
# build and initilize topics matrix
topics=np.zeros((len(p_data),label_num),dtype=float)
for i in range (len(source)):
topics[source[i], i] = 1
for i in range(iter_num):
for layer in ordering:
for node in layer:
n_vec = p_data[node]['edge_set']
sub = topics[n_vec, :]
update = np.dot(weights[node], sub)
topics[node,:]=update
# analysis for the effect of propagation
for i in range(len(p_data)):
p_data[i]['actual_ratio']=np.sum(topics[i])
actual_ratios=[]
prop_ratios=[]
for p in p_data:
if p['prop_ratio']!=0.0:
prop_ratios.append(p['prop_ratio'])
if p['actual_ratio']!=0.0:
actual_ratios.append(p['actual_ratio'])
# prop_ratios contain node that has a least one undirected connection
print len(prop_ratios)
print sum(prop_ratios)/len(prop_ratios)
# actual_ratios contain no 0
print len(actual_ratios)
# actual ratios being
print sum(actual_ratios)/len(actual_ratios)
# number of nodes in network
print len(check)
# depth of the tree
print len(ordering)
In [17]:
# merge stage
from Queue import PriorityQueue
from scipy.stats import entropy
#build the initial group
node_group=[-1 for i in range(len(p_data))]
group=[[] for i in range(label_num)]
for i in range(len(p_data)):
if(np.max(topics[i])!=0):
p_data[i]['prop_group']=np.argmax(topics[i])
group[np.argmax(topics[i])].append(i)
else:
p_data[i]['prop_group']=-1
# build group info
group_info={}
# keep track of the indepedent group
curr_group=set()
# structure for getting the greedy option
q=PriorityQueue()
# build vectors from compression
# text vec need to fix to weight even between two abstract with different length
def build_text_vector(vec):
text_vec=np.zeros(len(index_phrase))
for i in vec:
node_vec=np.zeros(len(index_phrase))
phrases=p_data[i]['phrases']
for key in phrases:
node_vec[int(key)]+=phrases[key]
node_vec/=np.sum(node_vec)
text_vec+=node_vec
text_vec/=len(vec)
return text_vec
#cosine distant
def get_similarity(a,b):
return np.dot(a,b)/(la.norm(a)*la.norm(b))
# build group
for i in range(label_num):
group_info[str(i)]={}
group_info[str(i)]['nodes']=set(group[i])
group_info[str(i)]['text_vec']=build_text_vector(group[i])
group_info[str(i)]['size']=len(group[i])
group_info[str(i)]['meta_group']=str(i)
curr_group.add(str(i))
for j in group[i]:
node_group[j]=str(i)
# build connection
for key in group_info:
g=group_info[key]
nodes=g['nodes']
connected_group=set()
for node in nodes:
citation_from_node = p_data[node]['all_cite']
for c in citation_from_node:
o_group=node_group[int(c)]
if(o_group!=key):
connected_group.add(o_group)
g['connected_group']=connected_group
for cg in connected_group:
q.put([-get_similarity(group_info[key]['text_vec'],group_info[cg]['text_vec']),[key,cg]])
#get the meta group for the small group
def get_meta(node):
temp=node
while(group_info[temp]['meta_group']!=temp):
# get the parent until we hit the root
temp=group_info[temp]['meta_group']
return temp
total_degree=0
for p in p_data:
total_degree+=len(p['all_cite'])
#get modularity gain
def get_modularity(a_node, b_node):
delta_q=0
count=0
for n in a_node:
a_degree=len(p_data[n]['all_cite'])
for c in p_data[n]['all_cite']:
b_degree=len(p_data[int(c)]['all_cite'])
if int(c) in b_node:
count+=1
delta_q+=1-(float(a_degree*b_degree)/total_degree)
else:
delta_q+=(-(float(a_degree*b_degree)/total_degree))
delta_q/=total_degree
return delta_q
#merge two groups and update the data structure
def merge_group(a, b):
# change group_info
name = a+','+b
ga=group_info[a]
gb=group_info[b]
delta_q = get_modularity(ga['nodes'],gb['nodes'])
if(delta_q < 0):
return
ga['meta_group']=name
gb['meta_group']=name
group_info[name]={}
group_info[name]['nodes']=(ga['nodes']|gb['nodes'])
group_info[name]['size']=ga['size']+gb['size']
group_info[name]['text_vec']=(ga['text_vec']*ga['size']+gb['text_vec']*gb['size'])/group_info[name]['size']
group_info[name]['meta_group']=name
curr_group.remove(a)
curr_group.remove(b)
curr_group.add(name)
# find new connection via meta_group, current connection could be out of dated due to new merge
combine_connection=(ga['connected_group']|gb['connected_group'])
connected_group=set()
for i in combine_connection:
meta=get_meta(i)
if(meta!=name):
if(not meta in connected_group):
connected_group.add(meta)
group_info[name]['connected_group']=connected_group
# create new option for queue
for cg in connected_group:
n_g=group_info[name]
t_g=group_info[cg]
new_vector=(n_g['text_vec']*n_g['size']+t_g['text_vec']*t_g['size'])/(n_g['size']+t_g['size'])
e = entropy(new_vector)
penalty=(n_g['size']+t_g['size'])
similarity = get_similarity(group_info[name]['text_vec'],group_info[cg]['text_vec'])
score = - similarity/(e*penalty)
score2 = - similarity/(penalty)
q.put([score2,[name,cg]])
# greedy merge until run out of options or reach the minimal tolerance
while (not q.empty()):
option = q.get()
if(len(curr_group) <= tolerance):
break
tup=option[1]
if ((not tup[0] in curr_group) or (not tup[1] in curr_group)):
continue
merge_group(tup[0],tup[1])
# organize group info
group_index={}
final_group_info=[]
# produce a new index for the merged groups
count=0
for group in curr_group:
group_index[group]=count
group_info[group]['index']=count
final_group_info.append(group_info[group])
count+=1
# update the connection in meta group with integer index
for group in curr_group:
new_connection=set()
for cg in group_info[group]['connected_group']:
new_connection.add(group_info[get_meta(cg)]['index'])
group_info[group]['connected_group']=new_connection
print len(curr_group)
In [18]:
from operator import itemgetter
# update singel node group index
for p in p_data:
p['ppm_index']=-1
for group in final_group_info:
nodes = group['nodes']
for n in nodes:
p_data[n]['ppm_index']=group_index[group['meta_group']]
#Top phrase for every group
group_phrases=[]
for i in range(len(group_index)):
top_phrase=[]
b=np.argsort(final_group_info[i]['text_vec'])[::-1]
for k in range(30):
top_phrase.append(index_phrase[str(b[k])])
final_group_info[i]['top_phrase']=top_phrase
name_str = (top_phrase[0]+', '+top_phrase[1]+' and '+top_phrase[2])
final_group_info[i]['name']=name_str
# get importer, exporter and contribution score
for group in final_group_info:
index=group['index']
#map group to node
importer={}
exporter={}
#map group to number
import_score={}
export_score={}
exchange_score={}
for cg in group['connected_group']:
importer[cg]=set()
exporter[cg]=set()
import_score[cg]=0
export_score[cg]=0
exchange_score[cg]=0
for node in group['nodes']:
for c in p_data[node]['citations']:
out_index=p_data[int(c)]['ppm_index']
if(out_index!=index):
importer[out_index].add(node)
import_score[out_index]+=1
exchange_score[out_index]+=1
for c in p_data[node]['cited_by']:
out_index=p_data[int(c)]['ppm_index']
if(out_index!=index):
exporter[out_index].add(node)
export_score[out_index]+=1
exchange_score[out_index]+=1
for cg in group['connected_group']:
importer[cg]=list(importer[cg])
exporter[cg]=list(exporter[cg])
import_list=[]
for key in import_score:
import_list.append([key, import_score[key]])
export_list=[]
for key in export_score:
export_list.append([key, export_score[key]])
exchange_list=[]
for key in exchange_score:
exchange_list.append([key, exchange_score[key]])
group['import_list']=sorted(import_list,key=itemgetter(1),reverse=True)
group['export_list']=sorted(export_list,key=itemgetter(1),reverse=True)
group['exchange_list']=sorted(exchange_list,key=itemgetter(1),reverse=True)
group['importer']=importer
group['exporter']=exporter
# turn set into list for storage
for group in final_group_info:
group['nodes']=list(group['nodes'])
group['connected_group']=list(group['connected_group'])
del group['text_vec']
del group['meta_group']
super_data['ppm_group']=final_group_info
In [19]:
import os
path = "data/super_data_3.json"
if(os.path.isfile(path)):
os.remove(path)
with open(path, "w") as f:
json.dump(super_data, f)
In [20]:
#check if the the clusters are balance (the number below are the basic clusters)
print (len(super_data['ppm_group']))
print (len(super_data['louvain_group']))
print ' '
for c in curr_group:
print c
In [ ]: