In [16]:
import json
import numpy as np
from numpy import linalg as la
import os


#predefined topics size
label_num = 180
#orginiality determined how far belief can propagate through the network 
originality = 0.001
# propagation and mixing
iter_num = 300

# 90 best partition (similarity loss)
# 89 uncertainty and reliability merge in this level (similarity loss)
# 87 best partition (similarity + entropy loss)
tolerance = 89

with open("data/super_data_2.json", "r") as f:
    super_data = json.load(f)


p_data=super_data['papers']
bayes_rank=super_data['bayes_ranks']
index_phrase=super_data['index_phrase']




#propagation stage
source=[]


for i in range(label_num):
    source.append(bayes_rank[i])


# list for storing the index of all sources
source_idx=[]
# check set for preventing duplicate adding
check=set()

#build topics
for i in range (len(source)):
    check.add(source[i])
    source_idx.append(source[i])

# BFS for find legit order
ordering=[]

#build depth 1 nodes, depth measure how far this nodes is from the sources
depth_1=[]
for i in source_idx:
    for j in p_data[i]['all_cite']:
        if not int(j) in check:
            depth_1.append(int(j))
            check.add(int(j))
        
ordering.append(depth_1)

#recursively build all nodes
running=True
depth=0

while running:
    new_layer=[]
    for i in ordering[depth]:
        for j in p_data[i]['all_cite']:
            if not int(j) in check:
                check.add(int(j))
                new_layer.append(int(j))
    if(len(new_layer)==0):
        running=False
    else:
        ordering.append(new_layer)
        depth+=1

# build edge weight by normalization
weights=[]



for p in p_data:
    z = float(sum(p['all_cite_sim'])) + originality
    vec=np.array(p['all_cite_sim'])/z
    p['prop_ratio']=1-(originality/z)
    weights.append(vec)
        
        
# turn all str in citation into int vector
for p in p_data:
    int_vec=[]
    for i in p['all_cite']:
        int_vec.append(int(i))
    p['edge_set']=int_vec

    
# build and initilize topics matrix
topics=np.zeros((len(p_data),label_num),dtype=float)
for i in range (len(source)):
    topics[source[i], i] = 1

    

for i in range(iter_num):
    for layer in ordering:
        for node in layer:
            n_vec = p_data[node]['edge_set']
            sub = topics[n_vec, :]
            update = np.dot(weights[node], sub)
            topics[node,:]=update

# analysis for the effect of propagation
for i in range(len(p_data)):
    p_data[i]['actual_ratio']=np.sum(topics[i])

actual_ratios=[]
prop_ratios=[]
for p in p_data:
    if p['prop_ratio']!=0.0:
        prop_ratios.append(p['prop_ratio'])
    if p['actual_ratio']!=0.0:
        actual_ratios.append(p['actual_ratio'])

        
# prop_ratios contain node that has a least one undirected connection      
print len(prop_ratios)
print sum(prop_ratios)/len(prop_ratios)
# actual_ratios contain no 0
print len(actual_ratios)
# actual ratios being 
print sum(actual_ratios)/len(actual_ratios)
# number of nodes in network
print len(check)
# depth of the tree
print len(ordering)


526
0.99147777554
526
0.988746113904
526
3

In [17]:
# merge stage
from Queue import PriorityQueue
from scipy.stats import entropy


#build the initial group
node_group=[-1 for i in range(len(p_data))]
group=[[] for i in range(label_num)]

for i in range(len(p_data)):
    if(np.max(topics[i])!=0):
        p_data[i]['prop_group']=np.argmax(topics[i])
        group[np.argmax(topics[i])].append(i)
    else:
        p_data[i]['prop_group']=-1

        
# build group info
group_info={}

# keep track of the indepedent group
curr_group=set()

# structure for getting the greedy option
q=PriorityQueue()


# build vectors from compression
# text vec need to fix to weight even between two abstract with different length
def build_text_vector(vec):
    text_vec=np.zeros(len(index_phrase))
    for i in vec:
        node_vec=np.zeros(len(index_phrase))
        phrases=p_data[i]['phrases']
        for key in phrases:
            node_vec[int(key)]+=phrases[key]
        node_vec/=np.sum(node_vec)
        text_vec+=node_vec
    text_vec/=len(vec)
    return text_vec


#cosine distant
def get_similarity(a,b):
    return np.dot(a,b)/(la.norm(a)*la.norm(b))


# build group
for i in range(label_num):
    group_info[str(i)]={}
    group_info[str(i)]['nodes']=set(group[i])
    group_info[str(i)]['text_vec']=build_text_vector(group[i])
    group_info[str(i)]['size']=len(group[i])
    group_info[str(i)]['meta_group']=str(i)
    curr_group.add(str(i))
    for j in group[i]:
        node_group[j]=str(i)


# build connection
for key in group_info:
    g=group_info[key]
    nodes=g['nodes']
    connected_group=set()
    for node in nodes:
        citation_from_node = p_data[node]['all_cite']
        for c in citation_from_node:
            o_group=node_group[int(c)]
            if(o_group!=key):
                connected_group.add(o_group)
    g['connected_group']=connected_group
    for cg in connected_group:
        q.put([-get_similarity(group_info[key]['text_vec'],group_info[cg]['text_vec']),[key,cg]])
        

#get the meta group for the small group
def get_meta(node):
    temp=node
    while(group_info[temp]['meta_group']!=temp):
        # get the parent until we hit the root
        temp=group_info[temp]['meta_group']
    return temp

total_degree=0
for p in p_data:
    total_degree+=len(p['all_cite'])

#get modularity gain
def get_modularity(a_node, b_node):    
    delta_q=0
    count=0
    for n in a_node:
        a_degree=len(p_data[n]['all_cite'])
        for c in p_data[n]['all_cite']:
            b_degree=len(p_data[int(c)]['all_cite'])
            if int(c) in b_node:
                count+=1
                delta_q+=1-(float(a_degree*b_degree)/total_degree)
            else:
                delta_q+=(-(float(a_degree*b_degree)/total_degree))
    delta_q/=total_degree
    return delta_q

    
#merge two groups and update the data structure
def merge_group(a, b):
    
    # change group_info
    name = a+','+b

    ga=group_info[a]
    gb=group_info[b]
    
    delta_q = get_modularity(ga['nodes'],gb['nodes'])
    
    if(delta_q < 0):
        return
    
    ga['meta_group']=name
    gb['meta_group']=name
    group_info[name]={}
    group_info[name]['nodes']=(ga['nodes']|gb['nodes'])
    group_info[name]['size']=ga['size']+gb['size']
    group_info[name]['text_vec']=(ga['text_vec']*ga['size']+gb['text_vec']*gb['size'])/group_info[name]['size']
    group_info[name]['meta_group']=name
    curr_group.remove(a)
    curr_group.remove(b)
    curr_group.add(name)
    
    # find new connection via meta_group, current connection could be out of dated due to new merge
    combine_connection=(ga['connected_group']|gb['connected_group'])
    connected_group=set()
    
    for i in combine_connection:
        meta=get_meta(i)
        if(meta!=name):
            if(not meta in connected_group):
                connected_group.add(meta)
    
    group_info[name]['connected_group']=connected_group
    # create new option for queue
    for cg in connected_group:
        
        n_g=group_info[name]
        t_g=group_info[cg]
        
        new_vector=(n_g['text_vec']*n_g['size']+t_g['text_vec']*t_g['size'])/(n_g['size']+t_g['size'])
        e = entropy(new_vector)
        penalty=(n_g['size']+t_g['size'])
        
        similarity = get_similarity(group_info[name]['text_vec'],group_info[cg]['text_vec'])
        score = - similarity/(e*penalty)
        score2 = - similarity/(penalty)

        
        q.put([score2,[name,cg]])
    
    


# greedy merge until run out of options or reach the minimal tolerance
while (not q.empty()):
    option = q.get()
    if(len(curr_group) <= tolerance):
        break
    tup=option[1]
    if ((not tup[0] in curr_group) or (not tup[1] in curr_group)):
        continue 
    merge_group(tup[0],tup[1])

    
# organize group info 
group_index={}
final_group_info=[]

# produce a new index for the merged groups
count=0
for group in curr_group:
    group_index[group]=count
    group_info[group]['index']=count
    final_group_info.append(group_info[group])
    count+=1
    
# update the connection in meta group with integer index
for group in curr_group:
    new_connection=set()
    for cg in group_info[group]['connected_group']:
        new_connection.add(group_info[get_meta(cg)]['index'])
    group_info[group]['connected_group']=new_connection
        
print len(curr_group)


89

In [18]:
from operator import itemgetter

# update singel node group index
for p in p_data:
    p['ppm_index']=-1

for group in final_group_info:
    nodes = group['nodes']
    for n in nodes:
        p_data[n]['ppm_index']=group_index[group['meta_group']]

        
#Top phrase for every group
group_phrases=[]
for i in range(len(group_index)):
    top_phrase=[]
    b=np.argsort(final_group_info[i]['text_vec'])[::-1]
    for k in range(30):
        top_phrase.append(index_phrase[str(b[k])])
    final_group_info[i]['top_phrase']=top_phrase
    name_str = (top_phrase[0]+', '+top_phrase[1]+' and '+top_phrase[2])
    final_group_info[i]['name']=name_str

# get importer, exporter and contribution score
for group in final_group_info:
    
    index=group['index']
    #map group to node
    importer={}
    exporter={}
    #map group to number
    import_score={}
    export_score={}
    exchange_score={}

    
    for cg in group['connected_group']:
        importer[cg]=set()
        exporter[cg]=set()
        import_score[cg]=0
        export_score[cg]=0
        exchange_score[cg]=0
    
    for node in group['nodes']:
        
        for c in p_data[node]['citations']:
            out_index=p_data[int(c)]['ppm_index']
            if(out_index!=index):
                importer[out_index].add(node)
                import_score[out_index]+=1
                exchange_score[out_index]+=1
                
        for c in p_data[node]['cited_by']:
            out_index=p_data[int(c)]['ppm_index']
            if(out_index!=index):
                exporter[out_index].add(node)
                export_score[out_index]+=1
                exchange_score[out_index]+=1
                
    for cg in group['connected_group']:
        importer[cg]=list(importer[cg])
        exporter[cg]=list(exporter[cg])
    
    import_list=[]
    for key in import_score:
        import_list.append([key, import_score[key]])
    export_list=[]
    for key in export_score:
        export_list.append([key, export_score[key]])
    exchange_list=[]
    for key in exchange_score:
        exchange_list.append([key, exchange_score[key]])
    
    group['import_list']=sorted(import_list,key=itemgetter(1),reverse=True)
    group['export_list']=sorted(export_list,key=itemgetter(1),reverse=True)
    group['exchange_list']=sorted(exchange_list,key=itemgetter(1),reverse=True)
    
    group['importer']=importer
    group['exporter']=exporter


# turn set into list for storage
for group in final_group_info:
    group['nodes']=list(group['nodes'])
    group['connected_group']=list(group['connected_group'])
    del group['text_vec']
    del group['meta_group']

super_data['ppm_group']=final_group_info

In [19]:
import os
path = "data/super_data_3.json" 
if(os.path.isfile(path)): 
    os.remove(path) 
with open(path, "w") as f: 
    json.dump(super_data, f)

In [20]:
#check if the the clusters are balance (the number below are the basic clusters)

print (len(super_data['ppm_group']))
print (len(super_data['louvain_group']))


print ' '
for c in curr_group:
    print c


89
69
 
135
39,64,178,159
166
12,62,2,74,126,154,46,52,19,49
157,56
22
23
115,77
161
119
120
121
122
125
167
118
55
79,31,164,155
168,28,11,57,172,1
18,34,33,40
130,162
114,17,138,36,6,81,0,8,20,70,152,128,53,170,43,78,139,4,124
76,80,68
88
89
111
110
113
112
82
83
86
87
84
85
129,163
177
137,148
140,144
176,51
108
109
102
103
100
101
106
107
104
105
30
141,174,153,47
132,160
133,156
61
66
175
173
127,13,14,50,26,38,147,15,25,134,7,169,58,143,3,60
29,65,37,27,9,142,158
41,59,136,54
99
98
91
90
93
92
95
94
97
96
10
117
116
150
48
44
45
5
131,69
146
35,42,63
73
72
71
165,67,179
149,151
171,24,145,123,21
16,75,32

In [ ]: