In [1]:
import sys
sys.path.append('../src/mane/prototype/')
import numpy as np
import graph as g
import pickle as p
from sklearn.preprocessing import normalize, scale
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
In [65]:
def lg(exp_id, graph_name, index=[0], norm=False, split=0.5, max_iter=100, C=1e9, ic=500):
weightfile = '../src/mane/prototype/embeddings/' + exp_id + '.weights'
graphfile = '../src/mane/data/' + graph_name
with open(weightfile, 'rb') as f:
w = p.load(f)
graph = g.graph_from_pickle(graphfile+'.graph', graphfile+'.community')
emb = None
if index is None:
emb = w
else:
for i in index:
if emb is None:
emb = w[i]
else:
emb += w[i]
emb /= len(index)
if norm:
emb = normalize(emb)
xids, y_train = graph.gen_training_community(split)
X = [emb[i] for i in xids]
predictor = LogisticRegression(C=C, max_iter=max_iter,
n_jobs=-1, intercept_scaling=ic).fit(X, y_train)
y_true = [graph._communities[i] for i in graph.nodes()]
y_pred = [predictor.predict(emb[i].reshape(1,-1))[0] for i in graph.nodes()]
print('Experiment ', exp_id, ' ', graph_name)
print('f1_macro (emb): ', f1_score(y_true, y_pred, average='macro'))
print('f1_micro (emb): ', f1_score(y_true, y_pred, average='micro'))
return y_true, y_pred, emb, predictor, xids, y_train, graph
In [14]:
# Without high regularization restriction (maybe extreme overfit)
for _ in range(10):
lg_average('BC3003', 'blogcatalog3')
In [16]:
# Without high regularization
for _ in range(10):
lg_embeddings('BC3_deepwalk', 'blogcatalog3')
In [18]:
# Without high regularization restriction (maybe extreme overfit)
for _ in range(10):
lg_average('BC3012', 'blogcatalog3')
In [12]:
# Normalization might increase ~1.5%
_ = lg('BC3_node2vec', 'blogcatalog3', None, True, C=1e10)
In [8]:
# C plays a big role in f1 score, bigger C is better (more overfit)
_ = lg('BC3_node2vec', 'blogcatalog3', None, C=1e10)
In [7]:
# C plays a big role in f1 score, larger C is better (more overfit) and take longer time.
_ = lg('BC3_node2vec', 'blogcatalog3', None, C=1e15)
In [59]:
# Increase ic for node2vec too, but no significant improvement.
_ = lg('BC3_node2vec', 'blogcatalog3', None, C=1e15, ic=20)
In [17]:
_ = lg('BC3_deepwalk', 'blogcatalog3')
In [18]:
_ = lg('BC3_deepwalk', 'blogcatalog3', C=1e10)
In [19]:
_ = lg('BC3_deepwalk', 'blogcatalog3', C=1e15)
In [20]:
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e15)
In [21]:
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e20)
In [23]:
# Test increase intercept scaling (less regularization) = 1e10
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [25]:
# Test increase intercept scaling (less regularization) = 1e5
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [27]:
# Test increase intercept scaling (less regularization) = 1
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [29]:
# Test increase intercept scaling (less regularization) = 0.1
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [31]:
# Test increase intercept scaling (less regularization) = 5
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [33]:
# Test increase intercept scaling (less regularization) = 100
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [35]:
# Test increase intercept scaling (less regularization) = 1000
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)
In [37]:
# Test increase intercept scaling (less regularization) = 500
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=500)
In [38]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=200)
In [39]:
# Test increase intercept scaling (less regularization) = 300
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=300)
In [40]:
# Test increase intercept scaling (less regularization) = 250
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=250)
In [41]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3012', 'blogcatalog3', norm=True, C=1e10, ic=200)
In [42]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=200)
In [44]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=220)
In [46]:
# Note: This is still random walk
_ = lg('BC3025', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=220)
In [47]:
# Check result of ex24 with much less num_walk than 25
_ = lg('BC3024', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=220)
In [61]:
for _ in range(10):
lg('BC3025', 'blogcatalog3', index=[0,1], norm=True, C=1e15, ic=230)
In [63]:
# Test if balancing the class weight will give better results - Nope!
for _ in range(10):
lg('BC3025', 'blogcatalog3', index=[0,1], norm=True, C=1e15, ic=230)
In [64]:
# Test if balancing the class weight will give better results - Nope!
# NOTE: BC3012 is motif walk
for _ in range(10):
lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=230)
In [66]:
# Remove class weight balancing
for _ in range(10):
lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=230)
In [ ]: