In [1]:
import sys
sys.path.append('../src/mane/prototype/')
import numpy as np
import graph as g
import pickle as p

from sklearn.preprocessing import normalize, scale
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [65]:
def lg(exp_id, graph_name, index=[0], norm=False, split=0.5, max_iter=100, C=1e9, ic=500):
    weightfile = '../src/mane/prototype/embeddings/' + exp_id + '.weights'
    graphfile = '../src/mane/data/' + graph_name
    with open(weightfile, 'rb') as f:
        w = p.load(f)
    graph = g.graph_from_pickle(graphfile+'.graph', graphfile+'.community')
    emb = None
    if index is None:
        emb = w
    else:
        for i in index:
            if emb is None:
                emb = w[i]
            else:
                emb += w[i]
        emb /= len(index)
    if norm:
        emb = normalize(emb)
    xids, y_train = graph.gen_training_community(split)
    X = [emb[i] for i in xids]
    predictor = LogisticRegression(C=C, max_iter=max_iter, 
                                   n_jobs=-1, intercept_scaling=ic).fit(X, y_train)
    y_true = [graph._communities[i] for i in graph.nodes()]
    y_pred = [predictor.predict(emb[i].reshape(1,-1))[0] for i in graph.nodes()]
    print('Experiment ', exp_id, ' ', graph_name)
    print('f1_macro (emb): ', f1_score(y_true, y_pred, average='macro'))
    print('f1_micro (emb): ', f1_score(y_true, y_pred, average='micro'))
    return y_true, y_pred, emb, predictor, xids, y_train, graph

In [14]:
# Without high regularization restriction (maybe extreme overfit)
for _ in range(10):
    lg_average('BC3003', 'blogcatalog3')


Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.231464313831
f1_micro (avg):  0.269879751746
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.231905373246
f1_micro (avg):  0.26871605896
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.225459827678
f1_micro (avg):  0.26406128782
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.227966114565
f1_micro (avg):  0.263479441427
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.223431328045
f1_micro (avg):  0.26871605896
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.226662751553
f1_micro (avg):  0.261636927851
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.222029666086
f1_micro (avg):  0.26794026377
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.236412099949
f1_micro (avg):  0.268231186967
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.220325754334
f1_micro (avg):  0.26095810706
Experiment  BC3003   blogcatalog3
f1_macro (avg):  0.229073200321
f1_micro (avg):  0.267746314973

In [16]:
# Without high regularization
for _ in range(10):
    lg_embeddings('BC3_deepwalk', 'blogcatalog3')


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.176592490043
f1_micro (emb):  0.190942591156
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.168755210426
f1_micro (emb):  0.186675717611
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.165052734059
f1_micro (emb):  0.183669511249
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.170187811286
f1_micro (emb):  0.189584949573
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.169425552103
f1_micro (emb):  0.181536074476
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.160817582297
f1_micro (emb):  0.184639255237
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.168058670528
f1_micro (emb):  0.186772692009
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.172268980869
f1_micro (emb):  0.189487975175
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.174936647138
f1_micro (emb):  0.187257564003
Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.175188436389
f1_micro (emb):  0.186772692009

In [18]:
# Without high regularization restriction (maybe extreme overfit)
for _ in range(10):
    lg_average('BC3012', 'blogcatalog3')


Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.230215879861
f1_micro (avg):  0.269976726144
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.240204737171
f1_micro (avg):  0.276474010861
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.237116080153
f1_micro (avg):  0.275116369279
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.229967973073
f1_micro (avg):  0.278510473235
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.227415752099
f1_micro (avg):  0.271819239721
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.231182872668
f1_micro (avg):  0.277637703646
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.235566378466
f1_micro (avg):  0.279965089216
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.234393485211
f1_micro (avg):  0.276667959659
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.23043577756
f1_micro (avg):  0.27889837083
Experiment  BC3012   blogcatalog3
f1_macro (avg):  0.236381520751
f1_micro (avg):  0.276183087665

In [12]:
# Normalization might increase ~1.5%
_ = lg('BC3_node2vec', 'blogcatalog3', None, True, C=1e10)


Experiment  BC3_node2vec   blogcatalog3
f1_macro (emb):  0.142211558894
f1_micro (emb):  0.138673390225

In [8]:
# C plays a big role in f1 score, bigger C is better (more overfit)
_ = lg('BC3_node2vec', 'blogcatalog3', None, C=1e10)


Experiment  BC3_node2vec   blogcatalog3
f1_macro (emb):  0.136424362427
f1_micro (emb):  0.127424359969

In [7]:
# C plays a big role in f1 score, larger C is better (more overfit) and take longer time.
_ = lg('BC3_node2vec', 'blogcatalog3', None, C=1e15)


Experiment  BC3_node2vec   blogcatalog3
f1_macro (emb):  0.139981239597
f1_micro (emb):  0.12577579519

In [59]:
# Increase ic for node2vec too, but no significant improvement.
_ = lg('BC3_node2vec', 'blogcatalog3', None, C=1e15, ic=20)


Experiment  BC3_node2vec   blogcatalog3
f1_macro (emb):  0.132168967917
f1_micro (emb):  0.12810318076

In [17]:
_ = lg('BC3_deepwalk', 'blogcatalog3')


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.159558442833
f1_micro (emb):  0.184154383243

In [18]:
_ = lg('BC3_deepwalk', 'blogcatalog3', C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.173899261504
f1_micro (emb):  0.187257564003

In [19]:
_ = lg('BC3_deepwalk', 'blogcatalog3', C=1e15)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.178525504103
f1_micro (emb):  0.188712179984

In [20]:
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e15)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.179359606732
f1_micro (emb):  0.185512024825

In [21]:
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e20)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.173694083412
f1_micro (emb):  0.185512024825

In [23]:
# Test increase intercept scaling (less regularization) = 1e10 
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.00484540632064
f1_micro (emb):  0.104344453064
/home/hoangnt/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [25]:
# Test increase intercept scaling (less regularization) = 1e5
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.00484540632064
f1_micro (emb):  0.104344453064
/home/hoangnt/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [27]:
# Test increase intercept scaling (less regularization) = 1
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.179533339032
f1_micro (emb):  0.184736229635

In [29]:
# Test increase intercept scaling (less regularization) = 0.1
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.168065879964
f1_micro (emb):  0.18357253685

In [31]:
# Test increase intercept scaling (less regularization) = 5
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.180549583389
f1_micro (emb):  0.182311869666

In [33]:
# Test increase intercept scaling (less regularization) = 100
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.183072014513
f1_micro (emb):  0.190166795966

In [35]:
# Test increase intercept scaling (less regularization) = 1000
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.16228669375
f1_micro (emb):  0.170868890613
/home/hoangnt/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [37]:
# Test increase intercept scaling (less regularization) = 500
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=500)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.182358768162
f1_micro (emb):  0.183184639255

In [38]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=200)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.19052928339
f1_micro (emb):  0.188906128782

In [39]:
# Test increase intercept scaling (less regularization) = 300
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=300)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.178623753489
f1_micro (emb):  0.185415050427

In [40]:
# Test increase intercept scaling (less regularization) = 250
_ = lg('BC3_deepwalk', 'blogcatalog3', norm=True, C=1e10, ic=250)


Experiment  BC3_deepwalk   blogcatalog3
f1_macro (emb):  0.186272138262
f1_micro (emb):  0.18589992242

In [41]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3012', 'blogcatalog3', norm=True, C=1e10, ic=200)


Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.209849459982
f1_micro (emb):  0.239429790535

In [42]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=200)


Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.232876907793
f1_micro (emb):  0.27812257564

In [44]:
# Test increase intercept scaling (less regularization) = 200
_ = lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=220)


Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.233082461595
f1_micro (emb):  0.27967416602

In [46]:
# Note: This is still random walk
_ = lg('BC3025', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=220)


Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.246718317201
f1_micro (emb):  0.282971295578

In [47]:
# Check result of ex24 with much less num_walk than 25
_ = lg('BC3024', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=220)


Experiment  BC3024   blogcatalog3
f1_macro (emb):  0.165170496715
f1_micro (emb):  0.175426687355

In [61]:
for _ in range(10):
    lg('BC3025', 'blogcatalog3', index=[0,1], norm=True, C=1e15, ic=230)


Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.231367291946
f1_micro (emb):  0.274049650892
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.240162489034
f1_micro (emb):  0.279480217223
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.234634939919
f1_micro (emb):  0.276280062064
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.257472767209
f1_micro (emb):  0.286462373933
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.243036177004
f1_micro (emb):  0.278219550039
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.250303983259
f1_micro (emb):  0.283359193173
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.24255831965
f1_micro (emb):  0.279286268425
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.242226100945
f1_micro (emb):  0.281710628394
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.246300652772
f1_micro (emb):  0.284522885958
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.241641796936
f1_micro (emb):  0.281322730799

In [63]:
# Test if balancing the class weight will give better results - Nope!
for _ in range(10):
    lg('BC3025', 'blogcatalog3', index=[0,1], norm=True, C=1e15, ic=230)


Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.224380957857
f1_micro (emb):  0.23283553142
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.223271052248
f1_micro (emb):  0.231380915438
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.221421182605
f1_micro (emb):  0.229538401862
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.223983957479
f1_micro (emb):  0.232156710628
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.230403768585
f1_micro (emb):  0.232738557021
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.22186299814
f1_micro (emb):  0.227986811482
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.215577851566
f1_micro (emb):  0.227114041893
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.21914005075
f1_micro (emb):  0.227889837083
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.229582394109
f1_micro (emb):  0.234775019395
Experiment  BC3025   blogcatalog3
f1_macro (emb):  0.222870394891
f1_micro (emb):  0.228083785881

In [64]:
# Test if balancing the class weight will give better results - Nope!
# NOTE: BC3012 is motif walk
for _ in range(10):
    lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=230)


Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.214191739416
f1_micro (emb):  0.22498060512
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.22357130585
f1_micro (emb):  0.228083785881
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.218972529614
f1_micro (emb):  0.225853374709
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.224612124085
f1_micro (emb):  0.231380915438
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.21616459147
f1_micro (emb):  0.227211016292
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.215718913787
f1_micro (emb):  0.227695888285
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.210375258756
f1_micro (emb):  0.219743987587
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.214906468457
f1_micro (emb):  0.224495733126
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.212082320573
f1_micro (emb):  0.228471683476
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.220601710571
f1_micro (emb):  0.232253685027

In [66]:
# Remove class weight balancing
for _ in range(10):
    lg('BC3012', 'blogcatalog3', index=[0,1], norm=True, C=1e10, ic=230)


Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.238939110385
f1_micro (emb):  0.277249806051
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.243341618848
f1_micro (emb):  0.27812257564
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.238332321528
f1_micro (emb):  0.278801396431
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.235606345264
f1_micro (emb):  0.276474010861
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.23754928472
f1_micro (emb):  0.275504266874
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.235305996745
f1_micro (emb):  0.277152831652
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.233681557652
f1_micro (emb):  0.278704422033
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.230810101649
f1_micro (emb):  0.273952676493
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.240172422597
f1_micro (emb):  0.2812257564
Experiment  BC3012   blogcatalog3
f1_macro (emb):  0.234754882463
f1_micro (emb):  0.278607447634

In [ ]: