In [1]:
import sys
sys.path.append('../src/mane/prototype/')
import numpy as np
import graph as g
import pickle as p

from sklearn.preprocessing import normalize, scale, MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

Experiment with Random-walk result


In [2]:
# Load weight
with open('../src/mane/prototype/embeddings/BC3047.weights', 'rb') as f:
    w = p.load(f)
# Load graph
bc = g.graph_from_pickle('../src/mane/data/blogcatalog3.graph', 
                         '../src/mane/data/blogcatalog3.community')

In [3]:
emb = (w[0] + w[1]) / 2
emb = normalize(emb)

In [4]:
x_train, yl_train, x_test, yl_test = bc.get_ids_labels(0.5)

In [5]:
lg = OneVsRestClassifier(LogisticRegression())

In [6]:
X_train = [emb[i] for i in x_train]
Y_train = MultiLabelBinarizer(classes=range(1,40)).fit_transform(yl_train)

In [7]:
lg.fit(X_train, Y_train)


Out[7]:
OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [8]:
lg.predict_proba(emb[9566].reshape(1,-1)).shape


Out[8]:
(1, 39)

In [9]:
X_test = [emb[i] for i in x_test]
Y_test = MultiLabelBinarizer(classes=range(1,40)).fit_transform(yl_test)

In [10]:
pred = lg.predict_proba([i for i in X_test])

In [11]:
Y_test


Out[11]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

Get top k prediction


In [35]:
pred[0].argsort()[-3:]


Out[35]:
array([5, 6, 4])

In [12]:
len(pred)


Out[12]:
5156

In [13]:
len(yl_test)


Out[13]:
5156

In [14]:
pred[0]


Out[14]:
array([ 0.00398161,  0.0357896 ,  0.08584289,  0.02449758,  0.15023545,
        0.48816811,  0.02156142,  0.05960207,  0.06594667,  0.00866716,
        0.05512942,  0.0033318 ,  0.04651478,  0.03337085,  0.01032531,
        0.02674524,  0.04656247,  0.02798097,  0.08513218,  0.00963393,
        0.0087078 ,  0.00683082,  0.0105208 ,  0.25379662,  0.01822022,
        0.01270774,  0.00683734,  0.02172979,  0.01812818,  0.02239905,
        0.01263985,  0.02685956,  0.00782481,  0.00602085,  0.00532535,
        0.01669904,  0.00731791,  0.00314692,  0.0011479 ])

In [15]:
yl_test[0]


Out[15]:
[6]

In [16]:
pred[0].argsort()[-1:]


Out[16]:
array([5])

In [17]:
num_pred = []

In [18]:
for i, j in enumerate(pred):
    k = len(yl_test[i])
    num_pred.append(j.argsort()[-k:])

In [19]:
len(num_pred)


Out[19]:
5156

In [20]:
num_pred[0]


Out[20]:
array([5])

In [21]:
num_pred[1]


Out[21]:
array([19,  6])

In [22]:
Y_pred = MultiLabelBinarizer(classes=range(1,40)).fit_transform(num_pred)

In [23]:
Y_pred[0]


Out[23]:
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [24]:
Y_test[0]


Out[24]:
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
for i, j in enumerate(num_pred):
    num_pred[i] = [k+1 for k in j]

In [26]:
num_pred[0]


Out[26]:
[6]

In [27]:
yl_test[0]


Out[27]:
[6]

In [28]:
Y_pred = MultiLabelBinarizer(classes=range(1,40)).fit_transform(num_pred)

In [29]:
Y_pred[0]


Out[29]:
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [30]:
Y_test[0]


Out[30]:
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [31]:
f1_score(y_pred=Y_pred, y_true=Y_test, average='macro')


/home/hoangnt/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Out[31]:
0.14045250348537816

In [32]:
f1_score(y_pred=Y_pred, y_true=Y_test, average='micro')


Out[32]:
0.29844588089671298

In [33]:
f1_score(y_pred=Y_pred, y_true=Y_test, average='macro')


/home/hoangnt/anaconda3/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Out[33]:
0.14045250348537816

In [ ]: