In [1]:
import numpy as np
import time
The citeseer dataset is a set of academic papers and between those. The goal is to predict the topic of the paper.
The following calls result in the feature matrix "features", vectors "ids" and "labels" and the adjacency matrix A.
In [2]:
from util.DatasetUtil import LINQ
from sklearn import preprocessing
f = open('data/citeseer/citeseer.content', 'r')
[ids, features, labels] = LINQ.readContent(f)
f.close()
num_nodes = len(labels)
# transform labels into consecutive integers starting at 0
le = preprocessing.LabelEncoder()
le.fit(labels)
labels = le.transform(labels)
Read adjacency matrix. Note that a few paper-ids appear in the adjacency list, but do not have a content entry.
In [3]:
f = open('data/citeseer/citeseer.cites', 'r')
A = LINQ.readAdjacencyMatrix(f, num_nodes, ids)
f.close()
The following variables represent indexes after a random train/test split.
In [4]:
from sklearn import cross_validation
test_size = 0.3 # percentage of unlabeled data
X_train, X_test, y_train, y_test = cross_validation.train_test_split(np.array(range(num_nodes)),
labels, test_size=test_size, random_state=0)
Number of nodes and features
In [5]:
print features.shape
Number of labels
In [6]:
num_labels = max(labels)+1
print num_labels
Proportion of labels
In [7]:
r = [0.0 for i in range(num_labels)]
for l in labels:
r[l] += 1
r = map(lambda x: x/num_nodes, r)
print r
Number of Links
In [8]:
print (A != 0).sum(0).sum()/2
Create linear bag-of-words kernel BOWK (content-only) and measure duration in seconds.
In [9]:
from util import Kernel
start = time.clock()
BOWK = Kernel.LinearKernel(features)
print time.clock()-start
train SVM
In [10]:
from sklearn import svm
BOWK_train = BOWK[X_train][:,X_train]
BOWK_model = svm.SVC(kernel='precomputed', C=1, verbose=True).fit(BOWK_train, y_train)
Test accuracy which is the percentage of correctly predicted labels in the test set.
In [11]:
from sklearn.metrics import accuracy_score
BOWK_test = BOWK[X_test,:][:,X_train];
score = accuracy_score(BOWK_model.predict(BOWK_test), y_test)
print str(score)
Compute a CWK using 10 hop random walks and absorbing probability alpha=0.5.
In [12]:
import CWKernel
start = time.clock()
[CWKLabel_train, CWKLabel_test] = CWKernel.CWKernel(A, X_train, labels[X_train], X_test,
max(labels)+1, 10, alpha=0.5)
print time.clock()-start
In [13]:
CWK_model = svm.SVC(kernel='precomputed', C=0.1, verbose=True, shrinking=False).fit(CWKLabel_train, y_train)
In [14]:
score = accuracy_score(CWK_model.predict(CWKLabel_test), y_test)
print str(score)
For FCWK additionally supply a node kernel, which in our case is simply the linear bag of words kernel we created earlier.
In [15]:
start = time.clock()
[CWK_train, CWK_test] = CWKernel.CWFeatureKernel(A, X_train, X_test, 10, node_kernel=BOWK, alpha=0.5)
print time.clock()-start
In [16]:
CWK_model = svm.SVC(kernel='precomputed', C=0.1, verbose=True, shrinking=False).fit(CWK_train, y_train)
In [17]:
score = accuracy_score(CWK_model.predict(CWK_test), y_test)
print str(score)
Cross Validate FCWK on Citeseer
In [18]:
def cross_validate(apply_classifier, labels, n_folds=3):
y = labels
kf = cross_validation.KFold(len(labels), n_folds, shuffle=True)
scores = []
for train, test in kf:
[CWK_train, CWK_test] = apply_classifier(train, test)
CWK_model = svm.SVC(kernel='precomputed', C=0.1, verbose=True, shrinking=False).fit(CWK_train, y[train])
score = accuracy_score(CWK_model.predict(CWK_test), y[test])
print "finished fold: " + str(score)
scores.append(score)
print np.mean(scores)
In [19]:
def apply_FCWK_factory(A, node_kernel):
X = np.array(range(len(labels)))
def apply_FCWK(train_ind, test_ind):
return CWKernel.CWFeatureKernel(A, X[train_ind],
X[test_ind], 10,
node_kernel=BOWK,
alpha=0.5)
return apply_FCWK
In [20]:
cross_validate(apply_FCWK_factory(A, BOWK), labels, n_folds=3)
In [21]:
f = open('data/cora/cora.content', 'r')
[ids, features, labels] = LINQ.readContent(f)
f.close()
num_nodes = len(labels)
le = preprocessing.LabelEncoder()
le.fit(labels)
labels = le.transform(labels)
In [22]:
f = open('data/cora/cora.cites', 'r')
A = LINQ.readAdjacencyMatrix(f, num_nodes, ids)
f.close()
In [23]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(np.array(range(num_nodes)), labels, test_size=0.5, random_state=0)
Number of nodes and features
In [24]:
features.shape
Out[24]:
Number of classes
In [25]:
num_labels = max(labels)+1
print num_labels
Proportion of labels
In [26]:
r = [0.0 for i in range(num_labels)]
for l in labels:
r[l] += 1
r = map(lambda x: x/num_nodes, r)
print r
Number of links
In [27]:
print (A != 0).sum(0).sum()/2
In [28]:
BOWK = Kernel.LinearKernel(features)
In [29]:
reload(CWKernel)
start = time.clock()
[CWK_train, CWK_test] = CWKernel.CWFeatureKernel(A, X_train, X_test, 10, alpha=0.5, node_kernel=BOWK)
print time.clock()-start
In [30]:
CWK_model = svm.SVC(kernel='precomputed', C=1, verbose=True, shrinking=False).fit(CWK_train, y_train)
In [31]:
score = accuracy_score(CWK_model.predict(CWK_test), y_test)
print str(score)
Cross Validate FCWK on Cora
In [32]:
cross_validate(apply_FCWK_factory(A, BOWK), labels, n_folds=3)