In [1]:
import networkx as nx
import numpy as np
import pickle as p
from matplotlib import pyplot as plt
%matplotlib inline
data_loc = './../data/raw/citeseer/' # 'citeseer.cites', 'citeseer.content'
Cora is a directed citation network of 3327 papers with link as citation (citing paper points to cited paper, the order in the edgelist is reversed). Each paper has one label (6 types of label in total). The labels are:
Also, each papers has a binary feature vector of 3703 elements (word existance indicator) describing the content of the node. The end of each feature vector is the string label of the paper (e.g. AI or DB, etc.). Note that there are 15 nodes without labels and feature vector.
In [2]:
graph_file = open(data_loc+'citeseer.cites', 'r')
Print the first 5 lines of the graph file:
In [3]:
for _ in range(5): print(repr(graph_file.readline()))
Since the identifiers of nodes are strings, I would like to convert them to a range of integers ranging from 0 to 3311.
In [4]:
graph_file.seek(0)
iid = {} # Integer id conversion dict
idx = 0
citeseer_edgelist = []
for line in graph_file.readlines():
i, j = line.split()
if i not in iid:
iid[i] = idx
idx += 1
if j not in iid:
iid[j] = idx
idx += 1
citeseer_edgelist.append((iid[j],iid[i])) # Correct direction of links
In [5]:
print("Number of edges:", len(citeseer_edgelist))
In [6]:
citeseer = nx.DiGraph(citeseer_edgelist)
In [7]:
print("Number of nodes:", len(citeseer))
In [8]:
len(iid)
Out[8]:
In [9]:
graph_file.close()
In [15]:
# Prepare data arrays and labels lookup table
citeseer_labels = np.ndarray(shape=(len(iid)), dtype=int)
citeseer_features = np.ndarray(shape=(len(iid), 3703), dtype=int)
labels = {'Agents': 0, 'AI': 1, 'DB': 2, 'IR': 3, 'ML': 4, 'HCI': 5}
no_labels = set(citeseer.nodes())
# Read data
with open(data_loc+'citeseer.content', 'r') as f:
for line in f.readlines():
oid, *data, label = line.split()
citeseer_labels[iid[oid]] = labels[label]
citeseer_features[iid[oid],:] = list(map(int, data))
no_labels.remove(iid[oid])
for i in no_labels:
citeseer_labels[i] = -1
citeseer_features[i,:] = np.zeros(3703)
# Validation
with open(data_loc+'citeseer.content', 'r') as f:
for line in f.readlines():
oid, *data, label = line.split()
assert citeseer_labels[iid[oid]] == labels[label]
assert citeseer_labels[iid[oid]] < 6
assert sum(citeseer_features[iid[oid]]) == sum(map(int, data))
print("Validation for `citeseer_labels` and `citeseer_features` passes.")
In [16]:
print("Feature shape: ", citeseer_features.shape)
print("Label shape: ", citeseer_labels.shape)
Convert features to sparse format and dump the data as pickle file:
In [17]:
from scipy.sparse import csr_matrix
citeseer_csr_features = csr_matrix(citeseer_features)
citeseer_dataset = {'NXGraph': citeseer, 'Labels': citeseer_labels,
'CSRFeatures': citeseer_csr_features}
with open('./../data/citeseer.data', 'wb') as f:
p.dump(citeseer_dataset, f, protocol=2)
In [ ]:
nx.write_edgelist(citeseer, path='./../data/citeseer.edges') # delimiter is a white space
In [13]:
len(citeseer_labels)
Out[13]:
In [18]:
max(citeseer_labels)
Out[18]:
In [20]:
np.unique(citeseer_labels)
Out[20]:
In [21]:
no_labels
Out[21]:
In [ ]: