In [12]:
#!/usr/bin/env python
#first import useful module
import numpy as np
import scipy.sparse as _sparse
import time
from sklearn.preprocessing import normalize
import optparse, sys, os, copy
from collections import defaultdict
In [50]:
# this version of page rank is extremely slow with regard to the
# then define customized functions
def readWebGraph(matSize, filename = os.path.join("data", "web-Google.dat")):
mat = _sparse.lil_matrix((matSize, matSize))
sys.stderr.write('Phase 1 - read web graph\n')
with open(filename, "r") as f:
for idx, line in enumerate(f):
data = line.strip()
if not data.startswith("#"):
tmp = data.split('\t')
col = int(tmp[0])
row = int(tmp[1])
mat[row, col] += 1;
if idx % 10000 == 0:
sys.stderr.write('.')
sys.stderr.write('\nPhase 2 - normalize column weight\n')
mat = normalize(mat, norm='l1', axis=0)
col_nums = np.where(mat.sum(0) == 0)[1]
N = mat.shape[0]
if col_nums.size > 0:
for col in np.nditer(col_nums):
mat[:, col] = 1 / float(N)
return mat.tocsc()
# count the largest graph node number
def readBiggestNodesFromGraph(filename = os.path.join("data", "web-Google.dat")):
sys.stderr.write("Finding Biggest Node")
max_node = 0
with open(filename, "r") as f:
for idx, line in enumerate(f):
data = line.strip()
if not data.startswith("#"):
tmp = data.split("\t")
max_node = max(int(tmp[0]), int(tmp[1]), max_node)
if idx % 1000000 == 0:
sys.stderr.write(".")
return max_node
def pageRank(M_matrix, r_last=0, pTele = 0.2, epsilon = 10**-10, max_iter = sys.maxint):
sys.stderr.write("Page ranking.")
M = M_matrix.multiply(1 - pTele)
iter = 0
N = M.shape[0]
while True:
iter += 1
sys.stderr.write(".")
if iter >= max_iter:
sys.stderr.write("maximum iteration reached.")
break
r = M.dot(r_last) + pTele / N
if np.absolute( r - r_last).sum() <= epsilon:
break
r_last = r
return r
In [54]:
# first play with toy set
big = readBiggestNodesFromGraph(os.path.join("data", "toy.dat"))
tmp = readWebGraph(big + 1, os.path.join("data", "toy.dat"))
# init the rank vector to uniform
r_init = normalize(np.ones((tmp.shape[0], 1)), norm='l1', axis=0)
print tmp.toarray()
print r_init
In [55]:
pRank = pageRank(tmp, r_init, 0.3)
In [56]:
print pRank * 3
print pRank[1] + pRank[2]
In [57]:
# Real case for processing the data
start_time = time.time()
cnt = readBiggestNodesFromGraph()
mid_time = time.time()
elasped_time1 = mid_time - start_time
print "Finish finding biggest node - elasped time: " + str(elasped_time1)
In [59]:
graph = readWebGraph(cnt + 1)
end_time = time.time()
elasped_time2 = end_time - mid_time
print "Finish creating graph - elasped time: " + str(elasped_time2)
In [ ]:
r_init = normalize(np.ones((graph.shape[0], 1)), norm='l1', axis=0)
pRank = pageRank(graph, r_init, 0.2, 10**-5, 150)
print "The weight for node 99 is:"
print pRank[100]
In [80]:
In [16]:
In [ ]: