In [12]:
#!/usr/bin/env python
#first import useful module
import numpy as np
import scipy.sparse as _sparse
import time
from sklearn.preprocessing import normalize

import optparse, sys, os, copy
from collections import defaultdict

In [50]:
# this version of page rank is extremely slow with regard to the 
# then define customized functions
def readWebGraph(matSize, filename = os.path.join("data", "web-Google.dat")):
    mat = _sparse.lil_matrix((matSize, matSize))
    sys.stderr.write('Phase 1 - read web graph\n')
    with open(filename, "r") as f:
        for idx, line in enumerate(f):
            data =  line.strip()
            if not data.startswith("#"):
                tmp = data.split('\t')
                col = int(tmp[0])
                row = int(tmp[1])
                mat[row, col] += 1;                
            if idx % 10000 == 0:
                sys.stderr.write('.')
                
    sys.stderr.write('\nPhase 2 - normalize column weight\n')
    
    mat = normalize(mat, norm='l1', axis=0)
    col_nums = np.where(mat.sum(0) == 0)[1]

    N =  mat.shape[0]
    if col_nums.size > 0:
         for col in np.nditer(col_nums):
            mat[:, col] = 1 / float(N)
        
    return mat.tocsc()
    

# count the largest graph node number
def readBiggestNodesFromGraph(filename = os.path.join("data", "web-Google.dat")):
    sys.stderr.write("Finding Biggest Node")
    max_node = 0
    with open(filename, "r") as f:
        for idx, line in enumerate(f):
            data =  line.strip()
            if not data.startswith("#"):
                tmp = data.split("\t")
                max_node = max(int(tmp[0]), int(tmp[1]), max_node)
            if idx % 1000000 == 0:
                sys.stderr.write(".")
                
    return max_node

def pageRank(M_matrix, r_last=0, pTele = 0.2, epsilon = 10**-10, max_iter = sys.maxint):
    sys.stderr.write("Page ranking.")
    M = M_matrix.multiply(1 - pTele)
    iter = 0
    N = M.shape[0]
    while True:
        iter += 1
        sys.stderr.write(".")
        if iter >= max_iter:
            sys.stderr.write("maximum iteration reached.")
            break
        
        r = M.dot(r_last) + pTele / N
        
        if np.absolute( r - r_last).sum() <= epsilon:
            break
        
        r_last = r
        
    return r

In [54]:
# first play with toy set
big = readBiggestNodesFromGraph(os.path.join("data", "toy.dat"))
tmp = readWebGraph(big + 1, os.path.join("data", "toy.dat"))
# init the rank vector to uniform 
r_init = normalize(np.ones((tmp.shape[0], 1)), norm='l1', axis=0)

print tmp.toarray()
print r_init


[[ 0.   0.   0. ]
 [ 0.5  0.   0. ]
 [ 0.5  1.   1. ]]
[[ 0.33333333]
 [ 0.33333333]
 [ 0.33333333]]
Finding Biggest Node.Phase 1 - read web graph
.
Phase 2 - normalize column weight

In [55]:
pRank = pageRank(tmp, r_init, 0.3)


Page ranking....

In [56]:
print pRank * 3

print pRank[1] + pRank[2]


[[ 0.3  ]
 [ 0.405]
 [ 2.295]]
[ 0.9]

In [57]:
# Real case for processing the data
start_time = time.time()
cnt = readBiggestNodesFromGraph()
mid_time = time.time()
elasped_time1 = mid_time - start_time
print "Finish finding biggest node - elasped time: " + str(elasped_time1)


Finding Biggest Node......
Finish finding biggest node - elasped time: 19.1437609196

In [59]:
graph = readWebGraph(cnt + 1)
end_time = time.time()
elasped_time2 = end_time - mid_time
print "Finish creating graph - elasped time: " + str(elasped_time2)


Phase 1 - read web graph
...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
Phase 2 - normalize column weight
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-59-e637a91e7bff> in <module>()
----> 1 graph = readWebGraph(cnt + 1)
      2 end_time = time.time()
      3 elasped_time2 = end_time - mid_time
      4 print "Finish creating graph - elasped time: " + str(elasped_time2)

<ipython-input-50-52ec3bd2877c> in readWebGraph(matSize, filename)
     22     if col_nums.size > 0:
     23          for col in np.nditer(col_nums):
---> 24              mat[:, col] = 1 / float(N)
     25 
     26     return mat.tocsc()

/Library/Python/2.7/site-packages/scipy/sparse/compressed.pyc in __setitem__(self, index, x)
    654             return
    655         i, j = self._swap((i.ravel(), j.ravel()))
--> 656         self._set_many(i, j, x.ravel())
    657 
    658     def _setdiag(self, values, k):

/Library/Python/2.7/site-packages/scipy/sparse/compressed.pyc in _set_many(self, i, j, x)
    738             j = j[mask]
    739             j[j < 0] += N
--> 740             self._insert_many(i, j, x[mask])
    741 
    742     def _insert_many(self, i, j, x):

/Library/Python/2.7/site-packages/scipy/sparse/compressed.pyc in _insert_many(self, i, j, x)
    805             # TODO: only sort where necessary
    806             self.has_sorted_indices = False
--> 807             self.sort_indices()
    808 
    809         self.check_format(full_check=False)

/Library/Python/2.7/site-packages/scipy/sparse/compressed.pyc in sort_indices(self)
   1039         if not self.has_sorted_indices:
   1040             fn = _sparsetools.csr_sort_indices
-> 1041             fn(len(self.indptr) - 1, self.indptr, self.indices, self.data)
   1042             self.has_sorted_indices = True
   1043 

KeyboardInterrupt: 

In [ ]:
r_init = normalize(np.ones((graph.shape[0], 1)), norm='l1', axis=0)
pRank = pageRank(graph, r_init, 0.2, 10**-5, 150)

print "The weight for node 99 is:"
print pRank[100]

In [80]:



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-80-e8868cb4318b> in <module>()
----> 1 print pRank.shape()

TypeError: 'tuple' object is not callable

In [16]:


In [ ]: