To quantify graph properties in the data
$ head yellow_tripdata_2016-12.csv
$ wc -l yellow_tripdata_2016-12.csv
$ cut -d , -f 8,9 yellow_tripdata_2016-12.csv > yellow_PUL_DOL_Graph.txt
$ cat yellow_PUL_DOL_Graph.txt | sed "1 d" > yellow_PUL_DOL_Graph_nohead.txt
$ rm yellow_PUL_DOL_Graph.txt
$ tr "," "\t" < yellow_PUL_DOL_Graph_nohead.txt > yellow_PUL_DOL_Graph.txt
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,10)
import seaborn as sns
In [2]:
import snap
import os
import sys
In [3]:
fname = "yellow_PUL_DOL_Graph.txt"
col1 = 1 - 1
col2 = 2 - 1
In [4]:
G = snap.LoadEdgeList(snap.PNGraph,fname,col1,col2)
In [5]:
print "\ngraph nodes %d, edges %d" % (G.GetNodes(), G.GetEdges())
In [6]:
WccV = snap.TIntPrV()
snap.GetWccSzCnt(G, WccV)
print "\n# of connected component sizes", WccV.Len()
In [7]:
print "\n# of connected component sizes", WccV.Len()
for comp in WccV:
print "size %d, number of components %d" % (comp.GetVal1(), comp.GetVal2())
In [8]:
MxWcc = snap.GetMxWcc(G)
print "\nmax wcc nodes %d, edges %d" % (MxWcc.GetNodes(), MxWcc.GetEdges())
In [9]:
InDegCntV = snap.TIntPrV()
snap.GetInDegCnt(G, InDegCntV)
print "\n# of different in-degrees", InDegCntV.Len()
In [11]:
'''
for item in InDegCntV:
print "in-degree %d, number of nodes %d" % (item.GetVal1(), item.GetVal2())
'''
Out[11]:
In [12]:
OutDegCntV = snap.TIntPrV()
snap.GetOutDegCnt(G, OutDegCntV)
print "\n# of different out-degrees", OutDegCntV.Len()
In [13]:
'''
for item in OutDegCntV:
print "out-degree %d, number of nodes %d" % (item.GetVal1(), item.GetVal2())
'''
Out[13]:
In [14]:
PRankH = snap.TIntFltH()
snap.GetPageRank(G, PRankH)
Hash table ....
In [15]:
type(PRankH)
Out[15]:
In [16]:
lPRank = sorted(PRankH, key = lambda key: PRankH[key], reverse = True)
In [17]:
print "\ntop 10 experts by PageRank"
for item in lPRank[:10]:
print "id %7s, pagerank %.6f" % (item, PRankH[item])
In [18]:
#for item in PRankH:
# print item
In [19]:
print type(lPRank)
print len(lPRank)
# print PRankH[]
sortedPRkey = []
sortedPR = []
for item, val in enumerate(lPRank):
sortedPRkey.append(item)
sortedPR.append(PRankH[val])
In [20]:
plt.gca().set_yscale('log')
plt.plot(sortedPR, 'o')
plt.show()
Very few nodes appear prominent
In [ ]:
In [20]:
NIdHubH = snap.TIntFltH()
NIdAuthH = snap.TIntFltH()
snap.GetHits(G, NIdHubH, NIdAuthH)
In [18]:
lAuth = sorted(NIdAuthH, key = lambda key: NIdAuthH[key], reverse = True)
print "\ntop 10 experts by Hits"
In [19]:
for item in lAuth[:10]:
print "id %7s, authority rank %.6f" % (item, NIdAuthH[item])
In [21]:
lHub = sorted(NIdHubH, key = lambda key: NIdHubH[key], reverse = True)
In [22]:
for item in lHub[:10]:
print "id %7s, hub rank %.6f" % (item, NIdHubH[item])
In [26]:
sortedAuthkey = []
sortedAuth = []
for item, val in enumerate(lAuth):
sortedAuthkey.append(item)
sortedAuth.append(NIdAuthH[val])
In [27]:
plt.gca().set_yscale('log')
plt.plot(sortedAuth, 'o')
plt.show()
In [28]:
sortedHubkey = []
sortedHub = []
for item, val in enumerate(lHub):
sortedHubkey.append(item)
sortedHub.append(NIdHubH[val])
In [29]:
plt.gca().set_yscale('log')
plt.plot(sortedHub, 'o')
plt.show()
In [33]:
snap.PlotInDegDistr(G, "yellow 2016", "yellow 2016 In Degree")
In [34]:
yellowInD = pd.read_csv('inDeg.yellow 2016.tab', skiprows=3, sep='\t')
In [35]:
yellowInD.describe()
Out[35]:
In [39]:
plt.figure(figsize=(8,4.95))
plt.plot(yellowInD['# In-degree'], yellowInD['Count'], '-o')
plt.show()
In [ ]: