Objective

To quantify graph properties in the data

Data Source

  1. NYC TLC
    • Yellow 2016 December
$ head yellow_tripdata_2016-12.csv
$ wc -l yellow_tripdata_2016-12.csv
$ cut -d , -f 8,9 yellow_tripdata_2016-12.csv > yellow_PUL_DOL_Graph.txt 
$ cat yellow_PUL_DOL_Graph.txt | sed "1 d" > yellow_PUL_DOL_Graph_nohead.txt 
$ rm yellow_PUL_DOL_Graph.txt
$ tr "," "\t" < yellow_PUL_DOL_Graph_nohead.txt > yellow_PUL_DOL_Graph.txt
  • Do this efficiently!
  • There are 10 million lines in the data file

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,10)
import seaborn as sns

In [2]:
import snap
import os
import sys

Create directed graph


In [3]:
fname = "yellow_PUL_DOL_Graph.txt"
col1 = 1 - 1
col2 = 2 - 1

In [4]:
G = snap.LoadEdgeList(snap.PNGraph,fname,col1,col2)

In [5]:
print "\ngraph nodes %d, edges %d" % (G.GetNodes(), G.GetEdges())


graph nodes 262, edges 27080

In [6]:
WccV = snap.TIntPrV()
snap.GetWccSzCnt(G, WccV)
print "\n# of connected component sizes", WccV.Len()


# of connected component sizes 1

In [7]:
print "\n# of connected component sizes", WccV.Len()
for comp in WccV:
    print "size %d, number of components %d" % (comp.GetVal1(), comp.GetVal2())


# of connected component sizes 1
size 262, number of components 1

In [8]:
MxWcc = snap.GetMxWcc(G)
print "\nmax wcc nodes %d, edges %d" % (MxWcc.GetNodes(), MxWcc.GetEdges())


max wcc nodes 262, edges 27080

In [9]:
InDegCntV = snap.TIntPrV()
snap.GetInDegCnt(G, InDegCntV)
print "\n# of different in-degrees", InDegCntV.Len()


# of different in-degrees 120

In [11]:
'''
for item in InDegCntV:
    print "in-degree %d, number of nodes %d" % (item.GetVal1(), item.GetVal2())
'''


Out[11]:
'\nfor item in InDegCntV:\n    print "in-degree %d, number of nodes %d" % (item.GetVal1(), item.GetVal2())\n'

In [12]:
OutDegCntV = snap.TIntPrV()
snap.GetOutDegCnt(G, OutDegCntV)
print "\n# of different out-degrees", OutDegCntV.Len()


# of different out-degrees 151

In [13]:
'''
for item in OutDegCntV:
    print "out-degree %d, number of nodes %d" % (item.GetVal1(), item.GetVal2())
'''


Out[13]:
'\nfor item in OutDegCntV:\n    print "out-degree %d, number of nodes %d" % (item.GetVal1(), item.GetVal2())\n'

Get Page Rank


In [14]:
PRankH = snap.TIntFltH()
snap.GetPageRank(G, PRankH)

Hash table ....


In [15]:
type(PRankH)


Out[15]:
snap.TIntFltH

In [16]:
lPRank = sorted(PRankH, key = lambda key: PRankH[key], reverse = True)

In [17]:
print "\ntop 10 experts by PageRank"
for item in lPRank[:10]:
    print "id %7s, pagerank %.6f" % (item, PRankH[item])


top 10 experts by PageRank
id      44, pagerank 0.021227
id     265, pagerank 0.014389
id      27, pagerank 0.011263
id     132, pagerank 0.010668
id     264, pagerank 0.008206
id     138, pagerank 0.006748
id     170, pagerank 0.006588
id     230, pagerank 0.005863
id      75, pagerank 0.005809
id      48, pagerank 0.005673

In [18]:
#for item in PRankH:
#    print item

In [19]:
print type(lPRank)
print len(lPRank)
# print PRankH[]

sortedPRkey = []
sortedPR = []

for item, val in enumerate(lPRank):
    sortedPRkey.append(item) 
    sortedPR.append(PRankH[val])


<type 'list'>
262

In [20]:
plt.gca().set_yscale('log')
plt.plot(sortedPR, 'o')
plt.show()


Very few nodes appear prominent

Hash properly

Then proceed further


In [ ]:


In [20]:
NIdHubH = snap.TIntFltH()
NIdAuthH = snap.TIntFltH()
snap.GetHits(G, NIdHubH, NIdAuthH)

In [18]:
lAuth = sorted(NIdAuthH, key = lambda key: NIdAuthH[key], reverse = True)
print "\ntop 10 experts by Hits"


top 10 experts by Hits

In [19]:
for item in lAuth[:10]:
    print "id %7s, authority rank %.6f" % (item, NIdAuthH[item])


id     132, authority rank 0.083657
id     264, authority rank 0.082190
id     265, authority rank 0.081600
id     138, authority rank 0.080902
id     230, authority rank 0.079899
id      48, authority rank 0.079303
id     161, authority rank 0.078768
id     162, authority rank 0.078451
id       7, authority rank 0.078450
id     186, authority rank 0.078401

In [21]:
lHub = sorted(NIdHubH, key = lambda key: NIdHubH[key], reverse = True)

In [22]:
for item in lHub[:10]:
    print "id %7s, hub rank %.6f" % (item, NIdHubH[item])


id     132, hub rank 0.110319
id     138, hub rank 0.110147
id     170, hub rank 0.110117
id      48, hub rank 0.109902
id     162, hub rank 0.109768
id     186, hub rank 0.109678
id     100, hub rank 0.109566
id      79, hub rank 0.109547
id     230, hub rank 0.109448
id     234, hub rank 0.109286

In [26]:
sortedAuthkey = []
sortedAuth = []

for item, val in enumerate(lAuth):
    sortedAuthkey.append(item) 
    sortedAuth.append(NIdAuthH[val])

In [27]:
plt.gca().set_yscale('log')
plt.plot(sortedAuth, 'o')
plt.show()



In [28]:
sortedHubkey = []
sortedHub = []

for item, val in enumerate(lHub):
    sortedHubkey.append(item) 
    sortedHub.append(NIdHubH[val])

In [29]:
plt.gca().set_yscale('log')
plt.plot(sortedHub, 'o')
plt.show()


Degree Distribution


In [33]:
snap.PlotInDegDistr(G, "yellow 2016", "yellow 2016 In Degree")

In [34]:
yellowInD = pd.read_csv('inDeg.yellow 2016.tab', skiprows=3, sep='\t')

In [35]:
yellowInD.describe()


Out[35]:
# In-degree Count
count 120.000000 120.000000
mean 93.608333 2.183333
std 42.301498 1.506196
min 1.000000 1.000000
25% 59.750000 1.000000
50% 96.500000 2.000000
75% 126.250000 3.000000
max 206.000000 8.000000

In [39]:
plt.figure(figsize=(8,4.95))
plt.plot(yellowInD['# In-degree'], yellowInD['Count'], '-o')
plt.show()



In [ ]: