Using snappy27 python 2.7 environment


In [1]:
import pandas as pd

In [2]:
import networkx as nx

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,10)
import seaborn as sns

Objective

To check if graph visualization suggests nice strutures in the data

Data Source

  1. NYC TLC
    • Yellow 2016 December
    • Empty commas at the end are shifting column headers
  2. NYC TLC
    • Yellow 2015 December

Check data


In [4]:
# tripDf = pd.read_csv('yellow_tripdata_2016-12.csv', nrows=100000)
tripDf = pd.read_csv('yellow_tripdata_2016-12.csv', nrows=100000, usecols=range(0,17))

In [5]:
tripDf.head()


Out[5]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
0 2 2016-12-31 15:15:01 2016-12-31 15:15:09 1 0.00 1 N 264 264 2 1.0 0.0 0.5 0.00 0.0 0.3 1.80
1 1 2016-12-01 00:00:01 2016-12-01 00:10:22 1 1.60 1 N 163 143 2 9.0 0.5 0.5 0.00 0.0 0.3 10.30
2 1 2016-12-01 00:00:01 2016-12-01 00:11:01 1 1.40 1 N 164 229 1 9.0 0.5 0.5 2.05 0.0 0.3 12.35
3 2 2016-12-01 00:00:02 2016-12-01 00:09:17 6 1.69 1 N 246 107 1 8.5 0.5 0.5 2.45 0.0 0.3 12.25
4 2 2016-12-01 00:00:02 2016-12-01 00:15:20 1 1.88 1 N 161 162 1 10.5 0.5 0.5 2.95 0.0 0.3 14.75

In [6]:
tripDf.describe()


Out[6]:
VendorID passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
count 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000
mean 1.534170 1.688080 2.816452 1.027230 160.956120 157.442760 1.315800 12.808481 0.517615 0.49820 1.856390 0.196189 0.299724 16.179009
std 0.498834 1.277311 3.266767 0.265272 66.109491 73.031908 0.481158 9.833531 0.165189 0.03289 2.261572 1.104020 0.012366 11.850111
min 1.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -123.000000 -1.000000 -0.50000 0.000000 0.000000 -0.300000 -123.300000
25% 1.000000 1.000000 1.030000 1.000000 113.000000 97.000000 1.000000 7.000000 0.500000 0.50000 0.000000 0.000000 0.300000 9.300000
50% 2.000000 1.000000 1.750000 1.000000 161.000000 161.000000 1.000000 10.000000 0.500000 0.50000 1.500000 0.000000 0.300000 12.800000
75% 2.000000 2.000000 3.200000 1.000000 231.000000 233.000000 2.000000 15.000000 0.500000 0.50000 2.560000 0.000000 0.300000 18.500000
max 2.000000 8.000000 64.890000 6.000000 265.000000 265.000000 4.000000 350.000000 4.500000 0.50000 70.000000 36.040000 0.300000 350.800000

In [7]:
tripDf2 = pd.read_csv('yellow_tripdata_2015-12.csv', nrows=100000)

In [8]:
tripDf2.head()


Out[8]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude RatecodeID store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
0 2 2015-12-01 00:00:00 2015-12-01 00:05:16 5 0.96 -73.979942 40.765381 1 N -73.966309 40.763088 1 5.5 0.5 0.5 1.00 0.0 0.3 7.80
1 2 2015-12-01 00:00:00 2015-12-01 00:00:00 2 2.69 -73.972336 40.762379 1 N -73.993629 40.745998 1 21.5 0.0 0.5 3.34 0.0 0.3 25.64
2 2 2015-12-01 00:00:00 2015-12-01 00:00:00 1 2.62 -73.968849 40.764530 1 N -73.974548 40.791641 1 17.0 0.0 0.5 3.56 0.0 0.3 21.36
3 1 2015-12-01 00:00:01 2015-12-01 00:05:56 1 1.20 -73.993935 40.741684 1 N -73.997665 40.747467 1 6.5 0.5 0.5 0.20 0.0 0.3 8.00
4 1 2015-12-01 00:00:01 2015-12-01 00:09:28 2 3.00 -73.988922 40.726990 1 N -73.975594 40.696869 2 11.0 0.5 0.5 0.00 0.0 0.3 12.30

In [9]:
tripDf2.describe()


Out[9]:
VendorID passenger_count trip_distance pickup_longitude pickup_latitude RatecodeID dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
count 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 1.528430 1.704210 2.827327 -72.875803 40.143712 1.028550 -72.926800 40.172130 1.339510 12.750026 0.604903 0.498520 1.799825 0.238597 0.299736 16.191607
std 0.499194 1.304138 3.370212 8.950561 4.930469 0.395748 8.749564 4.819822 0.487284 9.859140 0.227777 0.030787 2.271786 1.178359 0.011922 11.909361
min 1.000000 0.000000 0.000000 -77.047104 0.000000 1.000000 -74.624863 0.000000 1.000000 -120.000000 -1.000000 -0.500000 0.000000 0.000000 -0.300000 -120.300000
25% 1.000000 1.000000 1.030000 -73.992464 40.734241 1.000000 -73.991776 40.731129 1.000000 7.000000 0.500000 0.500000 0.000000 0.000000 0.300000 9.300000
50% 2.000000 1.000000 1.710000 -73.982414 40.751160 1.000000 -73.981369 40.750484 1.000000 10.000000 0.500000 0.500000 1.460000 0.000000 0.300000 12.800000
75% 2.000000 2.000000 3.100000 -73.969246 40.765747 1.000000 -73.964371 40.768700 2.000000 15.000000 0.500000 0.500000 2.550000 0.000000 0.300000 18.360000
max 2.000000 6.000000 91.200000 0.000000 42.736137 99.000000 0.000000 41.487267 4.000000 500.000000 1.500000 2.500000 115.000000 24.000000 0.300000 550.300000

On vectorizing data

  • After 2015 TLC is not publishing latitude and longitude information.
  • In a way, 2016 data is already vectorized. Taxi zones form 265 nodes. Using that for now.
  • 2015 dat needs to be geohashed or binned.
  • Also, taxi zone data needs to be geolocated.

In [10]:
del(tripDf2)

In [11]:
plt.figure(figsize=(8,4.95))
sns.distplot(tripDf['PULocationID'])
plt.show()



In [12]:
# this is just one block of the DF 
plt.figure(figsize=(8,4.95))
plt.hist(tripDf['PULocationID'], bins=100);



In [13]:
plt.figure(figsize=(8,8))
plt.scatter(tripDf['PULocationID'], tripDf['DOLocationID'], marker='.')
plt.show()


Create directed graph, unweighted


In [14]:
# df1 = tripDf.groupby(['PULocationID','DOLocationID']).size()
df1 = tripDf.groupby(['PULocationID','DOLocationID']).size().reset_index(name='count')
df1.head()


Out[14]:
PULocationID DOLocationID count
0 1 265 1
1 4 4 10
2 4 7 1
3 4 13 2
4 4 25 2

In [15]:
df1.tail()


Out[15]:
PULocationID DOLocationID count
7319 264 265 6
7320 265 117 1
7321 265 161 1
7322 265 162 1
7323 265 265 43

In [16]:
for index, row in df1.iterrows():
    if index > 5:
        break
    else:
        print index, row['PULocationID'], row['PULocationID'], row['count']


0 1 1 1
1 4 4 10
2 4 4 1
3 4 4 2
4 4 4 2
5 4 4 2

In [17]:
Guwd = nx.DiGraph()

for index, row in df1.iterrows():
    Guwd.add_edge(row['PULocationID'],row['DOLocationID'])

In [18]:
print "Number of nodes: %d" % Guwd.number_of_nodes()
print "Number of edges: %d" % Guwd.number_of_edges()


Number of nodes: 247
Number of edges: 7324

Why are there 247 nodes?

Viualize graph


In [19]:
plt.figure(figsize=(15,15))
nx.draw_spring(Guwd,  node_size=60, width=0.04)
plt.show() # display


/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/networkx/drawing/nx_pylab.py:126: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()
/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/networkx/drawing/nx_pylab.py:138: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  plt.hold(b)
/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/matplotlib/__init__.py:917: UserWarning: axes.hold is deprecated. Please remove it from your matplotlibrc and/or style files.
  warnings.warn(self.msg_depr_set % key)
/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/matplotlib/rcsetup.py:152: UserWarning: axes.hold is deprecated, will be removed in 3.0
  warnings.warn("axes.hold is deprecated, will be removed in 3.0")

In [20]:
plt.figure(figsize=(15,15))
nx.draw_circular(Guwd,  node_size=60, width=0.04)
plt.savefig("unweighted_graph_Y16_partial.png") 
plt.show() # display


This does not revealing any structures, yet.


In [21]:
del(Guwd)

Observtation

  • This graph is unweighted
  • Further not all the data was parsed to form this graph

Create weighted graph


In [22]:
df1.head()


Out[22]:
PULocationID DOLocationID count
0 1 265 1
1 4 4 10
2 4 7 1
3 4 13 2
4 4 25 2

In [23]:
df1.tail()


Out[23]:
PULocationID DOLocationID count
7319 264 265 6
7320 265 117 1
7321 265 161 1
7322 265 162 1
7323 265 265 43

In [24]:
df1.describe()


Out[24]:
PULocationID DOLocationID count
count 7324.000000 7324.000000 7324.000000
mean 149.527307 141.323730 13.653741
std 72.290430 77.624095 33.067425
min 1.000000 1.000000 1.000000
25% 90.000000 74.000000 1.000000
50% 144.000000 143.000000 3.000000
75% 229.000000 220.000000 12.000000
max 265.000000 265.000000 1590.000000

Some of the edges repeat many times. USe repetition to create edge weights.


In [25]:
print df1.columns
print df1[:5]


Index([u'PULocationID', u'DOLocationID', u'count'], dtype='object')
   PULocationID  DOLocationID  count
0             1           265      1
1             4             4     10
2             4             7      1
3             4            13      2
4             4            25      2

In [26]:
for index, row in df1.iterrows():
    if index > 5:
        break
    else:
        print index, row['PULocationID'], row['PULocationID'], row['count']


0 1 1 1
1 4 4 10
2 4 4 1
3 4 4 2
4 4 4 2
5 4 4 2

In [27]:
Gwd = nx.DiGraph()
for index, row in df1.iterrows():
    Gwd.add_edge(row['PULocationID'],row['DOLocationID'],weight=row['count'])

In [28]:
edgewidth = [ d['weight']/400. for (u,v,d) in Gwd.edges(data=True) ] # for plotting

In [29]:
print "Number of nodes: %d" % Gwd.number_of_nodes()
print "Number of edges: %d" % Gwd.number_of_edges()


Number of nodes: 247
Number of edges: 7324

In [30]:
plt.figure(figsize=(15,15))
nx.draw_circular(Gwd, node_size=50, width=edgewidth);
plt.show() # display



In [31]:
del(Gwd)

In [32]:
Gw = nx.Graph()
for index, row in df1.iterrows():
    Gw.add_edge(row['PULocationID'],row['DOLocationID'],weight=row['count'])

In [33]:
edgewidth = [ d['weight']/100. for (u,v,d) in Gw.edges(data=True) ] # for plotting 

print "Number of nodes: %d" % Gw.number_of_nodes()
print "Number of edges: %d" % Gw.number_of_edges()


Number of nodes: 247
Number of edges: 5499

In [34]:
plt.figure(figsize=(15,15))
nx.draw_circular(Gw, node_size=50, width=edgewidth);
plt.savefig("weighted_graph_Y16_partial.png") 
plt.show() # display


  • Yes!
  • There are certain nodes and edges thar appear more prominent
  • This graph is weighted
  • Further not all the data was parsed to form this graph
  • The dataframe manipulation needs to be parsed better

Thank You!