Using snappy27 python 2.7 environment



In [1]:

    
import pandas as pd



In [2]:

    
import networkx as nx



In [3]:

    
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,10)
import seaborn as sns

Objective

To check if graph visualization suggests nice strutures in the data

Data Source

NYC TLC
- Yellow 2016 December
- Empty commas at the end are shifting column headers
NYC TLC
- Yellow 2015 December

Check data



In [4]:

    
# tripDf = pd.read_csv('yellow_tripdata_2016-12.csv', nrows=100000)
tripDf = pd.read_csv('yellow_tripdata_2016-12.csv', nrows=100000, usecols=range(0,17))



In [5]:

    
tripDf.head()









    Out[5]:






  
    
      
      VendorID
      tpep_pickup_datetime
      tpep_dropoff_datetime
      passenger_count
      trip_distance
      RatecodeID
      store_and_fwd_flag
      PULocationID
      DOLocationID
      payment_type
      fare_amount
      extra
      mta_tax
      tip_amount
      tolls_amount
      improvement_surcharge
      total_amount
    
  
  
    
      0
      2
      2016-12-31 15:15:01
      2016-12-31 15:15:09
      1
      0.00
      1
      N
      264
      264
      2
      1.0
      0.0
      0.5
      0.00
      0.0
      0.3
      1.80
    
    
      1
      1
      2016-12-01 00:00:01
      2016-12-01 00:10:22
      1
      1.60
      1
      N
      163
      143
      2
      9.0
      0.5
      0.5
      0.00
      0.0
      0.3
      10.30
    
    
      2
      1
      2016-12-01 00:00:01
      2016-12-01 00:11:01
      1
      1.40
      1
      N
      164
      229
      1
      9.0
      0.5
      0.5
      2.05
      0.0
      0.3
      12.35
    
    
      3
      2
      2016-12-01 00:00:02
      2016-12-01 00:09:17
      6
      1.69
      1
      N
      246
      107
      1
      8.5
      0.5
      0.5
      2.45
      0.0
      0.3
      12.25
    
    
      4
      2
      2016-12-01 00:00:02
      2016-12-01 00:15:20
      1
      1.88
      1
      N
      161
      162
      1
      10.5
      0.5
      0.5
      2.95
      0.0
      0.3
      14.75



In [6]:

    
tripDf.describe()









    Out[6]:






  
    
      
      VendorID
      passenger_count
      trip_distance
      RatecodeID
      PULocationID
      DOLocationID
      payment_type
      fare_amount
      extra
      mta_tax
      tip_amount
      tolls_amount
      improvement_surcharge
      total_amount
    
  
  
    
      count
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.00000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
    
    
      mean
      1.534170
      1.688080
      2.816452
      1.027230
      160.956120
      157.442760
      1.315800
      12.808481
      0.517615
      0.49820
      1.856390
      0.196189
      0.299724
      16.179009
    
    
      std
      0.498834
      1.277311
      3.266767
      0.265272
      66.109491
      73.031908
      0.481158
      9.833531
      0.165189
      0.03289
      2.261572
      1.104020
      0.012366
      11.850111
    
    
      min
      1.000000
      0.000000
      0.000000
      1.000000
      1.000000
      1.000000
      1.000000
      -123.000000
      -1.000000
      -0.50000
      0.000000
      0.000000
      -0.300000
      -123.300000
    
    
      25%
      1.000000
      1.000000
      1.030000
      1.000000
      113.000000
      97.000000
      1.000000
      7.000000
      0.500000
      0.50000
      0.000000
      0.000000
      0.300000
      9.300000
    
    
      50%
      2.000000
      1.000000
      1.750000
      1.000000
      161.000000
      161.000000
      1.000000
      10.000000
      0.500000
      0.50000
      1.500000
      0.000000
      0.300000
      12.800000
    
    
      75%
      2.000000
      2.000000
      3.200000
      1.000000
      231.000000
      233.000000
      2.000000
      15.000000
      0.500000
      0.50000
      2.560000
      0.000000
      0.300000
      18.500000
    
    
      max
      2.000000
      8.000000
      64.890000
      6.000000
      265.000000
      265.000000
      4.000000
      350.000000
      4.500000
      0.50000
      70.000000
      36.040000
      0.300000
      350.800000



In [7]:

    
tripDf2 = pd.read_csv('yellow_tripdata_2015-12.csv', nrows=100000)



In [8]:

    
tripDf2.head()









    Out[8]:






  
    
      
      VendorID
      tpep_pickup_datetime
      tpep_dropoff_datetime
      passenger_count
      trip_distance
      pickup_longitude
      pickup_latitude
      RatecodeID
      store_and_fwd_flag
      dropoff_longitude
      dropoff_latitude
      payment_type
      fare_amount
      extra
      mta_tax
      tip_amount
      tolls_amount
      improvement_surcharge
      total_amount
    
  
  
    
      0
      2
      2015-12-01 00:00:00
      2015-12-01 00:05:16
      5
      0.96
      -73.979942
      40.765381
      1
      N
      -73.966309
      40.763088
      1
      5.5
      0.5
      0.5
      1.00
      0.0
      0.3
      7.80
    
    
      1
      2
      2015-12-01 00:00:00
      2015-12-01 00:00:00
      2
      2.69
      -73.972336
      40.762379
      1
      N
      -73.993629
      40.745998
      1
      21.5
      0.0
      0.5
      3.34
      0.0
      0.3
      25.64
    
    
      2
      2
      2015-12-01 00:00:00
      2015-12-01 00:00:00
      1
      2.62
      -73.968849
      40.764530
      1
      N
      -73.974548
      40.791641
      1
      17.0
      0.0
      0.5
      3.56
      0.0
      0.3
      21.36
    
    
      3
      1
      2015-12-01 00:00:01
      2015-12-01 00:05:56
      1
      1.20
      -73.993935
      40.741684
      1
      N
      -73.997665
      40.747467
      1
      6.5
      0.5
      0.5
      0.20
      0.0
      0.3
      8.00
    
    
      4
      1
      2015-12-01 00:00:01
      2015-12-01 00:09:28
      2
      3.00
      -73.988922
      40.726990
      1
      N
      -73.975594
      40.696869
      2
      11.0
      0.5
      0.5
      0.00
      0.0
      0.3
      12.30



In [9]:

    
tripDf2.describe()









    Out[9]:






  
    
      
      VendorID
      passenger_count
      trip_distance
      pickup_longitude
      pickup_latitude
      RatecodeID
      dropoff_longitude
      dropoff_latitude
      payment_type
      fare_amount
      extra
      mta_tax
      tip_amount
      tolls_amount
      improvement_surcharge
      total_amount
    
  
  
    
      count
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
      100000.000000
    
    
      mean
      1.528430
      1.704210
      2.827327
      -72.875803
      40.143712
      1.028550
      -72.926800
      40.172130
      1.339510
      12.750026
      0.604903
      0.498520
      1.799825
      0.238597
      0.299736
      16.191607
    
    
      std
      0.499194
      1.304138
      3.370212
      8.950561
      4.930469
      0.395748
      8.749564
      4.819822
      0.487284
      9.859140
      0.227777
      0.030787
      2.271786
      1.178359
      0.011922
      11.909361
    
    
      min
      1.000000
      0.000000
      0.000000
      -77.047104
      0.000000
      1.000000
      -74.624863
      0.000000
      1.000000
      -120.000000
      -1.000000
      -0.500000
      0.000000
      0.000000
      -0.300000
      -120.300000
    
    
      25%
      1.000000
      1.000000
      1.030000
      -73.992464
      40.734241
      1.000000
      -73.991776
      40.731129
      1.000000
      7.000000
      0.500000
      0.500000
      0.000000
      0.000000
      0.300000
      9.300000
    
    
      50%
      2.000000
      1.000000
      1.710000
      -73.982414
      40.751160
      1.000000
      -73.981369
      40.750484
      1.000000
      10.000000
      0.500000
      0.500000
      1.460000
      0.000000
      0.300000
      12.800000
    
    
      75%
      2.000000
      2.000000
      3.100000
      -73.969246
      40.765747
      1.000000
      -73.964371
      40.768700
      2.000000
      15.000000
      0.500000
      0.500000
      2.550000
      0.000000
      0.300000
      18.360000
    
    
      max
      2.000000
      6.000000
      91.200000
      0.000000
      42.736137
      99.000000
      0.000000
      41.487267
      4.000000
      500.000000
      1.500000
      2.500000
      115.000000
      24.000000
      0.300000
      550.300000

On vectorizing data

After 2015 TLC is not publishing latitude and longitude information.
In a way, 2016 data is already vectorized. Taxi zones form 265 nodes. Using that for now.
2015 dat needs to be geohashed or binned.
Also, taxi zone data needs to be geolocated.



In [10]:

    
del(tripDf2)



In [11]:

    
plt.figure(figsize=(8,4.95))
sns.distplot(tripDf['PULocationID'])
plt.show()



In [12]:

    
# this is just one block of the DF 
plt.figure(figsize=(8,4.95))
plt.hist(tripDf['PULocationID'], bins=100);



In [13]:

    
plt.figure(figsize=(8,8))
plt.scatter(tripDf['PULocationID'], tripDf['DOLocationID'], marker='.')
plt.show()

Create directed graph, unweighted

Groupby question



In [14]:

    
# df1 = tripDf.groupby(['PULocationID','DOLocationID']).size()
df1 = tripDf.groupby(['PULocationID','DOLocationID']).size().reset_index(name='count')
df1.head()









    Out[14]:






  
    
      
      PULocationID
      DOLocationID
      count
    
  
  
    
      0
      1
      265
      1
    
    
      1
      4
      4
      10
    
    
      2
      4
      7
      1
    
    
      3
      4
      13
      2
    
    
      4
      4
      25
      2



In [15]:

    
df1.tail()









    Out[15]:






  
    
      
      PULocationID
      DOLocationID
      count
    
  
  
    
      7319
      264
      265
      6
    
    
      7320
      265
      117
      1
    
    
      7321
      265
      161
      1
    
    
      7322
      265
      162
      1
    
    
      7323
      265
      265
      43



In [16]:

    
for index, row in df1.iterrows():
    if index > 5:
        break
    else:
        print index, row['PULocationID'], row['PULocationID'], row['count']



In [17]:

    
Guwd = nx.DiGraph()

for index, row in df1.iterrows():
    Guwd.add_edge(row['PULocationID'],row['DOLocationID'])



In [18]:

    
print "Number of nodes: %d" % Guwd.number_of_nodes()
print "Number of edges: %d" % Guwd.number_of_edges()









    



Number of nodes: 247
Number of edges: 7324

Why are there 247 nodes?

Viualize graph



In [19]:

    
plt.figure(figsize=(15,15))
nx.draw_spring(Guwd,  node_size=60, width=0.04)
plt.show() # display









    



/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/networkx/drawing/nx_pylab.py:126: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  b = plt.ishold()
/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/networkx/drawing/nx_pylab.py:138: MatplotlibDeprecationWarning: pyplot.hold is deprecated.
    Future behavior will be consistent with the long-time default:
    plot commands add elements without first clearing the
    Axes and/or Figure.
  plt.hold(b)
/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/matplotlib/__init__.py:917: UserWarning: axes.hold is deprecated. Please remove it from your matplotlibrc and/or style files.
  warnings.warn(self.msg_depr_set % key)
/Users/jarumugam/anaconda/envs/snappy27/lib/python2.7/site-packages/matplotlib/rcsetup.py:152: UserWarning: axes.hold is deprecated, will be removed in 3.0
  warnings.warn("axes.hold is deprecated, will be removed in 3.0")



In [20]:

    
plt.figure(figsize=(15,15))
nx.draw_circular(Guwd,  node_size=60, width=0.04)
plt.savefig("unweighted_graph_Y16_partial.png") 
plt.show() # display

This does not revealing any structures, yet.



In [21]:

    
del(Guwd)

Observtation

This graph is unweighted
Further not all the data was parsed to form this graph

Create weighted graph



In [22]:

    
df1.head()









    Out[22]:






  
    
      
      PULocationID
      DOLocationID
      count
    
  
  
    
      0
      1
      265
      1
    
    
      1
      4
      4
      10
    
    
      2
      4
      7
      1
    
    
      3
      4
      13
      2
    
    
      4
      4
      25
      2



In [23]:

    
df1.tail()









    Out[23]:






  
    
      
      PULocationID
      DOLocationID
      count
    
  
  
    
      7319
      264
      265
      6
    
    
      7320
      265
      117
      1
    
    
      7321
      265
      161
      1
    
    
      7322
      265
      162
      1
    
    
      7323
      265
      265
      43



In [24]:

    
df1.describe()









    Out[24]:






  
    
      
      PULocationID
      DOLocationID
      count
    
  
  
    
      count
      7324.000000
      7324.000000
      7324.000000
    
    
      mean
      149.527307
      141.323730
      13.653741
    
    
      std
      72.290430
      77.624095
      33.067425
    
    
      min
      1.000000
      1.000000
      1.000000
    
    
      25%
      90.000000
      74.000000
      1.000000
    
    
      50%
      144.000000
      143.000000
      3.000000
    
    
      75%
      229.000000
      220.000000
      12.000000
    
    
      max
      265.000000
      265.000000
      1590.000000

Some of the edges repeat many times. USe repetition to create edge weights.



In [25]:

    
print df1.columns
print df1[:5]









    



Index([u'PULocationID', u'DOLocationID', u'count'], dtype='object')
   PULocationID  DOLocationID  count
0             1           265      1
1             4             4     10
2             4             7      1
3             4            13      2
4             4            25      2



In [26]:

    
for index, row in df1.iterrows():
    if index > 5:
        break
    else:
        print index, row['PULocationID'], row['PULocationID'], row['count']



In [27]:

    
Gwd = nx.DiGraph()
for index, row in df1.iterrows():
    Gwd.add_edge(row['PULocationID'],row['DOLocationID'],weight=row['count'])



In [28]:

    
edgewidth = [ d['weight']/400. for (u,v,d) in Gwd.edges(data=True) ] # for plotting



In [29]:

    
print "Number of nodes: %d" % Gwd.number_of_nodes()
print "Number of edges: %d" % Gwd.number_of_edges()









    



Number of nodes: 247
Number of edges: 7324



In [30]:

    
plt.figure(figsize=(15,15))
nx.draw_circular(Gwd, node_size=50, width=edgewidth);
plt.show() # display



In [31]:

    
del(Gwd)



In [32]:

    
Gw = nx.Graph()
for index, row in df1.iterrows():
    Gw.add_edge(row['PULocationID'],row['DOLocationID'],weight=row['count'])



In [33]:

    
edgewidth = [ d['weight']/100. for (u,v,d) in Gw.edges(data=True) ] # for plotting 

print "Number of nodes: %d" % Gw.number_of_nodes()
print "Number of edges: %d" % Gw.number_of_edges()









    



Number of nodes: 247
Number of edges: 5499



In [34]:

    
plt.figure(figsize=(15,15))
nx.draw_circular(Gw, node_size=50, width=edgewidth);
plt.savefig("weighted_graph_Y16_partial.png") 
plt.show() # display

Yes!
There are certain nodes and edges thar appear more prominent
This graph is weighted
Further not all the data was parsed to form this graph
The dataframe manipulation needs to be parsed better

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	RatecodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount
0	2	2016-12-31 15:15:01	2016-12-31 15:15:09	1	0.00	1	N	264	264	2	1.0	0.0	0.5	0.00	0.3	1.80
1	1	2016-12-01 00:00:01	2016-12-01 00:10:22	1	1.60	1	N	163	143	2	9.0	0.5	0.5	0.00	0.3	10.30
2	1	2016-12-01 00:00:01	2016-12-01 00:11:01	1	1.40	1	N	164	229	1	9.0	0.5	0.5	2.05	0.3	12.35
3	2	2016-12-01 00:00:02	2016-12-01 00:09:17	6	1.69	1	N	246	107	1	8.5	0.5	0.5	2.45	0.3	12.25
4	2	2016-12-01 00:00:02	2016-12-01 00:15:20	1	1.88	1	N	161	162	1	10.5	0.5	0.5	2.95	0.3	14.75

	VendorID	passenger_count	trip_distance	RatecodeID	PULocationID	DOLocationID	payment_type	fare_amount	extra	mta_tax	tip_amount	tolls_amount	improvement_surcharge	total_amount
count	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.00000	100000.000000	100000.000000	100000.000000	100000.000000
mean	1.534170	1.688080	2.816452	1.027230	160.956120	157.442760	1.315800	12.808481	0.517615	0.49820	1.856390	0.196189	0.299724	16.179009
std	0.498834	1.277311	3.266767	0.265272	66.109491	73.031908	0.481158	9.833531	0.165189	0.03289	2.261572	1.104020	0.012366	11.850111
min	1.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	-123.000000	-1.000000	-0.50000	0.000000	0.000000	-0.300000	-123.300000
25%	1.000000	1.000000	1.030000	1.000000	113.000000	97.000000	1.000000	7.000000	0.500000	0.50000	0.000000	0.000000	0.300000	9.300000
50%	2.000000	1.000000	1.750000	1.000000	161.000000	161.000000	1.000000	10.000000	0.500000	0.50000	1.500000	0.000000	0.300000	12.800000
75%	2.000000	2.000000	3.200000	1.000000	231.000000	233.000000	2.000000	15.000000	0.500000	0.50000	2.560000	0.000000	0.300000	18.500000
max	2.000000	8.000000	64.890000	6.000000	265.000000	265.000000	4.000000	350.000000	4.500000	0.50000	70.000000	36.040000	0.300000	350.800000

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	passenger_count	trip_distance	pickup_longitude	pickup_latitude	RatecodeID	store_and_fwd_flag	dropoff_longitude	dropoff_latitude	payment_type	fare_amount	extra	mta_tax	tip_amount	improvement_surcharge	total_amount
0	2	2015-12-01 00:00:00	2015-12-01 00:05:16	5	0.96	-73.979942	40.765381	1	N	-73.966309	40.763088	1	5.5	0.5	0.5	1.00	0.3	7.80
1	2	2015-12-01 00:00:00	2015-12-01 00:00:00	2	2.69	-73.972336	40.762379	1	N	-73.993629	40.745998	1	21.5	0.0	0.5	3.34	0.3	25.64
2	2	2015-12-01 00:00:00	2015-12-01 00:00:00	1	2.62	-73.968849	40.764530	1	N	-73.974548	40.791641	1	17.0	0.0	0.5	3.56	0.3	21.36
3	1	2015-12-01 00:00:01	2015-12-01 00:05:56	1	1.20	-73.993935	40.741684	1	N	-73.997665	40.747467	1	6.5	0.5	0.5	0.20	0.3	8.00
4	1	2015-12-01 00:00:01	2015-12-01 00:09:28	2	3.00	-73.988922	40.726990	1	N	-73.975594	40.696869	2	11.0	0.5	0.5	0.00	0.3	12.30

	PULocationID	DOLocationID	count
count	7324.000000	7324.000000	7324.000000
mean	149.527307	141.323730	13.653741
std	72.290430	77.624095	33.067425
min	1.000000	1.000000	1.000000
25%	90.000000	74.000000	1.000000
50%	144.000000	143.000000	3.000000
75%	229.000000	220.000000	12.000000
max	265.000000	265.000000	1590.000000

Using snappy27 python 2.7 environment

Objective

Data Source

Check data

On vectorizing data

Create directed graph, unweighted

Why are there 247 nodes?

Viualize graph

Observtation

Create weighted graph

Thank You!