In [1]:
import pandas as pd
In [2]:
import networkx as nx
In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,10)
import seaborn as sns
In [4]:
# tripDf = pd.read_csv('yellow_tripdata_2016-12.csv', nrows=100000)
tripDf = pd.read_csv('yellow_tripdata_2016-12.csv', nrows=100000, usecols=range(0,17))
In [5]:
tripDf.head()
Out[5]:
In [6]:
tripDf.describe()
Out[6]:
In [7]:
tripDf2 = pd.read_csv('yellow_tripdata_2015-12.csv', nrows=100000)
In [8]:
tripDf2.head()
Out[8]:
In [9]:
tripDf2.describe()
Out[9]:
In [10]:
del(tripDf2)
In [11]:
plt.figure(figsize=(8,4.95))
sns.distplot(tripDf['PULocationID'])
plt.show()
In [12]:
# this is just one block of the DF
plt.figure(figsize=(8,4.95))
plt.hist(tripDf['PULocationID'], bins=100);
In [13]:
plt.figure(figsize=(8,8))
plt.scatter(tripDf['PULocationID'], tripDf['DOLocationID'], marker='.')
plt.show()
In [14]:
# df1 = tripDf.groupby(['PULocationID','DOLocationID']).size()
df1 = tripDf.groupby(['PULocationID','DOLocationID']).size().reset_index(name='count')
df1.head()
Out[14]:
In [15]:
df1.tail()
Out[15]:
In [16]:
for index, row in df1.iterrows():
if index > 5:
break
else:
print index, row['PULocationID'], row['PULocationID'], row['count']
In [17]:
Guwd = nx.DiGraph()
for index, row in df1.iterrows():
Guwd.add_edge(row['PULocationID'],row['DOLocationID'])
In [18]:
print "Number of nodes: %d" % Guwd.number_of_nodes()
print "Number of edges: %d" % Guwd.number_of_edges()
In [19]:
plt.figure(figsize=(15,15))
nx.draw_spring(Guwd, node_size=60, width=0.04)
plt.show() # display
In [20]:
plt.figure(figsize=(15,15))
nx.draw_circular(Guwd, node_size=60, width=0.04)
plt.savefig("unweighted_graph_Y16_partial.png")
plt.show() # display
This does not revealing any structures, yet.
In [21]:
del(Guwd)
In [22]:
df1.head()
Out[22]:
In [23]:
df1.tail()
Out[23]:
In [24]:
df1.describe()
Out[24]:
Some of the edges repeat many times. USe repetition to create edge weights.
In [25]:
print df1.columns
print df1[:5]
In [26]:
for index, row in df1.iterrows():
if index > 5:
break
else:
print index, row['PULocationID'], row['PULocationID'], row['count']
In [27]:
Gwd = nx.DiGraph()
for index, row in df1.iterrows():
Gwd.add_edge(row['PULocationID'],row['DOLocationID'],weight=row['count'])
In [28]:
edgewidth = [ d['weight']/400. for (u,v,d) in Gwd.edges(data=True) ] # for plotting
In [29]:
print "Number of nodes: %d" % Gwd.number_of_nodes()
print "Number of edges: %d" % Gwd.number_of_edges()
In [30]:
plt.figure(figsize=(15,15))
nx.draw_circular(Gwd, node_size=50, width=edgewidth);
plt.show() # display
In [31]:
del(Gwd)
In [32]:
Gw = nx.Graph()
for index, row in df1.iterrows():
Gw.add_edge(row['PULocationID'],row['DOLocationID'],weight=row['count'])
In [33]:
edgewidth = [ d['weight']/100. for (u,v,d) in Gw.edges(data=True) ] # for plotting
print "Number of nodes: %d" % Gw.number_of_nodes()
print "Number of edges: %d" % Gw.number_of_edges()
In [34]:
plt.figure(figsize=(15,15))
nx.draw_circular(Gw, node_size=50, width=edgewidth);
plt.savefig("weighted_graph_Y16_partial.png")
plt.show() # display