In [1]:
#enable plotting
%matplotlib inline

#import packages
import matplotlib.pyplot as plt
import numpy as np
import csv
import collections
from collections import Counter

#set graph size
plt.rcParams["figure.figsize"] = (20,10)

In [2]:
#read the file
fn = "stop_times.csv"
with open(fn, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    data = {}
    for column in header:
        data[column] = []
    for row in reader:
        for column, value in zip(header, row):
            data[column].append(value)
            
fn1 = "stops.csv"
with open(fn1, "r") as f1:
    reader1 = csv.reader(f1)
    header1 = next(reader1)
    data1 = {}
    for column1 in header1:
        data1[column1] = []
    for row1 in reader1:
        for column1, value1 in zip(header1, row1):
            data1[column1].append(value1)
            
fn2 = "routes.csv"
with open(fn2, "r") as f2:
    reader2 = csv.reader(f2)
    header2 = next(reader2)
    data2 = {}
    for column2 in header2:
        data2[column2] = []
    for row2 in reader2:
        for column2, value2 in zip(header2, row2):
            data2[column2].append(value2)

In [3]:
#amount of stops per trip
trip_count={}
for id in data['trip_id']:
    if id not in trip_count.keys():
        trip_count[id]=1
    else:
        trip_count[id]+=1

a = Counter(trip_count)
sorted_a=sorted(a.items(), key=lambda a: a[1])
x_val = np.arange(len(sorted_a))
y_val = [x[1] for x in sorted_a]
plt.bar(x_val,y_val,align='center', width=0.6, color='r')
plt.ylabel('Amount of stops',fontsize=15)
plt.xlabel('Trips', fontsize=15)
plt.title('Distribution of amount of stops per trip',fontsize=15)
plt.show()



In [18]:
len(sorted_a)


Out[18]:
5498

In [4]:
#amount of stops per stop
stop_count={}
for id in data['stop_id']:
    if id not in stop_count.keys():
        stop_count[id]=1
    else:
        stop_count[id]+=1
        
b = Counter(stop_count)
sorted_b=sorted(b.items(), key=lambda b: b[1])
x_val = np.arange(len(sorted_b))
y_val = [x[1] for x in sorted_b]
plt.bar(x_val,y_val,align='center', width=0.6, color='g')
plt.ylabel('Amount of trips',fontsize=15)
plt.xlabel('Stops', fontsize=15)
plt.title('Distribution of amount of trips per stop',fontsize=15)
plt.show()



In [17]:
len(sorted_b)


Out[17]:
2496

In [10]:
c=Counter(stop_count).most_common(20)
c.sort(key=lambda x: x[1]) 
x_val = list(zip(*c))[0]
y_val = list(zip(*c))[1]
x_pos = np.arange(len(x_val)) 
plt.bar(x_pos, y_val,align='center',width=0.6, color='b')
plt.xticks(x_pos, x_val,fontsize=8) 
plt.ylabel('Amount of trips',fontsize=15)
plt.title('20 stops with most trip stops',fontsize=15)
plt.show()



In [12]:
color_count={}
for color in data2['route_color']:
    if color not in color_count.keys():
        color_count[color]=1
    else:
        color_count[color]+=1
        
d = Counter(color_count)
sorted_d=sorted(d.items(), key=lambda d: d[1])
x_val = [x[0] for x in sorted_d]
y_val = [x[1] for x in sorted_d]
x_pos = np.arange(len(x_val)) 
plt.bar(x_pos,y_val,align='center', width=0.6, color='y')
plt.xticks(x_pos, x_val,fontsize=8)
plt.ylabel('Amount of routes',fontsize=15)
plt.title('Distribution of routes per color',fontsize=15)
plt.show()



In [15]:
location_count={}
for code in data1['stop_code']:
    if code not in location_count.keys():
        location_count[code]=1
    else:
        location_count[code]+=1
        
e = Counter(location_count)
sorted_e=sorted(e.items(), key=lambda e: e[1])
x_val = [x[0] for x in sorted_e]
y_val = [x[1] for x in sorted_e]
x_pos = np.arange(len(x_val)) 
plt.bar(x_pos,y_val,align='center', width=0.5, color='c')
plt.ylabel('Amount of stops',fontsize=15)
plt.title('Stops per location',fontsize=15)
plt.show()



In [16]:
len(x_val)


Out[16]:
1353

In [ ]: