LIS590DV Final Project: Task One.

Group: Whale.

Yingjun Guan, Xiaoliang Jiang, Xinyu Zhang, Jialu Wang. The first task is based on the Champaign-Urbana Metro Transit District (CUMTD). From the data source (http://developer.cumtd.com/), the data and the corresponding documentation can be found. The data involves the information of agency (agency.txt), running schedule (calendar.txt), running exception schedule (calendar_dates.txt), stops (stops.txt), stop time(stop_time.txt), routes of all trafic (routes.txt), shapes of the routes - timely records rather than the stops(shapes.txt), daily time schedule (trips.txt), and the fare information (fare_rules.txt and fare_attributes.txt)

1. 下面这个要留吗?

2. 佳璐的需要一些comments。。。。吗?

3. 多加些描述性文字?

4. 加地图部分?plotly怎么办办?

5. 佳璐的图有几个变黑啦

6. 姜霸霸的20号图


In [1]:
## 要留吗??
"""
Make a pie chart - see
http://matplotlib.sf.net/matplotlib.pylab.html#-pie for the docstring.

This example shows a basic pie chart with labels optional features,
like autolabeling the percentage, offsetting a slice with "explode",
adding a shadow, and changing the starting angle.

"""
from pylab import *

# make a square figure and axes
figure(1, figsize=(6,6))
ax = axes([0.1, 0.1, 0.8, 0.8])

# The slices will be ordered and plotted counter-clockwise.
labels = '105 days', '51-100 days', '10-50days', '<10 days'
fracs = [43/289, 67/289, 139/289, 40/289]
explode=(0.05, 0.05, 0.05, 0.05)

pie(fracs, explode=explode, labels=labels,
                autopct='%1.1f%%', shadow=True, startangle=90)
                # The default startangle is 0, which would start
                # the Frogs slice on the x-axis.  With startangle=90,
                # everything is rotated counter-clockwise by 90 degrees,
                # so the plotting starts on the positive y-axis.

title('Raining Hogs and Dogs', bbox={'facecolor':'0.8', 'pad':5})

show()



In [41]:
from pylab import *

# make a square figure and axes
figure(1, figsize=(6,6))
ax = axes([0.1, 0.1, 0.8, 0.8])

# The slices will be ordered and plotted counter-clockwise.
labels = '105 days', '51-100 days', '10-50days', '<10 days'
fracs = [43/289, 67/289, 139/289, 40/289]
explode=(0.05, 0, 0, 0)

pie(fracs, explode=explode, labels=labels,
                autopct='%1.1f%%', shadow=True, startangle=90)
                # The default startangle is 0, which would start
                # the Frogs slice on the x-axis.  With startangle=90,
                # everything is rotated counter-clockwise by 90 degrees,
                # so the plotting starts on the positive y-axis.

title('Pie chart for traffic running days (out of 147)', bbox={'facecolor':'0.8', 'pad':5})

show()


this pie chart represent .....

stop, stop time, routes


In [1]:
#enable plotting
%matplotlib inline

#import packages
import matplotlib.pyplot as plt
import numpy as np
import csv
import collections
from collections import Counter

#set graph size
plt.rcParams["figure.figsize"] = (20,10)

In [5]:
#read the file
fn = "stop_times.txt"
with open(fn, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    data = {}
    for column in header:
        data[column] = []
    for row in reader:
        for column, value in zip(header, row):
            data[column].append(value)
            
fn1 = "stops.txt"
with open(fn1, "r") as f1:
    reader1 = csv.reader(f1)
    header1 = next(reader1)
    data1 = {}
    for column1 in header1:
        data1[column1] = []
    for row1 in reader1:
        for column1, value1 in zip(header1, row1):
            data1[column1].append(value1)
            
fn2 = "routes - routes.csv.csv"
with open(fn2, "r") as f2:
    reader2 = csv.reader(f2)
    header2 = next(reader2)
    data2 = {}
    for column2 in header2:
        data2[column2] = []
    for row2 in reader2:
        for column2, value2 in zip(header2, row2):
            data2[column2].append(value2)

In [6]:
#amount of stops per trip
trip_count={}
for id in data['trip_id']:
    if id not in trip_count.keys():
        trip_count[id]=1
    else:
        trip_count[id]+=1

a = Counter(trip_count)
sorted_a=sorted(a.items(), key=lambda a: a[1])
x_val = np.arange(len(sorted_a))
y_val = [x[1] for x in sorted_a]
plt.bar(x_val,y_val,align='center', width=0.6, color='r')
plt.ylabel('Amount of stops',fontsize=15)
plt.xlabel('Trips', fontsize=15)
plt.title('Distribution of amount of stops per trip',fontsize=15)
plt.show()


this graph represent amount of stops per trip


In [7]:
len(sorted_a)


Out[7]:
5498

In [43]:
#amount of stops per stop
stop_count={}
for id in data['stop_id']:
    if id not in stop_count.keys():
        stop_count[id]=1
    else:
        stop_count[id]+=1
        
b = Counter(stop_count)
sorted_b=sorted(b.items(), key=lambda b: b[1])
x_val = np.arange(len(sorted_b))
y_val = [x[1] for x in sorted_b]
plt.bar(x_val,y_val,align='center', width=0.6, color='g')
plt.ylabel('Amount of trips',fontsize=15)
plt.xlabel('Stops', fontsize=15)
plt.title('Distribution of amount of trips per stop',fontsize=15)
plt.show()


amount of stops per stop??


In [36]:
len(sorted_b)


Out[36]:
2496

In [44]:
c=Counter(stop_count).most_common(20)
c.sort(key=lambda x: x[1]) 
x_val = list(zip(*c))[0]
y_val = list(zip(*c))[1]
x_pos = np.arange(len(x_val)) 
plt.bar(x_pos, y_val,align='center',width=0.6, color='b')
plt.xticks(x_pos, x_val,fontsize=8) 
plt.ylabel('Amount of trips',fontsize=15)
plt.title('20 stops with most trip stops',fontsize=15)
plt.show()


this graph repersent the top 20 stops with most trip stops


In [45]:
color_count={}
for color in data2['route_color']:
    if color not in color_count.keys():
        color_count[color]=1
    else:
        color_count[color]+=1
        
d = Counter(color_count)
sorted_d=sorted(d.items(), key=lambda d: d[1])
x_val = [x[0] for x in sorted_d]
y_val = [x[1] for x in sorted_d]
x_pos = np.arange(len(x_val)) 
plt.bar(x_pos,y_val,align='center', width=0.6, color='y')
plt.xticks(x_pos, x_val,fontsize=8)
plt.ylabel('Amount of routes',fontsize=15)
plt.title('Distribution of routes per color',fontsize=15)
plt.show()


this graph represent amount of routes for each color


In [46]:
location_count={}
for code in data1['stop_code']:
    if code not in location_count.keys():
        location_count[code]=1
    else:
        location_count[code]+=1
        
e = Counter(location_count)
sorted_e=sorted(e.items(), key=lambda e: e[1])
x_val = [x[0] for x in sorted_e]
y_val = [x[1] for x in sorted_e]
x_pos = np.arange(len(x_val)) 
plt.bar(x_pos,y_val,align='center', width=0.5, color='c')
plt.ylabel('Amount of stops',fontsize=15)
plt.title('Stops per location',fontsize=15)
plt.show()


this graph repersent stop distribution per location


In [40]:
len(x_val)


Out[40]:
1353

The graph below described the physical path for every vehicle takes.

Data are latitudes and longitudes of points from shapes.txt, grouped by shape_id. Paths covered Champaign and Urbana, and denser in campus district.


In [9]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os
import csv

In [10]:
df = pd.read_csv("/Users/celine/Desktop/5DataVisual/google_transit/shapes.txt")
df2=df.groupby('shape_id')

In [11]:
# from matplotlib import cm

mycolor=plt.cm.jet
color_id=np.linspace(0,1,677)
s=0
for name, group in df2:
    s=s+1 
#     print(name)
    #group.plot('shape_pt_lat','shape_pt_lon')
    plt.plot(group['shape_pt_lon'],group['shape_pt_lat'], color=plt.cm.binary(s/677), alpha = 0.2)
plt.show()



In [12]:
fn = "stops.txt"
with open(fn, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    data = {}
    for column in header:
        data[column] = []
    for row in reader:
        for column, value in zip(header, row):
            data[column].append(value)

In [13]:
class Dataset:
    def __init__(self, data):
        self.data = data
        
    def convert(self, column, dtype):
        self.data[column] = np.array(self.data[column], dtype=dtype)
        
    def columns(self):
        return self.data.keys()
    
    def filter_eq(self, column, value):
        good = (self.data[column] == value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def filter_lt(self, column, value):
        good = (self.data[column] < value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def filter_gt(self, column, value):
        good = (self.data[column] > value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def filter_ne(self, column, value):
        good = (self.data[column] != value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def size(self):
        for key in self.data:
            return self.data[key].size

    def split(self, column):
        new_datasets = {}
        for split_value in np.unique(self.data[column]):
            new_datasets[split_value] = self.filter_eq(column, split_value)
        return new_datasets

    def stats(self):
        statistics = {}
        for key in self.data:
            if self.data[key].dtype not in ("float", "int"):
                continue
            values = self.data[key]
            statistics[key] = (values.min(), values.max(), values.std(), values.mean())
        return statistics
    
    def compare(self, other):
        stats1 = self.stats()
        stats2 = other.stats()
        for column in self.columns():
            if column not in stats1: continue
            print("Column '{0:25s}'".format(column))
            for s1, s2 in zip(stats1[column], stats2[column]):
                print("    {0} vs {1}".format(s1, s2))
    
    def plot(self, x_column, y_column):
        plt.plot(self.data[x_column], self.data[y_column], '.')

In [14]:
header


Out[14]:
['stop_id',
 'stop_code',
 'stop_name',
 'stop_desc',
 'stop_lat',
 'stop_lon',
 'zone_id',
 'stop_url',
 'location_type',
 'parent_station']

In [15]:
stopsdata= Dataset(data)
value_types = {'stop_ids': 'str',
               'stop_code': 'str',
               'stop_name':'str',
               'stop_desc':'str',
               'stop_lat':'float',
               'stop_lon':'float',
               'zone_id':'float',
               'stop_url':'str',
               'location_type':'str',
               'parent_station':'str'}
for v in stopsdata.columns():
    stopsdata.convert(v, value_types.get(v, "str"))

In [16]:
plt.subplot(221)
plt.rcParams["figure.figsize"] = (20, 20)
plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 、1 minutes for each stop", fontsize=15)
plt.plot(data["stop_lon"],data["stop_lat"],c='#00ff80',marker='o',markersize=7,mec='none',ls='',alpha=0.05)
plt.subplot(222)
plt.rcParams["figure.figsize"] = (20, 20)
plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 2 minutes for each stop", fontsize=15)
plt.plot(data["stop_lon"],data["stop_lat"],c='#80ff00',marker='o',markersize=15,mec='none',ls='',alpha=0.05)
plt.subplot(223)
plt.rcParams["figure.figsize"] = (20, 20)
plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 5 minutes for each stop", fontsize=15)
plt.plot(data["stop_lon"],data["stop_lat"],c='#ffff00',marker='o',markersize=32,mec='none',ls='',alpha=0.05)
plt.subplot(224)
plt.rcParams["figure.figsize"] = (20, 20)
plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 10 minutes for each stop", fontsize=15)
plt.plot(data["stop_lon"],data["stop_lat"],c='#ff0000',marker='o',markersize=65,mec='none',ls='',alpha=0.05)


Out[16]:
[<matplotlib.lines.Line2D at 0x115327c50>]

explaination

Accroding to the google map, 0.02 longitude at 40.06N latitude equals 1.1 miles. So the circle here represent a circle area with r=0.275mile which means an area which only take 5 minutes to walk.


In [21]:
stats=stopsdata.stats()
plt.rcParams["figure.figsize"] = (20, 15)
stats=stopsdata.stats()
lon_min=stats["stop_lon"][0]
lon_max=stats["stop_lon"][1]
lat_min=stats["stop_lat"][0]
lat_max=stats["stop_lat"][1]
num_bins=16
lon=np.mgrid[lon_min:lon_max:(num_bins+1)*1j]
lat=np.mgrid[lat_min:lat_max:(num_bins+1)*1j]
tree_count=np.zeros((num_bins,num_bins))
for i in range(num_bins):
    left_lat=lat[i]
    right_lat=lat[i+1]
    filter_lat_left=stopsdata.filter_gt("stop_lat",left_lat)
    filter_lat_right=filter_lat_left.filter_lt("stop_lat",right_lat)
    for j in range(num_bins):
        left_lon=lon[j]
        right_lon=lon[j+1]
        filter_lon_left=filter_lat_right.filter_gt("stop_lon",left_lon)
        filter_lon_right=filter_lon_left.filter_lt("stop_lon",right_lon)
        tree_count[i,j] +=filter_lon_right.size()
#plt.xlim(lon_min,lon_max)
#plt.ylim(lat_min,lat_max)
plt.subplot(221)
plt.imshow(tree_count, extent=(lon_min,lon_max,lat_min,lat_max),origin="lower")
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("The distribution of stops", fontsize=25)
color_bar=plt.colorbar()
color_bar.set_label("Count")
plt.subplot(222)
plt.imshow(tree_count, extent=(lon_min,lon_max,lat_min,lat_max),origin="lower",cmap =plt.cm.Blues)
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("The distribution of stops", fontsize=25)
color_bar=plt.colorbar()
color_bar.set_label("Count")
plt.subplot(223)
plt.imshow(tree_count, extent=(lon_min,lon_max,lat_min,lat_max),origin="lower", cmap = plt.cm.afmhot)
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("The distribution of stops", fontsize=25)
color_bar=plt.colorbar()
color_bar.set_label("Count")

plt.subplot(224)
plt.imshow(tree_count, extent=(lon_min,lon_max,lat_min,lat_max),origin="lower", cmap = plt.cm.BuGn)
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("The distribution of stops", fontsize=25)
color_bar=plt.colorbar()
color_bar.set_label("Count")


Question

How to let the ploting area be a perfect square.


In [26]:
plt.rcParams["figure.figsize"] = (20, 20)

plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 10 minutes for each stop", fontsize=25)
plt.plot(data["stop_lon"],data["stop_lat"],c='#ffcccc',marker='o',markersize=169,mec='none',ls='') #15min

plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 5 minutes for each stop", fontsize=25)
plt.plot(data["stop_lon"],data["stop_lat"],c='#ffe5cc',marker='o',markersize=56,mec='none',ls='') #10min

plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 2 minutes for each stop", fontsize=25)
plt.plot(data["stop_lon"],data["stop_lat"],c='#ffffcc',marker='o',markersize=28,mec='none',ls='') #5min

plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas in 1 minutes for each stop", fontsize=25)
plt.plot(data["stop_lon"],data["stop_lat"],c='#e5ffcc',marker='o',markersize=11.2,mec='none',ls='')#2min

mycolor=plt.cm.jet
color_id=np.linspace(0,1,677)
s=0
for name, group in df2:
    s=s+1 
#     print(name)
    #group.plot('shape_pt_lat','shape_pt_lon')
    plt.plot(group['shape_pt_lon'],group['shape_pt_lat'], color=plt.cm.binary(s/677), alpha = 0.2)
plt.show()


300 size = 2.65 mile r = 1.325 mile = 2.1324 km 80m/min 300 size = 26.66 min 11.2528 size = 1min

-88.35 to -88.15 10.58mile=17.03km 1 latitude =111km

0.12 latitude=13.32km 1.278513.32 0.121.2785=0.15342 40.10+-0.07671=


In [27]:
plt.rcParams["figure.figsize"] = (20, 20)
plt.grid()
plt.xlabel("Longitude",fontsize=15)
plt.ylabel("Latitude",fontsize=15)
plt.title("Walkable areas & MTD routes", fontsize=25)
plt.xlim(-88.339,-88.139)
plt.ylim(40.02329,40.17671)
r=11.2528
colorlist=("#ffcccc","#ffd5cc","#ffddcc","#ffe6cc","#ffeecc","#fff7cc","#ffffcc","#f7ffcc","#eeffcc","#e6ffcc","#ddffcc","#d5ffcc")

for i in range(12,0,-1):
    plt.plot(data["stop_lon"],data["stop_lat"],color=colorlist[12-i],marker='o',markersize=11.2528*i,mec='none',ls='')

mycolor=plt.cm.jet
color_id=np.linspace(0,1,677)
s=0
for name, group in df2:
    s=s+1 
#     print(name)
    #group.plot('shape_pt_lat','shape_pt_lon')
    plt.plot(group['shape_pt_lon'],group['shape_pt_lat'], color=plt.cm.binary(s/677), alpha = 0.2)
plt.show()



In [ ]: