notebook.community

Edit and run



In [11]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from pprint import pprint
from cta_api import CTABustracker, CTATraintracker
from serializers import MyEncoder

bus_tracker = CTABustracker()
north_predictions = bus_tracker.get_predictions_for_stops(route_id=72, stp_id="890,944")  # North Ave Bus Predictions
cali_predictions = bus_tracker.get_predictions_for_stops(route_id=52, stp_id="3183,3006")  # California Bus Predictions

for prediction in north_predictions:
    print json.dumps(prediction, cls=MyEncoder, indent=4)









    



INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ctabustracker.com
2016-02-07 08:02:23,511 requests.packages.urllib3.connectionpool INFO     Starting new HTTP connection (1): www.ctabustracker.com
DEBUG:requests.packages.urllib3.connectionpool:"GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=72&stpid=890,944&top=3 HTTP/1.1" 200 1065
2016-02-07 08:02:24,061 requests.packages.urllib3.connectionpool DEBUG    "GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=72&stpid=890,944&top=3 HTTP/1.1" 200 1065
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ctabustracker.com
2016-02-07 08:02:24,066 requests.packages.urllib3.connectionpool INFO     Starting new HTTP connection (1): www.ctabustracker.com
DEBUG:requests.packages.urllib3.connectionpool:"GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=52&stpid=3183,3006&top=3 HTTP/1.1" 200 1098
2016-02-07 08:02:24,206 requests.packages.urllib3.connectionpool DEBUG    "GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=52&stpid=3183,3006&top=3 HTTP/1.1" 200 1098






    



{
    "distance_to_stop": 18023, 
    "route": "52", 
    "vehicle_id": "1807", 
    "arrival_time": "20160207 08:18", 
    "requested_time": "20160207 08:01", 
    "bus_eta": 17.0, 
    "route_direction": "Northbound", 
    "stop_name": "California & Le Moyne"
}
{
    "distance_to_stop": 13524, 
    "route": "52", 
    "vehicle_id": "1800", 
    "arrival_time": "20160207 08:19", 
    "requested_time": "20160207 08:01", 
    "bus_eta": 18.0, 
    "route_direction": "Southbound", 
    "stop_name": "California & Milwaukee (Blue Line)"
}



In [2]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from pprint import pprint
from cta_api import CTABustracker, CTATraintracker
from serializers import MyEncoder

bus_tracker = CTABustracker()
north_predictions = bus_tracker.get_raw_predictions_for_stops(route_id=72, stp_id="890,944")  # North Ave Bus Predictions
cali_predictions = bus_tracker.get_raw_predictions_for_stops(route_id=52, stp_id="3183,3006")  # California Bus Predictions

for prediction in north_predictions:
    print json.dumps(prediction, cls=MyEncoder, indent=4)









    



INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ctabustracker.com
2016-02-07 08:06:13,476 requests.packages.urllib3.connectionpool INFO     Starting new HTTP connection (1): www.ctabustracker.com
DEBUG:requests.packages.urllib3.connectionpool:"GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=72&stpid=890,944&top=3 HTTP/1.1" 200 1551
2016-02-07 08:06:13,638 requests.packages.urllib3.connectionpool DEBUG    "GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=72&stpid=890,944&top=3 HTTP/1.1" 200 1551
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.ctabustracker.com
2016-02-07 08:06:13,643 requests.packages.urllib3.connectionpool INFO     Starting new HTTP connection (1): www.ctabustracker.com
DEBUG:requests.packages.urllib3.connectionpool:"GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=52&stpid=3183,3006&top=3 HTTP/1.1" 200 1594
2016-02-07 08:06:13,804 requests.packages.urllib3.connectionpool DEBUG    "GET /bustime/api/v1/getpredictions?key=T6ketWk5cWetPgYVqkvEVJVng&rt=52&stpid=3183,3006&top=3 HTTP/1.1" 200 1594






    



{
    "tmstmp": "20160207 08:05", 
    "typ": "A", 
    "stpnm": "North Ave & California", 
    "stpid": "890", 
    "vid": "8130", 
    "dstp": "7674", 
    "rt": "72", 
    "rtdir": "Eastbound", 
    "des": "Clark", 
    "prdtm": "20160207 08:14", 
    "tablockid": "72 -802", 
    "tatripid": "1009981", 
    "zone": null
}
{
    "tmstmp": "20160207 08:06", 
    "typ": "A", 
    "stpnm": "North Ave & Damen", 
    "stpid": "944", 
    "vid": "1055", 
    "dstp": "9757", 
    "rt": "72", 
    "rtdir": "Westbound", 
    "des": "Harlem", 
    "prdtm": "20160207 08:16", 
    "tablockid": "72 -801", 
    "tatripid": "1010053", 
    "zone": null
}
{
    "tmstmp": "20160207 08:04", 
    "typ": "A", 
    "stpnm": "North Ave & California", 
    "stpid": "890", 
    "vid": "1934", 
    "dstp": "29138", 
    "rt": "72", 
    "rtdir": "Eastbound", 
    "des": "Clark", 
    "prdtm": "20160207 08:33", 
    "tablockid": "72 -806", 
    "tatripid": "1009982", 
    "zone": null
}



In [3]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from serializers import BusPrediction
import pylab
import pandas as pd
# from cta_api import CTABustracker

# Can live query for data:
# bus_tracker = CTABustracker()
# north_predictions = bus_tracker.get_predictions_for_stops(route_id=72, stp_id="890")  # North Ave Bus Predictions
# north_ave_etas = [bus_eta for bus_eta in bus_data if 6 < bus_eta.get_requested_time_hour() < 2 and bus_eta.stop_name == 'North Ave & California']
# north_ave_etas = [float(bus.bus_eta) for bus in north_ave_etas]

df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/bus_data.csv", index_col = [0, 1])
# print type(df)

north_ave = [row.to_dict() for index, row in df.iterrows() if row[-1] == 'North Ave & California']
for indx, bus in enumerate(north_ave):
    clean_d = { k.replace("'","").strip(): v for k, v in bus.iteritems() }
    north_ave[indx] = BusPrediction(**clean_d)
    
morning_north_ave = [float(prediction.bus_eta) for prediction in north_ave if 6 < prediction.get_requested_time_hour() < 12]

pylab.figure()
data = pylab.hist(morning_north_ave, bins=6)

pylab.title("North (#72) Bus ETAs From Home to Work")
pylab.legend(loc='best')
pylab.xlabel("Minutes Until Pickup")
pylab.ylabel("Number of Buses")
pylab.plot()
pylab.show()



In [15]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from serializers import BusPrediction
import pylab
import pandas as pd

df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/bus_non-labeled_data.csv", index_col = [0, 1])
# 461 Eastbound #72 records
print df[(df['delayed'] == 1) & (df['stop_name'] == 'North Ave & California')]
print df[(df['delayed'] == 0) & (df['stop_name'] == 'North Ave & California')]



In [22]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from serializers import Tweet
import pylab
import pandas as pd
import re
import operator

df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/cta_tweet_data.csv", index_col = [0, 1])
all_tweets = {}
all_ids = set([])

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])

    return output

# print df

for indx, row in df.iterrows():
    if pd.notnull(row[-1]) and pd.notnull(row[0]):
        row_dict = row.to_dict()
        print row_dict.keys()
        clean_d = { k.replace("'","").strip(): v for k, v in row_dict.iteritems() }
        break
        tweet = Tweet(**clean_d)
        if not all_tweets.get(tweet.tweet_id):
            all_tweets[tweet.tweet_id] = tweet
        else:
            all_tweets[tweet.tweet_id] = tweet

print all_tweets.keys()
# all_tweet_bigrams = [ngrams(re.sub(r'[^\w\s]', '', data.text), 2) for tweet_id, data in all_tweets.iteritems()]
# all_tweet_bigrams = sum(all_tweet_bigrams, [])

# bigrams_freq = {}

# for bigram in all_tweet_bigrams:
#     joined_bigram = ' '.join(bigram)
#     if joined_bigram not in bigrams_freq:
#         bigrams_freq[joined_bigram] = 1
#     else:
#         bigrams_freq[joined_bigram] += 1



# sorted_bigrams_freq = sorted(bigrams_freq.items(), key=operator.itemgetter(1), reverse=True)

# # print sorted_bigrams_freq[:10] # [('trains are', 112), ('Line trains', 100), ('are operating', 70), ('residual delays', 67), ('delays after', 67), ('operating with', 67), ('with residual', 65), ('due to', 60), ('buses are', 50), ('rerouted via', 50)]

# labels = [bigram[0] for bigram in sorted_bigrams_freq[:10]]
# data = [bigram[1] for bigram in sorted_bigrams_freq[:10]]
# pylab.title('Top 10 Bigrams in CTA Tweet Data January 2016')
# pylab.pie(data, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
# pylab.show()









    



['text']
[]



In [10]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from serializers import Tweet
# import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import csv

raw_routes_data = []
all_labels = []

# df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/final_morning_training_data.csv", index_col = [0, 1])
# for indx, row in df.iterrows():
#     data = row.to_dict()
#     clean_d = { k.replace("'","").strip(): v for k, v in data.iteritems() }
#     all_labels.append(clean_d.pop('prediction'))
#     raw_routes_data.append(raw_route_data)

headers = ["utc_timestamp", "date_string", "prediction", "california_train", "weather", "feels_like", "tweet", "damen_train", "uber_surging", "uber_eta", "72_bus", "52_bus"]

with open("/Users/lorenamesa/Desktop/pytennessee/final_morning_training_data.csv", "r") as morning_data:

    reader = csv.reader(morning_data, dialect='excel')

    for row in reader:
        if row[0] == 'utc_timestamp':
            continue
        data = dict(zip(headers, row))
        all_labels.append(data.pop('prediction'))
        raw_route_data = dict(zip(headers, row))
        raw_route_data.pop('prediction')
        raw_route_data.pop('date_string')
        raw_route_data.pop('utc_timestamp')
        raw_routes_data.append(raw_route_data)

# Apply vectorizer to training data
print len(raw_routes_data)
ten_percent = len(raw_routes_data) / 10
vec = DictVectorizer()
training_raw_data = raw_routes_data[ten_percent:]
training_label = all_labels[ten_percent:]
training_data = vec.fit_transform(training_raw_data).toarray()

# Train classifier
clf = MultinomialNB()
clf.fit(training_data, training_label)

# Predict
testing_label = all_labels[:ten_percent]
testing_raw_data = raw_routes_data[:ten_percent]
testing_data = vec.transform(testing_raw_data).toarray()

predictions = clf.predict(testing_data)

# Generate report for 90/10 split!
m = metrics.classification_report(testing_label, predictions)  
print 'testing on 90/10 split'
print m









    



160
testing on 90/10 split
             precision    recall  f1-score   support

          1       0.33      0.14      0.20         7
          2       0.55      0.75      0.63         8
          3       0.00      0.00      0.00         1

avg / total       0.42      0.44      0.40        16



In [12]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
from serializers import BusPrediction
import pylab
import pandas as pd

df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/bus_data.csv", index_col = [0, 1])
# print type(df)

north_ave = [row.to_dict() for index, row in df.iterrows() if row[-1] == 'North Ave & California']
for indx, bus in enumerate(north_ave):
    clean_d = { k.replace("'","").strip(): v for k, v in bus.iteritems() }
    north_ave[indx] = BusPrediction(**clean_d)
    
morning_north_ave = [float(prediction.bus_eta) for prediction in north_ave if 6 < prediction.get_requested_time_hour() < 12]

pylab.figure()
data = pylab.hist(morning_north_ave, bins=6)

ranges = list(data[1][0:7])
categories = []  # ['1.0 to 4.5', '4.5 to 8.0', '8.0 to 11.5', '11.5 to 15.0', '15.0 to 18.5', '18.5 to 22.0']
num_categories = []

for num in xrange(len(ranges)-1):
    categories.append(str(ranges[num]) + " to " + str(ranges[num+1]))
    
for category in categories:
    print category









    



0.0 to 5.16666666667
5.16666666667 to 10.3333333333
10.3333333333 to 15.5
15.5 to 20.6666666667
20.6666666667 to 25.8333333333
25.8333333333 to 31.0



In [16]:

    
import sys

sys.path.append('/usr/local/lib/python2.7/site-packages')

import json
# import pandas as pd
import csv


class WorkCommutePrediction(object):

    def __init__(self, **kwargs):
        self.labeled_route = kwargs.get('prediction')
        self.utc_timestamp = kwargs.get('utc_timestamp')
        self.local_date_string = kwargs.get('date_string')
        self.weather = kwargs.get('weather')
        self.feels_like = kwargs.get('feels_like')
        self.cta_delayed = kwargs.get('tweet')
        self.north_ave_bus_eta = kwargs.get('72_bus')
        self.cali_bus_eta = kwargs.get('52_bus')
        self.damen_train_eta = kwargs.get('damen_train')
        self.cali_train_eta = kwargs.get('california_train')
        self.uberX_eta = kwargs.get('uber_eta')
        self.uberX_surging = kwargs.get('uber_surging')

all_routes = {}

with open("/Users/lorenamesa/Desktop/pytennessee/final_morning_training_data.csv", "r") as morning_data:

    reader = csv.reader(morning_data, dialect='excel')

    for row in reader:
        if row[0] == 'utc_timestamp':
            continue
        data = dict(zip(headers, row))
        work_commute_p = WorkCommutePrediction(**data)
        all_labels.append(data.pop('prediction'))

        if all_routes.get(work_commute_p.labeled_route):
            all_routes[work_commute_p.labeled_route].append(work_commute_p)
        else:
            all_routes[work_commute_p.labeled_route] = [work_commute_p]

route_data = []
routes = []

north_bus_breakdown = {}
for route, data in all_routes.iteritems():
    north_bus_breakdown[route] = {}
    for item in data:
        if north_bus_breakdown[route].get(item.north_ave_bus_eta):
            north_bus_breakdown[route][item.north_ave_bus_eta] += 1
        else:
            north_bus_breakdown[route][item.north_ave_bus_eta] = 1

north_eta = ['1.0 to 6.0', '6.0 to 11.0', '11.0 to 16.0', '16.0 to 21.0', '21.0 to 26.0', '26.0 to 31.0']

pylab.figure()
pylab.title('Number of Rides Per Route By North Ave (#72) Bus ETA')
b1 = pylab.bar([1,2,3,4,5,6], [north_bus_breakdown['1'].get(item, 0) for item in north_eta], color='red', width=0.3)
b2 = pylab.bar([1,2,3,4,5,6], [north_bus_breakdown['2'].get(item, 0) for item in north_eta], color='orange', width=0.2)
b3 = pylab.bar([1,2,3,4,5,6], [north_bus_breakdown['3'].get(item, 0) for item in north_eta], color='blue', width=0.1)

pylab.legend([b1[0], b2[0], b3[0]], ['52 Bus -> Cali Train', '72 Bus -> Damen', 'UberX'])
pylab.xticks([1,2,3,4,5,6], north_eta)


pylab.xlabel('North Ave Bus ETA in Minutes')
pylab.ylabel('Number of Commutes')
pylab.show()



In [ ]: