In [11]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from pprint import pprint
from cta_api import CTABustracker, CTATraintracker
from serializers import MyEncoder
bus_tracker = CTABustracker()
north_predictions = bus_tracker.get_predictions_for_stops(route_id=72, stp_id="890,944") # North Ave Bus Predictions
cali_predictions = bus_tracker.get_predictions_for_stops(route_id=52, stp_id="3183,3006") # California Bus Predictions
for prediction in north_predictions:
print json.dumps(prediction, cls=MyEncoder, indent=4)
In [2]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from pprint import pprint
from cta_api import CTABustracker, CTATraintracker
from serializers import MyEncoder
bus_tracker = CTABustracker()
north_predictions = bus_tracker.get_raw_predictions_for_stops(route_id=72, stp_id="890,944") # North Ave Bus Predictions
cali_predictions = bus_tracker.get_raw_predictions_for_stops(route_id=52, stp_id="3183,3006") # California Bus Predictions
for prediction in north_predictions:
print json.dumps(prediction, cls=MyEncoder, indent=4)
In [3]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from serializers import BusPrediction
import pylab
import pandas as pd
# from cta_api import CTABustracker
# Can live query for data:
# bus_tracker = CTABustracker()
# north_predictions = bus_tracker.get_predictions_for_stops(route_id=72, stp_id="890") # North Ave Bus Predictions
# north_ave_etas = [bus_eta for bus_eta in bus_data if 6 < bus_eta.get_requested_time_hour() < 2 and bus_eta.stop_name == 'North Ave & California']
# north_ave_etas = [float(bus.bus_eta) for bus in north_ave_etas]
df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/bus_data.csv", index_col = [0, 1])
# print type(df)
north_ave = [row.to_dict() for index, row in df.iterrows() if row[-1] == 'North Ave & California']
for indx, bus in enumerate(north_ave):
clean_d = { k.replace("'","").strip(): v for k, v in bus.iteritems() }
north_ave[indx] = BusPrediction(**clean_d)
morning_north_ave = [float(prediction.bus_eta) for prediction in north_ave if 6 < prediction.get_requested_time_hour() < 12]
pylab.figure()
data = pylab.hist(morning_north_ave, bins=6)
pylab.title("North (#72) Bus ETAs From Home to Work")
pylab.legend(loc='best')
pylab.xlabel("Minutes Until Pickup")
pylab.ylabel("Number of Buses")
pylab.plot()
pylab.show()
In [15]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from serializers import BusPrediction
import pylab
import pandas as pd
df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/bus_non-labeled_data.csv", index_col = [0, 1])
# 461 Eastbound #72 records
print df[(df['delayed'] == 1) & (df['stop_name'] == 'North Ave & California')]
print df[(df['delayed'] == 0) & (df['stop_name'] == 'North Ave & California')]
In [22]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from serializers import Tweet
import pylab
import pandas as pd
import re
import operator
df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/cta_tweet_data.csv", index_col = [0, 1])
all_tweets = {}
all_ids = set([])
def ngrams(input, n):
input = input.split(' ')
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output
# print df
for indx, row in df.iterrows():
if pd.notnull(row[-1]) and pd.notnull(row[0]):
row_dict = row.to_dict()
print row_dict.keys()
clean_d = { k.replace("'","").strip(): v for k, v in row_dict.iteritems() }
break
tweet = Tweet(**clean_d)
if not all_tweets.get(tweet.tweet_id):
all_tweets[tweet.tweet_id] = tweet
else:
all_tweets[tweet.tweet_id] = tweet
print all_tweets.keys()
# all_tweet_bigrams = [ngrams(re.sub(r'[^\w\s]', '', data.text), 2) for tweet_id, data in all_tweets.iteritems()]
# all_tweet_bigrams = sum(all_tweet_bigrams, [])
# bigrams_freq = {}
# for bigram in all_tweet_bigrams:
# joined_bigram = ' '.join(bigram)
# if joined_bigram not in bigrams_freq:
# bigrams_freq[joined_bigram] = 1
# else:
# bigrams_freq[joined_bigram] += 1
# sorted_bigrams_freq = sorted(bigrams_freq.items(), key=operator.itemgetter(1), reverse=True)
# # print sorted_bigrams_freq[:10] # [('trains are', 112), ('Line trains', 100), ('are operating', 70), ('residual delays', 67), ('delays after', 67), ('operating with', 67), ('with residual', 65), ('due to', 60), ('buses are', 50), ('rerouted via', 50)]
# labels = [bigram[0] for bigram in sorted_bigrams_freq[:10]]
# data = [bigram[1] for bigram in sorted_bigrams_freq[:10]]
# pylab.title('Top 10 Bigrams in CTA Tweet Data January 2016')
# pylab.pie(data, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
# pylab.show()
In [10]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from serializers import Tweet
# import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import csv
raw_routes_data = []
all_labels = []
# df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/final_morning_training_data.csv", index_col = [0, 1])
# for indx, row in df.iterrows():
# data = row.to_dict()
# clean_d = { k.replace("'","").strip(): v for k, v in data.iteritems() }
# all_labels.append(clean_d.pop('prediction'))
# raw_routes_data.append(raw_route_data)
headers = ["utc_timestamp", "date_string", "prediction", "california_train", "weather", "feels_like", "tweet", "damen_train", "uber_surging", "uber_eta", "72_bus", "52_bus"]
with open("/Users/lorenamesa/Desktop/pytennessee/final_morning_training_data.csv", "r") as morning_data:
reader = csv.reader(morning_data, dialect='excel')
for row in reader:
if row[0] == 'utc_timestamp':
continue
data = dict(zip(headers, row))
all_labels.append(data.pop('prediction'))
raw_route_data = dict(zip(headers, row))
raw_route_data.pop('prediction')
raw_route_data.pop('date_string')
raw_route_data.pop('utc_timestamp')
raw_routes_data.append(raw_route_data)
# Apply vectorizer to training data
print len(raw_routes_data)
ten_percent = len(raw_routes_data) / 10
vec = DictVectorizer()
training_raw_data = raw_routes_data[ten_percent:]
training_label = all_labels[ten_percent:]
training_data = vec.fit_transform(training_raw_data).toarray()
# Train classifier
clf = MultinomialNB()
clf.fit(training_data, training_label)
# Predict
testing_label = all_labels[:ten_percent]
testing_raw_data = raw_routes_data[:ten_percent]
testing_data = vec.transform(testing_raw_data).toarray()
predictions = clf.predict(testing_data)
# Generate report for 90/10 split!
m = metrics.classification_report(testing_label, predictions)
print 'testing on 90/10 split'
print m
In [12]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
from serializers import BusPrediction
import pylab
import pandas as pd
df = pd.read_csv("/Users/lorenamesa/Desktop/pytennessee/bus_data.csv", index_col = [0, 1])
# print type(df)
north_ave = [row.to_dict() for index, row in df.iterrows() if row[-1] == 'North Ave & California']
for indx, bus in enumerate(north_ave):
clean_d = { k.replace("'","").strip(): v for k, v in bus.iteritems() }
north_ave[indx] = BusPrediction(**clean_d)
morning_north_ave = [float(prediction.bus_eta) for prediction in north_ave if 6 < prediction.get_requested_time_hour() < 12]
pylab.figure()
data = pylab.hist(morning_north_ave, bins=6)
ranges = list(data[1][0:7])
categories = [] # ['1.0 to 4.5', '4.5 to 8.0', '8.0 to 11.5', '11.5 to 15.0', '15.0 to 18.5', '18.5 to 22.0']
num_categories = []
for num in xrange(len(ranges)-1):
categories.append(str(ranges[num]) + " to " + str(ranges[num+1]))
for category in categories:
print category
In [16]:
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import json
# import pandas as pd
import csv
class WorkCommutePrediction(object):
def __init__(self, **kwargs):
self.labeled_route = kwargs.get('prediction')
self.utc_timestamp = kwargs.get('utc_timestamp')
self.local_date_string = kwargs.get('date_string')
self.weather = kwargs.get('weather')
self.feels_like = kwargs.get('feels_like')
self.cta_delayed = kwargs.get('tweet')
self.north_ave_bus_eta = kwargs.get('72_bus')
self.cali_bus_eta = kwargs.get('52_bus')
self.damen_train_eta = kwargs.get('damen_train')
self.cali_train_eta = kwargs.get('california_train')
self.uberX_eta = kwargs.get('uber_eta')
self.uberX_surging = kwargs.get('uber_surging')
all_routes = {}
with open("/Users/lorenamesa/Desktop/pytennessee/final_morning_training_data.csv", "r") as morning_data:
reader = csv.reader(morning_data, dialect='excel')
for row in reader:
if row[0] == 'utc_timestamp':
continue
data = dict(zip(headers, row))
work_commute_p = WorkCommutePrediction(**data)
all_labels.append(data.pop('prediction'))
if all_routes.get(work_commute_p.labeled_route):
all_routes[work_commute_p.labeled_route].append(work_commute_p)
else:
all_routes[work_commute_p.labeled_route] = [work_commute_p]
route_data = []
routes = []
north_bus_breakdown = {}
for route, data in all_routes.iteritems():
north_bus_breakdown[route] = {}
for item in data:
if north_bus_breakdown[route].get(item.north_ave_bus_eta):
north_bus_breakdown[route][item.north_ave_bus_eta] += 1
else:
north_bus_breakdown[route][item.north_ave_bus_eta] = 1
north_eta = ['1.0 to 6.0', '6.0 to 11.0', '11.0 to 16.0', '16.0 to 21.0', '21.0 to 26.0', '26.0 to 31.0']
pylab.figure()
pylab.title('Number of Rides Per Route By North Ave (#72) Bus ETA')
b1 = pylab.bar([1,2,3,4,5,6], [north_bus_breakdown['1'].get(item, 0) for item in north_eta], color='red', width=0.3)
b2 = pylab.bar([1,2,3,4,5,6], [north_bus_breakdown['2'].get(item, 0) for item in north_eta], color='orange', width=0.2)
b3 = pylab.bar([1,2,3,4,5,6], [north_bus_breakdown['3'].get(item, 0) for item in north_eta], color='blue', width=0.1)
pylab.legend([b1[0], b2[0], b3[0]], ['52 Bus -> Cali Train', '72 Bus -> Damen', 'UberX'])
pylab.xticks([1,2,3,4,5,6], north_eta)
pylab.xlabel('North Ave Bus ETA in Minutes')
pylab.ylabel('Number of Commutes')
pylab.show()
In [ ]: