In [1]:
%pylab inline
In [2]:
import scipy
import numpy
import pandas
In [153]:
import matplotlib.pyplot as pyplt
In [154]:
import seaborn as sns
In [155]:
cd /Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/
In [156]:
tweetsDF = pandas.io.json.read_json("new_gruber_tweets.json")
In [157]:
createdDF = tweetsDF.ix[0:, ["created_at"]]
createdTextDF = tweetsDF.ix[0:, ["created_at", "text"]]
createdTextVals = createdTextDF.values
In [158]:
tweetTimes = []
for i,row in createdDF.iterrows():
tweetTimes.append(row["created_at"])
tweetTimes.sort()
In [159]:
timeUntilNext = []
for i in xrange(1, len(tweetTimes)-1):
timeDiff = (tweetTimes[i]-tweetTimes[i-1]).seconds
timeUntilNext.append(timeDiff)
In [160]:
timeToNextSeries = pandas.Series(timeUntilNext)
In [161]:
timeToNextSeries.hist(bins=30, normed=True)
Out[161]:
In [162]:
from scipy.optimize import curve_fit
In [163]:
def fitFunc(t, b):
return b*numpy.exp(-b*t)
In [164]:
count,division = np.histogram(timeUntilNext, bins=100, normed=True)
In [165]:
fitParams, fitCov = curve_fit(fitFunc, division[0:len(division)-1], count, p0=1e-4)
In [166]:
fitParams
Out[166]:
In [167]:
1/fitParams[0]
Out[167]:
In [168]:
fitCov
Out[168]:
In [169]:
t = division[0:len(division)-1]
timeToNextSeries.hist(bins=50, normed=True, color="blue")
pyplt.plot(t, fitFunc(t, fitParams[0]), color="yellow")
Out[169]:
In [170]:
exp_diffs = []
for t in timeUntilNext:
exp_diffs.append(t-1/fitParams[0])
In [171]:
pandas.Series(exp_diffs).hist(bins=50)
Out[171]:
In [172]:
pandas.Series(exp_diffs).describe()
Out[172]:
In [173]:
import math
exp_diffs = []
abs_diffs = []
for t in timeUntilNext:
exp_diffs.append(t-1/fitParams[0])
abs_diffs.append(math.fabs(t-1/fitParams[0]))
In [174]:
pandas.Series(abs_diffs).hist()
Out[174]:
In [175]:
pandas.Series(abs_diffs).describe()
Out[175]:
In [176]:
def fitFunc_gen(t, a, b, c):
return a*(b)*numpy.exp(-b*t)+c
In [177]:
fitParams_gen, fitCov_gen = curve_fit(fitFunc_gen, division[0:len(division)-1], count, p0=[0, 3e-4, 0])
In [178]:
fitParams_gen
Out[178]:
In [179]:
fitCov_gen
Out[179]:
In [180]:
(1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1]
Out[180]:
In [181]:
t = division[0:len(division)-1]
timeToNextSeries.hist(bins=50, normed=True, color="blue")
pyplt.plot(t, fitFunc(t, fitParams[0]), color="yellow")
pyplt.plot(t, fitFunc_gen(t, fitParams_gen[0], fitParams_gen[1], fitParams_gen[2]), color="red")
Out[181]:
In [182]:
exp_gen_diffs = []
exp_gen_abs = []
for t in timeUntilNext:
exp_gen_diffs.append((t-1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1])
exp_gen_abs.append(math.fabs((t-1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1]))
In [183]:
pandas.Series(exp_gen_diffs).describe()
Out[183]:
In [184]:
pandas.Series(exp_gen_abs).describe()
Out[184]:
In [185]:
cd /Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/
In [186]:
tweetsDF = pandas.io.json.read_json("new_gruber_tweets.json")
In [187]:
step_size = 10
data_points = []
for v in timeUntilNext:
bin_left_edges = np.arange(0, v, step_size)
for l_edge in bin_left_edges:
tempNewPoint = [l_edge, v-l_edge]
data_points.append(tempNewPoint)
In [188]:
data_points.sort()
In [189]:
deltat_100 = [v[1] for v in data_points if v[0]==100]
deltat_150 = [v[1] for v in data_points if v[0]==150]
deltat_10 = [v[1] for v in data_points if v[0]==10]
In [190]:
pandas.Series(deltat_10).hist(bins=30, alpha=0.5, color="blue")
#pandas.Series(deltat_100).hist(bins=30)
d_150 = pandas.Series(deltat_150)
pandas.Series(deltat_150).hist(bins=30, alpha=0.3, color="red")
Out[190]:
In [191]:
xVals = [v[0] for v in deltatToStd]
yVals = [v[1] for v in deltatToStd]
In [192]:
scipy.std(deltat_150)
Out[192]:
In [193]:
deltatToStd = []
deltaToDist = []
for i in np.arange(0, 100, 10):
tempDeltas = [v[1] for v in data_points if v[0] == i]
tempStd = scipy.std(tempDeltas)
deltatToStd.append([i, tempStd])
deltaToDist.append([i, tempDeltas])
In [194]:
p1 = pyplt.plot(xVals, yVals, label="std dev (sec)")
pyplt.legend(loc=2, prop={'size':18})
Out[194]:
In [195]:
deltaToBounds = []
for v in deltaToDist:
topBound = numpy.percentile(v[1], 95)
bottomBound = numpy.percentile(v[1], 5)
deltaToBounds.append([v[0], (topBound, bottomBound)])
In [196]:
p1 = pyplt.plot(xVals, [e[1][0] for e in deltaToBounds], color="red")
p2 = pyplt.plot(xVals, [e[1][1] for e in deltaToBounds], color="red")
pyplt.fill_between(xVals, [e[1][0] for e in deltaToBounds], [e[1][1] for e in deltaToBounds], alpha=0.4, color="orange")
Out[196]:
In [197]:
dataPoints_1 = []
x = np.arange(0, 100, 10)
for j in xrange(100):
points = [(i, i*2 + 3 + numpy.random.normal(scale=50.0)) for i in x]
dataPoints_1.extend(points)
In [198]:
pointToVals = []
pointToBounds = []
for i in np.arange(0, 100, 10):
valsForDataPoint = [v for v in dataPoints_1 if v[0]==i]
pointToVals.append(valsForDataPoint)
upperBound = numpy.percentile(valsForDataPoint, 95)
lowerBound = numpy.percentile(valsForDataPoint, 5)
pointToBounds.append([i, (upperBound, lowerBound)])
In [199]:
pyplt.plot(x, [v[1][0] for v in pointToBounds])
pyplt.plot(x, [v[1][1] for v in pointToBounds])
pyplt.plot(x, [[i*2+3] for i in x], color="red", label="true model")
pyplt.fill_between(x, [v[1][0] for v in pointToBounds], [v[1][1] for v in pointToBounds], color="orange", alpha=0.4)
pyplt.legend(loc=2, prop={'size':18})
Out[199]:
In [200]:
dataPoints_2 = []
x = np.arange(0, 100, 10)
y = np.arange(50, 150, 10)
for j in xrange(100):
yVal = random.choice(y)
points = [(i, i*2 + yVal*2 + 3 + numpy.random.normal(scale=50.0)) for i in x]
dataPoints_2.extend(points)
In [201]:
avgYAtX = {}
for i in x:
lineVals = [(i*2 + yVal*2 + 3 ) for yVal in np.arange(50, 150, 10)]
avgY = reduce(lambda x,y: x+y, lineVals)/len(lineVals)
avgYAtX[i] = avgY
In [202]:
pointToVals = []
pointToBounds = []
for i in np.arange(0, 100, 10):
valsForDataPoint = [v for v in dataPoints_2 if v[0]==i]
pointToVals.append(valsForDataPoint)
upperBound = numpy.percentile(valsForDataPoint, 95)
lowerBound = numpy.percentile(valsForDataPoint, 5)
pointToBounds.append([i, (upperBound, lowerBound)])
In [203]:
pyplt.plot(x, [v[1][0] for v in pointToBounds])
pyplt.plot(x, [v[1][1] for v in pointToBounds])
pyplt.plot(x, [avgYAtX[i] for i in x], color="red", label="true model (best estimate)")
pyplt.fill_between(x, [v[1][0] for v in pointToBounds], [v[1][1] for v in pointToBounds], color="orange", alpha=0.4)
pyplt.legend(loc=2, prop={'size':18})
Out[203]:
In [204]:
import twitter_tools
from twitter_tools import *
In [205]:
tweetFile = open("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_gruber_tweets.json")
In [206]:
import json
jsonFile = json.load(tweetFile)
In [207]:
tweetFile.close()
In [208]:
gruberTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_gruber_tweets.json")
siracusaTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_siracusa_tweets.json")
armentTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_arment_tweets.json")
In [209]:
gruberTimeDiffs = []
gruberTweetTimes = []
gruberTimeToDiff = {}
gruberTimeToText = {}
siracusaMentionTimes = []
armentMentionTimes = []
In [210]:
gruberCreatedDF = gruberTweetsDF.ix[0:, ["created_at"]]
gruberCreatedTextDF = gruberTweetsDF.ix[0:, ["created_at", "text"]]
createdTextVals = gruberCreatedTextDF.values
for i, row in gruberCreatedDF.iterrows():
gruberTweetTimes.append(row["created_at"])
In [211]:
gruberTweetTimes.sort()
In [212]:
for i in xrange(1, len(gruberTweetTimes)):
timeDiff = (gruberTweetTimes[i]-gruberTweetTimes[i-1]).seconds
gruberTimeDiffs.append(timeDiff)
gruberTimeToDiff[gruberTweetTimes[i]] = timeDiff
gruberTimeToText[gruberTweetTimes[i]] = gruberCreatedTextDF[ gruberCreatedTextDF["created_at"]==gruberTweetTimes[i] ]
In [213]:
nearestMentionToTimeDiff = []
tweetIndexToNearestMention = {}
In [214]:
def findTweetFollowingTime(timeStamp, tweetTimes):
returnTweetTime = None
for t in tweetTimes:
if t>timeStamp:
returnTweetTime = t
break
return returnTweetTime
def findTweetPreceedingTime(timeStamp, tweetTimes):
returnTweetTime = None
i = len(tweetTimes)-1
while i>=0:
t = tweetTimes[i]
if t<timeStamp:
returnTweetTime = t
break
i-=1
return returnTweetTime
In [215]:
siracusaTimeOfGruberMentions = []
armentTimeOfGruberMentions = []
In [216]:
for i, row in armentTweetsDF.iterrows():
if "user_mentions" in row:
if type(row["user_mentions"]) == list:
if len([e for e in row["user_mentions"] if e["screen_name"]=="gruber"])>0:
armentTimeOfGruberMentions.append(row["created_at"])
In [217]:
for i, row in siracusaTweetsDF.iterrows():
if "user_mentions" in row:
if type(row["user_mentions"]) == list:
if len([e for e in row["user_mentions"] if e["screen_name"]=="gruber"])>0:
siracusaTimeOfGruberMentions.append(row["created_at"])
In [218]:
gruberTweetTimes.sort()
siracusaTimeOfGruberMentions.sort()
armentTimeOfGruberMentions.sort()
for i in xrange(len(gruberTweetTimes)):
t = gruberTweetTimes[i]
t_next = None
if i+1<len(gruberTweetTimes):
t_next = gruberTweetTimes[i+1]
#print "t_next: %s" % t_next
t_s = findTweetFollowingTime(t, siracusaTimeOfGruberMentions)
t_s_prev = findTweetPreceedingTime(t, siracusaTimeOfGruberMentions)
#print "t_s: %s" % t_s
t_a = findTweetFollowingTime(t, armentTimeOfGruberMentions)
t_a_prev = findTweetPreceedingTime(t, armentTimeOfGruberMentions)
sDiff = None
aDiff = None
if t_s_prev is not None and t_s is not None:
sDiff = math.fabs((t_s - t).seconds)
if sDiff >math.fabs((t-t_s_prev).seconds):
sDiff = math.fabs((t-t_s_prev).seconds)
if t_a_prev is not None and t_a is not None:
aDiff = math.fabs((t_a - t).seconds)
if aDiff > math.fabs((t-t_a_prev).seconds):
aDiff = math.fabs((t - t_a_prev).seconds)
closestMention = None
if sDiff is not None:
closestMention = sDiff
elif aDiff is not None:
closestMention = aDiff
if aDiff is not None and sDiff is not None:
if aDiff < sDiff:
closestMention = aDiff
if closestMention is not None:
nearestMentionToTimeDiff.append((closestMention, (t_next-t).seconds))
tweetIndexToNearestMention[i] = closestMention
In [219]:
features_list = extract_features(jsonFile)
In [220]:
featuresWithLabel = []
for i in range(len(gruberTimeDiffs)):
timeDiff = gruberTimeDiffs[i]
if timeDiff<4000:
label = "short"
else:
label = "long"
featuresForTweet = features_list[i]
nearestMention = 0
if i in tweetIndexToNearestMention:
nearestMention = tweetIndexToNearestMention[i]
completeItem = []
completeItem.append(label)
completeItem.extend(list(featuresForTweet))
completeItem.append(nearestMention)
featuresWithLabel.append(completeItem)
In [221]:
cd /Users/dondini/Udacity/
In [222]:
from info_gain import *
In [223]:
valsY = ["short", "long"]
binsY = None
In [224]:
joint_list = [(v[1], v[0]) for v in featuresWithLabel]
valsX = ["morning", "afternoon", "evening", "night"]
binsX = None
In [225]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, None, None)
In [226]:
entropy_loss(jpTable, valsX, valsY)
Out[226]:
In [227]:
joint_list = [(v[2], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]
In [228]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)
In [229]:
entropy_loss(jpTable, [0.0, 1.0], valsY)
Out[229]:
In [230]:
joint_list = [(v[3], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]
In [231]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)
In [232]:
entropy_loss(jpTable, [0.0, 1.0], valsY)
Out[232]:
In [233]:
joint_list = [(v[4], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, 14], [14, 28], [28, 42], [42, 56], [56, 70], [70, 84], [84, 98], [98, 112], [112, 126]]
In [234]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)
In [235]:
entropy_loss(jpTable, [v[4] for v in featuresWithLabel], valsY)
Out[235]:
In [236]:
joint_list = [(v[5], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]
In [237]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)
In [238]:
entropy_loss(jpTable, [0.0, 1.0], valsY)
Out[238]:
In [239]:
joint_list = [(v[6], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]
In [240]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)
In [241]:
entropy_loss(jpTable, [0.0, 1.0], valsY)
Out[241]:
In [242]:
joint_list = [(v[7], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, 1000], [1000, 5000], [5000, 8000], [8000, 10000], [10000, 20000], [20000, 30000], [30000, 60000], [60000, 80000], [80000, 10000], [100000, 800000]]
In [243]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)
In [244]:
entropy_loss(jpTable, [0.0, 1.0], valsY)
Out[244]:
In [245]:
step_size = 10
knn_data_points = []
tweet_index = 0
for v in timeUntilNext:
bin_left_edges = np.arange(0, v, step_size)
features_for_tweet = features_list[tweet_index]
if tweet_index in tweetIndexToNearestMention:
for l_edge in bin_left_edges:
newDeltaT = l_edge
mentionDist = tweetIndexToNearestMention[tweet_index] + newDeltaT
textLength = features_for_tweet[3]
label = v-l_edge
newPoint = [newDeltaT, mentionDist, textLength, label]
knn_data_points.append(newPoint)
tweet_index+=1
In [246]:
import sklearn
from sklearn.neighbors import KNeighborsRegressor
In [247]:
knn = KNeighborsRegressor(5)
In [248]:
knn_3 = KNeighborsRegressor(3)
In [249]:
len(knn_data_points)
Out[249]:
In [250]:
.70*len(knn_data_points)
Out[250]:
In [251]:
.30*len(knn_data_points)
Out[251]:
In [252]:
import random
In [253]:
trainingPoints = [random.choice(knn_data_points) for i in xrange(966880)]
In [254]:
trainingX = [(v[0], v[1], v[2]) for v in trainingPoints]
In [255]:
trainingY = [v[3] for v in trainingPoints]
In [256]:
m_3 = knn_3.fit(trainingX, trainingY)
In [257]:
m = knn.fit(trainingX, trainingY)
In [258]:
y_ = m.predict(trainingX)
In [259]:
y = m_3.predict(trainingX)
In [260]:
train_diffs = []
for i in xrange(len(trainingY)):
train_diffs.append(trainingY[i] - y_[i])
In [261]:
pandas.Series([math.fabs(x) for x in train_diffs]).describe()
Out[261]:
In [262]:
pandas.Series(train_diffs).hist(bins=50)
Out[262]:
In [263]:
testPoints = [random.choice(knn_data_points) for i in xrange(414377)]
In [264]:
testX = [(v[0], v[1], v[2]) for v in testPoints]
In [265]:
testY = [v[3] for v in testPoints]
In [266]:
test_pred = m_3.predict(testX)
In [267]:
test_diffs = []
for i in xrange(len(testY)):
test_diffs.append(math.fabs(testY[i] - test_pred[i]))
In [268]:
pandas.Series(test_diffs).describe()
Out[268]:
In [269]:
testSeries = pandas.Series(test_diffs)
In [270]:
testSeries.hist(bins=50)
Out[270]:
In [270]: