In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import scipy
import numpy
import pandas

In [153]:
import matplotlib.pyplot as pyplt

In [154]:
import seaborn as sns
Navigate to data points

In [155]:
cd /Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/


/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter

Form data points


In [156]:
tweetsDF = pandas.io.json.read_json("new_gruber_tweets.json")

In [157]:
createdDF = tweetsDF.ix[0:, ["created_at"]]
createdTextDF = tweetsDF.ix[0:, ["created_at", "text"]]
createdTextVals = createdTextDF.values

In [158]:
tweetTimes = []
for i,row in createdDF.iterrows():
    tweetTimes.append(row["created_at"])
tweetTimes.sort()

In [159]:
timeUntilNext = []
for i in xrange(1, len(tweetTimes)-1):
    timeDiff = (tweetTimes[i]-tweetTimes[i-1]).seconds
    timeUntilNext.append(timeDiff)

Create initial histogram


In [160]:
timeToNextSeries = pandas.Series(timeUntilNext)

In [161]:
timeToNextSeries.hist(bins=30, normed=True)


Out[161]:
<matplotlib.axes.AxesSubplot at 0x10e9118d0>

Fit an exponential


In [162]:
from scipy.optimize import curve_fit

In [163]:
def fitFunc(t, b):
    return b*numpy.exp(-b*t)

In [164]:
count,division = np.histogram(timeUntilNext, bins=100, normed=True)

In [165]:
fitParams, fitCov = curve_fit(fitFunc, division[0:len(division)-1], count, p0=1e-4)

In [166]:
fitParams


Out[166]:
array([ 0.00068895])

In [167]:
1/fitParams[0]


Out[167]:
1451.4883194175982

In [168]:
fitCov


Out[168]:
array([[  1.04040926e-09]])

Evaluate exponential


In [169]:
t = division[0:len(division)-1]
timeToNextSeries.hist(bins=50, normed=True, color="blue")
pyplt.plot(t, fitFunc(t, fitParams[0]), color="yellow")


Out[169]:
[<matplotlib.lines.Line2D at 0x141c15c90>]

In [170]:
exp_diffs = []
for t in timeUntilNext:
    exp_diffs.append(t-1/fitParams[0])

In [171]:
pandas.Series(exp_diffs).hist(bins=50)


Out[171]:
<matplotlib.axes.AxesSubplot at 0x10e826b90>

In [172]:
pandas.Series(exp_diffs).describe()


Out[172]:
count     3232.000000
mean      2843.046025
std      11064.424336
min      -1450.488319
25%      -1368.488319
50%      -1115.488319
75%       1032.261681
max      83356.511681
dtype: float64

Evaluate absolute difference of values


In [173]:
import math
exp_diffs = []
abs_diffs = []
for t in timeUntilNext:
    exp_diffs.append(t-1/fitParams[0])
    abs_diffs.append(math.fabs(t-1/fitParams[0]))

In [174]:
pandas.Series(abs_diffs).hist()


Out[174]:
<matplotlib.axes.AxesSubplot at 0x10d303950>

In [175]:
pandas.Series(abs_diffs).describe()


Out[175]:
count     3232.000000
mean      4446.834434
std      10522.663236
min          0.488319
25%       1110.505840
50%       1363.488319
75%       1437.488319
max      83356.511681
dtype: float64

Observe effect of adding offset

Note that much of the histogram occurs *before* zero. Perhaps by adding an offset, we can improve performance.

Fit more generalized exponential


In [176]:
def fitFunc_gen(t, a, b, c):
    return a*(b)*numpy.exp(-b*t)+c

In [177]:
fitParams_gen, fitCov_gen = curve_fit(fitFunc_gen, division[0:len(division)-1], count, p0=[0, 3e-4, 0])

In [178]:
fitParams_gen


Out[178]:
array([  3.34149579e-01,   2.17218731e-03,   3.18496895e-06])

In [179]:
fitCov_gen


Out[179]:
array([[  1.10186661e-04,  -6.83152391e-07,  -1.07612154e-09],
       [ -6.83152391e-07,   4.65228418e-09,   5.59703963e-12],
       [ -1.07612154e-09,   5.59703963e-12,   4.82402257e-13]])

In [180]:
(1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1]


Out[180]:
153.83309518722578

Evaluate adjusted exponential


In [181]:
t = division[0:len(division)-1]
timeToNextSeries.hist(bins=50, normed=True, color="blue")
pyplt.plot(t, fitFunc(t, fitParams[0]), color="yellow")
pyplt.plot(t, fitFunc_gen(t, fitParams_gen[0], fitParams_gen[1], fitParams_gen[2]), color="red")


Out[181]:
[<matplotlib.lines.Line2D at 0x12606ba10>]

In [182]:
exp_gen_diffs = []
exp_gen_abs = []
for t in timeUntilNext:
    exp_gen_diffs.append((t-1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1])
    exp_gen_abs.append(math.fabs((t-1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1]))

In [183]:
pandas.Series(exp_gen_diffs).describe()


Out[183]:
count     3232.000000
mean      1281.188091
std       3697.172730
min       -153.494601
25%       -126.094336
50%        -41.554492
75%        676.115265
max      28184.728714
dtype: float64

In [184]:
pandas.Series(exp_gen_abs).describe()


Out[184]:
count     3232.000000
mean      1400.959394
std       3653.456236
min          0.119945
25%        107.716109
50%        139.126169
75%        676.115265
max      28184.728714
dtype: float64

Depection of variance in data


In [185]:
cd /Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/


/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter

In [186]:
tweetsDF = pandas.io.json.read_json("new_gruber_tweets.json")

Obtaining time-to-tweet vs. delta-t data points


In [187]:
step_size = 10
data_points = []
for v in timeUntilNext:
    bin_left_edges = np.arange(0, v, step_size)
    
    for l_edge in bin_left_edges:
        tempNewPoint = [l_edge, v-l_edge]
        data_points.append(tempNewPoint)

In [188]:
data_points.sort()

In [189]:
deltat_100 = [v[1] for v in data_points if v[0]==100]
deltat_150 = [v[1] for v in data_points if v[0]==150]
deltat_10 = [v[1] for v in data_points if v[0]==10]

In [190]:
pandas.Series(deltat_10).hist(bins=30, alpha=0.5, color="blue")
#pandas.Series(deltat_100).hist(bins=30)
d_150 = pandas.Series(deltat_150)
pandas.Series(deltat_150).hist(bins=30, alpha=0.3, color="red")


Out[190]:
<matplotlib.axes.AxesSubplot at 0x14838fbd0>

In [191]:
xVals = [v[0] for v in deltatToStd]
yVals = [v[1] for v in deltatToStd]

In [192]:
scipy.std(deltat_150)


Out[192]:
13381.170443594028

In [193]:
deltatToStd = []
deltaToDist = []
for i in np.arange(0, 100, 10):
    tempDeltas = [v[1] for v in data_points if v[0] == i]
    tempStd = scipy.std(tempDeltas)
    deltatToStd.append([i, tempStd])
    deltaToDist.append([i, tempDeltas])

In [194]:
p1 = pyplt.plot(xVals, yVals, label="std dev (sec)")
pyplt.legend(loc=2, prop={'size':18})


Out[194]:
<matplotlib.legend.Legend at 0x148001510>

Obtain bounds on bootstrapped 95% confidence interval


In [195]:
deltaToBounds = []
for v in deltaToDist:
    topBound = numpy.percentile(v[1], 95)
    bottomBound = numpy.percentile(v[1], 5)
    deltaToBounds.append([v[0], (topBound, bottomBound)])

In [196]:
p1 = pyplt.plot(xVals, [e[1][0] for e in deltaToBounds], color="red")
p2 = pyplt.plot(xVals, [e[1][1] for e in deltaToBounds], color="red")
pyplt.fill_between(xVals, [e[1][0] for e in deltaToBounds], [e[1][1] for e in deltaToBounds], alpha=0.4, color="orange")


Out[196]:
<matplotlib.collections.PolyCollection at 0x1250baad0>

Impact of unmeasured features

One variable plus gaussian noise


In [197]:
dataPoints_1 = []
x = np.arange(0, 100, 10)
for j in xrange(100):
    points = [(i, i*2 + 3 + numpy.random.normal(scale=50.0)) for i in x]
    dataPoints_1.extend(points)

In [198]:
pointToVals = []
pointToBounds = []
for i in np.arange(0, 100, 10):
    valsForDataPoint = [v for v in dataPoints_1 if v[0]==i]
    pointToVals.append(valsForDataPoint)
    upperBound = numpy.percentile(valsForDataPoint, 95)
    lowerBound = numpy.percentile(valsForDataPoint, 5)
    pointToBounds.append([i, (upperBound, lowerBound)])

In [199]:
pyplt.plot(x, [v[1][0] for v in pointToBounds])
pyplt.plot(x, [v[1][1] for v in pointToBounds])
pyplt.plot(x, [[i*2+3] for i in x], color="red", label="true model")
pyplt.fill_between(x, [v[1][0] for v in pointToBounds], [v[1][1] for v in pointToBounds], color="orange", alpha=0.4)
pyplt.legend(loc=2, prop={'size':18})


Out[199]:
<matplotlib.legend.Legend at 0x148ca22d0>

Two variables plus gaussian noise


In [200]:
dataPoints_2 = []
x = np.arange(0, 100, 10)
y = np.arange(50, 150, 10)
for j in xrange(100):
    yVal = random.choice(y)
    points = [(i, i*2 + yVal*2 + 3 + numpy.random.normal(scale=50.0)) for i in x]
    dataPoints_2.extend(points)

In [201]:
avgYAtX = {}
for i in x:
    lineVals = [(i*2 + yVal*2 + 3 ) for yVal in np.arange(50, 150, 10)]
    avgY = reduce(lambda x,y: x+y, lineVals)/len(lineVals)
    avgYAtX[i] = avgY

In [202]:
pointToVals = []
pointToBounds = []
for i in np.arange(0, 100, 10):
    valsForDataPoint = [v for v in dataPoints_2 if v[0]==i]
    pointToVals.append(valsForDataPoint)
    upperBound = numpy.percentile(valsForDataPoint, 95)
    lowerBound = numpy.percentile(valsForDataPoint, 5)
    pointToBounds.append([i, (upperBound, lowerBound)])

In [203]:
pyplt.plot(x, [v[1][0] for v in pointToBounds])
pyplt.plot(x, [v[1][1] for v in pointToBounds])
pyplt.plot(x, [avgYAtX[i] for i in x], color="red", label="true model (best estimate)")
pyplt.fill_between(x, [v[1][0] for v in pointToBounds], [v[1][1] for v in pointToBounds], color="orange", alpha=0.4)
pyplt.legend(loc=2, prop={'size':18})


Out[203]:
<matplotlib.legend.Legend at 0x10e9cf250>

Feature selection

Proposed list of features to consider for influence upon intertweet time: -mention distance -time of day -contains mention -contains URL -length of tweet (num chars) -contains hashtags -is_reply
We now need to form the features described above. The most non-trivial one to create is "mention distance", as defined in lecture. We do this now.

Forming mention distance


In [204]:
import twitter_tools
from twitter_tools import *
Obtain your own data files here!

In [205]:
tweetFile = open("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_gruber_tweets.json")

In [206]:
import json
jsonFile = json.load(tweetFile)

In [207]:
tweetFile.close()

In [208]:
gruberTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_gruber_tweets.json")
siracusaTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_siracusa_tweets.json")
armentTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_arment_tweets.json")

In [209]:
gruberTimeDiffs = []
gruberTweetTimes = []
gruberTimeToDiff = {}
gruberTimeToText = {}
siracusaMentionTimes = []
armentMentionTimes = []

In [210]:
gruberCreatedDF = gruberTweetsDF.ix[0:, ["created_at"]]
gruberCreatedTextDF = gruberTweetsDF.ix[0:, ["created_at", "text"]]
createdTextVals = gruberCreatedTextDF.values

for i, row in gruberCreatedDF.iterrows():
    gruberTweetTimes.append(row["created_at"])

In [211]:
gruberTweetTimes.sort()

In [212]:
for i in xrange(1, len(gruberTweetTimes)):
    timeDiff = (gruberTweetTimes[i]-gruberTweetTimes[i-1]).seconds
    gruberTimeDiffs.append(timeDiff)
    gruberTimeToDiff[gruberTweetTimes[i]] = timeDiff
    gruberTimeToText[gruberTweetTimes[i]] = gruberCreatedTextDF[ gruberCreatedTextDF["created_at"]==gruberTweetTimes[i] ]
Capture time of mentions

In [213]:
nearestMentionToTimeDiff = []
tweetIndexToNearestMention = {}

In [214]:
def findTweetFollowingTime(timeStamp, tweetTimes):
    returnTweetTime = None
    
    for t in tweetTimes:
        if t>timeStamp:
            returnTweetTime = t
            break
    
    return returnTweetTime

def findTweetPreceedingTime(timeStamp, tweetTimes):
    returnTweetTime = None
    
    i = len(tweetTimes)-1
    
    while i>=0:
        t = tweetTimes[i]
        if t<timeStamp:
            returnTweetTime = t
            break
        
        i-=1
    
    return returnTweetTime

In [215]:
siracusaTimeOfGruberMentions = []
armentTimeOfGruberMentions = []

In [216]:
for i, row in armentTweetsDF.iterrows():
    if "user_mentions" in row:
        if type(row["user_mentions"]) == list:
            if len([e for e in row["user_mentions"] if e["screen_name"]=="gruber"])>0:
                armentTimeOfGruberMentions.append(row["created_at"])

In [217]:
for i, row in siracusaTweetsDF.iterrows():
    if "user_mentions" in row:
        if type(row["user_mentions"]) == list:
            if len([e for e in row["user_mentions"] if e["screen_name"]=="gruber"])>0:
                siracusaTimeOfGruberMentions.append(row["created_at"])
Mention distance for each @gruber tweet
For each gruber tweet, let's find the tweet belonging to either @siracusa or @marcoarment which is the closest, in time, to mention @gruber

In [218]:
gruberTweetTimes.sort()
siracusaTimeOfGruberMentions.sort()
armentTimeOfGruberMentions.sort()
for i in xrange(len(gruberTweetTimes)):
    t = gruberTweetTimes[i]
    t_next = None
    if i+1<len(gruberTweetTimes):
        t_next = gruberTweetTimes[i+1]
        
        #print "t_next: %s" % t_next
        t_s = findTweetFollowingTime(t, siracusaTimeOfGruberMentions)
        t_s_prev = findTweetPreceedingTime(t, siracusaTimeOfGruberMentions)
        
        #print "t_s: %s" % t_s
        
        t_a = findTweetFollowingTime(t, armentTimeOfGruberMentions)
        t_a_prev = findTweetPreceedingTime(t, armentTimeOfGruberMentions)
        
        sDiff = None
        aDiff = None
        if t_s_prev is not None and t_s is not None: 
            sDiff = math.fabs((t_s - t).seconds)
            if sDiff >math.fabs((t-t_s_prev).seconds):
                sDiff = math.fabs((t-t_s_prev).seconds)
            
        if t_a_prev is not None and t_a is not None:
            aDiff = math.fabs((t_a - t).seconds)
            if aDiff > math.fabs((t-t_a_prev).seconds):
                aDiff = math.fabs((t - t_a_prev).seconds)
        
        closestMention = None
        
        if sDiff is not None:
            closestMention = sDiff
        elif aDiff is not None:
            closestMention = aDiff
        
        if aDiff is not None and sDiff is not None:
            if aDiff < sDiff:
                closestMention = aDiff
                
        if closestMention is not None:
            nearestMentionToTimeDiff.append((closestMention, (t_next-t).seconds))
            tweetIndexToNearestMention[i] = closestMention

Extract remaining features


In [219]:
features_list = extract_features(jsonFile)

In [220]:
featuresWithLabel = []
for i in range(len(gruberTimeDiffs)):
    timeDiff = gruberTimeDiffs[i]
    if timeDiff<4000:
        label = "short"
    else:
        label = "long"
    
    featuresForTweet = features_list[i]
    
    nearestMention = 0
    if i in tweetIndexToNearestMention:
        nearestMention = tweetIndexToNearestMention[i]
    
    completeItem = []
    completeItem.append(label)
    completeItem.extend(list(featuresForTweet))
    completeItem.append(nearestMention)
    featuresWithLabel.append(completeItem)

Evaluate features by mutual information gain

Get info_gain library from your local path!

In [221]:
cd /Users/dondini/Udacity/


/Users/dondini/Udacity

In [222]:
from info_gain import *

1 Time of day


In [223]:
valsY = ["short", "long"]
binsY = None

In [224]:
joint_list = [(v[1], v[0]) for v in featuresWithLabel]
valsX = ["morning", "afternoon", "evening", "night"]
binsX = None

In [225]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, None, None)

In [226]:
entropy_loss(jpTable, valsX, valsY)


Out[226]:
0.00029792328911204535

2 Contains mention


In [227]:
joint_list = [(v[2], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]

In [228]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)

In [229]:
entropy_loss(jpTable, [0.0, 1.0], valsY)


Out[229]:
0.00018592171588510675

3 Contains URL


In [230]:
joint_list = [(v[3], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]

In [231]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)

In [232]:
entropy_loss(jpTable, [0.0, 1.0], valsY)


Out[232]:
0.0006917841288795468

4 Length of tweet


In [233]:
joint_list = [(v[4], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, 14], [14, 28], [28, 42], [42, 56], [56, 70], [70, 84], [84, 98], [98, 112], [112, 126]]

In [234]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)

In [235]:
entropy_loss(jpTable, [v[4] for v in featuresWithLabel], valsY)


Out[235]:
3.184467596637892

5 Contains hashtags


In [236]:
joint_list = [(v[5], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]

In [237]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)

In [238]:
entropy_loss(jpTable, [0.0, 1.0], valsY)


Out[238]:
2.471814972664077e-05

6 Is reply


In [239]:
joint_list = [(v[6], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]

In [240]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)

In [241]:
entropy_loss(jpTable, [0.0, 1.0], valsY)


Out[241]:
7.188753133346992e-05

7 Mention distance


In [242]:
joint_list = [(v[7], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, 1000], [1000, 5000], [5000, 8000], [8000, 10000], [10000, 20000], [20000, 30000], [30000, 60000], [60000, 80000], [80000, 10000], [100000, 800000]]

In [243]:
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)

In [244]:
entropy_loss(jpTable, [0.0, 1.0], valsY)


Out[244]:
2.065038542452612

Fitting a model

Obtain data points - each point is of the form (delta-t, mention distance, text-length of last tweet) - label with time until next tweet

kNN

Form data points to be used with kNN model

In [245]:
step_size = 10
knn_data_points = []
tweet_index = 0
for v in timeUntilNext:
    bin_left_edges = np.arange(0, v, step_size)
    
    features_for_tweet = features_list[tweet_index]
    
    if tweet_index in tweetIndexToNearestMention:
        for l_edge in bin_left_edges:
            newDeltaT = l_edge
            mentionDist = tweetIndexToNearestMention[tweet_index] + newDeltaT
            textLength = features_for_tweet[3]
            label = v-l_edge
            
            newPoint = [newDeltaT, mentionDist, textLength, label]
            knn_data_points.append(newPoint)
    
    tweet_index+=1

In [246]:
import sklearn
from sklearn.neighbors import KNeighborsRegressor

In [247]:
knn = KNeighborsRegressor(5)

In [248]:
knn_3 = KNeighborsRegressor(3)

In [249]:
len(knn_data_points)


Out[249]:
1381258

In [250]:
.70*len(knn_data_points)


Out[250]:
966880.6

In [251]:
.30*len(knn_data_points)


Out[251]:
414377.39999999997

In [252]:
import random

In [253]:
trainingPoints = [random.choice(knn_data_points) for i in xrange(966880)]

In [254]:
trainingX = [(v[0], v[1], v[2]) for v in trainingPoints]

In [255]:
trainingY = [v[3] for v in trainingPoints]

In [256]:
m_3 = knn_3.fit(trainingX, trainingY)

In [257]:
m = knn.fit(trainingX, trainingY)

Performance on training set


In [258]:
y_ = m.predict(trainingX)

In [259]:
y = m_3.predict(trainingX)

In [260]:
train_diffs = []
for i in xrange(len(trainingY)):
    train_diffs.append(trainingY[i] - y_[i])

In [261]:
pandas.Series([math.fabs(x) for x in train_diffs]).describe()


Out[261]:
count    966880.000000
mean       1876.546200
std        4720.141228
min           0.000000
25%           4.000000
50%           8.000000
75%        1062.000000
max       65938.400000
dtype: float64

In [262]:
pandas.Series(train_diffs).hist(bins=50)


Out[262]:
<matplotlib.axes.AxesSubplot at 0x12316d210>

Performance on test set


In [263]:
testPoints = [random.choice(knn_data_points) for i in xrange(414377)]

In [264]:
testX = [(v[0], v[1], v[2]) for v in testPoints]

In [265]:
testY = [v[3] for v in testPoints]

In [266]:
test_pred = m_3.predict(testX)

In [267]:
test_diffs = []
for i in xrange(len(testY)):
    test_diffs.append(math.fabs(testY[i] - test_pred[i]))

In [268]:
pandas.Series(test_diffs).describe()


//anaconda/lib/python2.7/site-packages/pandas/compat/scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
//anaconda/lib/python2.7/site-packages/pandas/compat/scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
Out[268]:
count    414377.000000
mean       1834.645057
std        5575.817493
min           0.000000
25%           3.333333
50%           6.666667
75%          30.000000
max       82658.000000
dtype: float64

In [269]:
testSeries = pandas.Series(test_diffs)

In [270]:
testSeries.hist(bins=50)


Out[270]:
<matplotlib.axes.AxesSubplot at 0x12316d950>

In [270]: