notebook.community

Edit and run



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import scipy
import numpy
import pandas



In [153]:

    
import matplotlib.pyplot as pyplt



In [154]:

    
import seaborn as sns

Navigate to data points



In [155]:

    
cd /Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/









    



/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter

Form data points



In [156]:

    
tweetsDF = pandas.io.json.read_json("new_gruber_tweets.json")



In [157]:

    
createdDF = tweetsDF.ix[0:, ["created_at"]]
createdTextDF = tweetsDF.ix[0:, ["created_at", "text"]]
createdTextVals = createdTextDF.values



In [158]:

    
tweetTimes = []
for i,row in createdDF.iterrows():
    tweetTimes.append(row["created_at"])
tweetTimes.sort()



In [159]:

    
timeUntilNext = []
for i in xrange(1, len(tweetTimes)-1):
    timeDiff = (tweetTimes[i]-tweetTimes[i-1]).seconds
    timeUntilNext.append(timeDiff)

Create initial histogram



In [160]:

    
timeToNextSeries = pandas.Series(timeUntilNext)



In [161]:

    
timeToNextSeries.hist(bins=30, normed=True)









    Out[161]:





<matplotlib.axes.AxesSubplot at 0x10e9118d0>

Fit an exponential



In [162]:

    
from scipy.optimize import curve_fit



In [163]:

    
def fitFunc(t, b):
    return b*numpy.exp(-b*t)



In [164]:

    
count,division = np.histogram(timeUntilNext, bins=100, normed=True)



In [165]:

    
fitParams, fitCov = curve_fit(fitFunc, division[0:len(division)-1], count, p0=1e-4)



In [166]:

    
fitParams









    Out[166]:





array([ 0.00068895])



In [167]:

    
1/fitParams[0]









    Out[167]:





1451.4883194175982



In [168]:

    
fitCov









    Out[168]:





array([[  1.04040926e-09]])

Evaluate exponential



In [169]:

    
t = division[0:len(division)-1]
timeToNextSeries.hist(bins=50, normed=True, color="blue")
pyplt.plot(t, fitFunc(t, fitParams[0]), color="yellow")









    Out[169]:





[<matplotlib.lines.Line2D at 0x141c15c90>]



In [170]:

    
exp_diffs = []
for t in timeUntilNext:
    exp_diffs.append(t-1/fitParams[0])



In [171]:

    
pandas.Series(exp_diffs).hist(bins=50)









    Out[171]:





<matplotlib.axes.AxesSubplot at 0x10e826b90>



In [172]:

    
pandas.Series(exp_diffs).describe()









    Out[172]:





count     3232.000000
mean      2843.046025
std      11064.424336
min      -1450.488319
25%      -1368.488319
50%      -1115.488319
75%       1032.261681
max      83356.511681
dtype: float64

Evaluate absolute difference of values



In [173]:

    
import math
exp_diffs = []
abs_diffs = []
for t in timeUntilNext:
    exp_diffs.append(t-1/fitParams[0])
    abs_diffs.append(math.fabs(t-1/fitParams[0]))



In [174]:

    
pandas.Series(abs_diffs).hist()









    Out[174]:





<matplotlib.axes.AxesSubplot at 0x10d303950>



In [175]:

    
pandas.Series(abs_diffs).describe()









    Out[175]:





count     3232.000000
mean      4446.834434
std      10522.663236
min          0.488319
25%       1110.505840
50%       1363.488319
75%       1437.488319
max      83356.511681
dtype: float64

Observe effect of adding offset

Note that much of the histogram occurs *before* zero. Perhaps by adding an offset, we can improve performance.

Fit more generalized exponential



In [176]:

    
def fitFunc_gen(t, a, b, c):
    return a*(b)*numpy.exp(-b*t)+c



In [177]:

    
fitParams_gen, fitCov_gen = curve_fit(fitFunc_gen, division[0:len(division)-1], count, p0=[0, 3e-4, 0])



In [178]:

    
fitParams_gen









    Out[178]:





array([  3.34149579e-01,   2.17218731e-03,   3.18496895e-06])



In [179]:

    
fitCov_gen









    Out[179]:





array([[  1.10186661e-04,  -6.83152391e-07,  -1.07612154e-09],
       [ -6.83152391e-07,   4.65228418e-09,   5.59703963e-12],
       [ -1.07612154e-09,   5.59703963e-12,   4.82402257e-13]])



In [180]:

    
(1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1]









    Out[180]:





153.83309518722578

Evaluate adjusted exponential



In [181]:

    
t = division[0:len(division)-1]
timeToNextSeries.hist(bins=50, normed=True, color="blue")
pyplt.plot(t, fitFunc(t, fitParams[0]), color="yellow")
pyplt.plot(t, fitFunc_gen(t, fitParams_gen[0], fitParams_gen[1], fitParams_gen[2]), color="red")









    Out[181]:





[<matplotlib.lines.Line2D at 0x12606ba10>]



In [182]:

    
exp_gen_diffs = []
exp_gen_abs = []
for t in timeUntilNext:
    exp_gen_diffs.append((t-1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1])
    exp_gen_abs.append(math.fabs((t-1/fitParams_gen[1])*fitParams_gen[0]+fitParams_gen[1]))



In [183]:

    
pandas.Series(exp_gen_diffs).describe()









    Out[183]:





count     3232.000000
mean      1281.188091
std       3697.172730
min       -153.494601
25%       -126.094336
50%        -41.554492
75%        676.115265
max      28184.728714
dtype: float64



In [184]:

    
pandas.Series(exp_gen_abs).describe()









    Out[184]:





count     3232.000000
mean      1400.959394
std       3653.456236
min          0.119945
25%        107.716109
50%        139.126169
75%        676.115265
max      28184.728714
dtype: float64

Depection of variance in data



In [185]:

    
cd /Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/









    



/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter



In [186]:

    
tweetsDF = pandas.io.json.read_json("new_gruber_tweets.json")

Obtaining time-to-tweet vs. delta-t data points



In [187]:

    
step_size = 10
data_points = []
for v in timeUntilNext:
    bin_left_edges = np.arange(0, v, step_size)
    
    for l_edge in bin_left_edges:
        tempNewPoint = [l_edge, v-l_edge]
        data_points.append(tempNewPoint)



In [188]:

    
data_points.sort()



In [189]:

    
deltat_100 = [v[1] for v in data_points if v[0]==100]
deltat_150 = [v[1] for v in data_points if v[0]==150]
deltat_10 = [v[1] for v in data_points if v[0]==10]



In [190]:

    
pandas.Series(deltat_10).hist(bins=30, alpha=0.5, color="blue")
#pandas.Series(deltat_100).hist(bins=30)
d_150 = pandas.Series(deltat_150)
pandas.Series(deltat_150).hist(bins=30, alpha=0.3, color="red")









    Out[190]:





<matplotlib.axes.AxesSubplot at 0x14838fbd0>



In [191]:

    
xVals = [v[0] for v in deltatToStd]
yVals = [v[1] for v in deltatToStd]



In [192]:

    
scipy.std(deltat_150)









    Out[192]:





13381.170443594028



In [193]:

    
deltatToStd = []
deltaToDist = []
for i in np.arange(0, 100, 10):
    tempDeltas = [v[1] for v in data_points if v[0] == i]
    tempStd = scipy.std(tempDeltas)
    deltatToStd.append([i, tempStd])
    deltaToDist.append([i, tempDeltas])



In [194]:

    
p1 = pyplt.plot(xVals, yVals, label="std dev (sec)")
pyplt.legend(loc=2, prop={'size':18})









    Out[194]:





<matplotlib.legend.Legend at 0x148001510>

Obtain bounds on bootstrapped 95% confidence interval



In [195]:

    
deltaToBounds = []
for v in deltaToDist:
    topBound = numpy.percentile(v[1], 95)
    bottomBound = numpy.percentile(v[1], 5)
    deltaToBounds.append([v[0], (topBound, bottomBound)])



In [196]:

    
p1 = pyplt.plot(xVals, [e[1][0] for e in deltaToBounds], color="red")
p2 = pyplt.plot(xVals, [e[1][1] for e in deltaToBounds], color="red")
pyplt.fill_between(xVals, [e[1][0] for e in deltaToBounds], [e[1][1] for e in deltaToBounds], alpha=0.4, color="orange")









    Out[196]:





<matplotlib.collections.PolyCollection at 0x1250baad0>

Impact of unmeasured features

One variable plus gaussian noise



In [197]:

    
dataPoints_1 = []
x = np.arange(0, 100, 10)
for j in xrange(100):
    points = [(i, i*2 + 3 + numpy.random.normal(scale=50.0)) for i in x]
    dataPoints_1.extend(points)



In [198]:

    
pointToVals = []
pointToBounds = []
for i in np.arange(0, 100, 10):
    valsForDataPoint = [v for v in dataPoints_1 if v[0]==i]
    pointToVals.append(valsForDataPoint)
    upperBound = numpy.percentile(valsForDataPoint, 95)
    lowerBound = numpy.percentile(valsForDataPoint, 5)
    pointToBounds.append([i, (upperBound, lowerBound)])



In [199]:

    
pyplt.plot(x, [v[1][0] for v in pointToBounds])
pyplt.plot(x, [v[1][1] for v in pointToBounds])
pyplt.plot(x, [[i*2+3] for i in x], color="red", label="true model")
pyplt.fill_between(x, [v[1][0] for v in pointToBounds], [v[1][1] for v in pointToBounds], color="orange", alpha=0.4)
pyplt.legend(loc=2, prop={'size':18})









    Out[199]:





<matplotlib.legend.Legend at 0x148ca22d0>

Two variables plus gaussian noise



In [200]:

    
dataPoints_2 = []
x = np.arange(0, 100, 10)
y = np.arange(50, 150, 10)
for j in xrange(100):
    yVal = random.choice(y)
    points = [(i, i*2 + yVal*2 + 3 + numpy.random.normal(scale=50.0)) for i in x]
    dataPoints_2.extend(points)



In [201]:

    
avgYAtX = {}
for i in x:
    lineVals = [(i*2 + yVal*2 + 3 ) for yVal in np.arange(50, 150, 10)]
    avgY = reduce(lambda x,y: x+y, lineVals)/len(lineVals)
    avgYAtX[i] = avgY



In [202]:

    
pointToVals = []
pointToBounds = []
for i in np.arange(0, 100, 10):
    valsForDataPoint = [v for v in dataPoints_2 if v[0]==i]
    pointToVals.append(valsForDataPoint)
    upperBound = numpy.percentile(valsForDataPoint, 95)
    lowerBound = numpy.percentile(valsForDataPoint, 5)
    pointToBounds.append([i, (upperBound, lowerBound)])



In [203]:

    
pyplt.plot(x, [v[1][0] for v in pointToBounds])
pyplt.plot(x, [v[1][1] for v in pointToBounds])
pyplt.plot(x, [avgYAtX[i] for i in x], color="red", label="true model (best estimate)")
pyplt.fill_between(x, [v[1][0] for v in pointToBounds], [v[1][1] for v in pointToBounds], color="orange", alpha=0.4)
pyplt.legend(loc=2, prop={'size':18})









    Out[203]:





<matplotlib.legend.Legend at 0x10e9cf250>

Feature selection

Proposed list of features to consider for influence upon intertweet time: -mention distance -time of day -contains mention -contains URL -length of tweet (num chars) -contains hashtags -is_reply

We now need to form the features described above. The most non-trivial one to create is "mention distance", as defined in lecture. We do this now.

Forming mention distance



In [204]:

    
import twitter_tools
from twitter_tools import *

Obtain your own data files here!



In [205]:

    
tweetFile = open("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_gruber_tweets.json")



In [206]:

    
import json
jsonFile = json.load(tweetFile)



In [207]:

    
tweetFile.close()



In [208]:

    
gruberTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_gruber_tweets.json")
siracusaTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_siracusa_tweets.json")
armentTweetsDF = pandas.io.json.read_json("/Users/dondini/Udacity/lesson_2/ud919/lesson2/twitter/new_arment_tweets.json")



In [209]:

    
gruberTimeDiffs = []
gruberTweetTimes = []
gruberTimeToDiff = {}
gruberTimeToText = {}
siracusaMentionTimes = []
armentMentionTimes = []



In [210]:

    
gruberCreatedDF = gruberTweetsDF.ix[0:, ["created_at"]]
gruberCreatedTextDF = gruberTweetsDF.ix[0:, ["created_at", "text"]]
createdTextVals = gruberCreatedTextDF.values

for i, row in gruberCreatedDF.iterrows():
    gruberTweetTimes.append(row["created_at"])



In [211]:

    
gruberTweetTimes.sort()



In [212]:

    
for i in xrange(1, len(gruberTweetTimes)):
    timeDiff = (gruberTweetTimes[i]-gruberTweetTimes[i-1]).seconds
    gruberTimeDiffs.append(timeDiff)
    gruberTimeToDiff[gruberTweetTimes[i]] = timeDiff
    gruberTimeToText[gruberTweetTimes[i]] = gruberCreatedTextDF[ gruberCreatedTextDF["created_at"]==gruberTweetTimes[i] ]

Capture time of mentions



In [213]:

    
nearestMentionToTimeDiff = []
tweetIndexToNearestMention = {}



In [214]:

    
def findTweetFollowingTime(timeStamp, tweetTimes):
    returnTweetTime = None
    
    for t in tweetTimes:
        if t>timeStamp:
            returnTweetTime = t
            break
    
    return returnTweetTime

def findTweetPreceedingTime(timeStamp, tweetTimes):
    returnTweetTime = None
    
    i = len(tweetTimes)-1
    
    while i>=0:
        t = tweetTimes[i]
        if t<timeStamp:
            returnTweetTime = t
            break
        
        i-=1
    
    return returnTweetTime



In [215]:

    
siracusaTimeOfGruberMentions = []
armentTimeOfGruberMentions = []



In [216]:

    
for i, row in armentTweetsDF.iterrows():
    if "user_mentions" in row:
        if type(row["user_mentions"]) == list:
            if len([e for e in row["user_mentions"] if e["screen_name"]=="gruber"])>0:
                armentTimeOfGruberMentions.append(row["created_at"])



In [217]:

    
for i, row in siracusaTweetsDF.iterrows():
    if "user_mentions" in row:
        if type(row["user_mentions"]) == list:
            if len([e for e in row["user_mentions"] if e["screen_name"]=="gruber"])>0:
                siracusaTimeOfGruberMentions.append(row["created_at"])

Mention distance for each @gruber tweet

For each gruber tweet, let's find the tweet belonging to either @siracusa or @marcoarment which is the closest, in time, to mention @gruber



In [218]:

    
gruberTweetTimes.sort()
siracusaTimeOfGruberMentions.sort()
armentTimeOfGruberMentions.sort()
for i in xrange(len(gruberTweetTimes)):
    t = gruberTweetTimes[i]
    t_next = None
    if i+1<len(gruberTweetTimes):
        t_next = gruberTweetTimes[i+1]
        
        #print "t_next: %s" % t_next
        t_s = findTweetFollowingTime(t, siracusaTimeOfGruberMentions)
        t_s_prev = findTweetPreceedingTime(t, siracusaTimeOfGruberMentions)
        
        #print "t_s: %s" % t_s
        
        t_a = findTweetFollowingTime(t, armentTimeOfGruberMentions)
        t_a_prev = findTweetPreceedingTime(t, armentTimeOfGruberMentions)
        
        sDiff = None
        aDiff = None
        if t_s_prev is not None and t_s is not None: 
            sDiff = math.fabs((t_s - t).seconds)
            if sDiff >math.fabs((t-t_s_prev).seconds):
                sDiff = math.fabs((t-t_s_prev).seconds)
            
        if t_a_prev is not None and t_a is not None:
            aDiff = math.fabs((t_a - t).seconds)
            if aDiff > math.fabs((t-t_a_prev).seconds):
                aDiff = math.fabs((t - t_a_prev).seconds)
        
        closestMention = None
        
        if sDiff is not None:
            closestMention = sDiff
        elif aDiff is not None:
            closestMention = aDiff
        
        if aDiff is not None and sDiff is not None:
            if aDiff < sDiff:
                closestMention = aDiff
                
        if closestMention is not None:
            nearestMentionToTimeDiff.append((closestMention, (t_next-t).seconds))
            tweetIndexToNearestMention[i] = closestMention

Extract remaining features



In [219]:

    
features_list = extract_features(jsonFile)



In [220]:

    
featuresWithLabel = []
for i in range(len(gruberTimeDiffs)):
    timeDiff = gruberTimeDiffs[i]
    if timeDiff<4000:
        label = "short"
    else:
        label = "long"
    
    featuresForTweet = features_list[i]
    
    nearestMention = 0
    if i in tweetIndexToNearestMention:
        nearestMention = tweetIndexToNearestMention[i]
    
    completeItem = []
    completeItem.append(label)
    completeItem.extend(list(featuresForTweet))
    completeItem.append(nearestMention)
    featuresWithLabel.append(completeItem)

Evaluate features by mutual information gain

Get info_gain library from your local path!



In [221]:

    
cd /Users/dondini/Udacity/









    



/Users/dondini/Udacity



In [222]:

    
from info_gain import *

1 Time of day



In [223]:

    
valsY = ["short", "long"]
binsY = None



In [224]:

    
joint_list = [(v[1], v[0]) for v in featuresWithLabel]
valsX = ["morning", "afternoon", "evening", "night"]
binsX = None



In [225]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, None, None)



In [226]:

    
entropy_loss(jpTable, valsX, valsY)









    Out[226]:





0.00029792328911204535

2 Contains mention



In [227]:

    
joint_list = [(v[2], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]



In [228]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)



In [229]:

    
entropy_loss(jpTable, [0.0, 1.0], valsY)









    Out[229]:





0.00018592171588510675

3 Contains URL



In [230]:

    
joint_list = [(v[3], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]



In [231]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)



In [232]:

    
entropy_loss(jpTable, [0.0, 1.0], valsY)









    Out[232]:





0.0006917841288795468

4 Length of tweet



In [233]:

    
joint_list = [(v[4], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, 14], [14, 28], [28, 42], [42, 56], [56, 70], [70, 84], [84, 98], [98, 112], [112, 126]]



In [234]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)



In [235]:

    
entropy_loss(jpTable, [v[4] for v in featuresWithLabel], valsY)









    Out[235]:





3.184467596637892

5 Contains hashtags



In [236]:

    
joint_list = [(v[5], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]



In [237]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)



In [238]:

    
entropy_loss(jpTable, [0.0, 1.0], valsY)









    Out[238]:





2.471814972664077e-05

6 Is reply



In [239]:

    
joint_list = [(v[6], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, .9], [1.0, 100]]



In [240]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)



In [241]:

    
entropy_loss(jpTable, [0.0, 1.0], valsY)









    Out[241]:





7.188753133346992e-05

7 Mention distance



In [242]:

    
joint_list = [(v[7], v[0]) for v in featuresWithLabel]
valsX = None
binsX = [[0, 1000], [1000, 5000], [5000, 8000], [8000, 10000], [10000, 20000], [20000, 30000], [30000, 60000], [60000, 80000], [80000, 10000], [100000, 800000]]



In [243]:

    
jpTable = compute_joint_prob(joint_list, valsX, valsY, bins1=binsX)



In [244]:

    
entropy_loss(jpTable, [0.0, 1.0], valsY)









    Out[244]:





2.065038542452612

Fitting a model

Obtain data points - each point is of the form (delta-t, mention distance, text-length of last tweet) - label with time until next tweet

kNN

Form data points to be used with kNN model



In [245]:

    
step_size = 10
knn_data_points = []
tweet_index = 0
for v in timeUntilNext:
    bin_left_edges = np.arange(0, v, step_size)
    
    features_for_tweet = features_list[tweet_index]
    
    if tweet_index in tweetIndexToNearestMention:
        for l_edge in bin_left_edges:
            newDeltaT = l_edge
            mentionDist = tweetIndexToNearestMention[tweet_index] + newDeltaT
            textLength = features_for_tweet[3]
            label = v-l_edge
            
            newPoint = [newDeltaT, mentionDist, textLength, label]
            knn_data_points.append(newPoint)
    
    tweet_index+=1



In [246]:

    
import sklearn
from sklearn.neighbors import KNeighborsRegressor



In [247]:

    
knn = KNeighborsRegressor(5)



In [248]:

    
knn_3 = KNeighborsRegressor(3)



In [249]:

    
len(knn_data_points)









    Out[249]:





1381258



In [250]:

    
.70*len(knn_data_points)









    Out[250]:





966880.6



In [251]:

    
.30*len(knn_data_points)









    Out[251]:





414377.39999999997



In [252]:

    
import random



In [253]:

    
trainingPoints = [random.choice(knn_data_points) for i in xrange(966880)]



In [254]:

    
trainingX = [(v[0], v[1], v[2]) for v in trainingPoints]



In [255]:

    
trainingY = [v[3] for v in trainingPoints]



In [256]:

    
m_3 = knn_3.fit(trainingX, trainingY)



In [257]:

    
m = knn.fit(trainingX, trainingY)

Performance on training set



In [258]:

    
y_ = m.predict(trainingX)



In [259]:

    
y = m_3.predict(trainingX)



In [260]:

    
train_diffs = []
for i in xrange(len(trainingY)):
    train_diffs.append(trainingY[i] - y_[i])



In [261]:

    
pandas.Series([math.fabs(x) for x in train_diffs]).describe()









    Out[261]:





count    966880.000000
mean       1876.546200
std        4720.141228
min           0.000000
25%           4.000000
50%           8.000000
75%        1062.000000
max       65938.400000
dtype: float64



In [262]:

    
pandas.Series(train_diffs).hist(bins=50)









    Out[262]:





<matplotlib.axes.AxesSubplot at 0x12316d210>

Performance on test set



In [263]:

    
testPoints = [random.choice(knn_data_points) for i in xrange(414377)]



In [264]:

    
testX = [(v[0], v[1], v[2]) for v in testPoints]



In [265]:

    
testY = [v[3] for v in testPoints]



In [266]:

    
test_pred = m_3.predict(testX)



In [267]:

    
test_diffs = []
for i in xrange(len(testY)):
    test_diffs.append(math.fabs(testY[i] - test_pred[i]))



In [268]:

    
pandas.Series(test_diffs).describe()









    



//anaconda/lib/python2.7/site-packages/pandas/compat/scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
//anaconda/lib/python2.7/site-packages/pandas/compat/scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]






    Out[268]:





count    414377.000000
mean       1834.645057
std        5575.817493
min           0.000000
25%           3.333333
50%           6.666667
75%          30.000000
max       82658.000000
dtype: float64



In [269]:

    
testSeries = pandas.Series(test_diffs)



In [270]:

    
testSeries.hist(bins=50)









    Out[270]:





<matplotlib.axes.AxesSubplot at 0x12316d950>



In [270]: