notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
try:
    import statistics
    from statistics import StatisticsError
except ImportError:
    print('ImportError: No module named statistics? (python 3)')

import random
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.decomposition import TruncatedSVD
import itertools
from sklearn import mixture


def L1 (x,y):
    dist = 0
    if len(x)==len(y):
        for i in range(len(x)):
            dist += math.fabs(x[i]-y[i])
        return(dist)
    else:
        print('vectors must be equal length for L1')
        return (None)









    



ImportError: No module named statistics? (python 3)



In [3]:

    
#!
# This code here makes the nx.Graph
G=nx.Graph()

m=0 # these two counters 
n=0 # arn't important

# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in: 
        lineX = list(csv.reader(line, skipinitialspace=True))
        G.add_node(lineX[8][0])
        if '@' in lineX[10][0]:
            m+=1
            for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
                if t!='' and t[0]=='@' and t!='@':
                    G.add_edge(lineX[8][0],t[1:])
                    n+=1
        if n%100000==0:
            print(n)
print(nx.number_of_nodes(G))



In [4]:

    
len(G)









    Out[4]:





889334



In [5]:

    
len(G.edges())









    Out[5]:





616462



In [6]:

    
# Finding the largest connected_component
LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component
print(nx.number_of_nodes(LargestCC))
# del G



In [7]:

    
#!
# removes self-loops from the graph, this is needed to get nx.k_core
LargestCC.remove_edges_from(LargestCC.selfloop_edges())

core7 = nx.k_core(LargestCC,7)
# del LargestCC



In [8]:

    
# find the fiedler vector, and use it to partition the graph

f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')
s[f>0]=1

# this is the positions we will use for each graph
pos = nx.spring_layout(core7)



In [9]:

    
# draw partition
colors = ['#d7191c', '#2b83ba'] # red and blue
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7,pos=pos, node_color=node_colors,node_size=10)



In [10]:

    
# this makes the laplacian matrix to do the spectral clustering
L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)

# X = v @ np.diag(w) # python 3
X = np.matmul( v , np.diag(w) )
    
X = X[:,worder]



In [11]:

    
# based on the graph above, k=6 was chosen. k=4 was what we were taught to
# choose, because it's the "L" in the graph. Though that didn't look good,
# so I increased k to 6.
# this runs k-means for the next code
kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit_predict(X[:,1:3])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_



In [12]:

    
colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, pos = pos, node_color=node_colors,node_size=30)



In [13]:

    
# Now we switch from the graphical analysis to LSA



In [14]:

    
# #!
# # this reads in the tweets
# # then simply parses user ID into ID_list
# # and the tweet text into TextList
# TextList = []
# ID_list = []
# n=0
# # with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
# with open('training.1600000.processed.noemoticon.csv') as f_in:
#     for line in f_in:
#         lineX = list(csv.reader(line, skipinitialspace=True))
#         TextList.append(lineX[10][0])
#         ID_list.append(lineX[8][0])
#         n=n+1
#         if n%100000==0:
#             print(n)
# print(n)



In [24]:

    
import networkx as nx

import matplotlib.pyplot as plt

deg_hist = nx.degree_histogram(G)

# plt.scatter( range(len(deg_hist)), deg_hist)
# plt.show()

fig = plt.figure()
ax = plt.gca()
ax.plot(range(len(deg_hist)),deg_hist, 'o', c='blue', alpha=0.05, markeredgecolor='none')
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel('degree')
ax.set_ylabel('frequency')
ax.set_title('Degree distribution for network of mentions on twitter.')









    Out[24]:





<matplotlib.text.Text at 0x187615590>



In [15]:

    
"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']

df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,



In [25]:

    
len(df['UserID'].unique())









    Out[25]:





659775



In [128]:

    
TextListA = list(df.text)



In [17]:

    
# #!
# #takes a long time
# # vectorize TextList to dtm
# # if you can get the snowball stemmer to work, that would be useful

# vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
# dtm = vectorizer.fit_transform(TextList)
# # del TextList



In [18]:

    
#!
#takes a long time
# vectorize TextList to dtm
# if you can get the snowball stemmer to work, that would be useful

vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(TextListA)
del TextListA



In [ ]:

    
# vectorizer.get_feature_names()



In [ ]:

    
vectorizer.get_stop_words()



In [19]:

    
#!
# compute svd of dtm
svd = TruncatedSVD(n_components=100, n_iter=4)
svdOutput = svd.fit_transform(dtm)



In [39]:

    
svdOutput.shape









    Out[39]:





(1600000, 25)



In [20]:

    
#!
# this is the model I went with for LSA

gmm = mixture.GMM(n_components=10, covariance_type='full')
# gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
gmm.fit(svdOutput[:,:15])
pred = gmm.predict(svdOutput[:,:15])



In [129]:

    
#!
# this is the model I went with for LSA

vectorizer_half = TfidfVectorizer(stop_words='english', min_df=10,max_df=0.5)
dtm_half = vectorizer_half.fit_transform(TextListA)

#!
# compute svd of dtm
svd_half = TruncatedSVD(n_components=50, n_iter=4)
svdOutput_half = svd_half.fit_transform(dtm_half)

gmm5 = mixture.GMM(n_components=5, covariance_type='full')
# gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
gmm5.fit(svdOutput_half[:,:15])
y = gmm5.predict(svdOutput_half[:,:15])



In [21]:

    
# this converts the GMM result from classifying tweets
# into classifying users
ID_Pred = {}
for i in range(len(ID_list)):
    ID = ID_list[i]
    if ID in ID_Pred:
        ID_Pred[ID].append(pred[i])
    else:
        ID_Pred[ID]=[pred[i]]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-a0ce786569d4> in <module>()
      2 # into classifying users
      3 ID_Pred = {}
----> 4 for i in range(len(ID_list)):
      5     ID = ID_list[i]
      6     if ID in ID_Pred:

NameError: name 'ID_list' is not defined



In [ ]:

    
# # this converts the GMM result from classifying tweets
# # into classifying users
# # this also classifies all users, not just core7
# ID_Pred = {}
# for i in range(len(ID_list)):
#     ID = ID_list[i]
#     if ID in ID_Pred:
#         ID_Pred[ID].append(pred[i])
#     else:
#         ID_Pred[ID]=[pred[i]]
        
# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
# node_colors = []
# for g in core7: # classify the nodes, based off their tweets
#     try:
#         try: # if there is only one mode of groups, classify the user as the mode
#             X = statistics.mode(ID_Pred[g])
#             node_colors.append(colors[X])
#         except StatisticsError: # if there is no mode, pick a tweet at random, and classify the user as that tweet's group
#             node_colors.append(colors[ID_Pred[g][random.randint(0,len(ID_Pred[g])-1)]])
            
#     except KeyError: # if the node never tweeted (was only tweeted at)
#         node_colors.append(colors[6]) # make it black



In [ ]:

    
# draw the core7 based on LSA predictions, only to make it easier to
# compare to our spectral clustering
nx.draw(core7,pos=pos, node_color=node_colors,node_size=10)



In [ ]:

    
#!
# this gives the top terms of each eigenvector for our LSA
# the groups aren't exactly these values, but it's similar.
# you can also plot the nodes , with these eigenvectors as the axis
# being a good way to visualize the results of LSA

terms = vectorizer.get_feature_names()

for i in range(0,20):
    top = np.argsort(svd.components_[i])
    topterms = [terms[top[f]] for f in range(60,120)]
    print()
    print (i,topterms)



In [24]:

    
means = gmm.means_



In [136]:

    
gmm5.means_[0,:].shape









    Out[136]:





(15,)



In [137]:

    
means5 = gmm5.means_
# mean5_vecs = np.array(means5)
dfs5 = []
for i in xrange(means5.shape[0]):
    vec = np.zeros(100)
    vec[:15] = mean_vecs[i,:]
    cat = svd.inverse_transform(vec.reshape(1,-1))
    top_feats = []
    for i in np.argsort(cat).reshape(-1):
        top_feats.append((features[i], cat.reshape(-1)[i]))
    mydf = pd.DataFrame(top_feats)
    mydf.columns = ['feature', 'tfidf']
    dfs5.append(mydf)



In [44]:

    
mean_vecs[0,:].shape









    Out[44]:





(15,)



In [50]:

    
vec = np.zeros(100)
vec[:15] = mean_vecs[0,:]
# vec
cat = svd.inverse_transform(vec.reshape(1,-1))



In [115]:

    
mean_vecs = np.array(means)
dfs = []
for i in xrange(10):
    vec = np.zeros(100)
    vec[:15] = mean_vecs[i,:]
    cat = svd.inverse_transform(vec.reshape(1,-1))
    top_feats = []
    for i in np.argsort(cat).reshape(-1):
        top_feats.append((features[i], cat.reshape(-1)[i]))
    mydf = pd.DataFrame(top_feats)
    mydf.columns = ['feature', 'tfidf']
    dfs.append(mydf)
# return df



In [22]:

    
# Xtr = vec_pipe.fit_transform(X)
# vec = vec_pipe.named_steps['vec']



# features = vec.get_feature_names()

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df



In [ ]:

    
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)



In [ ]:

    
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs



In [28]:

    
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()



In [32]:



In [106]:

    
for i in xrange(len(dfs)):
    dfs[i].label = i



In [110]:

    
# plot_tfidf_classfeats_h(dfs[:3])
#



In [116]:

    
dff = pd.concat(dfs, axis=1)



In [127]:

    
dff[-700:-650]









    Out[127]:






  
    
      
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
    
  
  
    
      101548
      plans
      0.000506
      road
      0.000421
      dress
      0.000200
      ill
      0.000435
      24
      0.000564
      sunny
      0.000404
      lakers
      0.000624
      mileycyrus
      0.000568
      wondering
      0.000350
      tooo
      0.000474
    
    
      101549
      alright
      0.000506
      needed
      0.000421
      simple
      0.000200
      forever
      0.000438
      demi
      0.000565
      meant
      0.000404
      different
      0.000625
      lakers
      0.000569
      planned
      0.000351
      doesnt
      0.000474
    
    
      101550
      taken
      0.000507
      paid
      0.000422
      hun
      0.000200
      season
      0.000438
      record
      0.000566
      fb
      0.000404
      xo
      0.000626
      fact
      0.000571
      single
      0.000354
      holidays
      0.000476
    
    
      101551
      airport
      0.000507
      easy
      0.000422
      kill
      0.000201
      babygirlparis
      0.000438
      soooo
      0.000567
      number
      0.000404
      starts
      0.000626
      team
      0.000571
      whats
      0.000355
      cd
      0.000477
    
    
      101552
      paper
      0.000508
      doctors
      0.000423
      liked
      0.000202
      hanging
      0.000439
      random
      0.000567
      def
      0.000405
      save
      0.000627
      hates
      0.000571
      twitterville
      0.000356
      drunk
      0.000477
    
    
      101553
      card
      0.000509
      walking
      0.000423
      tummy
      0.000202
      question
      0.000439
      eh
      0.000568
      single
      0.000406
      film
      0.000627
      hmmm
      0.000572
      agree
      0.000356
      dogs
      0.000478
    
    
      101554
      luck
      0.000509
      says
      0.000423
      camera
      0.000202
      brazil
      0.000440
      breakfast
      0.000569
      thoughts
      0.000406
      worried
      0.000627
      facebook
      0.000573
      email
      0.000357
      cried
      0.000478
    
    
      101555
      band
      0.000509
      cos
      0.000424
      lately
      0.000203
      ang
      0.000440
      join
      0.000569
      tuesday
      0.000406
      deal
      0.000632
      goodbye
      0.000573
      isnt
      0.000358
      blackberry
      0.000479
    
    
      101556
      worked
      0.000510
      bummer
      0.000424
      relax
      0.000203
      dark
      0.000440
      bet
      0.000569
      smell
      0.000406
      huh
      0.000633
      bummer
      0.000573
      dead
      0.000358
      isnt
      0.000479
    
    
      101557
      slow
      0.000510
      straight
      0.000424
      confused
      0.000203
      wasn
      0.000441
      evening
      0.000570
      woo
      0.000406
      met
      0.000633
      moon
      0.000574
      staying
      0.000362
      park
      0.000479
    
    
      101558
      lately
      0.000513
      knew
      0.000425
      using
      0.000203
      heard
      0.000441
      vegas
      0.000570
      knows
      0.000408
      wtf
      0.000633
      chill
      0.000576
      meetings
      0.000363
      dead
      0.000479
    
    
      101559
      fat
      0.000515
      broken
      0.000425
      huge
      0.000203
      4officeautomation
      0.000441
      eye
      0.000572
      keeping
      0.000408
      fix
      0.000633
      eye
      0.000576
      met
      0.000364
      spent
      0.000480
    
    
      101560
      nite
      0.000515
      calling
      0.000425
      played
      0.000203
      emailunlimited
      0.000441
      wine
      0.000572
      store
      0.000408
      usually
      0.000633
      argh
      0.000576
      mr
      0.000364
      air
      0.000481
    
    
      101561
      books
      0.000515
      business
      0.000426
      son
      0.000203
      scary
      0.000442
      magic
      0.000572
      waking
      0.000409
      email
      0.000634
      drinking
      0.000577
      cut
      0.000364
      thanks
      0.000481
    
    
      101562
      pay
      0.000516
      nyc
      0.000426
      shall
      0.000204
      sent
      0.000443
      concert
      0.000573
      green
      0.000410
      shame
      0.000634
      cd
      0.000578
      eyes
      0.000365
      plane
      0.000481
    
    
      101563
      dnt
      0.000516
      throat
      0.000427
      goodbye
      0.000204
      favourite
      0.000443
      wear
      0.000574
      30
      0.000411
      learn
      0.000634
      cake
      0.000579
      positive
      0.000366
      philippines
      0.000483
    
    
      101564
      hugs
      0.000519
      wouldn
      0.000427
      airport
      0.000205
      didnt
      0.000445
      inside
      0.000575
      earlier
      0.000412
      peace
      0.000635
      suppose
      0.000580
      throat
      0.000366
      staying
      0.000484
    
    
      101565
      coz
      0.000519
      ate
      0.000427
      jus
      0.000205
      box
      0.000447
      trip
      0.000575
      jon
      0.000414
      bro
      0.000635
      paid
      0.000583
      nights
      0.000367
      listen
      0.000487
    
    
      101566
      mtv
      0.000520
      worked
      0.000427
      hmmm
      0.000206
      peace
      0.000447
      named
      0.000576
      sadly
      0.000414
      sadly
      0.000637
      tweeps
      0.000584
      date
      0.000367
      hahah
      0.000488
    
    
      101567
      lovely
      0.000520
      forget
      0.000428
      starts
      0.000206
      pc
      0.000448
      scary
      0.000577
      telling
      0.000415
      mate
      0.000637
      goodmorning
      0.000584
      todays
      0.000367
      absolutely
      0.000488
    
    
      101568
      posted
      0.000521
      watched
      0.000428
      kid
      0.000206
      mood
      0.000448
      proud
      0.000577
      yummy
      0.000416
      ohh
      0.000638
      lesson
      0.000585
      tour
      0.000368
      33
      0.000489
    
    
      101569
      inside
      0.000521
      finishing
      0.000428
      episode
      0.000206
      dougiemcfly
      0.000449
      afternoon
      0.000577
      hurts
      0.000417
      word
      0.000638
      kid
      0.000587
      parents
      0.000369
      england
      0.000489
    
    
      101570
      holiday
      0.000522
      problem
      0.000428
      warm
      0.000206
      kid
      0.000450
      thx
      0.000578
      fast
      0.000417
      airport
      0.000639
      cancelled
      0.000588
      planning
      0.000370
      email
      0.000489
    
    
      101571
      uk
      0.000523
      aw
      0.000429
      photo
      0.000207
      wear
      0.000450
      hahahaha
      0.000579
      lately
      0.000419
      called
      0.000640
      star
      0.000588
      twitterland
      0.000370
      wtf
      0.000489
    
    
      101572
      pissed
      0.000524
      red
      0.000431
      scary
      0.000207
      write
      0.000450
      shall
      0.000579
      shoutout
      0.000422
      chill
      0.000641
      allergies
      0.000588
      entire
      0.000371
      hes
      0.000490
    
    
      101573
      share
      0.000525
      mommy
      0.000431
      wear
      0.000208
      15
      0.000453
      died
      0.000580
      walk
      0.000422
      sooooo
      0.000641
      sent
      0.000588
      problem
      0.000372
      guitar
      0.000490
    
    
      101574
      ouch
      0.000526
      person
      0.000431
      wednesday
      0.000208
      lonely
      0.000453
      mommy
      0.000580
      freaking
      0.000423
      thoughts
      0.000641
      hugs
      0.000592
      online
      0.000372
      road
      0.000491
    
    
      101575
      huh
      0.000527
      outta
      0.000432
      mother
      0.000208
      eye
      0.000454
      mum
      0.000580
      country
      0.000423
      team
      0.000642
      miles
      0.000593
      filled
      0.000372
      girlfriend
      0.000491
    
    
      101576
      blackberry
      0.000528
      grandma
      0.000433
      exactly
      0.000209
      headache
      0.000454
      final
      0.000581
      drunk
      0.000424
      sweetie
      0.000642
      pass
      0.000595
      hospital
      0.000372
      sleepy
      0.000491
    
    
      101577
      drinking
      0.000530
      hmmm
      0.000433
      tom
      0.000209
      iâ
      0.000456
      huh
      0.000581
      peeps
      0.000425
      nearly
      0.000642
      loads
      0.000596
      sit
      0.000373
      tear
      0.000492
    
    
      101578
      save
      0.000531
      pack
      0.000435
      decided
      0.000209
      sky
      0.000456
      turned
      0.000581
      chris
      0.000425
      mmm
      0.000643
      sadly
      0.000597
      choice
      0.000373
      weekends
      0.000492
    
    
      101579
      drunk
      0.000532
      ended
      0.000436
      isnt
      0.000209
      misses
      0.000457
      speak
      0.000582
      reminder
      0.000425
      tickets
      0.000643
      fantastic
      0.000598
      mad
      0.000374
      bbq
      0.000494
    
    
      101580
      fact
      0.000535
      tweeps
      0.000437
      gettin
      0.000209
      ask
      0.000457
      bar
      0.000582
      plans
      0.000425
      voice
      0.000643
      pic
      0.000598
      spring
      0.000374
      headed
      0.000495
    
    
      101581
      normal
      0.000535
      clothes
      0.000437
      cos
      0.000210
      currently
      0.000458
      reminds
      0.000583
      starting
      0.000428
      cup
      0.000645
      double
      0.000600
      body
      0.000374
      study
      0.000496
    
    
      101582
      hanging
      0.000536
      yep
      0.000438
      uk
      0.000210
      souljaboytellem
      0.000458
      test
      0.000584
      mac
      0.000428
      looked
      0.000645
      zoo
      0.000601
      twitterverse
      0.000376
      yo
      0.000496
    
    
      101583
      asked
      0.000537
      killing
      0.000438
      terrible
      0.000210
      tiny
      0.000460
      watchin
      0.000584
      quote
      0.000429
      question
      0.000646
      sucked
      0.000601
      sweetie
      0.000377
      officially
      0.000496
    
    
      101584
      small
      0.000537
      8am
      0.000439
      fair
      0.000210
      angels
      0.000460
      sexy
      0.000589
      jk
      0.000430
      answer
      0.000648
      longer
      0.000602
      walking
      0.000377
      lame
      0.000496
    
    
      101585
      feet
      0.000538
      shall
      0.000440
      lady
      0.000210
      wont
      0.000461
      beer
      0.000590
      half
      0.000431
      anyways
      0.000648
      ppl
      0.000602
      chance
      0.000378
      relaxing
      0.000497
    
    
      101586
      isnt
      0.000538
      favorite
      0.000440
      da
      0.000211
      luv
      0.000461
      hannah
      0.000593
      stuck
      0.000431
      liked
      0.000648
      worth
      0.000602
      suck
      0.000378
      ohh
      0.000497
    
    
      101587
      dress
      0.000542
      booo
      0.000441
      enjoyed
      0.000211
      bring
      0.000462
      sense
      0.000594
      kevin
      0.000432
      taste
      0.000649
      brothers
      0.000604
      quiet
      0.000379
      sam
      0.000497
    
    
      101588
      changed
      0.000543
      100
      0.000441
      different
      0.000211
      plane
      0.000462
      posted
      0.000596
      version
      0.000432
      site
      0.000649
      date
      0.000604
      following
      0.000380
      kate
      0.000498
    
    
      101589
      favorite
      0.000545
      wonder
      0.000441
      sims
      0.000212
      france
      0.000463
      dreams
      0.000597
      tickets
      0.000432
      interview
      0.000650
      mommy
      0.000609
      wife
      0.000381
      date
      0.000499
    
    
      101590
      sadly
      0.000545
      double
      0.000441
      fix
      0.000212
      available
      0.000463
      sadly
      0.000597
      weekends
      0.000432
      eh
      0.000653
      die
      0.000610
      luv
      0.000381
      club
      0.000500
    
    
      101591
      played
      0.000546
      prom
      0.000444
      exciting
      0.000212
      looked
      0.000463
      normal
      0.000597
      doesnt
      0.000434
      lazy
      0.000653
      song
      0.000610
      lord
      0.000382
      hmm
      0.000500
    
    
      101592
      deal
      0.000546
      math
      0.000445
      finals
      0.000213
      000
      0.000463
      style
      0.000598
      fav
      0.000435
      twitterverse
      0.000655
      ones
      0.000610
      nyc
      0.000383
      slow
      0.000500
    
    
      101593
      apparently
      0.000546
      suppose
      0.000445
      aren
      0.000213
      1st
      0.000463
      bed
      0.000598
      raining
      0.000437
      easy
      0.000657
      looked
      0.000612
      wont
      0.000383
      misses
      0.000501
    
    
      101594
      hold
      0.000546
      tonite
      0.000446
      hospital
      0.000214
      pls
      0.000464
      hand
      0.000598
      goin
      0.000439
      thx
      0.000658
      boyfriend
      0.000613
      thx
      0.000385
      crap
      0.000501
    
    
      101595
      15
      0.000547
      pics
      0.000446
      nights
      0.000214
      30
      0.000465
      fat
      0.000599
      lets
      0.000440
      ate
      0.000658
      gave
      0.000616
      picture
      0.000386
      333
      0.000501
    
    
      101596
      passed
      0.000548
      freaking
      0.000446
      star
      0.000214
      yea
      0.000467
      luv
      0.000599
      aren
      0.000440
      terrible
      0.000658
      passed
      0.000618
      instead
      0.000388
      update
      0.000501
    
    
      101597
      bet
      0.000550
      moment
      0.000446
      realized
      0.000215
      lady
      0.000467
      series
      0.000599
      worked
      0.000440
      son
      0.000664
      sound
      0.000618
      bike
      0.000389
      writing
      0.000501



In [139]:

    
dff5 = pd.concat(dfs5, axis=1)



In [145]:

    
dff5[-500:-450]









    Out[145]:






  
    
      
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
      feature
      tfidf
    
  
  
    
      101748
      slept
      0.000726
      quite
      0.000596
      cleaning
      0.000289
      mr
      0.000593
      busy
      0.000749
    
    
      101749
      12
      0.000729
      site
      0.000602
      ipod
      0.000289
      design
      0.000594
      eating
      0.000751
    
    
      101750
      fair
      0.000730
      post
      0.000606
      evening
      0.000290
      ng
      0.000595
      store
      0.000754
    
    
      101751
      exactly
      0.000731
      figure
      0.000607
      test
      0.000290
      course
      0.000595
      goodbye
      0.000756
    
    
      101752
      saturday
      0.000733
      moving
      0.000607
      site
      0.000290
      comments
      0.000596
      minutes
      0.000757
    
    
      101753
      album
      0.000733
      kill
      0.000608
      11
      0.000290
      air
      0.000596
      weather
      0.000758
    
    
      101754
      test
      0.000735
      season
      0.000609
      congrats
      0.000291
      cold
      0.000598
      rest
      0.000759
    
    
      101755
      broken
      0.000736
      relax
      0.000610
      upset
      0.000291
      park
      0.000600
      cd
      0.000763
    
    
      101756
      mum
      0.000737
      woo
      0.000610
      flu
      0.000291
      laptop
      0.000606
      weeks
      0.000763
    
    
      101757
      running
      0.000738
      ipod
      0.000610
      tour
      0.000291
      reason
      0.000606
      june
      0.000764
    
    
      101758
      lil
      0.000740
      cut
      0.000611
      short
      0.000293
      david
      0.000607
      ipod
      0.000765
    
    
      101759
      town
      0.000741
      bummed
      0.000611
      throat
      0.000294
      em
      0.000609
      played
      0.000770
    
    
      101760
      worst
      0.000743
      decided
      0.000611
      figure
      0.000296
      hit
      0.000612
      air
      0.000771
    
    
      101761
      interesting
      0.000743
      fair
      0.000613
      spent
      0.000299
      ones
      0.000613
      worth
      0.000773
    
    
      101762
      driving
      0.000746
      months
      0.000617
      rock
      0.000299
      white
      0.000616
      computer
      0.000778
    
    
      101763
      fail
      0.000747
      sims
      0.000617
      spending
      0.000300
      web
      0.000616
      country
      0.000779
    
    
      101764
      red
      0.000748
      lame
      0.000617
      idk
      0.000300
      app
      0.000618
      awww
      0.000780
    
    
      101765
      definitely
      0.000753
      ahhh
      0.000617
      horrible
      0.000301
      stupid
      0.000619
      month
      0.000781
    
    
      101766
      joined
      0.000759
      lake
      0.000617
      interesting
      0.000301
      run
      0.000621
      wonderful
      0.000783
    
    
      101767
      water
      0.000759
      wear
      0.000621
      la
      0.000302
      cutest
      0.000621
      light
      0.000784
    
    
      101768
      fell
      0.000761
      boys
      0.000621
      problem
      0.000302
      click
      0.000623
      button
      0.000784
    
    
      101769
      lots
      0.000762
      comes
      0.000622
      clean
      0.000303
      vid
      0.000625
      happen
      0.000785
    
    
      101770
      learn
      0.000763
      ago
      0.000624
      understand
      0.000303
      ride
      0.000625
      die
      0.000786
    
    
      101771
      past
      0.000763
      bbq
      0.000625
      xxx
      0.000304
      forgot
      0.000626
      sooo
      0.000787
    
    
      101772
      20
      0.000764
      lost
      0.000632
      laptop
      0.000304
      user
      0.000626
      vote
      0.000790
    
    
      101773
      nope
      0.000769
      hahaha
      0.000633
      mood
      0.000304
      php
      0.000626
      son
      0.000791
    
    
      101774
      moon
      0.000771
      plan
      0.000635
      bought
      0.000305
      month
      0.000627
      lunch
      0.000793
    
    
      101775
      set
      0.000774
      video
      0.000636
      past
      0.000305
      goodbye
      0.000627
      case
      0.000795
    
    
      101776
      clean
      0.000776
      boyfriend
      0.000637
      lil
      0.000306
      computer
      0.000628
      cat
      0.000795
    
    
      101777
      figure
      0.000781
      drink
      0.000640
      worst
      0.000306
      close
      0.000631
      starting
      0.000798
    
    
      101778
      turn
      0.000783
      makes
      0.000642
      pool
      0.000306
      20
      0.000632
      hmm
      0.000799
    
    
      101779
      knew
      0.000784
      mum
      0.000642
      scared
      0.000307
      support
      0.000635
      outside
      0.000800
    
    
      101780
      bout
      0.000787
      change
      0.000648
      awards
      0.000307
      6shtr
      0.000635
      club
      0.000803
    
    
      101781
      wtf
      0.000788
      dance
      0.000650
      felt
      0.000308
      shopping
      0.000640
      text
      0.000803
    
    
      101782
      plan
      0.000789
      date
      0.000656
      open
      0.000308
      6q1om
      0.000640
      ppl
      0.000804
    
    
      101783
      songs
      0.000790
      nights
      0.000656
      lonely
      0.000309
      bing
      0.000641
      da
      0.000808
    
    
      101784
      birthday
      0.000794
      worst
      0.000657
      tinyurl
      0.000310
      works
      0.000649
      rain
      0.000809
    
    
      101785
      close
      0.000794
      camp
      0.000658
      tweeting
      0.000311
      loopt
      0.000651
      wtf
      0.000811
    
    
      101786
      answer
      0.000796
      longer
      0.000659
      visit
      0.000311
      hear
      0.000654
      itunes
      0.000812
    
    
      101787
      ahh
      0.000799
      sit
      0.000661
      youtube
      0.000311
      wishes
      0.000655
      wife
      0.000815
    
    
      101788
      moment
      0.000799
      20
      0.000665
      eyes
      0.000312
      account
      0.000658
      close
      0.000822
    
    
      101789
      instead
      0.000800
      bike
      0.000666
      instead
      0.000313
      early
      0.000659
      john
      0.000827
    
    
      101790
      english
      0.000800
      broke
      0.000667
      album
      0.000315
      couldn
      0.000660
      click
      0.000828
    
    
      101791
      high
      0.000801
      brother
      0.000668
      turn
      0.000316
      asks
      0.000662
      wondering
      0.000829
    
    
      101792
      lonely
      0.000802
      ahh
      0.000670
      cut
      0.000316
      mac
      0.000662
      till
      0.000830
    
    
      101793
      office
      0.000803
      staying
      0.000671
      dead
      0.000316
      trailer
      0.000664
      internet
      0.000832
    
    
      101794
      open
      0.000806
      online
      0.000672
      fast
      0.000316
      sucks
      0.000665
      knew
      0.000832
    
    
      101795
      months
      0.000810
      wondering
      0.000674
      enjoying
      0.000317
      dogbook
      0.000666
      cover
      0.000832
    
    
      101796
      busy
      0.000812
      uni
      0.000675
      high
      0.000317
      fans
      0.000667
      sign
      0.000832
    
    
      101797
      parents
      0.000813
      win
      0.000676
      reason
      0.000317
      group
      0.000668
      dinner
      0.000833



In [ ]:

	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf
101548	plans	0.000506	road	0.000421	dress	0.000200	ill	0.000435	24	0.000564	sunny	0.000404	lakers	0.000624	mileycyrus	0.000568	wondering	0.000350	tooo	0.000474
101549	alright	0.000506	needed	0.000421	simple	0.000200	forever	0.000438	demi	0.000565	meant	0.000404	different	0.000625	lakers	0.000569	planned	0.000351	doesnt	0.000474
101550	taken	0.000507	paid	0.000422	hun	0.000200	season	0.000438	record	0.000566	fb	0.000404	xo	0.000626	fact	0.000571	single	0.000354	holidays	0.000476
101551	airport	0.000507	easy	0.000422	kill	0.000201	babygirlparis	0.000438	soooo	0.000567	number	0.000404	starts	0.000626	team	0.000571	whats	0.000355	cd	0.000477
101552	paper	0.000508	doctors	0.000423	liked	0.000202	hanging	0.000439	random	0.000567	def	0.000405	save	0.000627	hates	0.000571	twitterville	0.000356	drunk	0.000477
101553	card	0.000509	walking	0.000423	tummy	0.000202	question	0.000439	eh	0.000568	single	0.000406	film	0.000627	hmmm	0.000572	agree	0.000356	dogs	0.000478
101554	luck	0.000509	says	0.000423	camera	0.000202	brazil	0.000440	breakfast	0.000569	thoughts	0.000406	worried	0.000627	facebook	0.000573	email	0.000357	cried	0.000478
101555	band	0.000509	cos	0.000424	lately	0.000203	ang	0.000440	join	0.000569	tuesday	0.000406	deal	0.000632	goodbye	0.000573	isnt	0.000358	blackberry	0.000479
101556	worked	0.000510	bummer	0.000424	relax	0.000203	dark	0.000440	bet	0.000569	smell	0.000406	huh	0.000633	bummer	0.000573	dead	0.000358	isnt	0.000479
101557	slow	0.000510	straight	0.000424	confused	0.000203	wasn	0.000441	evening	0.000570	woo	0.000406	met	0.000633	moon	0.000574	staying	0.000362	park	0.000479
101558	lately	0.000513	knew	0.000425	using	0.000203	heard	0.000441	vegas	0.000570	knows	0.000408	wtf	0.000633	chill	0.000576	meetings	0.000363	dead	0.000479
101559	fat	0.000515	broken	0.000425	huge	0.000203	4officeautomation	0.000441	eye	0.000572	keeping	0.000408	fix	0.000633	eye	0.000576	met	0.000364	spent	0.000480
101560	nite	0.000515	calling	0.000425	played	0.000203	emailunlimited	0.000441	wine	0.000572	store	0.000408	usually	0.000633	argh	0.000576	mr	0.000364	air	0.000481
101561	books	0.000515	business	0.000426	son	0.000203	scary	0.000442	magic	0.000572	waking	0.000409	email	0.000634	drinking	0.000577	cut	0.000364	thanks	0.000481
101562	pay	0.000516	nyc	0.000426	shall	0.000204	sent	0.000443	concert	0.000573	green	0.000410	shame	0.000634	cd	0.000578	eyes	0.000365	plane	0.000481
101563	dnt	0.000516	throat	0.000427	goodbye	0.000204	favourite	0.000443	wear	0.000574	30	0.000411	learn	0.000634	cake	0.000579	positive	0.000366	philippines	0.000483
101564	hugs	0.000519	wouldn	0.000427	airport	0.000205	didnt	0.000445	inside	0.000575	earlier	0.000412	peace	0.000635	suppose	0.000580	throat	0.000366	staying	0.000484
101565	coz	0.000519	ate	0.000427	jus	0.000205	box	0.000447	trip	0.000575	jon	0.000414	bro	0.000635	paid	0.000583	nights	0.000367	listen	0.000487
101566	mtv	0.000520	worked	0.000427	hmmm	0.000206	peace	0.000447	named	0.000576	sadly	0.000414	sadly	0.000637	tweeps	0.000584	date	0.000367	hahah	0.000488
101567	lovely	0.000520	forget	0.000428	starts	0.000206	pc	0.000448	scary	0.000577	telling	0.000415	mate	0.000637	goodmorning	0.000584	todays	0.000367	absolutely	0.000488
101568	posted	0.000521	watched	0.000428	kid	0.000206	mood	0.000448	proud	0.000577	yummy	0.000416	ohh	0.000638	lesson	0.000585	tour	0.000368	33	0.000489
101569	inside	0.000521	finishing	0.000428	episode	0.000206	dougiemcfly	0.000449	afternoon	0.000577	hurts	0.000417	word	0.000638	kid	0.000587	parents	0.000369	england	0.000489
101570	holiday	0.000522	problem	0.000428	warm	0.000206	kid	0.000450	thx	0.000578	fast	0.000417	airport	0.000639	cancelled	0.000588	planning	0.000370	email	0.000489
101571	uk	0.000523	aw	0.000429	photo	0.000207	wear	0.000450	hahahaha	0.000579	lately	0.000419	called	0.000640	star	0.000588	twitterland	0.000370	wtf	0.000489
101572	pissed	0.000524	red	0.000431	scary	0.000207	write	0.000450	shall	0.000579	shoutout	0.000422	chill	0.000641	allergies	0.000588	entire	0.000371	hes	0.000490
101573	share	0.000525	mommy	0.000431	wear	0.000208	15	0.000453	died	0.000580	walk	0.000422	sooooo	0.000641	sent	0.000588	problem	0.000372	guitar	0.000490
101574	ouch	0.000526	person	0.000431	wednesday	0.000208	lonely	0.000453	mommy	0.000580	freaking	0.000423	thoughts	0.000641	hugs	0.000592	online	0.000372	road	0.000491
101575	huh	0.000527	outta	0.000432	mother	0.000208	eye	0.000454	mum	0.000580	country	0.000423	team	0.000642	miles	0.000593	filled	0.000372	girlfriend	0.000491
101576	blackberry	0.000528	grandma	0.000433	exactly	0.000209	headache	0.000454	final	0.000581	drunk	0.000424	sweetie	0.000642	pass	0.000595	hospital	0.000372	sleepy	0.000491
101577	drinking	0.000530	hmmm	0.000433	tom	0.000209	iâ	0.000456	huh	0.000581	peeps	0.000425	nearly	0.000642	loads	0.000596	sit	0.000373	tear	0.000492
101578	save	0.000531	pack	0.000435	decided	0.000209	sky	0.000456	turned	0.000581	chris	0.000425	mmm	0.000643	sadly	0.000597	choice	0.000373	weekends	0.000492
101579	drunk	0.000532	ended	0.000436	isnt	0.000209	misses	0.000457	speak	0.000582	reminder	0.000425	tickets	0.000643	fantastic	0.000598	mad	0.000374	bbq	0.000494
101580	fact	0.000535	tweeps	0.000437	gettin	0.000209	ask	0.000457	bar	0.000582	plans	0.000425	voice	0.000643	pic	0.000598	spring	0.000374	headed	0.000495
101581	normal	0.000535	clothes	0.000437	cos	0.000210	currently	0.000458	reminds	0.000583	starting	0.000428	cup	0.000645	double	0.000600	body	0.000374	study	0.000496
101582	hanging	0.000536	yep	0.000438	uk	0.000210	souljaboytellem	0.000458	test	0.000584	mac	0.000428	looked	0.000645	zoo	0.000601	twitterverse	0.000376	yo	0.000496
101583	asked	0.000537	killing	0.000438	terrible	0.000210	tiny	0.000460	watchin	0.000584	quote	0.000429	question	0.000646	sucked	0.000601	sweetie	0.000377	officially	0.000496
101584	small	0.000537	8am	0.000439	fair	0.000210	angels	0.000460	sexy	0.000589	jk	0.000430	answer	0.000648	longer	0.000602	walking	0.000377	lame	0.000496
101585	feet	0.000538	shall	0.000440	lady	0.000210	wont	0.000461	beer	0.000590	half	0.000431	anyways	0.000648	ppl	0.000602	chance	0.000378	relaxing	0.000497
101586	isnt	0.000538	favorite	0.000440	da	0.000211	luv	0.000461	hannah	0.000593	stuck	0.000431	liked	0.000648	worth	0.000602	suck	0.000378	ohh	0.000497
101587	dress	0.000542	booo	0.000441	enjoyed	0.000211	bring	0.000462	sense	0.000594	kevin	0.000432	taste	0.000649	brothers	0.000604	quiet	0.000379	sam	0.000497
101588	changed	0.000543	100	0.000441	different	0.000211	plane	0.000462	posted	0.000596	version	0.000432	site	0.000649	date	0.000604	following	0.000380	kate	0.000498
101589	favorite	0.000545	wonder	0.000441	sims	0.000212	france	0.000463	dreams	0.000597	tickets	0.000432	interview	0.000650	mommy	0.000609	wife	0.000381	date	0.000499
101590	sadly	0.000545	double	0.000441	fix	0.000212	available	0.000463	sadly	0.000597	weekends	0.000432	eh	0.000653	die	0.000610	luv	0.000381	club	0.000500
101591	played	0.000546	prom	0.000444	exciting	0.000212	looked	0.000463	normal	0.000597	doesnt	0.000434	lazy	0.000653	song	0.000610	lord	0.000382	hmm	0.000500
101592	deal	0.000546	math	0.000445	finals	0.000213	000	0.000463	style	0.000598	fav	0.000435	twitterverse	0.000655	ones	0.000610	nyc	0.000383	slow	0.000500
101593	apparently	0.000546	suppose	0.000445	aren	0.000213	1st	0.000463	bed	0.000598	raining	0.000437	easy	0.000657	looked	0.000612	wont	0.000383	misses	0.000501
101594	hold	0.000546	tonite	0.000446	hospital	0.000214	pls	0.000464	hand	0.000598	goin	0.000439	thx	0.000658	boyfriend	0.000613	thx	0.000385	crap	0.000501
101595	15	0.000547	pics	0.000446	nights	0.000214	30	0.000465	fat	0.000599	lets	0.000440	ate	0.000658	gave	0.000616	picture	0.000386	333	0.000501
101596	passed	0.000548	freaking	0.000446	star	0.000214	yea	0.000467	luv	0.000599	aren	0.000440	terrible	0.000658	passed	0.000618	instead	0.000388	update	0.000501
101597	bet	0.000550	moment	0.000446	realized	0.000215	lady	0.000467	series	0.000599	worked	0.000440	son	0.000664	sound	0.000618	bike	0.000389	writing	0.000501

	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf	feature	tfidf
101748	slept	0.000726	quite	0.000596	cleaning	0.000289	mr	0.000593	busy	0.000749
101749	12	0.000729	site	0.000602	ipod	0.000289	design	0.000594	eating	0.000751
101750	fair	0.000730	post	0.000606	evening	0.000290	ng	0.000595	store	0.000754
101751	exactly	0.000731	figure	0.000607	test	0.000290	course	0.000595	goodbye	0.000756
101752	saturday	0.000733	moving	0.000607	site	0.000290	comments	0.000596	minutes	0.000757
101753	album	0.000733	kill	0.000608	11	0.000290	air	0.000596	weather	0.000758
101754	test	0.000735	season	0.000609	congrats	0.000291	cold	0.000598	rest	0.000759
101755	broken	0.000736	relax	0.000610	upset	0.000291	park	0.000600	cd	0.000763
101756	mum	0.000737	woo	0.000610	flu	0.000291	laptop	0.000606	weeks	0.000763
101757	running	0.000738	ipod	0.000610	tour	0.000291	reason	0.000606	june	0.000764
101758	lil	0.000740	cut	0.000611	short	0.000293	david	0.000607	ipod	0.000765
101759	town	0.000741	bummed	0.000611	throat	0.000294	em	0.000609	played	0.000770
101760	worst	0.000743	decided	0.000611	figure	0.000296	hit	0.000612	air	0.000771
101761	interesting	0.000743	fair	0.000613	spent	0.000299	ones	0.000613	worth	0.000773
101762	driving	0.000746	months	0.000617	rock	0.000299	white	0.000616	computer	0.000778
101763	fail	0.000747	sims	0.000617	spending	0.000300	web	0.000616	country	0.000779
101764	red	0.000748	lame	0.000617	idk	0.000300	app	0.000618	awww	0.000780
101765	definitely	0.000753	ahhh	0.000617	horrible	0.000301	stupid	0.000619	month	0.000781
101766	joined	0.000759	lake	0.000617	interesting	0.000301	run	0.000621	wonderful	0.000783
101767	water	0.000759	wear	0.000621	la	0.000302	cutest	0.000621	light	0.000784
101768	fell	0.000761	boys	0.000621	problem	0.000302	click	0.000623	button	0.000784
101769	lots	0.000762	comes	0.000622	clean	0.000303	vid	0.000625	happen	0.000785
101770	learn	0.000763	ago	0.000624	understand	0.000303	ride	0.000625	die	0.000786
101771	past	0.000763	bbq	0.000625	xxx	0.000304	forgot	0.000626	sooo	0.000787
101772	20	0.000764	lost	0.000632	laptop	0.000304	user	0.000626	vote	0.000790
101773	nope	0.000769	hahaha	0.000633	mood	0.000304	php	0.000626	son	0.000791
101774	moon	0.000771	plan	0.000635	bought	0.000305	month	0.000627	lunch	0.000793
101775	set	0.000774	video	0.000636	past	0.000305	goodbye	0.000627	case	0.000795
101776	clean	0.000776	boyfriend	0.000637	lil	0.000306	computer	0.000628	cat	0.000795
101777	figure	0.000781	drink	0.000640	worst	0.000306	close	0.000631	starting	0.000798
101778	turn	0.000783	makes	0.000642	pool	0.000306	20	0.000632	hmm	0.000799
101779	knew	0.000784	mum	0.000642	scared	0.000307	support	0.000635	outside	0.000800
101780	bout	0.000787	change	0.000648	awards	0.000307	6shtr	0.000635	club	0.000803
101781	wtf	0.000788	dance	0.000650	felt	0.000308	shopping	0.000640	text	0.000803
101782	plan	0.000789	date	0.000656	open	0.000308	6q1om	0.000640	ppl	0.000804
101783	songs	0.000790	nights	0.000656	lonely	0.000309	bing	0.000641	da	0.000808
101784	birthday	0.000794	worst	0.000657	tinyurl	0.000310	works	0.000649	rain	0.000809
101785	close	0.000794	camp	0.000658	tweeting	0.000311	loopt	0.000651	wtf	0.000811
101786	answer	0.000796	longer	0.000659	visit	0.000311	hear	0.000654	itunes	0.000812
101787	ahh	0.000799	sit	0.000661	youtube	0.000311	wishes	0.000655	wife	0.000815
101788	moment	0.000799	20	0.000665	eyes	0.000312	account	0.000658	close	0.000822
101789	instead	0.000800	bike	0.000666	instead	0.000313	early	0.000659	john	0.000827
101790	english	0.000800	broke	0.000667	album	0.000315	couldn	0.000660	click	0.000828
101791	high	0.000801	brother	0.000668	turn	0.000316	asks	0.000662	wondering	0.000829
101792	lonely	0.000802	ahh	0.000670	cut	0.000316	mac	0.000662	till	0.000830
101793	office	0.000803	staying	0.000671	dead	0.000316	trailer	0.000664	internet	0.000832
101794	open	0.000806	online	0.000672	fast	0.000316	sucks	0.000665	knew	0.000832
101795	months	0.000810	wondering	0.000674	enjoying	0.000317	dogbook	0.000666	cover	0.000832
101796	busy	0.000812	uni	0.000675	high	0.000317	fans	0.000667	sign	0.000832
101797	parents	0.000813	win	0.000676	reason	0.000317	group	0.000668	dinner	0.000833