``````

In [2]:

%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
try:
import statistics
from statistics import StatisticsError
except ImportError:
print('ImportError: No module named statistics? (python 3)')

import random
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.decomposition import TruncatedSVD
import itertools
from sklearn import mixture

def L1 (x,y):
dist = 0
if len(x)==len(y):
for i in range(len(x)):
dist += math.fabs(x[i]-y[i])
return(dist)
else:
print('vectors must be equal length for L1')
return (None)

``````
``````

ImportError: No module named statistics? (python 3)

``````
``````

In [3]:

#!
# This code here makes the nx.Graph
G=nx.Graph()

m=0 # these two counters
n=0 # arn't important

# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
for line in f_in:
if '@' in lineX[10][0]:
m+=1
for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
if t!='' and t[0]=='@' and t!='@':
n+=1
if n%100000==0:
print(n)
print(nx.number_of_nodes(G))

``````
``````

100000
100000
100000
100000
100000
100000
100000
200000
200000
300000
300000
400000
500000
600000
700000
889334

``````
``````

In [4]:

len(G)

``````
``````

Out[4]:

889334

``````
``````

In [5]:

len(G.edges())

``````
``````

Out[5]:

616462

``````
``````

In [6]:

# Finding the largest connected_component
LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component
print(nx.number_of_nodes(LargestCC))
# del G

``````
``````

339766

``````
``````

In [7]:

#!
# removes self-loops from the graph, this is needed to get nx.k_core
LargestCC.remove_edges_from(LargestCC.selfloop_edges())

core7 = nx.k_core(LargestCC,7)
# del LargestCC

``````
``````

In [8]:

# find the fiedler vector, and use it to partition the graph

f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')
s[f>0]=1

# this is the positions we will use for each graph
pos = nx.spring_layout(core7)

``````
``````

In [9]:

# draw partition
colors = ['#d7191c', '#2b83ba'] # red and blue
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7,pos=pos, node_color=node_colors,node_size=10)

``````
``````

``````
``````

In [10]:

# this makes the laplacian matrix to do the spectral clustering
L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)

# X = v @ np.diag(w) # python 3
X = np.matmul( v , np.diag(w) )

X = X[:,worder]

``````
``````

In [11]:

# based on the graph above, k=6 was chosen. k=4 was what we were taught to
# choose, because it's the "L" in the graph. Though that didn't look good,
# so I increased k to 6.
# this runs k-means for the next code
kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit_predict(X[:,1:3])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

``````
``````

In [12]:

colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, pos = pos, node_color=node_colors,node_size=30)

``````
``````

``````
``````

In [13]:

# Now we switch from the graphical analysis to LSA

``````
``````

In [14]:

# #!
# # this reads in the tweets
# # then simply parses user ID into ID_list
# # and the tweet text into TextList
# TextList = []
# ID_list = []
# n=0
# # with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
# with open('training.1600000.processed.noemoticon.csv') as f_in:
#     for line in f_in:
#         TextList.append(lineX[10][0])
#         ID_list.append(lineX[8][0])
#         n=n+1
#         if n%100000==0:
#             print(n)
# print(n)

``````
``````

In [24]:

import networkx as nx

import matplotlib.pyplot as plt

deg_hist = nx.degree_histogram(G)

# plt.scatter( range(len(deg_hist)), deg_hist)
# plt.show()

fig = plt.figure()
ax = plt.gca()
ax.plot(range(len(deg_hist)),deg_hist, 'o', c='blue', alpha=0.05, markeredgecolor='none')
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel('degree')
ax.set_ylabel('frequency')
ax.set_title('Degree distribution for network of mentions on twitter.')

``````
``````

Out[24]:

<matplotlib.text.Text at 0x187615590>

``````
``````

In [15]:

"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']

``````
``````

In [25]:

len(df['UserID'].unique())

``````
``````

Out[25]:

659775

``````
``````

In [128]:

TextListA = list(df.text)

``````
``````

In [17]:

# #!
# #takes a long time
# # vectorize TextList to dtm
# # if you can get the snowball stemmer to work, that would be useful

# vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
# dtm = vectorizer.fit_transform(TextList)
# # del TextList

``````
``````

In [18]:

#!
#takes a long time
# vectorize TextList to dtm
# if you can get the snowball stemmer to work, that would be useful

vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(TextListA)
del TextListA

``````
``````

In [ ]:

# vectorizer.get_feature_names()

``````
``````

In [ ]:

vectorizer.get_stop_words()

``````
``````

In [19]:

#!
# compute svd of dtm
svd = TruncatedSVD(n_components=100, n_iter=4)
svdOutput = svd.fit_transform(dtm)

``````
``````

In [39]:

svdOutput.shape

``````
``````

Out[39]:

(1600000, 25)

``````
``````

In [20]:

#!
# this is the model I went with for LSA

gmm = mixture.GMM(n_components=10, covariance_type='full')
# gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
gmm.fit(svdOutput[:,:15])
pred = gmm.predict(svdOutput[:,:15])

``````
``````

In [129]:

#!
# this is the model I went with for LSA

vectorizer_half = TfidfVectorizer(stop_words='english', min_df=10,max_df=0.5)
dtm_half = vectorizer_half.fit_transform(TextListA)

#!
# compute svd of dtm
svd_half = TruncatedSVD(n_components=50, n_iter=4)
svdOutput_half = svd_half.fit_transform(dtm_half)

gmm5 = mixture.GMM(n_components=5, covariance_type='full')
# gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
gmm5.fit(svdOutput_half[:,:15])
y = gmm5.predict(svdOutput_half[:,:15])

``````
``````

In [21]:

# this converts the GMM result from classifying tweets
# into classifying users
ID_Pred = {}
for i in range(len(ID_list)):
ID = ID_list[i]
if ID in ID_Pred:
ID_Pred[ID].append(pred[i])
else:
ID_Pred[ID]=[pred[i]]

``````
``````

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-a0ce786569d4> in <module>()
2 # into classifying users
3 ID_Pred = {}
----> 4 for i in range(len(ID_list)):
5     ID = ID_list[i]
6     if ID in ID_Pred:

NameError: name 'ID_list' is not defined

``````
``````

In [ ]:

# # this converts the GMM result from classifying tweets
# # into classifying users
# # this also classifies all users, not just core7
# ID_Pred = {}
# for i in range(len(ID_list)):
#     ID = ID_list[i]
#     if ID in ID_Pred:
#         ID_Pred[ID].append(pred[i])
#     else:
#         ID_Pred[ID]=[pred[i]]

# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
# node_colors = []
# for g in core7: # classify the nodes, based off their tweets
#     try:
#         try: # if there is only one mode of groups, classify the user as the mode
#             X = statistics.mode(ID_Pred[g])
#             node_colors.append(colors[X])
#         except StatisticsError: # if there is no mode, pick a tweet at random, and classify the user as that tweet's group
#             node_colors.append(colors[ID_Pred[g][random.randint(0,len(ID_Pred[g])-1)]])

#     except KeyError: # if the node never tweeted (was only tweeted at)
#         node_colors.append(colors[6]) # make it black

``````
``````

In [ ]:

# draw the core7 based on LSA predictions, only to make it easier to
# compare to our spectral clustering
nx.draw(core7,pos=pos, node_color=node_colors,node_size=10)

``````
``````

In [ ]:

#!
# this gives the top terms of each eigenvector for our LSA
# the groups aren't exactly these values, but it's similar.
# you can also plot the nodes , with these eigenvectors as the axis
# being a good way to visualize the results of LSA

terms = vectorizer.get_feature_names()

for i in range(0,20):
top = np.argsort(svd.components_[i])
topterms = [terms[top[f]] for f in range(60,120)]
print()
print (i,topterms)

``````
``````

In [24]:

means = gmm.means_

``````
``````

In [136]:

gmm5.means_[0,:].shape

``````
``````

Out[136]:

(15,)

``````
``````

In [137]:

means5 = gmm5.means_
# mean5_vecs = np.array(means5)
dfs5 = []
for i in xrange(means5.shape[0]):
vec = np.zeros(100)
vec[:15] = mean_vecs[i,:]
cat = svd.inverse_transform(vec.reshape(1,-1))
top_feats = []
for i in np.argsort(cat).reshape(-1):
top_feats.append((features[i], cat.reshape(-1)[i]))
mydf = pd.DataFrame(top_feats)
mydf.columns = ['feature', 'tfidf']
dfs5.append(mydf)

``````
``````

In [44]:

mean_vecs[0,:].shape

``````
``````

Out[44]:

(15,)

``````
``````

In [50]:

vec = np.zeros(100)
vec[:15] = mean_vecs[0,:]
# vec
cat = svd.inverse_transform(vec.reshape(1,-1))

``````
``````

In [115]:

mean_vecs = np.array(means)
dfs = []
for i in xrange(10):
vec = np.zeros(100)
vec[:15] = mean_vecs[i,:]
cat = svd.inverse_transform(vec.reshape(1,-1))
top_feats = []
for i in np.argsort(cat).reshape(-1):
top_feats.append((features[i], cat.reshape(-1)[i]))
mydf = pd.DataFrame(top_feats)
mydf.columns = ['feature', 'tfidf']
dfs.append(mydf)
# return df

``````
``````

In [22]:

# Xtr = vec_pipe.fit_transform(X)
# vec = vec_pipe.named_steps['vec']

# features = vec.get_feature_names()

def top_tfidf_feats(row, features, top_n=25):
''' Get top n tfidf values in row and return them with their corresponding feature names.'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df

``````
``````

In [ ]:

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
''' Return the top n features that on average are most important amongst documents in rows
indentified by indices in grp_ids. '''
if grp_ids:
D = Xtr[grp_ids].toarray()
else:
D = Xtr.toarray()

D[D < min_tfidf] = 0
tfidf_means = np.mean(D, axis=0)

``````
``````

In [ ]:

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
calculated across documents with the same class label. '''
dfs = []
labels = np.unique(y)
for label in labels:
ids = np.where(y==label)
feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs

``````
``````

In [28]:

def plot_tfidf_classfeats_h(dfs):
''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
fig = plt.figure(figsize=(12, 9), facecolor="w")
x = np.arange(len(dfs[0]))
for i, df in enumerate(dfs):
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_frame_on(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.set_title("label = " + str(df.label), fontsize=16)
ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
ax.set_yticks(x)
ax.set_ylim([-1, x[-1]+1])
yticks = ax.set_yticklabels(df.feature)
plt.show()

``````
``````

In [32]:

``````
``````

In [106]:

for i in xrange(len(dfs)):
dfs[i].label = i

``````
``````

In [110]:

# plot_tfidf_classfeats_h(dfs[:3])
#

``````
``````

In [116]:

dff = pd.concat(dfs, axis=1)

``````
``````

In [127]:

dff[-700:-650]

``````
``````

Out[127]:

feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf

101548
plans
0.000506
0.000421
dress
0.000200
ill
0.000435
24
0.000564
sunny
0.000404
lakers
0.000624
mileycyrus
0.000568
wondering
0.000350
tooo
0.000474

101549
alright
0.000506
needed
0.000421
simple
0.000200
forever
0.000438
demi
0.000565
meant
0.000404
different
0.000625
lakers
0.000569
planned
0.000351
doesnt
0.000474

101550
taken
0.000507
paid
0.000422
hun
0.000200
season
0.000438
record
0.000566
fb
0.000404
xo
0.000626
fact
0.000571
single
0.000354
holidays
0.000476

101551
airport
0.000507
easy
0.000422
kill
0.000201
babygirlparis
0.000438
soooo
0.000567
number
0.000404
starts
0.000626
team
0.000571
whats
0.000355
cd
0.000477

101552
paper
0.000508
doctors
0.000423
liked
0.000202
hanging
0.000439
random
0.000567
def
0.000405
save
0.000627
hates
0.000571
0.000356
drunk
0.000477

101553
card
0.000509
walking
0.000423
tummy
0.000202
question
0.000439
eh
0.000568
single
0.000406
film
0.000627
hmmm
0.000572
agree
0.000356
dogs
0.000478

101554
luck
0.000509
says
0.000423
camera
0.000202
brazil
0.000440
breakfast
0.000569
thoughts
0.000406
worried
0.000627
0.000573
email
0.000357
cried
0.000478

101555
band
0.000509
cos
0.000424
lately
0.000203
ang
0.000440
join
0.000569
tuesday
0.000406
deal
0.000632
goodbye
0.000573
isnt
0.000358
blackberry
0.000479

101556
worked
0.000510
bummer
0.000424
relax
0.000203
dark
0.000440
bet
0.000569
smell
0.000406
huh
0.000633
bummer
0.000573
0.000358
isnt
0.000479

101557
slow
0.000510
straight
0.000424
confused
0.000203
wasn
0.000441
evening
0.000570
woo
0.000406
met
0.000633
moon
0.000574
staying
0.000362
park
0.000479

101558
lately
0.000513
knew
0.000425
using
0.000203
heard
0.000441
vegas
0.000570
knows
0.000408
wtf
0.000633
chill
0.000576
meetings
0.000363
0.000479

101559
fat
0.000515
broken
0.000425
huge
0.000203
4officeautomation
0.000441
eye
0.000572
keeping
0.000408
fix
0.000633
eye
0.000576
met
0.000364
spent
0.000480

101560
nite
0.000515
calling
0.000425
played
0.000203
emailunlimited
0.000441
wine
0.000572
store
0.000408
usually
0.000633
argh
0.000576
mr
0.000364
air
0.000481

101561
books
0.000515
0.000426
son
0.000203
scary
0.000442
magic
0.000572
waking
0.000409
email
0.000634
drinking
0.000577
cut
0.000364
thanks
0.000481

101562
pay
0.000516
nyc
0.000426
shall
0.000204
sent
0.000443
concert
0.000573
green
0.000410
shame
0.000634
cd
0.000578
eyes
0.000365
plane
0.000481

101563
dnt
0.000516
throat
0.000427
goodbye
0.000204
favourite
0.000443
wear
0.000574
30
0.000411
learn
0.000634
cake
0.000579
positive
0.000366
philippines
0.000483

101564
hugs
0.000519
wouldn
0.000427
airport
0.000205
didnt
0.000445
inside
0.000575
earlier
0.000412
peace
0.000635
suppose
0.000580
throat
0.000366
staying
0.000484

101565
coz
0.000519
ate
0.000427
jus
0.000205
box
0.000447
trip
0.000575
jon
0.000414
bro
0.000635
paid
0.000583
nights
0.000367
listen
0.000487

101566
mtv
0.000520
worked
0.000427
hmmm
0.000206
peace
0.000447
named
0.000576
0.000414
0.000637
tweeps
0.000584
date
0.000367
hahah
0.000488

101567
lovely
0.000520
forget
0.000428
starts
0.000206
pc
0.000448
scary
0.000577
telling
0.000415
mate
0.000637
goodmorning
0.000584
todays
0.000367
absolutely
0.000488

101568
posted
0.000521
watched
0.000428
kid
0.000206
mood
0.000448
proud
0.000577
yummy
0.000416
ohh
0.000638
lesson
0.000585
tour
0.000368
33
0.000489

101569
inside
0.000521
finishing
0.000428
episode
0.000206
dougiemcfly
0.000449
afternoon
0.000577
hurts
0.000417
word
0.000638
kid
0.000587
parents
0.000369
england
0.000489

101570
holiday
0.000522
problem
0.000428
warm
0.000206
kid
0.000450
thx
0.000578
fast
0.000417
airport
0.000639
cancelled
0.000588
planning
0.000370
email
0.000489

101571
uk
0.000523
aw
0.000429
photo
0.000207
wear
0.000450
hahahaha
0.000579
lately
0.000419
called
0.000640
star
0.000588
0.000370
wtf
0.000489

101572
pissed
0.000524
red
0.000431
scary
0.000207
write
0.000450
shall
0.000579
shoutout
0.000422
chill
0.000641
allergies
0.000588
entire
0.000371
hes
0.000490

101573
share
0.000525
mommy
0.000431
wear
0.000208
15
0.000453
died
0.000580
walk
0.000422
sooooo
0.000641
sent
0.000588
problem
0.000372
guitar
0.000490

101574
ouch
0.000526
person
0.000431
wednesday
0.000208
lonely
0.000453
mommy
0.000580
freaking
0.000423
thoughts
0.000641
hugs
0.000592
online
0.000372
0.000491

101575
huh
0.000527
outta
0.000432
mother
0.000208
eye
0.000454
mum
0.000580
country
0.000423
team
0.000642
miles
0.000593
filled
0.000372
girlfriend
0.000491

101576
blackberry
0.000528
grandma
0.000433
exactly
0.000209
0.000454
final
0.000581
drunk
0.000424
sweetie
0.000642
pass
0.000595
hospital
0.000372
sleepy
0.000491

101577
drinking
0.000530
hmmm
0.000433
tom
0.000209
iâ
0.000456
huh
0.000581
peeps
0.000425
nearly
0.000642
0.000596
sit
0.000373
tear
0.000492

101578
save
0.000531
pack
0.000435
decided
0.000209
sky
0.000456
turned
0.000581
chris
0.000425
mmm
0.000643
0.000597
choice
0.000373
weekends
0.000492

101579
drunk
0.000532
ended
0.000436
isnt
0.000209
misses
0.000457
speak
0.000582
reminder
0.000425
tickets
0.000643
fantastic
0.000598
0.000374
bbq
0.000494

101580
fact
0.000535
tweeps
0.000437
gettin
0.000209
0.000457
bar
0.000582
plans
0.000425
voice
0.000643
pic
0.000598
spring
0.000374
0.000495

101581
normal
0.000535
clothes
0.000437
cos
0.000210
currently
0.000458
reminds
0.000583
starting
0.000428
cup
0.000645
double
0.000600
body
0.000374
study
0.000496

101582
hanging
0.000536
yep
0.000438
uk
0.000210
souljaboytellem
0.000458
test
0.000584
mac
0.000428
looked
0.000645
zoo
0.000601
0.000376
yo
0.000496

101583
0.000537
killing
0.000438
terrible
0.000210
tiny
0.000460
watchin
0.000584
quote
0.000429
question
0.000646
sucked
0.000601
sweetie
0.000377
officially
0.000496

101584
small
0.000537
8am
0.000439
fair
0.000210
angels
0.000460
sexy
0.000589
jk
0.000430
0.000648
longer
0.000602
walking
0.000377
lame
0.000496

101585
feet
0.000538
shall
0.000440
0.000210
wont
0.000461
beer
0.000590
half
0.000431
anyways
0.000648
ppl
0.000602
chance
0.000378
relaxing
0.000497

101586
isnt
0.000538
favorite
0.000440
da
0.000211
luv
0.000461
hannah
0.000593
stuck
0.000431
liked
0.000648
worth
0.000602
suck
0.000378
ohh
0.000497

101587
dress
0.000542
booo
0.000441
enjoyed
0.000211
bring
0.000462
sense
0.000594
kevin
0.000432
taste
0.000649
brothers
0.000604
quiet
0.000379
sam
0.000497

101588
changed
0.000543
100
0.000441
different
0.000211
plane
0.000462
posted
0.000596
version
0.000432
site
0.000649
date
0.000604
following
0.000380
kate
0.000498

101589
favorite
0.000545
wonder
0.000441
sims
0.000212
france
0.000463
dreams
0.000597
tickets
0.000432
interview
0.000650
mommy
0.000609
wife
0.000381
date
0.000499

101590
0.000545
double
0.000441
fix
0.000212
available
0.000463
0.000597
weekends
0.000432
eh
0.000653
die
0.000610
luv
0.000381
club
0.000500

101591
played
0.000546
prom
0.000444
exciting
0.000212
looked
0.000463
normal
0.000597
doesnt
0.000434
lazy
0.000653
song
0.000610
lord
0.000382
hmm
0.000500

101592
deal
0.000546
math
0.000445
finals
0.000213
000
0.000463
style
0.000598
fav
0.000435
0.000655
ones
0.000610
nyc
0.000383
slow
0.000500

101593
apparently
0.000546
suppose
0.000445
aren
0.000213
1st
0.000463
bed
0.000598
raining
0.000437
easy
0.000657
looked
0.000612
wont
0.000383
misses
0.000501

101594
hold
0.000546
tonite
0.000446
hospital
0.000214
pls
0.000464
hand
0.000598
goin
0.000439
thx
0.000658
boyfriend
0.000613
thx
0.000385
crap
0.000501

101595
15
0.000547
pics
0.000446
nights
0.000214
30
0.000465
fat
0.000599
lets
0.000440
ate
0.000658
gave
0.000616
picture
0.000386
333
0.000501

101596
passed
0.000548
freaking
0.000446
star
0.000214
yea
0.000467
luv
0.000599
aren
0.000440
terrible
0.000658
passed
0.000618
0.000388
update
0.000501

101597
bet
0.000550
moment
0.000446
realized
0.000215
0.000467
series
0.000599
worked
0.000440
son
0.000664
sound
0.000618
bike
0.000389
writing
0.000501

``````
``````

In [139]:

dff5 = pd.concat(dfs5, axis=1)

``````
``````

In [145]:

dff5[-500:-450]

``````
``````

Out[145]:

feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf
feature
tfidf

101748
slept
0.000726
quite
0.000596
cleaning
0.000289
mr
0.000593
busy
0.000749

101749
12
0.000729
site
0.000602
ipod
0.000289
design
0.000594
eating
0.000751

101750
fair
0.000730
post
0.000606
evening
0.000290
ng
0.000595
store
0.000754

101751
exactly
0.000731
figure
0.000607
test
0.000290
course
0.000595
goodbye
0.000756

101752
saturday
0.000733
moving
0.000607
site
0.000290
0.000596
minutes
0.000757

101753
album
0.000733
kill
0.000608
11
0.000290
air
0.000596
weather
0.000758

101754
test
0.000735
season
0.000609
congrats
0.000291
cold
0.000598
rest
0.000759

101755
broken
0.000736
relax
0.000610
upset
0.000291
park
0.000600
cd
0.000763

101756
mum
0.000737
woo
0.000610
flu
0.000291
laptop
0.000606
weeks
0.000763

101757
running
0.000738
ipod
0.000610
tour
0.000291
reason
0.000606
june
0.000764

101758
lil
0.000740
cut
0.000611
short
0.000293
david
0.000607
ipod
0.000765

101759
town
0.000741
bummed
0.000611
throat
0.000294
em
0.000609
played
0.000770

101760
worst
0.000743
decided
0.000611
figure
0.000296
hit
0.000612
air
0.000771

101761
interesting
0.000743
fair
0.000613
spent
0.000299
ones
0.000613
worth
0.000773

101762
driving
0.000746
months
0.000617
rock
0.000299
white
0.000616
computer
0.000778

101763
fail
0.000747
sims
0.000617
spending
0.000300
web
0.000616
country
0.000779

101764
red
0.000748
lame
0.000617
idk
0.000300
app
0.000618
awww
0.000780

101765
definitely
0.000753
ahhh
0.000617
horrible
0.000301
stupid
0.000619
month
0.000781

101766
joined
0.000759
lake
0.000617
interesting
0.000301
run
0.000621
wonderful
0.000783

101767
water
0.000759
wear
0.000621
la
0.000302
cutest
0.000621
light
0.000784

101768
fell
0.000761
boys
0.000621
problem
0.000302
click
0.000623
button
0.000784

101769
lots
0.000762
comes
0.000622
clean
0.000303
vid
0.000625
happen
0.000785

101770
learn
0.000763
ago
0.000624
understand
0.000303
ride
0.000625
die
0.000786

101771
past
0.000763
bbq
0.000625
xxx
0.000304
forgot
0.000626
sooo
0.000787

101772
20
0.000764
lost
0.000632
laptop
0.000304
user
0.000626
vote
0.000790

101773
nope
0.000769
hahaha
0.000633
mood
0.000304
php
0.000626
son
0.000791

101774
moon
0.000771
plan
0.000635
bought
0.000305
month
0.000627
lunch
0.000793

101775
set
0.000774
video
0.000636
past
0.000305
goodbye
0.000627
case
0.000795

101776
clean
0.000776
boyfriend
0.000637
lil
0.000306
computer
0.000628
cat
0.000795

101777
figure
0.000781
drink
0.000640
worst
0.000306
close
0.000631
starting
0.000798

101778
turn
0.000783
makes
0.000642
pool
0.000306
20
0.000632
hmm
0.000799

101779
knew
0.000784
mum
0.000642
scared
0.000307
support
0.000635
outside
0.000800

101780
bout
0.000787
change
0.000648
awards
0.000307
6shtr
0.000635
club
0.000803

101781
wtf
0.000788
dance
0.000650
felt
0.000308
shopping
0.000640
text
0.000803

101782
plan
0.000789
date
0.000656
open
0.000308
6q1om
0.000640
ppl
0.000804

101783
songs
0.000790
nights
0.000656
lonely
0.000309
bing
0.000641
da
0.000808

101784
birthday
0.000794
worst
0.000657
tinyurl
0.000310
works
0.000649
rain
0.000809

101785
close
0.000794
camp
0.000658
tweeting
0.000311
loopt
0.000651
wtf
0.000811

101786
0.000796
longer
0.000659
visit
0.000311
hear
0.000654
itunes
0.000812

101787
ahh
0.000799
sit
0.000661
0.000311
wishes
0.000655
wife
0.000815

101788
moment
0.000799
20
0.000665
eyes
0.000312
account
0.000658
close
0.000822

101789
0.000800
bike
0.000666
0.000313
early
0.000659
john
0.000827

101790
english
0.000800
broke
0.000667
album
0.000315
couldn
0.000660
click
0.000828

101791
high
0.000801
brother
0.000668
turn
0.000316
0.000662
wondering
0.000829

101792
lonely
0.000802
ahh
0.000670
cut
0.000316
mac
0.000662
till
0.000830

101793
office
0.000803
staying
0.000671
0.000316
trailer
0.000664
internet
0.000832

101794
open
0.000806
online
0.000672
fast
0.000316
sucks
0.000665
knew
0.000832

101795
months
0.000810
wondering
0.000674
enjoying
0.000317
dogbook
0.000666
cover
0.000832

101796
busy
0.000812
uni
0.000675
high
0.000317
fans
0.000667
sign
0.000832

101797
parents
0.000813
win
0.000676
reason
0.000317
group
0.000668
dinner
0.000833

``````
``````

In [ ]:

``````