Only the important stuff-Copy1



In [2]:
%matplotlib inline
import networkx as nx
import csv
import re
import pandas as pd
import numpy as np
try:
    import statistics
    from statistics import StatisticsError
except ImportError:
    print('ImportError: No module named statistics? (python 3)')

import random
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.decomposition import TruncatedSVD
import itertools
from sklearn import mixture


def L1 (x,y):
    dist = 0
    if len(x)==len(y):
        for i in range(len(x)):
            dist += math.fabs(x[i]-y[i])
        return(dist)
    else:
        print('vectors must be equal length for L1')
        return (None)


ImportError: No module named statistics? (python 3)

In [3]:
#!
# This code here makes the nx.Graph
G=nx.Graph()

m=0 # these two counters 
n=0 # arn't important

# with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
with open('training.1600000.processed.noemoticon.csv') as f_in:
    for line in f_in: 
        lineX = list(csv.reader(line, skipinitialspace=True))
        G.add_node(lineX[8][0])
        if '@' in lineX[10][0]:
            m+=1
            for t in re.split('[^a-zA-Z\_\@]', lineX[10][0]):
                if t!='' and t[0]=='@' and t!='@':
                    G.add_edge(lineX[8][0],t[1:])
                    n+=1
        if n%100000==0:
            print(n)
print(nx.number_of_nodes(G))


100000
100000
100000
100000
100000
100000
100000
200000
200000
300000
300000
400000
500000
600000
700000
889334

In [4]:
len(G)


Out[4]:
889334

In [5]:
len(G.edges())


Out[5]:
616462

In [6]:
# Finding the largest connected_component
LargestCC = max(nx.connected_component_subgraphs(G), key=len) # largest connected component
print(nx.number_of_nodes(LargestCC))
# del G


339766

In [7]:
#!
# removes self-loops from the graph, this is needed to get nx.k_core
LargestCC.remove_edges_from(LargestCC.selfloop_edges())

core7 = nx.k_core(LargestCC,7)
# del LargestCC

In [8]:
# find the fiedler vector, and use it to partition the graph

f = nx.fiedler_vector(core7)
s = np.zeros(len(f),dtype='int')
s[f>0]=1

# this is the positions we will use for each graph
pos = nx.spring_layout(core7)

In [9]:
# draw partition
colors = ['#d7191c', '#2b83ba'] # red and blue
node_colors = [colors[s[v]] for v in range(nx.number_of_nodes(core7))]
nx.draw(core7,pos=pos, node_color=node_colors,node_size=10)



In [10]:
# this makes the laplacian matrix to do the spectral clustering
L = nx.laplacian_matrix(core7).todense()
w, v = np.linalg.eig(L)
v = np.array(v)
worder = np.argsort(w)

# X = v @ np.diag(w) # python 3
X = np.matmul( v , np.diag(w) )
    
X = X[:,worder]

In [11]:
# based on the graph above, k=6 was chosen. k=4 was what we were taught to
# choose, because it's the "L" in the graph. Though that didn't look good,
# so I increased k to 6.
# this runs k-means for the next code
kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit_predict(X[:,1:3])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

In [12]:
colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon']
node_colors = [colors[labels[i]] for i in range(nx.number_of_nodes(core7))]
nx.draw(core7, pos = pos, node_color=node_colors,node_size=30)



In [13]:
# Now we switch from the graphical analysis to LSA

In [14]:
# #!
# # this reads in the tweets
# # then simply parses user ID into ID_list
# # and the tweet text into TextList
# TextList = []
# ID_list = []
# n=0
# # with open('training.1600000.processed.noemoticon.csv', encoding='latin-1') as f_in:
# with open('training.1600000.processed.noemoticon.csv') as f_in:
#     for line in f_in:
#         lineX = list(csv.reader(line, skipinitialspace=True))
#         TextList.append(lineX[10][0])
#         ID_list.append(lineX[8][0])
#         n=n+1
#         if n%100000==0:
#             print(n)
# print(n)

In [24]:
import networkx as nx

import matplotlib.pyplot as plt

deg_hist = nx.degree_histogram(G)

# plt.scatter( range(len(deg_hist)), deg_hist)
# plt.show()

fig = plt.figure()
ax = plt.gca()
ax.plot(range(len(deg_hist)),deg_hist, 'o', c='blue', alpha=0.05, markeredgecolor='none')
ax.set_yscale('log')
ax.set_xscale('log')

ax.set_xlabel('degree')
ax.set_ylabel('frequency')
ax.set_title('Degree distribution for network of mentions on twitter.')


Out[24]:
<matplotlib.text.Text at 0x187615590>

In [15]:
"""0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query. If there is no query, then this value is NO_QUERY.
4 - the user that tweeted
5 - the text of the tweet"""
cols = ['polarity','tweetID','date','Query','UserID','text']

df = pd.read_csv('training.1600000.processed.noemoticon.csv',names=cols,encoding='latin-1')#names=m_cols ,

In [25]:
len(df['UserID'].unique())


Out[25]:
659775

In [128]:
TextListA = list(df.text)

In [17]:
# #!
# #takes a long time
# # vectorize TextList to dtm
# # if you can get the snowball stemmer to work, that would be useful

# vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
# dtm = vectorizer.fit_transform(TextList)
# # del TextList

In [18]:
#!
#takes a long time
# vectorize TextList to dtm
# if you can get the snowball stemmer to work, that would be useful

vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(TextListA)
del TextListA

In [ ]:
# vectorizer.get_feature_names()

In [ ]:
vectorizer.get_stop_words()

In [19]:
#!
# compute svd of dtm
svd = TruncatedSVD(n_components=100, n_iter=4)
svdOutput = svd.fit_transform(dtm)

In [39]:
svdOutput.shape


Out[39]:
(1600000, 25)

In [20]:
#!
# this is the model I went with for LSA

gmm = mixture.GMM(n_components=10, covariance_type='full')
# gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
gmm.fit(svdOutput[:,:15])
pred = gmm.predict(svdOutput[:,:15])

In [129]:
#!
# this is the model I went with for LSA

vectorizer_half = TfidfVectorizer(stop_words='english', min_df=10,max_df=0.5)
dtm_half = vectorizer_half.fit_transform(TextListA)

#!
# compute svd of dtm
svd_half = TruncatedSVD(n_components=50, n_iter=4)
svdOutput_half = svd_half.fit_transform(dtm_half)

gmm5 = mixture.GMM(n_components=5, covariance_type='full')
# gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
gmm5.fit(svdOutput_half[:,:15])
y = gmm5.predict(svdOutput_half[:,:15])

In [21]:
# this converts the GMM result from classifying tweets
# into classifying users
ID_Pred = {}
for i in range(len(ID_list)):
    ID = ID_list[i]
    if ID in ID_Pred:
        ID_Pred[ID].append(pred[i])
    else:
        ID_Pred[ID]=[pred[i]]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-a0ce786569d4> in <module>()
      2 # into classifying users
      3 ID_Pred = {}
----> 4 for i in range(len(ID_list)):
      5     ID = ID_list[i]
      6     if ID in ID_Pred:

NameError: name 'ID_list' is not defined

In [ ]:
# # this converts the GMM result from classifying tweets
# # into classifying users
# # this also classifies all users, not just core7
# ID_Pred = {}
# for i in range(len(ID_list)):
#     ID = ID_list[i]
#     if ID in ID_Pred:
#         ID_Pred[ID].append(pred[i])
#     else:
#         ID_Pred[ID]=[pred[i]]
        
# colors = ['#d7191c', '#ffffbf', '#2b83ba', 'green','orange','maroon','black']
# node_colors = []
# for g in core7: # classify the nodes, based off their tweets
#     try:
#         try: # if there is only one mode of groups, classify the user as the mode
#             X = statistics.mode(ID_Pred[g])
#             node_colors.append(colors[X])
#         except StatisticsError: # if there is no mode, pick a tweet at random, and classify the user as that tweet's group
#             node_colors.append(colors[ID_Pred[g][random.randint(0,len(ID_Pred[g])-1)]])
            
#     except KeyError: # if the node never tweeted (was only tweeted at)
#         node_colors.append(colors[6]) # make it black

In [ ]:
# draw the core7 based on LSA predictions, only to make it easier to
# compare to our spectral clustering
nx.draw(core7,pos=pos, node_color=node_colors,node_size=10)

In [ ]:
#!
# this gives the top terms of each eigenvector for our LSA
# the groups aren't exactly these values, but it's similar.
# you can also plot the nodes , with these eigenvectors as the axis
# being a good way to visualize the results of LSA

terms = vectorizer.get_feature_names()

for i in range(0,20):
    top = np.argsort(svd.components_[i])
    topterms = [terms[top[f]] for f in range(60,120)]
    print()
    print (i,topterms)

In [24]:
means = gmm.means_

In [136]:
gmm5.means_[0,:].shape


Out[136]:
(15,)

In [137]:
means5 = gmm5.means_
# mean5_vecs = np.array(means5)
dfs5 = []
for i in xrange(means5.shape[0]):
    vec = np.zeros(100)
    vec[:15] = mean_vecs[i,:]
    cat = svd.inverse_transform(vec.reshape(1,-1))
    top_feats = []
    for i in np.argsort(cat).reshape(-1):
        top_feats.append((features[i], cat.reshape(-1)[i]))
    mydf = pd.DataFrame(top_feats)
    mydf.columns = ['feature', 'tfidf']
    dfs5.append(mydf)

In [44]:
mean_vecs[0,:].shape


Out[44]:
(15,)

In [50]:
vec = np.zeros(100)
vec[:15] = mean_vecs[0,:]
# vec
cat = svd.inverse_transform(vec.reshape(1,-1))

In [115]:
mean_vecs = np.array(means)
dfs = []
for i in xrange(10):
    vec = np.zeros(100)
    vec[:15] = mean_vecs[i,:]
    cat = svd.inverse_transform(vec.reshape(1,-1))
    top_feats = []
    for i in np.argsort(cat).reshape(-1):
        top_feats.append((features[i], cat.reshape(-1)[i]))
    mydf = pd.DataFrame(top_feats)
    mydf.columns = ['feature', 'tfidf']
    dfs.append(mydf)
# return df

In [22]:
# Xtr = vec_pipe.fit_transform(X)
# vec = vec_pipe.named_steps['vec']



# features = vec.get_feature_names()

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [ ]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [ ]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [28]:
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

In [32]:


In [106]:
for i in xrange(len(dfs)):
    dfs[i].label = i

In [110]:
# plot_tfidf_classfeats_h(dfs[:3])
#

In [116]:
dff = pd.concat(dfs, axis=1)

In [127]:
dff[-700:-650]


Out[127]:
feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf
101548 plans 0.000506 road 0.000421 dress 0.000200 ill 0.000435 24 0.000564 sunny 0.000404 lakers 0.000624 mileycyrus 0.000568 wondering 0.000350 tooo 0.000474
101549 alright 0.000506 needed 0.000421 simple 0.000200 forever 0.000438 demi 0.000565 meant 0.000404 different 0.000625 lakers 0.000569 planned 0.000351 doesnt 0.000474
101550 taken 0.000507 paid 0.000422 hun 0.000200 season 0.000438 record 0.000566 fb 0.000404 xo 0.000626 fact 0.000571 single 0.000354 holidays 0.000476
101551 airport 0.000507 easy 0.000422 kill 0.000201 babygirlparis 0.000438 soooo 0.000567 number 0.000404 starts 0.000626 team 0.000571 whats 0.000355 cd 0.000477
101552 paper 0.000508 doctors 0.000423 liked 0.000202 hanging 0.000439 random 0.000567 def 0.000405 save 0.000627 hates 0.000571 twitterville 0.000356 drunk 0.000477
101553 card 0.000509 walking 0.000423 tummy 0.000202 question 0.000439 eh 0.000568 single 0.000406 film 0.000627 hmmm 0.000572 agree 0.000356 dogs 0.000478
101554 luck 0.000509 says 0.000423 camera 0.000202 brazil 0.000440 breakfast 0.000569 thoughts 0.000406 worried 0.000627 facebook 0.000573 email 0.000357 cried 0.000478
101555 band 0.000509 cos 0.000424 lately 0.000203 ang 0.000440 join 0.000569 tuesday 0.000406 deal 0.000632 goodbye 0.000573 isnt 0.000358 blackberry 0.000479
101556 worked 0.000510 bummer 0.000424 relax 0.000203 dark 0.000440 bet 0.000569 smell 0.000406 huh 0.000633 bummer 0.000573 dead 0.000358 isnt 0.000479
101557 slow 0.000510 straight 0.000424 confused 0.000203 wasn 0.000441 evening 0.000570 woo 0.000406 met 0.000633 moon 0.000574 staying 0.000362 park 0.000479
101558 lately 0.000513 knew 0.000425 using 0.000203 heard 0.000441 vegas 0.000570 knows 0.000408 wtf 0.000633 chill 0.000576 meetings 0.000363 dead 0.000479
101559 fat 0.000515 broken 0.000425 huge 0.000203 4officeautomation 0.000441 eye 0.000572 keeping 0.000408 fix 0.000633 eye 0.000576 met 0.000364 spent 0.000480
101560 nite 0.000515 calling 0.000425 played 0.000203 emailunlimited 0.000441 wine 0.000572 store 0.000408 usually 0.000633 argh 0.000576 mr 0.000364 air 0.000481
101561 books 0.000515 business 0.000426 son 0.000203 scary 0.000442 magic 0.000572 waking 0.000409 email 0.000634 drinking 0.000577 cut 0.000364 thanks 0.000481
101562 pay 0.000516 nyc 0.000426 shall 0.000204 sent 0.000443 concert 0.000573 green 0.000410 shame 0.000634 cd 0.000578 eyes 0.000365 plane 0.000481
101563 dnt 0.000516 throat 0.000427 goodbye 0.000204 favourite 0.000443 wear 0.000574 30 0.000411 learn 0.000634 cake 0.000579 positive 0.000366 philippines 0.000483
101564 hugs 0.000519 wouldn 0.000427 airport 0.000205 didnt 0.000445 inside 0.000575 earlier 0.000412 peace 0.000635 suppose 0.000580 throat 0.000366 staying 0.000484
101565 coz 0.000519 ate 0.000427 jus 0.000205 box 0.000447 trip 0.000575 jon 0.000414 bro 0.000635 paid 0.000583 nights 0.000367 listen 0.000487
101566 mtv 0.000520 worked 0.000427 hmmm 0.000206 peace 0.000447 named 0.000576 sadly 0.000414 sadly 0.000637 tweeps 0.000584 date 0.000367 hahah 0.000488
101567 lovely 0.000520 forget 0.000428 starts 0.000206 pc 0.000448 scary 0.000577 telling 0.000415 mate 0.000637 goodmorning 0.000584 todays 0.000367 absolutely 0.000488
101568 posted 0.000521 watched 0.000428 kid 0.000206 mood 0.000448 proud 0.000577 yummy 0.000416 ohh 0.000638 lesson 0.000585 tour 0.000368 33 0.000489
101569 inside 0.000521 finishing 0.000428 episode 0.000206 dougiemcfly 0.000449 afternoon 0.000577 hurts 0.000417 word 0.000638 kid 0.000587 parents 0.000369 england 0.000489
101570 holiday 0.000522 problem 0.000428 warm 0.000206 kid 0.000450 thx 0.000578 fast 0.000417 airport 0.000639 cancelled 0.000588 planning 0.000370 email 0.000489
101571 uk 0.000523 aw 0.000429 photo 0.000207 wear 0.000450 hahahaha 0.000579 lately 0.000419 called 0.000640 star 0.000588 twitterland 0.000370 wtf 0.000489
101572 pissed 0.000524 red 0.000431 scary 0.000207 write 0.000450 shall 0.000579 shoutout 0.000422 chill 0.000641 allergies 0.000588 entire 0.000371 hes 0.000490
101573 share 0.000525 mommy 0.000431 wear 0.000208 15 0.000453 died 0.000580 walk 0.000422 sooooo 0.000641 sent 0.000588 problem 0.000372 guitar 0.000490
101574 ouch 0.000526 person 0.000431 wednesday 0.000208 lonely 0.000453 mommy 0.000580 freaking 0.000423 thoughts 0.000641 hugs 0.000592 online 0.000372 road 0.000491
101575 huh 0.000527 outta 0.000432 mother 0.000208 eye 0.000454 mum 0.000580 country 0.000423 team 0.000642 miles 0.000593 filled 0.000372 girlfriend 0.000491
101576 blackberry 0.000528 grandma 0.000433 exactly 0.000209 headache 0.000454 final 0.000581 drunk 0.000424 sweetie 0.000642 pass 0.000595 hospital 0.000372 sleepy 0.000491
101577 drinking 0.000530 hmmm 0.000433 tom 0.000209 0.000456 huh 0.000581 peeps 0.000425 nearly 0.000642 loads 0.000596 sit 0.000373 tear 0.000492
101578 save 0.000531 pack 0.000435 decided 0.000209 sky 0.000456 turned 0.000581 chris 0.000425 mmm 0.000643 sadly 0.000597 choice 0.000373 weekends 0.000492
101579 drunk 0.000532 ended 0.000436 isnt 0.000209 misses 0.000457 speak 0.000582 reminder 0.000425 tickets 0.000643 fantastic 0.000598 mad 0.000374 bbq 0.000494
101580 fact 0.000535 tweeps 0.000437 gettin 0.000209 ask 0.000457 bar 0.000582 plans 0.000425 voice 0.000643 pic 0.000598 spring 0.000374 headed 0.000495
101581 normal 0.000535 clothes 0.000437 cos 0.000210 currently 0.000458 reminds 0.000583 starting 0.000428 cup 0.000645 double 0.000600 body 0.000374 study 0.000496
101582 hanging 0.000536 yep 0.000438 uk 0.000210 souljaboytellem 0.000458 test 0.000584 mac 0.000428 looked 0.000645 zoo 0.000601 twitterverse 0.000376 yo 0.000496
101583 asked 0.000537 killing 0.000438 terrible 0.000210 tiny 0.000460 watchin 0.000584 quote 0.000429 question 0.000646 sucked 0.000601 sweetie 0.000377 officially 0.000496
101584 small 0.000537 8am 0.000439 fair 0.000210 angels 0.000460 sexy 0.000589 jk 0.000430 answer 0.000648 longer 0.000602 walking 0.000377 lame 0.000496
101585 feet 0.000538 shall 0.000440 lady 0.000210 wont 0.000461 beer 0.000590 half 0.000431 anyways 0.000648 ppl 0.000602 chance 0.000378 relaxing 0.000497
101586 isnt 0.000538 favorite 0.000440 da 0.000211 luv 0.000461 hannah 0.000593 stuck 0.000431 liked 0.000648 worth 0.000602 suck 0.000378 ohh 0.000497
101587 dress 0.000542 booo 0.000441 enjoyed 0.000211 bring 0.000462 sense 0.000594 kevin 0.000432 taste 0.000649 brothers 0.000604 quiet 0.000379 sam 0.000497
101588 changed 0.000543 100 0.000441 different 0.000211 plane 0.000462 posted 0.000596 version 0.000432 site 0.000649 date 0.000604 following 0.000380 kate 0.000498
101589 favorite 0.000545 wonder 0.000441 sims 0.000212 france 0.000463 dreams 0.000597 tickets 0.000432 interview 0.000650 mommy 0.000609 wife 0.000381 date 0.000499
101590 sadly 0.000545 double 0.000441 fix 0.000212 available 0.000463 sadly 0.000597 weekends 0.000432 eh 0.000653 die 0.000610 luv 0.000381 club 0.000500
101591 played 0.000546 prom 0.000444 exciting 0.000212 looked 0.000463 normal 0.000597 doesnt 0.000434 lazy 0.000653 song 0.000610 lord 0.000382 hmm 0.000500
101592 deal 0.000546 math 0.000445 finals 0.000213 000 0.000463 style 0.000598 fav 0.000435 twitterverse 0.000655 ones 0.000610 nyc 0.000383 slow 0.000500
101593 apparently 0.000546 suppose 0.000445 aren 0.000213 1st 0.000463 bed 0.000598 raining 0.000437 easy 0.000657 looked 0.000612 wont 0.000383 misses 0.000501
101594 hold 0.000546 tonite 0.000446 hospital 0.000214 pls 0.000464 hand 0.000598 goin 0.000439 thx 0.000658 boyfriend 0.000613 thx 0.000385 crap 0.000501
101595 15 0.000547 pics 0.000446 nights 0.000214 30 0.000465 fat 0.000599 lets 0.000440 ate 0.000658 gave 0.000616 picture 0.000386 333 0.000501
101596 passed 0.000548 freaking 0.000446 star 0.000214 yea 0.000467 luv 0.000599 aren 0.000440 terrible 0.000658 passed 0.000618 instead 0.000388 update 0.000501
101597 bet 0.000550 moment 0.000446 realized 0.000215 lady 0.000467 series 0.000599 worked 0.000440 son 0.000664 sound 0.000618 bike 0.000389 writing 0.000501

In [139]:
dff5 = pd.concat(dfs5, axis=1)

In [145]:
dff5[-500:-450]


Out[145]:
feature tfidf feature tfidf feature tfidf feature tfidf feature tfidf
101748 slept 0.000726 quite 0.000596 cleaning 0.000289 mr 0.000593 busy 0.000749
101749 12 0.000729 site 0.000602 ipod 0.000289 design 0.000594 eating 0.000751
101750 fair 0.000730 post 0.000606 evening 0.000290 ng 0.000595 store 0.000754
101751 exactly 0.000731 figure 0.000607 test 0.000290 course 0.000595 goodbye 0.000756
101752 saturday 0.000733 moving 0.000607 site 0.000290 comments 0.000596 minutes 0.000757
101753 album 0.000733 kill 0.000608 11 0.000290 air 0.000596 weather 0.000758
101754 test 0.000735 season 0.000609 congrats 0.000291 cold 0.000598 rest 0.000759
101755 broken 0.000736 relax 0.000610 upset 0.000291 park 0.000600 cd 0.000763
101756 mum 0.000737 woo 0.000610 flu 0.000291 laptop 0.000606 weeks 0.000763
101757 running 0.000738 ipod 0.000610 tour 0.000291 reason 0.000606 june 0.000764
101758 lil 0.000740 cut 0.000611 short 0.000293 david 0.000607 ipod 0.000765
101759 town 0.000741 bummed 0.000611 throat 0.000294 em 0.000609 played 0.000770
101760 worst 0.000743 decided 0.000611 figure 0.000296 hit 0.000612 air 0.000771
101761 interesting 0.000743 fair 0.000613 spent 0.000299 ones 0.000613 worth 0.000773
101762 driving 0.000746 months 0.000617 rock 0.000299 white 0.000616 computer 0.000778
101763 fail 0.000747 sims 0.000617 spending 0.000300 web 0.000616 country 0.000779
101764 red 0.000748 lame 0.000617 idk 0.000300 app 0.000618 awww 0.000780
101765 definitely 0.000753 ahhh 0.000617 horrible 0.000301 stupid 0.000619 month 0.000781
101766 joined 0.000759 lake 0.000617 interesting 0.000301 run 0.000621 wonderful 0.000783
101767 water 0.000759 wear 0.000621 la 0.000302 cutest 0.000621 light 0.000784
101768 fell 0.000761 boys 0.000621 problem 0.000302 click 0.000623 button 0.000784
101769 lots 0.000762 comes 0.000622 clean 0.000303 vid 0.000625 happen 0.000785
101770 learn 0.000763 ago 0.000624 understand 0.000303 ride 0.000625 die 0.000786
101771 past 0.000763 bbq 0.000625 xxx 0.000304 forgot 0.000626 sooo 0.000787
101772 20 0.000764 lost 0.000632 laptop 0.000304 user 0.000626 vote 0.000790
101773 nope 0.000769 hahaha 0.000633 mood 0.000304 php 0.000626 son 0.000791
101774 moon 0.000771 plan 0.000635 bought 0.000305 month 0.000627 lunch 0.000793
101775 set 0.000774 video 0.000636 past 0.000305 goodbye 0.000627 case 0.000795
101776 clean 0.000776 boyfriend 0.000637 lil 0.000306 computer 0.000628 cat 0.000795
101777 figure 0.000781 drink 0.000640 worst 0.000306 close 0.000631 starting 0.000798
101778 turn 0.000783 makes 0.000642 pool 0.000306 20 0.000632 hmm 0.000799
101779 knew 0.000784 mum 0.000642 scared 0.000307 support 0.000635 outside 0.000800
101780 bout 0.000787 change 0.000648 awards 0.000307 6shtr 0.000635 club 0.000803
101781 wtf 0.000788 dance 0.000650 felt 0.000308 shopping 0.000640 text 0.000803
101782 plan 0.000789 date 0.000656 open 0.000308 6q1om 0.000640 ppl 0.000804
101783 songs 0.000790 nights 0.000656 lonely 0.000309 bing 0.000641 da 0.000808
101784 birthday 0.000794 worst 0.000657 tinyurl 0.000310 works 0.000649 rain 0.000809
101785 close 0.000794 camp 0.000658 tweeting 0.000311 loopt 0.000651 wtf 0.000811
101786 answer 0.000796 longer 0.000659 visit 0.000311 hear 0.000654 itunes 0.000812
101787 ahh 0.000799 sit 0.000661 youtube 0.000311 wishes 0.000655 wife 0.000815
101788 moment 0.000799 20 0.000665 eyes 0.000312 account 0.000658 close 0.000822
101789 instead 0.000800 bike 0.000666 instead 0.000313 early 0.000659 john 0.000827
101790 english 0.000800 broke 0.000667 album 0.000315 couldn 0.000660 click 0.000828
101791 high 0.000801 brother 0.000668 turn 0.000316 asks 0.000662 wondering 0.000829
101792 lonely 0.000802 ahh 0.000670 cut 0.000316 mac 0.000662 till 0.000830
101793 office 0.000803 staying 0.000671 dead 0.000316 trailer 0.000664 internet 0.000832
101794 open 0.000806 online 0.000672 fast 0.000316 sucks 0.000665 knew 0.000832
101795 months 0.000810 wondering 0.000674 enjoying 0.000317 dogbook 0.000666 cover 0.000832
101796 busy 0.000812 uni 0.000675 high 0.000317 fans 0.000667 sign 0.000832
101797 parents 0.000813 win 0.000676 reason 0.000317 group 0.000668 dinner 0.000833

In [ ]: