notebook.community

Edit and run



In [1]:

    
%matplotlib inline

import os
import json
import time
import pickle
import requests
import math


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
df = pd.read_csv('may_june_july.csv', delimiter="|")



In [3]:

    
# Combine all text 
df['tags'] = df['descr'] + " " + df["title"] + " " + df["cat"]+ " " + df["primary_kw"]+ " " + df["tags"] 
# Drop unneeded columns
df.drop('id', axis=1, inplace=True)
df.drop('pull_cc', axis=1, inplace=True)
df.drop('cc', axis=1, inplace=True)
df.drop('metav', axis=1, inplace=True)
df.drop('descr', axis=1, inplace=True)
df.drop('title', axis=1, inplace=True)
df.drop('primary_kw', axis=1, inplace=True)
df.drop('cat', axis=1, inplace=True)

df.head()









    Out[3]:






  
    
      
      freq
      impressions
      tags
    
  
  
    
      0
      2
      29316
      Giant man with tiny dog alert! The Mountain Fr...
    
    
      1
      2
      17180
      FYI: Ice cream sandwiches &gt; all other sandw...
    
    
      2
      2
      3474
      "My mama always said you can tell a lot about ...
    
    
      3
      2
      9027
      Let's see if you're a true cheese whiz. Can Yo...
    
    
      4
      2
      7247
      The EPA just released first-time guidelines on...



In [4]:

    
# NORMALIZE TO LOG DISTRIBUTION
# Compute Log (freq*impressions/1000)
# Add log column
df['Log'] = df['freq']*df['impressions']

for i, row in df.iterrows():
    cv = math.log(df.iloc[i,3],2)
    df.set_value(i,'Log',cv)

# Drop unneeded column
df.drop('freq', axis=1, inplace=True)
df.drop('impressions', axis=1, inplace=True)
df.head()









    Out[4]:






  
    
      
      tags
      Log
    
  
  
    
      0
      Giant man with tiny dog alert! The Mountain Fr...
      15
    
    
      1
      FYI: Ice cream sandwiches &gt; all other sandw...
      15
    
    
      2
      "My mama always said you can tell a lot about ...
      12
    
    
      3
      Let's see if you're a true cheese whiz. Can Yo...
      14
    
    
      4
      The EPA just released first-time guidelines on...
      13



In [5]:

    
data_mean = df["Log"].mean()
data_mean









    Out[5]:





19.444149122193465



In [6]:

    
data_std = df["Log"].std()
data_std









    Out[6]:





3.2282119418494077



In [7]:

    
plt.hist(df["Log"])
plt.show()



In [8]:

    
# Virality defined as -1 sigma from mean
df['viral'] = np.where(df['Log']<data_mean-data_std, 'notviral', 'viral')
df['viral_num'] = df.viral.map({'notviral':0, 'viral':1})
df.drop('Log', axis=1, inplace=True)
df.head()









    Out[8]:






  
    
      
      tags
      viral
      viral_num
    
  
  
    
      0
      Giant man with tiny dog alert! The Mountain Fr...
      notviral
      0
    
    
      1
      FYI: Ice cream sandwiches &gt; all other sandw...
      notviral
      0
    
    
      2
      "My mama always said you can tell a lot about ...
      notviral
      0
    
    
      3
      Let's see if you're a true cheese whiz. Can Yo...
      notviral
      0
    
    
      4
      The EPA just released first-time guidelines on...
      notviral
      0



In [9]:

    
df['tags'].fillna('a', inplace=True)



In [10]:

    
df.tail()









    Out[10]:






  
    
      
      tags
      viral
      viral_num
    
  
  
    
      14292
      A former Stanford swimmer who sexually assault...
      viral
      1
    
    
      14293
      A former Stanford swimmer who sexually assault...
      viral
      1
    
    
      14294
      A former Stanford swimmer who sexually assault...
      viral
      1
    
    
      14295
      A definitive ranking of our dirtiest words. Th...
      viral
      1
    
    
      14296
      Don't worry, you won't need to know Chandler B...
      viral
      1



In [11]:

    
df.shape









    Out[11]:





(14297, 3)



In [12]:

    
df.viral.value_counts()









    Out[12]:





viral       11564
notviral     2733
Name: viral, dtype: int64



In [13]:

    
X = df.tags
y = df.viral_num
print(X.shape)
print(y.shape)









    



(14297,)
(14297,)



In [14]:

    
X.head()









    Out[14]:





0    Giant man with tiny dog alert! The Mountain Fr...
1    FYI: Ice cream sandwiches &gt; all other sandw...
2    "My mama always said you can tell a lot about ...
3    Let's see if you're a true cheese whiz. Can Yo...
4    The EPA just released first-time guidelines on...
Name: tags, dtype: object



In [15]:

    
y.head()









    Out[15]:





0    0
1    0
2    0
3    0
4    0
Name: viral_num, dtype: int64



In [16]:

    
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)









    



(10722,)
(3575,)
(10722,)
(3575,)



In [17]:

    
# instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()



In [18]:

    
vect









    Out[18]:





CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)



In [19]:

    
# learn training data vocabulary, then use it to create a document-term matrix
# FOLLOWING CAN BE DONE IN SINGLE STEP:  X_train_dtm = vect.fit_transform(X_train)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)



In [20]:

    
X_train_dtm









    Out[20]:





<10722x15480 sparse matrix of type '<class 'numpy.int64'>'
	with 266669 stored elements in Compressed Sparse Row format>



In [21]:

    
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm









    Out[21]:





<3575x15480 sparse matrix of type '<class 'numpy.int64'>'
	with 87915 stored elements in Compressed Sparse Row format>



In [22]:

    
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()



In [23]:

    
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)









    



CPU times: user 5.83 ms, sys: 1.48 ms, total: 7.3 ms
Wall time: 6.45 ms






    Out[23]:





MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [24]:

    
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)



In [25]:

    
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)









    Out[25]:





0.78965034965034964



In [26]:

    
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)









    Out[26]:





array([[ 243,  466],
       [ 286, 2580]])



In [27]:

    
# print message text for the false positives (non-viral incorrectly classified as viral)
X_test[y_test < y_pred_class]









    Out[27]:





2356     So much fierce, so little time. Here's What A ...
4046     Robert Czegely has been accused of "gross misc...
3116     Put your JT fandom to the test. Are These Nsyn...
1094     Hold the door. 53 Thoughts I Had Watching Seas...
1312     Identify them all! Can You Guess The Pok mon B...
3807     Three of the former Libyan dictator s top offi...
2410     Tiny home, big clean. 28 Clever Ways To Deep C...
2813     Today's characters: the Malfoys. Here Are The ...
4781     <b>These big rocks in the desert will blow you...
3312     For when you want a little bit of ~everything....
10363    Yes, I want my house to smell like fairy bread...
3895     "Next week will see the debut of Hodor's cousi...
4290     Some Bernie-or-Busters protested Clinton's spe...
1401     Underrated is an UNDERstatement. It's Time To ...
32       Michelle Carter was charged with involuntary m...
11347    Put all those geography classes to the test. C...
1578                                                     a
994      <b>I got 99 problems, and Twitter spoilers are...
857      Suck it, salad. 15 Things You'll Only Understa...
897      Spreadin' barks and smiles! 19 Excited Dogs Ju...
1156     Milk and cereal are a union that must not be b...
2515     <b>From Fred Lyon's <a rel="nofollow" href="ht...
1365                                                     a
2230     Talk to the hand 'cause the face ain't listeni...
4696     Who's gonna be the ring bearer of your dreams?...
4177     <b>Because the people deserve to know!</b> 10 ...
3832     "Do you know how many basic bitches would kill...
2364     ALL THE EMOTIONS! Here's What Happens When You...
1890     "Thank you Nintendo!!!" Pok mon Go Is Helping ...
335      The joke is that it's not. Chrissy Teigen Twee...
                               ...                        
2977     So much drama. "Real Housewives Of Beverly Hil...
4165     Woodsprites are taking over underground. This ...
5338     For when you want a little bit of ~everything....
3796     When you start having secret meetings in the t...
5332     You spent a hell of a lot of your time on late...
3689     Add these ones to your brunch bucket list! 19 ...
5930     "Arrogant white privileged fools" A Weatherman...
2096     Do you like your sausages in a dog or a roll? ...
2517     Avoca-DO make this. Honey-Lime Chicken And Avo...
7466     <b>There&#39;s only one way to find out.</b> A...
4107     One of them's an absolute stinker. Which Tom H...
535      Oh, tartar sauce! 19 Faces From "SpongeBob Squ...
130      It's complicated AF. How Will Brexit Actually ...
10130    French bulldogs and independent coffee shops a...
2490     Scream if you know the answer. Only A Real Hor...
2646     Man cannot live on trail mix alone. 29 Camping...
597                                                      a
5151     Mmmm, the crunch. Buttermilk-Fried Chicken Mak...
2477     I used to compare myself to the models in maga...
3027     Who stands out the most in a crowd? Can You Pi...
4485     A bang for your buck. How Much Do These 9 Sex ...
503      *Immediately opens a cat hotel.* 11 Dream Jobs...
423      "Secretary Clinton has won the democratic nomi...
1687     A tale of chaos and scandal at Day One of the ...
6153     Some people have asked for a rematch of the Ba...
1941     This can only mean good things for everyone's ...
7857     <b>Warning: partial nudity ahead!</b> Acclaime...
830      All men must die. But in what order? Can You A...
5776     Screw football, this guy should go to Hogwarts...
2922     Who's a good customer service representative? ...
Name: tags, dtype: object



In [28]:

    
# print message text for the false negatives (Viral incorrectly classified as non-viral)
X_test[y_test > y_pred_class]









    Out[28]:





7120     "You spent some time away..." Brace Yourselves...
11141    Mmmmmmmmmmmmmmmm. 16 Extremely Satisfying Pict...
12571    It wasn't. 17 Things That Will Make Every Indi...
5558     "You're worthy of the last French fry in the b...
10655    Worse still, literally no one is surprised. De...
14152    In 2016, it takes staggering ignorance to thin...
13227    "I am no longer okay with just being a sidekic...
11674    The school has thousands of students, no full-...
11436    "I have always said, I will work after marriag...
9034     TL;DR Poo is done with your shit. Kareena Kapo...
13466    It seems that the key is to live anywhere othe...
5811     A hot new type of food stack. People On Instag...
1175     Give those old duds new life. 18 No-Sew Ways T...
3153     The pound may be weak, but the memes are stron...
6004     She was a better Trump than Trump. Meryl Stree...
5640     Vote Leave, take back the status quo? Four Bre...
11646    Pappis for puppies. 9 Pictures Of Doggies At I...
14051    Kya re mamu! Sab changa? Yeh quiz lega kya bhe...
8708     The golden age of TV everywhere, tbh. 13 Reaso...
12271    That three-day stretch in March when it seems ...
8918     Ded. These Photos Of Katrina Kaif In A Really ...
5411     The Los Angeles County Sheriff's Department sa...
6863     Including face masks, lingerie, dresses, and s...
12611    #Blackdegreesmatter 38 Photos Of Black Graduat...
8274     Meet Senator-elect Pauline Hanson. Australia's...
3400     On Wednesday, the hashtag <a href="https://twi...
10579    Stop restoring my faith in arranged marriages,...
9864     Enrol early. Update often. Stop What You're Do...
8157     Baroness Smith, Labour MPs, and even some Tori...
4719     UK voters have sent a massive shock through th...
                               ...                        
10519    Do these policies belong to Labor, the Greens,...
14107    You're not proud, but getting drunk in freezin...
4331     Almost as soothing as eating it. ALMOST. This ...
2625     Michael Sandford, a 19-year-old from Dorking i...
12879    Could you smash 833 cans a year? Well apparent...
12130    I'm taking the rest of the day off. Rahul Khan...
14011    "Fuckin eh it's bout to go downnnn." We Ranked...
7137     At least five police officers were killed afte...
10625    "I'm bringing them home in a box". Indigenous ...
11239    "G'day mates!" Or, nah. Australian Stereotypes...
6306     Aliens DO exist, according to the former Blink...
575      For when you just don't have time to deal with...
13840    Can you not? Salman Khan Compared Himself To A...
8814     "Freedom of speech di maa di." The Censor Boar...
11983    Let the stars assign you a soulmate. Which Sou...
10725    "It's Levi-OH-sa, not Levi-oh-SAR." How Many "...
5806     They're all the same. 22 Pictures That Perfect...
406      *Mixes them all together* 21 Bartenders Share ...
9317     The PERFECT outfit... In theory. "Dhoti Dungar...
9454     He's hiding somewhere in these pictures... but...
3358     Will there be a general election? Can there be...
8572     See if you can PRUNE Mary-Kate out from the As...
9579     "I hope we'll be having those exchanges over t...
5184     "Oot and aboot." 21 Things You'll Never Hear A...
6454     <b>Temper your jealousy, teachers.</b> One can...
177      Slay, mama. 9 Stunning Eid Outfits That'll Tak...
6704     Quit wringing my heart like a washcloth, you g...
4036     "You make me happier than pretzel day." This G...
9875     What was wrong with me? This Is For Everyone W...
7065     Just too good to be muggle. 17 Real Places Tha...
Name: tags, dtype: object



In [30]:

    
# example false negative
X_test[6454]









    Out[30]:





'<b>Temper your jealousy, teachers.</b> One can always dream. 30 Epic Examples Of Inspirational Classroom Decor DIY classrooms awesome design classroom decor classroom design cool classrooms design teacher decorations teachers'



In [31]:

    
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob









    Out[31]:





array([ 0.35800925,  0.99945296,  0.99999878, ...,  0.99999947,
        0.9999996 ,  0.98753744])



In [32]:

    
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)









    Out[32]:





0.73793648012740198



In [33]:

    
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()



In [34]:

    
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)









    



CPU times: user 384 ms, sys: 5.54 ms, total: 389 ms
Wall time: 392 ms






    Out[34]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [35]:

    
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)



In [36]:

    
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob









    Out[36]:





array([ 0.77864268,  0.95959005,  0.98801846, ...,  0.98512148,
        0.97422148,  0.74503525])



In [37]:

    
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)









    Out[37]:





0.80083916083916085



In [38]:

    
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)









    Out[38]:





0.72567856991703716



In [39]:

    
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)









    Out[39]:





15480



In [40]:

    
# examine the first 50 tokens
print(X_train_tokens[0:50])









    



['00', '000', '007', '00s', '03', '06', '07', '10', '100', '100000', '100m', '101', '11', '110', '11th', '12', '125', '129', '13', '14', '140', '143', '15', '150', '1500', '151', '16', '160', '16_new_food', '16th', '17', '172', '177', '1789', '17th', '18', '182', '18311', '18th', '19', '1938922913', '1955', '1960s', '1964', '1966', '1969', '1970s', '1972', '1980', '1980s']



In [41]:

    
# examine the last 50 tokens
print(X_train_tokens[-50:])









    



['zachary', 'zack', 'zackary', 'zafar', 'zafn', 'zaful', 'zag', 'zakia', 'zambia', 'zanada', 'zara', 'zaveri', 'zayn', 'zbych', 'zealand', 'zebra', 'zelda', 'zemeckis', 'zen', 'zendaya', 'zero', 'zesty', 'zeus', 'zhang', 'zhao', 'zig', 'zika', 'zinger', 'zip', 'zit', 'ziva', 'zo', 'zodiac', 'zodiacquiz', 'zoe', 'zoey', 'zombies', 'zone', 'zoo', 'zoodles', 'zooey', 'zookeeper', 'zoom', 'zootopia', 'zoren', 'zucchini', 'zuchinni', 'zuckerberg', 'zwan', 'zz']



In [42]:

    
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_









    Out[42]:





array([[  0.,   7.,   4., ...,   1.,   0.,   1.],
       [  4.,  39.,   4., ...,   2.,   3.,   1.]])



In [43]:

    
# rows represent classes, columns represent tokens
nb.feature_count_.shape









    Out[43]:





(2, 15480)



In [44]:

    
# number of times each token appears across all Non-viral Buzzes
non_viral_token_count = nb.feature_count_[0, :]
non_viral_token_count









    Out[44]:





array([ 0.,  7.,  4., ...,  1.,  0.,  1.])



In [45]:

    
# number of times each token appears across all Viral Buzzes
viral_token_count = nb.feature_count_[1, :]
viral_token_count









    Out[45]:





array([  4.,  39.,   4., ...,   2.,   3.,   1.])



In [46]:

    
# create a DataFrame of tokens with their separate non-viral and viral counts
tokens = pd.DataFrame({'token':X_train_tokens, 'non_viral':non_viral_token_count, 'viral':viral_token_count}).set_index('token')
tokens.head()



In [47]:

    
# examine 5 random DataFrame rows
tokens.sample(20, random_state=6)









    Out[47]:






  
    
      
      non_viral
      viral
    
    
      token
      
      
    
  
  
    
      realest
      1.0
      2.0
    
    
      creeps
      0.0
      2.0
    
    
      meredith
      3.0
      6.0
    
    
      nz
      0.0
      2.0
    
    
      nadu
      2.0
      3.0
    
    
      unqualified
      0.0
      1.0
    
    
      deep
      4.0
      59.0
    
    
      belgium
      2.0
      0.0
    
    
      dax
      4.0
      9.0
    
    
      motherquot
      1.0
      0.0
    
    
      wxyz
      0.0
      1.0
    
    
      reflection
      0.0
      2.0
    
    
      peyton
      2.0
      0.0
    
    
      ritchie
      0.0
      3.0
    
    
      dear
      3.0
      16.0
    
    
      ability
      1.0
      3.0
    
    
      organizer
      4.0
      17.0
    
    
      jimmy
      5.0
      50.0
    
    
      closing
      0.0
      3.0
    
    
      pretzels
      1.0
      0.0



In [48]:

    
# Naive Bayes counts the number of observations in each class
nb.class_count_









    Out[48]:





array([ 2024.,  8698.])



In [49]:

    
# add 1 to non-viral and viral counts to avoid dividing by 0
tokens['non_viral'] = tokens.non_viral + 1
tokens['viral'] = tokens.viral + 1
tokens.sample(5, random_state=6)



In [50]:

    
# convert the non-viral and viral counts into frequencies
tokens['non_viral'] = tokens.non_viral / nb.class_count_[0]
tokens['viral'] = tokens.viral / nb.class_count_[1]
tokens.sample(5, random_state=6)



In [51]:

    
# calculate the ratio of viral-to-non-viral for each token
tokens['viral_ratio'] = tokens.viral / tokens.non_viral
tokens.sample(5, random_state=6)









    Out[51]:






  
    
      
      non_viral
      viral
      viral_ratio
    
    
      token
      
      
      
    
  
  
    
      realest
      0.000988
      0.000345
      0.349046
    
    
      creeps
      0.000494
      0.000345
      0.698092
    
    
      meredith
      0.001976
      0.000805
      0.407220
    
    
      nz
      0.000494
      0.000345
      0.698092
    
    
      nadu
      0.001482
      0.000460
      0.310263



In [52]:

    
# examine the DataFrame sorted by viral_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('viral_ratio', ascending=False)









    Out[52]:






  
    
      
      non_viral
      viral
      viral_ratio
    
    
      token
      
      
      
    
  
  
    
      stanford
      0.000494
      0.014141
      28.621752
    
    
      petty
      0.000494
      0.007358
      14.892619
    
    
      gym
      0.000494
      0.007013
      14.194527
    
    
      jonas
      0.000494
      0.005633
      11.402161
    
    
      san
      0.000494
      0.004944
      10.005978
    
    
      yeah
      0.000494
      0.004599
      9.307887
    
    
      victoria
      0.000494
      0.004254
      8.609795
    
    
      hairy
      0.000494
      0.004254
      8.609795
    
    
      beckham
      0.000494
      0.004139
      8.377098
    
    
      disgusting
      0.000494
      0.004024
      8.144401
    
    
      spray
      0.000494
      0.003909
      7.911704
    
    
      anal
      0.000494
      0.003909
      7.911704
    
    
      alligator
      0.000494
      0.003909
      7.911704
    
    
      spears
      0.000494
      0.003794
      7.679007
    
    
      soda
      0.000494
      0.003794
      7.679007
    
    
      1ups
      0.000494
      0.003794
      7.679007
    
    
      judge
      0.000494
      0.003794
      7.679007
    
    
      pubes
      0.000494
      0.003679
      7.446309
    
    
      rob
      0.000494
      0.003564
      7.213612
    
    
      cramps
      0.000494
      0.003564
      7.213612
    
    
      eyebrow
      0.000494
      0.003564
      7.213612
    
    
      kitty
      0.000494
      0.003564
      7.213612
    
    
      diego
      0.000494
      0.003564
      7.213612
    
    
      depp
      0.000988
      0.006898
      6.980915
    
    
      chip
      0.000494
      0.003449
      6.980915
    
    
      rey
      0.000494
      0.003334
      6.748218
    
    
      fruit
      0.000988
      0.006668
      6.748218
    
    
      highlighter
      0.000494
      0.003334
      6.748218
    
    
      kendrick
      0.000494
      0.003334
      6.748218
    
    
      con
      0.000988
      0.006668
      6.748218
    
    
      ...
      ...
      ...
      ...
    
    
      pronunciation
      0.012846
      0.000460
      0.035800
    
    
      kumar
      0.006423
      0.000230
      0.035800
    
    
      nigam
      0.003458
      0.000115
      0.033242
    
    
      kabali
      0.003458
      0.000115
      0.033242
    
    
      mockumentary
      0.003458
      0.000115
      0.033242
    
    
      sonam
      0.013834
      0.000460
      0.033242
    
    
      newspaper
      0.003458
      0.000115
      0.033242
    
    
      peach
      0.003458
      0.000115
      0.033242
    
    
      osborne
      0.003458
      0.000115
      0.033242
    
    
      letters
      0.003458
      0.000115
      0.033242
    
    
      starter
      0.003458
      0.000115
      0.033242
    
    
      akshay
      0.003458
      0.000115
      0.033242
    
    
      sonu
      0.003458
      0.000115
      0.033242
    
    
      janitor
      0.003458
      0.000115
      0.033242
    
    
      brunswick
      0.003458
      0.000115
      0.033242
    
    
      pronounce
      0.003458
      0.000115
      0.033242
    
    
      snippet
      0.003458
      0.000115
      0.033242
    
    
      resting
      0.003458
      0.000115
      0.033242
    
    
      momentum
      0.003458
      0.000115
      0.033242
    
    
      gort
      0.004447
      0.000115
      0.025855
    
    
      dhoni
      0.004447
      0.000115
      0.025855
    
    
      cattrall
      0.004447
      0.000115
      0.025855
    
    
      kanan
      0.004447
      0.000115
      0.025855
    
    
      frappe
      0.004447
      0.000115
      0.025855
    
    
      lindsey
      0.004447
      0.000115
      0.025855
    
    
      gill
      0.004941
      0.000115
      0.023270
    
    
      degrassi
      0.005435
      0.000115
      0.021154
    
    
      bhatt
      0.005929
      0.000115
      0.019391
    
    
      stages
      0.005929
      0.000115
      0.019391
    
    
      hauts
      0.008399
      0.000115
      0.013688
    
  

15480 rows × 3 columns



In [54]:

    
# look up the viral_ratio for a given token
tokens.loc['lol', 'viral_ratio']









    Out[54]:





1.3186173066605349



In [ ]:



In [ ]:



In [ ]:

	freq	impressions	tags
0	2	29316	Giant man with tiny dog alert! The Mountain Fr...
1	2	17180	FYI: Ice cream sandwiches > all other sandw...
2	2	3474	"My mama always said you can tell a lot about ...
3	2	9027	Let's see if you're a true cheese whiz. Can Yo...
4	2	7247	The EPA just released first-time guidelines on...

	tags	viral
0	Giant man with tiny dog alert! The Mountain Fr...	notviral
1	FYI: Ice cream sandwiches > all other sandw...	notviral
2	"My mama always said you can tell a lot about ...	notviral
3	Let's see if you're a true cheese whiz. Can Yo...	notviral
4	The EPA just released first-time guidelines on...	notviral

	tags	viral	viral_num
14292	A former Stanford swimmer who sexually assault...	viral	1
14293	A former Stanford swimmer who sexually assault...	viral	1
14294	A former Stanford swimmer who sexually assault...	viral	1
14295	A definitive ranking of our dirtiest words. Th...	viral	1
14296	Don't worry, you won't need to know Chandler B...	viral	1

	non_viral	viral
token
realest	1.0	2.0
creeps	0.0	2.0
meredith	3.0	6.0
nz	0.0	2.0
nadu	2.0	3.0
unqualified	0.0	1.0
deep	4.0	59.0
belgium	2.0	0.0
dax	4.0	9.0
motherquot	1.0	0.0
wxyz	0.0	1.0
reflection	0.0	2.0
peyton	2.0	0.0
ritchie	0.0	3.0
dear	3.0	16.0
ability	1.0	3.0
organizer	4.0	17.0
jimmy	5.0	50.0
closing	0.0	3.0
pretzels	1.0	0.0

	non_viral	viral
token
realest	0.000988	0.000345
creeps	0.000494	0.000345
meredith	0.001976	0.000805
nz	0.000494	0.000345
nadu	0.001482	0.000460

	non_viral	viral	viral_ratio
token
stanford	0.000494	0.014141	28.621752
petty	0.000494	0.007358	14.892619
gym	0.000494	0.007013	14.194527
jonas	0.000494	0.005633	11.402161
san	0.000494	0.004944	10.005978
yeah	0.000494	0.004599	9.307887
victoria	0.000494	0.004254	8.609795
hairy	0.000494	0.004254	8.609795
beckham	0.000494	0.004139	8.377098
disgusting	0.000494	0.004024	8.144401
spray	0.000494	0.003909	7.911704
anal	0.000494	0.003909	7.911704
alligator	0.000494	0.003909	7.911704
spears	0.000494	0.003794	7.679007
soda	0.000494	0.003794	7.679007
1ups	0.000494	0.003794	7.679007
judge	0.000494	0.003794	7.679007
pubes	0.000494	0.003679	7.446309
rob	0.000494	0.003564	7.213612
cramps	0.000494	0.003564	7.213612
eyebrow	0.000494	0.003564	7.213612
kitty	0.000494	0.003564	7.213612
diego	0.000494	0.003564	7.213612
depp	0.000988	0.006898	6.980915
chip	0.000494	0.003449	6.980915
rey	0.000494	0.003334	6.748218
fruit	0.000988	0.006668	6.748218
highlighter	0.000494	0.003334	6.748218
kendrick	0.000494	0.003334	6.748218
con	0.000988	0.006668	6.748218
...	...	...	...
pronunciation	0.012846	0.000460	0.035800
kumar	0.006423	0.000230	0.035800
nigam	0.003458	0.000115	0.033242
kabali	0.003458	0.000115	0.033242
mockumentary	0.003458	0.000115	0.033242
sonam	0.013834	0.000460	0.033242
newspaper	0.003458	0.000115	0.033242
peach	0.003458	0.000115	0.033242
osborne	0.003458	0.000115	0.033242
letters	0.003458	0.000115	0.033242
starter	0.003458	0.000115	0.033242
akshay	0.003458	0.000115	0.033242
sonu	0.003458	0.000115	0.033242
janitor	0.003458	0.000115	0.033242
brunswick	0.003458	0.000115	0.033242
pronounce	0.003458	0.000115	0.033242
snippet	0.003458	0.000115	0.033242
resting	0.003458	0.000115	0.033242
momentum	0.003458	0.000115	0.033242
gort	0.004447	0.000115	0.025855
dhoni	0.004447	0.000115	0.025855
cattrall	0.004447	0.000115	0.025855
kanan	0.004447	0.000115	0.025855
frappe	0.004447	0.000115	0.025855
lindsey	0.004447	0.000115	0.025855
gill	0.004941	0.000115	0.023270
degrassi	0.005435	0.000115	0.021154
bhatt	0.005929	0.000115	0.019391
stages	0.005929	0.000115	0.019391
hauts	0.008399	0.000115	0.013688