In [1]:
%matplotlib inline

import os
import json
import time
import pickle
import requests
import math


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('may_june_july.csv', delimiter="|")

In [3]:
# Combine all text 
df['tags'] = df['descr'] + " " + df["title"] + " " + df["cat"]+ " " + df["primary_kw"]+ " " + df["tags"] 
# Drop unneeded columns
df.drop('id', axis=1, inplace=True)
df.drop('pull_cc', axis=1, inplace=True)
df.drop('cc', axis=1, inplace=True)
df.drop('metav', axis=1, inplace=True)
df.drop('descr', axis=1, inplace=True)
df.drop('title', axis=1, inplace=True)
df.drop('primary_kw', axis=1, inplace=True)
df.drop('cat', axis=1, inplace=True)

df.head()


Out[3]:
freq impressions tags
0 2 29316 Giant man with tiny dog alert! The Mountain Fr...
1 2 17180 FYI: Ice cream sandwiches > all other sandw...
2 2 3474 "My mama always said you can tell a lot about ...
3 2 9027 Let's see if you're a true cheese whiz. Can Yo...
4 2 7247 The EPA just released first-time guidelines on...

In [4]:
# NORMALIZE TO LOG DISTRIBUTION
# Compute Log (freq*impressions/1000)
# Add log column
df['Log'] = df['freq']*df['impressions']

for i, row in df.iterrows():
    cv = math.log(df.iloc[i,3],2)
    df.set_value(i,'Log',cv)

# Drop unneeded column
df.drop('freq', axis=1, inplace=True)
df.drop('impressions', axis=1, inplace=True)
df.head()


Out[4]:
tags Log
0 Giant man with tiny dog alert! The Mountain Fr... 15
1 FYI: Ice cream sandwiches > all other sandw... 15
2 "My mama always said you can tell a lot about ... 12
3 Let's see if you're a true cheese whiz. Can Yo... 14
4 The EPA just released first-time guidelines on... 13

In [5]:
data_mean = df["Log"].mean()
data_mean


Out[5]:
19.444149122193465

In [6]:
data_std = df["Log"].std()
data_std


Out[6]:
3.2282119418494077

In [7]:
plt.hist(df["Log"])
plt.show()



In [8]:
# Virality defined as -1 sigma from mean
df['viral'] = np.where(df['Log']<data_mean-data_std, 'notviral', 'viral')
df['viral_num'] = df.viral.map({'notviral':0, 'viral':1})
df.drop('Log', axis=1, inplace=True)
df.head()


Out[8]:
tags viral viral_num
0 Giant man with tiny dog alert! The Mountain Fr... notviral 0
1 FYI: Ice cream sandwiches &gt; all other sandw... notviral 0
2 "My mama always said you can tell a lot about ... notviral 0
3 Let's see if you're a true cheese whiz. Can Yo... notviral 0
4 The EPA just released first-time guidelines on... notviral 0

In [9]:
df['tags'].fillna('a', inplace=True)

In [10]:
df.tail()


Out[10]:
tags viral viral_num
14292 A former Stanford swimmer who sexually assault... viral 1
14293 A former Stanford swimmer who sexually assault... viral 1
14294 A former Stanford swimmer who sexually assault... viral 1
14295 A definitive ranking of our dirtiest words. Th... viral 1
14296 Don't worry, you won't need to know Chandler B... viral 1

In [11]:
df.shape


Out[11]:
(14297, 3)

In [12]:
df.viral.value_counts()


Out[12]:
viral       11564
notviral     2733
Name: viral, dtype: int64

In [13]:
X = df.tags
y = df.viral_num
print(X.shape)
print(y.shape)


(14297,)
(14297,)

In [14]:
X.head()


Out[14]:
0    Giant man with tiny dog alert! The Mountain Fr...
1    FYI: Ice cream sandwiches &gt; all other sandw...
2    "My mama always said you can tell a lot about ...
3    Let's see if you're a true cheese whiz. Can Yo...
4    The EPA just released first-time guidelines on...
Name: tags, dtype: object

In [15]:
y.head()


Out[15]:
0    0
1    0
2    0
3    0
4    0
Name: viral_num, dtype: int64

In [16]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(10722,)
(3575,)
(10722,)
(3575,)

In [17]:
# instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [18]:
vect


Out[18]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
# learn training data vocabulary, then use it to create a document-term matrix
# FOLLOWING CAN BE DONE IN SINGLE STEP:  X_train_dtm = vect.fit_transform(X_train)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [20]:
X_train_dtm


Out[20]:
<10722x15480 sparse matrix of type '<class 'numpy.int64'>'
	with 266669 stored elements in Compressed Sparse Row format>

In [21]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm


Out[21]:
<3575x15480 sparse matrix of type '<class 'numpy.int64'>'
	with 87915 stored elements in Compressed Sparse Row format>

In [22]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [23]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)


CPU times: user 5.83 ms, sys: 1.48 ms, total: 7.3 ms
Wall time: 6.45 ms
Out[23]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [25]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


Out[25]:
0.78965034965034964

In [26]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)


Out[26]:
array([[ 243,  466],
       [ 286, 2580]])

In [27]:
# print message text for the false positives (non-viral incorrectly classified as viral)
X_test[y_test < y_pred_class]


Out[27]:
2356     So much fierce, so little time. Here's What A ...
4046     Robert Czegely has been accused of "gross misc...
3116     Put your JT fandom to the test. Are These Nsyn...
1094     Hold the door. 53 Thoughts I Had Watching Seas...
1312     Identify them all! Can You Guess The Pok mon B...
3807     Three of the former Libyan dictator s top offi...
2410     Tiny home, big clean. 28 Clever Ways To Deep C...
2813     Today's characters: the Malfoys. Here Are The ...
4781     <b>These big rocks in the desert will blow you...
3312     For when you want a little bit of ~everything....
10363    Yes, I want my house to smell like fairy bread...
3895     "Next week will see the debut of Hodor's cousi...
4290     Some Bernie-or-Busters protested Clinton's spe...
1401     Underrated is an UNDERstatement. It's Time To ...
32       Michelle Carter was charged with involuntary m...
11347    Put all those geography classes to the test. C...
1578                                                     a
994      <b>I got 99 problems, and Twitter spoilers are...
857      Suck it, salad. 15 Things You'll Only Understa...
897      Spreadin' barks and smiles! 19 Excited Dogs Ju...
1156     Milk and cereal are a union that must not be b...
2515     <b>From Fred Lyon's <a rel="nofollow" href="ht...
1365                                                     a
2230     Talk to the hand 'cause the face ain't listeni...
4696     Who's gonna be the ring bearer of your dreams?...
4177     <b>Because the people deserve to know!</b> 10 ...
3832     "Do you know how many basic bitches would kill...
2364     ALL THE EMOTIONS! Here's What Happens When You...
1890     "Thank you Nintendo!!!" Pok mon Go Is Helping ...
335      The joke is that it's not. Chrissy Teigen Twee...
                               ...                        
2977     So much drama. "Real Housewives Of Beverly Hil...
4165     Woodsprites are taking over underground. This ...
5338     For when you want a little bit of ~everything....
3796     When you start having secret meetings in the t...
5332     You spent a hell of a lot of your time on late...
3689     Add these ones to your brunch bucket list! 19 ...
5930     "Arrogant white privileged fools" A Weatherman...
2096     Do you like your sausages in a dog or a roll? ...
2517     Avoca-DO make this. Honey-Lime Chicken And Avo...
7466     <b>There&#39;s only one way to find out.</b> A...
4107     One of them's an absolute stinker. Which Tom H...
535      Oh, tartar sauce! 19 Faces From "SpongeBob Squ...
130      It's complicated AF. How Will Brexit Actually ...
10130    French bulldogs and independent coffee shops a...
2490     Scream if you know the answer. Only A Real Hor...
2646     Man cannot live on trail mix alone. 29 Camping...
597                                                      a
5151     Mmmm, the crunch. Buttermilk-Fried Chicken Mak...
2477     I used to compare myself to the models in maga...
3027     Who stands out the most in a crowd? Can You Pi...
4485     A bang for your buck. How Much Do These 9 Sex ...
503      *Immediately opens a cat hotel.* 11 Dream Jobs...
423      "Secretary Clinton has won the democratic nomi...
1687     A tale of chaos and scandal at Day One of the ...
6153     Some people have asked for a rematch of the Ba...
1941     This can only mean good things for everyone's ...
7857     <b>Warning: partial nudity ahead!</b> Acclaime...
830      All men must die. But in what order? Can You A...
5776     Screw football, this guy should go to Hogwarts...
2922     Who's a good customer service representative? ...
Name: tags, dtype: object

In [28]:
# print message text for the false negatives (Viral incorrectly classified as non-viral)
X_test[y_test > y_pred_class]


Out[28]:
7120     "You spent some time away..." Brace Yourselves...
11141    Mmmmmmmmmmmmmmmm. 16 Extremely Satisfying Pict...
12571    It wasn't. 17 Things That Will Make Every Indi...
5558     "You're worthy of the last French fry in the b...
10655    Worse still, literally no one is surprised. De...
14152    In 2016, it takes staggering ignorance to thin...
13227    "I am no longer okay with just being a sidekic...
11674    The school has thousands of students, no full-...
11436    "I have always said, I will work after marriag...
9034     TL;DR Poo is done with your shit. Kareena Kapo...
13466    It seems that the key is to live anywhere othe...
5811     A hot new type of food stack. People On Instag...
1175     Give those old duds new life. 18 No-Sew Ways T...
3153     The pound may be weak, but the memes are stron...
6004     She was a better Trump than Trump. Meryl Stree...
5640     Vote Leave, take back the status quo? Four Bre...
11646    Pappis for puppies. 9 Pictures Of Doggies At I...
14051    Kya re mamu! Sab changa? Yeh quiz lega kya bhe...
8708     The golden age of TV everywhere, tbh. 13 Reaso...
12271    That three-day stretch in March when it seems ...
8918     Ded. These Photos Of Katrina Kaif In A Really ...
5411     The Los Angeles County Sheriff's Department sa...
6863     Including face masks, lingerie, dresses, and s...
12611    #Blackdegreesmatter 38 Photos Of Black Graduat...
8274     Meet Senator-elect Pauline Hanson. Australia's...
3400     On Wednesday, the hashtag <a href="https://twi...
10579    Stop restoring my faith in arranged marriages,...
9864     Enrol early. Update often. Stop What You're Do...
8157     Baroness Smith, Labour MPs, and even some Tori...
4719     UK voters have sent a massive shock through th...
                               ...                        
10519    Do these policies belong to Labor, the Greens,...
14107    You're not proud, but getting drunk in freezin...
4331     Almost as soothing as eating it. ALMOST. This ...
2625     Michael Sandford, a 19-year-old from Dorking i...
12879    Could you smash 833 cans a year? Well apparent...
12130    I'm taking the rest of the day off. Rahul Khan...
14011    "Fuckin eh it's bout to go downnnn." We Ranked...
7137     At least five police officers were killed afte...
10625    "I'm bringing them home in a box". Indigenous ...
11239    "G'day mates!" Or, nah. Australian Stereotypes...
6306     Aliens DO exist, according to the former Blink...
575      For when you just don't have time to deal with...
13840    Can you not? Salman Khan Compared Himself To A...
8814     "Freedom of speech di maa di." The Censor Boar...
11983    Let the stars assign you a soulmate. Which Sou...
10725    "It's Levi-OH-sa, not Levi-oh-SAR." How Many "...
5806     They're all the same. 22 Pictures That Perfect...
406      *Mixes them all together* 21 Bartenders Share ...
9317     The PERFECT outfit... In theory. "Dhoti Dungar...
9454     He's hiding somewhere in these pictures... but...
3358     Will there be a general election? Can there be...
8572     See if you can PRUNE Mary-Kate out from the As...
9579     "I hope we'll be having those exchanges over t...
5184     "Oot and aboot." 21 Things You'll Never Hear A...
6454     <b>Temper your jealousy, teachers.</b> One can...
177      Slay, mama. 9 Stunning Eid Outfits That'll Tak...
6704     Quit wringing my heart like a washcloth, you g...
4036     "You make me happier than pretzel day." This G...
9875     What was wrong with me? This Is For Everyone W...
7065     Just too good to be muggle. 17 Real Places Tha...
Name: tags, dtype: object

In [30]:
# example false negative
X_test[6454]


Out[30]:
'<b>Temper your jealousy, teachers.</b> One can always dream. 30 Epic Examples Of Inspirational Classroom Decor DIY classrooms awesome design classroom decor classroom design cool classrooms design teacher decorations teachers'

In [31]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob


Out[31]:
array([ 0.35800925,  0.99945296,  0.99999878, ...,  0.99999947,
        0.9999996 ,  0.98753744])

In [32]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)


Out[32]:
0.73793648012740198

In [33]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [34]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)


CPU times: user 384 ms, sys: 5.54 ms, total: 389 ms
Wall time: 392 ms
Out[34]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [36]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob


Out[36]:
array([ 0.77864268,  0.95959005,  0.98801846, ...,  0.98512148,
        0.97422148,  0.74503525])

In [37]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)


Out[37]:
0.80083916083916085

In [38]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)


Out[38]:
0.72567856991703716

In [39]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)


Out[39]:
15480

In [40]:
# examine the first 50 tokens
print(X_train_tokens[0:50])


['00', '000', '007', '00s', '03', '06', '07', '10', '100', '100000', '100m', '101', '11', '110', '11th', '12', '125', '129', '13', '14', '140', '143', '15', '150', '1500', '151', '16', '160', '16_new_food', '16th', '17', '172', '177', '1789', '17th', '18', '182', '18311', '18th', '19', '1938922913', '1955', '1960s', '1964', '1966', '1969', '1970s', '1972', '1980', '1980s']

In [41]:
# examine the last 50 tokens
print(X_train_tokens[-50:])


['zachary', 'zack', 'zackary', 'zafar', 'zafn', 'zaful', 'zag', 'zakia', 'zambia', 'zanada', 'zara', 'zaveri', 'zayn', 'zbych', 'zealand', 'zebra', 'zelda', 'zemeckis', 'zen', 'zendaya', 'zero', 'zesty', 'zeus', 'zhang', 'zhao', 'zig', 'zika', 'zinger', 'zip', 'zit', 'ziva', 'zo', 'zodiac', 'zodiacquiz', 'zoe', 'zoey', 'zombies', 'zone', 'zoo', 'zoodles', 'zooey', 'zookeeper', 'zoom', 'zootopia', 'zoren', 'zucchini', 'zuchinni', 'zuckerberg', 'zwan', 'zz']

In [42]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_


Out[42]:
array([[  0.,   7.,   4., ...,   1.,   0.,   1.],
       [  4.,  39.,   4., ...,   2.,   3.,   1.]])

In [43]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape


Out[43]:
(2, 15480)

In [44]:
# number of times each token appears across all Non-viral Buzzes
non_viral_token_count = nb.feature_count_[0, :]
non_viral_token_count


Out[44]:
array([ 0.,  7.,  4., ...,  1.,  0.,  1.])

In [45]:
# number of times each token appears across all Viral Buzzes
viral_token_count = nb.feature_count_[1, :]
viral_token_count


Out[45]:
array([  4.,  39.,   4., ...,   2.,   3.,   1.])

In [46]:
# create a DataFrame of tokens with their separate non-viral and viral counts
tokens = pd.DataFrame({'token':X_train_tokens, 'non_viral':non_viral_token_count, 'viral':viral_token_count}).set_index('token')
tokens.head()


Out[46]:
non_viral viral
token
00 0.0 4.0
000 7.0 39.0
007 4.0 4.0
00s 27.0 207.0
03 0.0 3.0

In [47]:
# examine 5 random DataFrame rows
tokens.sample(20, random_state=6)


Out[47]:
non_viral viral
token
realest 1.0 2.0
creeps 0.0 2.0
meredith 3.0 6.0
nz 0.0 2.0
nadu 2.0 3.0
unqualified 0.0 1.0
deep 4.0 59.0
belgium 2.0 0.0
dax 4.0 9.0
motherquot 1.0 0.0
wxyz 0.0 1.0
reflection 0.0 2.0
peyton 2.0 0.0
ritchie 0.0 3.0
dear 3.0 16.0
ability 1.0 3.0
organizer 4.0 17.0
jimmy 5.0 50.0
closing 0.0 3.0
pretzels 1.0 0.0

In [48]:
# Naive Bayes counts the number of observations in each class
nb.class_count_


Out[48]:
array([ 2024.,  8698.])

In [49]:
# add 1 to non-viral and viral counts to avoid dividing by 0
tokens['non_viral'] = tokens.non_viral + 1
tokens['viral'] = tokens.viral + 1
tokens.sample(5, random_state=6)


Out[49]:
non_viral viral
token
realest 2.0 3.0
creeps 1.0 3.0
meredith 4.0 7.0
nz 1.0 3.0
nadu 3.0 4.0

In [50]:
# convert the non-viral and viral counts into frequencies
tokens['non_viral'] = tokens.non_viral / nb.class_count_[0]
tokens['viral'] = tokens.viral / nb.class_count_[1]
tokens.sample(5, random_state=6)


Out[50]:
non_viral viral
token
realest 0.000988 0.000345
creeps 0.000494 0.000345
meredith 0.001976 0.000805
nz 0.000494 0.000345
nadu 0.001482 0.000460

In [51]:
# calculate the ratio of viral-to-non-viral for each token
tokens['viral_ratio'] = tokens.viral / tokens.non_viral
tokens.sample(5, random_state=6)


Out[51]:
non_viral viral viral_ratio
token
realest 0.000988 0.000345 0.349046
creeps 0.000494 0.000345 0.698092
meredith 0.001976 0.000805 0.407220
nz 0.000494 0.000345 0.698092
nadu 0.001482 0.000460 0.310263

In [52]:
# examine the DataFrame sorted by viral_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('viral_ratio', ascending=False)


Out[52]:
non_viral viral viral_ratio
token
stanford 0.000494 0.014141 28.621752
petty 0.000494 0.007358 14.892619
gym 0.000494 0.007013 14.194527
jonas 0.000494 0.005633 11.402161
san 0.000494 0.004944 10.005978
yeah 0.000494 0.004599 9.307887
victoria 0.000494 0.004254 8.609795
hairy 0.000494 0.004254 8.609795
beckham 0.000494 0.004139 8.377098
disgusting 0.000494 0.004024 8.144401
spray 0.000494 0.003909 7.911704
anal 0.000494 0.003909 7.911704
alligator 0.000494 0.003909 7.911704
spears 0.000494 0.003794 7.679007
soda 0.000494 0.003794 7.679007
1ups 0.000494 0.003794 7.679007
judge 0.000494 0.003794 7.679007
pubes 0.000494 0.003679 7.446309
rob 0.000494 0.003564 7.213612
cramps 0.000494 0.003564 7.213612
eyebrow 0.000494 0.003564 7.213612
kitty 0.000494 0.003564 7.213612
diego 0.000494 0.003564 7.213612
depp 0.000988 0.006898 6.980915
chip 0.000494 0.003449 6.980915
rey 0.000494 0.003334 6.748218
fruit 0.000988 0.006668 6.748218
highlighter 0.000494 0.003334 6.748218
kendrick 0.000494 0.003334 6.748218
con 0.000988 0.006668 6.748218
... ... ... ...
pronunciation 0.012846 0.000460 0.035800
kumar 0.006423 0.000230 0.035800
nigam 0.003458 0.000115 0.033242
kabali 0.003458 0.000115 0.033242
mockumentary 0.003458 0.000115 0.033242
sonam 0.013834 0.000460 0.033242
newspaper 0.003458 0.000115 0.033242
peach 0.003458 0.000115 0.033242
osborne 0.003458 0.000115 0.033242
letters 0.003458 0.000115 0.033242
starter 0.003458 0.000115 0.033242
akshay 0.003458 0.000115 0.033242
sonu 0.003458 0.000115 0.033242
janitor 0.003458 0.000115 0.033242
brunswick 0.003458 0.000115 0.033242
pronounce 0.003458 0.000115 0.033242
snippet 0.003458 0.000115 0.033242
resting 0.003458 0.000115 0.033242
momentum 0.003458 0.000115 0.033242
gort 0.004447 0.000115 0.025855
dhoni 0.004447 0.000115 0.025855
cattrall 0.004447 0.000115 0.025855
kanan 0.004447 0.000115 0.025855
frappe 0.004447 0.000115 0.025855
lindsey 0.004447 0.000115 0.025855
gill 0.004941 0.000115 0.023270
degrassi 0.005435 0.000115 0.021154
bhatt 0.005929 0.000115 0.019391
stages 0.005929 0.000115 0.019391
hauts 0.008399 0.000115 0.013688

15480 rows × 3 columns


In [54]:
# look up the viral_ratio for a given token
tokens.loc['lol', 'viral_ratio']


Out[54]:
1.3186173066605349

In [ ]:


In [ ]:


In [ ]: