In [25]:
import os
import json
import time
import pickle
import requests
import math


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [120]:
df = pd.DataFrame()
df = pd.read_csv('may_june_july.csv', delimiter="|")
#ab = xy[:10]
#df = ab.copy()

In [121]:
# Combine all text
df['AllText'] = ""
df['primary_kw'].fillna(" ", inplace=True)
df['tags'].fillna(" ", inplace=True)
for i, row in df.iterrows():
    cv = df.iloc[i,5]+" "+df.iloc[i,6]+" "+df.iloc[i,7]+" "+df.iloc[i,8]+" "+df.iloc[i,9]+" "+df.iloc[i,10]
    df.set_value(i,'AllText',cv)

In [122]:
df['Log'] = df['freq']*df['impressions']/1000
for i, row in df.iterrows():
    cv = math.log(df.iloc[i,12],2)
    df.set_value(i,'Log',cv)

In [123]:
data_mean = df["Log"].mean()
print data_mean

data_std = df["Log"].std()
print data_std

%matplotlib inline
plt.hist(df["Log"])
plt.show()


9.97900229397
3.21629982395

In [124]:
df.shape


Out[124]:
(14297, 13)

In [125]:
# Virality defined as -1 sigma from mean
df['viral'] = np.where(df['Log']<data_mean-data_std, 'notviral', 'viral')
df['viral_num'] = df.viral.map({'notviral':0, 'viral':1})
#df.head()

In [126]:
#df.tail()

In [127]:
df.shape


Out[127]:
(14297, 15)

In [128]:
df.viral.value_counts()


Out[128]:
viral       11862
notviral     2435
Name: viral, dtype: int64

In [129]:
X = df.AllText
y = df.viral_num
print(X.shape)
print(y.shape)


(14297,)
(14297,)

In [130]:
X.head()


Out[130]:
0    Giant man with tiny dog alert! Celebrity The M...
1    FYI: Ice cream sandwiches &gt; all other sandw...
2    "My mama always said you can tell a lot about ...
3    Let's see if you're a true cheese whiz. Food C...
4    The EPA just released first-time guidelines on...
Name: AllText, dtype: object

In [131]:
y.head()


Out[131]:
0    0
1    0
2    0
3    0
4    0
Name: viral_num, dtype: int64

In [132]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(10722,)
(3575,)
(10722,)
(3575,)

In [143]:
# instantiate the vectorizer
# Test Following scenarions:  max_df=0.5
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_df=0.5)

In [144]:
#X_train

In [145]:
vect


Out[145]:
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [146]:
# learn training data vocabulary, then use it to create a document-term matrix
# FOLLOWING CAN BE DONE IN SINGLE STEP:  X_train_dtm = vect.fit_transform(X_train)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [147]:
X_train_dtm


Out[147]:
<10722x15808 sparse matrix of type '<type 'numpy.int64'>'
	with 287918 stored elements in Compressed Sparse Row format>

In [148]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm


Out[148]:
<3575x15808 sparse matrix of type '<type 'numpy.int64'>'
	with 94428 stored elements in Compressed Sparse Row format>

In [149]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [150]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)


CPU times: user 9.81 ms, sys: 2.27 ms, total: 12.1 ms
Wall time: 10.2 ms
Out[150]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [151]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [152]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


Out[152]:
0.81146853146853148

In [153]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)


Out[153]:
array([[ 217,  406],
       [ 268, 2684]])

In [94]:
# print message text for the false positives (non-viral incorrectly classified as viral)
X_test[y_test < y_pred_class]


Out[94]:
2356     So much fierce, so little time. Music Here's W...
4046     Robert Czegely has been accused of "gross misc...
3116     Put your JT fandom to the test. Music Are Thes...
1790     Do you read bad drawings? Animals Can You Iden...
1094     Hold the door. Culture 53 Thoughts I Had Watch...
3807     Three of the former Libyan dictator s top offi...
153      Seven police officers and two civilians were h...
2410     Tiny home, big clean. DIY 28 Clever Ways To De...
2813     Today's characters: the Malfoys. UK Here Are T...
4781     <b>These big rocks in the desert will blow you...
3312     For when you want a little bit of ~everything....
10363    Yes, I want my house to smell like fairy bread...
3895     "Next week will see the debut of Hodor's cousi...
4290     Some Bernie-or-Busters protested Clinton's spe...
1401     Underrated is an UNDERstatement. Rewind It's T...
11347    Put all those geography classes to the test. I...
1578     <b>Picture your favorite invention.</b> Now pi...
994      <b>I got 99 problems, and Twitter spoilers are...
897      Spreadin' barks and smiles! Animals 19 Excited...
1156     Milk and cereal are a union that must not be b...
2515     <b>From Fred Lyon's <a rel="nofollow" href="ht...
2230     Talk to the hand 'cause the face ain't listeni...
4177     <b>Because the people deserve to know!</b> Com...
3832     "Do you know how many basic bitches would kill...
2364     ALL THE EMOTIONS! Tasty Here's What Happens Wh...
1890     "Thank you Nintendo!!!" UK Pok mon Go Is Helpi...
335      The joke is that it's not. Celebrity Chrissy T...
793      The ugly babies know. Culture What Does Your C...
6420     Life? Shout? Bliss? Khode? I don't even know a...
1217     "Got this for son in law" Books 21 Negative  H...
                               ...                        
159      Trust your subconscious! It knows what's up. C...
1041     Contains NSFW images of some of Scotland's jui...
1989     "NO, NOT DRAGONS. FUCK THOSE DRAGONS. FOCUS!!!...
1711     And no, this is not just about tofu. Food 17 U...
1710     We'd use them everyday, TBH. Books If Snapchat...
354      One of these will <i>really</i> satisfy your s...
5792     See if you can make it through the night as on...
2977     So much drama. Australia "Real Housewives Of B...
4165     Woodsprites are taking over underground. Austr...
5385     "Directions: Pour into an oiled fish mold and ...
5338     For when you want a little bit of ~everything....
3796     When you start having secret meetings in the t...
5930     "Arrogant white privileged fools" Australia A ...
2096     Do you like your sausages in a dog or a roll? ...
535      Oh, tartar sauce! Rewind 19 Faces From "Sponge...
2490     Scream if you know the answer. UK Only A Real ...
2646     Man cannot live on trail mix alone. Food 29 Ca...
6582     Whoever said diamonds needed polish was wrong....
597      Girls' stories aren't just for girls. UK 34 Yo...
5151     Mmmm, the crunch. Tasty Buttermilk-Fried Chick...
2477     I used to compare myself to the models in maga...
3027     Who stands out the most in a crowd? Celebrity ...
4485     A bang for your buck. UK How Much Do These 9 S...
503      *Immediately opens a cat hotel.* UK 11 Dream J...
1687     A tale of chaos and scandal at Day One of the ...
1941     This can only mean good things for everyone's ...
7857     <b>Warning: partial nudity ahead!</b> Acclaime...
830      All men must die. But in what order? UK Can Yo...
5776     Screw football, this guy should go to Hogwarts...
2922     Who's a good customer service representative? ...
Name: AllText, dtype: object

In [95]:
# print message text for the false negatives (Viral incorrectly classified as non-viral)
X_test[y_test > y_pred_class]


Out[95]:
11141    Mmmmmmmmmmmmmmmm. India 16 Extremely Satisfyin...
12571    It wasn't. India 17 Things That Will Make Ever...
5558     "You're worthy of the last French fry in the b...
10655    Worse still, literally no one is surprised. In...
14152    In 2016, it takes staggering ignorance to thin...
2674     Your relationship with the word "home" becomes...
13227    "I am no longer okay with just being a sidekic...
11674    The school has thousands of students, no full-...
11436    "I have always said, I will work after marriag...
9034     TL;DR Poo is done with your shit. India Kareen...
5811     A hot new type of food stack. Food People On I...
1175     Give those old duds new life. DIY 18 No-Sew Wa...
3153     The pound may be weak, but the memes are stron...
6004     She was a better Trump than Trump. USNews Mery...
5640     Vote Leave, take back the status quo? UKNews F...
8943     "She just wakes up like that." India Aishwarya...
11646    Pappis for puppies. India 9 Pictures Of Doggie...
14051    Kya re mamu! Sab changa? Yeh quiz lega kya bhe...
8708     The golden age of TV everywhere, tbh. Canada 1...
12271    That three-day stretch in March when it seems ...
8918     Ded. India These Photos Of Katrina Kaif In A R...
5411     The Los Angeles County Sheriff's Department sa...
6863     Including face masks, lingerie, dresses, and s...
12611    #Blackdegreesmatter Culture 38 Photos Of Black...
8274     Meet Senator-elect Pauline Hanson. World Austr...
3400     On Wednesday, the hashtag <a href="https://twi...
9502     Renuka Chowdhury is a former union minister of...
10579    Stop restoring my faith in arranged marriages,...
9864     Enrol early. Update often. AUNews Stop What Yo...
8157     Baroness Smith, Labour MPs, and even some Tori...
                               ...                        
13720    "Kids love singing apes." AUNews Adam Goodes A...
4398     Warning: You'll be hearing about this theory f...
5518     "Tanong mo sa mommy mo." Culture 18 Sentences ...
7217     Tradies. There's always tradies. Australia 25 ...
1885     Gesundheit, doggies. Animals 11 Dogs Caught Mi...
9284     <i>The New European</i>, published 8 July, is ...
13273    You're totally doing this. Tasty Try Out Your ...
10862    Creamy PB? Check. Decadent brownie? Check. Cri...
10519    Do these policies belong to Labor, the Greens,...
4331     Almost as soothing as eating it. ALMOST. Food ...
2625     Michael Sandford, a 19-year-old from Dorking i...
12879    Could you smash 833 cans a year? Well apparent...
4614     It s not all bad. UK 16 Beauty Brands That Are...
12130    I'm taking the rest of the day off. India Rahu...
14011    "Fuckin eh it's bout to go downnnn." Canada We...
10436    <b>Who says art has to mean anything, or requi...
10625    "I'm bringing them home in a box". AUNews Indi...
11239    "G'day mates!" Or, nah. Australia Australian S...
6306     Aliens DO exist, according to the former Blink...
8814     "Freedom of speech di maa di." India The Censo...
11983    Let the stars assign you a soulmate. India Whi...
406      *Mixes them all together* Food 21 Bartenders S...
9317     The PERFECT outfit... In theory. India "Dhoti ...
9454     He's hiding somewhere in these pictures... but...
3358     Will there be a general election? Can there be...
9579     "I hope we'll be having those exchanges over t...
5184     "Oot and aboot." Canada 21 Things You'll Never...
6454     <b>Temper your jealousy, teachers.</b> One can...
177      Slay, mama. Style 9 Stunning Eid Outfits That'...
12021    <b>Wear your lit love on your sleeve.</b> Book...
Name: AllText, dtype: object

In [96]:
# example false negative
#X_test[3]

In [97]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob


Out[97]:
array([ 0.67990059,  0.99984685,  0.99999829, ...,  0.99999919,
        0.99999972,  0.99629517])

In [98]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)


Out[98]:
0.73614156085381088

In [99]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [100]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)


CPU times: user 658 ms, sys: 12 ms, total: 670 ms
Wall time: 673 ms
Out[100]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [101]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [102]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob


Out[102]:
array([ 0.8093811 ,  0.97096945,  0.98082109, ...,  0.98028302,
        0.99328703,  0.87609797])

In [103]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)


Out[103]:
0.82237762237762235

In [104]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)


Out[104]:
0.72488276849060629

In [105]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)


Out[105]:
15808

In [106]:
# examine the first 50 tokens
print(X_train_tokens[0:50])


[u'00', u'000', u'007', u'00s', u'03', u'06', u'07', u'09', u'10', u'100', u'100000', u'100m', u'101', u'11', u'110', u'11th', u'12', u'125', u'129', u'13', u'14', u'140', u'143', u'15', u'150', u'1500', u'151', u'16', u'160', u'16_new_food', u'16th', u'17', u'172', u'177', u'1789', u'17th', u'18', u'182', u'18311', u'18th', u'19', u'1938922913', u'1950s', u'1955', u'1960s', u'1964', u'1966', u'1969', u'1970s', u'1972']

In [107]:
# examine the last 50 tokens
print(X_train_tokens[-50:])


[u'zack', u'zackary', u'zafar', u'zafn', u'zaful', u'zag', u'zakia', u'zambia', u'zanada', u'zara', u'zaveri', u'zayn', u'zbych', u'zealand', u'zebra', u'zelda', u'zemeckis', u'zen', u'zendaya', u'zero', u'zesty', u'zeus', u'zhang', u'zhao', u'zig', u'zika', u'zinger', u'zip', u'zit', u'ziva', u'zo', u'zodiac', u'zodiacquiz', u'zoe', u'zoey', u'zombies', u'zone', u'zones', u'zoo', u'zoodles', u'zooey', u'zookeeper', u'zoom', u'zootopia', u'zoren', u'zucchini', u'zuchinni', u'zuckerberg', u'zwan', u'zz']

In [108]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_


Out[108]:
array([[  0.,   7.,   4., ...,   1.,   0.,   1.],
       [  4.,  40.,   4., ...,   2.,   3.,   1.]])

In [109]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape


Out[109]:
(2, 15808)

In [110]:
# number of times each token appears across all Non-viral Buzzes
non_viral_token_count = nb.feature_count_[0, :]
non_viral_token_count


Out[110]:
array([ 0.,  7.,  4., ...,  1.,  0.,  1.])

In [111]:
# number of times each token appears across all Viral Buzzes
viral_token_count = nb.feature_count_[1, :]
viral_token_count


Out[111]:
array([  4.,  40.,   4., ...,   2.,   3.,   1.])

In [112]:
# create a DataFrame of tokens with their separate non-viral and viral counts
tokens = pd.DataFrame({'token':X_train_tokens, 'non_viral':non_viral_token_count, 'viral':viral_token_count}).set_index('token')
tokens.head()


Out[112]:
non_viral viral
token
00 0.0 4.0
000 7.0 40.0
007 4.0 4.0
00s 18.0 216.0
03 0.0 3.0

In [113]:
# examine 5 random DataFrame rows
tokens.sample(20, random_state=6)


Out[113]:
non_viral viral
token
puff 0.0 10.0
canada 92.0 247.0
comtent 27.0 206.0
burnham 1.0 8.0
poops 2.0 1.0
surgery 5.0 14.0
crohns 0.0 2.0
strange 4.0 16.0
stray 1.0 1.0
commit 0.0 6.0
origin 1.0 2.0
rhyhorny 0.0 1.0
school 44.0 196.0
karate 0.0 2.0
praised 0.0 1.0
purely 1.0 5.0
razors 2.0 9.0
priest 2.0 0.0
screaming 0.0 2.0
network 0.0 12.0

In [114]:
# Naive Bayes counts the number of observations in each class
nb.class_count_


Out[114]:
array([ 1812.,  8910.])

In [115]:
# add 1 to non-viral and viral counts to avoid dividing by 0
tokens['non_viral'] = tokens.non_viral + 1
tokens['viral'] = tokens.viral + 1
tokens.sample(5, random_state=6)


Out[115]:
non_viral viral
token
puff 1.0 11.0
canada 93.0 248.0
comtent 28.0 207.0
burnham 2.0 9.0
poops 3.0 2.0

In [116]:
# convert the non-viral and viral counts into frequencies
tokens['non_viral'] = tokens.non_viral / nb.class_count_[0]
tokens['viral'] = tokens.viral / nb.class_count_[1]
tokens.sample(5, random_state=6)


Out[116]:
non_viral viral
token
puff 0.000552 0.001235
canada 0.051325 0.027834
comtent 0.015453 0.023232
burnham 0.001104 0.001010
poops 0.001656 0.000224

In [117]:
# calculate the ratio of viral-to-non-viral for each token
tokens['viral_ratio'] = tokens.viral / tokens.non_viral
tokens.sample(5, random_state=6)


Out[117]:
non_viral viral viral_ratio
token
puff 0.000552 0.001235 2.237037
canada 0.051325 0.027834 0.542312
comtent 0.015453 0.023232 1.503463
burnham 0.001104 0.001010 0.915152
poops 0.001656 0.000224 0.135578

In [118]:
# examine the DataFrame sorted by viral_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('viral_ratio', ascending=False)


Out[118]:
non_viral viral viral_ratio
token
petty 0.000552 0.007183 13.015488
stanford 0.001104 0.013917 12.608754
gym 0.000552 0.006846 12.405387
sims 0.000552 0.006510 11.795286
bra 0.000552 0.006510 11.795286
boobs 0.001104 0.012570 11.388552
yeah 0.000552 0.005051 9.151515
san 0.000552 0.004826 8.744781
spears 0.000552 0.004377 7.931313
victoria 0.000552 0.004153 7.524579
hairy 0.000552 0.004153 7.524579
crime 0.000552 0.004040 7.321212
fries 0.001656 0.012009 7.253423
chyna 0.000552 0.003928 7.117845
rob 0.000552 0.003928 7.117845
disgusting 0.000552 0.003928 7.117845
blac 0.000552 0.003928 7.117845
elba 0.000552 0.003816 6.914478
alligator 0.000552 0.003816 6.914478
idris 0.000552 0.003816 6.914478
anal 0.000552 0.003816 6.914478
spray 0.000552 0.003816 6.914478
soda 0.000552 0.003704 6.711111
1ups 0.000552 0.003704 6.711111
judge 0.000552 0.003704 6.711111
pubes 0.000552 0.003591 6.507744
eyebrow 0.000552 0.003479 6.304377
simpsons 0.000552 0.003479 6.304377
diego 0.000552 0.003479 6.304377
cramps 0.000552 0.003479 6.304377
... ... ... ...
eacute 0.003311 0.000112 0.033895
charleston 0.003311 0.000112 0.033895
fierravanti 0.003311 0.000112 0.033895
lehengas 0.003311 0.000112 0.033895
fasting 0.003311 0.000112 0.033895
dil 0.003311 0.000112 0.033895
toby 0.003311 0.000112 0.033895
gen 0.003311 0.000112 0.033895
alpha 0.003311 0.000112 0.033895
iftar 0.003311 0.000112 0.033895
ipl 0.003311 0.000112 0.033895
yang 0.003311 0.000112 0.033895
concetta 0.003311 0.000112 0.033895
rickshaw 0.003311 0.000112 0.033895
noor 0.003311 0.000112 0.033895
janitor 0.003863 0.000112 0.029052
letters 0.003863 0.000112 0.029052
starter 0.003863 0.000112 0.029052
sonu 0.003863 0.000112 0.029052
snippet 0.003863 0.000112 0.029052
nigam 0.003863 0.000112 0.029052
osborne 0.003863 0.000112 0.029052
kabali 0.003863 0.000112 0.029052
pronounce 0.003863 0.000112 0.029052
brunswick 0.003863 0.000112 0.029052
momentum 0.003863 0.000112 0.029052
resting 0.003863 0.000112 0.029052
peach 0.003863 0.000112 0.029052
dhoni 0.004967 0.000112 0.022596
frappe 0.004967 0.000112 0.022596

15808 rows × 3 columns


In [119]:
# look up the viral_ratio for a given token
tokens.loc['stanford', 'viral_ratio']


Out[119]:
12.608754208754208

In [ ]:


In [ ]:


In [ ]: