In [25]:
import os
import json
import time
import pickle
import requests
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
In [120]:
df = pd.DataFrame()
df = pd.read_csv('may_june_july.csv', delimiter="|")
#ab = xy[:10]
#df = ab.copy()
In [121]:
# Combine all text
df['AllText'] = ""
df['primary_kw'].fillna(" ", inplace=True)
df['tags'].fillna(" ", inplace=True)
for i, row in df.iterrows():
cv = df.iloc[i,5]+" "+df.iloc[i,6]+" "+df.iloc[i,7]+" "+df.iloc[i,8]+" "+df.iloc[i,9]+" "+df.iloc[i,10]
df.set_value(i,'AllText',cv)
In [122]:
df['Log'] = df['freq']*df['impressions']/1000
for i, row in df.iterrows():
cv = math.log(df.iloc[i,12],2)
df.set_value(i,'Log',cv)
In [123]:
data_mean = df["Log"].mean()
print data_mean
data_std = df["Log"].std()
print data_std
%matplotlib inline
plt.hist(df["Log"])
plt.show()
In [124]:
df.shape
Out[124]:
In [125]:
# Virality defined as -1 sigma from mean
df['viral'] = np.where(df['Log']<data_mean-data_std, 'notviral', 'viral')
df['viral_num'] = df.viral.map({'notviral':0, 'viral':1})
#df.head()
In [126]:
#df.tail()
In [127]:
df.shape
Out[127]:
In [128]:
df.viral.value_counts()
Out[128]:
In [129]:
X = df.AllText
y = df.viral_num
print(X.shape)
print(y.shape)
In [130]:
X.head()
Out[130]:
In [131]:
y.head()
Out[131]:
In [132]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [143]:
# instantiate the vectorizer
# Test Following scenarions: max_df=0.5
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_df=0.5)
In [144]:
#X_train
In [145]:
vect
Out[145]:
In [146]:
# learn training data vocabulary, then use it to create a document-term matrix
# FOLLOWING CAN BE DONE IN SINGLE STEP: X_train_dtm = vect.fit_transform(X_train)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
In [147]:
X_train_dtm
Out[147]:
In [148]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm
Out[148]:
In [149]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
In [150]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)
Out[150]:
In [151]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
In [152]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)
Out[152]:
In [153]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
Out[153]:
In [94]:
# print message text for the false positives (non-viral incorrectly classified as viral)
X_test[y_test < y_pred_class]
Out[94]:
In [95]:
# print message text for the false negatives (Viral incorrectly classified as non-viral)
X_test[y_test > y_pred_class]
Out[95]:
In [96]:
# example false negative
#X_test[3]
In [97]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
Out[97]:
In [98]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)
Out[98]:
In [99]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
In [100]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)
Out[100]:
In [101]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)
In [102]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
Out[102]:
In [103]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)
Out[103]:
In [104]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)
Out[104]:
In [105]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)
Out[105]:
In [106]:
# examine the first 50 tokens
print(X_train_tokens[0:50])
In [107]:
# examine the last 50 tokens
print(X_train_tokens[-50:])
In [108]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_
Out[108]:
In [109]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape
Out[109]:
In [110]:
# number of times each token appears across all Non-viral Buzzes
non_viral_token_count = nb.feature_count_[0, :]
non_viral_token_count
Out[110]:
In [111]:
# number of times each token appears across all Viral Buzzes
viral_token_count = nb.feature_count_[1, :]
viral_token_count
Out[111]:
In [112]:
# create a DataFrame of tokens with their separate non-viral and viral counts
tokens = pd.DataFrame({'token':X_train_tokens, 'non_viral':non_viral_token_count, 'viral':viral_token_count}).set_index('token')
tokens.head()
Out[112]:
In [113]:
# examine 5 random DataFrame rows
tokens.sample(20, random_state=6)
Out[113]:
In [114]:
# Naive Bayes counts the number of observations in each class
nb.class_count_
Out[114]:
In [115]:
# add 1 to non-viral and viral counts to avoid dividing by 0
tokens['non_viral'] = tokens.non_viral + 1
tokens['viral'] = tokens.viral + 1
tokens.sample(5, random_state=6)
Out[115]:
In [116]:
# convert the non-viral and viral counts into frequencies
tokens['non_viral'] = tokens.non_viral / nb.class_count_[0]
tokens['viral'] = tokens.viral / nb.class_count_[1]
tokens.sample(5, random_state=6)
Out[116]:
In [117]:
# calculate the ratio of viral-to-non-viral for each token
tokens['viral_ratio'] = tokens.viral / tokens.non_viral
tokens.sample(5, random_state=6)
Out[117]:
In [118]:
# examine the DataFrame sorted by viral_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('viral_ratio', ascending=False)
Out[118]:
In [119]:
# look up the viral_ratio for a given token
tokens.loc['stanford', 'viral_ratio']
Out[119]:
In [ ]:
In [ ]:
In [ ]: