NOTE: Need data with Max/Impressions; Need separator between phrases in Primary_kw and Tags.
Use Viral, Non-Viral (Pick -1 Std. Dev. as an arbitrary marker) Try Multiple Classes: 1 Buzz (Bottom quartile), 2 (Middle 50%) Buzz and 3(Top Quartile) Buzz
In [1]:
import os
import json
import time
import pickle
import requests
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
In [112]:
df = pd.DataFrame()
df = pd.read_csv('may_june_july.csv', delimiter="|")
#df = df[df.pull_cc == 'us']
#df = df.reset_index(drop=True)
df.head()
Out[112]:
In [113]:
# Combine all text
df['AllText'] = ""
df['primary_kw'].fillna(" ", inplace=True)
df['tags'].fillna(" ", inplace=True)
for i, row in df.iterrows():
#cv = df.iloc[i,5]+" "+df.iloc[i,6]+" "+df.iloc[i,7]+" "+df.iloc[i,8]+" "+df.iloc[i,9]+" "+df.iloc[i,10]
#Remove metav and cat
cv = df.iloc[i,2]+" "+df.iloc[i,5]+" "+df.iloc[i,6]+" "+df.iloc[i,7]+" "+df.iloc[i,9]+" "+df.iloc[i,10]
df.set_value(i,'AllText',cv)
print df.tail()
# Log to convert to Normal Distribution
df['Log'] = df['freq']*(df['impressions']+1)/1000
for i, row in df.iterrows():
cv = math.log(df.iloc[i,12],2)
df.set_value(i,'Log',cv)
# analyse data a bit
data_mean = df["Log"].mean()
print data_mean
data_std = df["Log"].std()
print data_std
%matplotlib inline
plt.hist(df["Log"])
plt.show()
# Assign buzzes
df['viral'] = ""
for i, row in df.iterrows():
if df.iloc[i,12]<=(data_mean-1.5*data_std):
df.set_value(i,'viral','1buzz')
elif (df.iloc[i,12]>(data_mean+1.5*data_std)):
df.set_value(i,'viral','3buzz')
else:
df.set_value(i,'viral','2buzz')
#df['viral'] = np.where(df['Log']<data_mean-1*data_std, 'notviral', 'viral')
df['viral_num'] = 0
df['viral_num'] = df.viral.map({'1buzz':1, '2buzz':2, '3buzz':3})
In [114]:
X = df.AllText
y = df.viral_num
# instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_df=0.1)
df.head()
Out[114]:
In [115]:
df.tail()
Out[115]:
In [116]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(vect, nb)
pipe.steps
Out[116]:
In [117]:
# calculate accuracy of class predictions
from sklearn.cross_validation import cross_val_score
cross_val_score(pipe,X,y,cv=12,scoring='accuracy').mean()
Out[117]:
In [118]:
# import and instantiate a Logistic Regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(vect, logreg)
pipe.steps
Out[118]:
In [119]:
# calculate accuracy of class predictions
cross_val_score(pipe,X,y,cv=12,scoring='accuracy').mean()
Out[119]:
In [120]:
data_mean-1.5*data_std
Out[120]:
In [121]:
data_mean+1.5*data_std
Out[121]:
In [122]:
print data_mean
print data_std
In [123]:
df.shape
Out[123]:
In [124]:
df.viral.value_counts()
Out[124]:
In [ ]: