In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df = pd.read_csv('atlas-taggings.csv')
In [3]:
df.head(10)
Out[3]:
In [4]:
articles = df[df.tagged_type == 'Article']
In [5]:
articles.head()
Out[5]:
In [34]:
def get_tag(x):
return x.split('/')[2]
#changing this function to get_tag_name() in module.
In [10]:
articles.tag_url = articles.tag_url.apply(get_tag)
articles.head()
Out[10]:
In [11]:
test = pd.get_dummies(articles.tag_url)
In [12]:
test.head()
Out[12]:
In [13]:
articles = articles.join(test)
In [14]:
articles.drop(['tag_id','tag_url','tagged_type','tagged_id'],axis=1,inplace=True)
In [15]:
articles.head()
Out[15]:
In [16]:
unique_articles = articles.groupby('tagged_url').sum() #made into func
In [17]:
unique_articles = unique_articles.reset_index()
In [18]:
unique_articles = unique_articles.set_index('tagged_url')
In [19]:
#now we need the pageviews and have to map the URLs to Page Titles
pageviews = pd.read_csv('output_articles_performance.csv',header=None,names=['url','published','pageviews'])
pageviews.head()
#In the future I should import the module and run it here instead of grabbing.
Out[19]:
In [20]:
pageviews.url = ['www.atlasobscura.com/articles/' + x for x in pageviews.url]
In [21]:
pageviews.head()
Out[21]:
In [22]:
pageviews.describe()
Out[22]:
In [23]:
pageviews.set_index('url',inplace=True)
In [24]:
article_set = unique_articles.join(pageviews)
In [25]:
article_set.head()
Out[25]:
In [26]:
article_set.reset_index()
Out[26]:
In [27]:
article_set['upper_quartile'] = [1 if x > 10000 else 0 for x in article_set.pageviews]
In [28]:
article_set.pageviews.plot(kind='hist', bins=100,title='Page View Distribution, All Content')
Out[28]:
In [29]:
article_set['published'] = pd.to_datetime(article_set['published'])
In [30]:
article_set
Out[30]:
In [31]:
article_set['year'] = pd.DatetimeIndex(article_set['published']).year
In [32]:
ax = article_set.boxplot(column='pageviews',by='year',figsize=(6,6),showfliers=False)
ax.set(title='PV distribution by year',ylabel='pageviews')
Out[32]:
In [33]:
yearly = article_set.set_index('published').resample('M').mean().plot(y='pageviews')
yearly.set(title='Total Pageviews By Month of Article Publication')
Out[33]:
In [35]:
time_series = pd.read_csv('time-series.csv')
In [36]:
type(time_series)
Out[36]:
In [37]:
time_series = time_series.drop('Unnamed: 0',axis=1)
In [38]:
time_series = time_series.T
In [39]:
time_series.columns
Out[39]:
In [40]:
time_series['total'] = time_series.sum(axis=1)
In [41]:
time_series.head()
Out[41]:
In [42]:
time_series['days_to_90p']= [(time_series.iloc[x].expanding().sum() > time_series.iloc[x].total*.90).argmax() \
for x in range(len(time_series))]
In [43]:
time_series.reset_index(inplace=True)
In [44]:
time_series.head(1)
Out[44]:
In [45]:
time_series['index'] = ['www.atlasobscura.com/articles/' + x for x in time_series['index']]
time_series.set_index('index',inplace=True)
time_series = time_series.join(pageviews.published)
time_series.head(5)
Out[45]:
In [46]:
time_series['published'] = pd.to_datetime(time_series.published)
In [47]:
time_series['year_pub'] = pd.DatetimeIndex(time_series['published']).year
In [48]:
time_series.boxplot(column='days_to_90p',by='year_pub')
Out[48]:
In [49]:
time_series.year_pub.value_counts(dropna=False)
Out[49]:
In [50]:
time_series[['days_to_90p','total','year_pub']].corr()
Out[50]:
In [403]:
#I DON'T KNOW WHY THIS WON'T WORK
time_series['30-day-PVs'] = [time_series.fillna(value=0).iloc[x,0:31].sum() for x in range(len(time_series))]
In [417]:
time_series['7-day-PVs'] = [time_series.fillna(value=0).iloc[x,0:8].sum() for x in range(len(time_series))]
In [92]:
total_tagged= pd.DataFrame(data=article_set.sum(),columns = ['num_tagged'])
In [93]:
total_tagged.sort_values('num_tagged',ascending=False,inplace=True)
In [94]:
total_tagged.drop('pageviews',axis=0,inplace=True)
In [95]:
total_tagged[total_tagged.num_tagged >= 10].count()
Out[95]:
In [96]:
total_tagged[total_tagged.num_tagged <=5].index
Out[96]:
In [124]:
#tag_analysis = article_set.drop(total_tagged[total_tagged.num_tagged < 5].index,axis=1)
#I'm resetting tag_analysis to contain all tags so I can manipulate later whenever I want. It makes it more clear.
tag_analysis = article_set
In [98]:
print tag_analysis.shape
tag_analysis.head()
Out[98]:
In [60]:
tag_analysis.tail()
tag_analysis.to_csv('tag_analysis_ready.csv')
In [99]:
total_tagged.head(30)
print total_tagged.shape
In [100]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(interaction_only=True)
In [101]:
poly_df = pd.DataFrame(poly.fit_transform(tag_analysis.fillna(0).drop(['published','pageviews','upper_quartile','year'],axis=1)))
In [102]:
poly.n_output_features_
Out[102]:
In [103]:
total_tagged.ix['extra-mile']
Out[103]:
In [104]:
regular_features = ['places-you-can-no-longer-go','100-wonders','extra-mile','video-wonders','news','features','columns',
'found','animals','fleeting-wonders','visual','other-capitals-of-the-world','video','art','list','objects-of-intrigue',
'maps','morbid-monday','female-explorers','naturecultures']
In [125]:
total_tagged[total_tagged.num_tagged >10].shape
Out[125]:
In [304]:
interactions = pd.DataFrame()
In [305]:
for item in regular_features:
for column in tag_analysis.drop(['published','pageviews','upper_quartile','year'],axis=1).drop(
total_tagged[total_tagged.num_tagged < 10].index,axis=1).columns:
interactions[(item + '_' + column)] = tag_analysis[item] + tag_analysis[column]
#Just sum the row and column and then turn any 2s into 1s and 1s into zeros.
In [306]:
def correct_values(x):
if x == 2.0:
return 1
elif x == 1.0:
return 0
else:
return 0
for item in interactions.columns:
interactions[item] = interactions[item].apply(correct_values)
In [307]:
interactions.head(2)
Out[307]:
In [308]:
tagged_total = pd.DataFrame(data =interactions.sum(),columns=['num_tagged'])
tagged_total = tagged_total.sort_values('num_tagged',ascending=False)
In [309]:
identity_tags = tagged_total[0:26].index
In [310]:
interactions = interactions.drop(identity_tags,axis=1)
In [311]:
tagged_total = pd.DataFrame(data =interactions.sum(),columns=['num_tagged'])
tagged_total = tagged_total.sort_values('num_tagged',ascending=False)
tagged_total.head(10)
Out[311]:
In [312]:
#DO I WANT TO DROP THE EMPTY COLUMNS?
#for item in interactions.columns:
# if interactions[item].sum == 0:
# interactions = interactions.drop(item,axis=1)
In [313]:
interactions.head(10)
Out[313]:
In [314]:
interactions = interactions.join(pageviews)
In [315]:
#drop empty cols
def drop_zero_cols(df):
for item in df.columns:
if df[item].sum() == 0:
df = df.drop(item,axis=1)
else:
continue
return df
In [316]:
interactions = drop_zero_cols(interactions.fillna(0).drop(['published','pageviews'],axis=1))
interactions = interactions.join(pageviews)
In [317]:
interactions.head(1)
Out[317]:
In [318]:
interaction_totals = pd.DataFrame(interactions.sum().sort_values(ascending=False),columns=['num_tagged'])
In [345]:
interaction_totals[interaction_totals.num_tagged < 4].shape
Out[345]:
In [346]:
interactions_analysis = interactions.drop(interaction_totals[interaction_totals.num_tagged < 4].index,axis=1)
In [347]:
interactions_analysis.head()
Out[347]:
In [348]:
#Check whether number of Aggregated stories published per day has an impact on average/total Day 0 - 1 traffic.
In [349]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import cross_validation
In [350]:
interactions_analysis['upper_quartile'] = [1 if x > 10000 else 0 for x in interactions.pageviews]
In [351]:
interactions_analysis['twenty_thousand'] = [1 if x > 20000 else 0 for x in interactions.pageviews]
In [352]:
y = interactions_analysis.upper_quartile
X = interactions_analysis.drop(['pageviews','published','upper_quartile','twenty_thousand'],axis=1)
In [353]:
kf = cross_validation.KFold(len(interactions_analysis),n_folds=5)
scores = []
for train_index, test_index in kf:
lr = linear_model.LogisticRegression().fit(X.iloc[train_index],y.iloc[train_index])
scores.append(lr.score(X.iloc[test_index],y.iloc[test_index]))
print "average accuracy for LogisticRegression is", np.mean(scores)
print "average of the set is: ", np.mean(y)
In [354]:
interactions_lr_scores = lr.predict_proba(X)[:,1]
In [355]:
print metrics.roc_auc_score(y,interactions_lr_scores)
In [356]:
interactions_probabilities = pd.DataFrame(zip(X.columns,interactions_lr_scores),columns=['tags','probabilities'])
In [357]:
interactions_probabilities.sort_values('probabilities',ascending=False)
Out[357]:
In [475]:
interaction_totals.head(2)
Out[475]:
In [469]:
def split_tag(x):
return x.split('_')[1]
interactions_probabilities = interactions_probabilities.reset_index()
interactions_probabilities['subtag'] = interactions_probabilities.tags.apply(split_tag)
In [477]:
interactions_probabilities = interactions_probabilities.sort_values(['tags','probabilities'],ascending=[1, 0])
In [471]:
interactions_probabilities = interactions_probabilities.set_index('tags').join(interaction_totals)
In [478]:
interactions_probabilities
Out[478]:
In [ ]:
In [ ]:
In [567]:
interactions_probabilities['pageviews'] = [sum(interactions['pageviews'][interactions[item]==1]) for item in interactions_probabilities.tags]
In [570]:
interactions_probabilities['mean-PVs'] = interactions_probabilities['pageviews'] // interactions_probabilities['num_tagged']
In [579]:
regular_features
Out[579]:
In [623]:
interactions_probabilities[interactions_probabilities.tags.str.contains('features')==True].sort_values('mean-PVs',
ascending = False)
Out[623]:
In [ ]:
interactions_probabilities.sort_values('probabilities',ascending = False)
In [625]:
np.mean(interactions.pageviews)
Out[625]:
In [620]:
#I took the dashes out. Have to add back for this
fix_regular_features = [x.replace(' ','-') for x in regular_features]
fig,axes=plt.subplots(figsize=(10,10))
for item, name in enumerate(fix_regular_features):
interactions.plot(x=interactions['pageviews'][interactions.columns.str.contains(name)==True],kind='box',ax=item)
plt.show()
In [453]:
#doublecheck my work on pageviews vs num-published
pub_volume = tag_analysis[['published','pageviews']]
pub_volume['num_pubbed'] = 1
pub_volume['published'] = pd.to_datetime(pub_volume.published)
pub_volume = pub_volume.set_index('published')
In [454]:
pub_volume.head(10)
Out[454]:
In [455]:
pub_volume = pub_volume.resample('M').sum().dropna()
In [456]:
pub_volume['year'] = pub_volume.index.year
In [457]:
pub_volume[pub_volume.index.year >=2015].corr()
Out[457]:
In [458]:
pub_volume[pub_volume.index.year >=2015].plot(kind='scatter',x='num_pubbed',y='pageviews')
Out[458]:
In [459]:
import seaborn as sns
ax = sns.regplot(x='num_pubbed',y='pageviews',data=pub_volume)
In [446]:
#doublecheck my work on pageviews vs num-published
pub_volume = time_series[['published','7-day-PVs']]
pub_volume['num_pubbed'] = 1
pub_volume['published'] = pd.to_datetime(pub_volume.published)
pub_volume = pub_volume.set_index('published')
In [447]:
pub_volume.head(10)
Out[447]:
In [448]:
num_holder = pub_volume.resample('D').sum().dropna().drop('7-day-PVs',axis=1)
pub_volume = pub_volume.resample('D').sum().dropna().drop('num_pubbed',axis=1)
pub_volume = pub_volume.join(num_holder)
pub_volume['year'] = pub_volume.index.year
pub_volume[pub_volume.index.year >=2015].corr()
Out[448]:
In [451]:
pub_volume[pub_volume.index >='2016-01-01'].plot(kind='scatter',x='num_pubbed',y='7-day-PVs',title='7-Day PVs')
Out[451]:
In [452]:
import seaborn as sns
ax = sns.regplot(x='num_pubbed',y='7-day-PVs',data=pub_volume)
In [540]:
simplereach = pd.read_csv('simplereach-tags.csv')
In [541]:
simplereach.head(1)
Out[541]:
In [542]:
simplereach = simplereach.set_index('Tag')
In [546]:
total_tagged2 = total_tagged
In [547]:
total_tagged2.head(4)
Out[547]:
In [548]:
total_tagged2.index = [x.replace('-',' ') for x in total_tagged.index]
simplereach = simplereach.join(total_tagged2)
In [549]:
simplereach['mean-PVs'] = simplereach['Page Views'] // simplereach['num_tagged']
simplereach['mean-shares'] = simplereach['Facebook Shares'] // simplereach['num_tagged']
In [550]:
simplereach = simplereach[['mean-PVs','mean-shares','num_tagged']]
In [622]:
simplereach[simplereach['num_tagged'] > 5].sort_values('mean-PVs',ascending=False)
In [554]:
#regular_features = [x.replace('-',' ') for x in regular_features]
simplereach.ix[regular_features].sort_values('mean-PVs',ascending=False)
Out[554]:
In [ ]:
In [ ]:
In [135]:
from sklearn import linear_model
In [136]:
from sklearn import metrics
In [137]:
tag_analysis.fillna(value=0,inplace=True)
In [138]:
y = tag_analysis.upper_quartile
X = tag_analysis.drop(['pageviews','published','upper_quartile'],axis=1)
In [139]:
from sklearn import cross_validation
In [140]:
kf = cross_validation.KFold(len(tag_analysis),n_folds=5)
scores = []
for train_index, test_index in kf:
lr = linear_model.LogisticRegression().fit(X.iloc[train_index],y.iloc[train_index])
scores.append(lr.score(X.iloc[test_index],y.iloc[test_index]))
print "average accuracy for LogisticRegression is", np.mean(scores)
print "average of the set is: ", np.mean(y)
In [141]:
lr_scores = lr.predict_proba(X)[:,1]
In [142]:
print metrics.roc_auc_score(y,lr_scores)
In [143]:
print metrics.roc_auc_score(y,lr_scores)
In [144]:
lr_scores
Out[144]:
In [145]:
coefficients = pd.DataFrame(zip(X.columns,lr.coef_[0]),columns=['tags','coefficients'])
probabilities = pd.DataFrame(zip(X.columns,lr_scores),columns=['tags','probabilities'])
In [146]:
probabilities.sort_values('probabilities',ascending=False)
Out[146]:
In [147]:
coefficients.sort_values('coefficients',ascending=False)
Out[147]:
In [148]:
tag_analysis[tag_analysis['100-wonders'] ==1].describe()
Out[148]:
In [149]:
tag_analysis.head()
Out[149]:
In [150]:
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
In [ ]:
params = {'n_neighbors': [x for x in range(2,200,1)],
'weights': ['distance','uniform']}
gs = GridSearchCV(estimator=KNeighborsClassifier(),param_grid=params,n_jobs=8,cv=10)
gs.fit(X,y)
print gs.best_params_
print gs.best_score_
In [160]:
print type(gs.best_estimator_)
In [161]:
knn = gs.best_estimator_.fit(X,y)
In [162]:
knn_scores = knn.predict_proba(X)[:,1]
In [163]:
print np.mean(knn_scores)
In [164]:
print np.mean(lr_scores)
In [165]:
knn_probabilities = pd.DataFrame(zip(X.columns,knn_scores),columns=['tags','probabilities'])
In [166]:
knn_probabilities.sort_values('probabilities',ascending=False)
Out[166]:
In [167]:
print 'knn', metrics.roc_auc_score(y,knn_scores)
print 'lr', metrics.roc_auc_score(y,lr_scores)
In [62]:
probabilities = probabilities.set_index('tags')
In [63]:
probabilities = probabilities.join(total_tagged)
In [64]:
probabilities.to_csv('tag-probabilities-logisticregression.csv')
In [65]:
from sklearn.ensemble import RandomForestClassifier
In [66]:
params = {'max_depth': np.arange(20,100,2),
'min_samples_leaf': np.arange(90,200,2),
'n_estimators': 20}
gs1 = GridSearchCV(RandomForestClassifier(),param_grid=params, cv=10, scoring='roc_auc',n_jobs=8,verbose=1)
gs1.fit(X,y)
print gs1.best_params_
print gs1.best_score_
In [67]:
rf = RandomForestClassifier(gs1.best_estimator_)
rf.fit(X,y)
probs = rf.predict_proba(X)[:,1]
print rf.score(X,y)
print metrics.roc_auc_score(y,probs)
In [69]:
probs = pd.DataFrame(zip(X.columns,probs),columns=['tags','probabilities'])
In [71]:
probs.sort_values('probabilities',ascending=False)
Out[71]:
In [144]:
tag_analysis2 = article_set.drop(total_tagged[total_tagged.num_tagged < 15].index,axis=1)
In [190]:
tag_analysis2['ten_thousand'] = [1 if x > 10000 else 0 for x in tag_analysis2.pageviews]
In [191]:
tag_analysis2.fillna(value=0,inplace=True)
y2 = tag_analysis2.ten_thousand
X2 = tag_analysis2.drop(['pageviews','upper_quartile','ten_thousand'],axis=1)
In [192]:
kf2 = cross_validation.KFold(len(tag_analysis2),n_folds=5)
scores2 = []
for train_index, test_index in kf2:
lr2 = linear_model.LogisticRegression().fit(X2.iloc[train_index],y2.iloc[train_index])
scores2.append(lr2.score(X2.iloc[test_index],y2.iloc[test_index]))
print "average accuracy for LogisticRegression is", np.mean(scores2)
print "average of the set is: ", np.mean(y2)
In [193]:
print tag_analysis2.shape
print y2.shape
print X2.shape
In [194]:
lr_scores2 = lr2.predict_proba(X2)[:,1]
In [195]:
lr2_probs = pd.DataFrame(zip(X2.columns,lr_scores2),columns=['tags','probabilities'])
In [196]:
lr2_probs.sort_values('probabilities',ascending=False)
Out[196]:
In [197]:
metrics.roc_auc_score(y2,lr2.predict_proba(X2)[:,1])
Out[197]:
In [198]:
lr2_probs = lr2_probs.set_index('tags')
In [199]:
lr2_probs = lr2_probs.join(total_tagged)
In [206]:
plt.figure(figsize=(10,10))
plt.scatter(lr2_probs.num_tagged,lr2_probs.probabilities)
plt.show()
In [201]:
lr2_probs = lr2_probs.sort_values('probabilities',ascending=False)
In [202]:
lr2_probs = lr2_probs.reset_index()
In [203]:
lr2_probs.to_csv('min15tags_min10000pvs.csv')
In [204]:
lr2_probs.shape
Out[204]:
In [207]:
lr2_probs
Out[207]:
In [ ]: