In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
pkl_file = open("enwiki-redirects-minus-processed.pickle", 'rb')
df = pickle.load(pkl_file)
len(df)
Out[2]:
In [3]:
for x in df[0:5].iterrows():
print(x[1].page_text, x[1].page_text.find("–"))
In [4]:
def is_endash_redirect(row):
page_text = row.page_text
page_title = row.page_title
title_dash_loc = page_title.find("-")
text_dash_loc = page_text.find("–")
if text_dash_loc == -1 or title_dash_loc == -1:
return False
sep = text_dash_loc - title_dash_loc
if sep > 10 or sep < 14:
return True
else:
return False
In [5]:
df['is_endash_redirect'] = df.apply(is_endash_redirect, axis=1)
df['is_endash_redirect'].value_counts()
Out[5]:
In [6]:
df[0:5]
Out[6]:
In [7]:
total_df = df[df.is_endash_redirect == True].copy()
In [8]:
total_df['datetime'] = pd.to_datetime(total_df['timestamp'])
In [9]:
total_df = total_df.set_index('datetime')
In [20]:
gp = total_df.groupby([pd.TimeGrouper('3D', closed='left')])
In [21]:
ax = gp.revid.count().plot(kind='line', figsize=[14,6], logy=False)
In [ ]: