June 2008: This discussion blows up.
Run get_data.py, which is a script for parsing a list of redirects with endashes (from this quarry query) and gets data about when the page was first edited and by whom. It takes about a day to run on toollabs, so it is not run in this notebook.
In [ ]:
!python get_data.py
get_data.py produces tsv and pickle files for enwiki redirects.
In [1]:
!ls -lah
In [4]:
import pickle
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [110]:
pkl_file = open("enwiki-redirects-endash-processed.pickle", 'rb')
df = pickle.load(pkl_file)
len(df)
Out[110]:
In [ ]:
In [111]:
def is_endash_rename(row):
page_text = row.page_text
page_title = row.page_title
title_dash_loc = page_title.find("-")
text_dash_loc = page_text.find("–")
sep = text_dash_loc - title_dash_loc
if title_dash_loc > 0 and (sep > 10 or sep < 14):
return True
else:
return False
In [112]:
df['is_endash_rename'] = df.apply(is_endash_rename, axis=1)
df['is_endash_rename'].value_counts()
Out[112]:
In [113]:
total_df = df[df.is_endash_rename == True].copy()
In [114]:
total_df['datetime'] = pd.to_datetime(total_df['timestamp'])
In [115]:
total_df = total_df.set_index('datetime')
In [116]:
gp = total_df.groupby([pd.TimeGrouper('1W', closed='left')])
In [117]:
gp.revid.count().plot(figsize=[12,8])
Out[117]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [53]:
total_df = pd.DataFrame(columns=["revid", "timestamp", "user", "comment", "is_redirect", "page_title", "page_namespace", "page_text"])
for row in df.iterrows():
page_text = row[1].page_text
page_title = row[1].page_title
title_dash_loc = page_title.find("-")
text_dash_loc = page_text.find("–")
sep = text_dash_loc - title_dash_loc
if title_dash_loc > 0 and (sep > 10 or sep < 14):
total_df = total_df.append(row[1])