The great endash migration

June 2008: This discussion blows up.

Run get_data.py, which is a script for parsing a list of redirects with endashes (from this quarry query) and gets data about when the page was first edited and by whom. It takes about a day to run on toollabs, so it is not run in this notebook.


In [ ]:
!python get_data.py

get_data.py produces tsv and pickle files for enwiki redirects.


In [1]:
!ls -lah


total 74M
drwxr-sr-x 3 tools.paws tools.paws 4.0K Mar  8 19:56 .
drwxr-sr-x 7 tools.paws tools.paws 4.0K Mar  8 18:09 ..
drwxr-sr-x 2 tools.paws tools.paws 4.0K Mar  8 19:56 .ipynb_checkpoints
-rw-r--r-- 1 tools.paws tools.paws  27M Mar  8 18:18 enwiki-redirects-endash-20170308.tsv
-rw-r--r-- 1 tools.paws tools.paws 8.4K Mar  8 21:44 enwiki-redirects-endash-errors.pickle
-rw-r--r-- 1 tools.paws tools.paws 7.6K Mar  8 21:44 enwiki-redirects-endash-errors.tsv
-rw-r--r-- 1 tools.paws tools.paws  11M Mar  8 21:44 enwiki-redirects-endash-processed.pickle
-rw-r--r-- 1 tools.paws tools.paws  10M Mar  8 21:44 enwiki-redirects-endash-processed.tsv
-rw-r--r-- 1 tools.paws tools.paws  27M Mar  8 18:09 enwiki-redirects-to-endash-20170308.tsv
-rw-r--r-- 1 tools.paws tools.paws 1.7K Mar  8 19:29 get_data.py
-rw-r--r-- 1 tools.paws tools.paws  25K Mar  8 19:55 process-visualize.ipynb
-rw-r--r-- 1 tools.paws tools.paws 171K Mar  8 19:56 redirects-scratch-notebook.ipynb

In [4]:
import pickle

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [110]:
pkl_file = open("enwiki-redirects-endash-processed.pickle", 'rb')
df = pickle.load(pkl_file)
len(df)


Out[110]:
72550

In [ ]:


In [111]:
def is_endash_rename(row):
    page_text = row.page_text
    page_title = row.page_title
    
    title_dash_loc = page_title.find("-")
    text_dash_loc = page_text.find("–")
    
    sep = text_dash_loc - title_dash_loc
    
    if title_dash_loc > 0 and (sep > 10 or sep < 14):
        return True
    else:
        return False

In [112]:
df['is_endash_rename'] = df.apply(is_endash_rename, axis=1)

df['is_endash_rename'].value_counts()


Out[112]:
True     55705
False    16845
Name: is_endash_rename, dtype: int64

In [113]:
total_df = df[df.is_endash_rename == True].copy()

In [114]:
total_df['datetime'] = pd.to_datetime(total_df['timestamp'])

In [115]:
total_df = total_df.set_index('datetime')

In [116]:
gp = total_df.groupby([pd.TimeGrouper('1W', closed='left')])

In [117]:
gp.revid.count().plot(figsize=[12,8])


Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ad9e9b048>
/srv/paws/lib/python3.4/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

scratch


In [53]:
total_df = pd.DataFrame(columns=["revid", "timestamp", "user", "comment", "is_redirect", "page_title", "page_namespace", "page_text"])

for row in df.iterrows():

    
    page_text = row[1].page_text
    page_title = row[1].page_title
    
    title_dash_loc = page_title.find("-")
    text_dash_loc = page_text.find("–")
    
    sep = text_dash_loc - title_dash_loc
    
    if title_dash_loc > 0 and (sep > 10 or sep < 14):
        total_df = total_df.append(row[1])


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-53-b3a2bfa99cbf> in <module>()
     13 
     14     if title_dash_loc > 0 and (sep > 10 or sep < 14):
---> 15         total_df = total_df.append(row[1])

/srv/paws/lib/python3.4/site-packages/pandas/core/frame.py in append(self, other, ignore_index, verify_integrity)
   4419                               index=index,
   4420                               columns=combined_columns)
-> 4421             other = other._convert(datetime=True, timedelta=True)
   4422             if not self.columns.equals(combined_columns):
   4423                 self = self.reindex(columns=combined_columns)

/srv/paws/lib/python3.4/site-packages/pandas/core/generic.py in _convert(self, datetime, numeric, timedelta, coerce, copy)
   3106             self._data.convert(datetime=datetime, numeric=numeric,
   3107                                timedelta=timedelta, coerce=coerce,
-> 3108                                copy=copy)).__finalize__(self)
   3109 
   3110     # TODO: Remove in 0.18 or 2017, which ever is sooner

/srv/paws/lib/python3.4/site-packages/pandas/core/internals.py in convert(self, **kwargs)
   3190 
   3191     def convert(self, **kwargs):
-> 3192         return self.apply('convert', **kwargs)
   3193 
   3194     def replace(self, **kwargs):

/srv/paws/lib/python3.4/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3060             return self.make_empty(axes or self.axes)
   3061         bm = self.__class__(result_blocks, axes or self.axes,
-> 3062                             do_integrity_check=do_integrity_check)
   3063         bm._consolidate_inplace()
   3064         return bm

/srv/paws/lib/python3.4/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check, fastpath)
   2748 
   2749         for block in blocks:
-> 2750             if block.is_sparse:
   2751                 if len(block.mgr_locs) != 1:
   2752                     raise AssertionError("Sparse block refers to multiple "

KeyboardInterrupt: