In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

import numpy as np
import pandas as pd
import joblib

import datetime
import os
import numpy as np
import time
import multiprocessing as mp
import re


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 9.2 ms

In [7]:
nss = [ 'article', 'user']
years = range(2001,2016)


samples = {}
for ns in nss:
    for year in years:
        ind = os.path.join('../../data/samples', ns, 'clean', 'talk_diff_%d' % year)
        outd = 'comments_%s_%d' % (ns, year)
        samples[ind] = outd


time: 3.37 ms

In [8]:
for ind, outd in samples.items():
    print(ind)
    os.system('rm -rf %s' % outd)
    os.system('mkdir %s' % outd)
    
    files = []
    for root, dirnames, filenames in os.walk(ind):
        for filename in filenames:
            if 'chunk' in filename:
                files.append(filename)
                
    for file in files:
        df = pd.read_csv(os.path.join(ind, file), sep = '\t', encoding = 'utf-8')
        df = df.rename(columns={'clean_diff': 'comment',
                                    'diff': 'raw_comment',
                                    'rev_timestamp': 'timestamp',
                                   }
                          )
        order = ['rev_id', 'comment', 'raw_comment', 'timestamp', 'page_id', 'page_title', 'user_id', 'user_text', 'bot', 'admin']
        df = df[order]
        df.to_csv(os.path.join(outd, file), sep = '\t', index = False)
    
    os.chdir('../../data/figshare')
    os.system("tar -Pzcvf %s.tar.gz %s" % (outd, outd))
    
    os.system("rm -rf %s" % outd)


../../data/samples/article/clean/talk_diff_2009
../../data/samples/article/clean/talk_diff_2004
../../data/samples/user/clean/talk_diff_2003
../../data/samples/user/clean/talk_diff_2004
../../data/samples/user/clean/talk_diff_2013
../../data/samples/article/clean/talk_diff_2015
../../data/samples/user/clean/talk_diff_2011
../../data/samples/article/clean/talk_diff_2014
../../data/samples/user/clean/talk_diff_2009
../../data/samples/user/clean/talk_diff_2015
../../data/samples/user/clean/talk_diff_2006
../../data/samples/article/clean/talk_diff_2012
../../data/samples/article/clean/talk_diff_2013
../../data/samples/article/clean/talk_diff_2011
../../data/samples/user/clean/talk_diff_2001
../../data/samples/user/clean/talk_diff_2012
../../data/samples/article/clean/talk_diff_2003
../../data/samples/article/clean/talk_diff_2007
../../data/samples/article/clean/talk_diff_2002
../../data/samples/user/clean/talk_diff_2005
../../data/samples/user/clean/talk_diff_2014
../../data/samples/article/clean/talk_diff_2006
../../data/samples/user/clean/talk_diff_2007
../../data/samples/article/clean/talk_diff_2001
../../data/samples/article/clean/talk_diff_2010
../../data/samples/user/clean/talk_diff_2010
../../data/samples/user/clean/talk_diff_2008
../../data/samples/user/clean/talk_diff_2002
../../data/samples/article/clean/talk_diff_2005
../../data/samples/article/clean/talk_diff_2008
time: 1h 43min 12s

In [ ]:


In [ ]: