In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from db_utils import query_hive_ssh
import re
import copy
import time
import numpy as np
import os

In [5]:
def transfer_table(params, dry = False):
    hdfs_path = '/user/hive/warehouse/%(db)s.db/%(table)s' % params
    stat2_path = '/home/ellery/detox/data/retention/%(name)s' % params
    local_path = '/Users/ellerywulczyn/detox/data/retention/%(name)s/' % params
    

    if not dry:
        # transfer from HDFS to stat2
        cmd = "ssh stat1002.eqiad.wmnet 'rm -rf %s'" % stat2_path
        print(os.system(cmd))
        cmd = "ssh stat1002.eqiad.wmnet 'hadoop fs -copyToLocal %s %s '" % (hdfs_path, stat2_path)
        print(os.system(cmd))
        #transfer from stat2 to local
        cmd = 'rm -rf %s' % local_path
        print(os.system(cmd))
        cmd = 'rsync -avz  stat1002.eqiad.wmnet:%s/* %s' % (stat2_path, local_path)
        os.system(cmd)
        
        
    dfs = []

    for filename in os.listdir(local_path):
        try:
            dfs.append(pd.read_csv(os.path.join(local_path,filename), sep = '\t', header = None))
        except:
            print("Error reading: ", os.path.join(local_path,filename))

    df = pd.concat(dfs)
    df.columns = params['columns']

    df.to_csv(local_path[:-1] + '.tsv', sep ='\t', index = False)

In [6]:
params = {
        'db': 'enwiki',
        'table':'daily_revision_counts',
        'name': 'daily_revision_counts',
        'columns': ['user_id','day','ns', 'n_revisions', 'n_deleted_revisions', 'n_identity_reverted_revisions', 'n_productive_revisions']
    }
transfer_table(params, dry = True)

In [7]:
params = {
        'db': 'enwiki',
        'table':'user_first_edit',
        'name': 'user_start',
        'columns': ['user_id', 'first_edit_day']
    }
transfer_table(params, dry = False)


0
0
0

In [10]:
params = {
        'db': 'enwiki',
        'table':'user_id_to_names',
        'name': 'user_id_to_names',
        'columns': ['user_id', 'user_text']
    }
transfer_table(params, dry = False)


0
0
0
Error reading:  /Users/ellerywulczyn/detox/data/retention/user_id_to_names/000131_0
Error reading:  /Users/ellerywulczyn/detox/data/retention/user_id_to_names/000900_0

In [ ]:
params = {
        'db': 'enwiki',
        'table':'user_warnings',
        'name': 'user_warnings',
        'columns': [
                    'rev_id',
                    'page_id',
                    'page_title',
                    'rev_timestamp',
                    'user_id',
                    'user_text',
                    'bot',
                    'admin',
                    'ns',
                    'year',
                    'type',
                    'level',
                    ]
    }
transfer_table(params, dry = False)