In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from db_utils import query_hive_ssh
import re
import copy
import time
import numpy as np
import os
In [5]:
def transfer_table(params, dry = False):
hdfs_path = '/user/hive/warehouse/%(db)s.db/%(table)s' % params
stat2_path = '/home/ellery/detox/data/retention/%(name)s' % params
local_path = '/Users/ellerywulczyn/detox/data/retention/%(name)s/' % params
if not dry:
# transfer from HDFS to stat2
cmd = "ssh stat1002.eqiad.wmnet 'rm -rf %s'" % stat2_path
print(os.system(cmd))
cmd = "ssh stat1002.eqiad.wmnet 'hadoop fs -copyToLocal %s %s '" % (hdfs_path, stat2_path)
print(os.system(cmd))
#transfer from stat2 to local
cmd = 'rm -rf %s' % local_path
print(os.system(cmd))
cmd = 'rsync -avz stat1002.eqiad.wmnet:%s/* %s' % (stat2_path, local_path)
os.system(cmd)
dfs = []
for filename in os.listdir(local_path):
try:
dfs.append(pd.read_csv(os.path.join(local_path,filename), sep = '\t', header = None))
except:
print("Error reading: ", os.path.join(local_path,filename))
df = pd.concat(dfs)
df.columns = params['columns']
df.to_csv(local_path[:-1] + '.tsv', sep ='\t', index = False)
In [6]:
params = {
'db': 'enwiki',
'table':'daily_revision_counts',
'name': 'daily_revision_counts',
'columns': ['user_id','day','ns', 'n_revisions', 'n_deleted_revisions', 'n_identity_reverted_revisions', 'n_productive_revisions']
}
transfer_table(params, dry = True)
In [7]:
params = {
'db': 'enwiki',
'table':'user_first_edit',
'name': 'user_start',
'columns': ['user_id', 'first_edit_day']
}
transfer_table(params, dry = False)
In [10]:
params = {
'db': 'enwiki',
'table':'user_id_to_names',
'name': 'user_id_to_names',
'columns': ['user_id', 'user_text']
}
transfer_table(params, dry = False)
In [ ]:
params = {
'db': 'enwiki',
'table':'user_warnings',
'name': 'user_warnings',
'columns': [
'rev_id',
'page_id',
'page_title',
'rev_timestamp',
'user_id',
'user_text',
'bot',
'admin',
'ns',
'year',
'type',
'level',
]
}
transfer_table(params, dry = False)