In [1]:
import pandas as pd
import numpy as np
# Scrape file sizes
# https://superuser.com/questions/423499/wget-only-getting-listing-file-in-every-sub-dir
In [2]:
v35_df = pd.read_csv("../data/hmp/ppAll_V35_map.txt", sep="\t", index_col=False)
hmasm_df = pd.read_csv("../data/hmp/HMASM-690.csv")
In [3]:
hmasm_df.head()
Out[3]:
In [4]:
v35_df.head()
Out[4]:
In [5]:
hmasm_set = set(hmasm_df["SRS ID"])
In [6]:
v35_df_filtered = v35_df.loc[v35_df['SRS_SampleID'].isin(hmasm_set)]
In [7]:
v35_df_filtered.shape
Out[7]:
In [8]:
v35_df_filtered = v35_df_filtered[np.isfinite(v35_df_filtered['PSN'])]
# v35_df_filtered = v35_df_filtered[np.isfinite(v35_df_filtered['SN'])]
In [9]:
v35_df_filtered.shape
Out[9]:
In [10]:
v35_df_filtered_psn = v35_df.loc[v35_df['PSN'].isin(v35_df_filtered['PSN'])]
In [11]:
v35_df_filtered_psn.shape
Out[11]:
In [12]:
inf = "../data/hmp/results.txt"
with open(inf) as reader:
#File sizes express in bytes
lines = [("ftp://" + line.split("\t")[-1][2:].rstrip(), int(line.split("\t")[1])/1073741824) for line in reader]
In [13]:
rows = []
for line, size in lines:
srs = line.split("/")[-1].split(".")[0]
group = line.split("/")[-2]
rows.append([line, srs, group, size, srs in hmasm_set])
files_tidy = pd.DataFrame(rows, columns=["ftp", "srs", "group", "file_size_gb", "passed_qc"])
files_tidy.to_csv("../data/hmp/hmp_shotgun_files.txt", sep="\t")
In [14]:
files_summarized = files_tidy[files_tidy['passed_qc']].groupby("group").apply(lambda x: x.sample(30, replace=x.shape[0] <= 30))
In [32]:
filtered_files = files_tidy[files_tidy['srs'].isin(set(files_summarized['srs']))]
#groups_to_include = ['stool', 'subgingival_plaque', 'supragingival_plaque', 'left_retroauricular_crease', 'right_retroauricular_crease', 'tongue_dorsum']
# groups_to_include = ['left_retroauricular_crease', 'tongue_dorsum']
# filtered_files = filtered_files[filtered_files['group'].isin(groups_to_include)]
In [33]:
filtered_files['ftp'].head()
filtered_files['ftp'].to_csv('../data/hmp/ftpfilepaths_2', index=False)
In [28]:
#!cat ftpfilepaths | xargs -n 1 -P 16 wget -q --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 20; printf '%s\0' **/*.bz2 | xargs -r0 -n 1 -P 16 tar xf
In [36]:
filtered_files.groupby('group').sum()
Out[36]: