In [1]:
import pandas as pd
import numpy as np

# Scrape file sizes
# https://superuser.com/questions/423499/wget-only-getting-listing-file-in-every-sub-dir

In [2]:
v35_df = pd.read_csv("../data/hmp/ppAll_V35_map.txt", sep="\t", index_col=False)

hmasm_df = pd.read_csv("../data/hmp/HMASM-690.csv")

In [3]:
hmasm_df.head()


Out[3]:
SRS ID Body Site
0 SRS011061 stool
1 SRS011090 buccal_mucosa
2 SRS011098 supragingival_plaque
3 SRS011126 supragingival_plaque
4 SRS011132 anterior_nares

In [4]:
v35_df.head()


Out[4]:
SampleID RSID PSN SN NAP ExperimentAccession RunID SRS_SampleID Region BarcodeSequence LinkerPrimerSequence Sex HMPBodySubsiteHMPBodySite VisitNo
0 SRS012191.SRX020679.V35 158013734.0 700013549.0 NaN 700013596.0 SRX020679 SRR048044 SRS012191 V35 TCAGCGCAAC CCGTCAATTCMTTTRAGT female Stool Gastrointestinal_tract
1 SRS011157.SRX020666.V35 158822939.0 700014954.0 NaN 700014956.0 SRX020666 SRR044955 SRS011157 V35 TCAGTCACAC CCGTCAATTCMTTTRAGT male Stool Gastrointestinal_tract
2 SRS011157.SRX020669.V35 158822939.0 700014954.0 NaN 700014956.0 SRX020669 SRR045333 SRS011157 V35 TCAGTCACAC CCGTCAATTCMTTTRAGT male Stool Gastrointestinal_tract
3 SRS011159.SRX020660.V35 158822939.0 700014965.0 NaN 700014968.0 SRX020660 SRR044847 SRS011159 V35 TCAGCGACTC CCGTCAATTCMTTTRAGT male Saliva Oral
4 SRS011159.SRX020668.V35 158822939.0 700014965.0 NaN 700014968.0 SRX020668 SRR045072 SRS011159 V35 TCAGCGACTC CCGTCAATTCMTTTRAGT male Saliva Oral

In [5]:
hmasm_set = set(hmasm_df["SRS ID"])

In [6]:
v35_df_filtered = v35_df.loc[v35_df['SRS_SampleID'].isin(hmasm_set)]

In [7]:
v35_df_filtered.shape


Out[7]:
(288, 14)

In [8]:
v35_df_filtered = v35_df_filtered[np.isfinite(v35_df_filtered['PSN'])]
# v35_df_filtered = v35_df_filtered[np.isfinite(v35_df_filtered['SN'])]

In [9]:
v35_df_filtered.shape


Out[9]:
(269, 14)

In [10]:
v35_df_filtered_psn = v35_df.loc[v35_df['PSN'].isin(v35_df_filtered['PSN'])]

In [11]:
v35_df_filtered_psn.shape


Out[11]:
(269, 14)

In [12]:
inf = "../data/hmp/results.txt"
with open(inf) as reader:
    #File sizes express in bytes
    lines = [("ftp://" + line.split("\t")[-1][2:].rstrip(), int(line.split("\t")[1])/1073741824) for line in reader]

In [13]:
rows = []
for line, size in lines:
    srs = line.split("/")[-1].split(".")[0]
    group = line.split("/")[-2]
    rows.append([line, srs, group, size, srs in hmasm_set])
files_tidy = pd.DataFrame(rows, columns=["ftp", "srs", "group", "file_size_gb", "passed_qc"])
files_tidy.to_csv("../data/hmp/hmp_shotgun_files.txt", sep="\t")

In [14]:
files_summarized = files_tidy[files_tidy['passed_qc']].groupby("group").apply(lambda x: x.sample(30, replace=x.shape[0] <= 30))

In [32]:
filtered_files = files_tidy[files_tidy['srs'].isin(set(files_summarized['srs']))]
#groups_to_include = ['stool', 'subgingival_plaque', 'supragingival_plaque', 'left_retroauricular_crease', 'right_retroauricular_crease', 'tongue_dorsum']
# groups_to_include = ['left_retroauricular_crease', 'tongue_dorsum']
# filtered_files = filtered_files[filtered_files['group'].isin(groups_to_include)]

In [33]:
filtered_files['ftp'].head()
filtered_files['ftp'].to_csv('../data/hmp/ftpfilepaths_2', index=False)

In [28]:
#!cat ftpfilepaths | xargs -n 1 -P 16 wget -q --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 20; printf '%s\0' **/*.bz2 | xargs -r0 -n 1 -P 16 tar xf

In [36]:
filtered_files.groupby('group').sum()


Out[36]:
file_size_gb passed_qc
group
anterior_nares 2.374736 30.0
attached_keratinized_gingiva 21.790731 6.0
buccal_mucosa 20.771632 30.0
left_retroauricular_crease 13.520481 9.0
mid_vagina 0.111247 2.0
palatine_tonsils 8.601941 6.0
posterior_fornix 4.202533 30.0
right_retroauricular_crease 23.297344 15.0
saliva 1.608737 3.0
stool 189.343498 30.0
subgingival_plaque 9.511325 7.0
supragingival_plaque 99.006217 30.0
throat 8.426970 7.0
tongue_dorsum 154.930194 30.0
vaginal_introitus 0.344422 3.0