In [1]:
import csv
with open('hmp_humann/humann-mappings_unique.tsv', 'rU') as humann_map_f:
    r = csv.reader(humann_map_f, delimiter='\t')
    humann_map = [row for row in r]
with open('hmp_mappings/unified_map.tsv', 'rU') as map_16s_f:
    r = csv.reader(map_16s_f, delimiter='\t')
    map_16s = [row for row in r]

In [2]:
print humann_map[0]


['RANDSID', 'GENDER', 'STSite', 'Parent_Specimen', 'SRS']

In [3]:
print map_16s[0]


['SampleID', 'RSID', 'PSN', 'SN', 'NAP', 'ExperimentAccession', 'RunID', 'SRS_SampleID', 'Region', 'BarcodeSequence', 'LinkerPrimerSequence', 'Sex', 'HMPBodySubsiteHMPBodySite', 'VisitNo']
  • Three options
    • humann_map[RANDSID,STSite] == map_16s[RSID,HMPBodySubsiteHMPBodySite]
    • humann_map[Parent_Specimen] == map_16s[PSN]
    • humann_map[SRS] == map_16s[NAP]

In [4]:
map_16s_matched = []
map_16s_matched.append(map_16s[0])

In [5]:
RANDSID = humann_map[0].index('RANDSID')
STSite = humann_map[0].index('STSite')
Parent_Specimen = humann_map[0].index('Parent_Specimen')
SRS = humann_map[0].index('SRS')

In [6]:
RSID = map_16s[0].index('RSID')
HMPBodySubsiteHMPBodySite = map_16s[0].index('HMPBodySubsiteHMPBodySite')
PSN = map_16s[0].index('PSN')
NAP = map_16s[0].index('NAP')

In [7]:
for row in humann_map[1:]:
    this_RANDSID = row[RANDSID]
    this_STSite = row[STSite]
    this_set = []
    for map_row in map_16s:
        if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
            map_16s_matched.append(map_row)

In [8]:
len(map_16s_matched)


Out[8]:
2120

In [9]:
# there are no doubt some duplicates in there
# will deal with that in terminal
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)

In [10]:
# oops: forgot the other options
map_16s_matched = [map_16s[0]]

In [11]:
for row in humann_map[1:]:
    this_RANDSID = row[RANDSID]
    this_STSite = row[STSite]
    this_Parent_Specimen = row[Parent_Specimen]
    this_SRS = row[SRS]
    this_set = []
    for map_row in map_16s:
        if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
            map_16s_matched.append(map_row)
        elif row[PSN] == this_Parent_Specimen:
            map_16s_matched.append(map_row)
        elif row[NAP] == this_SRS:
            map_16s_matched.append(map_row)

In [12]:
len(map_16s_matched)


Out[12]:
7663993

In [13]:
len(map_16s)


Out[13]:
11172

In [14]:
# Looks like there are likely duplicates
# Will deal with them through bash
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)

In [15]:
# double oops forgot this
for i in range(len(map_16s_matched)):
    row = map_16s_matched[i]
    this_end = row[0].find('.V')
    if this_end != -1:
        this_start = row[0][0:this_end]
    else:
        this_start = row[0]
    row = [this_start] + row
    map_16s_matched[i] = row

In [16]:
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)

In [ ]: