In [1]:
    
import csv
with open('hmp_humann/humann-mappings_unique.tsv', 'rU') as humann_map_f:
    r = csv.reader(humann_map_f, delimiter='\t')
    humann_map = [row for row in r]
with open('hmp_mappings/unified_map.tsv', 'rU') as map_16s_f:
    r = csv.reader(map_16s_f, delimiter='\t')
    map_16s = [row for row in r]
    
In [2]:
    
print humann_map[0]
    
    
In [3]:
    
print map_16s[0]
    
    
In [4]:
    
map_16s_matched = []
map_16s_matched.append(map_16s[0])
    
In [5]:
    
RANDSID = humann_map[0].index('RANDSID')
STSite = humann_map[0].index('STSite')
Parent_Specimen = humann_map[0].index('Parent_Specimen')
SRS = humann_map[0].index('SRS')
    
In [6]:
    
RSID = map_16s[0].index('RSID')
HMPBodySubsiteHMPBodySite = map_16s[0].index('HMPBodySubsiteHMPBodySite')
PSN = map_16s[0].index('PSN')
NAP = map_16s[0].index('NAP')
    
In [7]:
    
for row in humann_map[1:]:
    this_RANDSID = row[RANDSID]
    this_STSite = row[STSite]
    this_set = []
    for map_row in map_16s:
        if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
            map_16s_matched.append(map_row)
    
In [8]:
    
len(map_16s_matched)
    
    Out[8]:
In [9]:
    
# there are no doubt some duplicates in there
# will deal with that in terminal
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)
    
In [10]:
    
# oops: forgot the other options
map_16s_matched = [map_16s[0]]
    
In [11]:
    
for row in humann_map[1:]:
    this_RANDSID = row[RANDSID]
    this_STSite = row[STSite]
    this_Parent_Specimen = row[Parent_Specimen]
    this_SRS = row[SRS]
    this_set = []
    for map_row in map_16s:
        if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
            map_16s_matched.append(map_row)
        elif row[PSN] == this_Parent_Specimen:
            map_16s_matched.append(map_row)
        elif row[NAP] == this_SRS:
            map_16s_matched.append(map_row)
    
In [12]:
    
len(map_16s_matched)
    
    Out[12]:
In [13]:
    
len(map_16s)
    
    Out[13]:
In [14]:
    
# Looks like there are likely duplicates
# Will deal with them through bash
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)
    
In [15]:
    
# double oops forgot this
for i in range(len(map_16s_matched)):
    row = map_16s_matched[i]
    this_end = row[0].find('.V')
    if this_end != -1:
        this_start = row[0][0:this_end]
    else:
        this_start = row[0]
    row = [this_start] + row
    map_16s_matched[i] = row
    
In [16]:
    
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)
    
In [ ]: