notebook.community

Edit and run



In [1]:

    
import csv
with open('hmp_humann/humann-mappings_unique.tsv', 'rU') as humann_map_f:
    r = csv.reader(humann_map_f, delimiter='\t')
    humann_map = [row for row in r]
with open('hmp_mappings/unified_map.tsv', 'rU') as map_16s_f:
    r = csv.reader(map_16s_f, delimiter='\t')
    map_16s = [row for row in r]



In [2]:

    
print humann_map[0]









    



['RANDSID', 'GENDER', 'STSite', 'Parent_Specimen', 'SRS']



In [3]:

    
print map_16s[0]









    



['SampleID', 'RSID', 'PSN', 'SN', 'NAP', 'ExperimentAccession', 'RunID', 'SRS_SampleID', 'Region', 'BarcodeSequence', 'LinkerPrimerSequence', 'Sex', 'HMPBodySubsiteHMPBodySite', 'VisitNo']

Three options
- humann_map[RANDSID,STSite] == map_16s[RSID,HMPBodySubsiteHMPBodySite]
- humann_map[Parent_Specimen] == map_16s[PSN]
- humann_map[SRS] == map_16s[NAP]



In [4]:

    
map_16s_matched = []
map_16s_matched.append(map_16s[0])



In [5]:

    
RANDSID = humann_map[0].index('RANDSID')
STSite = humann_map[0].index('STSite')
Parent_Specimen = humann_map[0].index('Parent_Specimen')
SRS = humann_map[0].index('SRS')



In [6]:

    
RSID = map_16s[0].index('RSID')
HMPBodySubsiteHMPBodySite = map_16s[0].index('HMPBodySubsiteHMPBodySite')
PSN = map_16s[0].index('PSN')
NAP = map_16s[0].index('NAP')



In [7]:

    
for row in humann_map[1:]:
    this_RANDSID = row[RANDSID]
    this_STSite = row[STSite]
    this_set = []
    for map_row in map_16s:
        if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
            map_16s_matched.append(map_row)



In [8]:

    
len(map_16s_matched)









    Out[8]:





2120



In [9]:

    
# there are no doubt some duplicates in there
# will deal with that in terminal
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)



In [10]:

    
# oops: forgot the other options
map_16s_matched = [map_16s[0]]



In [11]:

    
for row in humann_map[1:]:
    this_RANDSID = row[RANDSID]
    this_STSite = row[STSite]
    this_Parent_Specimen = row[Parent_Specimen]
    this_SRS = row[SRS]
    this_set = []
    for map_row in map_16s:
        if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
            map_16s_matched.append(map_row)
        elif row[PSN] == this_Parent_Specimen:
            map_16s_matched.append(map_row)
        elif row[NAP] == this_SRS:
            map_16s_matched.append(map_row)



In [12]:

    
len(map_16s_matched)









    Out[12]:





7663993



In [13]:

    
len(map_16s)









    Out[13]:





11172



In [14]:

    
# Looks like there are likely duplicates
# Will deal with them through bash
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)



In [15]:

    
# double oops forgot this
for i in range(len(map_16s_matched)):
    row = map_16s_matched[i]
    this_end = row[0].find('.V')
    if this_end != -1:
        this_start = row[0][0:this_end]
    else:
        this_start = row[0]
    row = [this_start] + row
    map_16s_matched[i] = row



In [16]:

    
with open('hmp_mappings/humann_full_map_raw.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)



In [ ]: