In [1]:
import csv
with open('hmp_mappings/unified_map.tsv', 'rU') as map_16s_f:
    r = csv.reader(map_16s_f, delimiter='\t')
    map_16s = [row for row in r]

In [3]:
with open('hmp_metaphlan/HMP.ab.txt','rU') as metaphlan_f:
    r = csv.reader(metaphlan_f, delimiter='\t')
    metaphlan_map = r.next()

In [4]:
map_16s[0]


Out[4]:
['SampleID',
 'RSID',
 'PSN',
 'SN',
 'NAP',
 'ExperimentAccession',
 'RunID',
 'SRS_SampleID',
 'Region',
 'BarcodeSequence',
 'LinkerPrimerSequence',
 'Sex',
 'HMPBodySubsiteHMPBodySite',
 'VisitNo']

In [5]:
map_16s[1]


Out[5]:
['SRS012191.SRX020675.V13',
 '158013734',
 '700013549',
 '',
 '700013596',
 'SRX020675',
 'SRR047747',
 'SRS012191',
 'V13',
 'TCAGCGCAAC',
 'ATTACCGCGGCTGCTGG',
 'female',
 'Stool',
 'Gastrointestinal_tract',
 '1']

In [6]:
metaphlan_map[0:5]


Out[6]:
['sid', 'SRS043001', 'SRS015989', 'SRS021477', 'SRS022621']

In [7]:
map_16s_matched = [map_16s[0]]

In [8]:
SRS_SampleID = map_16s[0].index('SRS_SampleID')

In [9]:
for row in map_16s[1:]:
    if row[SRS_SampleID] in metaphlan_map[1:]:
        map_16s_matched.append(row)

In [10]:
len(map_16s_matched)


Out[10]:
442

In [11]:
len(metaphlan_map)


Out[11]:
691

In [12]:
# so there are some that aren't matched
# let's see if we can recover them from the humann files
with open('hmp_humann/04b-mpm-cop-nul-nve-nul-nve.txt','rU') as f:
    r = csv.reader(f, delimiter='\t')
    humann_f = [row for row in r]

In [15]:
humann_f[0][0:5]


Out[15]:
['ID',
 'NAME',
 'SRS011061_vs_KEGG_v54-mpm-cop-nul-nve-nul-nve',
 'SRS011090_vs_KEGG_v54-mpm-cop-nul-nve-nul-nve',
 'SRS011098_vs_KEGG_v54-mpm-cop-nul-nve-nul-nve']

In [17]:
for i in range(len(humann_f[0])):
    humann_f[0][i] = humann_f[0][i].replace('_vs_KEGG_v54-mpm-cop-nul-nve-nul-nve','')

In [18]:
humann_f[0][0:5]


Out[18]:
['ID', 'NAME', 'SRS011061', 'SRS011090', 'SRS011098']

In [20]:
i = 0
for item in humann_f[0]:
    if item in metaphlan_map:
        i += 1
print i


674

In [21]:
# okay, that looks good
RSID = map_16s[0].index('RSID')
HMPBodySubsiteHMPBodySite = map_16s[0].index('HMPBodySubsiteHMPBodySite')
PSN = map_16s[0].index('PSN')
NAP = map_16s[0].index('NAP')
for item in metaphlan_map:
    try:
        ind_humann = humann_f[0].index(item)
        this_RANDSID = humann_f[1][ind_humann]
        this_STSite = humann_f[5][ind_humann]
        this_Parent_Specimen = humann_f[6][ind_humann]
        this_SRS = humann_f[9][ind_humann]
        for map_row in map_16s[1:]:
            if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
                map_16s_matched.append(row)
            elif row[PSN] == this_Parent_Specimen:
                map_16s_matched.append(row)
            elif row[NAP] == this_SRS:
                map_16s_matched.append(row)
    except:
        continue

In [22]:
len(map_16s_matched)


Out[22]:
2537

In [25]:
map_16s_matched[1]


Out[25]:
['SRS011271.SRX020659.V13',
 '158802708',
 '700015245',
 '',
 '700015250',
 'SRX020659',
 'SRR045051',
 'SRS011271',
 'V13',
 'TCAGCACGC',
 'ATTACCGCGGCTGCTGG',
 'male',
 'Stool',
 'Gastrointestinal_tract',
 '1']

In [26]:
for i in range(len(map_16s_matched)):
    row = map_16s_matched[i]
    this_end = row[0].find('.V')
    if this_end != -1:
        this_start = row[0][0:this_end]
    else:
        this_start = row[0]
    row = [this_start] + row
    map_16s_matched[i] = row

In [27]:
map_16s_matched[1]


Out[27]:
['SRS011271.SRX020659',
 'SRS011271.SRX020659.V13',
 '158802708',
 '700015245',
 '',
 '700015250',
 'SRX020659',
 'SRR045051',
 'SRS011271',
 'V13',
 'TCAGCACGC',
 'ATTACCGCGGCTGCTGG',
 'male',
 'Stool',
 'Gastrointestinal_tract',
 '1']

In [28]:
map_16s_matched[0][0] = 'Shortened_SampleID'

In [29]:
with open('MetaphlanMapping.tsv','wb') as f:
    w = csv.writer(f,delimiter='\t')
    for row in map_16s_matched:
        w.writerow(row)

In [ ]: