In [1]:
import csv
with open('hmp_mappings/unified_map.tsv', 'rU') as map_16s_f:
r = csv.reader(map_16s_f, delimiter='\t')
map_16s = [row for row in r]
In [3]:
with open('hmp_metaphlan/HMP.ab.txt','rU') as metaphlan_f:
r = csv.reader(metaphlan_f, delimiter='\t')
metaphlan_map = r.next()
In [4]:
map_16s[0]
Out[4]:
In [5]:
map_16s[1]
Out[5]:
In [6]:
metaphlan_map[0:5]
Out[6]:
In [7]:
map_16s_matched = [map_16s[0]]
In [8]:
SRS_SampleID = map_16s[0].index('SRS_SampleID')
In [9]:
for row in map_16s[1:]:
if row[SRS_SampleID] in metaphlan_map[1:]:
map_16s_matched.append(row)
In [10]:
len(map_16s_matched)
Out[10]:
In [11]:
len(metaphlan_map)
Out[11]:
In [12]:
# so there are some that aren't matched
# let's see if we can recover them from the humann files
with open('hmp_humann/04b-mpm-cop-nul-nve-nul-nve.txt','rU') as f:
r = csv.reader(f, delimiter='\t')
humann_f = [row for row in r]
In [15]:
humann_f[0][0:5]
Out[15]:
In [17]:
for i in range(len(humann_f[0])):
humann_f[0][i] = humann_f[0][i].replace('_vs_KEGG_v54-mpm-cop-nul-nve-nul-nve','')
In [18]:
humann_f[0][0:5]
Out[18]:
In [20]:
i = 0
for item in humann_f[0]:
if item in metaphlan_map:
i += 1
print i
In [21]:
# okay, that looks good
RSID = map_16s[0].index('RSID')
HMPBodySubsiteHMPBodySite = map_16s[0].index('HMPBodySubsiteHMPBodySite')
PSN = map_16s[0].index('PSN')
NAP = map_16s[0].index('NAP')
for item in metaphlan_map:
try:
ind_humann = humann_f[0].index(item)
this_RANDSID = humann_f[1][ind_humann]
this_STSite = humann_f[5][ind_humann]
this_Parent_Specimen = humann_f[6][ind_humann]
this_SRS = humann_f[9][ind_humann]
for map_row in map_16s[1:]:
if map_row[RSID] == this_RANDSID and map_row[HMPBodySubsiteHMPBodySite] == this_STSite:
map_16s_matched.append(row)
elif row[PSN] == this_Parent_Specimen:
map_16s_matched.append(row)
elif row[NAP] == this_SRS:
map_16s_matched.append(row)
except:
continue
In [22]:
len(map_16s_matched)
Out[22]:
In [25]:
map_16s_matched[1]
Out[25]:
In [26]:
for i in range(len(map_16s_matched)):
row = map_16s_matched[i]
this_end = row[0].find('.V')
if this_end != -1:
this_start = row[0][0:this_end]
else:
this_start = row[0]
row = [this_start] + row
map_16s_matched[i] = row
In [27]:
map_16s_matched[1]
Out[27]:
In [28]:
map_16s_matched[0][0] = 'Shortened_SampleID'
In [29]:
with open('MetaphlanMapping.tsv','wb') as f:
w = csv.writer(f,delimiter='\t')
for row in map_16s_matched:
w.writerow(row)
In [ ]: