Fix the counts file so that it can be read with the R commands:
dt = read.csv(file="hmp1.v13.hq.otu.counts.header", header=T, sep="\t", quote="")
dt = dt[,1:27655]
I also split the file to two -on with the PPS
lines, one with the may1
lines.
In [10]:
fin = open("hmp1.v13.hq.otu.counts")
fout_may1 = open("hmp1.v13.hq.otu.counts.may1", "w")
fout_PPS = open("hmp1.v13.hq.otu.counts.PPS", "w")
line = fin.readline()
line = line.split("\t")
line = line[1:]
line[-1] = line[-1][:-1]
line.append(str(int(line[-1])+1) + '\n')
fout_may1.write('\t'.join(line))
fout_PPS.write('\t'.join(line))
PPS = 0
may1 = 0
for line in fin:
line = line.split('\t')
nap, tag = line[0].split('.')
line[0] = nap
if 'may1' == tag:
fout_may1.write('\t'.join(line))
may1 += 1
elif 'PPS' == tag:
fout_PPS.write('\t'.join(line))
PPS += 1
fout_may1.close()
fout_PPS.close()
fin.close()
print "PPS",PPS,"may1",may1
In [8]:
!ls -lhg hmp1.v13.hq.otu.counts*
Find duplicate NPS:
In [7]:
fin = open("hmp1.v13.hq.otu.counts")
fout = open('duplicates', 'w')
line = fin.readline()
fout.write(line)
ids = dict()
cnt = 0
for line in fin:
collection = line.split('\t')[0]
nps, tag = collection.split('.')
if nps in ids:
fout.write(ids[nps])
fout.write(line)
cnt += 1
else:
ids[nps] = line
if cnt == 10:
break
fin.close()
fout.close()
In [ ]:
Fix the mapping file so that each taxa will be on a different column
In [15]:
def remove_enclosure(s):
try:
return s[:s.index('(')]
except ValueError:
return s
line = '249 Root(100);Bacteria(100);"Firmicutes"(90);"Clostridia"(90);Clostridiales(90);unclassified(87);unclassified(87);\n'
otu,taxonomy = map(str.strip, line.split('\t'))
map(lambda x: x.replace('"', ''), map(remove_enclosure, taxonomy.split(';')[1:-1]))
Out[15]:
In [39]:
fin = open("hmp1.v13.hq.otu.lookup")
fout = open("hmp1.v13.hq.otu.lookup.split", "w")
header = fin.readline()
header = ["otu", "domain","kingdom","phylum","class","order","family"]#,"genus"]
fout.write('\t'.join(header) + '\n')
for line in fin:
otu,taxonomy = map(str.strip, line.split('\t'))
taxa = map(lambda x: x.replace('"', ''), map(remove_enclosure, taxonomy.split(';')[1:-1]))
taxa.insert(0, otu)
while len(taxa) < len(header):
taxa.append("NA")
if len(taxa) > len(header): print "otu %s: %d taxa, %d header" % (otu, len(taxa), len(header))
fout.write('\t'.join(taxa)+'\n')
fout.close()
fin.close()
In [41]:
!head -n10 hmp1.v13.hq.otu.lookup.split
In [ ]: