Aim :
1, Take damn.txt and rep_set_tax_assignments.txt
2, Make a data frame
# OTU NC1_c NC2_c etc K O P C F G S U
Popluate with NULL if no values are available
3, Make a def to subset all the rows which has P == 'Firmicutes' etc. and Sample == NC1
it should give the results like this (is another df)
# OTU NC1 K O P C F G S U
4, def : df to counter
input : # OTU NC1 K O P C F G S U
output : # OTU NC1_c
5. Compare two outputs
subset the df1 and df2
df1_NC1
otu1 12
otu2 34
otu3 3 etc
df2_NC1
otu1 23
otu2 4
otu3 66 etc
make a matplotlib.plt of these two Series
In [2]:
!ls
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import re
In [4]:
f1 = "damn.txt"
with open(f1, 'r') as f:
data = f.readlines()
In [5]:
# This piece of code gives the counts based on groups
newdt = {}
otus =[]
print "OTU\t HFD_c\tHFS_c\tNC_c\tTotal"
for e,i in enumerate(data[0:10]):
i = i.strip("\n")
#print e, i.split("\t")[1],';'.join(i.split("\t")[2:])
#print e, i.split("\t")[0], len(i.split("\t"))-1
otu = i.split("\t")[0]
#print otu
otus.append(otu)
readids = i.split("\t")[1:]
#newdt[otu] = '|'.join(readids).replace(" ","")
NCs, HFD, HFS = [], [], []
for p,x in enumerate(readids):
#print "Line :", e+1,p+1 ,x
if "NC" in x:
NCs.append(x)
elif "HFD" in x:
HFD.append(x)
elif "HFS" in x:
HFS.append(x)
#print HFD, HFS, NCs
HFD_c=len(HFD)
HFS_c=len(HFS)
NC_c = len(NCs)
#print otu, "HFD:" ,HFD_c, "HFS:",HFS_c,"NCs:",NC_c, "Total:", HFD_c + HFS_c + NC_c
print otu,"\t", HFD_c,"\t", HFS_c,"\t", NC_c,"\t", HFD_c + HFS_c + NC_c
#print newdt
#keys = [ k for k in newdt.keys() ]
#testS = pd.Series(newdt)
In [6]:
!head -4 mapping_file2.txt
In [7]:
# Use mapping file to get the Sample Ids
mf =pd.read_csv("mapping_file2.txt" , header=False, sep="\t")
SampleID = mf["#SampleID"]
In [8]:
# create several regex complie objs at once
def re_compiler(SampleID):
"""pandasSeries | list->list
Returns a list of tuples with pattern and re compiler obj
"""
r =[]
for i in SampleID:
y = i
y2 = re.compile(i)
r.append((y,y2))
return(r)
In [9]:
pp = re_compiler(SampleID)
pp[0:3]
Out[9]:
In [10]:
def line_to_counts(line, pp):
result = {}
for i in line:
for p in pp:
if p[1].match(i):
#print p[0], "line found: ", i
if not p[0] in result:
result[p[0]] =1
else:
result[p[0]] +=1
return result
In [11]:
def line_to_counts2(line, pp):
"""list,list->list
Returns a list of counts for each pattern in pp
pp is a tuple of patten and re compiler
made from function re_compiler
>>>lines_to_counts2(line, pp)
[6, 10, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0]
"""
dict1 = {}
result=[]
for i in line:
for p in pp:
if p[1].match(i):
#print p[0], "line found: ", i
if not p[0] in result:
dict1[p[0]] =1
else:
dict1[p[0]] +=1
for i in pp:
#print i[0]
if dict1.get(i[0]):
#print i,result.get(i)
result.append(dict1.get(i[0]))
else:
#print i,0
result.append(0)
return result
In [12]:
# This piece of code gives the counts based on subgroups
otus =[]
print "OTU",[ i[0] for i in pp]
for e,i in enumerate(data[0:10]):
i = i.strip("\n")
otu = i.split("\t")[0]
print otu,
readids = i.split("\t")[1:]
print line_to_counts2(readids, pp)
In [13]:
# make a DF of OTUs counts and taxa