Aim : 
1, Take damn.txt and rep_set_tax_assignments.txt
2, Make a data frame
# OTU NC1_c NC2_c etc K O P C F G S U
Popluate with NULL if no values are available
3, Make a def to subset all the rows which has P == 'Firmicutes' etc. and Sample == NC1
it should give the results like this (is another df)
# OTU NC1 K O P C F G S U
4, def : df to counter
input : # OTU NC1 K O P C F G S U
output : # OTU NC1_c
5. Compare two outputs
subset the df1 and df2
df1_NC1  
otu1 12
otu2 34
otu3 3 etc 
df2_NC1  
otu1 23
otu2 4
otu3 66 etc 
make a matplotlib.plt of these two Series
In [2]:
    
!ls
    
    
In [3]:
    
import pandas as pd
import matplotlib.pyplot as plt
import re
    
In [4]:
    
f1 = "damn.txt"
with open(f1, 'r')  as f:
    data = f.readlines()
    
In [5]:
    
# This piece of code gives the counts based on groups 
newdt = {}
otus =[]
print "OTU\t HFD_c\tHFS_c\tNC_c\tTotal"
for e,i in enumerate(data[0:10]):
    i = i.strip("\n") 
    #print e, i.split("\t")[1],';'.join(i.split("\t")[2:])
    #print e, i.split("\t")[0], len(i.split("\t"))-1
    otu = i.split("\t")[0]
    #print otu
    otus.append(otu)
    readids = i.split("\t")[1:]
    #newdt[otu] =  '|'.join(readids).replace(" ","")
    NCs, HFD, HFS = [], [], []
    for p,x in enumerate(readids):
        #print "Line :", e+1,p+1 ,x
        if "NC" in x:
            NCs.append(x)
        elif "HFD" in x:
            HFD.append(x)
        elif "HFS" in x:
            HFS.append(x)
        #print HFD, HFS, NCs
    HFD_c=len(HFD)
    HFS_c=len(HFS)
    NC_c = len(NCs)
    #print otu, "HFD:" ,HFD_c, "HFS:",HFS_c,"NCs:",NC_c, "Total:", HFD_c + HFS_c + NC_c
    print otu,"\t", HFD_c,"\t", HFS_c,"\t", NC_c,"\t", HFD_c + HFS_c + NC_c
#print newdt
#keys =  [ k for k in newdt.keys() ]
#testS = pd.Series(newdt)
    
    
In [6]:
    
!head -4 mapping_file2.txt
    
    
In [7]:
    
# Use mapping file to get the Sample Ids
mf =pd.read_csv("mapping_file2.txt" , header=False, sep="\t")
SampleID = mf["#SampleID"]
    
In [8]:
    
# create several regex complie objs at once
def re_compiler(SampleID):
    """pandasSeries | list->list
    Returns a list of tuples with pattern and re compiler obj
    """
    r =[]
    for i in SampleID:
        y = i 
        y2 = re.compile(i)
        r.append((y,y2))
    return(r)
    
In [9]:
    
pp = re_compiler(SampleID)
pp[0:3]
    
    Out[9]:
In [10]:
    
def line_to_counts(line, pp):
    result = {}
    for i in line:
        for p in pp:
            if p[1].match(i):
                #print p[0], "line found: ", i
                if not p[0] in result:
                    result[p[0]] =1
                else:
                    result[p[0]] +=1
    return result
    
In [11]:
    
def line_to_counts2(line, pp):
    """list,list->list
    Returns a list of counts for each pattern in pp
    pp is a tuple of patten and re compiler 
    made from function re_compiler
    >>>lines_to_counts2(line, pp)
    [6, 10, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0]
    """
    dict1 = {}
    result=[]
    for i in line:
        for p in pp:
            if p[1].match(i):
                #print p[0], "line found: ", i
                if not p[0] in result:
                    dict1[p[0]] =1
                else:
                    dict1[p[0]] +=1
    for i in pp:
        #print i[0]
        if dict1.get(i[0]):
            #print i,result.get(i)
            result.append(dict1.get(i[0]))
        else:
            #print i,0
            result.append(0)
    return result
    
In [12]:
    
# This piece of code gives the counts based on subgroups
otus =[]
print "OTU",[ i[0] for i in pp]
for e,i in enumerate(data[0:10]):
    i = i.strip("\n")
    otu = i.split("\t")[0]
    print otu,
    readids = i.split("\t")[1:]
    print line_to_counts2(readids, pp)
    
    
In [13]:
    
# make a DF of OTUs counts and taxa