Aim : 
1, Take damn.txt and rep_set_tax_assignments.txt
2, Make a data frame
# OTU NC1_c NC2_c etc K O P C F G S U
Popluate with NULL if no values are available

3, Make a def to subset all the rows which has P == 'Firmicutes' etc. and Sample == NC1
it should give the results like this (is another df)
# OTU NC1 K O P C F G S U

4, def : df to counter
input : # OTU NC1 K O P C F G S U
output : # OTU NC1_c

5. Compare two outputs
subset the df1 and df2
df1_NC1  
otu1 12
otu2 34
otu3 3 etc 

df2_NC1  
otu1 23
otu2 4
otu3 66 etc 

make a matplotlib.plt of these two Series

In [2]:
!ls


PlayBioms.py	      data_scripts.Rproj     modified_damn.csv
Playing_Pandas.ipynb  final_otu_map.txt      rep_set_tax_assignments.txt
ahist.R		      final_otu_map_mc1.txt  sofar.R
bhist.R		      final_qiime.R	     ssfinal_otu_map.txt
checkingRphyloseq.R   jiaco2roshan.R	     ssrep_set_tax_assignments.txt
damn.txt	      mapping_file2.txt

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [4]:
f1 = "damn.txt"
with open(f1, 'r')  as f:
    data = f.readlines()

In [5]:
# This piece of code gives the counts based on groups 

newdt = {}
otus =[]
print "OTU\t HFD_c\tHFS_c\tNC_c\tTotal"
for e,i in enumerate(data[0:10]):
    i = i.strip("\n") 
    #print e, i.split("\t")[1],';'.join(i.split("\t")[2:])
    #print e, i.split("\t")[0], len(i.split("\t"))-1
    otu = i.split("\t")[0]
    #print otu
    otus.append(otu)
    readids = i.split("\t")[1:]
    #newdt[otu] =  '|'.join(readids).replace(" ","")
    NCs, HFD, HFS = [], [], []
    for p,x in enumerate(readids):
        #print "Line :", e+1,p+1 ,x
        if "NC" in x:
            NCs.append(x)
        elif "HFD" in x:
            HFD.append(x)
        elif "HFS" in x:
            HFS.append(x)
        #print HFD, HFS, NCs
    HFD_c=len(HFD)
    HFS_c=len(HFS)
    NC_c = len(NCs)
    #print otu, "HFD:" ,HFD_c, "HFS:",HFS_c,"NCs:",NC_c, "Total:", HFD_c + HFS_c + NC_c
    print otu,"\t", HFD_c,"\t", HFS_c,"\t", NC_c,"\t", HFD_c + HFS_c + NC_c
#print newdt
#keys =  [ k for k in newdt.keys() ]
#testS = pd.Series(newdt)


OTU	 HFD_c	HFS_c	NC_c	Total
206494 	18 	0 	21 	39
1800048 	0 	2 	0 	2
541135 	9 	0 	65 	74
276629 	5 	2 	106 	113
1036749 	4 	8 	0 	12
259732 	2 	0 	0 	2
276195 	25 	0 	8 	33
186233 	0 	2 	0 	2
276620 	0 	0 	2 	2
373909 	24 	12 	12 	48

In [6]:
!head -4 mapping_file2.txt


#SampleID	BarcodeSequence	LinkerPrimerSequence	InputFileName	Treatment	Replicate	Description
HFD.1	GAGGGC		/anas/roshan-current/i2mc/burcelin/miseq/testV/merged_data/HFD_1.fasta	HFD	1	desc
HFD.2	TACAAG		/anas/roshan-current/i2mc/burcelin/miseq/testV/merged_data/HFD_2.fasta	HFD	2	desc
HFD.3	CGTTTC		/anas/roshan-current/i2mc/burcelin/miseq/testV/merged_data/HFD_3.fasta	HFD	3	desc

In [7]:
# Use mapping file to get the Sample Ids
mf =pd.read_csv("mapping_file2.txt" , header=False, sep="\t")
SampleID = mf["#SampleID"]

In [8]:
# create several regex complie objs at once
def re_compiler(SampleID):
    """pandasSeries | list->list
    Returns a list of tuples with pattern and re compiler obj
    """
    r =[]
    for i in SampleID:
        y = i 
        y2 = re.compile(i)
        r.append((y,y2))
    return(r)

In [9]:
pp = re_compiler(SampleID)
pp[0:3]


Out[9]:
[('HFD.1', re.compile(r'HFD.1')),
 ('HFD.2', re.compile(r'HFD.2')),
 ('HFD.3', re.compile(r'HFD.3'))]

In [10]:
def line_to_counts(line, pp):
    result = {}
    for i in line:
        for p in pp:
            if p[1].match(i):
                #print p[0], "line found: ", i
                if not p[0] in result:
                    result[p[0]] =1
                else:
                    result[p[0]] +=1
    return result

In [11]:
def line_to_counts2(line, pp):
    """list,list->list
    Returns a list of counts for each pattern in pp
    pp is a tuple of patten and re compiler 
    made from function re_compiler
    >>>lines_to_counts2(line, pp)
    [6, 10, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0]
    """
    dict1 = {}
    result=[]
    for i in line:
        for p in pp:
            if p[1].match(i):
                #print p[0], "line found: ", i
                if not p[0] in result:
                    dict1[p[0]] =1
                else:
                    dict1[p[0]] +=1
    for i in pp:
        #print i[0]
        if dict1.get(i[0]):
            #print i,result.get(i)
            result.append(dict1.get(i[0]))
        else:
            #print i,0
            result.append(0)
    return result

In [12]:
# This piece of code gives the counts based on subgroups
otus =[]
print "OTU",[ i[0] for i in pp]
for e,i in enumerate(data[0:10]):
    i = i.strip("\n")
    otu = i.split("\t")[0]
    print otu,
    readids = i.split("\t")[1:]
    print line_to_counts2(readids, pp)


OTU ['HFD.1', 'HFD.2', 'HFD.3', 'HFD.4', 'HFD.5', 'HFDS.1', 'HFDS.2', 'HFDS.3', 'HFDS.4', 'HFDS.5', 'NC.1', 'NC.2', 'NC.3', 'NC.4', 'NC.5']
206494 [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
1800048 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
541135 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1]
276629 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
1036749 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
259732 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
276195 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
186233 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
276620 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
373909 [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]

In [13]:
# make a DF of OTUs counts and taxa