Building database

I have 6 SS WPS2 genomes, so I'll concatenate sequences into a single combined fasta file

Version 1.3:

  • Functional programming
  • Combined assembled and unassembled

In [4]:
from Bio import SeqIO
import time
import os
import shutil
import pandas

In [5]:
#parameters
version = 'v1.4'
project_name = 'wps2_bl_metagenome'
e_value = 1e-20
iden = 95.0
metric = 50.0

In [2]:
!cat ../ss_genomes/AP_WPS-2_bacterium* > ../ss_genomes/all_AP_WPS-2_bacterium.fna

In [2]:


In [ ]:


In [3]:
!makeblastdb -in ../ss_genomes/all_AP_WPS-2_bacterium.fna -dbtype nucl -title "combined_ss_WPS2" -out ../blast_db/combined_ss_WPS2/combined_ss_WPS2 -parse_seqids



Building a new DB, current time: 11/13/2016 23:46:34
New DB name:   ../blast_db/combined_ss_WPS2/combined_ss_WPS2
New DB title:  combined_ss_WPS2
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 665 sequences in 0.0848749 seconds.

OK - verified

Running Blast query


In [3]:
import time
start =time.time()
!blastp -db test_pp_pmo -query 76969.assembled.faa -out assembl_contigs_vs_test_pp_pmo2.tab -evalue .00001 -outfmt 6 -num_threads 8
print "command was executed in %d seconds"%(time.time()-start)


^C

Optimizing number of threads - No real difference


In [1]:
ex_time=list()
num_thread=list()
for i in [1,2,4,6,8]:
    start =time.time()
    !blastp -db test_pp_pmo -query 76969.assembled.faa -out assembl_contigs_vs_test_pp_pmo2.tab -evalue .00001 -outfmt 6 -num_threads {i}
    elapsed=time.time()-start
    print 'number of threads: ', i
    print 'exec time: ', elapsed
    ex_time.append(elapsed)
    num_thread.append(i)


number of threads:  1
exec time:  2587.95323396
number of threads:  2
exec time:  2689.65229201
number of threads:  4
exec time:  2710.735955
number of threads:  6
exec time:  2715.41233706
number of threads:  8
exec time:  2493.76898289

In [ ]:


In [ ]:

Defining functions


In [126]:
def parse_contigs_ind(f_name):
    """
    Returns sequences index from the input files(s)
    remember to close index object after use
    """
    handle = open(f_name, "rU")
    record_dict = SeqIO.index(f_name,"fasta")
    handle.close()
    return record_dict

#returning specific sequences and overal list
def retrive_sequence(contig_lst, rec_dic):
    """
    Returns list of sequence elements from dictionary/index of SeqIO objects specific to the contig_lst parameter
    """
    contig_seqs = list()
    #record_dict = rec_dic
    #handle.close()
    for contig in contig_lst:
        contig_seqs.append(rec_dic[contig].seq.tostring())
    return contig_seqs


def filter_seq_dict(key_lst, rec_dic):
    """
    Returns filtered dictionary element from rec_dic according to sequence names passed in key_lst
    """
    return { key: rec_dic[key] for key in key_lst }

def unique_scaffold_topEval(dataframe):
#returns pandas series object
    variables = list(dataframe.columns.values)
    scaffolds=dict()
    rows=list()

    for row in dataframe.itertuples():
        
        #if row[1]=='Ga0073928_10002560':
        if row[1] not in scaffolds:
            scaffolds[row[1]]=row
        else:
            if row[11]<scaffolds[row[1]][11]:
                scaffolds[row[1]]=row
    rows=scaffolds.values()
    #variables=['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits']
    df = pandas.DataFrame([[getattr(i,j) for j in variables] for i in rows], columns = variables)
    return df

In [ ]:


In [ ]:

Making all_WPS-2 Database [ln3] again more scalable way


In [ ]:


In [ ]:


In [3]:
iteration = 0

In [2]:
blast_db_homeDir = "../blast_db/"

In [3]:
#!mkdir blast_db_homeDir

Reading Raw input file and creating specific input file


In [21]:
#command_line file of reference sequences
reference_0 = "../ss_genomes/all_AP_WPS-2_bacterium.fna"

input_dir = "input_files"
if os.path.exists(input_dir):
    shutil.rmtree(input_dir)
try:
    os.mkdir(input_dir)
except OSError:
    raise
#returns 
records_0 = parse_contigs_ind(reference_0)
ref_out_0 = "input_files/reference0.fna"
with open(ref_out_0, "w") as handle:
    SeqIO.write(records_0.values(), handle, "fasta")
    #NO NEED TO CLOSE with statement will automatically close the file
records_0.close()

In [13]:


In [15]:

After iteration0 is done:

  • filter assembled e-20: >=95 iden, >=50 metric
  • add records to output_1

In [66]:
output_1 = "../ss_genomes/iteration1.fna"

In [83]:
iteration1_all =cutoff_contigs['1e-20_cutoff_contigs']
print len(iteration1_all) 
len((iteration1_all[iteration1_all['iden']>=95.0]))
print len((iteration1_all[iteration1_all['iden']>=95.0]))
print len((iteration1_all[iteration1_all['Metric']>=50]))


9594
2085
5658

In [89]:
iteration1_filtered = iteration1_all[(iteration1_all['iden']>=95.0)&(iteration1_all['Metric']>=50)]
len(iteration1_filtered)


Out[89]:
1828

In [101]:
names = iteration1_filtered['quid'].tolist()

In [130]:
records_0 = dict(parse_contigs_ind(input_0))
#all_records = parse_contigs_ind(assembled_contigs)
records_1 = filter_seq_dict(names, all_records)
print type(records_1)
records_1.update(records_0)


<type 'dict'>

In [131]:
len(records_0.items())


Out[131]:
665

In [132]:
len(records_1.items())


Out[132]:
2493

Outputting iteration 1


In [133]:
#records_0 = parse_contigs_ind(input_0)
output_1_0 = "../ss_genomes/iteration1_0.fna"
output_1 = "../ss_genomes/iteration1_0.fna"
with open(output_1_0, "w") as handle:
  SeqIO.write(records_0.values(), handle, "fasta")
with open(output_1, "w") as handle:
  SeqIO.write(records_1.values(), handle, "fasta")

In [ ]:

Repeating what we've done


In [110]:
type(records_1)


# output_0 = "../ss_genomes/iteration0.fna"
# with open(output_0, "w") as handle:
#   SeqIO.write(records_0.values(), handle, "fasta")


Out[110]:
dict

In [105]:
len(records_0)


Out[105]:
665

In [ ]:

adding those to the database


In [80]:
(iteration1_all[iteration1_all['iden']>=95.0])&(iteration1_all[iteration1_all['alen']>=50])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-80-91486e773425> in <module>()
----> 1 (iteration1_all[iteration1_all['iden']>=95.0])&(iteration1_all[iteration1_all['alen']>=50])

C:\Users\Andriy\Anaconda2\lib\site-packages\pandas\core\ops.pyc in f(self, other, axis, level, fill_value)
   1058     def f(self, other, axis=default_axis, level=None, fill_value=None):
   1059         if isinstance(other, pd.DataFrame):  # Another DataFrame
-> 1060             return self._combine_frame(other, na_op, fill_value, level)
   1061         elif isinstance(other, ABCSeries):
   1062             return self._combine_series(other, na_op, fill_value, axis, level)

C:\Users\Andriy\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _combine_frame(self, other, func, fill_value, level)
   3478                                                     dtype=r.dtype)
   3479 
-> 3480                 result = dict([(col, f(col)) for col in this])
   3481 
   3482             # non-unique

C:\Users\Andriy\Anaconda2\lib\site-packages\pandas\core\frame.pyc in f(col)
   3474 
   3475                 def f(col):
-> 3476                     r = _arith_op(this[col].values, other[col].values)
   3477                     return self._constructor_sliced(r, index=new_index,
   3478                                                     dtype=r.dtype)

C:\Users\Andriy\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _arith_op(left, right)
   3466                 right[right_mask & mask] = fill_value
   3467 
-> 3468             return func(left, right)
   3469 
   3470         if this._is_mixed_type or other._is_mixed_type:

C:\Users\Andriy\Anaconda2\lib\site-packages\pandas\core\ops.pyc in na_op(x, y)
    995                 yrav = yrav[mask]
    996                 if np.prod(xrav.shape) and np.prod(yrav.shape):
--> 997                     result[mask] = op(xrav, yrav)
    998             elif hasattr(x, 'size'):
    999                 result = np.empty(x.size, dtype=x.dtype)

TypeError: unsupported operand type(s) for &: 'str' and 'str'

In [ ]:


In [ ]:


In [10]:
dct1


Out[10]:
{'a': 2, 'b': 3, 'c': 5, 'd': 1}

In [ ]:


In [5]:
in_db = '../ss_genomes/all_AP_WPS-2_bacterium.fna'
title = 'combined_ss_WPS2'
outfile = "../blast_db/combined_ss_WPS2/combined_ss_WPS2"

!makeblastdb -in {infile} -dbtype nucl -title "{title}" -out {outfile} -parse_seqids



Building a new DB, current time: 11/13/2016 23:59:45
New DB name:   ../blast_db/combined_ss_WPS2/combined_ss_WPS2
New DB title:  combined_ss_WPS2
Sequence type: Nucleotide
Keep Linkouts: T
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 665 sequences in 0.0594308 seconds.

Blasting assembled and unasembled contigs


In [6]:
import time
q_fname1 = './../IMG Data/76969.assembled.fna'
q_fname2 = './../IMG Data/76969.unassembled_illumina.fna'
out_name1= 'all_wps2_q_assembled76969.tab'
out_name2= 'all_wps2_q_unasembled76969.tab'
database = '../blast_db/combined_ss_WPS2/combined_ss_WPS2'
# start =time.time()
# !blastn -db {database} -query "{q_fname1}" -out {out_name1} -evalue .00001 -outfmt 6 -num_threads 8
# elapsed=time.time()-start
# print 'exec time: ', elapsed
# start =time.time()
# !blastn -db {database} -query "{q_fname2}" -out {out_name2} -evalue .00001 -outfmt 6 -num_threads 8
# elapsed=time.time()-start
# print 'exec time: ', elapsed

In [ ]:


In [ ]:


In [ ]:


In [ ]:

Parsing Blast Outputs


In [ ]:


In [ ]:

Trying pandas instead


In [2]:
del recruited_mg


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-e75790db1808> in <module>()
----> 1 del recruited_mg

NameError: name 'recruited_mg' is not defined

In [14]:
recruited_mg = []
blast_cols = ['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits']
recruited_mg_1 = pandas.read_csv(out_name1 ,sep="\t", header=None)
recruited_mg_1.columns=['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits']

recruited_mg_2 = pandas.read_csv(out_name2 ,sep="\t", header=None)
recruited_mg_2.columns=['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits']

recruited_mg = [recruited_mg_1, recruited_mg_2]

In [15]:
len(recruited_mg[0])


Out[15]:
26364

In [16]:
len(recruited_mg[1])


Out[16]:
12145

In [ ]:


In [ ]:

Adding "Metric", "Seq_size", "Seq_nt" fields

indeces of metagenome sequences and the recruited from blast dataframe are placed in lists of the same size: No of -m (metagenome) sequences


In [100]:
#For linux use symlink
metagenome_1 ="../../pp_metagenome1/IMG Data/76969.assembled.fna"
metagenome_2 ="../../pp_metagenome1/IMG Data/76969.unassembled_illumina.fna"
#!Beware of closing all records enries eventually
all_records_mg_1=parse_contigs_ind(metagenome_1)
all_records_mg_2=parse_contigs_ind(metagenome_2)
all_records=[all_records_mg_1, all_records_mg_2]

In [131]:
start = time.time()
#wps2_eval_cutoff['Seq_nt']=retrive_sequence(contig_list, all_records)
for i in range(len(recruited_mg)):
    #cutoff_contigs[dataframe]=evalue_filter(cutoff_contigs[dataframe])
    recruited_mg[i]=unique_scaffold_topEval(recruited_mg[i])
    contig_list = recruited_mg[i]['quid'].tolist()
    recruited_mg[i]['Seq_nt']=retrive_sequence(contig_list, all_records[i])
    recruited_mg[i]['Seq_size']=recruited_mg[i]['Seq_nt'].apply(lambda x: len(x))
    recruited_mg[i]['Coverage']=recruited_mg[i]['alen'].apply(lambda x: float(x))/recruited_mg[i]['Seq_size']
    recruited_mg[i]['Metric']=recruited_mg[i]['Coverage']*recruited_mg[i]['iden']
    recruited_mg[i] = recruited_mg[i][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Seq_size', 'Seq_nt']]
    #all_records[i].close()# keep open if multiple iterations

    
elapsed=time.time()-start
print elapsed


1.90100002289

In [91]:
# don't close just yet
# do after it is not needed
for i in range(len(recruited_mg)):
    all_records[i].close()

In [132]:
print  len(recruited_mg[0])
print len(recruited_mg[1])


11345
6297

Number of unique scaffolds found


In [ ]:


In [133]:
len(set(recruited_mg[0]['quid']))


Out[133]:
11345

In [134]:
len(set(recruited_mg[1]['quid']))


Out[134]:
6297

Evalue sorting


In [40]:
#result = df.sort(['A', 'B'], ascending=[1, 0])\

# wps2_assembled_sort_eval = wps2_assembled.sort_values(by=['eval'], ascending=False)
# wps2_assembled_sort_eval

Filtering out by evalue cutoff


In [25]:
# eval_cutoff = 1e-20
# wps2_assembled_filtered = wps2_assembled_sort_eval[wps2_assembled_sort_eval['eval']<eval_cutoff]
# wps2_assembled_filtered

Efect of evalue on # of unique scaffolds found


In [26]:
# #Doing a 
# unique_scaffolds = list()
# eval_cutoffs = [1e-10, 1e-20, 1e-30, 1e-40, 1e-50, 1e-60, 1e-70, 1e-80, 1e-90, 1e-100, 1e-110, 1e-120, 1e-130, 1e-140, 1e-150]
# #for e_val in eval_cutoffs:
# for e_val in eval_cutoffs:
#     wps2_assembled_filtered = wps2_assembled_sort_eval[wps2_assembled_sort_eval['eval']<e_val]
#     unique_scaffolds.append(len(set(wps2_assembled_filtered['quid'])))

In [ ]:


In [ ]:


In [27]:
# print eval_cutoffs
# print unique_scaffolds

In [28]:
# import matplotlib
# import matplotlib.pyplot as plt
# %matplotlib inline  
# #plt.semilogy(t, np.exp(-t/5.0))
# plt.plot()
# plt.semilogx(eval_cutoffs, unique_scaffolds, marker = 'o', basex=10)
# plt.title('Number of unique scaffolds as a function of BLAST e-Value')
# plt.xticks(eval_cutoffs)
# plt.ylabel('No. of unique scaffolds')
# plt.xlabel('BLAST e-Value')
# plt.grid(True)
# plt.ylim((0,12000))
# plt.show()

Doing some stats on the recruited data


In [41]:
#assembled

ranges = range(5, 210, 15)
unique_scaffolds = list()
eval_ranges = [10**(-x) for x in ranges]
assembled_filtered_lst = list()
#for e_val in eval_cutoffs:
for i in range(len(eval_ranges)):
    assembled_filtered = recruited_mg[0][recruited_mg[0]['eval']<eval_ranges[i]]
    assembled_filtered_lst.append(assembled_filtered)
    #unique_scaffolds.append(len(set(wps2_assembled_filtered['quid'])))
    unique_scaffolds.append(len(set(assembled_filtered_lst[i]['quid'])))

In [42]:
print ranges
print eval_ranges
print unique_scaffolds


[5, 20, 35, 50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]
[1e-05, 1e-20, 1e-35, 1e-50, 1e-65, 1e-80, 1e-95, 1e-110, 1e-125, 1e-140, 1e-155, 1e-170, 1e-185, 1e-200]
[11341, 9594, 8348, 7455, 6718, 6131, 5669, 5281, 4927, 4592, 4236, 3938, 3724, 3724]

In [ ]:


In [43]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

#plt.semilogy(t, np.exp(-t/5.0))
plt.plot()
plt.semilogx(eval_ranges, unique_scaffolds, marker = 'o', basex=10)
plt.title('Number of unique assembled scaffolds as a function of BLAST e-Value', fontsize=25)
plt.xticks(eval_ranges, fontsize=18)
plt.yticks(fontsize=18)
plt.ylabel('No. of unique scaffolds', fontsize=20)
plt.xlabel('BLAST e-Value', fontsize=20)
plt.grid(True)
plt.ylim((0,12000))
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
fig.savefig('assembled_fn_evalue.pdf', dpi=300)
plt.show()


Retrieving sequences from dataframe list


In [44]:
# dataframe_names= [str(x)+"_cutoff_contigs" for x in eval_ranges]
# dataframe_names

In [ ]:


In [33]:
# # cutoff = eval_ranges[3]
# # cutoff_contigs = wps2_assembled_filtered_lst[3]
# cutoff_contigs={}
# # #cutoff_contigs
# for i in range(len(eval_ranges)):
#     cutoff_contigs[dataframe_names[i]]=wps2_assembled_filtered_lst[i]

In [34]:
len(cutoff_contigs['1e-50_cutoff_contigs'])


Out[34]:
15813

In [35]:
print list(cutoff_contigs['1e-50_cutoff_contigs'].columns.values)


['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits']

Adding Seq_nt and Size entries


In [45]:
# from Bio import SeqIO
# assembled_contigs = "../IMG Data/76969.assembled.fna"
# handle = open(assembled_contigs, "rU")
# record_dict = SeqIO.to_dict(SeqIO.parse(handle,"fasta"))
# handle.close()
# rec = record_dict["Ga0073928_11111377"]
# rec

In [ ]:

Indexing approach

Less memory intensive, close index after it's done: record_dict.close()


In [47]:
# start = time.time()
# all_records = parse_contigs_ind(assembled_contigs)
# print time.time()-start

In [48]:
# all_records.close()

In [50]:
# all_records['Ga0073928_10022750']

In [ ]:


In [ ]:


In [93]:


In [49]:
# all_records=parse_contigs_ind(assembled_contigs)
# start = time.time()
# #wps2_eval_cutoff['Seq_nt']=retrive_sequence(contig_list, all_records)
# for dataframe in cutoff_contigs:
#     #cutoff_contigs[dataframe]=evalue_filter(cutoff_contigs[dataframe])
#     contig_list = cutoff_contigs[dataframe]['quid'].tolist()
#     cutoff_contigs[dataframe]['Seq_nt']=retrive_sequence(contig_list, all_records)
#     cutoff_contigs[dataframe]['Seq_size']=cutoff_contigs[dataframe]['Seq_nt'].apply(lambda x: len(x))
#     cutoff_contigs[dataframe]['Coverage']=cutoff_contigs[dataframe]['alen'].apply(lambda x: float(x))/cutoff_contigs[dataframe]['Seq_size']
#     cutoff_contigs[dataframe]['Metric']=cutoff_contigs[dataframe]['Coverage']*cutoff_contigs[dataframe]['iden']
#     cutoff_contigs[dataframe] = cutoff_contigs[dataframe][['quid', 'suid', 'iden', 'alen','Coverage','Metric', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Seq_size', 'Seq_nt']]
# elapsed=time.time()-start
# print elapsed
# all_records.close()

In [51]:
# cutoff_contigs['1e-50_cutoff_contigs']

In [52]:
# cutoff_contigs['1e-50_cutoff_contigs'].loc[7444]['Seq_nt']

In [53]:
# !cutoff_contigs['1e-50_cutoff_contigs'].loc[7444]['Seq_nt'][:30] "../IMG Data/76969.assembled.fna"

In [54]:
# #verification by manual searching and extracting the sequence
# len(''.join("""CCTGTTAGTGCTTCGGCGCCCGATACAATCTCGAGCAGCTCCTTGGTTATCGCGGCTTGA
# CGCGCGTTGTTCATAGCGATCGTCAGTTCATCGATCAATTTGCCGGCGTTTTCGGTGGCG
# TTGTTCATTGCAAGCAGCTGAGCTGCGTAGAATGACGCATCGGTCTCCAGCATCGCCGAA
# AATAGTGTGAACTCCAGATATTTCGGCAAGAGTTTTGAAAGTACGAACTCCGGCGAAGGC
# ACGATTTCGACCGCGCCGCGCGGACCCGTGGATGGAGCCGCCGCGGTTTCATGCTCGATC
# GGGACAAGCTGCCGTAGCTCGGGTCGCTGCACCATCATCGAGACATGTTTGGACGAAACG
# AGGATGATGTCGCCGATCTCTCCGGCGGTGAAATCGGCGGTGACGCTCTGCGCGAGTTCG
# TGCGCGGTTTGCAGTTTGGACGGCGCACTCAGCGGCCAGCCCGGCCGATCTCCTAGGCCC
# CAGCGGCGCACAGCGTTGCGAGCTTTTATGCCGACGGTGTAAAAACGCGCGTCCGCGTGC
# TGACGCCAGAAGCGCTCGGCCTCGCGAATGACGTTGGAGTTGAATGCGCCGGCCAG""".split()))

In [55]:
# len(cutoff_contigs['1e-50_cutoff_contigs'].loc[7444]['Seq_nt'])

In [125]:
recruited_mg[0]


Out[125]:
quid suid iden alen Coverage Metric mism gapo qsta qend ssta send eval bits Seq_size Seq_nt
0 Ga0073928_10000001 2616646159 73.97 438 0.000371 0.027420 100 13 725591 726021 10816 11246 1.000000e-37 165.0 1181591 ACAGTACCTTCGTGATCGCCGCAGTCAGCGTCGTCTTGCCGTGGTC...
1 Ga0073928_10000002 2616653076 74.33 1005 0.001313 0.097569 233 23 110425 111418 475 1465 1.000000e-109 403.0 765630 CCAGTTCAATCCCGGTGCCCATCTGCTGGGCCAGCGCTTCGGCTGC...
2 Ga0073928_10000002 2617273876 74.33 1005 0.001313 0.097569 233 23 110425 111418 34447 35437 1.000000e-109 403.0 765630 CCAGTTCAATCCCGGTGCCCATCTGCTGGGCCAGCGCTTCGGCTGC...
3 Ga0073928_10000003 2616646130 74.74 1607 0.002156 0.161123 376 29 217242 218833 2554 963 0.000000e+00 691.0 745438 GAAGGGCGGGTACCACGAACGTCTGCGCCTGCGACGCTACGCTACC...
4 Ga0073928_10000004 2616646178 88.33 60 0.000083 0.007306 7 0 56696 56755 3397 3456 4.000000e-10 73.1 725374 CGGATTGTGTTTCGACAACACCTTGGTGATGGCCGCCGTCAACGTG...
5 Ga0073928_10000004 2616653415 92.86 42 0.000058 0.005377 3 0 231872 231913 1625 1666 9.000000e-07 62.1 725374 CGGATTGTGTTTCGACAACACCTTGGTGATGGCCGCCGTCAACGTG...
6 Ga0073928_10000005 2616646216 79.71 971 0.001377 0.109752 191 5 102846 103813 6097 7064 0.000000e+00 697.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
7 Ga0073928_10000005 2616646158 79.20 971 0.001377 0.109050 180 16 102846 103805 4551 3592 0.000000e+00 654.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
8 Ga0073928_10000005 2616653440 73.68 1083 0.001536 0.113151 249 33 615295 616359 5839 6903 3.000000e-105 388.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
9 Ga0073928_10000005 2616646201 74.32 767 0.001088 0.080832 163 29 474629 475375 15357 14605 8.000000e-77 294.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
10 Ga0073928_10000005 2617273795 73.04 805 0.001142 0.083376 179 32 474629 475411 2096 1308 2.000000e-63 250.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
11 Ga0073928_10000005 2616653450 73.63 383 0.000543 0.039989 89 9 89657 90033 20975 21351 1.000000e-29 137.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
12 Ga0073928_10000005 2616653019 72.37 456 0.000647 0.046796 110 15 474629 475073 451 1 2.000000e-27 130.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
13 Ga0073928_10000005 2616652971 85.00 120 0.000170 0.014464 18 0 504155 504274 8289 8170 4.000000e-25 122.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
14 Ga0073928_10000005 2616652971 73.65 277 0.000393 0.028929 71 2 140403 140678 7917 8192 4.000000e-20 106.0 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
15 Ga0073928_10000005 2616646137 89.19 74 0.000105 0.009359 8 0 583008 583081 5043 5116 3.000000e-16 93.5 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
16 Ga0073928_10000005 2616646196 71.56 327 0.000464 0.033182 91 2 140404 140729 791 1116 1.000000e-14 87.9 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
17 Ga0073928_10000005 2616653050 85.14 74 0.000105 0.008934 11 0 583008 583081 3492 3565 3.000000e-11 76.8 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
18 Ga0073928_10000005 2616653358 85.14 74 0.000105 0.008934 11 0 583008 583081 6475 6402 3.000000e-11 76.8 705209 CATCTCCAACCCCCACGCCGAGCGCAACGTCATCGCCTACGCCAAC...
19 Ga0073928_10000006 2616653426 75.29 255 0.000384 0.028892 59 4 261289 261541 8757 8505 5.000000e-24 119.0 664508 ATCAGGTAGCGTAGCGTCGCAGGCGCAGACGTTCGTGGTACCCGCC...
20 Ga0073928_10000006 2616653389 77.69 130 0.000196 0.015199 29 0 9617 9746 4874 4745 2.000000e-12 80.5 664508 ATCAGGTAGCGTAGCGTCGCAGGCGCAGACGTTCGTGGTACCCGCC...
21 Ga0073928_10000008 2616646197 94.74 57 0.000099 0.009368 3 0 322028 322084 1727 1671 3.000000e-15 89.8 576479 CCAGAGCGCAGCGTTCGGCACGCCGATGAAGTAGAGTCCGATCGCA...
22 Ga0073928_10000011 2616652943 76.17 1385 0.002671 0.203417 312 17 297567 298942 1973 598 0.000000e+00 713.0 518616 CAGCAGCTCGCTTTGGCATCACACCTCAGCTCATCGATCAATCCCT...
23 Ga0073928_10000011 2616653384 76.17 1385 0.002671 0.203417 312 17 297567 298942 9441 10816 0.000000e+00 713.0 518616 CAGCAGCTCGCTTTGGCATCACACCTCAGCTCATCGATCAATCCCT...
24 Ga0073928_10000015 2616653351 85.33 75 0.000185 0.015791 11 0 50595 50669 4391 4317 5.000000e-12 78.7 405279 TCGAACGACAGCGCAACCTCTGTGGCAACTAGATCTTACGGACTTA...
25 Ga0073928_10000015 2617273762 85.33 75 0.000185 0.015791 11 0 50595 50669 1802 1876 5.000000e-12 78.7 405279 TCGAACGACAGCGCAACCTCTGTGGCAACTAGATCTTACGGACTTA...
26 Ga0073928_10000016 2616652926 76.55 533 0.001328 0.101686 121 4 87414 87944 4804 4274 2.000000e-75 289.0 401248 ATTGTGCTGAACCTGTTCCACACAGCACGTAGAAGTGGCGAACATT...
27 Ga0073928_10000016 2616653369 76.55 533 0.001328 0.101686 121 4 87414 87944 4345 3815 2.000000e-75 289.0 401248 ATTGTGCTGAACCTGTTCCACACAGCACGTAGAAGTGGCGAACATT...
28 Ga0073928_10000016 2617273834 76.55 533 0.001328 0.101686 121 4 87414 87944 1422 1952 2.000000e-75 289.0 401248 ATTGTGCTGAACCTGTTCCACACAGCACGTAGAAGTGGCGAACATT...
29 Ga0073928_10000016 2616646139 78.66 239 0.000596 0.046853 45 6 22246 22481 1841 2076 8.000000e-35 154.0 401248 ATTGTGCTGAACCTGTTCCACACAGCACGTAGAAGTGGCGAACATT...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
26334 Ga0073928_11966046 2616652961 93.42 76 0.372549 34.803529 5 0 52 127 10913 10988 6.000000e-26 113.0 204 AAATCTTTGACGCCTGCAGGGGGCTTTGCTATACTCGGCGAGTTCG...
26335 Ga0073928_11966046 2617273851 93.42 76 0.372549 34.803529 5 0 52 127 1552 1477 6.000000e-26 113.0 204 AAATCTTTGACGCCTGCAGGGGGCTTTGCTATACTCGGCGAGTTCG...
26336 Ga0073928_11966217 2616653061 84.26 197 0.970443 81.769557 31 0 1 197 4918 4722 7.000000e-50 193.0 203 GCCGGGGCTTCTTCTGCGGGTACCGTCATCATCGTCCCCGCCGAAA...
26337 Ga0073928_11966217 2616653379 84.26 197 0.970443 81.769557 31 0 1 197 5320 5516 7.000000e-50 193.0 203 GCCGGGGCTTCTTCTGCGGGTACCGTCATCATCGTCCCCGCCGAAA...
26338 Ga0073928_11966217 2616646202 84.26 197 0.970443 81.769557 31 0 1 197 740 936 7.000000e-50 193.0 203 GCCGGGGCTTCTTCTGCGGGTACCGTCATCATCGTCCCCGCCGAAA...
26339 Ga0073928_11966217 2617273829 84.26 197 0.970443 81.769557 31 0 1 197 21411 21215 7.000000e-50 193.0 203 GCCGGGGCTTCTTCTGCGGGTACCGTCATCATCGTCCCCGCCGAAA...
26340 Ga0073928_11966286 2616653061 97.55 204 1.009901 98.515842 3 2 1 202 5758 5961 1.000000e-96 348.0 202 ACGCAGATCAGCTACGCTGCGTTGAATACGTTCCCGGGCCTTGTAC...
26341 Ga0073928_11966286 2616653379 97.55 204 1.009901 98.515842 3 2 1 202 4480 4277 1.000000e-96 348.0 202 ACGCAGATCAGCTACGCTGCGTTGAATACGTTCCCGGGCCTTGTAC...
26342 Ga0073928_11966286 2617273829 97.55 204 1.009901 98.515842 3 2 1 202 22251 22454 1.000000e-96 348.0 202 ACGCAGATCAGCTACGCTGCGTTGAATACGTTCCCGGGCCTTGTAC...
26343 Ga0073928_11966425 2616646099 90.77 65 0.321782 29.208168 6 0 126 190 6132 6068 4.000000e-18 87.9 202 AAGACGACGTATAGGGTGTGACACGTGACCAATGCGGTAAGATTAC...
26344 Ga0073928_11966425 2616653379 90.77 65 0.321782 29.208168 6 0 126 190 1574 1510 4.000000e-18 87.9 202 AAGACGACGTATAGGGTGTGACACGTGACCAATGCGGTAAGATTAC...
26345 Ga0073928_11966425 2617273829 90.77 65 0.321782 29.208168 6 0 126 190 25157 25221 4.000000e-18 87.9 202 AAGACGACGTATAGGGTGTGACACGTGACCAATGCGGTAAGATTAC...
26346 Ga0073928_11966434 2616653061 86.73 196 0.970297 84.153861 25 1 1 196 5352 5158 4.000000e-57 217.0 202 GCTTGTGCGGGCCCCCGTCAATTCCTTTGAGTTTTAACCTTGCGGC...
26347 Ga0073928_11966434 2616653379 86.73 196 0.970297 84.153861 25 1 1 196 4886 5080 4.000000e-57 217.0 202 GCTTGTGCGGGCCCCCGTCAATTCCTTTGAGTTTTAACCTTGCGGC...
26348 Ga0073928_11966434 2616646202 86.73 196 0.970297 84.153861 25 1 1 196 306 500 4.000000e-57 217.0 202 GCTTGTGCGGGCCCCCGTCAATTCCTTTGAGTTTTAACCTTGCGGC...
26349 Ga0073928_11966434 2617273829 86.73 196 0.970297 84.153861 25 1 1 196 21845 21651 4.000000e-57 217.0 202 GCTTGTGCGGGCCCCCGTCAATTCCTTTGAGTTTTAACCTTGCGGC...
26350 Ga0073928_11966469 2616646099 80.92 152 0.752475 60.890297 27 2 45 195 6132 5982 1.000000e-27 119.0 202 GAAGCCCCGGTAAACGGCGGCCGTAACTATAACGGTCCTAAGGTAG...
26351 Ga0073928_11966469 2616653379 80.92 152 0.752475 60.890297 27 2 45 195 1574 1424 1.000000e-27 119.0 202 GAAGCCCCGGTAAACGGCGGCCGTAACTATAACGGTCCTAAGGTAG...
26352 Ga0073928_11966469 2617273829 80.39 153 0.757426 60.889455 27 3 45 195 25157 25308 6.000000e-26 113.0 202 GAAGCCCCGGTAAACGGCGGCCGTAACTATAACGGTCCTAAGGTAG...
26353 Ga0073928_11966884 2616646202 83.50 200 0.995025 83.084577 21 11 1 197 199 9 7.000000e-45 176.0 201 AGGCGCTGCATGGCTGTCGTCAGCTCGTGCCGTGAGGTGTTGGGTT...
26354 Ga0073928_11966884 2616653061 83.00 200 0.995025 82.587065 22 10 1 197 5459 5649 3.000000e-43 171.0 201 AGGCGCTGCATGGCTGTCGTCAGCTCGTGCCGTGAGGTGTTGGGTT...
26355 Ga0073928_11966884 2616653379 83.00 200 0.995025 82.587065 22 10 1 197 4779 4589 3.000000e-43 171.0 201 AGGCGCTGCATGGCTGTCGTCAGCTCGTGCCGTGAGGTGTTGGGTT...
26356 Ga0073928_11966884 2617273829 83.00 200 0.995025 82.587065 22 10 1 197 21952 22142 3.000000e-43 171.0 201 AGGCGCTGCATGGCTGTCGTCAGCTCGTGCCGTGAGGTGTTGGGTT...
26357 Ga0073928_11966918 2616646099 90.35 114 0.570000 51.499500 9 2 89 200 6402 6515 2.000000e-36 148.0 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...
26358 Ga0073928_11966918 2616646099 100.00 50 0.250000 25.000000 0 0 39 88 6083 6132 8.000000e-20 93.5 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...
26359 Ga0073928_11966918 2616653061 90.35 114 0.570000 51.499500 9 2 89 200 8392 8279 2.000000e-36 148.0 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...
26360 Ga0073928_11966918 2616653379 90.35 114 0.570000 51.499500 9 2 89 200 1846 1959 2.000000e-36 148.0 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...
26361 Ga0073928_11966918 2616653379 100.00 50 0.250000 25.000000 0 0 39 88 1525 1574 8.000000e-20 93.5 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...
26362 Ga0073928_11966918 2617273829 90.35 114 0.570000 51.499500 9 2 89 200 24885 24772 2.000000e-36 148.0 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...
26363 Ga0073928_11966918 2617273829 100.00 50 0.250000 25.000000 0 0 39 88 25206 25157 8.000000e-20 93.5 200 ACAAGTTCGCCGAGCAACTCGTTAAGACAGTCGAACGATCGTTACT...

26364 rows × 16 columns


In [56]:
# wps2_eval_cutoff['Seq_size']=wps2_eval_cutoff['Seq_nt'].apply(lambda x: len(x))
# wps2_eval_cutoff = wps2_eval_cutoff[['quid', 'suid', 'iden', 'alen', 'mism', 'gapo', 'qsta', 'qend', 'ssta', 'send', 'eval', 'bits','Seq_size', 'Seq_nt']]
# wps2_eval_cutoff

Filtering Step


In [136]:
for i in range(len(recruited_mg)):
    recruited_mg[i]=recruited_mg[i][(recruited_mg[i]['iden']>=iden)&(recruited_mg[i]['Metric']>=metric)&(recruited_mg[i]['eval']<=e_value)]

In [144]:
print  len(recruited_mg[0])
print len(recruited_mg[1])


1955
2637

Batch export to csv and multiple FASTA


In [152]:
#wps2_eval_cutoff.to_csv(path_or_buf="wps2_eval_cutoff.csv", sep='\t')
prefix="iteration1_"+str(version)+"_"+str(e_value)+"_recruited_mg"
for i in range(len(recruited_mg)):
    records= []
    outfile1 = prefix+str(i)+".csv"
#     try:
#         os.remove(outfile1)     
#     except OSError:
#         pass
    recruited_mg[i].to_csv(outfile1, sep='\t')
    ids = recruited_mg[i]['quid'].tolist()
    #if len(ids)==len(sequences):
    for j in range(len(ids)):
        records.append(all_records[i][ids[j]])
    outfile2 = prefix+str(i)+".fasta" 
#     try:
#         os.remove(outfile2)        
#     except OSError:
#         pass
    with open(outfile2, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")
        

    #print prefix+dataframe_name

In [ ]:


In [ ]: