In [1]:
import graphlab
import pandas
from Bio import SeqIO
In [2]:
sf=graphlab.SFrame('people-example.csv')
table1=graphlab.SFrame('table1.tsv')
hand_nt=open('1648.160_nt.fna', 'r')
hand_aa=open('1648.160.faa', 'r')
table1_nt=hand_nt.read()
table1_aa=hand_aa.read()
hand_nt.close()
hand_aa.close()
In [12]:
def returnLen(text):
return len(text)
def gc(dna):
"This function computes the GC ratio of a dna sequence"
nbases=dna.count('n')+dna.count('N')
gc_cont=float(dna.count('c')+dna.count('C')+dna.count('g')+dna.count('G'))/(len(dna)-nbases)
return gc_cont
In [30]:
sf #we can view first few lines of the table
#table1_aa
#table1_nt
lst_aa=table1_aa.split('>')
lst_nt=table1_nt.split('>')
lst_nt=lst_nt[1:]#first (No. 0) list item is empty
#dict_nt
lst_aa=lst_aa[1:]# the same for amino acids
In [3]:
SeqIO.convert("1648.160.gbk", "genbank", "1648.160_To_FastA.fna", "fasta") #output is not so friendly
Out[3]:
In [7]:
#hand_gb=open('1648.160.gbk','r')
#out_file = "1648.160_To_FastA_parced.fna"
#hand_out = open(out_file, "w")
#for seq_record in SeqIO.parse(hand_gb, "genbank") :
# print "Dealing with GenBank record %s" % seq_record.id
# hand_out.write(">%s %s\n%s\n" % (
# seq_record.id,
# seq_record.description,
# seq_record.seq))
#hand_out.close()
#hand_gb.close()
In [2]:
Banks969_Li_SFrame=graphlab.SFrame('6666666.156894.tsv')
Banks969_Li_SFrame
Banks969_Li_SFrame.show()
In [23]:
Banks969_Li_SFrame["Calculated_length_nt"]=Banks969_Li_SFrame['nucleotide_sequence'].apply(returnLen)
Banks969_Li_SFrame["Gene_GC"]=Banks969_Li_SFrame['nucleotide_sequence'].apply(gc)
Banks969_Li_SFrame["Calculated_length_aa"]=Banks969_Li_SFrame['aa_sequence'].apply(returnLen)
Banks969_Li_SFrame
Out[23]:
In [25]:
rna_Banks969_Li_SFrame=Banks969_Li_SFrame.filter_by('rna','type')
rna_Banks969_Li_SFrame
Out[25]:
In [26]:
coding_Banks969_Li_SFrame=Banks969_Li_SFrame.filter_by('peg','type')
coding_Banks969_Li_SFrame.show()
In [30]:
gene_Banks969_Li_gc_SFrame=coding_Banks969_Li_SFrame.select_columns(['feature_id','Calculated_length_nt','Gene_GC'])
gene_Banks969_Li_gc_SFrame
Out[30]:
In [31]:
gene_Banks969_Li_gc_SFrame.export_csv('gene_Banks969_Li_gc.tsv','\t')
In [33]:
gene_Banks969_Li_gc_SFrame[0]
Out[33]:
Forming a record of the following pattern: ">ID| function| [Strain]
In [56]:
coding_Banks969_Li_aa_dict={}
for item in coding_Banks969_Li_SFrame:
coding_Banks969_Li_aa_dict['>'+item['feature_id']+'| '+item['function']+'| '+'[Erysipelothrix rhusiopathiae Banks969-Li]']=item['aa_sequence']
#print item['function']
for i in range(len(coding_Banks969_Li_aa_dict)):
print coding_Banks969_Li_aa_dict[][i]
In [60]:
out_file='Banks969_Li_annotated.faa'
hand_out=open(out_file,'w')
for item in coding_Banks969_Li_aa_dict:
hand_out.write(item+'\n'+coding_Banks969_Li_aa_dict[item])
hand_out.close()
print len(coding_Banks969_Li_aa_dict), type(coding_Banks969_Li_aa_dict)
In [61]:
out_file='Banks969_Li_annotated_1.faa'
hand_out=open(out_file,'w')
coding_Banks969_Li_aa_headers=coding_Banks969_Li_aa_dict.keys()
coding_Banks969_Li_aa_sequences=coding_Banks969_Li_aa_dict.values()
for i in range(len(coding_Banks969_Li_aa_dict)):
hand_out.write(coding_Banks969_Li_aa_headers[i]+'\n'+coding_Banks969_Li_aa_sequences[i])
hand_out.close()
print len(coding_Banks969_Li_aa_dict), type(coding_Banks969_Li_aa_dict)
print len(coding_Banks969_Li_aa_headers), type(coding_Banks969_Li_aa_headers)
print len(coding_Banks969_Li_aa_sequences), type(coding_Banks969_Li_aa_sequences)
In [65]:
out_file='Banks969_Li_annotated_optimized.faa'
hand_out=open(out_file,'w')
coding_Banks969_Li_aa_headers=coding_Banks969_Li_aa_dict.keys()
coding_Banks969_Li_aa_sequences=coding_Banks969_Li_aa_dict.values()
for i in range(len(coding_Banks969_Li_aa_dict)):
if (i<(len(coding_Banks969_Li_aa_dict)-1)):
hand_out.write(coding_Banks969_Li_aa_headers[i]+'\n'+coding_Banks969_Li_aa_sequences[i]+'\n')
else:
hand_out.write(coding_Banks969_Li_aa_headers[i]+'\n'+coding_Banks969_Li_aa_sequences[i])
hand_out.close()
print len(coding_Banks969_Li_aa_dict), type(coding_Banks969_Li_aa_dict)
print len(coding_Banks969_Li_aa_headers), type(coding_Banks969_Li_aa_headers)
print len(coding_Banks969_Li_aa_sequences), type(coding_Banks969_Li_aa_sequences)
In [ ]:
In [31]:
dict_nt={}
for item in lst_nt:
lst_item=item.split('\n')
dict_nt[lst_item[0].strip()]=''.join(lst_item[1:]).strip()
#dict_nt ##looks ok but too large to display
In [33]:
dict_aa={}
for item in lst_aa:
lst_item=item.split('\n')
dict_aa[lst_item[0].strip()]=''.join(lst_item[1:]).strip()
#dict_aa ###too large to display
In [13]:
#table1_addedColumn=table1
#table1_addedColumn['Seq_nt']=table1_addedColumn['NCBI GI']#just creating an empty column
#indexed_table1=table1.to_dataframe()#converting to Dataframe and indexing
#indexed_table1.set_index('Feature ID')
In [48]:
sf=graphlab.SFrame({'Feature ID':dict_nt.keys(),'Seq_nt':dict_nt.values()})
#sf['Feature ID']
aa_SFrame=graphlab.SFrame({'Feature ID':dict_aa.keys(),'Seq_aa':dict_aa.values()})
aa_SFrame
Out[48]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [49]:
table1_merge=table1
#table1_merge['Feature ID']
In [50]:
sf
Out[50]:
In [51]:
mergedTbl1=table1.join(sf, how='left')
mergedTbl1.show()
In [52]:
In [53]:
mergedTbl1["Calculated_length_nt"]=mergedTbl1['Seq_nt'].apply(returnLen)
mergedTbl1
Out[53]:
In [ ]:
In [54]:
mergedTbl1["Gene_GC"]=mergedTbl1['Seq_nt'].apply(gc)
mergedTbl1
Out[54]:
In [55]:
mergedTbl1=mergedTbl1.join(aa_SFrame, how='left')
mergedTbl1 #OK
Out[55]:
In [56]:
mergedTbl1["Calculated_length_aa"]=mergedTbl1['Seq_aa'].apply(returnLen)
mergedTbl1
Out[56]:
In [ ]:
In [ ]:
In [58]:
graphlab.canvas.set_target('ipynb')
mergedTbl1['Gene_GC'].show()
In [ ]:
In [59]:
from Bio import SeqIO
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [5]:
sf.head()
Out[5]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [6]:
sf.tail()
Out[6]:
In [7]:
#Any data structure in GraphLab Create
sf.show()
table1.show()
In [8]:
graphlab.canvas.set_target('ipynb')
In [11]:
sf['age'].show(view="Categorical")
In [9]:
sf["Country"]
table1["Length (bp)"]
Out[9]:
In [13]:
sf['age']
Out[13]:
In [14]:
sf['age'].mean()
Out[14]:
In [15]:
sf['age'].max()
Out[15]:
In [10]:
sf["Full Name"]=sf['First Name']+' '+sf['Last Name']
In [18]:
sf
Out[18]:
In [20]:
sf['age']+2
Out[20]:
In [21]:
sf['age']=sf['age']+2
In [22]:
sf
Out[22]:
In [23]:
sfpp['Country']
In [24]:
sf['Country']
Out[24]:
In [25]:
sf['Country'].show()
In [26]:
def transform_country(country):
if country == 'USA':
return 'United States'
else:
return country
In [27]:
transform_country('Brazil')
Out[27]:
In [28]:
transform_country('USA')
Out[28]:
In [29]:
sf['Country'].apply(transform.country)
In [30]:
sf['Country'].apply(transform_country)
Out[30]:
In [31]:
sf['Country']=sf['Country'].apply(transform_country)
In [32]:
sf
Out[32]:
In [ ]: