In [1]:
!pip install biopython
In [2]:
import requests
url = 'https://raw.githubusercontent.com/Serulab/Py4Bio/master/samples/casein.gb'
input_file_name = url.split('/')[-1]
response = requests.get(url)
open(input_file_name, 'wb').write(response.content)
Out[2]:
In [0]:
from Bio import SeqIO
with open(input_file_name, 'r') as input_handle:
sequences = SeqIO.parse(input_handle, 'genbank')
with open('casein.fasta', 'w') as output_handle:
SeqIO.write(sequences, output_handle, 'fasta')
In [0]:
with open("casein.fasta") as output_handle:
print(output_handle.read())
In [0]:
# Using casein.fasta file generated in previous code
with open("casein.fasta") as input_handle:
sequences = SeqIO.parse(input_handle, 'fasta')
for sequence in sequences:
SeqIO.write(sequence, sequence.name, "fasta")
In [16]:
!ls
In [17]:
!cat FJ429671.1
In [19]:
record_dict = SeqIO.to_dict(SeqIO.parse("casein.fasta", "fasta"))
print(record_dict["FJ429671.1"]) # use any record ID
In [26]:
# Using casein.fasta file generated in the first code
MIN_SEQ_LENGTH = 400
sequences = SeqIO.parse("casein.fasta", 'fasta')
filtered = []
for total, sequence in enumerate(sequences):
if len(sequence.seq) > MIN_SEQ_LENGTH:
filtered.append(sequence)
print("total sequences: {}".format(total))
print("after filter sequences: {}".format(len(filtered)))