In [17]:
import os
We would need two reference genomes. One as a fasta file with each chromosome, and one that we will use exclusively for the mapping that would contain all contigs.
The use of contigs in the reference genome increases the mapping specificity.
In [11]:
species = 'Homo sapiens'
taxid = '9606'
genome = 'GRCh38.p10'
refseq, dir1, dir2, dir3 = 'GCF', '000', '001', '405'
genbank = 'GCF_000001405.36'
In [12]:
sumurl = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/{0}/{1}/{2}/{3}/{4}_{5}/{4}_{5}_assembly_report.txt'.format(
refseq, dir1, dir2, dir3, genbank, genome)
crmurl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text'
Download from the NCBI the list of chromosome/contigs
In [13]:
! wget -q $sumurl -O chromosome_list.txt
In [14]:
! head chromosome_list.txt
In [15]:
dirname = 'genome/'
! mkdir -p $dirname
For each contig/chromosome download the corresponding FASTA file from NCBI
In [ ]:
contig = []
for line in open('chromosome_list.txt'):
if line.startswith('#'):
continue
seq_name, seq_role, assigned_molecule, _, genbank, _, refseq, _ = line.split(None, 7)
if seq_role == 'assembled-molecule':
name = 'chr%s.fasta' % assigned_molecule
else:
name = 'chr%s_%s.fasta' % (assigned_molecule, seq_name.replace('/', '-'))
contig.append(name)
outfile = os.path.join(dirname, name)
if os.path.exists(outfile) and os.path.getsize(outfile) > 10:
continue
error_code = os.system('wget "%s" --no-check-certificate -O %s' % (crmurl % (genbank), outfile))
if error_code:
error_code = os.system('wget "%s" --no-check-certificate -O %s' % (crmurl % (refseq), outfile))
if error_code:
print genbank
Concatenate all contigs/chromosomes into a single file
In [ ]:
contig_file = open('genome/Homo_sapiens_contigs.fa','w')
for molecule in contig:
for line in open('genome/' + molecule):
# replace the header of the sequence in the fasta file
if line == '\n':
continue
if line.startswith('>'):
line = '>' + molecule[3:].replace('.fasta', '') + '\n'
contig_file.write(line)
contig_file.close()
Remove all the other files (with single chromosome/contig)
In [ ]:
! rm -f genome/*.fasta
In [ ]:
! gem-indexer -t 8 -i genome/Homo_sapiens_contigs.fa -o genome/Homo_sapiens_contigs
The index file will be: genome/Homo_sapiens_contigs.gem
WARNING: in more recent versions of GEM the "-t" flag should be "-T"