In [ ]:
%%html
<link rel='stylesheet' type='text/css' href='custom.css'/>
In [ ]:
!rm data/converted-seqs.fasta data/converted-seqs.qual data/not-yasf.fna
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
plt.ylabel('Known taxonomy')
plt.xlabel('Predicted taxonomy')
plt.tight_layout()
plt.show()
Jai Rideout and Evan Bolyen
Caporaso Lab, Northern Arizona University
A Python bioinformatics library for:
"The first step in developing a new genetic analysis algorithm is to decide how to make the input data file format different from all pre-existing analysis data file formats." - Law's First Law
Axt BAM SAM BED bedGraph bigBed bigGenePred table bigWig Chain GenePred table GFF GTF HAL MAF Microarray Net Personal Genome SNP format PSL VCF WIG abi ace clustal embl fasta fastq genbank ig imgt nexus phred phylip pir seqxml sff stockholm swiss tab qual uniprot-xml emboss PhyolXML NexML newick CDAO MDL bcf caf gcproj scf SBML lsmat ordination qseq BIOM ASN.1 .2bit .nib ENCODE ...
Axt BAM SAM BED bedGraph bigBed bigGenePred table bigWig Chain GenePred table GFF GTF HAL MAF Microarray Net Personal Genome SNP format PSL VCF WIG abi ace clustal embl fasta fastq genbank ig imgt nexus phred phylip pir seqxml sff stockholm swiss tab qual uniprot-xml emboss PhyolXML NexML newick CDAO MDL bcf caf gcproj scf SBML lsmat ordination qseq BIOM ASN.1 .2bit .nib ENCODE ... </span>
In [ ]:
from skbio import DNA
seq1 = DNA.read('data/seqs.fasta', qual='data/seqs.qual')
seq2 = DNA.read('data/seqs.fastq', variant='illumina1.8')
seq1
In [ ]:
seq1 == seq2
In [ ]:
import skbio.io
skbio.io.sniff('data/mystery_file.gz')
In [ ]:
from skbio import TreeNode
tree1 = skbio.io.read('http://localhost:8888/files/data/newick.gz',
into=TreeNode)
print(tree1.ascii_art())
In [ ]:
import io
with io.open('data/newick.bz2', mode='rb') as open_filehandle:
tree2 = skbio.io.read(open_filehandle, into=TreeNode)
print(tree2.ascii_art())
In [ ]:
tree3 = skbio.io.read(['((a, b, c), d:15):0;'], into=TreeNode)
print(tree3.ascii_art())
In [ ]:
!cat data/yasf-seq.yml
In [ ]:
import yaml
yasf = skbio.io.create_format('yasf')
@yasf.sniffer()
def yasf_sniffer(fh):
return fh.readline().rstrip() == "#YASF", {}
@yasf.reader(DNA)
def yasf_to_dna(fh):
seq = yaml.load(fh.read())
return DNA(seq['Sequence'], metadata={
'id': seq['ID'],
'location': seq['Location'],
'description': seq['Description']
})
In [ ]:
seq = DNA.read("data/yasf-seq.yml")
seq
In [ ]:
seq.write("data/not-yasf.fna", format='fasta')
!cat data/not-yasf.fna
In [ ]:
from skbio.util._decorator import stable
@stable(as_of='0.4.0')
def add(a, b):
"""add two numbers.
Parameters
----------
a, b : int
Numbers to add.
Returns
-------
int
Sum of `a` and `b`.
"""
return a + b
In [ ]:
help(add)
In [ ]:
seq = DNA("AacgtGTggA", lowercase='exon')
seq
In [ ]:
seq.values
In [ ]:
seq.positional_metadata
In [ ]:
seq[seq.positional_metadata['exon']]
In [ ]:
aligned_seqs_fp = 'data/gg_13_8_otus/rep_set_aligned/82_otus.fasta'
taxonomy_fp = 'data/gg_13_8_otus/taxonomy/82_otu_taxonomy.txt'
In [ ]:
from skbio import DNA
fwd_primer = DNA("GTGCCAGCMGCCGCGGTAA",
metadata={'label':'fwd-primer'})
rev_primer = DNA("GGACTACHVGGGTWTCTAAT",
metadata={'label':'rev-primer'}).reverse_complement()
In [ ]:
def seq_to_regex(seq):
result = []
for base in str(seq):
if base in DNA.degenerate_chars:
result.append('[{0}]'.format(
''.join(DNA.degenerate_map[base])))
else:
result.append(base)
return ''.join(result)
regex = '({0}.*{1})'.format(seq_to_regex(fwd_primer),
seq_to_regex(rev_primer))
In [ ]:
import numpy as np
import skbio
starts = []
stops = []
for seq in skbio.io.read(aligned_seqs_fp, format='fasta',
constructor=DNA):
for match in seq.find_with_regex(regex, ignore=seq.gaps()):
starts.append(match.start)
stops.append(match.stop)
locus = slice(int(np.median(starts)), int(np.median(stops)))
locus
In [ ]:
kmer_counts = []
seq_ids = []
for seq in skbio.io.read(aligned_seqs_fp, format='fasta',
constructor=DNA):
seq_ids.append(seq.metadata['id'])
sliced_seq = seq[locus].degap()
kmer_counts.append(sliced_seq.kmer_frequencies(8))
In [ ]:
from sklearn.feature_extraction import DictVectorizer
X = DictVectorizer().fit_transform(kmer_counts)
In [ ]:
taxonomy_level = 3 # class
id_to_taxon = {}
with open(taxonomy_fp) as f:
for line in f:
id_, taxon = line.strip().split('\t')
id_to_taxon[id_] = '; '.join(taxon.split('; ')[:taxonomy_level])
y = [id_to_taxon[seq_id] for seq_id in seq_ids]
In [ ]:
from sklearn.feature_selection import SelectPercentile
X = SelectPercentile().fit_transform(X, y)
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=0)
In [ ]:
from sklearn.svm import SVC
y_pred = SVC(C=10, kernel='linear', degree=3,
gamma=0.001).fit(X_train, y_train).predict(X_test)
In [ ]:
from sklearn.metrics import confusion_matrix, f1_score
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm / cm.sum(axis=1)[:, np.newaxis]
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
print("F-score: %1.3f" % f1_score(y_test, y_pred, average='micro'))
The Caporaso Lab is hiring postdocs and developers, find us if you want to get paid to work on scikit-bio!
We're having a sprint on Saturday and Sunday!