In [1]:
from __future__ import division, print_function
from IPython.core import page
page.page = print
import warnings
warnings.simplefilter('always')
<img src="skbio-docs.png" style="width: 600px;"/ align="left">
Bioinformatics, as I see it, is the application of the tools of computer science (things like programming languages, algorithms, and databases) to address biological problems (for example, inferring the evolutionary relationship between a group of organisms based on fragments of their genomes, or understanding if or how the community of microorganisms that live in my gut changes if I modify my diet). Bioinformatics is a rapidly growing field, largely in response to the vast increase in the quantity of data that biologists now grapple with. Students from varied disciplines (e.g., biology, computer science, statistics, and biochemistry) and stages of their educational careers (undergraduate, graduate, or postdoctoral) are becoming interested in bioinformatics.
I teach bioinformatics at the undergraduate and graduate levels at Northern Arizona University. This repository contains some of the materials that I've developed in these courses, and represents an initial attempt to organize these materials in a standalone way. If you'd like to read a little more about the project, see my blog post on microbe.net.
This project is in very early development stage. It's not ready for prime-time by any means, but I fall firmly into the "publish early, publish often" mindset, hence its public availability. I am very interested in feedback in the form of email (gregcaporaso@gmail.com) or pull requests.
The code in the iab module is not sufficiently tested, documented, or optimized for production use. As code reaches those quality standards it will be ported to scikit-bio. I do not recommend using the code in the iab module outside of these notebooks. In other words, don't import iab outside of the notebooks - if you want access to the functionality in your own code, you should import skbio.
Currently, the best example of where I'm hoping to go with these materials is the multiple sequence alignment chapter.
In [2]:
from skbio.core.alignment.pairwise import local_pairwise_align_nucleotide
local_pairwise_align_nucleotide?
In [3]:
from skbio.core.alignment.pairwise import local_pairwise_align_nucleotide
s1 = "ACTAAGGCTCTCTACCCCTCTCAGAGA"
s2 = "AAAAAACTCTCTAAACTCACTAAGGCTCTCTACCCCTCTTCAGAGAAGTCGA"
r = local_pairwise_align_nucleotide(s1, s2)
print(type(r))
print(r)
In [4]:
from skbio import DNA
s1 = DNA("ACTAAGGCTCTCTACCCCTCTCAGAGA", "query")
s2 = DNA("AAAAAACTCTCTAAACTCACTAAGGCTCTCTACCCCTCTTCAGAGAAGTCGA", "target")
r = local_pairwise_align_nucleotide(s1, s2)
print(type(r))
print(r)
In [5]:
from skbio.core.alignment import local_pairwise_align_ssw
local_pairwise_align_ssw?
In [6]:
r = local_pairwise_align_ssw(s1, s2)
print(type(r))
print(r)
In [7]:
from skbio.parse.sequences import parse_fasta
from skbio import SequenceCollection
from random import choice
gg_path = "/Users/caporaso/data/gg_13_8_otus/rep_set/73_otus.fasta"
s = SequenceCollection.from_fasta_records([(i, s) for i, s in parse_fasta(gg_path) if set(s) == set('ACGT')], DNA)
In [8]:
%timeit local_pairwise_align_ssw(choice(s), choice(s), gap_open_penalty=5,\
gap_extend_penalty=2, match_score=2, mismatch_score=-3)
In [9]:
warnings.simplefilter('ignore')
%timeit local_pairwise_align_nucleotide(choice(s), choice(s), gap_open_penalty=5,\
gap_extend_penalty=2, match_score=2, mismatch_score=-3)
In [10]:
import pandas as pd
sample_md = {
'A': {'body_site': 'gut', 'subject': '1'},
'B': {'body_site': 'skin', 'subject': '1'},
'C': {'body_site': 'tongue', 'subject': '1'},
'D': {'body_site': 'gut', 'subject': '2'},
'E': {'body_site': 'tongue', 'subject': '2'},
'F': {'body_site': 'skin', 'subject': '2'}}
sample_md = pd.DataFrame.from_dict(sample_md, orient='index')
sample_md
Out[10]:
In [11]:
data = [[23, 64, 14, 0, 0, 3, 1],
[0, 3, 35, 42, 0, 12, 1],
[0, 5, 5, 0, 40, 40, 0],
[44, 35, 9, 0, 1, 0, 0],
[0, 2, 8, 0, 35, 45, 1],
[0, 0, 25, 35, 0, 19, 0]]
table = pd.DataFrame(data,
columns=['Species 1', 'Species 2', 'Species 3', 'Species 4', 'Species 5', 'Species 6', 'Species 7'],
index=list('ABCDEF'))
table
Out[11]:
In [12]:
from skbio.math.diversity.beta import pw_distances
bc_dm = pw_distances(table, table.index, "braycurtis")
print(bc_dm)
In [13]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def scatter_3d(ord_results, df, column, color_map, title='', axis1=0,
axis2=1, axis3=2):
"""Adapted from Matplotlib Gallery:
http://matplotlib.org/examples/mplot3d/scatter3d_demo.html
"""
coord_matrix = ord_results.site.T
ids = ord_results.site_ids
colors = [color_map[df[column][id_]] for id_ in ord_results.site_ids]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
xs = coord_matrix[axis1]
ys = coord_matrix[axis2]
zs = coord_matrix[axis3]
plot = ax.scatter(xs, ys, zs, c=colors, s=150)
ax.set_xlabel('PC %d' % (axis1 + 1))
ax.set_ylabel('PC %d' % (axis2 + 1))
ax.set_zlabel('PC %d' % (axis3 + 1))
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])
ax.set_title(title)
return fig
In [14]:
from skbio.math.stats.ordination import PCoA
bc_pc = PCoA(bc_dm).scores()
In [15]:
# This function adapted from Matplotlib Gallery's:
# http://matplotlib.org/examples/mplot3d/scatter3d_demo.html
fig = scatter_3d(bc_pc, sample_md, 'subject', {'1': 'yellow', '2': 'purple'},
'Samples colored by subject')
In [16]:
# This function adapted from Matplotlib Gallery's:
# http://matplotlib.org/examples/mplot3d/scatter3d_demo.html
fig = scatter_3d(bc_pc, sample_md, 'body_site',
{'gut': 'b', 'skin': 'r', 'tongue': 'g'},
'Samples colored by body site')
In [17]:
from skbio.math.stats.distance import ANOSIM
anosim = ANOSIM(bc_dm, sample_md, column='subject')
results = anosim(999)
print(results.statistic)
print(results.p_value < 0.05)
In [18]:
anosim = ANOSIM(bc_dm, sample_md, column='body_site')
results = anosim(999)
print(results.statistic)
print(results.p_value < 0.1)
(name of language,
spelling of phrase for ordering a beer,
phonetic phrase for ordering a beer)
[Source]
In [19]:
languages = [("Afrikaans", "'n Bier, asseblief", "A beer ah-suh-bleef"),
("Basque", "Garagardo bat, mesedez", "Gara-gardo bat mese-des"),
("Breton", "Ur banne bier am bo, mar plij", "Oor bah-ne beer am boh mar pleezh"),
("Catalan", "Una cervesa, si us plau", "Oona servayzeh see oos plow"),
("Croatian", "Jedno pivo, molim", "Yed-no pee-vo, mo-lim"),
("Czech / Slovak", "Pivo, prosím", "Pee-vo, pro-seem"),
("Danish", "Jeg vil gerne have en øl", "Yay vil geh-neh heh en url"),
("Dutch", "Een bier, alsjeblieft", "Un beer, ahls-yer-bleeft"),
("English", "One beer, please", "Wun beer, pleez"),
("Esperanto", "Unu bieron, mi petas", "Oo-noo bee-airon, mee peh-tahs"),
("Estonian", "Üks õlu, palun", "Ooks ur-loo, pah-lun"),
("Finnish", "Olut mulle, kiitos", "O-loot moolek kee-tos"),
("French", "Une bière, s'il vous plaît", "Oon bee-air, seel voo pleh"),
("German", "Ein Bier, bitte", "Ine beer, bitt-uh"),
("Hungarian", "Egy pohár sört kérek", "Edj pohar shurt kayrek"),
("Icelandic", "Einn bjór, takk", "Ay-dn byohr tahk"),
("Irish", "Beoir amháin, le do thoil", "Byohr awoyn, lyeh doh hull"),
("Italian", "Una birra, per favore", "Oo-na beer-ra, pair fa-vo-re"),
("Latin", "Cervisiam, sodes", "Ker-wi-see-am, soh-dehs"),
("Latvian", "Vienu alu, lū-dzu", "Vyeh-noo ah-loo, loo dzoo"),
("Lithuanian", "Prašau viena alaus", "Pra-shau vie-na al-lows"),
("Maltese", "Wiehed birra, jekk jghogbok", "Wee-het bir-ra yek yoh-dzbok"),
("Norwegian", "En øl, takk", "Ehn url tahk"),
("Occitan", "Una cervesa, se vos plai", "Oo-no serbeh-zo se bus ply"),
("Polish", "Jedno piwo, proszę", "Yed-no peevo proshe"),
("Portuguese", "Uma cerveja, por favor", "Oo-ma ser-vay-ja, poor fa-vohr"),
("Romansch Ladina", "Üna biera, per plaschair.", "Oo-nuh bee-air-uh per plah-chair"),
("Sardinian", "Una birra, po piaghere", "Oo-na beer-ra po pia-gehre"),
("Scots Gaelic", "Leann, mas e do thoil e", "Lyawn mahs eh doh hawl eh"),
("Slovene", "Eno pivo, prosim", "Eno pee-vo pro-seem"),
("Spanish (Lat. Am.)", "Una cerveza, por favor", "Oo-na ser-veh-sa, por fa-vor"),
("Spanish (Spain)", "Una cerveza, por favor", "Oo-na thair-veh-tha, por fa-vor"),
("Strine", "Foster's, mate", "Faw-stuhz, mayt"),
("Swedish", "En öl, tack", "Ehn irl, tahk"),
("Twi", "Mame beer baako, mi pawokyew", "Mah-me bee-ye bah-ko mee pow-che-oo"),
("Turkish", "Bir bira, lütfen", "Beer beer-ah luht-fen"),
("Welsh", "Cwrw os gwelwch in dda", "Koo-roh ohs gwel-ookh-un-thah")]
In [20]:
language_to_pron = {e[0]: e[2] for e in languages}
all_pron_chars = []
for e in language_to_pron.values():
all_pron_chars.extend(e)
all_pron_chars = set(all_pron_chars)
pron_substitution_matrix = {}
for c in all_pron_chars:
row = {}.fromkeys(all_pron_chars, -2.0)
row[c] = 5.0
pron_substitution_matrix[c] = row
In [21]:
from skbio.core.alignment.pairwise import global_pairwise_align
alignment = global_pairwise_align(language_to_pron["Swedish"],
language_to_pron["Norwegian"],
gap_open_penalty=5, gap_extend_penalty=2,
substitution_matrix=pron_substitution_matrix)
print(alignment.to_fasta())
print("Hamming distance: %1.3f" % alignment.distances()[0,1])
In [22]:
alignment = global_pairwise_align(language_to_pron["Swedish"],
language_to_pron["Icelandic"],
gap_open_penalty=5, gap_extend_penalty=2,
substitution_matrix=pron_substitution_matrix)
print(alignment.to_fasta())
print("Hamming distance: %1.3f" % alignment.distances()[0,1])
In [23]:
alignment = global_pairwise_align(language_to_pron["Spanish (Spain)"],
language_to_pron["Italian"],
gap_open_penalty=5, gap_extend_penalty=2,
substitution_matrix=pron_substitution_matrix)
print(alignment.to_fasta())
print("Hamming distance: %1.3f" % alignment.distances()[0,1])
In [24]:
warnings.simplefilter('ignore')
from skbio.core.alignment.pairwise import global_pairwise_align
from skbio import DistanceMatrix
languages = language_to_pron.keys()
distances = np.zeros((len(languages), len(languages)))
for i, language1 in enumerate(languages):
language1_phrase = language_to_pron[language1]
for j in range(i):
language2 = languages[j]
language2_phrase = language_to_pron[language2]
alignment = global_pairwise_align(language1_phrase, language2_phrase,
gap_open_penalty=5, gap_extend_penalty=2,
substitution_matrix=pron_substitution_matrix)
distances[i, j] = distances[j, i] = alignment.distances()[0,1]
dm = DistanceMatrix(distances, languages)
print(dm)
In [25]:
from scipy.cluster.hierarchy import average, dendrogram, to_tree
lm = average(dm.condensed_form())
def format_dendrogram(tip_count):
import matplotlib.pylab as plt
ax = plt.gca()
fig = plt.gcf()
height = tip_count * 0.4
if height < 3:
height = 3
fig.set_size_inches(7, height)
font = {'family' : 'normal',
'weight' : 'normal',
'size' : 18}
matplotlib.rc('font', **font)
return ax
format_dendrogram(dm.shape[0])
d = dendrogram(lm, labels=dm.ids, orientation='right',
link_color_func=lambda x: 'black')
Adam Robbins-Pianka (@adamrp) | Antonio Gonzalez (@antgonza) | Daniel McDonald (@wasade) | Evan Bolyen (@ebolyen) | Greg Caporaso (@gregcaporaso) | Jai Ram Rideout (@ElBrogrammer) | Jens Reeder (@jensreeder) | Jorge Cañardo Alastuey (@Jorge-C) | Jose Antonio Navas Molina (@josenavas) | Joshua Shorenstein (@squirrelo) | Yoshiki Vázquez Baeza (@ElDeveloper) | @charudatta-navare | John Chase (@johnchase) | Karen Schwarzberg (@karenschwarzberg) | Emily TerAvest (@teravest) | Will Van Treuren (@wdwvt1) | Zech Xu (@RNAer)
Rob Knight (@rob-knight) | Gavin Huttley (@gavin-huttley) | Micah Hamady | Sandra Smit | Cathy Lozupone (@clozupone) | Mike Robeson (@mikerobeson) | Marcin Cieslik | Peter Maxwell | Jeremy Widmann | Zongzhi Liu | Michael Dwan | Logan Knecht (@loganknecht) | Andrew Cochran | Jose Carlos Clemente (@cleme) | Damien Coy | Levi McCracken | Andrew Butterfield | Justin Kuczynski (@justin212k) | Matthew Wakefield (@genomematt)
In [26]:
import skbio
print(skbio.title)
print(skbio.art)