This example is adapted from my SciPy 2015 talk, and was updated for a BIO 290 presentation on Emperor at Northern Arizona University by TJ Owen (Spring, 2015).
In [53]:
import warnings
from StringIO import StringIO
import numpy as np
import pandas as pd
from skbio.alignment import global_pairwise_align
from skbio import DistanceMatrix
from IPython.display import FileLink
warnings.simplefilter('ignore')
data = StringIO('SampleID,Spelling,Pronunciation,GeogrpahicLocation,Type\n0,Afrikaans,"\'n Bier, asseblief",A beer ah-suh-bleef,,\n1,Basque,"Garagardo bat, mesedez",Gara-gardo bat mese-des,European,Other\n2,Breton,"Ur banne bier am bo, mar plij",Oor bah-ne beer am boh mar pleezh,European,West Germanic\n3,Catalan,"Una cervesa, si us plau",Oona servayzeh see oos plow,European,Latin\n4,Croatian,"Jedno pivo, molim","Yed-no pee-vo, mo-lim",,\n5,Czech / Slovak,"Pivo, pros\xc3\xadm","Pee-vo, pro-seem",,\n6,Danish,Jeg vil gerne have en \xc3\xb8l,Yay vil geh-neh heh en url,European,North Germanic\n7,Dutch,"Een bier, alsjeblieft","Un beer, ahls-yer-bleeft",European,West Germanic\n8,English,"One beer, please","Wun beer, pleez",European,West Germanic\n9,Esperanto,"Unu bieron, mi petas","Oo-noo bee-airon, mee peh-tahs",European,Latin\n10,Estonian,"\xc3\x9cks \xc3\xb5lu, palun","Ooks ur-loo, pah-lun",European,Uralic\n11,Finnish,"Olut mulle, kiitos",O-loot moolek kee-tos,European,Uralic\n12,French,"Une bi\xc3\xa8re, s\'il vous pla\xc3\xaet","Oon bee-air, seel voo pleh",European,Latin\n13,German,"Ein Bier, bitte","Ine beer, bitt-uh",European,West Germanic\n14,Hungarian,Egy poh\xc3\xa1r s\xc3\xb6rt k\xc3\xa9rek,Edj pohar shurt kayrek,European,Uralic\n15,Icelandic,"Einn bj\xc3\xb3r, takk",Ay-dn byohr tahk,European,North Germanic\n16,Irish,"Beoir amh\xc3\xa1in, le do thoil","Byohr awoyn, lyeh doh hull",European,West Germanic\n17,Italian,"Una birra, per favore","Oo-na beer-ra, pair fa-vo-re",European,Latin\n18,Latin,"Cervisiam, sodes","Ker-wi-see-am, soh-dehs",European,Latin\n19,Latvian,"Vienu alu, l\xc5\xab-dzu","Vyeh-noo ah-loo, loo dzoo",European,Slavic\n20,Lithuanian,Pra\xc5\xa1au viena alaus,Pra-shau vie-na al-lows,European,Slavic\n21,Maltese,"Wiehed birra, jekk jghogbok",Wee-het bir-ra yek yoh-dzbok,European,Arabic\n22,Norwegian,"En \xc3\xb8l, takk",Ehn url tahk,European,North Germanic\n23,Occitan,"Una cervesa, se vos plai",Oo-no serbeh-zo se bus ply,European,Latin\n24,Polish,"Jedno piwo, prosz\xc4\x99",Yed-no peevo proshe,European,Slavic\n25,Portuguese,"Uma cerveja, por favor","Oo-ma ser-vay-ja, poor fa-vohr",European,Latin\n26,Romansch Ladina,"\xc3\x9cna biera, per plaschair.",Oo-nuh bee-air-uh per plah-chair,European,Latin\n27,Sardinian,"Una birra, po piaghere",Oo-na beer-ra po pia-gehre,European,Latin\n28,Scots Gaelic,"Leann, mas e do thoil e",Lyawn mahs eh doh hawl eh,European,West Germanic\n29,Slovene,"Eno pivo, prosim",Eno pee-vo pro-seem,European,Slavic\n30,Spanish (Lat. Am.),"Una cerveza, por favor","Oo-na ser-veh-sa, por fa-vor",America,Latin\n31,Spanish (Spain),"Una cerveza, por favor","Oo-na thair-veh-tha, por fa-vor",European,Latin\n32,Strine,"Foster\'s, mate","Faw-stuhz, mayt",Australia,West Germanic\n33,Swedish,"En \xc3\xb6l, tack","Ehn irl, tahk",European,North Germanic\n34,Twi,"Mame beer baako, mi pawokyew",Mah-me bee-ye bah-ko mee pow-che-oo,Africa,Niger-Congo\n35,Turkish,"Bir bira, l\xc3\xbctfen",Beer beer-ah luht-fen,Middle East,Arabic\n36,Welsh,Cwrw os gwelwch in dda,Koo-roh ohs gwel-ookh-un-thah,European,West Germanic\n')
df = pd.read_csv(data, header=0, index_col=0)
In [54]:
language_to_pron = {e[1]['SampleID']: e[1]['Pronunciation'] for e in df.iterrows()}
all_pron_chars = []
for e in language_to_pron.values():
all_pron_chars.extend(e)
all_pron_chars = set(all_pron_chars)
pron_substitution_matrix = {}
for c in all_pron_chars:
row = {}.fromkeys(all_pron_chars, -2.0)
row[c] = 5.0
pron_substitution_matrix[c] = row
languages = language_to_pron.keys()
distances = np.zeros((len(languages), len(languages)))
for i, language1 in enumerate(languages):
language1_phrase = language_to_pron[language1]
for j in range(i):
language2 = languages[j]
language2_phrase = language_to_pron[language2]
alignment = global_pairwise_align(language1_phrase, language2_phrase,
gap_open_penalty=5, gap_extend_penalty=2,
substitution_matrix=pron_substitution_matrix)
distances[i, j] = distances[j, i] = alignment.distances()[0,1]
dm = DistanceMatrix(distances, languages)
dm
Out[54]:
In [55]:
df.to_csv('map.txt', sep='\t',
header=["#SampleID", "Spelling", "Pronunciation", "GeogrpahicLocation", "Type"],
na_rep="No data", index=False)
dm.write('language-dm.txt')
!principal_coordinates.py -i language-dm.txt -o language-pc.txt
!make_emperor.py -o language-plots -i language-pc.txt -m map.txt
FileLink('language-plots/index.html')
Out[55]:
In [ ]: