In [1]:
%matplotlib inline

In [2]:
import os
import csv
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from ete2 import Tree, TreeStyle, TextFace, faces, Face, NodeStyle, AttrFace
from Bio import Phylo

# Export matplotlib svg text as text, not path
mpl.rcParams['svg.fonttype'] = 'none'


No module named MySQLdb
 MySQLdb module could not be loaded

Figure 1A - 12 species tree


In [3]:
# First convert the nexus tree to a newick one
Phylo.convert('DUF_TREE_CONSENSUS.txt', 'nexus', 'DUF_TREE_CONSENSUS.tree', 'newick')
# Read in the Tree using ETE2
tree = Tree("DUF_TREE_CONSENSUS.tree")

# Do not color in the internal nodes
nstyle = NodeStyle()
nstyle["size"] = 0
nstyle["fgcolor"] = "black"

# Set style for nodes and round the time value
# Also re-format species name
for n in tree.traverse():
    n.set_style(nstyle)
    n.dist = round(n.dist, 2)
    # Remove the middle underscore
    if n.is_leaf():
        genus, species = n.name.split("_")
        n.name = genus[0]+". "+ species

def layout(node):
    if node.is_leaf():
        # Add node name to laef nodes
        N = AttrFace("name",fsize=12, ftype="Source Sans Pro", fgcolor="black", fstyle="italic")
        N.margin_left = 5
        faces.add_face_to_node(N, node, 0, position="aligned")

    
# Tree formatting
ts = TreeStyle()
ts.layout_fn = layout
ts.show_leaf_name = False
# ts.tree_width = 200
ts.draw_guiding_lines = True
ts.show_branch_length = False
ts.show_scale = False
ts.branch_vertical_margin = 15
ts.margin_right = 50
ts.draw_aligned_faces_as_table = True

tree.render("2015-05-01-DUF1220-fig1a.svg", tree_style=ts, h=2, units="in")
tree.render("%%inline", tree_style=ts,h=2, units="in")


Out[3]:
Generated with ETE http://ete.cgenomics.org Generated with ETE http://ete.cgenomics.org C. sabaeus P. anubis M. mulatta N. leucogenys G. gorilla H. sapiens P. troglodytes P. abelii C. jacchus T. syrichta M. murinus O. garnettii

Figure S1??


In [4]:
# Read in the data from the excel file and use the second sheet
xls = pd.ExcelFile('2015-03-12-figure-data.xlsx')
figure1b_data = xls.parse('figure1B', index_col=None, na_values=['NA'])

In [5]:
def label_point(x, y, val, ax):
    '''Uses the Notes information provided in the dataframe to color certain points in the plot'''
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    label_num = 1
    for i, point in a.iterrows():
        if str(point['val']) != "nan":
            if str(point['val']) == "LCA Homininae":
                ax.text(point['x']+0.08, point['y']-0.2, str(label_num), name="Open Sans", size=10)  
            else:
                ax.text(point['x']+0.08, point['y']-0.07, str(label_num), name="Open Sans", size=10)
            label_num += 1

sns.set_context("paper", font_scale=1)
sns.set_style("ticks", {"axes.linewidth": 0.5, 'xtick.major.size': 5, 'ytick.major.size': 5})
plt.figure(figsize=(2, 2), dpi=300)
            
figure1b_data["colors"] = np.where(pd.isnull(figure1b_data).any(axis=1), '#7f7f7f', '#d62728')
sns.axlabel(r'$log_{10}$'+ '(branch length)',r'$log_{10}$'+'(scaled branch length)', name="Open Sans", size=8)

fig1b = plt.scatter(figure1b_data["log10[BRANCH LENGTH (MY)]"],
            figure1b_data["log10[SCALED BRANCH LENGTH]"],
            color=figure1b_data["colors"],
            s=20)
plt.xticks(np.arange(0, 2.5, 0.5))
plt.yticks(np.arange(0, 3.5, 0.5))
label_point(figure1b_data["log10[BRANCH LENGTH (MY)]"], figure1b_data["log10[SCALED BRANCH LENGTH]"], figure1b_data["Notes"], plt)

sns.despine(offset=10, trim=True)
plt.savefig("2015-03-12-DUF1220-fig1b.svg", facecolor='w', edgecolor='w',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches=None, pad_inches=0,
        frameon=True)


 Figure 1b data - Scatter plot using square root DUF counts


In [6]:
figure1c_data = xls.parse('figure 1C', na_values=['NA'])

 Figure 1c data - Scatter plot using log DUF counts instead of square root


In [7]:
figureS4_data = xls.parse('Figure S4', na_values=['NA'])

Figure 1b+c as requested by GBE review


In [8]:
sns.set_context("paper", font_scale=1.2, rc={"lines.linewidth": 1,  'font.sans-serif': "Source Sans Pro"})
sns.set_style("ticks", {"axes.linewidth": 0.7})
plt.figure(figsize=(8, 2), dpi=300)
plt.subplots_adjust(wspace=0.6, bottom=0.3)

#Placeholder
plt.subplot(131)

#DUF1220 square root transformed vs log10
plt.subplot(132)

colors = np.where(figure1c_data["Species"] == "Homo_sapiens",'#d62728', '#7f7f7f' )


plt.scatter(figure1c_data["log10(Brain mass [mg])"],
            figure1c_data["sqrt(DUF)"],
            color=colors,
            s=20)

sns.axlabel(r'$log10$'+'(Brainmass [mg])', r'$sqrt$'+ '(DUF1220 domain count)', name="Open Sans", size=8)

#Add the first regression line including Homo
xi = np.arange(3.3,6.5)
intercept = figure1c_data.iloc[0]["with homo"]
slope = figure1c_data.iloc[1]["with homo"]
line_with_homo = slope*xi+intercept
plt.plot(xi,line_with_homo, color='#d62728', linewidth=1)
plt.text(6.3, 15, "Homo", name="Open Sans", size=10, style="italic")

#Add the second regression line excluding Homo
intercept = figure1c_data.iloc[0]["without"]
slope = figure1c_data.iloc[1]["without"]
line_without_homo = slope*xi+intercept
plt.plot(xi,line_without_homo,color='#7f7f7f', linestyle="-", linewidth=1)


plt.xticks(np.arange(3, 7.5, 1))
sns.despine(offset=5, trim=True)


#DUF1220 log transformed vs log10
plt.subplot(133)
# Only color the Homo outlier
colors = np.where(figureS4_data["Species"] == "Homo_sapiens",'#d62728', '#7f7f7f' )


plt.scatter(figureS4_data["log10(Brain mass [mg])"],
            figureS4_data["log(DUF)"],
            color=colors,
            s=20)

sns.axlabel(r'$log10$'+'(Brainmass [mg])', r'$log$'+ '(DUF1220 domain count)', name="Open Sans", size=8)

#Add the first regression line including Homo
xi = np.arange(3.3,6.5)
intercept = figureS4_data.iloc[0]["with homo"]
slope = figureS4_data.iloc[1]["with homo"]
line_with_homo = slope*xi+intercept
plt.plot(xi,line_with_homo, color='#d62728', linewidth=1)
plt.text(6.3, 2.3, "Homo", name="Open Sans", size=10, style="italic")

#Add the second regression line excluding Homo
intercept = figureS4_data.iloc[0]["without"]
slope = figureS4_data.iloc[1]["without"]
line_without_homo = slope*xi+intercept
plt.plot(xi,line_without_homo,color='#7f7f7f', linestyle="-", linewidth=1)


plt.xticks(np.arange(3, 7.5, 1))
sns.despine(offset=5, trim=True)
plt.savefig("2015-06-01-DUF1220-GBE-REV1-Fig1.svg")


Figure S1 - DUF1220 distribution across Ensembl versions


In [9]:
def read_domtblout(source):
    '''Reads in a domtblout file and counts domain annotations'''
    hits = {}
    try:
        with open(source, "r") as infile:
            for line in infile:
                # Ignore comments
                if not line.startswith("#"):
                    fields = line.split()
                    target = fields[0]
                    if target in hits:
                        hits[target] += 1
                    else:
                        hits[target] = 1
            return hits
    except IOError:
        print("File does not exit!")

def list_files(current_dir):
    '''List files in the current directory'''
    file_list = []
    for path, subdirs, files in os.walk(current_dir):  # Walk directory tree
        for name in files:
            f = os.path.join(path, name)
            file_list.append(f)
    return file_list


infolder = "2015-02-20-hmmsearch-vs-pep-all-longest-ensembl-range"

domain_counts = {"homo_sapiens": [],
                 "pan_troglodytes": [],
                 "macaca_mulatta": []}

for f in list_files(infolder):
    if f.endswith("domtblout"):
        version = int(f.split("-")[-1].split(".")[0][1:])
        species = f.split("-")[-2]
        domtblout = read_domtblout(f)
        domain_counts[species].append((version, sum(domtblout.values())))


# Sort data by version
for s, counts in domain_counts.items():
    counts.sort(key=lambda tup: tup[0])
sns.set_context("paper", font_scale=1.75)
sns.set_style("ticks", {"axes.linewidth": 1, 'xtick.major.size': 10, 'ytick.major.size': 10})
plt.figure(figsize=(6.5, 4), dpi=300)
sns.axlabel("Ensembl version","DUF1220 count", name="Open Sans")
# plt.scatter(*zip(*domain_counts["homo_sapiens"]))
plt.plot(*zip(*domain_counts["homo_sapiens"]),color='#d62728', linewidth=2.0,label='Homo sapiens', marker='o', markersize=5)
plt.plot(*zip(*domain_counts["pan_troglodytes"]),color='#1f77b4',linewidth=2.0, label='Pan troglodytes', marker='o', markersize=5)
plt.plot(*zip(*domain_counts["macaca_mulatta"]), color='#7f7f7f',linewidth=2.0, label='Macaca mulatta', marker='o', markersize=5)
plt.legend(loc='center right')
sns.despine(offset=20, trim=True)
plt.savefig("2015-03-12-DUF1220-figS1.svg", dpi=None, facecolor='w', edgecolor='w',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches=None, pad_inches=0,
        frameon=None)


Figure S2


In [10]:
figureS2_data = xls.parse('figure S2', na_values=['NA'])

In [11]:
sns.set_context("paper", font_scale=1.75)
sns.set_style("ticks", {"axes.linewidth": 0.5, 'xtick.major.size': 5, 'ytick.major.size': 5})
plt.figure(figsize=(6.5, 3), dpi=300)
onerate = figureS2_data['one-rate model']
variablerate = figureS2_data['variable rate model']

binwidth = 1
sns.distplot(onerate, color="#d62728", bins=np.arange(min(onerate), max(onerate) + binwidth, binwidth), hist_kws={"histtype": "stepfilled", "alpha": 0.5})
sns.distplot(variablerate, color="#1f77b4", bins=np.arange(min(variablerate), max(variablerate) + binwidth, binwidth), hist_kws={"histtype": "stepfilled", "alpha": 0.5})
sns.despine()
sns.axlabel("Ln(Lh)", "Frequency", name="Open Sans")
plt.yticks(np.arange(0, 1.2, 0.2))
plt.savefig("2015-03-12-DUF1220-figS2.svg", dpi=None, facecolor='w', edgecolor='w',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches=None, pad_inches=0,
        frameon=None)


Figure S3


In [16]:
figureS3_data = xls.parse('figure S3', na_values=['NA'])
figureS3_data.max()


Out[16]:
Non-directional model   -20.374685
Directional Model       -15.816922
dtype: float64

In [13]:
sns.set_context("paper", font_scale=1.75)
sns.set_style("ticks", {"axes.linewidth": 0.5, 'xtick.major.size': 5, 'ytick.major.size': 5})
plt.figure(figsize=(6.5, 3), dpi=300)
directional = figureS3_data['Non-directional model']
nondirectional = figureS3_data['Directional Model']

binwidth = 0.5
sns.distplot(directional, color="#d62728", bins=np.arange(min(directional), max(directional) + binwidth, binwidth), hist_kws={"histtype": "stepfilled", "alpha": 0.5})
sns.distplot(nondirectional, color="#1f77b4", bins=np.arange(min(nondirectional), max(nondirectional) + binwidth, binwidth), hist_kws={"histtype": "stepfilled", "alpha": 0.5})
sns.despine()
sns.axlabel("Ln(Lh)", "Frequency", name="Open Sans")
plt.yticks(np.arange(0, 0.4, 0.1))
# plt.xticks(np.arange(-32, -14, 2.5))
plt.savefig("2015-03-12-DUF1220-figS3.svg", dpi=None, facecolor='w', edgecolor='w',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches=None, pad_inches=0,
        frameon=None)


 New Figure 1 plots

Figure 1b


In [3]:
# Read in the data from the excel file and use the second sheet
brain_body = pd.read_table('Brain_Body.txt', sep=' ')
brain_body.head()


Out[3]:
(Intercept) Brain BODY
1 -8.479123 3.974616 -2.411992
2 -11.665654 5.049570 -2.945956
3 -7.473377 3.477256 -1.826193
4 -8.015054 3.717259 -1.999664
5 -8.868122 3.755876 -1.675914

In [ ]:

Figure 1c


In [4]:
# Read in the data from the excel file and use the second sheet
prenatal_postnatal = pd.read_table('Prenatal_Postnatal.txt', sep=' ')
prenatal_postnatal.head()


Out[4]:
(Intercept) NeoBrain PNBrain
1 1.260629 -1.173638 2.112175
2 1.380114 -2.454531 3.350447
3 -1.444097 -2.752561 4.284822
4 -0.671749 -1.399745 3.146457
5 1.020635 -1.609858 2.582759

In [ ]:

Figure 1d


In [5]:
# Read in the data from the excel file and use the second sheet
neo_cere_rob = pd.read_table('neocortex_cerebellum_rob.txt', sep=' ', encoding='utf-8')
neo_cere_rob.describe()


Out[5]:
(Intercept) NEOCORTEX CEREBELLUM Rob
count 10000.000000 10000.000000 10000.000000 10000.000000
mean -1.462722 6.078249 -0.288697 -5.600247
std 6.189899 3.219917 4.544803 5.344421
min -36.598714 -17.978207 -26.005144 -38.253025
25% -5.140223 4.112755 -2.941903 -8.756219
50% -1.404043 6.006825 -0.218685 -5.574457
75% 2.433224 8.027133 2.548397 -2.504178
max 32.020978 25.770728 21.304477 19.939007

In [ ]:

Figure 2 as requested by GBE revision


In [6]:
sns.set_context("paper", font_scale=1.2, rc={"lines.linewidth": 1,  'font.sans-serif': "Source Sans Pro"})
sns.set_style("ticks", {"axes.linewidth": 0.7})
# plt.figure(figsize=(6.75, 1.75), dpi=300)
plt.figure(figsize=(8, 2), dpi=300)
# plt.subplots_adjust(left=-0.15, right=0.9,
#                     bottom=0.2, top=0.9,
#                     hspace=0.3, wspace=0.3)
plt.subplots_adjust(wspace=0.4, bottom=0.3)
plt.subplot(131)
sns.kdeplot(brain_body["Brain"], color="#d62728", label="Brain")
sns.kdeplot(brain_body["BODY"], color="black", label="Body")
sns.axlabel("Posterior mean", "Density")
sns.despine(offset=0, trim=True)
plt.legend(handlelength=0.5, loc=2, fontsize=8)
plt.subplot(132)
sns.kdeplot(prenatal_postnatal["NeoBrain"], color="black", label="Prenatal")
sns.kdeplot(prenatal_postnatal["PNBrain"], color="#d62728", label="Postnatal")
sns.despine(offset=0, trim=True)
sns.axlabel("Posterior mean", "")
# plt.legend().remove()
plt.legend(handlelength=0.5, loc=2,  fontsize=8)
plt.subplot(133)
sns.kdeplot(neo_cere_rob["NEOCORTEX"], color="#d62728", label="Neocortex")
sns.kdeplot(neo_cere_rob["CEREBELLUM"], color="black", label="Cerebellum")
sns.kdeplot(neo_cere_rob["Rob"], color="#7f7f7f", label="RoB")
sns.despine(offset=0, trim=True)
sns.axlabel("Posterior mean", "")
# plt.legend().remove()
plt.legend(handlelength=0.5, loc=2,  fontsize=8)
plt.savefig("2015-06-24-DUF1220-GBE-REV1-Fig2.svg")



In [ ]: