This notebook does some of the scatter plots, regressions, and PCA analyses from:

Herculano-Houzel, Suzana. "Decreasing sleep requirement with increasing numbers of neurons as a driver for bigger brains and bodies in mammalian evolution." Proc. R. Soc. B. Vol. 282. No. 1816. The Royal Society, 2015.

The data themselves were taken from the paper (Table 1) and entered into a Google doc I downloaded the spreadsheet to a csv file in the local directory, named it hh2015.csv.


In [443]:
# Imports and setup
import warnings
warnings.simplefilter('ignore', Warning)

import numpy as np
import pandas as pd

# I created some helper functions for plotting;
# details don't matter...
import hhplot
reload(hhplot)

# select what plotting library
hhplot.set_plot_lib('bokeh')  # can also be mpl for matplotlib

# Magic functions so plots will be embedded properly
%matplotlib inline
output_notebook()


BokehJS successfully loaded.

In [444]:
%%javascript
// Super magic function to make output windows stop scrolling!
require("notebook/js/outputarea").OutputArea.auto_scroll_threshold = 9999;



In [445]:
# Load raw data; add/remove columns
def load_data():
    # D/A is computed from D and A.
    # I also prefer O instead of O/N
    data = pd.read_csv('hh2015.csv')
    data = data.drop('D/A', axis=1)
    data['O'] = data['O/N'] * data['Ncx']
    data = data.drop('O/N', axis=1)
    return data
data = load_data()

# But, to do the analyses in the paper, I need D/A and O/N...
data['D/A'] = data['Dcx'] / data['Acx']
data['O/N'] = data['O'] / data['Ncx']
data['N/A'] = data['Ncx'] / data['Acx']
data['log(sleep)'] = data['daily sleep'] # anything that's not 'daily sleep' or 'T' is plotted with log
data.keys()


Out[445]:
Index([u'Species', u'Order', u'brain mass', u'daily sleep', u'Ncx', u'Dcx',
       u'Acx', u'T', u'Mcx', u'O', u'D/A', u'O/N', u'N/A', u'log(sleep)'],
      dtype='object')

In [446]:
# Do some sanity checks on the data:

# Brain volume and brain mass should be similar...
data['Vcx'] = data['Acx'] * data['T']
p = hhplot.regress_and_plot('brain mass', 'Vcx', data=data)
hhplot.show_plot(p)

# Cortical volume and cortical mass.. looks like "mass" is really volume.
p = hhplot.regress_and_plot('Mcx', 'Vcx', data=data)
hhplot.show_plot(p)



In [448]:
# Now, recreate figure 1
p = hhplot.grid_it([['D/A', 'Mcx', 'Dcx'],
                    ['T', 'brain mass', 'N/A'],
                    ['Acx', 'Ncx', 'O/N']], data=data)
hhplot.show_plot(p)



In [449]:
# These are the two analyses in the paper.
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A', 'T'], data=data)
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A'], data=data)


(24, 6)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A', 'T']
Total variance explained:  0.966849683254
Variance explained per component [ 0.88841046  0.07843922]
[[-0.42291281 -0.42980063 -0.42735234  0.33237961  0.423809   -0.40459347]
 [-0.25665853 -0.14202048 -0.15621875 -0.93406924  0.06897221 -0.1109494 ]]

(24, 5)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A']
Total variance explained:  0.989750115817
Variance explained per component [ 0.89658243  0.09316769]
[[-0.46099212 -0.4678174  -0.46724397  0.36694667  0.46439893]
 [-0.28209537 -0.16357038 -0.18414055 -0.9219911   0.09844594]]


In [450]:
# How do things look if we separate D and A?
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'Dcx', 'T'], data=data)
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'Dcx'], data=data)
# What if we remove A and keep only D/A?
hhplot.do_pca(['brain mass', 'Mcx', 'log(sleep)', 'D/A', 'T'], data=data)
hhplot.do_pca(['brain mass', 'Mcx', 'log(sleep)', 'D/A'], data=data)


(24, 6)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'Dcx', 'T']
Total variance explained:  0.908369844091
Variance explained per component [ 0.81184258  0.09652726]
[[-0.43729427 -0.44478154 -0.44165257  0.35513558  0.334838   -0.42148351]
 [-0.26749288 -0.22428676 -0.22641041 -0.46826596 -0.74359421 -0.23382895]]

(24, 5)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'Dcx']
Total variance explained:  0.916891245433
Variance explained per component [ 0.80709015  0.10980109]
[[-0.47789833 -0.48540526 -0.48427282  0.39692714  0.37937264]
 [-0.33943839 -0.28835962 -0.30151819 -0.44458111 -0.71628536]]

(24, 5)
['brain mass', 'Mcx', 'log(sleep)', 'D/A', 'T']
Total variance explained:  0.964965474949
Variance explained per component [ 0.88402049  0.08094498]
[[-0.46150579 -0.46935473  0.3885079   0.46599661 -0.44567625]
 [-0.30025873 -0.19013259 -0.90535676  0.0485503  -0.22730222]]

(24, 4)
['brain mass', 'Mcx', 'log(sleep)', 'D/A']
Total variance explained:  0.989965121917
Variance explained per component [ 0.89317564  0.09678948]
[[-0.51210167 -0.51999246  0.44189806  0.52161846]
 [-0.37531027 -0.25164661 -0.88277689  0.12853463]]


In [451]:
# What if we include all data?
data_keys = set(list(load_data().keys()))
data_keys = list(data_keys - set(['Species', 'Order']))
hhplot.do_pca(data_keys, data=data)


(24, 8)
['Ncx', 'O', 'T', 'Dcx', 'Mcx', 'daily sleep', 'Acx', 'brain mass']
Total variance explained:  0.924977688057
Variance explained per component [ 0.83349744  0.09148025]
[[-0.36307549 -0.38059792 -0.3604959   0.26510776 -0.38525569  0.2880335
  -0.38331475 -0.38048199]
 [ 0.37977897  0.13462036  0.06985058  0.74633029  0.09342458  0.48953865
   0.1007578   0.13125896]]


In [452]:
# Daily sleep and Dcx seem to pop out; is A unnecessary?
# How about a regression of D and sleep?
p = hhplot.regress_and_plot('Dcx', 'daily sleep', data=data)
show(p)



In [453]:
# From the paper: log(D/A) and thickness correlate
fh = plt.figure(figsize=(16, 4))
fh.add_subplot(1,3,1)
regress_and_plot('D/A', 'T', data=data, lib='mpl')

data['1/Ncx'] = 1/data['Ncx']
fh.add_subplot(1,3,2)
regress_and_plot('1/Ncx', 'T', data=data, lib='mpl')

data['A/D'] = 1/data['D/A']
fh.add_subplot(1,3,3)
regress_and_plot('Ncx', 'A/D', data=data, lib='mpl')


Out[453]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fb16b10>

In [454]:
# D/A vs Ncx for non-primate species
non_primate_data = data.loc[data['Order'] != 'Primate']
primate_data = data.loc[data['Order'] == 'Primate']
len(non_primate_data)


Out[454]:
16

In [455]:
# Figure 3b: primates and other species diverge
regress_and_plot('Ncx', 'D/A', data=primate_data, lib='mpl')
regress_and_plot('Ncx', 'D/A', data=non_primate_data, lib='mpl')
# For comparison
plt.figure()
regress_and_plot('Ncx', 'D/A', data=data, lib='mpl')


Out[455]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cf20ad0>

In [456]:
# Now fudge things around so I can do log plots of daily sleep
# Figure 3a
non_primate_data = data.loc[data['Order'] != 'Primate']
primate_data = data.loc[data['Order'] == 'Primate']
regress_and_plot('D/A', 'log(sleep)', data=data, lib='mpl')


Out[456]:
<matplotlib.axes._subplots.AxesSubplot at 0x108e36410>

In [457]:
# Figure 3c
regress_and_plot('Ncx', 'log(sleep)', data=non_primate_data, lib='mpl')
regress_and_plot('Ncx', 'log(sleep)', data=primate_data, lib='mpl')


Out[457]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f2adc50>

In [458]:
# Not reported in the paper: PCA for non-human primates
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A', 'T'], data=non_primate_data)
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A'], data=non_primate_data)


(16, 6)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A', 'T']
Total variance explained:  0.977242019908
Variance explained per component [ 0.92221616  0.05502586]
[[ 0.41369355  0.42315637  0.41976207 -0.36793845 -0.41922675  0.40307118]
 [ 0.35344047  0.14387014  0.21676788  0.84690805 -0.22422299 -0.19965898]]

(16, 5)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A']
Total variance explained:  0.99472930587
Variance explained per component [ 0.93080186  0.06392745]
[[ 0.45301127  0.46216505  0.46046406 -0.39809499 -0.45899617]
 [ 0.3230916   0.10920513  0.17043341  0.90563295 -0.18565416]]


In [459]:
# Not reported in the paper: PCA for primates
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A', 'T'], data=primate_data)
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A'], data=primate_data)


(8, 6)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A', 'T']
Total variance explained:  0.977050550832
Variance explained per component [ 0.84949438  0.12755618]
[[ 0.43925807  0.44171973  0.4393634  -0.24158375 -0.43756884  0.41117713]
 [ 0.11272997  0.0582231   0.02371267  0.95607934 -0.10560455  0.24103882]]

(8, 5)
['brain mass', 'Mcx', 'Acx', 'daily sleep', 'D/A']
Total variance explained:  0.9967482544
Variance explained per component [ 0.85293486  0.14381339]
[[ 0.47819591  0.48119615  0.48185953 -0.28321816 -0.47684149]
 [ 0.17897783  0.11963738  0.09411535  0.95652282 -0.17280113]]


In [460]:
# Not reported in the paper: PCA with log(sleep)
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'log(sleep)', 'D/A', 'T'], data=data)
hhplot.do_pca(['brain mass', 'Mcx', 'Acx', 'log(sleep)', 'D/A'], data=data)


(24, 6)
['brain mass', 'Mcx', 'Acx', 'log(sleep)', 'D/A', 'T']
Total variance explained:  0.967497232334
Variance explained per component [ 0.89796654  0.06953069]
[[-0.42085203 -0.42733021 -0.42513022  0.34560953  0.42286302 -0.40159392]
 [-0.26554173 -0.15589484 -0.16012749 -0.92327854  0.0162154  -0.16382116]]

(24, 5)
['brain mass', 'Mcx', 'Acx', 'log(sleep)', 'D/A']
Total variance explained:  0.990539267038
Variance explained per component [ 0.90883552  0.08170374]
[[-0.45791792 -0.46422196 -0.46392773  0.38138356  0.46273836]
 [-0.30432548 -0.18680027 -0.20184475 -0.91008257  0.05916144]]


In [461]:
# Regress log(everything) vs. linear(sleep)
cols = ['Ncx', 'O/N', 'D/A', 'N/A', 'O', 'T', 'Dcx', 'Mcx', 'Acx', 'brain mass']
res, lm = hhplot.lin_regress(cols, 'daily sleep', data=data)
res.residues_, res.intercept_, zip(cols, res.coef_)


(23, 10)
Out[461]:
(array([], dtype=float64),
 1.1516050262495412e-15,
 [('Ncx', 0.43714689460510092),
  ('O/N', -0.39446533659427624),
  ('D/A', -0.38235964236282516),
  ('N/A', 0.12251095451863142),
  ('O', 0.26985073689551275),
  ('T', 0.10306064001146915),
  ('Dcx', -0.26588781703427122),
  ('Mcx', -4.803049167236189),
  ('Acx', 0.38621495502373893),
  ('brain mass', 2.6533292545822751)])

In [462]:
# Just use quantities used by HH2015
cols = ['Ncx', 'O/N', 'T', 'D/A', 'Mcx', 'Acx', 'brain mass']
res, lm = lin_regress(cols, 'daily sleep', data)
res.residues_, res.intercept_, zip(cols, res.coef_)


(23, 7)
Out[462]:
(7.2766784877938413,
 6.6896188702023864e-16,
 [('Ncx', 1.0859753272566568),
  ('O/N', -0.31548796206225915),
  ('T', 0.10306064001146406),
  ('D/A', -1.2774549118586629),
  ('Mcx', -4.8030491672361517),
  ('Acx', -0.74321598456254856),
  ('brain mass', 2.6533292545822778)])

In [463]:
# Just use quantities used by HH2015 on log(sleep)
cols = ['Ncx', 'O/N', 'T', 'D/A', 'Mcx', 'Acx', 'brain mass']
res, lm = lin_regress(cols, 'log(sleep)', data=data)
res.residues_, res.intercept_, zip(cols, res.coef_)


(23, 7)
Out[463]:
(4.8193346624911158,
 3.2146442053102112e-16,
 [('Ncx', 1.7763245722124501),
  ('O/N', -0.23906957399225134),
  ('T', 0.051975304468025445),
  ('D/A', -1.3926271601549842),
  ('Mcx', -4.1531321510448107),
  ('Acx', -1.7962910501247906),
  ('brain mass', 2.2488330841925839)])

In [464]:
# Use only the variables from PCA

#All
cols = ['brain mass', 'Mcx', 'Acx', 'D/A', 'T']
res, lm = hhplot.lin_regress(cols, 'log(sleep)', data=data)
print res.residues_, res.intercept_, zip(cols, res.coef_)
print ''

# Primates
res, lm = hhplot.lin_regress(cols, 'log(sleep)', data=primate_data)
print res.residues_, res.intercept_, zip(cols, res.coef_)
print ''

# Non-primates
res, lm = hhplot.lin_regress(cols, 'log(sleep)', data=non_primate_data)
print res.residues_, res.intercept_, zip(cols, res.coef_)
print ''


(23, 5)
6.80645043888 1.65621113628e-16 [('brain mass', 1.947654638917822), ('Mcx', -0.62405829789987255), ('Acx', -0.62360049909944415), ('D/A', 1.2732064717293772), ('T', -0.18954099936588498)]

(8, 5)
1.06478730408 -1.3849644694e-15 [('brain mass', 13.188768749866215), ('Mcx', 0.52620480162387739), ('Acx', -12.940534063444092), ('D/A', -0.65414270008445274), ('T', -2.0937503456133997)]

(15, 5)
1.19336521983 -1.33839378614e-15 [('brain mass', 1.2089140649446593), ('Mcx', -5.4668152991762202), ('Acx', 1.4311920674289016), ('D/A', -1.8324171312552977), ('T', 0.1454411286274917)]


In [465]:
# Just regress log(D/A) and log(sleep)
cols = ['D/A']
res, lm = hhplot.lin_regress(cols, 'log(sleep)', data=data)
res.residues_, res.intercept_, zip(cols, res.coef_)


(24, 1)
Out[465]:
(9.5655913115879763, -5.6455638809538444e-17, [('D/A', 0.7755215634336039)])

In [466]:
# Am I understanding resid_? It doesn't seem to be the residual...
((np.log10(data['daily sleep']) - np.log10(data['D/A']) * 0.775521)**2).sum()


Out[466]:
17.144402713478186