In [1]:
# Thank you Damian Kao http://blog.nextgenetics.net/?e=102
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')


Out[1]:
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.

In [2]:
## Import Libraries

In [3]:
from __future__ import division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pylab
from math import sqrt
from matplotlib import rc
import scipy.stats as stats
import pandas as pd
from numpy.linalg import norm

Load data, calculate Lifetime Stem Cell Cummulative Division (LSCD), LSCD boundry points: a and b, mean value of LSCD: x-bar and their respective $log_{10}$ values named appropriately. The interval values of LSCD stems from the fact that the estimated values of stem cell division cycle per year frequencies are interval estimates ( $d_1$ and $d_2$). The LSCD boundry points are estimated with the formula presented by T-V original paper.


In [4]:
df = pd.read_table('Voglestein.csv', header=0, index_col=0, sep=',')

y = df['Life incident rate']
lscd = df['Total_Stem_Cells']*(df['Devision_year']*80 + 2) -2 

df['lscd'] = lscd

x_bar = df['Total_Stem_Cells']*((df['d1']+df['d2'])*40 + 2) -2 
a = df['Total_Stem_Cells']*(df['d1']*80 + 2) -2 
b = df['Total_Stem_Cells']*(df['d2']*80 + 2) -2 
df['x-bar'] = x_bar
df['a'] = a
df['b'] = b


df['LIR_log'] = np.log10(y)
df['lscd_log'] = np.log10(lscd)
df['lscd_mean_log'] = np.log10(x_bar)
df['lscd_low_log'] = np.log10(a)
df['lscd_high_log'] = np.log10(b)

In [5]:
# # Save data in CSV format
# df.to_csv('Vogelestien-log.csv', sep=',', header=True, index=True)

# # Print the data in latex format table
# format_dict = dict.fromkeys(df.columns, lambda x: '{:.2e}'.format(x))
# format_dict.update(dict.fromkeys(['Devision_year', 'd1', 'd2'], lambda x: '{:f}'.format(x)))

# df.to_latex('Voglestine-Intervals-data.tex', float_format=lambda x: '{:.2e}'.format(x), index=True, header=True)

In [6]:
df.head(50)


Out[6]:
Life incident rate Total_cells_tissue Total_Stem_Cells lscd Devision_year d1 d2 x-bar a b LIR_log lscd_log lscd_mean_log lscd_low_log lscd_high_log
Cancer
AML 0.004100 3.000000e+12 135000000 1.298700e+11 12.0000 6.400 23.6600 1.625940e+11 6.939000e+10 2.557980e+11 -2.387216 11.113509 11.211105 10.841297 11.407897
BCC 0.300000 2.000000e+11 5820000000 3.550200e+12 7.6000 7.600 7.6000 3.550200e+12 3.550200e+12 3.550200e+12 -0.522879 12.550253 12.550253 12.550253 12.550253
CLL 0.005200 3.000000e+12 135000000 1.298700e+11 12.0000 6.400 23.6600 1.625940e+11 6.939000e+10 2.557980e+11 -2.283997 11.113509 11.211105 10.841297 11.407897
Colorectal 0.048000 3.000000e+10 200000000 1.168400e+12 73.0000 73.000 73.0000 1.168400e+12 1.168400e+12 1.168400e+12 -1.318759 12.067592 12.067592 12.067592 12.067592
Colorectal FAP 1.000000 3.000000e+10 200000000 1.168400e+12 73.0000 73.000 73.0000 1.168400e+12 1.168400e+12 1.168400e+12 0.000000 12.067592 12.067592 12.067592 12.067592
Colorectal Lynch 0.500000 3.000000e+10 200000000 1.168400e+12 73.0000 73.000 73.0000 1.168400e+12 1.168400e+12 1.168400e+12 -0.301030 12.067592 12.067592 12.067592 12.067592
Duodenum adenocarcinoma 0.000300 6.800000e+08 4000000 7.688000e+09 24.0000 24.000 24.0000 7.688000e+09 7.688000e+09 7.688000e+09 -3.522879 9.885813 9.885813 9.885813 9.885813
Duodenum adenocarcinoma with FAP 0.035000 6.800000e+08 4000000 7.688000e+09 24.0000 24.000 24.0000 7.688000e+09 7.688000e+09 7.688000e+09 -1.455932 9.885813 9.885813 9.885813 9.885813
Esophageal squamous cell carcinoma 0.001938 3.240000e+09 846000 1.179324e+09 17.4000 17.400 17.4000 1.179324e+09 1.179324e+09 1.179324e+09 -2.712646 9.071633 9.071633 9.071633 9.071633
Gallbladder non papillary adenocarcinoma 0.002800 1.600000e+08 1600000 7.795200e+07 0.5840 0.584 0.5840 7.795200e+07 7.795200e+07 7.795200e+07 -2.552842 7.891827 7.891827 7.891827 7.891827
Glioblastoma 0.002190 8.640000e+10 135000000 2.700000e+08 0.0000 0.000 0.0000 2.700000e+08 2.700000e+08 2.700000e+08 -2.659556 8.431364 8.431364 8.431364 8.431364
Head & neck squamous cell carcinoma 0.013800 1.670000e+10 18500000 3.185700e+10 21.5000 18.250 26.1000 3.285600e+10 2.704700e+10 3.866500e+10 -1.860121 10.503205 10.516615 10.432119 10.587318
Head & neck squamous cell carcinoma with HPV-16 0.079350 1.670000e+10 18500000 3.185700e+10 21.5000 18.250 26.1000 3.285600e+10 2.704700e+10 3.866500e+10 -1.100453 10.503205 10.516615 10.432119 10.587318
Hepatocellular carcinoma 0.007100 2.000000e+11 3010000000 2.257500e+11 0.9125 0.730 1.2167 2.404027e+11 1.818040e+11 2.990014e+11 -2.148742 11.353628 11.380939 11.259603 11.475673
Hepatocellular carcinoma with HCV 0.071000 2.000000e+11 3010000000 2.257500e+11 0.9125 0.730 1.2167 2.404027e+11 1.818040e+11 2.990014e+11 -1.148742 11.353628 11.380939 11.259603 11.475673
Lung adenocarcinoma (nonsmokers) 0.004500 4.000000e+11 1220000000 9.272000e+09 0.0700 0.070 0.0700 9.272000e+09 9.272000e+09 9.272000e+09 -2.346787 9.967173 9.967173 9.967173 9.967173
Lung adenocarcinoma (smokers) 0.081000 4.000000e+11 1220000000 9.272000e+09 0.0700 0.070 0.0700 9.272000e+09 9.272000e+09 9.272000e+09 -1.091515 9.967173 9.967173 9.967173 9.967173
Medulloblastoma 0.000110 8.500000e+10 136000000 2.720000e+08 0.0000 0.000 0.0000 2.720000e+08 2.720000e+08 2.720000e+08 -3.958607 8.434569 8.434569 8.434569 8.434569
Melanoma 0.020300 3.800000e+09 3800000000 7.615200e+11 2.4800 2.480 2.4800 7.615200e+11 7.615200e+11 7.615200e+11 -1.692504 11.881681 11.881681 11.881681 11.881681
Osteosarcoma 0.000350 1.900000e+10 4180000 3.076480e+07 0.0670 0.067 0.0670 3.076480e+07 3.076480e+07 3.076480e+07 -3.455932 7.488054 7.488054 7.488054 7.488054
Osteosarcoma of arms 0.000040 3.000000e+08 650000 4.783998e+06 0.0670 0.067 0.0670 4.783998e+06 4.783998e+06 4.783998e+06 -4.397940 6.679791 6.679791 6.679791 6.679791
Osteosarcoma of head 0.000030 3.900000e+08 860000 6.329598e+06 0.0670 0.067 0.0670 6.329598e+06 6.329598e+06 6.329598e+06 -4.519993 6.801376 6.801376 6.801376 6.801376
Osteosarcoma legs 0.000220 7.200000e+08 1590000 1.170240e+07 0.0670 0.067 0.0670 1.170240e+07 1.170240e+07 1.170240e+07 -3.657577 7.068275 7.068275 7.068275 7.068275
Osteosarcoma plevis 0.000030 2.000000e+08 450000 3.311998e+06 0.0670 0.067 0.0670 3.311998e+06 3.311998e+06 3.311998e+06 -4.522879 6.520090 6.520090 6.520090 6.520090
Ovarian germ cell 0.000411 1.100000e+07 11000000 2.200000e+07 0.0000 0.000 0.0000 2.200000e+07 2.200000e+07 2.200000e+07 -3.386158 7.342423 7.342423 7.342423 7.342423
Pancreatic ductal adenocarcinoma 0.013589 2.000000e+11 4180000000 3.427600e+11 1.0000 1.000 1.0000 3.427600e+11 3.427600e+11 3.427600e+11 -1.866813 11.534990 11.534990 11.534990 11.534990
Pancreatic endocrine 0.000194 2.950000e+09 74000000 6.068000e+09 1.0000 1.000 1.0000 6.068000e+09 6.068000e+09 6.068000e+09 -3.712198 9.783046 9.783046 9.783046 9.783046
Small intestine adenocarcinoma 0.000700 1.700000e+10 100000000 2.882000e+11 36.0000 36.000 36.0000 2.882000e+11 2.882000e+11 2.882000e+11 -3.154902 11.459694 11.459694 11.459694 11.459694
Testicular germ cell cancer 0.003700 2.160000e+10 7200000 3.355200e+09 5.8000 4.000 6.0000 2.894400e+09 2.318400e+09 3.470400e+09 -2.431798 9.525718 9.461559 9.365188 9.540380
Thyroid papillary/follicular carcinoma 0.010260 1.000000e+10 65000000 5.824000e+08 0.0870 0.069 0.1180 6.162000e+08 4.888000e+08 7.436000e+08 -1.988853 8.765221 8.789722 8.689131 8.871339
Thyroid medullary carcinoma 0.000324 1.000000e+09 6500000 5.824000e+07 0.0870 0.069 0.1180 6.162000e+07 4.888000e+07 7.436000e+07 -3.489455 7.765221 7.789722 7.689131 7.871339

Show points where there are interval estimates


In [7]:
# Take a look at top 5 entries
df[df['a']-df['b'] != 0]


Out[7]:
Life incident rate Total_cells_tissue Total_Stem_Cells lscd Devision_year d1 d2 x-bar a b LIR_log lscd_log lscd_mean_log lscd_low_log lscd_high_log
Cancer
AML 0.004100 3.000000e+12 135000000 1.298700e+11 12.0000 6.400 23.6600 1.625940e+11 6.939000e+10 2.557980e+11 -2.387216 11.113509 11.211105 10.841297 11.407897
CLL 0.005200 3.000000e+12 135000000 1.298700e+11 12.0000 6.400 23.6600 1.625940e+11 6.939000e+10 2.557980e+11 -2.283997 11.113509 11.211105 10.841297 11.407897
Head & neck squamous cell carcinoma 0.013800 1.670000e+10 18500000 3.185700e+10 21.5000 18.250 26.1000 3.285600e+10 2.704700e+10 3.866500e+10 -1.860121 10.503205 10.516615 10.432119 10.587318
Head & neck squamous cell carcinoma with HPV-16 0.079350 1.670000e+10 18500000 3.185700e+10 21.5000 18.250 26.1000 3.285600e+10 2.704700e+10 3.866500e+10 -1.100453 10.503205 10.516615 10.432119 10.587318
Hepatocellular carcinoma 0.007100 2.000000e+11 3010000000 2.257500e+11 0.9125 0.730 1.2167 2.404027e+11 1.818040e+11 2.990014e+11 -2.148742 11.353628 11.380939 11.259603 11.475673
Hepatocellular carcinoma with HCV 0.071000 2.000000e+11 3010000000 2.257500e+11 0.9125 0.730 1.2167 2.404027e+11 1.818040e+11 2.990014e+11 -1.148742 11.353628 11.380939 11.259603 11.475673
Testicular germ cell cancer 0.003700 2.160000e+10 7200000 3.355200e+09 5.8000 4.000 6.0000 2.894400e+09 2.318400e+09 3.470400e+09 -2.431798 9.525718 9.461559 9.365188 9.540380
Thyroid papillary/follicular carcinoma 0.010260 1.000000e+10 65000000 5.824000e+08 0.0870 0.069 0.1180 6.162000e+08 4.888000e+08 7.436000e+08 -1.988853 8.765221 8.789722 8.689131 8.871339
Thyroid medullary carcinoma 0.000324 1.000000e+09 6500000 5.824000e+07 0.0870 0.069 0.1180 6.162000e+07 4.888000e+07 7.436000e+07 -3.489455 7.765221 7.789722 7.689131 7.871339

Plot a few scatter diagrams to show the relationships.

X: LSCD (lifetime cummulative cell divisions in organs) and Y (cancer incidence rate).

log-log scatter plot. Compute $\rho(\log_{10}X, \log_{10}Y))$


In [8]:
xi = 3; yi = 0
rho = stats.pearsonr(np.log(df[df.columns[xi]]), np.log(df[df.columns[yi]]))
spear = stats.spearmanr(np.log(df[df.columns[xi]]), np.log(df[df.columns[yi]]))
title = "Original TV Paper. Log(x) vs Log(Y) rho = %f,  pval = %.2e, Spearman: %f" %(rho[0], rho[1], spear[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=True, title=title);


log-log scatter plot. Compute $\rho(X, Y))$


In [9]:
xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=True, title=title);


Scatter plot. Compute $\rho(X, Y))$


In [10]:
xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=False, logy=False, title=title);


X axis in log scale. Scatter plot. Compute $\rho(X, Y))$


In [11]:
xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=False, title=title);


Lower bound on Pearson's $\rho$ Model 2


In [12]:
def rho_lower_bound_model2(x, y, delta):
    """
    Compute lower bound on Pearson's correlation coefficient, given
    the X, Y observations and the half interval width delta = (a_i + b_i)/2
    """
    n = len(x)

    # first, calclulate the correlation
    mu_xy = np.mean(x*y)
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    sigma_x = np.std(x)
    sigma_y = np.std(y)

    rho = (mu_xy - mu_x*mu_y)/(sigma_x*sigma_y)

    print "rho: %f, coeff. of determination: %f"% (rho, rho**2)
    
    # Now the lower bound
    beta = abs(np.mean(delta*y)) + abs(mu_y*norm(delta, 1)/n)
    
    alpha = ( n/(n-1) )* (norm(delta, 2)**2/n + 
                          3*norm(delta, 1)**2/n**2 + 
                          2*norm(delta*(x - mu_x), 1)/n
                          )
    
    rho_lower = (mu_xy - mu_x*mu_y - beta)/(sqrt(sigma_x**2 + alpha)*sigma_y)

    print "rho lower bound: %f, coeff. of determination lower bound: %f"% (rho_lower, rho_lower**2)

Compute lower bound for $\rho(\log(x),\log(y))$


In [13]:
x = np.log10(np.array(df['x-bar']))
y = np.log10(np.array(df['Life incident rate']))
delta = np.array(np.log10(df['b'])-np.log10(df['a']))/2

rho_lower_bound_model2(x, y, delta)


rho: 0.802479, coeff. of determination: 0.643972
rho lower bound: 0.709605, coeff. of determination lower bound: 0.503540

This time compute lower bound for $\rho(x, y)$


In [14]:
x = np.array(df['x-bar'])
y = np.array(df['Life incident rate'])
delta = np.array(df['b']-df['a'])/2

rho_lower_bound_model2(x, y, delta)


rho: 0.532764, coeff. of determination: 0.283838
rho lower bound: 0.524074, coeff. of determination lower bound: 0.274653

Remarks

  1. The lower bound seems to be much tighter in the case of $\rho(x, y)$ than $\rho(\log(x),\log(y))$
  2. The bound on the coefficient of determination $r^2 = 0.49$ means that bad-luck (stochastic errors during DNA replication) explains at least 49% of the variation in the incidence of cancer across tissue types which is still significant in my opinion. We emphasize that this variation does not explain the role of bad-luck in individuals getting cancers.