In [1]:

# Thank you Damian Kao http://blog.nextgenetics.net/?e=102
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide(); } else {$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle); </script> The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')   Out[1]: code_show=true; function code_toggle() { if (code_show){$('div.input').hide();
} else {
$('div.input').show(); } code_show = !code_show }$( document ).ready(code_toggle);

The raw code for this IPython notebook is by default hidden for easier reading.




In [2]:

## Import Libraries




In [3]:

from __future__ import division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pylab
from math import sqrt
from matplotlib import rc
import scipy.stats as stats
import pandas as pd
from numpy.linalg import norm



## Load data, calculate Lifetime Stem Cell Cummulative Division (LSCD), LSCD boundry points: a and b, mean value of LSCD: x-bar and their respective $log_{10}$ values named appropriately. The interval values of LSCD stems from the fact that the estimated values of stem cell division cycle per year frequencies are interval estimates ( $d_1$ and $d_2$). The LSCD boundry points are estimated with the formula presented by T-V original paper.



In [4]:

y = df['Life incident rate']
lscd = df['Total_Stem_Cells']*(df['Devision_year']*80 + 2) -2

df['lscd'] = lscd

x_bar = df['Total_Stem_Cells']*((df['d1']+df['d2'])*40 + 2) -2
a = df['Total_Stem_Cells']*(df['d1']*80 + 2) -2
b = df['Total_Stem_Cells']*(df['d2']*80 + 2) -2
df['x-bar'] = x_bar
df['a'] = a
df['b'] = b

df['LIR_log'] = np.log10(y)
df['lscd_log'] = np.log10(lscd)
df['lscd_mean_log'] = np.log10(x_bar)
df['lscd_low_log'] = np.log10(a)
df['lscd_high_log'] = np.log10(b)




In [5]:

# # Save data in CSV format

# # Print the data in latex format table
# format_dict = dict.fromkeys(df.columns, lambda x: '{:.2e}'.format(x))
# format_dict.update(dict.fromkeys(['Devision_year', 'd1', 'd2'], lambda x: '{:f}'.format(x)))

# df.to_latex('Voglestine-Intervals-data.tex', float_format=lambda x: '{:.2e}'.format(x), index=True, header=True)




In [6]:




Out[6]:

Life incident rate
Total_cells_tissue
Total_Stem_Cells
lscd
Devision_year
d1
d2
x-bar
a
b
LIR_log
lscd_log
lscd_mean_log
lscd_low_log
lscd_high_log

Cancer

AML
0.004100
3.000000e+12
135000000
1.298700e+11
12.0000
6.400
23.6600
1.625940e+11
6.939000e+10
2.557980e+11
-2.387216
11.113509
11.211105
10.841297
11.407897

BCC
0.300000
2.000000e+11
5820000000
3.550200e+12
7.6000
7.600
7.6000
3.550200e+12
3.550200e+12
3.550200e+12
-0.522879
12.550253
12.550253
12.550253
12.550253

CLL
0.005200
3.000000e+12
135000000
1.298700e+11
12.0000
6.400
23.6600
1.625940e+11
6.939000e+10
2.557980e+11
-2.283997
11.113509
11.211105
10.841297
11.407897

Colorectal
0.048000
3.000000e+10
200000000
1.168400e+12
73.0000
73.000
73.0000
1.168400e+12
1.168400e+12
1.168400e+12
-1.318759
12.067592
12.067592
12.067592
12.067592

Colorectal FAP
1.000000
3.000000e+10
200000000
1.168400e+12
73.0000
73.000
73.0000
1.168400e+12
1.168400e+12
1.168400e+12
0.000000
12.067592
12.067592
12.067592
12.067592

Colorectal Lynch
0.500000
3.000000e+10
200000000
1.168400e+12
73.0000
73.000
73.0000
1.168400e+12
1.168400e+12
1.168400e+12
-0.301030
12.067592
12.067592
12.067592
12.067592

0.000300
6.800000e+08
4000000
7.688000e+09
24.0000
24.000
24.0000
7.688000e+09
7.688000e+09
7.688000e+09
-3.522879
9.885813
9.885813
9.885813
9.885813

0.035000
6.800000e+08
4000000
7.688000e+09
24.0000
24.000
24.0000
7.688000e+09
7.688000e+09
7.688000e+09
-1.455932
9.885813
9.885813
9.885813
9.885813

Esophageal squamous cell carcinoma
0.001938
3.240000e+09
846000
1.179324e+09
17.4000
17.400
17.4000
1.179324e+09
1.179324e+09
1.179324e+09
-2.712646
9.071633
9.071633
9.071633
9.071633

0.002800
1.600000e+08
1600000
7.795200e+07
0.5840
0.584
0.5840
7.795200e+07
7.795200e+07
7.795200e+07
-2.552842
7.891827
7.891827
7.891827
7.891827

Glioblastoma
0.002190
8.640000e+10
135000000
2.700000e+08
0.0000
0.000
0.0000
2.700000e+08
2.700000e+08
2.700000e+08
-2.659556
8.431364
8.431364
8.431364
8.431364

Head & neck squamous cell carcinoma
0.013800
1.670000e+10
18500000
3.185700e+10
21.5000
18.250
26.1000
3.285600e+10
2.704700e+10
3.866500e+10
-1.860121
10.503205
10.516615
10.432119
10.587318

Head & neck squamous cell carcinoma with HPV-16
0.079350
1.670000e+10
18500000
3.185700e+10
21.5000
18.250
26.1000
3.285600e+10
2.704700e+10
3.866500e+10
-1.100453
10.503205
10.516615
10.432119
10.587318

Hepatocellular carcinoma
0.007100
2.000000e+11
3010000000
2.257500e+11
0.9125
0.730
1.2167
2.404027e+11
1.818040e+11
2.990014e+11
-2.148742
11.353628
11.380939
11.259603
11.475673

Hepatocellular carcinoma with HCV
0.071000
2.000000e+11
3010000000
2.257500e+11
0.9125
0.730
1.2167
2.404027e+11
1.818040e+11
2.990014e+11
-1.148742
11.353628
11.380939
11.259603
11.475673

0.004500
4.000000e+11
1220000000
9.272000e+09
0.0700
0.070
0.0700
9.272000e+09
9.272000e+09
9.272000e+09
-2.346787
9.967173
9.967173
9.967173
9.967173

0.081000
4.000000e+11
1220000000
9.272000e+09
0.0700
0.070
0.0700
9.272000e+09
9.272000e+09
9.272000e+09
-1.091515
9.967173
9.967173
9.967173
9.967173

Medulloblastoma
0.000110
8.500000e+10
136000000
2.720000e+08
0.0000
0.000
0.0000
2.720000e+08
2.720000e+08
2.720000e+08
-3.958607
8.434569
8.434569
8.434569
8.434569

Melanoma
0.020300
3.800000e+09
3800000000
7.615200e+11
2.4800
2.480
2.4800
7.615200e+11
7.615200e+11
7.615200e+11
-1.692504
11.881681
11.881681
11.881681
11.881681

Osteosarcoma
0.000350
1.900000e+10
4180000
3.076480e+07
0.0670
0.067
0.0670
3.076480e+07
3.076480e+07
3.076480e+07
-3.455932
7.488054
7.488054
7.488054
7.488054

Osteosarcoma of arms
0.000040
3.000000e+08
650000
4.783998e+06
0.0670
0.067
0.0670
4.783998e+06
4.783998e+06
4.783998e+06
-4.397940
6.679791
6.679791
6.679791
6.679791

0.000030
3.900000e+08
860000
6.329598e+06
0.0670
0.067
0.0670
6.329598e+06
6.329598e+06
6.329598e+06
-4.519993
6.801376
6.801376
6.801376
6.801376

Osteosarcoma legs
0.000220
7.200000e+08
1590000
1.170240e+07
0.0670
0.067
0.0670
1.170240e+07
1.170240e+07
1.170240e+07
-3.657577
7.068275
7.068275
7.068275
7.068275

Osteosarcoma plevis
0.000030
2.000000e+08
450000
3.311998e+06
0.0670
0.067
0.0670
3.311998e+06
3.311998e+06
3.311998e+06
-4.522879
6.520090
6.520090
6.520090
6.520090

Ovarian germ cell
0.000411
1.100000e+07
11000000
2.200000e+07
0.0000
0.000
0.0000
2.200000e+07
2.200000e+07
2.200000e+07
-3.386158
7.342423
7.342423
7.342423
7.342423

0.013589
2.000000e+11
4180000000
3.427600e+11
1.0000
1.000
1.0000
3.427600e+11
3.427600e+11
3.427600e+11
-1.866813
11.534990
11.534990
11.534990
11.534990

Pancreatic endocrine
0.000194
2.950000e+09
74000000
6.068000e+09
1.0000
1.000
1.0000
6.068000e+09
6.068000e+09
6.068000e+09
-3.712198
9.783046
9.783046
9.783046
9.783046

0.000700
1.700000e+10
100000000
2.882000e+11
36.0000
36.000
36.0000
2.882000e+11
2.882000e+11
2.882000e+11
-3.154902
11.459694
11.459694
11.459694
11.459694

Testicular germ cell cancer
0.003700
2.160000e+10
7200000
3.355200e+09
5.8000
4.000
6.0000
2.894400e+09
2.318400e+09
3.470400e+09
-2.431798
9.525718
9.461559
9.365188
9.540380

Thyroid papillary/follicular carcinoma
0.010260
1.000000e+10
65000000
5.824000e+08
0.0870
0.069
0.1180
6.162000e+08
4.888000e+08
7.436000e+08
-1.988853
8.765221
8.789722
8.689131
8.871339

Thyroid medullary carcinoma
0.000324
1.000000e+09
6500000
5.824000e+07
0.0870
0.069
0.1180
6.162000e+07
4.888000e+07
7.436000e+07
-3.489455
7.765221
7.789722
7.689131
7.871339



## Show points where there are interval estimates



In [7]:

# Take a look at top 5 entries
df[df['a']-df['b'] != 0]




Out[7]:

Life incident rate
Total_cells_tissue
Total_Stem_Cells
lscd
Devision_year
d1
d2
x-bar
a
b
LIR_log
lscd_log
lscd_mean_log
lscd_low_log
lscd_high_log

Cancer

AML
0.004100
3.000000e+12
135000000
1.298700e+11
12.0000
6.400
23.6600
1.625940e+11
6.939000e+10
2.557980e+11
-2.387216
11.113509
11.211105
10.841297
11.407897

CLL
0.005200
3.000000e+12
135000000
1.298700e+11
12.0000
6.400
23.6600
1.625940e+11
6.939000e+10
2.557980e+11
-2.283997
11.113509
11.211105
10.841297
11.407897

Head & neck squamous cell carcinoma
0.013800
1.670000e+10
18500000
3.185700e+10
21.5000
18.250
26.1000
3.285600e+10
2.704700e+10
3.866500e+10
-1.860121
10.503205
10.516615
10.432119
10.587318

Head & neck squamous cell carcinoma with HPV-16
0.079350
1.670000e+10
18500000
3.185700e+10
21.5000
18.250
26.1000
3.285600e+10
2.704700e+10
3.866500e+10
-1.100453
10.503205
10.516615
10.432119
10.587318

Hepatocellular carcinoma
0.007100
2.000000e+11
3010000000
2.257500e+11
0.9125
0.730
1.2167
2.404027e+11
1.818040e+11
2.990014e+11
-2.148742
11.353628
11.380939
11.259603
11.475673

Hepatocellular carcinoma with HCV
0.071000
2.000000e+11
3010000000
2.257500e+11
0.9125
0.730
1.2167
2.404027e+11
1.818040e+11
2.990014e+11
-1.148742
11.353628
11.380939
11.259603
11.475673

Testicular germ cell cancer
0.003700
2.160000e+10
7200000
3.355200e+09
5.8000
4.000
6.0000
2.894400e+09
2.318400e+09
3.470400e+09
-2.431798
9.525718
9.461559
9.365188
9.540380

Thyroid papillary/follicular carcinoma
0.010260
1.000000e+10
65000000
5.824000e+08
0.0870
0.069
0.1180
6.162000e+08
4.888000e+08
7.436000e+08
-1.988853
8.765221
8.789722
8.689131
8.871339

Thyroid medullary carcinoma
0.000324
1.000000e+09
6500000
5.824000e+07
0.0870
0.069
0.1180
6.162000e+07
4.888000e+07
7.436000e+07
-3.489455
7.765221
7.789722
7.689131
7.871339



# Plot a few scatter diagrams to show the relationships.

## log-log scatter plot. Compute $\rho(\log_{10}X, \log_{10}Y))$



In [8]:

xi = 3; yi = 0
rho = stats.pearsonr(np.log(df[df.columns[xi]]), np.log(df[df.columns[yi]]))
spear = stats.spearmanr(np.log(df[df.columns[xi]]), np.log(df[df.columns[yi]]))
title = "Original TV Paper. Log(x) vs Log(Y) rho = %f,  pval = %.2e, Spearman: %f" %(rho[0], rho[1], spear[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=True, title=title);






## log-log scatter plot. Compute $\rho(X, Y))$



In [9]:

xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=True, title=title);






## Scatter plot. Compute $\rho(X, Y))$



In [10]:

xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=False, logy=False, title=title);






## X axis in log scale. Scatter plot. Compute $\rho(X, Y))$



In [11]:

xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=False, title=title);






# Lower bound on Pearson's $\rho$ Model 2



In [12]:

def rho_lower_bound_model2(x, y, delta):
"""
Compute lower bound on Pearson's correlation coefficient, given
the X, Y observations and the half interval width delta = (a_i + b_i)/2
"""
n = len(x)

# first, calclulate the correlation
mu_xy = np.mean(x*y)
mu_x = np.mean(x)
mu_y = np.mean(y)
sigma_x = np.std(x)
sigma_y = np.std(y)

rho = (mu_xy - mu_x*mu_y)/(sigma_x*sigma_y)

print "rho: %f, coeff. of determination: %f"% (rho, rho**2)

# Now the lower bound
beta = abs(np.mean(delta*y)) + abs(mu_y*norm(delta, 1)/n)

alpha = ( n/(n-1) )* (norm(delta, 2)**2/n +
3*norm(delta, 1)**2/n**2 +
2*norm(delta*(x - mu_x), 1)/n
)

rho_lower = (mu_xy - mu_x*mu_y - beta)/(sqrt(sigma_x**2 + alpha)*sigma_y)

print "rho lower bound: %f, coeff. of determination lower bound: %f"% (rho_lower, rho_lower**2)



## Compute lower bound for $\rho(\log(x),\log(y))$



In [13]:

x = np.log10(np.array(df['x-bar']))
y = np.log10(np.array(df['Life incident rate']))
delta = np.array(np.log10(df['b'])-np.log10(df['a']))/2

rho_lower_bound_model2(x, y, delta)




rho: 0.802479, coeff. of determination: 0.643972
rho lower bound: 0.709605, coeff. of determination lower bound: 0.503540



## This time compute lower bound for $\rho(x, y)$



In [14]:

x = np.array(df['x-bar'])
y = np.array(df['Life incident rate'])
delta = np.array(df['b']-df['a'])/2

rho_lower_bound_model2(x, y, delta)




rho: 0.532764, coeff. of determination: 0.283838
rho lower bound: 0.524074, coeff. of determination lower bound: 0.274653



# Remarks

1. The lower bound seems to be much tighter in the case of $\rho(x, y)$ than $\rho(\log(x),\log(y))$
2. The bound on the coefficient of determination $r^2 = 0.49$ means that bad-luck (stochastic errors during DNA replication) explains at least 49% of the variation in the incidence of cancer across tissue types which is still significant in my opinion. We emphasize that this variation does not explain the role of bad-luck in individuals getting cancers.