In [1]:
# Thank you Damian Kao http://blog.nextgenetics.net/?e=102
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[1]:
In [2]:
## Import Libraries
In [3]:
from __future__ import division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pylab
from math import sqrt
from matplotlib import rc
import scipy.stats as stats
import pandas as pd
from numpy.linalg import norm
In [4]:
df = pd.read_table('Voglestein.csv', header=0, index_col=0, sep=',')
y = df['Life incident rate']
lscd = df['Total_Stem_Cells']*(df['Devision_year']*80 + 2) -2
df['lscd'] = lscd
x_bar = df['Total_Stem_Cells']*((df['d1']+df['d2'])*40 + 2) -2
a = df['Total_Stem_Cells']*(df['d1']*80 + 2) -2
b = df['Total_Stem_Cells']*(df['d2']*80 + 2) -2
df['x-bar'] = x_bar
df['a'] = a
df['b'] = b
df['LIR_log'] = np.log10(y)
df['lscd_log'] = np.log10(lscd)
df['lscd_mean_log'] = np.log10(x_bar)
df['lscd_low_log'] = np.log10(a)
df['lscd_high_log'] = np.log10(b)
In [5]:
# # Save data in CSV format
# df.to_csv('Vogelestien-log.csv', sep=',', header=True, index=True)
# # Print the data in latex format table
# format_dict = dict.fromkeys(df.columns, lambda x: '{:.2e}'.format(x))
# format_dict.update(dict.fromkeys(['Devision_year', 'd1', 'd2'], lambda x: '{:f}'.format(x)))
# df.to_latex('Voglestine-Intervals-data.tex', float_format=lambda x: '{:.2e}'.format(x), index=True, header=True)
In [6]:
df.head(50)
Out[6]:
In [7]:
# Take a look at top 5 entries
df[df['a']-df['b'] != 0]
Out[7]:
In [8]:
xi = 3; yi = 0
rho = stats.pearsonr(np.log(df[df.columns[xi]]), np.log(df[df.columns[yi]]))
spear = stats.spearmanr(np.log(df[df.columns[xi]]), np.log(df[df.columns[yi]]))
title = "Original TV Paper. Log(x) vs Log(Y) rho = %f, pval = %.2e, Spearman: %f" %(rho[0], rho[1], spear[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=True, title=title);
In [9]:
xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=True, title=title);
In [10]:
xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=False, logy=False, title=title);
In [11]:
xi = 3; yi = 0
rho = stats.pearsonr(df[df.columns[xi]], df[df.columns[yi]])
title = "TV Paper. X vs Y rho = %f pval = %.2e, Spearman: %f" %(rho[0], rho[1], stats.spearmanr(df[df.columns[xi]], df[df.columns[yi]])[0])
df.plot(xi, yi, kind='scatter', logx=True, logy=False, title=title);
In [12]:
def rho_lower_bound_model2(x, y, delta):
"""
Compute lower bound on Pearson's correlation coefficient, given
the X, Y observations and the half interval width delta = (a_i + b_i)/2
"""
n = len(x)
# first, calclulate the correlation
mu_xy = np.mean(x*y)
mu_x = np.mean(x)
mu_y = np.mean(y)
sigma_x = np.std(x)
sigma_y = np.std(y)
rho = (mu_xy - mu_x*mu_y)/(sigma_x*sigma_y)
print "rho: %f, coeff. of determination: %f"% (rho, rho**2)
# Now the lower bound
beta = abs(np.mean(delta*y)) + abs(mu_y*norm(delta, 1)/n)
alpha = ( n/(n-1) )* (norm(delta, 2)**2/n +
3*norm(delta, 1)**2/n**2 +
2*norm(delta*(x - mu_x), 1)/n
)
rho_lower = (mu_xy - mu_x*mu_y - beta)/(sqrt(sigma_x**2 + alpha)*sigma_y)
print "rho lower bound: %f, coeff. of determination lower bound: %f"% (rho_lower, rho_lower**2)
In [13]:
x = np.log10(np.array(df['x-bar']))
y = np.log10(np.array(df['Life incident rate']))
delta = np.array(np.log10(df['b'])-np.log10(df['a']))/2
rho_lower_bound_model2(x, y, delta)
In [14]:
x = np.array(df['x-bar'])
y = np.array(df['Life incident rate'])
delta = np.array(df['b']-df['a'])/2
rho_lower_bound_model2(x, y, delta)