In [1]:
%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

In [3]:
stats = pd.read_csv("all_RRBS_statistics_final.csv")

In [4]:
stats.shape


Out[4]:
(438, 15)

In [5]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]

In [6]:
len(normal)


Out[6]:
336

In [7]:
len(CLL)


Out[7]:
102

In [8]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=stats["type"], y=stats["methylation"])
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[8]:
<matplotlib.text.Text at 0x113fcb470>

In [9]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=stats["type"], y=stats["methylation"], jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[9]:
<matplotlib.text.Text at 0x11414ddd8>

In [10]:
ax = sns.violinplot(x=stats["type"],  y=stats["methylation"])
sns.plt.title("Violin plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("violin plot features a kernel density estimation of the underlying distribution")


violin plot features a kernel density estimation of the underlying distribution

In [11]:
ax = sns.boxplot(x=stats["type"],y=stats["methylation"], linewidth=1.5)
sns.plt.title("Box Whisker plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("Box whisker plot")


Box whisker plot

In [12]:
ax = sns.boxplot(x=stats["type"],y=stats["methylation"], linewidth=1.5)
sns.plt.title("Violin plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("Box whisker plot")
#plt.ylim(0.35, 0.7)


Box whisker plot

In [13]:
ax = sns.boxplot(y=stats["type"], x=stats["methylation"], linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Cell Type: Normal cells vs. CLL samples")


Out[13]:
<matplotlib.text.Text at 0x11457b588>

In [14]:
ax = sns.boxplot(x=stats["type"], y=stats["methylation"], linewidth=1.5)
ax = sns.stripplot(x=stats["type"], y=stats["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")


Out[14]:
<matplotlib.text.Text at 0x114578898>

In [15]:
combined = stats

In [16]:
ax = sns.boxplot(y=combined["type"],  x=combined["methylation"], linewidth=1.5)
ax = sns.stripplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[16]:
<matplotlib.text.Text at 0x1148ab7b8>

In [17]:
ax = sns.boxplot(x=combined["type"], y=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(x=combined["type"], y=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[17]:
<matplotlib.text.Text at 0x114931e80>

In [18]:
ax = sns.boxplot(y=combined["type"], x=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[18]:
<matplotlib.text.Text at 0x114a83da0>

In [19]:
ax = sns.boxplot(y=combined["type"], x=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[19]:
<matplotlib.text.Text at 0x114bc6748>

In [20]:
combined2 = stats

In [21]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Percent Methylation, weighted")


Out[21]:
<matplotlib.text.Text at 0x114d08eb8>

In [22]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, 
                   jitter=True, palette=dict(normal_B = 'b', CD19CD27m = 'g', CD19CD27p = 'r', CLL='m', CD19p = 'c'))
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[22]:
<matplotlib.legend.Legend at 0x114e0d780>

In [23]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Percent Methylation, weighted")


Out[23]:
<matplotlib.text.Text at 0x114566f60>

In [24]:
ax = sns.boxplot(x=combined2["type"],y=combined2["methylation"], linewidth=1.5)
plt.ylim(0.15,0.8)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[24]:
<matplotlib.legend.Legend at 0x1143f6278>

In [25]:
combined2.boxplot(column = 'methylation', by='type', fontsize=13)

sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[25]:
<matplotlib.legend.Legend at 0x1143f02b0>

In [26]:
combined2.boxplot(column = 'methylation', by='type', fontsize=13)
plt.title('corrected')


Out[26]:
<matplotlib.text.Text at 0x115386b70>

In [27]:
ax = sns.violinplot(x=combined2["type"],y=combined2["methylation"], palette="Set3")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[27]:
<matplotlib.legend.Legend at 0x115511e80>

In [28]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.protocol, jitter=True)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[28]:
<matplotlib.legend.Legend at 0x11463df98>

In [29]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"],y=combined2["methylation"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[29]:
<matplotlib.legend.Legend at 0x11539eb00>

In [30]:
ax = sns.violinplot(x=combined2["type"], y=combined2["methylation"], palette="Set1")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[30]:
<matplotlib.legend.Legend at 0x115992208>

In [31]:
ax = sns.boxplot(x=combined["type"], y=combined["methylation"], linewidth=1.5)
ax = sns.stripplot(x=combined["type"], y=combined["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")


Out[31]:
<matplotlib.text.Text at 0x115b94f60>

In [32]:
from scipy.stats import ttest_ind

cat1 = combined[combined['type']=='CLL']
cat2 = combined[combined['type']=='normal']

t, p = ttest_ind(cat1.methylation, cat2.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))


t-statistic is 9.17191822645
p-value is 1.85901019525e-18

In [33]:
import scipy.stats
z, p = scipy.stats.mannwhitneyu(cat1.methylation, cat2.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


Mann-Whitney U statistic is 7190.0
p-value is 6.56146204818e-19

In [34]:
sns.boxplot(x="bio", y="methylation", hue="type", data=combined2, palette="PRGn")
sns.despine(offset=10, trim=True)



In [35]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
sns.boxplot(x="bio", y="methylation", data=combined2)
sns.despine(offset=10, trim=True)



In [36]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
sns.boxplot(x="bio", y="methylation", data=combined2)
sns.stripplot(x="bio", y="methylation", data=combined2, color=".25", linewidth=0.5, jitter=True)
sns.despine(offset=10, trim=True)



In [37]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
sns.violinplot(x="bio", y="methylation", data=combined2)
sns.despine(offset=10, trim=True)



In [38]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
sns.stripplot(x="bio", y="methylation", data=combined2, color=".25", linewidth=0.5, jitter=True)
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
sns.violinplot(x="bio", y="methylation", data=combined2)
sns.despine(offset=10, trim=True)



In [39]:
final_real2 = stats

In [40]:
norm = final_real2[final_real2['type'] == 'normal']
print(len(norm))
cll = final_real2[final_real2['type'] == 'CLL']
print(len(cll))


336
102

In [41]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")


Out[41]:
<matplotlib.text.Text at 0x1162be860>

In [42]:
print(norm['methylation'].mean())  # mean
print(cll['methylation'].mean())

print(norm['methylation'].std())   # standard deviation
print(cll['methylation'].std())


0.527522734711752
0.5730079311403684
0.04811959837228965
0.025040087120524586

In [43]:
t, p = ttest_ind(norm.methylation, cll.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.methylation, cll.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -9.17191822645
p-value is 1.85901019525e-18
Mann-Whitney U statistic is 7190.0
p-value is 6.56146204818e-19

In [44]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["methylation"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $3.56 \times 10^{-20}$', ha='center', va='bottom', color=col)


Out[44]:
<matplotlib.text.Text at 0x11619a828>

In [45]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_total"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("PDR")


Out[45]:
<matplotlib.text.Text at 0x1166bb160>

In [46]:
print(norm['PDR_total'].mean())  # mean
print(cll['PDR_total'].mean())

print(norm['PDR_total'].std())   # standard deviation
print(cll['PDR_total'].std())


0.2688760809127509
0.3919598017928172
0.08845094185985881
0.015705379901830848

In [47]:
t, p = ttest_ind(norm.PDR_total, cll.PDR_total)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.PDR_total, cll.PDR_total)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -13.9764653096
p-value is 6.09415577859e-37
Mann-Whitney U statistic is 4698.0
p-value is 1.15165046304e-28

In [48]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_total"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["PDR_total"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $1.30 \times 10^{-31}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")
plt.savefig('PDR_normal_CLL_weighted.eps', format='eps', dpi=1000000)



In [49]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")


Out[49]:
<matplotlib.text.Text at 0x116b1b748>

In [50]:
print(norm['methylation_unweighted'].mean())  # mean
print(cll['methylation_unweighted'].mean())

print(norm['methylation_unweighted'].std())   # standard deviation
print(cll['methylation_unweighted'].std())


0.6219285808204109
0.6432324104102132
0.044653193387642454
0.018052359534246706

In [51]:
t, p = ttest_ind(norm.methylation_unweighted, cll.methylation_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.methylation_unweighted, cll.methylation_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -4.70017147795
p-value is 3.49145510596e-06
Mann-Whitney U statistic is 10665.0
p-value is 7.53018618598e-09

In [52]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["methylation_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $1.86 \times 10^{-9}$', ha='center', va='bottom', color=col)
plt.ylabel("Methylation")


Out[52]:
<matplotlib.text.Text at 0x1167abd30>

In [53]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.ylabel("PDR")


Out[53]:
<matplotlib.text.Text at 0x1160fad30>

In [54]:
print(norm['PDR_unweighted'].mean())  # mean
print(cll['PDR_unweighted'].mean())

print(norm['PDR_unweighted'].std())   # standard deviation
print(cll['PDR_unweighted'].std())


0.26476768934672706
0.3716003158116928
0.08587856059468897
0.011327365722304111

In [55]:
t, p = ttest_ind(norm.PDR_unweighted, cll.PDR_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.PDR_unweighted, cll.PDR_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -12.5209317406
p-value is 6.11369663991e-31
Mann-Whitney U statistic is 5187.0
p-value is 1.39234857141e-26

In [56]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["PDR_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $1.11 \times 10^{-29}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[56]:
<matplotlib.text.Text at 0x1161609b0>

In [57]:
CD27p = final_real2[final_real2['bio'] == 'CD19CD27p']
print(len(CD27p))
CD27m = final_real2[final_real2['bio'] == 'CD19CD27m']
print(len(CD27m))


68
74

In [58]:
# CD19CD27m

CD27cells = pd.concat([CD27p, CD27m])

In [59]:
CD27cells = CD27cells.reset_index(drop=True)

In [60]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")


Out[60]:
<matplotlib.text.Text at 0x1165a49e8>

In [61]:
print(CD27p['methylation'].mean())  # mean
print(CD27m['methylation'].mean())

print(CD27p['methylation'].std())   # standard deviation
print(CD27m['methylation'].std())


0.496253603114481
0.5081689120308355
0.03135877503461438
0.03257947340226167

In [62]:
t, p = ttest_ind(CD27p.methylation, CD27m.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.methylation, CD27m.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -2.21649444187
p-value is 0.0282712841343
Mann-Whitney U statistic is 1960.0
p-value is 0.0232995224883

In [63]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[63]:
<matplotlib.legend.Legend at 0x11659cfd0>

In [64]:
CD27p122 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell1_22_']
CD27p2344 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell23_44']
CD27p4566 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell45_66']
CD27p6788 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell67_88']

CD27m122 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell1_22_']
CD27m2344 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell23_44']
CD27m4566 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell45_66']
CD27m6788 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell67_88']

In [65]:
print("MEAN")
print("CD27p122")
print(CD27p122['methylation'].mean()) 
print("CD27p2344")
print(CD27p2344['methylation'].mean())
print("CD27p4566")
print(CD27p4566['methylation'].mean()) 
print("CD27p6788")
print(CD27p6788['methylation'].mean())


print("***")
print("CD27m122")
print(CD27m122['methylation'].mean()) 
print("CD27m2344")
print(CD27m2344['methylation'].mean())
print("CD27m4566")
print(CD27m4566['methylation'].mean()) 
print("CD27m6788")
print(CD27m6788['methylation'].mean())


MEAN
CD27p122
0.4972878846847126
CD27p2344
0.4851007884832205
CD27p4566
0.4834148056947074
CD27p6788
0.5134200187130699
***
CD27m122
0.5040448558871422
CD27m2344
0.5136966246462882
CD27m4566
0.5141029786434724
CD27m6788
0.5021620097019545

In [66]:
print("total")
print("CD27p122")
print(len(CD27p122['methylation']))
print("CD27p2344")
print(len(CD27p2344['methylation']))
print("CD27p4566")
print(len(CD27p4566['methylation']))
print("CD27p6788")
print(len(CD27p6788['methylation']))


print("***")
print("CD27m122")
print(len(CD27m122['methylation']))
print("CD27m2344")
print(len(CD27m2344['methylation']))
print("CD27m4566")
print(len(CD27m4566['methylation']))
print("CD27m6788")
print(len(CD27m6788['methylation']))


total
CD27p122
17
CD27p2344
22
CD27p4566
9
CD27p6788
20
***
CD27m122
18
CD27m2344
18
CD27m4566
17
CD27m6788
21

In [67]:
print("STD")
print("CD27p122")
print(CD27p122['methylation'].std()) 
print("CD27p2344")
print(CD27p2344['methylation'].std())
print("CD27p4566")
print(CD27p4566['methylation'].std()) 
print("CD27p6788")
print(CD27p6788['methylation'].std())


print("***")
print("CD27m122")
print(CD27m122['methylation'].std()) 
print("CD27m2344")
print(CD27m2344['methylation'].std())
print("CD27m4566")
print(CD27m4566['methylation'].std()) 
print("CD27m6788")
print(CD27m6788['methylation'].std())


STD
CD27p122
0.02902805368793664
CD27p2344
0.03338021715676278
CD27p4566
0.038155892534940505
CD27p6788
0.01950174730753446
***
CD27m122
0.03717250597962948
CD27m2344
0.021707573850562704
CD27m4566
0.03449159314327558
CD27m6788
0.035167704732143194

In [68]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["methylation"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[68]:
<matplotlib.text.Text at 0x116ef67f0>

In [69]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["methylation"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("Methylation")


Out[69]:
<matplotlib.text.Text at 0x117253ef0>

In [70]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["methylation"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["methylation"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27m, all chromosomes, weighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[70]:
<matplotlib.text.Text at 0x116865550>

In [71]:
print("MEAN: methylation unweighted")
print("CD27p122")
print(CD27p122['methylation_unweighted'].mean()) 
print("CD27p2344")
print(CD27p2344['methylation_unweighted'].mean())
print("CD27p4566")
print(CD27p4566['methylation_unweighted'].mean()) 
print("CD27p6788")
print(CD27p6788['methylation_unweighted'].mean())


print("***")
print("CD27m122")
print(CD27m122['methylation_unweighted'].mean()) 
print("CD27m2344")
print(CD27m2344['methylation_unweighted'].mean())
print("CD27m4566")
print(CD27m4566['methylation_unweighted'].mean()) 
print("CD27m6788")
print(CD27m6788['methylation_unweighted'].mean())


MEAN: methylation unweighted
CD27p122
0.5842490785697927
CD27p2344
0.5718335489217137
CD27p4566
0.5724617957578969
CD27p6788
0.5969825013293091
***
CD27m122
0.6143624872865359
CD27m2344
0.6311509372317828
CD27m4566
0.6318364273228161
CD27m6788
0.6161925913256822

In [72]:
print("STD: methylation unweighted")
print("CD27p122")
print(CD27p122['methylation_unweighted'].std()) 
print("CD27p2344")
print(CD27p2344['methylation_unweighted'].std())
print("CD27p4566")
print(CD27p4566['methylation_unweighted'].std()) 
print("CD27p6788")
print(CD27p6788['methylation_unweighted'].std())


print("***")
print("CD27m122")
print(CD27m122['methylation_unweighted'].std()) 
print("CD27m2344")
print(CD27m2344['methylation_unweighted'].std())
print("CD27m4566")
print(CD27m4566['methylation_unweighted'].std()) 
print("CD27m6788")
print(CD27m6788['methylation_unweighted'].std())


STD: methylation unweighted
CD27p122
0.03113596155118175
CD27p2344
0.028777568411674094
CD27p4566
0.02739289849650322
CD27p6788
0.018158165863661806
***
CD27m122
0.02986529960014145
CD27m2344
0.015850992051939544
CD27m4566
0.025052451971795592
CD27m6788
0.026227446358268715

In [73]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[73]:
<matplotlib.legend.Legend at 0x1167d3240>

In [74]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[74]:
<matplotlib.text.Text at 0x11723c828>

In [75]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("Methylation")


Out[75]:
<matplotlib.text.Text at 0x1175bfda0>

In [76]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["methylation_unweighted"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27m, all chromosomes, unweighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[76]:
<matplotlib.text.Text at 0x117785320>

In [77]:
print("MEAN: PDR, weighted")
print("CD27p122")
print(CD27p122['PDR_total'].mean()) 
print("CD27p2344")
print(CD27p2344['PDR_total'].mean())
print("CD27p4566")
print(CD27p4566['PDR_total'].mean()) 
print("CD27p6788")
print(CD27p6788['PDR_total'].mean())


print("***")
print("CD27m122")
print(CD27m122['PDR_total'].mean()) 
print("CD27m2344")
print(CD27m2344['PDR_total'].mean())
print("CD27m4566")
print(CD27m4566['PDR_total'].mean()) 
print("CD27m6788")
print(CD27m6788['PDR_total'].mean())

print("***")
print("*** *** *** *** *** ***")
print("***")

print("STD: PDR, weighted")
print("CD27p122")
print(CD27p122['PDR_total'].std()) 
print("CD27p2344")
print(CD27p2344['PDR_total'].std())
print("CD27p4566")
print(CD27p4566['PDR_total'].std()) 
print("CD27p6788")
print(CD27p6788['PDR_total'].std())


print("***")
print("CD27m122")
print(CD27m122['PDR_total'].std()) 
print("CD27m2344")
print(CD27m2344['PDR_total'].std())
print("CD27m4566")
print(CD27m4566['PDR_total'].std()) 
print("CD27m6788")
print(CD27m6788['PDR_total'].std())


MEAN: PDR, weighted
CD27p122
0.28566253826809807
CD27p2344
0.3110086941203543
CD27p4566
0.2800724545675633
CD27p6788
0.3089241775288547
***
CD27m122
0.17683627477104374
CD27m2344
0.17966110518768102
CD27m4566
0.17614056770444841
CD27m6788
0.1780542946983231
***
*** *** *** *** *** ***
***
STD: PDR, weighted
CD27p122
0.04972804578979002
CD27p2344
0.02870685299679014
CD27p4566
0.03660540813886372
CD27p6788
0.037033511259506297
***
CD27m122
0.013945749150551572
CD27m2344
0.01689758560575925
CD27m4566
0.004416412431237496
CD27m6788
0.013728468504302867

In [78]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("PDR total per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR total, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[78]:
<matplotlib.legend.Legend at 0x11652bcc0>

In [79]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["PDR_total"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR total, weighted")


Out[79]:
<matplotlib.text.Text at 0x1179e4b00>

In [80]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["PDR_total"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27p, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("PDR total, weighted")


Out[80]:
<matplotlib.text.Text at 0x117d3d390>

In [81]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["PDR_total"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["PDR_total"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27m, all chromosomes, weighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR total, weighted")


Out[81]:
<matplotlib.text.Text at 0x117de92e8>

In [82]:
print("MEAN: PDR, unweighted")
print("CD27p122")
print(CD27p122['PDR_unweighted'].mean()) 
print("CD27p2344")
print(CD27p2344['PDR_unweighted'].mean())
print("CD27p4566")
print(CD27p4566['PDR_unweighted'].mean()) 
print("CD27p6788")
print(CD27p6788['PDR_unweighted'].mean())


print("***")
print("CD27m122")
print(CD27m122['PDR_unweighted'].mean()) 
print("CD27m2344")
print(CD27m2344['PDR_unweighted'].mean())
print("CD27m4566")
print(CD27m4566['PDR_unweighted'].mean()) 
print("CD27m6788")
print(CD27m6788['PDR_unweighted'].mean())

print("***")
print("*** *** *** *** *** ***")
print("***")

print("STD: PDR, unweighted")
print("CD27p122")
print(CD27p122['PDR_unweighted'].std()) 
print("CD27p2344")
print(CD27p2344['PDR_unweighted'].std())
print("CD27p4566")
print(CD27p4566['PDR_unweighted'].std()) 
print("CD27p6788")
print(CD27p6788['PDR_unweighted'].std())


print("***")
print("CD27m122")
print(CD27m122['PDR_unweighted'].std()) 
print("CD27m2344")
print(CD27m2344['PDR_unweighted'].std())
print("CD27m4566")
print(CD27m4566['PDR_unweighted'].std()) 
print("CD27m6788")
print(CD27m6788['PDR_unweighted'].std())


MEAN: PDR, unweighted
CD27p122
0.29137034971405584
CD27p2344
0.3214126851517178
CD27p4566
0.2883948743714171
CD27p6788
0.3130316044578697
***
CD27m122
0.1735435408737465
CD27m2344
0.17789037827235266
CD27m4566
0.17552949440422289
CD27m6788
0.1741432994133486
***
*** *** *** *** *** ***
***
STD: PDR, unweighted
CD27p122
0.05195927920891388
CD27p2344
0.028481234302547183
CD27p4566
0.03809701507879582
CD27p6788
0.03643978677526343
***
CD27m122
0.01440649217303426
CD27m2344
0.01867522478970461
CD27m4566
0.003452085800240377
CD27m6788
0.013595703739238367

In [83]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR total, unweighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[83]:
<matplotlib.legend.Legend at 0x117bc8e80>

In [84]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR, unweighted")


Out[84]:
<matplotlib.text.Text at 0x117fae3c8>

In [85]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR per cell, CD27p, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("PDR, unweighted")


Out[85]:
<matplotlib.text.Text at 0x11849ceb8>

In [86]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["PDR_unweighted"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27m, all chromosomes, unweighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR, unweighted")


Out[86]:
<matplotlib.text.Text at 0x1186649b0>

In [87]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["methylation"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $0.0232$', ha='center', va='bottom', color=col)
plt.ylabel("Methylation")
plt.ylim(0.40, 0.63)


Out[87]:
(0.4, 0.63)

In [88]:
print(CD27p['methylation'].mean())  # mean
print(CD27m['methylation'].mean())

print(CD27p['methylation'].std())   # standard deviation
print(CD27m['methylation'].std())


0.496253603114481
0.5081689120308355
0.03135877503461438
0.03257947340226167

In [89]:
t, p = ttest_ind(CD27p.methylation, CD27m.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.methylation, CD27m.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -2.21649444187
p-value is 0.0282712841343
Mann-Whitney U statistic is 1960.0
p-value is 0.0232995224883

In [90]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR")


Out[90]:
<matplotlib.text.Text at 0x118aec9e8>

In [91]:
print(CD27p['PDR_total'].mean())  # mean
print(CD27m['PDR_total'].mean())

print(CD27p['PDR_total'].std())   # standard deviation
print(CD27m['PDR_total'].std())


0.29996455974839153
0.17770922539050618
0.03958310133552725
0.013016495093502473

In [92]:
t, p = ttest_ind(CD27p.PDR_total, CD27m.PDR_total)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.PDR_total, CD27m.PDR_total)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is 25.1376660782
p-value is 9.34291704379e-54
Mann-Whitney U statistic is 14.0
p-value is 1.69285882438e-24

In [93]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["PDR_total"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $1.69 \times 10^{-24}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[93]:
<matplotlib.text.Text at 0x118b15b70>

In [94]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")


Out[94]:
<matplotlib.text.Text at 0x118fdda20>

In [95]:
print(CD27p['methylation_unweighted'].mean())  # mean
print(CD27m['methylation_unweighted'].mean())

print(CD27p['methylation_unweighted'].std())   # standard deviation
print(CD27m['methylation_unweighted'].std())


0.5824173323584033
0.6229798016710396
0.028079368991370965
0.0256619603832431

In [96]:
t, p = ttest_ind(CD27p.methylation_unweighted, CD27m.methylation_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.methylation_unweighted, CD27m.methylation_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -8.99436165572
p-value is 1.50157472037e-15
Mann-Whitney U statistic is 648.0
p-value is 2.41619720242e-14

In [97]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["methylation_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $2.42 \times 10^{-14}$', ha='center', va='bottom', color=col)
plt.ylabel("Methylation")


Out[97]:
<matplotlib.text.Text at 0x119170e10>

In [98]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR")


Out[98]:
<matplotlib.text.Text at 0x118868d68>

In [99]:
print(CD27p['PDR_unweighted'].mean())  # mean
print(CD27m['PDR_unweighted'].mean())

print(CD27p['PDR_unweighted'].std())   # standard deviation
print(CD27m['PDR_unweighted'].std())


0.30706707319083654
0.17522731401597158
0.04045209131193274
0.013623979869060488

In [100]:
t, p = ttest_ind(CD27p.PDR_unweighted, CD27m.PDR_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.PDR_unweighted, CD27m.PDR_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is 26.4578627075
p-value is 2.48784664393e-56
Mann-Whitney U statistic is 13.0
p-value is 1.62304136447e-24

In [101]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["PDR_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $1.623 \times 10^{-24}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[101]:
<matplotlib.text.Text at 0x11886fb70>

In [102]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Methylation")


Out[102]:
<matplotlib.text.Text at 0x1196555c0>

In [103]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_total"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("PDR")


Out[103]:
<matplotlib.text.Text at 0x1196b5c18>

In [104]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Methylation")


Out[104]:
<matplotlib.text.Text at 0x119923e10>

In [105]:
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: PDR per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 336 *.anno files; CLL cells = 102 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("PDR")


Out[105]:
<matplotlib.text.Text at 0x119a95c50>

In [ ]:


In [ ]:


In [ ]:


In [ ]: