In [1]:
%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

In [3]:
stats = pd.read_csv("RRBS_anno_statistics_full_446files_filter50K.csv")

In [4]:
stats.shape


Out[4]:
(446, 18)

In [5]:
normal = stats[stats["type"]=="normal"]
CLL = stats[stats["type"]=="CLL"]

In [6]:
len(normal)


Out[6]:
342

In [7]:
len(CLL)


Out[7]:
104

In [8]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=stats["type"], y=stats["methylation"])
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[8]:
<matplotlib.text.Text at 0x10714a4a8>

In [9]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=stats["type"], y=stats["methylation"], jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[9]:
<matplotlib.text.Text at 0x1071ede80>

In [10]:
ax = sns.violinplot(x=stats["type"],  y=stats["methylation"])
sns.plt.title("Violin plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("violin plot features a kernel density estimation of the underlying distribution")


violin plot features a kernel density estimation of the underlying distribution

In [11]:
ax = sns.boxplot(x=stats["type"],y=stats["methylation"], linewidth=1.5)
sns.plt.title("Box Whisker plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("Box whisker plot")


Box whisker plot

In [12]:
ax = sns.boxplot(x=stats["type"],y=stats["methylation"], linewidth=1.5)
sns.plt.title("Violin plot: Methylation per cell, normal vs CLL")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
print("Box whisker plot")
#plt.ylim(0.35, 0.7)


Box whisker plot

In [13]:
ax = sns.boxplot(y=stats["type"], x=stats["methylation"], linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Cell Type: Normal cells vs. CLL samples")


Out[13]:
<matplotlib.text.Text at 0x1075af9e8>

In [14]:
ax = sns.boxplot(x=stats["type"], y=stats["methylation"], linewidth=1.5)
ax = sns.stripplot(x=stats["type"], y=stats["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")


Out[14]:
<matplotlib.text.Text at 0x1076ed6a0>

In [15]:
combined = stats

In [16]:
ax = sns.boxplot(y=combined["type"],  x=combined["methylation"], linewidth=1.5)
ax = sns.stripplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[16]:
<matplotlib.text.Text at 0x107984898>

In [17]:
ax = sns.boxplot(x=combined["type"], y=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(x=combined["type"], y=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[17]:
<matplotlib.text.Text at 0x10797e550>

In [18]:
ax = sns.boxplot(y=combined["type"], x=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[18]:
<matplotlib.text.Text at 0x107c11320>

In [19]:
ax = sns.boxplot(y=combined["type"], x=combined["methylation"], linewidth=1.5)
ax = sns.swarmplot(y=combined["type"], x=combined["methylation"], color=".25", linewidth=1.5)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")


Out[19]:
<matplotlib.text.Text at 0x107aa7198>

In [20]:
combined2 = stats

In [21]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Percent Methylation, weighted")


Out[21]:
<matplotlib.text.Text at 0x107d46748>

In [22]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, 
                   jitter=True, palette=dict(normal_B = 'b', CD19CD27m = 'g', CD19CD27p = 'r', CLL='m', CD19p = 'c'))
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[22]:
<matplotlib.legend.Legend at 0x107fae048>

In [23]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Percent Methylation, weighted")


Out[23]:
<matplotlib.text.Text at 0x1075f8ef0>

In [24]:
ax = sns.boxplot(x=combined2["type"],y=combined2["methylation"], linewidth=1.5)
plt.ylim(0.15,0.8)
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[24]:
<matplotlib.legend.Legend at 0x10756e198>

In [25]:
combined2.boxplot(column = 'methylation', by='type', fontsize=13)

sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[25]:
<matplotlib.legend.Legend at 0x10756edd8>

In [26]:
combined2.boxplot(column = 'methylation', by='type', fontsize=13)
plt.title('corrected')


Out[26]:
<matplotlib.text.Text at 0x1083f6da0>

In [27]:
ax = sns.violinplot(x=combined2["type"],y=combined2["methylation"], palette="Set3")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.bio, jitter=True, linewidth=1.5)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[27]:
<matplotlib.legend.Legend at 0x10858f3c8>

In [28]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.protocol, jitter=True)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[28]:
<matplotlib.legend.Legend at 0x1086f37b8>

In [29]:
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"],y=combined2["methylation"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[29]:
<matplotlib.legend.Legend at 0x10890d240>

In [30]:
ax = sns.violinplot(x=combined2["type"], y=combined2["methylation"], palette="Set1")
sns.set_style("whitegrid")
ax = sns.stripplot(x=combined2["type"], y=combined2["methylation"], hue=combined2.protocol, jitter=True, linewidth=1.0)
sns.plt.title("Technical/protocol sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("Percent Methylation, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[30]:
<matplotlib.legend.Legend at 0x108c322e8>

In [31]:
ax = sns.boxplot(x=combined["type"], y=combined["methylation"], linewidth=1.5)
ax = sns.stripplot(x=combined["type"], y=combined["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")


Out[31]:
<matplotlib.text.Text at 0x108cfd940>

In [32]:
from scipy.stats import ttest_ind

cat1 = combined[combined['type']=='CLL']
cat2 = combined[combined['type']=='normal']

t, p = ttest_ind(cat1.methylation, cat2.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))


t-statistic is 9.08197244059
p-value is 3.52580344586e-18

In [33]:
import scipy.stats
z, p = scipy.stats.mannwhitneyu(cat1.methylation, cat2.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


Mann-Whitney U statistic is 7598.0
p-value is 8.83828232178e-19

In [34]:
sns.boxplot(x="bio", y="methylation", hue="type", data=combined2, palette="PRGn")
sns.despine(offset=10, trim=True)



In [35]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
sns.boxplot(x="bio", y="methylation", data=combined2)
sns.despine(offset=10, trim=True)



In [36]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
sns.boxplot(x="bio", y="methylation", data=combined2)
sns.stripplot(x="bio", y="methylation", data=combined2, color=".25", linewidth=0.5, jitter=True)
sns.despine(offset=10, trim=True)



In [37]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
sns.violinplot(x="bio", y="methylation", data=combined2)
sns.despine(offset=10, trim=True)



In [38]:
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
sns.stripplot(x="bio", y="methylation", data=combined2, color=".25", linewidth=0.5, jitter=True)
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
sns.violinplot(x="bio", y="methylation", data=combined2)
sns.despine(offset=10, trim=True)



In [39]:
final_real2 = stats

In [40]:
norm = final_real2[final_real2['type'] == 'normal']
print(len(norm))
cll = final_real2[final_real2['type'] == 'CLL']
print(len(cll))


342
104

In [41]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")


Out[41]:
<matplotlib.text.Text at 0x1094bc6a0>

In [42]:
print(norm['methylation'].mean())  # mean
print(cll['methylation'].mean())

print(norm['methylation'].std())   # standard deviation
print(cll['methylation'].std())


0.5264868347885444
0.5719623389234731
0.04897658622789234
0.026034869603911014

In [43]:
t, p = ttest_ind(norm.methylation, cll.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.methylation, cll.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -9.08197244059
p-value is 3.52580344586e-18
Mann-Whitney U statistic is 7598.0
p-value is 8.83828232178e-19

In [44]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["methylation"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $8.38 \times 10^{-19}$', ha='center', va='bottom', color=col)


Out[44]:
<matplotlib.text.Text at 0x1094af978>

In [45]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_total"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("PDR")


Out[45]:
<matplotlib.text.Text at 0x109845748>

In [46]:
print(norm['PDR_total'].mean())  # mean
print(cll['PDR_total'].mean())

print(norm['PDR_total'].std())   # standard deviation
print(cll['PDR_total'].std())


0.26902181089091254
0.39259232468954625
0.08837966139791369
0.016211537486115344

In [47]:
t, p = ttest_ind(norm.PDR_total, cll.PDR_total)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.PDR_total, cll.PDR_total)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -14.1756622095
p-value is 6.81435276231e-38
Mann-Whitney U statistic is 4834.0
p-value is 2.31074604e-29

In [48]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_total"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["PDR_total"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $2.31 \times 10^{-29}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[48]:
<matplotlib.text.Text at 0x1098c5908>

In [49]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")


Out[49]:
<matplotlib.text.Text at 0x1099d7198>

In [50]:
print(norm['methylation_unweighted'].mean())  # mean
print(cll['methylation_unweighted'].mean())

print(norm['methylation_unweighted'].std())   # standard deviation
print(cll['methylation_unweighted'].std())


0.6211722512583407
0.64264856433203
0.04498882007207662
0.018495370947256944

In [51]:
t, p = ttest_ind(norm.methylation_unweighted, cll.methylation_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.methylation_unweighted, cll.methylation_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -4.74482202659
p-value is 2.8181758819e-06
Mann-Whitney U statistic is 11103.0
p-value is 6.48096351051e-09

In [52]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["methylation_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $6.48 \times 10^{-9}$', ha='center', va='bottom', color=col)
plt.ylabel("Methylation")


Out[52]:
<matplotlib.text.Text at 0x109932898>

In [53]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.ylabel("PDR")


Out[53]:
<matplotlib.text.Text at 0x10946d0b8>

In [54]:
print(norm['PDR_unweighted'].mean())  # mean
print(cll['PDR_unweighted'].mean())

print(norm['PDR_unweighted'].std())   # standard deviation
print(cll['PDR_unweighted'].std())


0.26492034497151973
0.37191874954710946
0.08576949652248889
0.011449347699184393

In [55]:
t, p = ttest_ind(norm.PDR_unweighted, cll.PDR_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(norm.PDR_unweighted, cll.PDR_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -12.6781353527
p-value is 1.18748809307e-31
Mann-Whitney U statistic is 5371.0
p-value is 4.11168526493e-27

In [56]:
ax = sns.boxplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], linewidth=1.5)
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = final_real2["PDR_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $4.11 \times 10^{-27}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[56]:
<matplotlib.text.Text at 0x109bb3e80>

In [57]:
CD27p = final_real2[final_real2['bio'] == 'CD19CD27p']
print(len(CD27p))
CD27m = final_real2[final_real2['bio'] == 'CD19CD27m']
print(len(CD27m))


70
75

In [58]:
# CD19CD27m

CD27cells = pd.concat([CD27p, CD27m])

In [59]:
CD27cells = CD27cells.reset_index(drop=True)

In [60]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")


Out[60]:
<matplotlib.text.Text at 0x109f36e80>

In [61]:
print(CD27p['methylation'].mean())  # mean
print(CD27m['methylation'].mean())

print(CD27p['methylation'].std())   # standard deviation
print(CD27m['methylation'].std())


0.49450628223261744
0.5074007176737922
0.03269744280542611
0.033035401287325766

In [62]:
t, p = ttest_ind(CD27p.methylation, CD27m.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.methylation, CD27m.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -2.36027039083
p-value is 0.0196122014505
Mann-Whitney U statistic is 2025.0
p-value is 0.0176897250099

In [63]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[63]:
<matplotlib.legend.Legend at 0x109f5ef60>

In [64]:
CD27p122 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell1_22_']
CD27p2344 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell23_44']
CD27p4566 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell45_66']
CD27p6788 = CD27p[CD27p['protocol'] == 'NormalBCD19pCD27pcell67_88']

CD27m122 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell1_22_']
CD27m2344 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell23_44']
CD27m4566 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell45_66']
CD27m6788 = CD27m[CD27m['protocol'] == 'NormalBCD19pCD27mcell67_88']

In [65]:
print("MEAN")
print("CD27p122")
print(CD27p122['methylation'].mean()) 
print("CD27p2344")
print(CD27p2344['methylation'].mean())
print("CD27p4566")
print(CD27p4566['methylation'].mean()) 
print("CD27p6788")
print(CD27p6788['methylation'].mean())


print("***")
print("CD27m122")
print(CD27m122['methylation'].mean()) 
print("CD27m2344")
print(CD27m2344['methylation'].mean())
print("CD27m4566")
print(CD27m4566['methylation'].mean()) 
print("CD27m6788")
print(CD27m6788['methylation'].mean())


MEAN
CD27p122
0.49285703341970166
CD27p2344
0.4851007884832205
CD27p4566
0.4834148056947074
CD27p6788
0.5105267884212087
***
CD27m122
0.5040448558871422
CD27m2344
0.5103733462571467
CD27m4566
0.5141029786434724
CD27m6788
0.5021620097019546

In [66]:
print("total")
print("CD27p122")
print(len(CD27p122['methylation']))
print("CD27p2344")
print(len(CD27p2344['methylation']))
print("CD27p4566")
print(len(CD27p4566['methylation']))
print("CD27p6788")
print(len(CD27p6788['methylation']))


print("***")
print("CD27m122")
print(len(CD27m122['methylation']))
print("CD27m2344")
print(len(CD27m2344['methylation']))
print("CD27m4566")
print(len(CD27m4566['methylation']))
print("CD27m6788")
print(len(CD27m6788['methylation']))


total
CD27p122
18
CD27p2344
22
CD27p4566
9
CD27p6788
21
***
CD27m122
18
CD27m2344
19
CD27m4566
17
CD27m6788
21

In [67]:
print("STD")
print("CD27p122")
print(CD27p122['methylation'].std()) 
print("CD27p2344")
print(CD27p2344['methylation'].std())
print("CD27p4566")
print(CD27p4566['methylation'].std()) 
print("CD27p6788")
print(CD27p6788['methylation'].std())


print("***")
print("CD27m122")
print(CD27m122['methylation'].std()) 
print("CD27m2344")
print(CD27m2344['methylation'].std())
print("CD27m4566")
print(CD27m4566['methylation'].std()) 
print("CD27m6788")
print(CD27m6788['methylation'].std())


STD
CD27p122
0.03385920150542658
CD27p2344
0.03338021715676278
CD27p4566
0.038155892534940505
CD27p6788
0.023175173194632043
***
CD27m122
0.03717250597962948
CD27m2344
0.025590610546114965
CD27m4566
0.03449159314327558
CD27m6788
0.035167704732143194

In [68]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["methylation"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[68]:
<matplotlib.text.Text at 0x10a0d9438>

In [69]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["methylation"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("Methylation")


Out[69]:
<matplotlib.text.Text at 0x10a4742b0>

In [70]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["methylation"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["methylation"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27m, all chromosomes, weighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[70]:
<matplotlib.text.Text at 0x109704b70>

In [71]:
print("MEAN: methylation unweighted")
print("CD27p122")
print(CD27p122['methylation_unweighted'].mean()) 
print("CD27p2344")
print(CD27p2344['methylation_unweighted'].mean())
print("CD27p4566")
print(CD27p4566['methylation_unweighted'].mean()) 
print("CD27p6788")
print(CD27p6788['methylation_unweighted'].mean())


print("***")
print("CD27m122")
print(CD27m122['methylation_unweighted'].mean()) 
print("CD27m2344")
print(CD27m2344['methylation_unweighted'].mean())
print("CD27m4566")
print(CD27m4566['methylation_unweighted'].mean()) 
print("CD27m6788")
print(CD27m6788['methylation_unweighted'].mean())


MEAN: methylation unweighted
CD27p122
0.5809577160887391
CD27p2344
0.5718335489217137
CD27p4566
0.5724617957578969
CD27p6788
0.5952836402481724
***
CD27m122
0.6143624872865359
CD27m2344
0.628785630362192
CD27m4566
0.6318364273228161
CD27m6788
0.6161925913256822

In [72]:
print("STD: methylation unweighted")
print("CD27p122")
print(CD27p122['methylation_unweighted'].std()) 
print("CD27p2344")
print(CD27p2344['methylation_unweighted'].std())
print("CD27p4566")
print(CD27p4566['methylation_unweighted'].std()) 
print("CD27p6788")
print(CD27p6788['methylation_unweighted'].std())


print("***")
print("CD27m122")
print(CD27m122['methylation_unweighted'].std()) 
print("CD27m2344")
print(CD27m2344['methylation_unweighted'].std())
print("CD27m4566")
print(CD27m4566['methylation_unweighted'].std()) 
print("CD27m6788")
print(CD27m6788['methylation_unweighted'].std())


STD: methylation unweighted
CD27p122
0.03327787475880213
CD27p2344
0.028777568411674094
CD27p4566
0.02739289849650322
CD27p6788
0.019334987628355504
***
CD27m122
0.02986529960014145
CD27m2344
0.018536295516258655
CD27m4566
0.025052451971795592
CD27m6788
0.026227446358268715

In [73]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[73]:
<matplotlib.legend.Legend at 0x10a46d748>

In [74]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[74]:
<matplotlib.text.Text at 0x109830470>

In [75]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27p, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("Methylation")


Out[75]:
<matplotlib.text.Text at 0x10a88fba8>

In [76]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["methylation_unweighted"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("Methylation per cell, CD27m, all chromosomes, unweighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("Methylation")


Out[76]:
<matplotlib.text.Text at 0x10aa55e48>

In [77]:
print("MEAN: PDR, weighted")
print("CD27p122")
print(CD27p122['PDR_total'].mean()) 
print("CD27p2344")
print(CD27p2344['PDR_total'].mean())
print("CD27p4566")
print(CD27p4566['PDR_total'].mean()) 
print("CD27p6788")
print(CD27p6788['PDR_total'].mean())


print("***")
print("CD27m122")
print(CD27m122['PDR_total'].mean()) 
print("CD27m2344")
print(CD27m2344['PDR_total'].mean())
print("CD27m4566")
print(CD27m4566['PDR_total'].mean()) 
print("CD27m6788")
print(CD27m6788['PDR_total'].mean())

print("***")
print("*** *** *** *** *** ***")
print("***")

print("STD: PDR, weighted")
print("CD27p122")
print(CD27p122['PDR_total'].std()) 
print("CD27p2344")
print(CD27p2344['PDR_total'].std())
print("CD27p4566")
print(CD27p4566['PDR_total'].std()) 
print("CD27p6788")
print(CD27p6788['PDR_total'].std())


print("***")
print("CD27m122")
print(CD27m122['PDR_total'].std()) 
print("CD27m2344")
print(CD27m2344['PDR_total'].std())
print("CD27m4566")
print(CD27m4566['PDR_total'].std()) 
print("CD27m6788")
print(CD27m6788['PDR_total'].std())


MEAN: PDR, weighted
CD27p122
0.2869346883599433
CD27p2344
0.3110086941203543
CD27p4566
0.2800724545675633
CD27p6788
0.3084258980341463
***
CD27m122
0.17683627477104374
CD27m2344
0.17956441493748626
CD27m4566
0.17614056770444841
CD27m6788
0.1780542946983231
***
*** *** *** *** *** ***
***
STD: PDR, weighted
CD27p122
0.04854426516146637
CD27p2344
0.02870685299679014
CD27p4566
0.03660540813886372
CD27p6788
0.03616795321758426
***
CD27m122
0.013945749150551572
CD27m2344
0.01642690906832183
CD27m4566
0.004416412431237496
CD27m6788
0.01372846850430287

In [78]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("PDR total per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR total, weighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[78]:
<matplotlib.legend.Legend at 0x10a8a0e10>

In [79]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["PDR_total"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR total, weighted")


Out[79]:
<matplotlib.text.Text at 0x10ac24080>

In [80]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["PDR_total"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27p, all chromosomes, weighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("PDR total, weighted")


Out[80]:
<matplotlib.text.Text at 0x10aedc7b8>

In [81]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["PDR_total"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["PDR_total"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27m, all chromosomes, weighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR total, weighted")


Out[81]:
<matplotlib.text.Text at 0x10b0b5f98>

In [82]:
print("MEAN: PDR, unweighted")
print("CD27p122")
print(CD27p122['PDR_unweighted'].mean()) 
print("CD27p2344")
print(CD27p2344['PDR_unweighted'].mean())
print("CD27p4566")
print(CD27p4566['PDR_unweighted'].mean()) 
print("CD27p6788")
print(CD27p6788['PDR_unweighted'].mean())


print("***")
print("CD27m122")
print(CD27m122['PDR_unweighted'].mean()) 
print("CD27m2344")
print(CD27m2344['PDR_unweighted'].mean())
print("CD27m4566")
print(CD27m4566['PDR_unweighted'].mean()) 
print("CD27m6788")
print(CD27m6788['PDR_unweighted'].mean())

print("***")
print("*** *** *** *** *** ***")
print("***")

print("STD: PDR, unweighted")
print("CD27p122")
print(CD27p122['PDR_unweighted'].std()) 
print("CD27p2344")
print(CD27p2344['PDR_unweighted'].std())
print("CD27p4566")
print(CD27p4566['PDR_unweighted'].std()) 
print("CD27p6788")
print(CD27p6788['PDR_unweighted'].std())


print("***")
print("CD27m122")
print(CD27m122['PDR_unweighted'].std()) 
print("CD27m2344")
print(CD27m2344['PDR_unweighted'].std())
print("CD27m4566")
print(CD27m4566['PDR_unweighted'].std()) 
print("CD27m6788")
print(CD27m6788['PDR_unweighted'].std())


MEAN: PDR, unweighted
CD27p122
0.2928016275899315
CD27p2344
0.3214126851517178
CD27p4566
0.2883948743714171
CD27p6788
0.31264582366755844
***
CD27m122
0.1735435408737465
CD27m2344
0.17776415764078474
CD27m4566
0.17552949440422289
CD27m6788
0.17414329941334858
***
*** *** *** *** *** ***
***
STD: PDR, unweighted
CD27p122
0.05077234389617635
CD27p2344
0.028481234302547183
CD27p4566
0.03809701507879582
CD27p6788
0.0355610815414403
***
CD27m122
0.01440649217303426
CD27m2344
0.01815739362665765
CD27m4566
0.003452085800240377
CD27m6788
0.013595703739238373

In [83]:
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], hue=CD27cells.protocol, linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR total, unweighted")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)


Out[83]:
<matplotlib.legend.Legend at 0x10ac88470>

In [84]:
ax = sns.boxplot(x=CD27cells["protocol"], y=CD27cells["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["protocol"], y=CD27cells["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788 CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR, unweighted")


Out[84]:
<matplotlib.text.Text at 0x10b0b5358>

In [85]:
ax = sns.boxplot(x=CD27p["protocol"], y=CD27p["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27p["protocol"], y=CD27p["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR per cell, CD27p, all chromosomes, unweighted")
plt.xlabel("CD27p122 CD27p2344 CD27p4566 CD27p6788")
plt.ylabel("PDR, unweighted")


Out[85]:
<matplotlib.text.Text at 0x10b76a898>

In [86]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["PDR_unweighted"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27m, all chromosomes, unweighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR, unweighted")


Out[86]:
<matplotlib.text.Text at 0x10b92f1d0>

In [87]:
ax = sns.boxplot(x=CD27m["protocol"], y=CD27m["PDR_unweighted"], linewidth=1.5, palette='Set1')
ax = sns.stripplot(x=CD27m["protocol"], y=CD27m["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
ax.xaxis.set_major_formatter(plt.NullFormatter())
sns.plt.title("PDR total per cell, CD27m, all chromosomes, unweighted")
plt.xlabel("CD27m122 CD27m2344 CD27m4566 CD27m6788")
plt.ylabel("PDR, unweighted")
plt.ylim(0.16, 0.19)


Out[87]:
(0.16, 0.19)

In [88]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["methylation"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $0.0177$', ha='center', va='bottom', color=col)
plt.ylabel("Methylation")
plt.ylim(0.40, 0.63)


Out[88]:
(0.4, 0.63)

In [89]:
print(CD27p['methylation'].mean())  # mean
print(CD27m['methylation'].mean())

print(CD27p['methylation'].std())   # standard deviation
print(CD27m['methylation'].std())


0.49450628223261744
0.5074007176737922
0.03269744280542611
0.033035401287325766

In [90]:
t, p = ttest_ind(CD27p.methylation, CD27m.methylation)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.methylation, CD27m.methylation)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -2.36027039083
p-value is 0.0196122014505
Mann-Whitney U statistic is 2025.0
p-value is 0.0176897250099

In [91]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR")


Out[91]:
<matplotlib.text.Text at 0x10bec3320>

In [92]:
print(CD27p['PDR_total'].mean())  # mean
print(CD27m['PDR_total'].mean())

print(CD27p['PDR_total'].std())   # standard deviation
print(CD27m['PDR_total'].std())


0.3000658801564559
0.17771075559108584
0.03901922884098728
0.012928253443472829

In [93]:
t, p = ttest_ind(CD27p.PDR_total, CD27m.PDR_total)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.PDR_total, CD27m.PDR_total)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is 25.6929050766
p-value is 1.90529511138e-55
Mann-Whitney U statistic is 14.0
p-value is 5.21013642477e-25

In [94]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_total"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, weighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["PDR_total"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $5.21 \times 10^{-25}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[94]:
<matplotlib.text.Text at 0x10c0c9780>

In [95]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("Methylation")


Out[95]:
<matplotlib.text.Text at 0x10c469fd0>

In [96]:
print(CD27p['methylation_unweighted'].mean())  # mean
print(CD27m['methylation_unweighted'].mean())

print(CD27p['methylation_unweighted'].std())   # standard deviation
print(CD27m['methylation_unweighted'].std())


0.5812955653272529
0.6224895390715532
0.028610332337666085
0.02583919269177535

In [97]:
t, p = ttest_ind(CD27p.methylation_unweighted, CD27m.methylation_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.methylation_unweighted, CD27m.methylation_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is -9.10910523078
p-value is 6.71657586611e-16
Mann-Whitney U statistic is 684.0
p-value is 1.61589647975e-14

In [98]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["methylation_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("Methylation per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["methylation_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $1.62 \times 10^{-14}$', ha='center', va='bottom', color=col)
plt.ylabel("Methylation")


Out[98]:
<matplotlib.text.Text at 0x10c6e3748>

In [99]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
plt.ylabel("PDR")


Out[99]:
<matplotlib.text.Text at 0x10bec4978>

In [100]:
print(CD27p['PDR_unweighted'].mean())  # mean
print(CD27m['PDR_unweighted'].mean())

print(CD27p['PDR_unweighted'].std())   # standard deviation
print(CD27m['PDR_unweighted'].std())


0.30718035051882914
0.17523084564605945
0.03988060101236509
0.013531647403441469

In [101]:
t, p = ttest_ind(CD27p.PDR_unweighted, CD27m.PDR_unweighted)
print(str("t-statistic is ") + str(t))
print(str("p-value is ") + str(p))

#
# The classic Student's t-test is a parametric test that only works when the data are normally distributed.
# A good non-parametric alternative to the t-test is the Mann-Whitney U-test (called ranksum in matlab, 
#     or Wilcoxon test, or Man-Whitney-Wilcoxon)
#
import scipy.stats
z, p = scipy.stats.mannwhitneyu(CD27p.PDR_unweighted, CD27m.PDR_unweighted)
p_value = p * 2
print(str("Mann-Whitney U statistic is ") + str(z))
print(str("p-value is ") + str(p_value))


t-statistic is 27.0398639206
p-value is 4.4126371972e-58
Mann-Whitney U statistic is 13.0
p-value is 4.99957643281e-25

In [102]:
ax = sns.boxplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], linewidth=1.5, palette='Set2')
ax = sns.stripplot(x=CD27cells["bio"], y=CD27cells["PDR_unweighted"], color=".25", linewidth=1.5, jitter=True)
sns.plt.title("PDR per cell, CD27p vs. CD27m, all chromosomes, unweighted")
plt.xlabel("CD27p = 68 *.anno files; CD27m = 74 *.anno files")
x1, x2 = 0, 1   # columns 'Sat' and 'Sun' (first column: 0, see plt.xticks())
y, h, col = CD27cells["PDR_unweighted"].max() + .025, .01, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, r'p = $5.00 \times 10^{-25}$', ha='center', va='bottom', color=col)
plt.ylabel("PDR")


Out[102]:
<matplotlib.text.Text at 0x10bba0d68>

In [103]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Methylation")


Out[103]:
<matplotlib.text.Text at 0x10c44a1d0>

In [104]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_total"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: PDR per cell, normal vs CLL, all chromosomes, weighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("PDR")


Out[104]:
<matplotlib.text.Text at 0x10c44a860>

In [105]:
import seaborn as sns
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["methylation_unweighted"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: Methylation per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("Methylation")


Out[105]:
<matplotlib.text.Text at 0x10c37d320>

In [106]:
sns.set(style="whitegrid", palette="muted")
muted = ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"]
newPal   = dict(normal_B = muted[5], CD19CD27m = muted[2],CD19CD27p = muted[1], CD19p = muted[3], CLL=muted[0])
ax = sns.stripplot(x=final_real2["type"], y=final_real2["PDR_unweighted"], hue=final_real2.bio, jitter=True, palette=newPal)
sns.plt.title("Biological sorting: PDR per cell, normal vs CLL, all chromosomes, unweighted")
plt.xlabel("Normal cells = 342 *.anno files; CLL cells = 104 *.anno files")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.ylabel("PDR")


Out[106]:
<matplotlib.text.Text at 0x10cba2e80>

In [ ]:


In [ ]:


In [ ]:


In [ ]: