In [49]:
%matplotlib inline
In [50]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
In [51]:
"""
total_read_CpGs_normal_B_cellA1H1_withFilter.csv
total_read_CpGs_normal_B_cellA1H1_noFilter.csv
total_read_CpGs_pCD27mcell_withFilter.csv
total_read_CpGs_pCD27mcell_noFilter.csv
total_read_CpGs_pCD27pcell_withFilter.csv
total_read_CpGs_pCD27pcell_noFilter.csv
total_read_CpGs_NormalBCD19pcell_withFilter.csv
total_read_CpGs_NormalBCD19pcell_noFilter.csv
total_read_CpGs_CLL_cw154_withFilter.csv
total_read_CpGs_CLL_cw154_noFilter.csv
total_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv
"""
Out[51]:
In [52]:
normal_B_filtered = pd.read_csv("total_read_CpGs_normal_B_cellA1H1_withFilter.csv")
normal_B_unfiltered = pd.read_csv("total_read_CpGs_normal_B_cellA1H1_noFilter.csv")
CD27pcell_filtered = pd.read_csv("total_read_CpGs_pCD27pcell_withFilter.csv")
CD27pcell_unfiltered = pd.read_csv("total_read_CpGs_pCD27pcell_noFilter.csv")
CD27mcell_filtered = pd.read_csv("total_read_CpGs_pCD27mcell_withFilter.csv")
CD27mcell_unfiltered = pd.read_csv("total_read_CpGs_pCD27mcell_noFilter.csv")
NormalBCD19pcell_filtered = pd.read_csv("total_read_CpGs_NormalBCD19pcell_withFilter.csv")
NormalBCD19pcell_unfiltered = pd.read_csv("total_read_CpGs_NormalBCD19pcell_noFilter.csv")
trito_both = pd.read_csv("total_read_CpGs_CLL_RRBS_trito_pool_noFilter.csv")
cll_cw154_filtered = pd.read_csv("total_read_CpGs_CLL_cw154_withFilter.csv")
cll_cw154_unfiltered = pd.read_csv("total_read_CpGs_CLL_cw154_noFilter.csv")
In [53]:
normal_B_filtered = normal_B_filtered.drop(["Unnamed: 0"], axis=1)
normal_B_unfiltered = normal_B_unfiltered.drop(["Unnamed: 0"], axis=1)
CD27pcell_filtered = CD27pcell_filtered.drop(["Unnamed: 0"], axis=1)
CD27pcell_unfiltered = CD27pcell_unfiltered.drop(["Unnamed: 0"], axis=1)
CD27mcell_filtered = CD27mcell_filtered.drop(["Unnamed: 0"], axis=1)
CD27mcell_unfiltered = CD27mcell_unfiltered.drop(["Unnamed: 0"], axis=1)
NormalBCD19pcell_filtered = NormalBCD19pcell_filtered.drop(["Unnamed: 0"], axis=1)
NormalBCD19pcell_unfiltered = NormalBCD19pcell_unfiltered.drop(["Unnamed: 0"], axis=1)
trito_both = trito_both.drop(["Unnamed: 0"], axis=1)
cll_cw154_filtered = cll_cw154_filtered.drop(["Unnamed: 0"], axis=1)
cll_cw154_unfiltered = cll_cw154_unfiltered.drop(["Unnamed: 0"], axis=1)
In [54]:
print(normal_B_filtered.shape)
print(normal_B_unfiltered.shape)
print(CD27pcell_filtered.shape)
print(CD27pcell_unfiltered.shape)
print(CD27mcell_filtered.shape)
print(CD27mcell_unfiltered.shape)
print(NormalBCD19pcell_filtered.shape)
print(NormalBCD19pcell_unfiltered.shape)
print(trito_both.shape)
print(cll_cw154_filtered.shape)
print(cll_cw154_unfiltered.shape)
In [55]:
filtered_files = [normal_B_filtered, CD27pcell_filtered, CD27mcell_filtered, NormalBCD19pcell_filtered, trito_both, cll_cw154_filtered]
In [56]:
filtered_all = pd.concat(filtered_files)
In [57]:
unfiltered_files = [normal_B_unfiltered, CD27pcell_unfiltered, CD27mcell_unfiltered, NormalBCD19pcell_unfiltered, trito_both, cll_cw154_unfiltered]
In [58]:
unfiltered_all = pd.concat(unfiltered_files)
In [59]:
print(filtered_all.shape)
print(unfiltered_all.shape)
In [60]:
unfiltered_all_pl = unfiltered_all.loc[:,['total_reads']]
unfiltered_all_pl = unfiltered_all_pl.reset_index(drop=True)
In [61]:
unfiltered_all_pl.plot(kind='area')
plt.title('Unfiltered, total reads, 516 *.anno files')
plt.xlabel('516 total files') # still a few files with reads less than 10K
Out[61]:
In [62]:
unfiltered_all2 = unfiltered_all.loc[:,['total_unique_CpGs']]
unfiltered_all2 = unfiltered_all2.reset_index(drop=True)
In [63]:
unfiltered_all2.plot(kind='area', color='g')
plt.title('Unfiltered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('516 total files') # still a few files with reads less than 10K
Out[63]:
In [64]:
filtered_all_pl = filtered_all.loc[:,['total_reads']]
filtered_all_pl = filtered_all_pl.reset_index(drop=True)
In [65]:
filtered_all_pl.plot(kind='area')
plt.title('Filtered, total reads, 516 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[65]:
In [66]:
filtered_all_pl2 = filtered_all.loc[:,['total_unique_CpGs']]
filtered_all_pl2 = filtered_all_pl2.reset_index(drop=True)
In [144]:
filtered_all_pl2.plot(kind='area', color='g')
plt.title('Filtered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[144]:
In [ ]:
In [68]:
unfiltered_all3 = unfiltered_all.loc[:,['total_reads', 'total_unique_CpGs']]
unfiltered_all3 = unfiltered_all3.reset_index(drop=True)
In [69]:
unfiltered_all3.plot(kind='area')
plt.title('Uniltered, total_reads & total unique CpG per cells, 516 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[69]:
In [70]:
filtered_all3 = filtered_all.loc[:,['total_reads', 'total_unique_CpGs']]
filtered_all3 = filtered_all3.reset_index(drop=True)
In [71]:
filtered_all3.plot(kind='area')
plt.title('Filtered, total_reads & total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[71]:
In [72]:
df1 = pd.read_table('allStats.txt')
df1 = df1.drop('sample.1', axis=1)
df1 = df1.drop('sample.2', axis=1)
In [73]:
df1.head()
Out[73]:
In [74]:
df1 = df1.drop('class', axis=1)
df1 = df1.drop('totMeth', axis=1)
df1 = df1.drop('totSeen', axis=1)
df1 = df1.drop('avSum', axis=1)
df1 = df1.drop('avTot', axis=1)
df1 = df1.drop('rMixed', axis=1)
df1 = df1.drop('rTot', axis=1)
df1 = df1.drop('rAv', axis=1)
df1 = df1.drop('rAvTot', axis=1)
df1 = df1.drop('bed', axis=1)
df1 = df1.drop('methInfoFile', axis=1)
df1 = df1.drop('totReads', axis=1)
df1 = df1.drop('totAligned', axis=1)
df1 = df1.drop('totClipped', axis=1)
df1 = df1.drop('totUsed', axis=1)
df1 = df1.drop('totMethCpG', axis=1)
df1 = df1.drop('totSeenCpG', axis=1)
# df1 = df1.drop('totCpG', axis=1)
df1 = df1.drop('totalReadPairs', axis=1)
df1 = df1.drop('alignedReads', axis=1)
df1 = df1.drop('bsRate', axis=1)
In [75]:
df1.head()
Out[75]:
In [76]:
df1.shape
Out[76]:
In [77]:
df1 = df1.rename(columns={'sample': 'filename'})
In [78]:
df1.head()
Out[78]:
In [79]:
stats_evan_both_unfiltered = unfiltered_all.merge(df1, on="filename")
In [80]:
stats_evan_both_unfiltered.shape
Out[80]:
In [81]:
stats_evan_both_unfiltered_plot = stats_evan_both_unfiltered.drop('filename', axis=1)
In [82]:
stats_evan_both_unfiltered_plot = stats_evan_both_unfiltered_plot.rename(columns={'totCpG': 'allStats.txt totCpG', 'totalReads': 'allStats.txt total reads'})
In [83]:
stats_evan_both_unfiltered_plot.columns
Out[83]:
In [84]:
stats_evan_both_unfiltered_plot2 = stats_evan_both_unfiltered_plot.loc[:,['total_reads', 'allStats.txt total reads']]
stats_evan_both_unfiltered_plot2 = stats_evan_both_unfiltered_plot2.reset_index(drop=True)
In [85]:
stats_evan_both_unfiltered_plot2.plot(kind='area')
plt.title('Unfiltered, allStats.txt versus Broad URL, total reads, 516 *.anno files')
plt.xlabel('516 total files') # still a few files with reads less than 10K
Out[85]:
In [86]:
stats_evan_both_unfiltered_plot3 = stats_evan_both_unfiltered_plot.loc[:,['total_unique_CpGs', 'allStats.txt totCpG']]
stats_evan_both_unfiltered_plot3 = stats_evan_both_unfiltered_plot3.reset_index(drop=True)
In [87]:
stats_evan_both_unfiltered_plot3.plot(kind='area')
plt.title('Unfiltered, allStats.txt versus Broad URL, total CpG, 516 *.anno files')
plt.xlabel('516 total files') # still a few files with reads less than 10K
Out[87]:
In [88]:
stats_evan_both_filtered = filtered_all.merge(df1, on="filename")
In [89]:
stats_evan_both_filtered.shape
Out[89]:
In [90]:
stats_evan_both_filtered_plot = stats_evan_both_filtered.drop('filename', axis=1)
In [91]:
stats_evan_both_filtered_plot = stats_evan_both_filtered_plot.rename(columns={'totCpG': 'allStats.txt totCpG', 'totalReads': 'allStats.txt total reads'})
In [92]:
stats_evan_both_filtered_plot.columns
Out[92]:
In [93]:
stats_evan_both_filtered_plot2 = stats_evan_both_filtered_plot.loc[:,['total_reads', 'allStats.txt total reads']]
stats_evan_both_filtered_plot2 = stats_evan_both_filtered_plot2.reset_index(drop=True)
In [94]:
stats_evan_both_filtered_plot2.plot(kind='area')
plt.title('Filtered, allStats.txt versus Broad URL, total reads, 443 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[94]:
In [ ]:
In [95]:
stats_evan_both_filtered_plot3 = stats_evan_both_filtered_plot.loc[:,['total_unique_CpGs', 'allStats.txt totCpG']]
stats_evan_both_filtered_plot3 = stats_evan_both_filtered_plot3.reset_index(drop=True)
In [96]:
stats_evan_both_filtered_plot3.plot(kind='area')
plt.title('Filtered, allStats.txt versus Broad URL, totCpG--total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[96]:
In [ ]:
In [ ]:
#
#
# The above output is y-axis: sum total # unique CpGs count, x-axis is anno file.
#
# It appears you want: y-axis: # of cells with sum total CpG count, x-axis is range of sum total CpG counts per file.
#
In [100]:
unfiltered_all2.plot(kind='area', color='g')
plt.title('Filtered, total unique CpG per cells, 443 *.anno files')
plt.xlabel('443 total files') # still a few files with reads less than 10K
Out[100]:
In [101]:
unfiltered_all2.columns
Out[101]:
In [108]:
plt.hist(unfiltered_all2['total_unique_CpGs'], bins=15)
Out[108]:
In [125]:
num_bins = 50
# the histogram of the data
x = unfiltered_all2['total_unique_CpGs']
n, bins, patches = plt.hist(x, num_bins, normed=1, facecolor='green', alpha=0.5)
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')
Out[125]:
In [168]:
num_bins = 75
data = unfiltered_all2['total_unique_CpGs']
plt.hist(data, num_bins, facecolor='green', alpha=0.75)
plt.ylabel('Number of cells')
plt.xlabel('Sum total Unique CpGs, bins=75; total 516 files')
plt.title(r'Unfiltered sum total unique CpGs per *.anno file')
Out[168]:
In [169]:
unfiltered_all_pl[:10]
Out[169]:
In [170]:
num_bins = 75
data = unfiltered_all_pl['total_reads']
plt.hist(data, num_bins, alpha=0.75)
plt.ylabel('Number of cells')
plt.xlabel('total reads, bins=75; total 516 files')
plt.title(r'Unfiltered total reads per *.anno file')
Out[170]:
In [171]:
filtered_all_pl[:10]
Out[171]:
In [172]:
num_bins = 75
data = filtered_all_pl['total_reads']
plt.hist(data, num_bins )
plt.ylabel('Number of cells')
plt.xlabel('total reads, bins=75; total 443 files')
plt.title(r'Filtered total reads per *.anno file')
Out[172]:
In [173]:
num_bins = 75
data = filtered_all_pl2['total_unique_CpGs']
plt.hist(data, num_bins, facecolor='green' )
plt.ylabel('Number of cells')
plt.xlabel('Sum total Unique CpGs, bins=75; total 443 files')
plt.title(r'Filtered Sum total Unique CpGs per *.anno file')
Out[173]:
In [ ]:
In [ ]:
In [ ]: