In [1]:
%matplotlib inline
In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')
In [4]:
df = pd.read_csv("all_file_stats.csv")
In [6]:
df.shape
Out[6]:
In [8]:
df.head() # all CpGs == total_cpg_no_filter
Out[8]:
In [33]:
#
# remove all "2cell" files
#
# index 498-----RRBS_trito_pool_2_CGTACTAG.CCCGGG
# index 502-----RRBS_trito_pool_2_CGTACTAG.CTCAGC
#
df = df.drop([498, 502])
In [34]:
unfiltered_all = df
In [35]:
unfiltered_all.shape
Out[35]:
In [ ]:
#
# y-axis: # of cells with sum total CpG count, x-axis is range of sum total CpG counts per file
#
In [36]:
unfiltered_all2 = unfiltered_all.loc[:,['total_cpg_no_filter']]
unfiltered_all2 = unfiltered_all2.reset_index(drop=True)
In [37]:
unfiltered_all2.shape
Out[37]:
In [73]:
unfiltered_all2.plot(kind='area', color='r')
plt.title('Unfiltered, total unique CpG per cells, 511 *.anno files')
plt.xlabel('511 total *.anno files')
plt.ylabel('Total # of CpG per file')
Out[73]:
In [64]:
plt.hist(unfiltered_all2['total_cpg_no_filter'], bins=75)
plt.title("Histogram: total unique CpG per cells, 511 *.anno files")
plt.ylabel("Number of files by bin")
plt.xlabel("Total number of unique CpG per file")
Out[64]:
In [65]:
filtered1 = unfiltered_all2[unfiltered_all2["total_cpg_no_filter"]>100000]
In [66]:
plt.hist(filtered1['total_cpg_no_filter'], bins=75, color='darkmagenta')
plt.title("Histogram: total unique CpG per cells, FILTERED by CpG > 100K, 438 *.anno files")
plt.ylabel("Number of files by bin")
plt.xlabel("Total number of unique CpG per file")
Out[66]:
In [ ]:
In [67]:
filtered2 = unfiltered_all2[unfiltered_all2["total_cpg_no_filter"]>80000]
In [69]:
len(filtered2) # a difference of four files
Out[69]:
In [74]:
plt.hist(filtered2['total_cpg_no_filter'], bins=75, color='firebrick')
plt.title("Histogram: total unique CpG per cells, FILTERED by CpG > 80K, 442 *.anno files")
plt.ylabel("Number of files by bin")
plt.xlabel("Total number of unique CpG per file")
Out[74]:
In [91]:
#
# List the four files difference between these two filters, >100K vs >80K
#
filtered438 = df[df["total_cpg_no_filter"] > 100000]
filtered442 = df[df["total_cpg_no_filter"] > 80000]
print(filtered438.shape)
print(filtered442.shape)
filtered442[(~filtered442.filename.isin(filtered438.filename))]
Out[91]:
In [92]:
filtered3 = unfiltered_all2[unfiltered_all2["total_cpg_no_filter"]>75000]
In [93]:
len(filtered3)
Out[93]:
In [94]:
plt.hist(filtered3['total_cpg_no_filter'], bins=75, color='darkcyan')
plt.title("Histogram: total unique CpG per cells, FILTERED by CpG > 75K, 443 *.anno files")
plt.ylabel("Number of files by bin")
plt.xlabel("Total number of unique CpG per file")
Out[94]:
In [95]:
#
# List the 5 files difference between these two filters, >100K vs >80K
#
filtered438 = df[df["total_cpg_no_filter"] > 100000]
filtered443 = df[df["total_cpg_no_filter"] > 75000]
print(filtered438.shape)
print(filtered443.shape)
filtered443[(~filtered443.filename.isin(filtered438.filename))]
Out[95]:
In [98]:
filtered4 = unfiltered_all2[unfiltered_all2["total_cpg_no_filter"]>50000]
In [99]:
len(filtered4)
Out[99]:
In [100]:
plt.hist(filtered4['total_cpg_no_filter'], bins=75, color='darkkhaki')
plt.title("Histogram: total unique CpG per cells, FILTERED by CpG > 50K, 446 *.anno files")
plt.ylabel("Number of files by bin")
plt.xlabel("Total number of unique CpG per file")
Out[100]:
In [103]:
#
# List the 8 files difference between these two filters, >100K vs >50K
#
filtered438 = df[df["total_cpg_no_filter"] > 100000]
filtered446 = df[df["total_cpg_no_filter"] > 50000]
print(filtered438.shape)
print(filtered446.shape)
filtered446[(~filtered446.filename.isin(filtered438.filename))]
Out[103]:
In [ ]:
In [ ]: