In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn
seaborn.set()
In [2]:
data = pd.read_csv("5G.rpkm.La.csv")
In [3]:
data.fillna(0, inplace = True)
len(data[pd.isnull(data["5GB1_vial_wLa_TR3"])]) #some magic to check if column contains NaN values
Out[3]:
In [4]:
#Create a column of log2 of expression level
data["Log2_5GB1_vial_wLa_TR3"]=data["5GB1_vial_wLa_TR3"].apply(np.log2)
data["Log2_5GB1_vial_woLa_TR2"]=data["5GB1_vial_woLa_TR2"].apply(np.log2)
In [5]:
#Replace the -inf values with zero
data["Log2_5GB1_vial_wLa_TR3"].replace(float('-inf'), 0, inplace = True)
data["Log2_5GB1_vial_woLa_TR2"].replace(float('-inf'), 0, inplace = True)
In [6]:
#Log2(expression) difference. Control - Treatment. Positive values = high expression w/o La. Neg = High expression w La
data["M_Log_Diff"]=(data["Log2_5GB1_vial_woLa_TR2"]-data["Log2_5GB1_vial_wLa_TR3"])
data["A_Log_Ave"]=(data["Log2_5GB1_vial_woLa_TR2"]+data["Log2_5GB1_vial_wLa_TR3"])/2
In [7]:
#raplce the zero values with a nan (float value)
#data["M_Log_Diff"].replace(0, np.nan, inplace = True)
In [8]:
%matplotlib inline
plt.style.use('ggplot')
axes = plt.gca()
axes.set_ylim([0,100])
plt.hist(data["M_Log_Diff"],bins = 50, range = [-5,13]) #[np.nanmin(data["Log_Diff"]), np.nanmax(data["Log_Diff"])])
Out[8]:
In [9]:
#MA plot
plt.scatter(data["A_Log_Ave"],data["M_Log_Diff"])
axes = plt.gca()
axes.set_xlim([0,20])
axes.set_ylim([-5,15])
plt.ylabel('M')
plt.xlabel('A')
#plt.imshow
Out[9]:
In [10]:
xy = np.array([data["A_Log_Ave"],data["M_Log_Diff"]]) #create 2d array using the data columns
#xy = np.vstack([data["A_Log_Ave"],data["M_Log_Diff"]]) #can also do it by stacking arrays, must be same length
z = gaussian_kde(xy) #create a probability density distribution function for a 2d array
fig, ax = plt.subplots()
ax.scatter(data["A_Log_Ave"], data["M_Log_Diff"], c=z(xy), s=20, cmap=plt.cm.coolwarm) # z(xy) is input to get the
# probability function output
axes = plt.gca()
axes.set_xlim([0,20])
axes.set_ylim([-5,15])
plt.ylabel('M')
plt.xlabel('A')
plt.title("log2(-La) - log2(+La)")
plt.savefig('MA_plot_plut_minus_La.png', bbox_inches='tight')
In [11]:
#Slicing the the A values into bins in order to pick out outliers in every bin
A_bins = [-1,1,2,3,4,5,6,7,8,9,10,18]
A_group_names = ["0_1","1_2","2_3","3_4","4_5","5_6","6_7","7_8","8_9","9_10","10_18"]
categories = pd.cut(data["A_Log_Ave"], A_bins, labels = A_group_names)
categories
Out[11]:
In [12]:
data["A_Categories"] = pd.cut(data["A_Log_Ave"], A_bins, labels = A_group_names)
data.head()
Out[12]:
In [13]:
grouped = data.groupby("A_Categories")["M_Log_Diff"].describe()
grouped.unstack()
Out[13]:
In [14]:
def get_outliers(x):
mu = x.mean()
std = x.std()
return x[(x < mu - 3*std) | (x > mu + 3*std)]
In [15]:
result = data.groupby("A_Categories")["M_Log_Diff"].apply(get_outliers)
In [16]:
result
Out[16]:
In [17]:
my_index = result.index.droplevel(0)
In [18]:
final = data.iloc[my_index]
In [19]:
final
Out[19]:
In [21]:
final["A_Categories"]
Out[21]:
In [22]:
final_list = final[final.A_Categories != "0_1"]
final_list
Out[22]:
In [23]:
final_list.groupby("A_Categories")["M_Log_Diff"].describe().unstack()
Out[23]:
In [24]:
final_list.count()
Out[24]:
In [ ]: