In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.multitest as ssm
import scipy.stats as ss
In [2]:
data = pd.read_csv('gene_high_throughput_sequencing.csv')
data.shape
Out[2]:
In [3]:
data_normal = data.loc[data.Diagnosis == 'normal']
data_normal.shape
Out[3]:
In [4]:
data_early = data.loc[data.Diagnosis == 'early neoplasia']
data_early.shape
Out[4]:
In [5]:
data_cancer = data.loc[data.Diagnosis == 'cancer']
data_cancer.shape
Out[5]:
In [6]:
data.head()
Out[6]:
In [14]:
counter = 0
p_values_1 = []
for name in data.columns[2:]:
p_values_1.append(ss.ttest_ind(data_normal[name], data_early[name], equal_var=False)[1])
counter += int(p_values_1[-1] < 0.05)
counter
Out[14]:
In [16]:
counter = 0
p_values_2 = []
for name in data.columns[2:]:
p_values_2.append(ss.ttest_ind(data_cancer[name], data_early[name], equal_var=False)[1])
counter += int(p_values_2[-1] < 0.05)
counter
Out[16]:
In [9]:
def fold_change(c, t):
nc = np.array(c).mean()
nt = np.array(t).mean()
if nt > nc:
return nt/nc
else:
return - nc/nt
In [17]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_1, method='hommel', alpha=0.025)
In [24]:
counter = 0
for idx, name in enumerate(data.columns[2:]):
if reject[idx] and abs(fold_change(data_normal[name], data_early[name])) > 1.5:
counter += 1
counter
Out[24]:
In [29]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_2, method='hommel', alpha=0.025)
In [30]:
counter = 0
for idx, name in enumerate(data.columns[2:]):
if reject[idx] and abs(fold_change(data_early[name], data_cancer[name])) > 1.5:
counter += 1
counter
Out[30]:
In [31]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_1, method='fdr_bh', alpha=0.025)
counter = 0
for idx, name in enumerate(data.columns[2:]):
if reject[idx] and abs(fold_change(data_normal[name], data_early[name])) > 1.5:
counter += 1
counter
Out[31]:
In [32]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_2, method='fdr_bh', alpha=0.025)
counter = 0
for idx, name in enumerate(data.columns[2:]):
if reject[idx] and abs(fold_change(data_early[name], data_cancer[name])) > 1.5:
counter += 1
counter
Out[32]:
In [ ]: