C4W4A1



In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.multitest as ssm
import scipy.stats as ss

In [2]:
data = pd.read_csv('gene_high_throughput_sequencing.csv')
data.shape


Out[2]:
(72, 15750)

In [3]:
data_normal = data.loc[data.Diagnosis == 'normal']
data_normal.shape


Out[3]:
(24, 15750)

In [4]:
data_early = data.loc[data.Diagnosis == 'early neoplasia']
data_early.shape


Out[4]:
(25, 15750)

In [5]:
data_cancer = data.loc[data.Diagnosis == 'cancer']
data_cancer.shape


Out[5]:
(23, 15750)

In [6]:
data.head()


Out[6]:
Patient_id Diagnosis LOC643837 LOC100130417 SAMD11 NOC2L KLHL17 PLEKHN1 C1orf170 HES4 ... CLIC2 RPS4Y1 ZFY PRKY USP9Y DDX3Y CD24 CYorf15B KDM5D EIF1AY
0 STT5425_Breast_001_normal normal 1.257614 2.408148 13.368622 9.494779 20.880435 12.722017 9.494779 54.349694 ... 4.761250 1.257614 1.257614 1.257614 1.257614 1.257614 23.268694 1.257614 1.257614 1.257614
1 STT5427_Breast_023_normal normal 4.567931 16.602734 42.477752 25.562376 23.221137 11.622386 14.330573 72.445474 ... 6.871902 1.815112 1.815112 1.815112 1.815112 1.815112 10.427023 1.815112 1.815112 1.815112
2 STT5430_Breast_002_normal normal 2.077597 3.978294 12.863214 13.728915 14.543176 14.141907 6.232790 57.011005 ... 7.096343 2.077597 2.077597 2.077597 2.077597 2.077597 22.344226 2.077597 2.077597 2.077597
3 STT5439_Breast_003_normal normal 2.066576 8.520713 14.466035 7.823932 8.520713 2.066576 10.870009 53.292034 ... 5.200770 2.066576 2.066576 2.066576 2.066576 2.066576 49.295538 2.066576 2.066576 2.066576
4 STT5441_Breast_004_normal normal 2.613616 3.434965 12.682222 10.543189 26.688686 12.484822 1.364917 67.140393 ... 11.227770 1.364917 1.364917 1.364917 1.364917 1.364917 23.627911 1.364917 1.364917 1.364917

5 rows × 15750 columns


In [14]:
counter = 0
p_values_1 = []
for name in data.columns[2:]:
    p_values_1.append(ss.ttest_ind(data_normal[name], data_early[name], equal_var=False)[1])
    counter += int(p_values_1[-1] < 0.05)
counter


Out[14]:
1575

In [16]:
counter = 0
p_values_2 = []
for name in data.columns[2:]:
    p_values_2.append(ss.ttest_ind(data_cancer[name], data_early[name], equal_var=False)[1])
    counter += int(p_values_2[-1] < 0.05)
counter


Out[16]:
3490

In [9]:
def fold_change(c, t):
    nc = np.array(c).mean()
    nt = np.array(t).mean()
    if nt > nc:
        return nt/nc
    else:
        return - nc/nt

In [17]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_1, method='hommel', alpha=0.025)

In [24]:
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_normal[name], data_early[name])) > 1.5:
        counter += 1
counter


Out[24]:
2

In [29]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_2, method='hommel', alpha=0.025)

In [30]:
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_early[name], data_cancer[name])) > 1.5:
        counter += 1
counter


Out[30]:
77

In [31]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_1, method='fdr_bh', alpha=0.025)
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_normal[name], data_early[name])) > 1.5:
        counter += 1
counter


Out[31]:
4

In [32]:
reject, p_corrected, _, _ = ssm.multipletests(p_values_2, method='fdr_bh', alpha=0.025)
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_early[name], data_cancer[name])) > 1.5:
        counter += 1
counter


Out[32]:
524

In [ ]: