notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import statsmodels.stats.multitest as ssm
import scipy.stats as ss



In [2]:

    
data = pd.read_csv('gene_high_throughput_sequencing.csv')
data.shape









    Out[2]:





(72, 15750)



In [3]:

    
data_normal = data.loc[data.Diagnosis == 'normal']
data_normal.shape









    Out[3]:





(24, 15750)



In [4]:

    
data_early = data.loc[data.Diagnosis == 'early neoplasia']
data_early.shape









    Out[4]:





(25, 15750)



In [5]:

    
data_cancer = data.loc[data.Diagnosis == 'cancer']
data_cancer.shape









    Out[5]:





(23, 15750)



In [6]:

    
data.head()









    Out[6]:






  
    
      
      Patient_id
      Diagnosis
      LOC643837
      LOC100130417
      SAMD11
      NOC2L
      KLHL17
      PLEKHN1
      C1orf170
      HES4
      ...
      CLIC2
      RPS4Y1
      ZFY
      PRKY
      USP9Y
      DDX3Y
      CD24
      CYorf15B
      KDM5D
      EIF1AY
    
  
  
    
      0
      STT5425_Breast_001_normal
      normal
      1.257614
      2.408148
      13.368622
      9.494779
      20.880435
      12.722017
      9.494779
      54.349694
      ...
      4.761250
      1.257614
      1.257614
      1.257614
      1.257614
      1.257614
      23.268694
      1.257614
      1.257614
      1.257614
    
    
      1
      STT5427_Breast_023_normal
      normal
      4.567931
      16.602734
      42.477752
      25.562376
      23.221137
      11.622386
      14.330573
      72.445474
      ...
      6.871902
      1.815112
      1.815112
      1.815112
      1.815112
      1.815112
      10.427023
      1.815112
      1.815112
      1.815112
    
    
      2
      STT5430_Breast_002_normal
      normal
      2.077597
      3.978294
      12.863214
      13.728915
      14.543176
      14.141907
      6.232790
      57.011005
      ...
      7.096343
      2.077597
      2.077597
      2.077597
      2.077597
      2.077597
      22.344226
      2.077597
      2.077597
      2.077597
    
    
      3
      STT5439_Breast_003_normal
      normal
      2.066576
      8.520713
      14.466035
      7.823932
      8.520713
      2.066576
      10.870009
      53.292034
      ...
      5.200770
      2.066576
      2.066576
      2.066576
      2.066576
      2.066576
      49.295538
      2.066576
      2.066576
      2.066576
    
    
      4
      STT5441_Breast_004_normal
      normal
      2.613616
      3.434965
      12.682222
      10.543189
      26.688686
      12.484822
      1.364917
      67.140393
      ...
      11.227770
      1.364917
      1.364917
      1.364917
      1.364917
      1.364917
      23.627911
      1.364917
      1.364917
      1.364917
    
  

5 rows × 15750 columns



In [14]:

    
counter = 0
p_values_1 = []
for name in data.columns[2:]:
    p_values_1.append(ss.ttest_ind(data_normal[name], data_early[name], equal_var=False)[1])
    counter += int(p_values_1[-1] < 0.05)
counter









    Out[14]:





1575



In [16]:

    
counter = 0
p_values_2 = []
for name in data.columns[2:]:
    p_values_2.append(ss.ttest_ind(data_cancer[name], data_early[name], equal_var=False)[1])
    counter += int(p_values_2[-1] < 0.05)
counter









    Out[16]:





3490



In [9]:

    
def fold_change(c, t):
    nc = np.array(c).mean()
    nt = np.array(t).mean()
    if nt > nc:
        return nt/nc
    else:
        return - nc/nt



In [17]:

    
reject, p_corrected, _, _ = ssm.multipletests(p_values_1, method='hommel', alpha=0.025)



In [24]:

    
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_normal[name], data_early[name])) > 1.5:
        counter += 1
counter









    Out[24]:





2



In [29]:

    
reject, p_corrected, _, _ = ssm.multipletests(p_values_2, method='hommel', alpha=0.025)



In [30]:

    
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_early[name], data_cancer[name])) > 1.5:
        counter += 1
counter









    Out[30]:





77



In [31]:

    
reject, p_corrected, _, _ = ssm.multipletests(p_values_1, method='fdr_bh', alpha=0.025)
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_normal[name], data_early[name])) > 1.5:
        counter += 1
counter









    Out[31]:





4



In [32]:

    
reject, p_corrected, _, _ = ssm.multipletests(p_values_2, method='fdr_bh', alpha=0.025)
counter = 0
for idx, name in enumerate(data.columns[2:]):
    if reject[idx] and abs(fold_change(data_early[name], data_cancer[name])) > 1.5:
        counter += 1
counter









    Out[32]:





524



In [ ]:

	Patient_id	Diagnosis	LOC643837	LOC100130417	SAMD11	NOC2L	KLHL17	PLEKHN1	C1orf170	HES4	...	CLIC2	RPS4Y1	ZFY	PRKY	USP9Y	DDX3Y	CD24	CYorf15B	KDM5D	EIF1AY
0	STT5425_Breast_001_normal	normal	1.257614	2.408148	13.368622	9.494779	20.880435	12.722017	9.494779	54.349694	...	4.761250	1.257614	1.257614	1.257614	1.257614	1.257614	23.268694	1.257614	1.257614	1.257614
1	STT5427_Breast_023_normal	normal	4.567931	16.602734	42.477752	25.562376	23.221137	11.622386	14.330573	72.445474	...	6.871902	1.815112	1.815112	1.815112	1.815112	1.815112	10.427023	1.815112	1.815112	1.815112
2	STT5430_Breast_002_normal	normal	2.077597	3.978294	12.863214	13.728915	14.543176	14.141907	6.232790	57.011005	...	7.096343	2.077597	2.077597	2.077597	2.077597	2.077597	22.344226	2.077597	2.077597	2.077597
3	STT5439_Breast_003_normal	normal	2.066576	8.520713	14.466035	7.823932	8.520713	2.066576	10.870009	53.292034	...	5.200770	2.066576	2.066576	2.066576	2.066576	2.066576	49.295538	2.066576	2.066576	2.066576
4	STT5441_Breast_004_normal	normal	2.613616	3.434965	12.682222	10.543189	26.688686	12.484822	1.364917	67.140393	...	11.227770	1.364917	1.364917	1.364917	1.364917	1.364917	23.627911	1.364917	1.364917	1.364917