In [1]:

    
import numpy as np
import pandas as pd

Load dataframe and count number of cases positive and negative cases



In [2]:

    
isozyme2d6 = pd.read_csv('data/2d6.csv')



In [3]:

    
# Renaming the Activity Score column to conform to Python syntax
isozyme2d6.rename(columns={'p450-cyp2d6-ActivityScore': 'ActivityScore'}, inplace=True)



In [4]:

    
# Number of substances with an activity scores greater than or equal to 40
n_pos = (isozyme2d6.ActivityScore >= 40).sum()
n_pos









    Out[4]:





2771



In [5]:

    
# Number of substances with an activity score below 40
n_neg = (isozyme2d6.ActivityScore < 40).sum()
n_neg









    Out[5]:





14372

Downsampling the negative cases

This section of code shuffles the order of substances with an Activity Score below 40 (negatives). Then counts the number of substances with an Activity Score 40 or above (positives) and uses that number as a cutoff value on the list of negatives. When complete, there are an equal number of positives and negatives.
The seed value is set for the randomizer to ensure reproducibility. Different seeds will result in different negatives being included in the analysis set. All of the positives are always included in the resulting dataset.



In [6]:

    
# method adapted from DataRobot post about scikit-learn classification

# Downsample negative cases -- there are many more negatives than positives

indices = np.where(isozyme2d6.ActivityScore < 40)[0]
rng = np.random.RandomState(50)  # sets seed for random number generator
rng.shuffle(indices)             # different seed numbers result in different shuffle
n_pos = (isozyme2d6.ActivityScore >= 40).sum()
balanced = isozyme2d6.drop(isozyme2d6.index[indices[n_pos:]])

balanced.head(10)









    Out[6]:






  
    
      
      SID
      ActivityScore
      apol
      a_acc
      a_acid
      a_aro
      a_base
      a_count
      a_don
      a_heavy
      ...
      vsa_acid
      vsa_base
      vsa_don
      vsa_hyd
      vsa_other
      vsa_pol
      Weight
      weinerPath
      weinerPol
      zagreb
    
  
  
    
      5 
       842618
       44
       70.986168
       5
       0
       17
       2
       64
       0
       31
      ...
       0
       0
        0.000000
       380.40643
       11.190562
       43.926376
       423.56500
       2659
       49
       164
    
    
      9 
       842953
       44
       62.660240
       3
       0
       12
       1
       52
       1
       29
      ...
       0
       0
        0.000000
       340.08496
       24.140093
       33.813168
       434.34698
       2338
       47
       154
    
    
      14
       843293
        0
       60.107033
       3
       0
        0
       0
       51
       1
       27
      ...
       0
       0
        9.421040
       288.14563
       45.765236
       45.975922
       384.50400
       1955
       44
       138
    
    
      18
       843526
       41
       45.001102
       3
       0
       11
       0
       36
       2
       22
      ...
       0
       0
       11.365152
       216.60484
       21.408051
       41.002750
       314.36499
       1107
       34
       114
    
    
      21
       843662
       40
       65.452202
       3
       0
       15
       1
       57
       0
       29
      ...
       0
       0
        0.000000
       315.71521
       12.949531
       36.147465
       390.51099
       2477
       48
       162
    
    
      29
       844085
       41
       43.942272
       2
       0
        0
       0
       37
       2
       19
      ...
       0
       0
       23.425066
       159.08879
       51.789574
       58.910046
       274.39200
        616
       35
        96
    
    
      31
       844156
        0
       62.506241
       3
       0
       13
       0
       53
       0
       30
      ...
       0
       0
        0.000000
       287.13580
       76.132065
       40.700764
       429.49698
       2651
       46
       160
    
    
      42
       844873
       41
       66.039825
       6
       0
       11
       0
       57
       3
       32
      ...
       0
       0
       41.167557
       238.78426
       62.202019
       94.951035
       459.52701
       2811
       52
       170
    
    
      44
       844963
       66
       47.919479
       3
       0
       17
       0
       40
       1
       23
      ...
       0
       0
        5.682576
       229.00011
       43.990028
       16.372663
       307.35300
       1305
       33
       124
    
    
      48
       845384
       42
       66.798203
       4
       0
       12
       1
       58
       1
       30
      ...
       0
       0
        5.682576
       319.00067
       23.055140
       53.769775
       430.54898
       2998
       45
       158
    
  

10 rows × 188 columns



In [7]:

    
# Demonstrate the dataset is balanced
n_pos = (balanced.ActivityScore >= 40).sum()
n_neg = (balanced.ActivityScore < 40).sum()
n_neg, n_pos









    Out[7]:





(2771, 2771)

Write files for analysis



In [8]:

    
balanced.to_csv("data/balanced2d6.csv", index=False)

Generate Training and Test Set



In [9]:

    
twoD6 = pd.read_csv("data/balanced2d6.csv")



In [10]:

    
twoD6.head()









    Out[10]:






  
    
      
      SID
      ActivityScore
      apol
      a_acc
      a_acid
      a_aro
      a_base
      a_count
      a_don
      a_heavy
      ...
      vsa_acid
      vsa_base
      vsa_don
      vsa_hyd
      vsa_other
      vsa_pol
      Weight
      weinerPath
      weinerPol
      zagreb
    
  
  
    
      0
       842618
       44
       70.986168
       5
       0
       17
       2
       64
       0
       31
      ...
       0
       0
        0.000000
       380.40643
       11.190562
       43.926376
       423.56500
       2659
       49
       164
    
    
      1
       842953
       44
       62.660240
       3
       0
       12
       1
       52
       1
       29
      ...
       0
       0
        0.000000
       340.08496
       24.140093
       33.813168
       434.34698
       2338
       47
       154
    
    
      2
       843293
        0
       60.107033
       3
       0
        0
       0
       51
       1
       27
      ...
       0
       0
        9.421040
       288.14563
       45.765236
       45.975922
       384.50400
       1955
       44
       138
    
    
      3
       843526
       41
       45.001102
       3
       0
       11
       0
       36
       2
       22
      ...
       0
       0
       11.365152
       216.60484
       21.408051
       41.002750
       314.36499
       1107
       34
       114
    
    
      4
       843662
       40
       65.452202
       3
       0
       15
       1
       57
       0
       29
      ...
       0
       0
        0.000000
       315.71521
       12.949531
       36.147465
       390.51099
       2477
       48
       162
    
  

5 rows × 188 columns

Shuffle and split dataset while preserving pandas index and metadata.



In [11]:

    
# Method adapted to Python3 from function by boates at https://gist.github.com/boates/5127281
N = len(twoD6)



In [12]:

    
l = list(range(N))



In [13]:

    
random.seed(76)
random.shuffle(l)



In [14]:

    
# get splitting indicies
# Here they are set to 80% training, 0% cross-validation and 20% test sets
trainLen = int(N*.8)
cvLen    = int(N*0.0)
testLen  = int(N*.2)



In [15]:

    
# get training, cv, and test sets
training = twoD6.ix[l[:trainLen]]
cv       = twoD6.ix[l[trainLen:trainLen+cvLen]]
test     = twoD6.ix[l[trainLen+cvLen:]]



In [16]:

    
# Examine training set
training.head()









    Out[16]:






  
    
      
      SID
      ActivityScore
      apol
      a_acc
      a_acid
      a_aro
      a_base
      a_count
      a_don
      a_heavy
      ...
      vsa_acid
      vsa_base
      vsa_don
      vsa_hyd
      vsa_other
      vsa_pol
      Weight
      weinerPath
      weinerPol
      zagreb
    
  
  
    
      3289
       11114095
       85
       25.967930
       0
       5
        0
       1
       23
       0
       13
      ...
       74.321251
       17.742489
       17.742489
        71.454041
       28.375498
       92.063744
       207.12199
        272
       13
        58
    
    
      2488
       11111502
       41
       52.303032
       0
       0
       12
       1
       45
       0
       21
      ...
        0.000000
        5.682576
        5.682576
       254.943600
        0.000000
        5.682576
       278.41901
        810
       40
       118
    
    
      2462
       11111413
        0
       35.143032
       0
       0
        0
       3
       36
       0
       12
      ...
        0.000000
       41.167557
       41.167557
       133.040590
        0.000000
       41.167557
       174.31200
        215
       13
        52
    
    
      2821
       11112630
       20
       32.461517
       1
       0
        6
       0
       26
       0
       14
      ...
        0.000000
        0.000000
        0.000000
       166.094760
       13.166624
        5.682576
       204.29700
        298
       17
        76
    
    
      2293
       11110827
        0
       64.246994
       5
       0
        0
       0
       57
       3
       28
      ...
        0.000000
        0.000000
        0.000000
       267.817900
       16.917038
       67.834602
       408.92200
       1670
       70
       168
    
  

5 rows × 188 columns



In [17]:

    
test.shape









    Out[17]:





(1109, 188)



In [18]:

    
# Check number of actives an inactives in test set
n_pos1 = (test.ActivityScore >= 40).sum()
n_neg1 = (test.ActivityScore < 40).sum()
n_neg1, n_pos1









    Out[18]:





(557, 552)



In [19]:

    
# Check number of actives and inactives in training set
n_pos2 = (training.ActivityScore >= 40).sum()
n_neg2 = (training.ActivityScore < 40).sum()
n_neg2, n_pos2









    Out[19]:





(2214, 2219)

Write resulting training and test set to files for use in all further analyses.



In [20]:

    
training.to_csv("data/training2d6.csv", index=False)
test.to_csv("data/test2d6.csv", index=False)



In [20]:

	SID	ActivityScore	apol	a_acc	a_aro	a_base	a_count	a_don	a_heavy	...	vsa_don	vsa_hyd	vsa_other	vsa_pol	Weight	weinerPath	weinerPol	zagreb
5	842618	44	70.986168	5	17	2	64	0	31	...	0.000000	380.40643	11.190562	43.926376	423.56500	2659	49	164
9	842953	44	62.660240	3	12	1	52	1	29	...	0.000000	340.08496	24.140093	33.813168	434.34698	2338	47	154
14	843293	0	60.107033	3	0	0	51	1	27	...	9.421040	288.14563	45.765236	45.975922	384.50400	1955	44	138
18	843526	41	45.001102	3	11	0	36	2	22	...	11.365152	216.60484	21.408051	41.002750	314.36499	1107	34	114
21	843662	40	65.452202	3	15	1	57	0	29	...	0.000000	315.71521	12.949531	36.147465	390.51099	2477	48	162
29	844085	41	43.942272	2	0	0	37	2	19	...	23.425066	159.08879	51.789574	58.910046	274.39200	616	35	96
31	844156	0	62.506241	3	13	0	53	0	30	...	0.000000	287.13580	76.132065	40.700764	429.49698	2651	46	160
42	844873	41	66.039825	6	11	0	57	3	32	...	41.167557	238.78426	62.202019	94.951035	459.52701	2811	52	170
44	844963	66	47.919479	3	17	0	40	1	23	...	5.682576	229.00011	43.990028	16.372663	307.35300	1305	33	124
48	845384	42	66.798203	4	12	1	58	1	30	...	5.682576	319.00067	23.055140	53.769775	430.54898	2998	45	158

	SID	ActivityScore	apol	a_acc	a_acid	a_aro	a_base	a_count	a_don	a_heavy	...	vsa_acid	vsa_base	vsa_don	vsa_hyd	vsa_other	vsa_pol	Weight	weinerPath	weinerPol	zagreb
3289	11114095	85	25.967930	0	5	0	1	23	0	13	...	74.321251	17.742489	17.742489	71.454041	28.375498	92.063744	207.12199	272	13	58
2488	11111502	41	52.303032	0	0	12	1	45	0	21	...	0.000000	5.682576	5.682576	254.943600	0.000000	5.682576	278.41901	810	40	118
2462	11111413	0	35.143032	0	0	0	3	36	0	12	...	0.000000	41.167557	41.167557	133.040590	0.000000	41.167557	174.31200	215	13	52
2821	11112630	20	32.461517	1	0	6	0	26	0	14	...	0.000000	0.000000	0.000000	166.094760	13.166624	5.682576	204.29700	298	17	76
2293	11110827	0	64.246994	5	0	0	0	57	3	28	...	0.000000	0.000000	0.000000	267.817900	16.917038	67.834602	408.92200	1670	70	168