In [1]:
import numpy as np
import pandas as pd
In [2]:
isozyme2d6 = pd.read_csv('data/2d6.csv')
In [3]:
# Renaming the Activity Score column to conform to Python syntax
isozyme2d6.rename(columns={'p450-cyp2d6-ActivityScore': 'ActivityScore'}, inplace=True)
In [4]:
# Number of substances with an activity scores greater than or equal to 40
n_pos = (isozyme2d6.ActivityScore >= 40).sum()
n_pos
Out[4]:
In [5]:
# Number of substances with an activity score below 40
n_neg = (isozyme2d6.ActivityScore < 40).sum()
n_neg
Out[5]:
This section of code shuffles the order of substances with an Activity Score below 40 (negatives). Then counts the number of substances with an Activity Score 40 or above (positives) and uses that number as a cutoff value on the list of negatives. When complete, there are an equal number of positives and negatives.
The seed value is set for the randomizer to ensure reproducibility. Different seeds will result in different negatives being included in the analysis set. All of the positives are always included in the resulting dataset.
In [6]:
# method adapted from DataRobot post about scikit-learn classification
# Downsample negative cases -- there are many more negatives than positives
indices = np.where(isozyme2d6.ActivityScore < 40)[0]
rng = np.random.RandomState(50) # sets seed for random number generator
rng.shuffle(indices) # different seed numbers result in different shuffle
n_pos = (isozyme2d6.ActivityScore >= 40).sum()
balanced = isozyme2d6.drop(isozyme2d6.index[indices[n_pos:]])
balanced.head(10)
Out[6]:
In [7]:
# Demonstrate the dataset is balanced
n_pos = (balanced.ActivityScore >= 40).sum()
n_neg = (balanced.ActivityScore < 40).sum()
n_neg, n_pos
Out[7]:
In [8]:
balanced.to_csv("data/balanced2d6.csv", index=False)
In [9]:
twoD6 = pd.read_csv("data/balanced2d6.csv")
In [10]:
twoD6.head()
Out[10]:
In [11]:
# Method adapted to Python3 from function by boates at https://gist.github.com/boates/5127281
N = len(twoD6)
In [12]:
l = list(range(N))
In [13]:
random.seed(76)
random.shuffle(l)
In [14]:
# get splitting indicies
# Here they are set to 80% training, 0% cross-validation and 20% test sets
trainLen = int(N*.8)
cvLen = int(N*0.0)
testLen = int(N*.2)
In [15]:
# get training, cv, and test sets
training = twoD6.ix[l[:trainLen]]
cv = twoD6.ix[l[trainLen:trainLen+cvLen]]
test = twoD6.ix[l[trainLen+cvLen:]]
In [16]:
# Examine training set
training.head()
Out[16]:
In [17]:
test.shape
Out[17]:
In [18]:
# Check number of actives an inactives in test set
n_pos1 = (test.ActivityScore >= 40).sum()
n_neg1 = (test.ActivityScore < 40).sum()
n_neg1, n_pos1
Out[18]:
In [19]:
# Check number of actives and inactives in training set
n_pos2 = (training.ActivityScore >= 40).sum()
n_neg2 = (training.ActivityScore < 40).sum()
n_neg2, n_pos2
Out[19]:
In [20]:
training.to_csv("data/training2d6.csv", index=False)
test.to_csv("data/test2d6.csv", index=False)
In [20]: