In [1]:
import sys
sys.path.append('/Users/rgerkin/Dropbox/science/olfaction-prediction') # Change to your path.
from sklearn.cross_validation import ShuffleSplit
import csv
import pandas as pd
from opc_python.utils.loading import get_CIDs
In [2]:
training_CIDs = get_CIDs('training') # The list of CIDs for the training molecules from the challenge.
print("There are %d molecules in the training set from the challenge." % len(training_CIDs))
In [3]:
test_size = 0.2
splits = ShuffleSplit(len(training_CIDs),n_iter=100,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.
train_n = round(len(training_CIDs) * (1-test_size))
test_n = round(len(training_CIDs) * (test_size))
print(("With a test size of %.2f, there will be %d items in each training split "
"and %d items in each test split" % (test_size,train_n,test_n)))
In [4]:
train_sets = pd.Panel(None, items=list(range(100)),
major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
for i,x in enumerate(train):
train_df.iloc[i] = x,training_CIDs[x]
train_sets[j] = train_df
test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
for i,x in enumerate(test):
test_df.iloc[i] = x,training_CIDs[x]
test_sets[j] = test_df
train_sets,test_sets
Out[4]:
In [5]:
# For example, the first five training CIDs from the third split,
# showing the CID and its corresponding index in the ordered list of CIDs:
train_sets[2].head()
Out[5]:
In [8]:
# This creates two files.
# One contains the training data indices and the other the test indices.
# Each have one hundred rows, one for each split.
train_sets.minor_xs('CID').transpose().to_csv('cv_splits_train.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('cv_splits_test.csv',header=False,index=False)
In [9]:
training_leaderboard_CIDs = sorted(get_CIDs('training')+get_CIDs('leaderboard'))
In [14]:
test_size = 0.17
splits = ShuffleSplit(len(training_leaderboard_CIDs),n_iter=100,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.
train_n = round(len(training_leaderboard_CIDs) * (1-test_size))
test_n = round(len(training_leaderboard_CIDs) * (test_size))
print(("With a test size of %.2f, there will be %d items in each training split "
"and %d items in each test split" % (test_size,train_n,test_n)))
In [16]:
train_sets = pd.Panel(None, items=list(range(100)),
major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
for i,x in enumerate(train):
train_df.iloc[i] = x,training_leaderboard_CIDs[x]
train_sets[j] = train_df
test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
for i,x in enumerate(test):
test_df.iloc[i] = x,training_leaderboard_CIDs[x]
test_sets[j] = test_df
train_sets,test_sets
Out[16]:
In [17]:
# For example, the first five training CIDs from the third split,
# showing the CID and its corresponding index in the ordered list of CIDs:
train_sets[2].head()
Out[17]:
In [18]:
# This creates two files.
# One contains the training data indices and the other the test indices.
# Each have one hundred rows, one for each split.
train_sets.minor_xs('CID').transpose().to_csv('cv_splits_train_big.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('cv_splits_test_big.csv',header=False,index=False)