In [1]:
import sys
sys.path.append('/Users/rgerkin/Dropbox/science/olfaction-prediction') # Change to your path. 

from sklearn.cross_validation import ShuffleSplit
import csv
import pandas as pd

from opc_python.utils.loading import get_CIDs

First, use only training set CIDs


In [2]:
training_CIDs = get_CIDs('training') # The list of CIDs for the training molecules from the challenge.  
print("There are %d molecules in the training set from the challenge." % len(training_CIDs))


There are 338 molecules in the training set from the challenge.

In [3]:
test_size = 0.2
splits = ShuffleSplit(len(training_CIDs),n_iter=100,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.  
train_n = round(len(training_CIDs) * (1-test_size))
test_n = round(len(training_CIDs) * (test_size))
print(("With a test size of %.2f, there will be %d items in each training split "
       "and %d items in each test split" % (test_size,train_n,test_n)))


With a test size of 0.20, there will be 270 items in each training split and 68 items in each test split

In [4]:
train_sets = pd.Panel(None, items=list(range(100)),
                          major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
                         major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
    train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
    for i,x in enumerate(train):
        train_df.iloc[i] = x,training_CIDs[x]
    train_sets[j] = train_df
    
    test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
    for i,x in enumerate(test):
        test_df.iloc[i] = x,training_CIDs[x]
    test_sets[j] = test_df

train_sets,test_sets


Out[4]:
(<class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 270 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 269
 Minor_axis axis: index to CID, <class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 68 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 67
 Minor_axis axis: index to CID)

In [5]:
# For example, the first five training CIDs from the third split, 
# showing the CID and its corresponding index in the ordered list of CIDs:
train_sets[2].head()


Out[5]:
index CID
0 52 6213
1 289 104721
2 15 460
3 336 6429333
4 121 8077

In [8]:
# This creates two files.  
# One contains the training data indices and the other the test indices.  
# Each have one hundred rows, one for each split.  
train_sets.minor_xs('CID').transpose().to_csv('cv_splits_train.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('cv_splits_test.csv',header=False,index=False)

Second, use training set CIDs and leaderboard set CIDs.


In [9]:
training_leaderboard_CIDs = sorted(get_CIDs('training')+get_CIDs('leaderboard'))

In [14]:
test_size = 0.17
splits = ShuffleSplit(len(training_leaderboard_CIDs),n_iter=100,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.  
train_n = round(len(training_leaderboard_CIDs) * (1-test_size))
test_n = round(len(training_leaderboard_CIDs) * (test_size))
print(("With a test size of %.2f, there will be %d items in each training split "
       "and %d items in each test split" % (test_size,train_n,test_n)))


With a test size of 0.17, there will be 338 items in each training split and 69 items in each test split

In [16]:
train_sets = pd.Panel(None, items=list(range(100)),
                          major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
                         major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
    train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
    for i,x in enumerate(train):
        train_df.iloc[i] = x,training_leaderboard_CIDs[x]
    train_sets[j] = train_df
    
    test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
    for i,x in enumerate(test):
        test_df.iloc[i] = x,training_leaderboard_CIDs[x]
    test_sets[j] = test_df

train_sets,test_sets


Out[16]:
(<class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 338 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 337
 Minor_axis axis: index to CID, <class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 69 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 68
 Minor_axis axis: index to CID)

In [17]:
# For example, the first five training CIDs from the third split, 
# showing the CID and its corresponding index in the ordered list of CIDs:
train_sets[2].head()


Out[17]:
index CID
0 235 14286
1 324 62332
2 243 15510
3 345 101604
4 281 31265

In [18]:
# This creates two files.  
# One contains the training data indices and the other the test indices.  
# Each have one hundred rows, one for each split.  
train_sets.minor_xs('CID').transpose().to_csv('cv_splits_train_big.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('cv_splits_test_big.csv',header=False,index=False)