In [1]:

    
import sys
sys.path.append('/Users/rgerkin/Dropbox/science/olfaction-prediction') # Change to your path. 

from sklearn.cross_validation import ShuffleSplit
import csv
import pandas as pd

from opc_python.utils.loading import get_CIDs

First, use only training set CIDs



In [2]:

    
training_CIDs = get_CIDs('training') # The list of CIDs for the training molecules from the challenge.  
print("There are %d molecules in the training set from the challenge." % len(training_CIDs))









    



There are 338 molecules in the training set from the challenge.



In [3]:

    
test_size = 0.2
splits = ShuffleSplit(len(training_CIDs),n_iter=100,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.  
train_n = round(len(training_CIDs) * (1-test_size))
test_n = round(len(training_CIDs) * (test_size))
print(("With a test size of %.2f, there will be %d items in each training split "
       "and %d items in each test split" % (test_size,train_n,test_n)))









    



With a test size of 0.20, there will be 270 items in each training split and 68 items in each test split



In [4]:

    
train_sets = pd.Panel(None, items=list(range(100)),
                          major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
                         major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
    train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
    for i,x in enumerate(train):
        train_df.iloc[i] = x,training_CIDs[x]
    train_sets[j] = train_df
    
    test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
    for i,x in enumerate(test):
        test_df.iloc[i] = x,training_CIDs[x]
    test_sets[j] = test_df

train_sets,test_sets









    Out[4]:





(<class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 270 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 269
 Minor_axis axis: index to CID, <class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 68 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 67
 Minor_axis axis: index to CID)



In [5]:

    
# For example, the first five training CIDs from the third split, 
# showing the CID and its corresponding index in the ordered list of CIDs:
train_sets[2].head()



In [8]:

    
# This creates two files.  
# One contains the training data indices and the other the test indices.  
# Each have one hundred rows, one for each split.  
train_sets.minor_xs('CID').transpose().to_csv('cv_splits_train.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('cv_splits_test.csv',header=False,index=False)

Second, use training set CIDs and leaderboard set CIDs.



In [9]:

    
training_leaderboard_CIDs = sorted(get_CIDs('training')+get_CIDs('leaderboard'))



In [14]:

    
test_size = 0.17
splits = ShuffleSplit(len(training_leaderboard_CIDs),n_iter=100,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.  
train_n = round(len(training_leaderboard_CIDs) * (1-test_size))
test_n = round(len(training_leaderboard_CIDs) * (test_size))
print(("With a test size of %.2f, there will be %d items in each training split "
       "and %d items in each test split" % (test_size,train_n,test_n)))









    



With a test size of 0.17, there will be 338 items in each training split and 69 items in each test split



In [16]:

    
train_sets = pd.Panel(None, items=list(range(100)),
                          major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
                         major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
    train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
    for i,x in enumerate(train):
        train_df.iloc[i] = x,training_leaderboard_CIDs[x]
    train_sets[j] = train_df
    
    test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
    for i,x in enumerate(test):
        test_df.iloc[i] = x,training_leaderboard_CIDs[x]
    test_sets[j] = test_df

train_sets,test_sets









    Out[16]:





(<class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 338 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 337
 Minor_axis axis: index to CID, <class 'pandas.core.panel.Panel'>
 Dimensions: 100 (items) x 69 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 99
 Major_axis axis: 0 to 68
 Minor_axis axis: index to CID)



In [17]:

    
# For example, the first five training CIDs from the third split, 
# showing the CID and its corresponding index in the ordered list of CIDs:
train_sets[2].head()



In [18]:

    
# This creates two files.  
# One contains the training data indices and the other the test indices.  
# Each have one hundred rows, one for each split.  
train_sets.minor_xs('CID').transpose().to_csv('cv_splits_train_big.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('cv_splits_test_big.csv',header=False,index=False)

	index	CID
0	52	6213
1	289	104721
2	15	460
3	336	6429333
4	121	8077

	index	CID
0	235	14286
1	324	62332
2	243	15510
3	345	101604
4	281	31265