In [1]:
# Preliminaries to work with the data.   
%matplotlib inline
import os
import sys
curr_path = os.getcwd()
gerkin_path = os.path.split(curr_path)[0]
olfaction_prediction_path = os.path.split(gerkin_path)[0]
sys.path.append(olfaction_prediction_path)
import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
training_leaderboard_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard'))

In [3]:
from sklearn.cross_validation import ShuffleSplit
test_size = 0.17
splits = ShuffleSplit(len(training_leaderboard_CIDs),n_iter=250,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.  
train_n = int(round(len(training_leaderboard_CIDs) * (1-test_size)))
test_n = int(round(len(training_leaderboard_CIDs) * (test_size)))
print(("With a test size of %.2f, there will be %d items in each training split "
       "and %d items in each test split" % (test_size,train_n,test_n)))


With a test size of 0.17, there will be 338 items in each training split and 69 items in each test split

In [12]:
for train,test in splits:
    print(len(train),len(test))


337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70
337 70

In [5]:
train_sets = pd.Panel(None, items=list(range(100)),
                          major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
                         major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
    train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
    for i,x in enumerate(train):
        train_df.iloc[i] = x,training_leaderboard_CIDs[x]
    train_sets[j] = train_df
    
    test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
    for i,x in enumerate(test):
        test_df.iloc[i] = x,training_leaderboard_CIDs[x]
    test_sets[j] = test_df

train_sets,test_sets


Out[5]:
(<class 'pandas.core.panel.Panel'>
 Dimensions: 250 (items) x 338 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 249
 Major_axis axis: 0 to 337
 Minor_axis axis: index to CID, <class 'pandas.core.panel.Panel'>
 Dimensions: 250 (items) x 69 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 249
 Major_axis axis: 0 to 68
 Minor_axis axis: index to CID)

In [6]:
train_sets[2].tail()


Out[6]:
index CID
333 145 8093
334 48 5780
335 318 61771
336 312 61408
337 NaN NaN

In [7]:
train_sets[2].head()


Out[7]:
index CID
0 235 14286
1 324 62332
2 243 15510
3 345 101604
4 281 31265

In [8]:
test_sets[0].head()


Out[8]:
index CID
0 261 23235
1 122 7824
2 60 6276
3 232 14104
4 135 8030

In [9]:
# This creates two files.  
# One contains the training data indices and the other the test indices.  
# Each have one hundred rows, one for each split.  
train_sets.minor_xs('CID').transpose().to_csv('../../data/cv_splits_train_bigger.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('../../data/cv_splits_test_bigger.csv',header=False,index=False)

In [10]:
train_sets.minor_xs('CID').transpose().isnull().sum().sum()


Out[10]:
250