In [1]:
# Preliminaries to work with the data.
%matplotlib inline
import os
import sys
curr_path = os.getcwd()
gerkin_path = os.path.split(curr_path)[0]
olfaction_prediction_path = os.path.split(gerkin_path)[0]
sys.path.append(olfaction_prediction_path)
import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [2]:
training_leaderboard_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard'))
In [3]:
from sklearn.cross_validation import ShuffleSplit
test_size = 0.17
splits = ShuffleSplit(len(training_leaderboard_CIDs),n_iter=250,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.
train_n = int(round(len(training_leaderboard_CIDs) * (1-test_size)))
test_n = int(round(len(training_leaderboard_CIDs) * (test_size)))
print(("With a test size of %.2f, there will be %d items in each training split "
"and %d items in each test split" % (test_size,train_n,test_n)))
In [12]:
for train,test in splits:
print(len(train),len(test))
In [5]:
train_sets = pd.Panel(None, items=list(range(100)),
major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
for i,x in enumerate(train):
train_df.iloc[i] = x,training_leaderboard_CIDs[x]
train_sets[j] = train_df
test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
for i,x in enumerate(test):
test_df.iloc[i] = x,training_leaderboard_CIDs[x]
test_sets[j] = test_df
train_sets,test_sets
Out[5]:
In [6]:
train_sets[2].tail()
Out[6]:
In [7]:
train_sets[2].head()
Out[7]:
In [8]:
test_sets[0].head()
Out[8]:
In [9]:
# This creates two files.
# One contains the training data indices and the other the test indices.
# Each have one hundred rows, one for each split.
train_sets.minor_xs('CID').transpose().to_csv('../../data/cv_splits_train_bigger.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('../../data/cv_splits_test_bigger.csv',header=False,index=False)
In [10]:
train_sets.minor_xs('CID').transpose().isnull().sum().sum()
Out[10]: