notebook.community

Edit and run



In [1]:

    
# Preliminaries to work with the data.   
%matplotlib inline
import os
import sys
curr_path = os.getcwd()
gerkin_path = os.path.split(curr_path)[0]
olfaction_prediction_path = os.path.split(gerkin_path)[0]
sys.path.append(olfaction_prediction_path)
import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



In [2]:

    
training_leaderboard_CIDs = sorted(loading.get_CIDs('training')+loading.get_CIDs('leaderboard'))



In [3]:

    
from sklearn.cross_validation import ShuffleSplit
test_size = 0.17
splits = ShuffleSplit(len(training_leaderboard_CIDs),n_iter=250,test_size=test_size,random_state=0)
# random_state = 0 means this will produce the same splits each time it is run.  
train_n = int(round(len(training_leaderboard_CIDs) * (1-test_size)))
test_n = int(round(len(training_leaderboard_CIDs) * (test_size)))
print(("With a test size of %.2f, there will be %d items in each training split "
       "and %d items in each test split" % (test_size,train_n,test_n)))









    



With a test size of 0.17, there will be 338 items in each training split and 69 items in each test split



In [12]:

    
for train,test in splits:
    print(len(train),len(test))



In [5]:

    
train_sets = pd.Panel(None, items=list(range(100)),
                          major_axis=list(range(train_n)),minor_axis=['index','CID'])
test_sets = pd.Panel(None, items=list(range(100)),
                         major_axis=list(range(test_n)),minor_axis=['index','CID'])
for j,(train,test) in enumerate(splits):
    train_df = pd.DataFrame(None,index=[list(range(len(train)))],columns=['index','CID'])
    for i,x in enumerate(train):
        train_df.iloc[i] = x,training_leaderboard_CIDs[x]
    train_sets[j] = train_df
    
    test_df = pd.DataFrame(None,index=[list(range(len(test)))],columns=['index','CID'])
    for i,x in enumerate(test):
        test_df.iloc[i] = x,training_leaderboard_CIDs[x]
    test_sets[j] = test_df

train_sets,test_sets









    Out[5]:





(<class 'pandas.core.panel.Panel'>
 Dimensions: 250 (items) x 338 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 249
 Major_axis axis: 0 to 337
 Minor_axis axis: index to CID, <class 'pandas.core.panel.Panel'>
 Dimensions: 250 (items) x 69 (major_axis) x 2 (minor_axis)
 Items axis: 0 to 249
 Major_axis axis: 0 to 68
 Minor_axis axis: index to CID)



In [6]:

    
train_sets[2].tail()



In [7]:

    
train_sets[2].head()



In [8]:

    
test_sets[0].head()



In [9]:

    
# This creates two files.  
# One contains the training data indices and the other the test indices.  
# Each have one hundred rows, one for each split.  
train_sets.minor_xs('CID').transpose().to_csv('../../data/cv_splits_train_bigger.csv',header=False,index=False)
test_sets.minor_xs('CID').transpose().to_csv('../../data/cv_splits_test_bigger.csv',header=False,index=False)



In [10]:

    
train_sets.minor_xs('CID').transpose().isnull().sum().sum()









    Out[10]:





250

	index	CID
333	145	8093
334	48	5780
335	318	61771
336	312	61408
337	NaN	NaN

	index	CID
0	235	14286
1	324	62332
2	243	15510
3	345	101604
4	281	31265

	index	CID
0	261	23235
1	122	7824
2	60	6276
3	232	14104
4	135	8030