In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from include.feature_lists import numeric_features
from include.dataset_fnames import generate_station_data_fname, generate_data_fname, generate_split_data_fname

In [3]:
fname = generate_data_fname(sample_type='train', data_type='numeric')
print fname


d:/Kaggle_ws/Bosch/input/train_numeric.csv

In [4]:
%time response_df = pd.read_csv(fname, usecols=['Id', 'Response'])


Wall time: 16 s

In [5]:
response_df.head()


Out[5]:
Id Response
0 4 0
1 6 0
2 7 0
3 9 0
4 11 0

StratifiedKFold in use


In [16]:
from sklearn.model_selection import StratifiedKFold

In [37]:
skf = StratifiedKFold(n_splits=2, random_state=0) # does random_state works?

In [38]:
skf.get_n_splits(response_df['Id'].values, response_df['Response'].values)


Out[38]:
2

In [39]:
for train_index, test_index in skf.split(response_df['Id'].values, response_df['Response'].values):
    print("TRAIN:", train_index, "TEST:", test_index)
    
#     X_train, X_test = response_df.loc[train_index], response_df.loc[test_index]

#     print X_train['Response'].value_counts()
#     print X_test['Response'].value_counts()


('TRAIN:', array([ 587053,  587126,  587626, ..., 1183744, 1183745, 1183746], dtype=int64), 'TEST:', array([     0,      1,      2, ..., 591893, 591894, 591895], dtype=int64))
('TRAIN:', array([     0,      1,      2, ..., 591893, 591894, 591895], dtype=int64), 'TEST:', array([ 587053,  587126,  587626, ..., 1183744, 1183745, 1183746], dtype=int64))

StratifiedShuffleSplit in use


In [40]:
from sklearn.model_selection import StratifiedShuffleSplit

In [48]:
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=35535)

In [49]:
sss.get_n_splits(response_df['Id'].values, response_df['Response'].values)


Out[49]:
2

In [51]:
for train_index, test_index in sss.split(response_df['Id'].values, response_df['Response'].values):
#    print("TRAIN:", train_index, "TEST:", test_index)
    
    X_train, X_test = response_df.loc[train_index], response_df.loc[test_index]

    print X_train['Response'].value_counts()
    print X_test['Response'].value_counts()


0    588434
1      3439
Name: Response, dtype: int64
0    588434
1      3440
Name: Response, dtype: int64
0    588434
1      3439
Name: Response, dtype: int64
0    588434
1      3440
Name: Response, dtype: int64

train_test_split


In [52]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(response_df['Id'].values, response_df['Response'].values, 
                                                    test_size=0.5, random_state=35535)

In [60]:
X_train


Out[60]:
array([ 475957, 2052296, 1960599, ...,  441904, 1751259,  310403], dtype=int64)

In [61]:
y_train.sum()


Out[61]:
3455

In [62]:
y_test.sum()


Out[62]:
3424

Pandas sample


In [66]:
train=response_df.sample(frac=0.5,random_state=0)
test=response_df.drop(train.index)

In [67]:
print train['Response'].value_counts()
print test['Response'].value_counts()


0    588470
1      3404
Name: Response, dtype: int64
0    588398
1      3475
Name: Response, dtype: int64

My way


In [6]:
n_response_df = response_df[response_df['Response'] == 0]
p_response_df = response_df[response_df['Response'] == 1]

In [7]:
n_response_df_train = n_response_df.sample(frac=0.5,random_state=0)
n_response_df_test = n_response_df.drop(n_response_df_train.index)

In [8]:
print n_response_df_train.shape
print n_response_df_test.shape


(588434, 2)
(588434, 2)

In [9]:
p_response_df_train = p_response_df.sample(frac=0.5,random_state=0)
p_response_df_test = p_response_df.drop(p_response_df_train.index)

In [10]:
print p_response_df_train.shape
print p_response_df_test.shape


(3440, 2)
(3439, 2)

In [11]:
response_df_train = n_response_df_train.append(p_response_df_train).sample(frac=1)
response_df_test = n_response_df_test.append(p_response_df_test).sample(frac=1)

In [12]:
print response_df_train.shape
print response_df_test.shape


(591874, 2)
(591873, 2)

In [13]:
response_df_train['Response'].value_counts()


Out[13]:
0    588434
1      3440
Name: Response, dtype: int64

In [33]:
train_idx = response_df_train['Id'].values
print train_idx
train_idx_set = set(train_idx)


[  70787 2060298  420950 ...,    6601  822169   66110]

In [34]:
test_idx = response_df_test['Id'].values
print test_idx
test_idx_set = set(test_idx)


[ 447284  117436 2130018 ...,  530448  834291 1069779]

In [35]:
from include.dataset_fnames import trainsplit_numeric_fname, validsplit_numeric_fname

Mixing up things


In [37]:
chunksize = 100000

use_header = True

t = 0
v = 0
for chunk in pd.read_csv(fname, iterator=True, chunksize=chunksize, dtype=object):
    print "*",
    data = pd.DataFrame()
    data = data.append(chunk, ignore_index=True)
    data['Id'] = data['Id'].astype(int)
    
    data_ids_set = set(data['Id']) # convert Ids to int
    train_ids = train_idx_set.intersection(data_ids_set)
    test_ids = test_idx_set.intersection(data_ids_set)

    train_idx = [True if data.loc[idx, 'Id'] in train_ids else False for idx in data.index]
    test_idx = [True if data.loc[idx, 'Id'] in test_ids else False for idx in data.index]
    
    train_df = data[train_idx]
    test_df = data[test_idx]
    
#     train_idx_set = train_idx_set.difference(train_ids)
#     test_idx_set = test_idx_set.difference(test_ids)
    
    train_df.to_csv(trainsplit_numeric_fname, mode="a", header=use_header, index=False)
    test_df.to_csv(validsplit_numeric_fname, mode="a", header=use_header, index=False)
    use_header = False
    t += train_df.shape[0]
    v += test_df.shape[0]
    
print t, v, t+v


* * * * * * * * * * * * 591874 591873 1183747

In [20]:
train_df = data[train_idx]

In [30]:
train_df.shape


Out[30]:
(49941, 970)

In [32]:
test_df.shape


Out[32]:
(50059, 970)

In [38]:
df = pd.read_csv(trainsplit_numeric_fname)

In [39]:
del df