notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
from include.feature_lists import numeric_features
from include.dataset_fnames import generate_station_data_fname, generate_data_fname, generate_split_data_fname



In [3]:

    
fname = generate_data_fname(sample_type='train', data_type='numeric')
print fname









    



d:/Kaggle_ws/Bosch/input/train_numeric.csv



In [4]:

    
%time response_df = pd.read_csv(fname, usecols=['Id', 'Response'])









    



Wall time: 16 s



In [5]:

    
response_df.head()

StratifiedKFold in use



In [16]:

    
from sklearn.model_selection import StratifiedKFold



In [37]:

    
skf = StratifiedKFold(n_splits=2, random_state=0) # does random_state works?



In [38]:

    
skf.get_n_splits(response_df['Id'].values, response_df['Response'].values)









    Out[38]:





2



In [39]:

    
for train_index, test_index in skf.split(response_df['Id'].values, response_df['Response'].values):
    print("TRAIN:", train_index, "TEST:", test_index)
    
#     X_train, X_test = response_df.loc[train_index], response_df.loc[test_index]

#     print X_train['Response'].value_counts()
#     print X_test['Response'].value_counts()









    



('TRAIN:', array([ 587053,  587126,  587626, ..., 1183744, 1183745, 1183746], dtype=int64), 'TEST:', array([     0,      1,      2, ..., 591893, 591894, 591895], dtype=int64))
('TRAIN:', array([     0,      1,      2, ..., 591893, 591894, 591895], dtype=int64), 'TEST:', array([ 587053,  587126,  587626, ..., 1183744, 1183745, 1183746], dtype=int64))

StratifiedShuffleSplit in use



In [40]:

    
from sklearn.model_selection import StratifiedShuffleSplit



In [48]:

    
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=35535)



In [49]:

    
sss.get_n_splits(response_df['Id'].values, response_df['Response'].values)









    Out[49]:





2



In [51]:

    
for train_index, test_index in sss.split(response_df['Id'].values, response_df['Response'].values):
#    print("TRAIN:", train_index, "TEST:", test_index)
    
    X_train, X_test = response_df.loc[train_index], response_df.loc[test_index]

    print X_train['Response'].value_counts()
    print X_test['Response'].value_counts()









    



0    588434
1      3439
Name: Response, dtype: int64
0    588434
1      3440
Name: Response, dtype: int64
0    588434
1      3439
Name: Response, dtype: int64
0    588434
1      3440
Name: Response, dtype: int64

train_test_split



In [52]:

    
from sklearn.model_selection import train_test_split



In [59]:

    
X_train, X_test, y_train, y_test = train_test_split(response_df['Id'].values, response_df['Response'].values, 
                                                    test_size=0.5, random_state=35535)



In [60]:

    
X_train









    Out[60]:





array([ 475957, 2052296, 1960599, ...,  441904, 1751259,  310403], dtype=int64)



In [61]:

    
y_train.sum()









    Out[61]:





3455



In [62]:

    
y_test.sum()









    Out[62]:





3424

Pandas sample



In [66]:

    
train=response_df.sample(frac=0.5,random_state=0)
test=response_df.drop(train.index)



In [67]:

    
print train['Response'].value_counts()
print test['Response'].value_counts()









    



0    588470
1      3404
Name: Response, dtype: int64
0    588398
1      3475
Name: Response, dtype: int64

My way



In [6]:

    
n_response_df = response_df[response_df['Response'] == 0]
p_response_df = response_df[response_df['Response'] == 1]



In [7]:

    
n_response_df_train = n_response_df.sample(frac=0.5,random_state=0)
n_response_df_test = n_response_df.drop(n_response_df_train.index)



In [8]:

    
print n_response_df_train.shape
print n_response_df_test.shape









    



(588434, 2)
(588434, 2)



In [9]:

    
p_response_df_train = p_response_df.sample(frac=0.5,random_state=0)
p_response_df_test = p_response_df.drop(p_response_df_train.index)



In [10]:

    
print p_response_df_train.shape
print p_response_df_test.shape









    



(3440, 2)
(3439, 2)



In [11]:

    
response_df_train = n_response_df_train.append(p_response_df_train).sample(frac=1)
response_df_test = n_response_df_test.append(p_response_df_test).sample(frac=1)



In [12]:

    
print response_df_train.shape
print response_df_test.shape









    



(591874, 2)
(591873, 2)



In [13]:

    
response_df_train['Response'].value_counts()









    Out[13]:





0    588434
1      3440
Name: Response, dtype: int64



In [33]:

    
train_idx = response_df_train['Id'].values
print train_idx
train_idx_set = set(train_idx)









    



[  70787 2060298  420950 ...,    6601  822169   66110]



In [34]:

    
test_idx = response_df_test['Id'].values
print test_idx
test_idx_set = set(test_idx)









    



[ 447284  117436 2130018 ...,  530448  834291 1069779]



In [35]:

    
from include.dataset_fnames import trainsplit_numeric_fname, validsplit_numeric_fname

Mixing up things



In [37]:

    
chunksize = 100000

use_header = True

t = 0
v = 0
for chunk in pd.read_csv(fname, iterator=True, chunksize=chunksize, dtype=object):
    print "*",
    data = pd.DataFrame()
    data = data.append(chunk, ignore_index=True)
    data['Id'] = data['Id'].astype(int)
    
    data_ids_set = set(data['Id']) # convert Ids to int
    train_ids = train_idx_set.intersection(data_ids_set)
    test_ids = test_idx_set.intersection(data_ids_set)

    train_idx = [True if data.loc[idx, 'Id'] in train_ids else False for idx in data.index]
    test_idx = [True if data.loc[idx, 'Id'] in test_ids else False for idx in data.index]
    
    train_df = data[train_idx]
    test_df = data[test_idx]
    
#     train_idx_set = train_idx_set.difference(train_ids)
#     test_idx_set = test_idx_set.difference(test_ids)
    
    train_df.to_csv(trainsplit_numeric_fname, mode="a", header=use_header, index=False)
    test_df.to_csv(validsplit_numeric_fname, mode="a", header=use_header, index=False)
    use_header = False
    t += train_df.shape[0]
    v += test_df.shape[0]
    
print t, v, t+v









    



* * * * * * * * * * * * 591874 591873 1183747



In [20]:

    
train_df = data[train_idx]



In [30]:

    
train_df.shape









    Out[30]:





(49941, 970)



In [32]:

    
test_df.shape









    Out[32]:





(50059, 970)



In [38]:

    
df = pd.read_csv(trainsplit_numeric_fname)



In [39]:

    
del df