In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from include.feature_lists import numeric_features
from include.dataset_fnames import generate_station_data_fname, generate_data_fname, generate_split_data_fname
In [3]:
fname = generate_data_fname(sample_type='train', data_type='numeric')
print fname
In [4]:
%time response_df = pd.read_csv(fname, usecols=['Id', 'Response'])
In [5]:
response_df.head()
Out[5]:
In [16]:
from sklearn.model_selection import StratifiedKFold
In [37]:
skf = StratifiedKFold(n_splits=2, random_state=0) # does random_state works?
In [38]:
skf.get_n_splits(response_df['Id'].values, response_df['Response'].values)
Out[38]:
In [39]:
for train_index, test_index in skf.split(response_df['Id'].values, response_df['Response'].values):
print("TRAIN:", train_index, "TEST:", test_index)
# X_train, X_test = response_df.loc[train_index], response_df.loc[test_index]
# print X_train['Response'].value_counts()
# print X_test['Response'].value_counts()
In [40]:
from sklearn.model_selection import StratifiedShuffleSplit
In [48]:
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=35535)
In [49]:
sss.get_n_splits(response_df['Id'].values, response_df['Response'].values)
Out[49]:
In [51]:
for train_index, test_index in sss.split(response_df['Id'].values, response_df['Response'].values):
# print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = response_df.loc[train_index], response_df.loc[test_index]
print X_train['Response'].value_counts()
print X_test['Response'].value_counts()
In [52]:
from sklearn.model_selection import train_test_split
In [59]:
X_train, X_test, y_train, y_test = train_test_split(response_df['Id'].values, response_df['Response'].values,
test_size=0.5, random_state=35535)
In [60]:
X_train
Out[60]:
In [61]:
y_train.sum()
Out[61]:
In [62]:
y_test.sum()
Out[62]:
In [66]:
train=response_df.sample(frac=0.5,random_state=0)
test=response_df.drop(train.index)
In [67]:
print train['Response'].value_counts()
print test['Response'].value_counts()
In [6]:
n_response_df = response_df[response_df['Response'] == 0]
p_response_df = response_df[response_df['Response'] == 1]
In [7]:
n_response_df_train = n_response_df.sample(frac=0.5,random_state=0)
n_response_df_test = n_response_df.drop(n_response_df_train.index)
In [8]:
print n_response_df_train.shape
print n_response_df_test.shape
In [9]:
p_response_df_train = p_response_df.sample(frac=0.5,random_state=0)
p_response_df_test = p_response_df.drop(p_response_df_train.index)
In [10]:
print p_response_df_train.shape
print p_response_df_test.shape
In [11]:
response_df_train = n_response_df_train.append(p_response_df_train).sample(frac=1)
response_df_test = n_response_df_test.append(p_response_df_test).sample(frac=1)
In [12]:
print response_df_train.shape
print response_df_test.shape
In [13]:
response_df_train['Response'].value_counts()
Out[13]:
In [33]:
train_idx = response_df_train['Id'].values
print train_idx
train_idx_set = set(train_idx)
In [34]:
test_idx = response_df_test['Id'].values
print test_idx
test_idx_set = set(test_idx)
In [35]:
from include.dataset_fnames import trainsplit_numeric_fname, validsplit_numeric_fname
In [37]:
chunksize = 100000
use_header = True
t = 0
v = 0
for chunk in pd.read_csv(fname, iterator=True, chunksize=chunksize, dtype=object):
print "*",
data = pd.DataFrame()
data = data.append(chunk, ignore_index=True)
data['Id'] = data['Id'].astype(int)
data_ids_set = set(data['Id']) # convert Ids to int
train_ids = train_idx_set.intersection(data_ids_set)
test_ids = test_idx_set.intersection(data_ids_set)
train_idx = [True if data.loc[idx, 'Id'] in train_ids else False for idx in data.index]
test_idx = [True if data.loc[idx, 'Id'] in test_ids else False for idx in data.index]
train_df = data[train_idx]
test_df = data[test_idx]
# train_idx_set = train_idx_set.difference(train_ids)
# test_idx_set = test_idx_set.difference(test_ids)
train_df.to_csv(trainsplit_numeric_fname, mode="a", header=use_header, index=False)
test_df.to_csv(validsplit_numeric_fname, mode="a", header=use_header, index=False)
use_header = False
t += train_df.shape[0]
v += test_df.shape[0]
print t, v, t+v
In [20]:
train_df = data[train_idx]
In [30]:
train_df.shape
Out[30]:
In [32]:
test_df.shape
Out[32]:
In [38]:
df = pd.read_csv(trainsplit_numeric_fname)
In [39]:
del df