In [86]:
import numpy as np
import pandas as pd
import os
import sys
from io import StringIO
from include.dataset_fnames import INPUTDIR, train_numeric_fname
In [10]:
pozitive_samples = os.path.join(INPUTDIR, 'train_numeric_headless_1.csv')
colnames = list(pd.read_csv(train_numeric_fname, nrows=2).columns)
df = pd.read_csv(pozitive_samples, names=colnames, usecols=['Id'])
In [11]:
df.head()
Out[11]:
In [12]:
df.shape
Out[12]:
In [13]:
from sklearn.utils import resample
In [42]:
df['Id'].values
df.shape[0]
Out[42]:
In [33]:
from collections import Counter
In [39]:
boosted = resample(df['Id'].values)
counts = Counter(boosted)
1.0 * len(counts) / len(boosted)
Out[39]:
In [40]:
from sklearn.model_selection import KFold
cv_splits = 3
In [59]:
kf = KFold(n_splits=cv_splits)
for train_indices, test_indices in kf.split(range(df.shape[0])):
train_set = df.iloc[train_indices]
test_set = df.iloc[test_indices]
boosted = resample(train_set['Id'].values)
print len(boosted)
counts = Counter(boosted)
print (1.0 * len(counts) / len(boosted))
In [73]:
%%time
negative_samples = os.path.join(INPUTDIR, 'train_numeric_headless_0.csv')
colnames = list(pd.read_csv(train_numeric_fname, nrows=2).columns)
df = pd.read_csv(negative_samples, names=colnames, usecols=['Id'])
In [74]:
df.info()
In [75]:
df.head()
Out[75]:
In [102]:
%%time
kf = KFold(n_splits=cv_splits)
for train_indices, test_indices in kf.split(range(df.shape[0])):
train_set = df.iloc[train_indices]
# print train_set.head()
train_set = train_set.sample(frac=1.0)
# print train_set.head()
test_set = df.iloc[test_indices]
print train_set.shape, test_set.shape
kf2 = KFold(n_splits=171)
for _, test_indices2 in kf2.split(range(train_set.shape[0])):
train_set2 = train_set.iloc[test_indices2]
print train_set2.shape
sti2 = sorted(train_set2['Id'].values)
print "Len sti2", len(sti2)
raw = ""
for line in open(negative_samples, "rb"):
end = line.find(',')
idx = int(line[:end])
if idx == sti2[0]:
raw += line
sti2.pop(0)
if len(sti2) == 0:
break
# break
df1 = pd.read_table(StringIO(unicode(raw)), sep=",", index_col=0, names=colnames)
print df1.shape
# print df1.head()
break
In [66]:
for train_indices, test_indices in kf.split(range(df.shape[0])):
784578 / 4586
Out[66]:
In [83]:
open?
In [ ]: