In [86]:
import numpy as np
import pandas as pd
import os
import sys
from io import StringIO


from include.dataset_fnames import INPUTDIR, train_numeric_fname

In [10]:
pozitive_samples = os.path.join(INPUTDIR, 'train_numeric_headless_1.csv')
colnames = list(pd.read_csv(train_numeric_fname, nrows=2).columns)
df = pd.read_csv(pozitive_samples, names=colnames, usecols=['Id'])

In [11]:
df.head()


Out[11]:
Id
0 1053
1 1250
2 1350
3 1793
4 2347

In [12]:
df.shape


Out[12]:
(6879, 1)

In [13]:
from sklearn.utils import resample

In [42]:
df['Id'].values
df.shape[0]


Out[42]:
6879

In [33]:
from collections import Counter

In [39]:
boosted = resample(df['Id'].values)
counts = Counter(boosted)
1.0 * len(counts) / len(boosted)


Out[39]:
0.6265445558947521

In [40]:
from sklearn.model_selection import KFold
cv_splits = 3

In [59]:
kf = KFold(n_splits=cv_splits)
for train_indices, test_indices in kf.split(range(df.shape[0])):
    train_set = df.iloc[train_indices]
    test_set = df.iloc[test_indices]
    
    boosted = resample(train_set['Id'].values)
    print len(boosted)
    counts = Counter(boosted)
    print (1.0 * len(counts) / len(boosted))


4586
0.629524640209
4586
0.629960750109
4586
0.626035761012

In [73]:
%%time
negative_samples = os.path.join(INPUTDIR, 'train_numeric_headless_0.csv')
colnames = list(pd.read_csv(train_numeric_fname, nrows=2).columns)
df = pd.read_csv(negative_samples, names=colnames, usecols=['Id'])


Wall time: 13.1 s

In [74]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176868 entries, 0 to 1176867
Data columns (total 1 columns):
Id    1176868 non-null int64
dtypes: int64(1)
memory usage: 9.0 MB

In [75]:
df.head()


Out[75]:
Id
0 4
1 6
2 7
3 9
4 11

In [102]:
%%time
kf = KFold(n_splits=cv_splits)
for train_indices, test_indices in kf.split(range(df.shape[0])):
    train_set = df.iloc[train_indices]
#     print train_set.head()
    train_set = train_set.sample(frac=1.0)
#     print train_set.head()
    test_set = df.iloc[test_indices]
    print train_set.shape, test_set.shape    
    
    kf2 = KFold(n_splits=171)
    for _, test_indices2 in kf2.split(range(train_set.shape[0])):
        train_set2 = train_set.iloc[test_indices2]
        print train_set2.shape
        
        sti2 = sorted(train_set2['Id'].values)
        print "Len sti2", len(sti2)
        
        raw = ""
        for line in open(negative_samples, "rb"):
            end = line.find(',')
            idx = int(line[:end])
            if idx == sti2[0]:
                raw += line
                sti2.pop(0)
                if len(sti2) == 0:
                    break
        
#         break
    df1 = pd.read_table(StringIO(unicode(raw)), sep=",", index_col=0, names=colnames)
    print df1.shape
#     print df1.head()
    break


(784578, 1) (392290, 1)
(4589, 1)
Len sti2 4589
(4589, 969)
(784579, 1) (392289, 1)
(4589, 1)
Len sti2 4589
(4589, 969)
(784579, 1) (392289, 1)
(4589, 1)
Len sti2 4589
(4589, 969)
Wall time: 30.5 s

In [66]:
for train_indices, test_indices in kf.split(range(df.shape[0])):
784578 / 4586


Out[66]:
171

In [83]:
open?

In [ ]: