notebook.community

Edit and run



In [86]:

    
import numpy as np
import pandas as pd
import os
import sys
from io import StringIO


from include.dataset_fnames import INPUTDIR, train_numeric_fname



In [10]:

    
pozitive_samples = os.path.join(INPUTDIR, 'train_numeric_headless_1.csv')
colnames = list(pd.read_csv(train_numeric_fname, nrows=2).columns)
df = pd.read_csv(pozitive_samples, names=colnames, usecols=['Id'])



In [11]:

    
df.head()



In [12]:

    
df.shape









    Out[12]:





(6879, 1)



In [13]:

    
from sklearn.utils import resample



In [42]:

    
df['Id'].values
df.shape[0]









    Out[42]:





6879



In [33]:

    
from collections import Counter



In [39]:

    
boosted = resample(df['Id'].values)
counts = Counter(boosted)
1.0 * len(counts) / len(boosted)









    Out[39]:





0.6265445558947521



In [40]:

    
from sklearn.model_selection import KFold
cv_splits = 3



In [59]:

    
kf = KFold(n_splits=cv_splits)
for train_indices, test_indices in kf.split(range(df.shape[0])):
    train_set = df.iloc[train_indices]
    test_set = df.iloc[test_indices]
    
    boosted = resample(train_set['Id'].values)
    print len(boosted)
    counts = Counter(boosted)
    print (1.0 * len(counts) / len(boosted))









    



4586
0.629524640209
4586
0.629960750109
4586
0.626035761012



In [73]:

    
%%time
negative_samples = os.path.join(INPUTDIR, 'train_numeric_headless_0.csv')
colnames = list(pd.read_csv(train_numeric_fname, nrows=2).columns)
df = pd.read_csv(negative_samples, names=colnames, usecols=['Id'])









    



Wall time: 13.1 s



In [74]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176868 entries, 0 to 1176867
Data columns (total 1 columns):
Id    1176868 non-null int64
dtypes: int64(1)
memory usage: 9.0 MB



In [75]:

    
df.head()



In [102]:

    
%%time
kf = KFold(n_splits=cv_splits)
for train_indices, test_indices in kf.split(range(df.shape[0])):
    train_set = df.iloc[train_indices]
#     print train_set.head()
    train_set = train_set.sample(frac=1.0)
#     print train_set.head()
    test_set = df.iloc[test_indices]
    print train_set.shape, test_set.shape    
    
    kf2 = KFold(n_splits=171)
    for _, test_indices2 in kf2.split(range(train_set.shape[0])):
        train_set2 = train_set.iloc[test_indices2]
        print train_set2.shape
        
        sti2 = sorted(train_set2['Id'].values)
        print "Len sti2", len(sti2)
        
        raw = ""
        for line in open(negative_samples, "rb"):
            end = line.find(',')
            idx = int(line[:end])
            if idx == sti2[0]:
                raw += line
                sti2.pop(0)
                if len(sti2) == 0:
                    break
        
#         break
    df1 = pd.read_table(StringIO(unicode(raw)), sep=",", index_col=0, names=colnames)
    print df1.shape
#     print df1.head()
    break









    



(784578, 1) (392290, 1)
(4589, 1)
Len sti2 4589
(4589, 969)
(784579, 1) (392289, 1)
(4589, 1)
Len sti2 4589
(4589, 969)
(784579, 1) (392289, 1)
(4589, 1)
Len sti2 4589
(4589, 969)
Wall time: 30.5 s



In [66]:

    
for train_indices, test_indices in kf.split(range(df.shape[0])):
784578 / 4586









    Out[66]:





171



In [83]:

    
open?



In [ ]:

	Id
0	1053
1	1250
2	1350
3	1793
4	2347