notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys

from matplotlib import pyplot as plt

%matplotlib inline



In [2]:

    
y = pandas.read_table("~/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)

ndim= 900
y.head()



In [3]:

    
ymin = 1
ysplit = 131
ymax = 156



In [4]:

    
np.unique(y[0], return_counts=True)









    Out[4]:





(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164]),
 array([  1263,   1261,   1255,   1256,   1252,   1235,   1240,   1264,
          1256,   1281,   1245,   1278,   1278,   1253,   1255,   1255,
          1291,   1277,   1308,   1285,   1322,   1309,   1318,   1322,
          1327,   1339,   1361,   1361,   1335,   1396,   1359,   1393,
          1373,   1356,   1398,   1416,   1386,   1398,   1396,   1404,
          1430,   1398,   1416,   1406,   1420,   1445,   1433,   1445,
          1454,   1451,   1481,   1482,   1477,   1474,   1478,   1486,
          1512,   1492,   1557,   1557,   1548,   1530,   1574,   1582,
          1606,   1611,   1666,   1650,   1704,   1739,   1735,   1743,
          1728,   1796,   1737,   1810,   1822,   1864,   1847,   1838,
          1857,   1913,   1910,   1917,   2006,   1992,   2033,   2063,
          2072,   2063,   2096,   2128,   2134,   2206,   2215,   2212,
          2258,   2279,   2287,   2319,   2356,   2435,   2438,   2491,
          2486,   2485,   2502,   2555,   2594,   2629,   2575,   2587,
          2777,   2875,   2897,   2884,   2978,   3087,   3179,   3368,
          3388,   3421,   3409,   3453,   3536,   3586,   3615,   3696,
          3821,   3802,   3934,   4059,   4069,   4253,   4819,   4939,
          5038,   5259,   5310,   6080,   6487,   6623,   7256,   8279,
          9069,   9221,   9707,   9998,  10557,  10645,  11484,  12382,
         12858,  16548,  18562,  21943,  30679,  34092,  45439,  60513,
         64478,  65211,  92241, 130122]))



In [20]:

    
yuniq,ycount = np.unique(y[0], return_counts=True)

print(np.sum(ycount[np.where(np.in1d(yuniq, range(ymin, ysplit)))[0]]))
print(np.sum(ycount[np.where(np.in1d(yuniq, range(ysplit, ymax+1)))[0]]))



In [5]:

    
import pickle

cstat = pickle.load(open( "../data/sum_features.dat", "rb" ) )



In [6]:

    
### Calclulate Standardized Mean Difference Between Classes

def calStandMeanDiff(y, cstat, yneg, ypos):
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)


    n1 = np.sum(np.in1d(y, yneg))
    n2 = np.sum(np.in1d(y, ypos))
    sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))

    for yi in yneg:
        sx += cstat[yi][0]
        ssx += cstat[yi][1]
    r1_mean = sx / float(n1)
    r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)

    tot_mean = sx
    tot_var  = ssx
    
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)
    for yi in ypos:
        sx += cstat[yi][0]
        ssx += cstat[yi][1]
    r2_mean = sx / float(n2)
    r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)

    tot_mean += sx
    tot_var  += ssx
    tot_mean = tot_mean / float(n1 + n2)
    tot_var  = (tot_var - 2*tot_var*tot_mean + tot_mean**2) / float(n1 + n2)

    rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)

    return (rdiff)


## unit test:
mean_test = calStandMeanDiff(y, cstat, np.arange(ymin,ysplit), np.arange(ysplit, ymax+1)) 
print(np.sum(mean_test > 0.001))









    



64






    



Number of samples in NegClass: 247846 and PosClass: 229379

Classify items belonging to first half (1) Second half (-1)

Finding Good Features



In [7]:

    
rdiff = calStandMeanDiff(y, cstat, np.arange(ymin,ysplit), np.arange(ysplit, ymax+1))


## Good Features:
goodfeatures = np.where(rdiff > 0.001)[0]

print(goodfeatures)

if goodfeatures.shape[0] < 100:
    rest = np.setdiff1d(np.arange(ndim), goodfeatures, assume_unique=True)
    goodfeatures = np.hstack((goodfeatures, rest[:100]))
    
print(goodfeatures.shape)
goodfeatures









    



[ 18  29  35  54  78  81  86 106 115 117 131 134 138 150 157 203 239 247
 248 253 265 286 298 304 337 340 344 347 349 375 377 385 450 494 496 513
 526 534 555 561 566 587 592 637 677 697 710 738 741 744 760 771 773 805
 813 815 817 819 830 860 864 892 897 898]
(164,)






    



Number of samples in NegClass: 247846 and PosClass: 229379 






    Out[7]:





array([ 18,  29,  35,  54,  78,  81,  86, 106, 115, 117, 131, 134, 138,
       150, 157, 203, 239, 247, 248, 253, 265, 286, 298, 304, 337, 340,
       344, 347, 349, 375, 377, 385, 450, 494, 496, 513, 526, 534, 555,
       561, 566, 587, 592, 637, 677, 697, 710, 738, 741, 744, 760, 771,
       773, 805, 813, 815, 817, 819, 830, 860, 864, 892, 897, 898,   0,
         1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  30,  31,  32,  33,  34,  36,  37,  38,  39,  40,  41,  42,
        43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  55,  56,
        57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
        70,  71,  72,  73,  74,  75,  76,  77,  79,  80,  82,  83,  84,
        85,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 104, 105, 107])

Read a Random Sample



In [8]:

    
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
    """ Read a random sample
    """
    if goodfeat is None:
        goodfeat = np.arange(ndim)
    Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
    ysub = np.zeros(shape=size, dtype=int)

    if acc_miny is None:
        acc_miny = np.min(y)
    if acc_maxy is None:
        acc_maxy = np.max(y)
        
    #yuniq, ycount = np.unique(y, return_counts=True)
    #tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
    
    acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
    assert(acceptable_indx.shape[0] > size)
    choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
    #print(choice_indx.shape)
    #sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
    
    #proba = 1.0 - size/float(tot_acceptable)
    
        
    with open(data_fname, 'r') as fp:
        n = 0
        nf = 0
        for line in fp:
#            if (y[n] >= acc_miny and y[n]<=acc_maxy):
#                if np.random.uniform(low=0, high=1) > proba and nf < size:
            if nf < size:
                if n == choice_indx[nf]:
                    line = line.strip().split()
                    ix = -1
                    for i,v in enumerate(line):
                        if np.any(goodfeat == i):
                            ix += 1
                            Xsub[nf,ix] = int(v)
                    ysub[nf] = y[n]

                    nf += 1
            n += 1
    return(Xsub, ysub)



In [23]:

    
## unit testing readRandomSample()
gf_test = goodfeatures
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
                              size=2000, goodfeat=gf_test, acc_miny=ymin, acc_maxy=ymax)

print(Xsub.shape)
print(np.unique(ysub))









    



(2000, 64)
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156]



In [9]:

    
### Performance Evaluation
def evalPerformance(ytrue, ypred):
    tp = np.sum(ypred[np.where(ytrue ==  1)[0]] == 1)
    fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
    tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
    fn = ytrue.shape[0]-(tp+fp+tn)
    #sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
    prec = tp / float(tp + fp)
    recall  = tp / float(tp + fn)
    f1score = 2*tp/float(2*tp + fp + fn)

    return (prec, recall, f1score)



In [26]:

    
y = pandas.read_table('../data/label_tr.lower.txt', sep=' ', header=None, dtype='int')

print(np.unique(y[1]))

Xsub, ysub = readRandomSample('../data/data_tr.lower.txt', y[1], size=20000, \
                              goodfeat=goodfeatures, acc_miny=ymin, acc_maxy=ymax)

print(np.unique(ysub))

assert(np.sum(ysub < ymin) == 0)
assert(np.sum(ysub > ymax) == 0)
ysub[np.where(ysub < ysplit)[0]] = -1
ysub[np.where(ysub >= ysplit)[0]] =  1

print(np.sum(ysub == -1), np.sum(ysub==1))

#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)

Xsub.shape









    



[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156]
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156]
(10374, 9626)






    Out[26]:





(20000, 164)

Grid-Search (coarse)



In [28]:

    
import sklearn.svm

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [1.0, 5.0, 10, 20, 50]:
    for gm in [0.001, 0.01, 0.1, 1.0, 5.0, 10, 20]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))









    



C=1.0000 Gamma=0.0010  ==> Prec:0.549  Recall:0.508  F1Score:0.528
C=1.0000 Gamma=0.0100  ==> Prec:0.563  Recall:0.485  F1Score:0.521
C=1.0000 Gamma=0.1000  ==> Prec:0.672  Recall:0.060  F1Score:0.110
C=1.0000 Gamma=1.0000  ==> Prec:0.800  Recall:0.001  F1Score:0.002
C=1.0000 Gamma=5.0000  ==> Prec:1.000  Recall:0.000  F1Score:0.000
C=1.0000 Gamma=10.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=1.0000 Gamma=20.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=5.0000 Gamma=0.0010  ==> Prec:0.548  Recall:0.534  F1Score:0.541
C=5.0000 Gamma=0.0100  ==> Prec:0.544  Recall:0.496  F1Score:0.519
C=5.0000 Gamma=0.1000  ==> Prec:0.666  Recall:0.062  F1Score:0.114
C=5.0000 Gamma=1.0000  ==> Prec:0.800  Recall:0.001  F1Score:0.002
C=5.0000 Gamma=5.0000  ==> Prec:1.000  Recall:0.000  F1Score:0.000
C=5.0000 Gamma=10.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=5.0000 Gamma=20.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=10.0000 Gamma=0.0010  ==> Prec:0.545  Recall:0.548  F1Score:0.546
C=10.0000 Gamma=0.0100  ==> Prec:0.539  Recall:0.485  F1Score:0.511
C=10.0000 Gamma=0.1000  ==> Prec:0.667  Recall:0.063  F1Score:0.115
C=10.0000 Gamma=1.0000  ==> Prec:0.800  Recall:0.001  F1Score:0.002
C=10.0000 Gamma=5.0000  ==> Prec:1.000  Recall:0.000  F1Score:0.000
C=10.0000 Gamma=10.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=10.0000 Gamma=20.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=20.0000 Gamma=0.0010  ==> Prec:0.543  Recall:0.555  F1Score:0.549
C=20.0000 Gamma=0.0100  ==> Prec:0.539  Recall:0.482  F1Score:0.509
C=20.0000 Gamma=0.1000  ==> Prec:0.667  Recall:0.063  F1Score:0.114
C=20.0000 Gamma=1.0000  ==> Prec:0.800  Recall:0.001  F1Score:0.002
C=20.0000 Gamma=5.0000  ==> Prec:1.000  Recall:0.000  F1Score:0.000
C=20.0000 Gamma=10.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=20.0000 Gamma=20.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=50.0000 Gamma=0.0010  ==> Prec:0.537  Recall:0.558  F1Score:0.547
C=50.0000 Gamma=0.0100  ==> Prec:0.531  Recall:0.474  F1Score:0.501






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-28-3d64aa7ad937> in <module>()
     10         clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
     11         clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
---> 12         ypred = clf.predict(Xsub[ts_idx, :])
     13         prec, recall, f1score = evalPerformance(yts, ypred)
     14         print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in predict(self, X)
    465             Class labels for samples in X.
    466         """
--> 467         y = super(BaseSVC, self).predict(X)
    468         return self.classes_.take(np.asarray(y, dtype=np.intp))
    469 

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in predict(self, X)
    283         X = self._validate_for_predict(X)
    284         predict = self._sparse_predict if self._sparse else self._dense_predict
--> 285         return predict(X)
    286 
    287     def _dense_predict(self, X):

/usr/local/lib/python2.7/dist-packages/sklearn/svm/base.pyc in _dense_predict(self, X)
    306             self.probA_, self.probB_, svm_type=svm_type, kernel=kernel,
    307             degree=self.degree, coef0=self.coef0, gamma=self._gamma,
--> 308             cache_size=self.cache_size)
    309 
    310     def _sparse_predict(self, X):

KeyboardInterrupt:



In [19]:

    
#y = pandas.read_table('../data/label_train.txt', sep=' ', header=None, dtype='int')

Xsub, ysub = readRandomSample('../data/data_train.txt', y[0], size=20000, \
                              acc_miny=ymin, acc_maxy=ymax)

print(np.unique(ysub))

assert(np.sum(ysub < ymin) == 0)
assert(np.sum(ysub > ymax) == 0)
ysub[np.where(ysub < ysplit)[0]] = -1
ysub[np.where(ysub >= ysplit)[0]] =  1

print(np.sum(ysub == -1), np.sum(ysub==1))

#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)

Xsub.shape









    



[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156]
(10489, 9511)






    Out[19]:





(20000, 900)



In [20]:

    
import sklearn.ensemble
import datetime as dt

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for n_est in [20, 50, 100, 200, 500]:
    rfclf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, criterion='gini')
    start_time = dt.datetime.now()
    rfclf.fit(Xsub[tr_idx, :], ysub[tr_idx])
    fit_time = dt.datetime.now()
    ypred = rfclf.predict(Xsub[ts_idx, :])
    pred_time = dt.datetime.now()
    prec, recall, f1score = evalPerformance(yts, ypred)
    print ("TrainSize %d n_est %d ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f (fit-time %d pred-time %d)" \
           %(tr_idx.shape[0], n_est, prec, recall, f1score, (fit_time-start_time).seconds, (pred_time - fit_time).seconds))









    



TrainSize 10000 n_est 20 ==> Prec:0.562  Recall:0.393  F1Score:0.463 (fit-time 6 pred-time 0)
TrainSize 10000 n_est 50 ==> Prec:0.566  Recall:0.408  F1Score:0.474 (fit-time 6 pred-time 0)
TrainSize 10000 n_est 100 ==> Prec:0.565  Recall:0.414  F1Score:0.478 (fit-time 6 pred-time 0)
TrainSize 10000 n_est 200 ==> Prec:0.570  Recall:0.417  F1Score:0.481 (fit-time 5 pred-time 0)
TrainSize 10000 n_est 500 ==> Prec:0.570  Recall:0.409  F1Score:0.477 (fit-time 6 pred-time 0)

Split Lower Training Set



In [2]:

    
ndim= 900

y = pandas.read_table('../data/label_tr.lower.txt', sep=' ', header=None, dtype='int')

ycv = pandas.read_table('../data/label_cv.lower.txt', sep=' ', header=None, dtype='int')

Xcv = pandas.read_table('../data/data_cv.lower.txt', sep=' ', header=None, dtype='int')

print(np.unique(y[1]))
print(np.unique(ycv[1]))

print(Xcv.shape)

feat_idx = np.random.choice(np.arange(ndim), 30, replace=False)
Xcv = Xcv.iloc[:, feat_idx]
print(Xcv.shape)









    



[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156]
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156]
(47732, 900)
(47732, 30)



In [22]:

    
ntot_train = y.shape[0]
print(ntot_train)

df = pandas.read_table('../data/data_tr.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')

Xcv = pandas.read_table('../data/data_cv.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')

print(df.shape)
print(Xcv.shape)









    



429493
(429493, 30)
(47732, 30)



In [14]:

    
tr_idx = np.random.choice(df.shape[0], ntot_train/2, replace=False)
ts_idx = np.setdiff1d(np.arange(df.shape[0]), tr_idx, assume_unique=True)

Xtr = df.iloc[tr_idx, :]
print(Xtr.shape, tr_idx.shape, ts_idx.shape)









    



((214746, 30), (214746,), (214747,))



In [16]:

    
kdt = scipy.spatial.KDTree(Xtr.iloc[:tr_idx.shape[0]/2,:], leafsize=1000)


qt_dist, qt_idx = kdt.query(Xtr.iloc[:10,:], k=10)
print(qt_dist)

print(qt_idx)









    



[[ 0.          3.60555128  3.74165739  3.74165739  3.74165739  3.87298335
   3.87298335  3.87298335  3.87298335  4.        ]
 [ 0.          2.          2.23606798  2.23606798  2.23606798  2.23606798
   2.23606798  2.44948974  2.44948974  2.44948974]
 [ 0.          3.31662479  3.87298335  4.          4.12310563  4.12310563
   4.12310563  4.12310563  4.24264069  4.35889894]
 [ 0.          2.64575131  2.64575131  2.82842712  3.          3.
   3.16227766  3.16227766  3.16227766  3.16227766]
 [ 0.          2.82842712  3.          3.          3.16227766  3.16227766
   3.16227766  3.16227766  3.16227766  3.16227766]
 [ 0.          3.46410162  3.60555128  3.60555128  3.74165739  3.87298335
   3.87298335  3.87298335  3.87298335  4.        ]
 [ 0.          2.82842712  3.          3.16227766  3.31662479  3.31662479
   3.46410162  3.60555128  3.60555128  3.60555128]
 [ 0.          3.31662479  3.87298335  3.87298335  4.          4.          4.
   4.          4.12310563  4.12310563]
 [ 0.          2.64575131  2.64575131  2.82842712  2.82842712  2.82842712
   2.82842712  2.82842712  2.82842712  2.82842712]
 [ 0.          2.82842712  3.31662479  3.46410162  3.46410162  3.46410162
   3.46410162  3.46410162  3.60555128  3.60555128]]
[[     0  56725  16154  51319  75082  38227  40278  70911 107042  94209]
 [     1  18889   7758  10836  64858  74036  76328  36075  61926  70665]
 [     2  95976  56401  27594  13727  32838  86180  90983  36913  84744]
 [     3  11310  94564  98987  66543  86227  24463  24965  40271  61784]
 [     4 104247  75131 104482   4063   5350  24188  39881  49923  81092]
 [     5  95703   7657  57022  45295   7874  59995  86961 104773  88718]
 [     6  43598  96750  96168   3382  96713  37939  29266  33863  91189]
 [     7  79697  28641  56196  25213  49816  55496  67226  55722  78011]
 [     8  11614  53270     13  39164  41679  60862  61386  63194  68009]
 [     9  57506  18665  23159  45354  78443  81722 105507  33557  74052]]



In [28]:

    
ntr = Xtr.shape[0]
nsplit = ntr / 2

kdt1 = scipy.spatial.KDTree(Xtr.iloc[:nsplit,:], leafsize=1000)

#kdt2 = scipy.spatial.KDTree(Xtr.iloc[nsplit:ntr,:], leafsize=1000)

qt1_idx = kdt1.query(Xcv[:20], k=10)[1]
#qt2_idx = kdt2.query(df.iloc[ts_idx, :])[1]



In [48]:

    
str_idx = np.arange(4)*ntot_train/4
end_idx = np.arange(1,5)*ntot_train/4

def get_label(arr):
    return(y.iloc[arr,1].values)

for i,(s,e) in enumerate(zip(str_idx, end_idx)):
    sys.stdout.write('%6d - %6d '%(s,e))
    kdt = scipy.spatial.KDTree(df.iloc[s:e,:], leafsize=1000)
    qt_idx = kdt.query(Xcv[:5], k=10)[1]
    print(qt_idx.shape)
    
    pred = np.apply_along_axis(get_label, 0, qt_idx)
    print(pred)
    np.savetxt('/tmp/preds.%d.dat'%i, pred, fmt='%d')
    #print(y.iloc[qt_idx[:,:],1].values)









    



     0 - 107373 (5, 10)
[[149   1  12  22 113 147 124 110 113 152]
 [131  52 150  59 153  76  55 146  76  15]
 [148 150  98   3 123 139  38 144 150  34]
 [144 111  82   7  28  94 146 118  28 142]
 [156 156 118 156 156 152 121 156 144 156]]
107373 - 214746 (5, 10)
[[ 76 103 130 106  43  53 131 129  91 148]
 [100 145 149  74 145 125  97 148 147 151]
 [151 156 146 135 149 151  55  91 143  90]
 [147 124 150 102  26 102  99  78 148 130]
 [156 134  81  89  70  79 156  79 144 151]]
214746 - 322119 (5, 10)
[[ 73 153 148   4  88 146 150  49 155 131]
 [118  19 146  52 132  50 118 107 142   4]
 [ 77 124 155  12 127 144 141  81  35 149]
 [ 62  69 113 140  30 132 151 109 121 135]
 [156 147 109  15 151 151 150 156 154  86]]
322119 - 429493 (5, 10)
[[155 147  80   1 124 149 146  89 112  73]
 [142  40 150  53 150  95 144 156 152 152]
 [132   7 149 156 111 154  70 149 103   3]
 [150 144 144  82 150 156 147  41  91  91]
 [ 11 119 153  98 155 147 110   6 107 147]]



In [18]:

    
ntot_train = y.shape[0]
str_idx = np.arange(5)*ntot_train/5
end_idx = np.arange(1,6)*ntot_train/5

def get_label(arr):
    return(y.iloc[arr,1].values)


for n in range(100):
    feat_idx = np.random.choice(ndim, size=30, replace=False)
    df = pandas.read_table('../data/data_tr.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')
    Xcv = pandas.read_table('../data/data_cv.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')
    sys.stdout.write('\n %d %d %d ==> ' %(n, df.shape[0], Xcv.shape[0]))
    
    for i,(s,e) in enumerate(zip(str_idx, end_idx)):
        sys.stdout.write('%6d-%6d  '%(s,e))
        kdt = scipy.spatial.KDTree(df.iloc[s:e,:], leafsize=1000)
        qt_dist, qt_idx = kdt.query(Xcv, k=10)
    
        pred = np.apply_along_axis(get_label, 0, qt_idx)
        
        np.savetxt('/tmp/dists.%d.%d.dat'%(n,i), qt_dist, fmt='%.4f')
        np.savetxt('/tmp/preds.%d.%d.dat'%(n,i), pred, fmt='%d')









    



 0 429493 47732 ==>      0- 85898   85898-171797  171797-257695  257695-343594  343594-429493  
 1 429493 47732 ==>      0- 85898   85898-171797  171797-257695  257695-343594  343594-429493  
 2 429493 47732 ==>      0- 85898   85898-171797  171797-257695  257695-343594  343594-429493  
 3 429493 47732 ==>      0- 85898   85898-171797  171797-257695  257695-343594  343594-429493  
 4 429493 47732 ==>      0- 85898   85898-171797  171797-257695  257695-343594  343594-429493  
 5 429493 47732 ==>      0- 85898   85898-171797  171797-257695  257695-343594  343594-429493  





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-18-6d1049710b93> in <module>()
     16         sys.stdout.write('%6d-%6d  '%(s,e))
     17         kdt = scipy.spatial.KDTree(df.iloc[s:e,:], leafsize=1000)
---> 18         qt_dist, qt_idx = kdt.query(Xcv, k=10)
     19 
     20         pred = np.apply_along_axis(get_label, 0, qt_idx)

/usr/local/lib/python2.7/dist-packages/scipy/spatial/kdtree.pyc in query(self, x, k, eps, p, distance_upper_bound)
    498                 raise ValueError("Requested %s nearest neighbors; acceptable numbers are integers greater than or equal to one, or None")
    499             for c in np.ndindex(retshape):
--> 500                 hits = self.__query(x[c], k=k, eps=eps, p=p, distance_upper_bound=distance_upper_bound)
    501                 if k is None:
    502                     dd[c] = [d for (d,i) in hits]

/usr/local/lib/python2.7/dist-packages/scipy/spatial/kdtree.pyc in __query(self, x, k, eps, p, distance_upper_bound)
    347                 # brute-force
    348                 data = self.data[node.idx]
--> 349                 ds = minkowski_distance_p(data,x[np.newaxis,:],p)
    350                 for i in range(len(ds)):
    351                     if ds[i] < distance_upper_bound:

KeyboardInterrupt:



In [ ]: