In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys

from matplotlib import pyplot as plt

%matplotlib inline

In [3]:
y = pandas.read_table("/home/vahid/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)

y.head()


Out[3]:
0
0 161
1 163
2 56
3 119
4 138

In [3]:
np.unique(y[0], return_counts=True)


Out[3]:
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164]),
 array([  1263,   1261,   1255,   1256,   1252,   1235,   1240,   1264,
          1256,   1281,   1245,   1278,   1278,   1253,   1255,   1255,
          1291,   1277,   1308,   1285,   1322,   1309,   1318,   1322,
          1327,   1339,   1361,   1361,   1335,   1396,   1359,   1393,
          1373,   1356,   1398,   1416,   1386,   1398,   1396,   1404,
          1430,   1398,   1416,   1406,   1420,   1445,   1433,   1445,
          1454,   1451,   1481,   1482,   1477,   1474,   1478,   1486,
          1512,   1492,   1557,   1557,   1548,   1530,   1574,   1582,
          1606,   1611,   1666,   1650,   1704,   1739,   1735,   1743,
          1728,   1796,   1737,   1810,   1822,   1864,   1847,   1838,
          1857,   1913,   1910,   1917,   2006,   1992,   2033,   2063,
          2072,   2063,   2096,   2128,   2134,   2206,   2215,   2212,
          2258,   2279,   2287,   2319,   2356,   2435,   2438,   2491,
          2486,   2485,   2502,   2555,   2594,   2629,   2575,   2587,
          2777,   2875,   2897,   2884,   2978,   3087,   3179,   3368,
          3388,   3421,   3409,   3453,   3536,   3586,   3615,   3696,
          3821,   3802,   3934,   4059,   4069,   4253,   4819,   4939,
          5038,   5259,   5310,   6080,   6487,   6623,   7256,   8279,
          9069,   9221,   9707,   9998,  10557,  10645,  11484,  12382,
         12858,  16548,  18562,  21943,  30679,  34092,  45439,  60513,
         64478,  65211,  92241, 130122]))

In [8]:



Out[8]:
522775

In [162]:
cv_idx = np.random.choice(y.shape[0], 100000, replace=False)

with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
    with open('../data/data_cv.txt', 'w') as f1, open('../data/label_cv.txt', 'w') as y1, \
        open('../data/data_tr.txt', 'w') as f2, open('../data/label_tr.txt', 'w') as y2:
        n = 0
        for line in fp:
            if np.any(n == cv_idx):
                f1.write('%s'%line)
                y1.write('%d\n'%y[0][n])
            else:
                f2.write('%s'%line)
                y2.write('%d\n'%y[0][])
            n += 1

In [ ]:
Xc
pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )

In [22]:
### Cal Mean and Var. of each feature for each class

ndim = 900
def calClassStat(data_fname, y):
    """ Return a dictionary: class_label:(mean_vec, var_vec, num)
        data_fname is the file containing all the features and samples
        y: class labels for each sample
    """
    clstat={}
    yuniq = np.unique(y)
    for ci in yuniq:
        clstat[ci] = [np.zeros(shape=ndim), np.zeros(shape=ndim), 0]
        clstat['all'] = [np.zeros(shape=ndim), np.zeros(shape=ndim), 0]
    
    with open(data_fname, 'r') as fp:
        n = 0
        for line in fp:
            line = line.strip().split()
            vals = np.empty(shape=ndim, dtype=int)
            for i,v in enumerate(line):
                vals[i] = int(v)            
            label = y[n]
            assert(len(line) == ndim)
            clstat[label][0] += vals
            clstat[label][1] += vals**2
            clstat[label][2] += 1
            n += 1
            
    
    for ci in yuniq:
        clstat['all'][0] += clstat[ci][0]
        clstat['all'][1] += clstat[ci][1]
        clstat['all'][2] += clstat[ci][2]
    
            
    return (clstat)

r = calClassStat('/home/vahid/Downloads/data/ml/data_train.txt', y[0])

In [125]:
import pickle

pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )

In [106]:
### Calclulate Standardized Mean Difference Between Classes

def calStandMeanDiff(y, yneg, ypos):
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)

    
    n1 = np.sum(np.in1d(y, yneg))
    n2 = np.sum(np.in1d(y, ypos))
    sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))
    
    for yi in yneg:
        sx += r[yi][0]
        ssx += r[yi][1]
    r1_mean = sx / float(n1)
    r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)

    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)
    for yi in ypos:
        sx += r[yi][0]
        ssx += r[yi][1]
    r2_mean = sx / float(n2)
    r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)

    tot_mean = r['all'][0] / float(r['all'][2])
    tot_var  = (r['all'][1] - 2*r['all'][0]*tot_mean + tot_mean**2) / float(r['all'][2])

    rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)

    return (rdiff)


## unit test:
mean_test = calStandMeanDiff(y, np.arange(1,157), np.arange(157, 165)) 
print(np.sum(mean_test > 0.1))


332
Number of samples in NegClass: 477225 and PosClass: 522775 

Classify items belonging to first half (1) Second half (-1)

Finding Good Features


In [107]:
rdiff = calStandMeanDiff(y, np.arange(1,157), np.arange(157, 165)) 


## Good Features:
goodfeatures = np.where(rdiff > 0.1)[0]

goodfeatures


Number of samples in NegClass: 477225 and PosClass: 522775 
Out[107]:
array([  1,   2,   5,   6,  14,  15,  24,  25,  35,  38,  39,  40,  42,
        43,  48,  52,  56,  58,  59,  61,  65,  69,  71,  74,  79,  80,
        81,  91,  92,  94,  97,  99, 110, 111, 112, 116, 118, 119, 122,
       127, 129, 133, 137, 138, 139, 140, 143, 145, 146, 148, 149, 151,
       152, 153, 158, 159, 161, 163, 166, 167, 168, 169, 175, 178, 180,
       184, 185, 186, 187, 188, 193, 195, 196, 198, 199, 204, 206, 208,
       210, 214, 215, 216, 217, 219, 221, 222, 223, 224, 225, 229, 234,
       237, 240, 247, 254, 255, 256, 257, 258, 261, 263, 265, 267, 272,
       273, 274, 277, 278, 280, 281, 282, 286, 288, 289, 294, 295, 300,
       307, 308, 315, 316, 318, 320, 322, 328, 329, 330, 332, 334, 336,
       338, 340, 341, 343, 344, 348, 350, 352, 353, 355, 359, 367, 372,
       374, 381, 387, 388, 389, 397, 398, 399, 400, 402, 404, 406, 407,
       408, 412, 413, 414, 416, 417, 421, 422, 425, 428, 432, 439, 440,
       443, 445, 447, 451, 457, 461, 462, 464, 466, 467, 473, 485, 489,
       490, 492, 498, 500, 501, 504, 507, 508, 510, 511, 517, 524, 526,
       527, 530, 531, 537, 539, 541, 542, 544, 547, 556, 561, 563, 564,
       566, 570, 571, 573, 576, 578, 579, 581, 582, 584, 585, 588, 589,
       594, 601, 602, 603, 605, 608, 609, 613, 617, 624, 628, 632, 633,
       634, 641, 643, 645, 646, 652, 653, 657, 658, 661, 665, 666, 667,
       673, 676, 680, 681, 686, 692, 698, 703, 705, 711, 712, 714, 716,
       717, 718, 720, 722, 725, 726, 727, 730, 732, 735, 738, 742, 746,
       747, 748, 751, 753, 757, 758, 759, 760, 762, 764, 766, 770, 772,
       773, 774, 776, 778, 779, 784, 785, 786, 790, 796, 798, 805, 806,
       811, 812, 815, 816, 817, 818, 821, 822, 825, 828, 829, 833, 841,
       849, 853, 854, 855, 861, 863, 864, 869, 871, 874, 876, 877, 878,
       879, 881, 884, 886, 890, 892, 894])

Read a Random Sample


In [149]:
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
    """ Read a random sample
    """
    if goodfeat is None:
        goodfeat = np.arange(ndim)
    Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
    ysub = np.zeros(shape=size, dtype=int)

    if acc_miny is None:
        acc_miny = np.min(y)
    if acc_maxy is None:
        acc_maxy = np.max(y)
        
    #yuniq, ycount = np.unique(y, return_counts=True)
    #tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
    
    acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
    assert(acceptable_indx.shape[0] > size)
    choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
    #print(choice_indx.shape)
    #sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
    
    #proba = 1.0 - size/float(tot_acceptable)
    
        
    with open(data_fname, 'r') as fp:
        n = 0
        nf = 0
        for line in fp:
#            if (y[n] >= acc_miny and y[n]<=acc_maxy):
#                if np.random.uniform(low=0, high=1) > proba and nf < size:
            if nf < size:
                if n == choice_indx[nf]:
                    line = line.strip().split()
                    ix = -1
                    for i,v in enumerate(line):
                        if np.any(goodfeat == i):
                            ix += 1
                            Xsub[nf,ix] = int(v)
                    ysub[nf] = y[n]

                    nf += 1
            n += 1
    return(Xsub, ysub)

In [119]:
## unit testing readRandomSample()
gf_test = np.arange(21,35)
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
                              size=2000, goodfeat=gf_test, acc_miny=15, acc_maxy=20)

print(Xsub.shape)
print(np.unique(ysub))


(2000,)
(2000, 14)
[15 16 17 18 19 20]

In [158]:
### Performance Evaluation
def evalPerformance(ytrue, ypred):
    tp = np.sum(ypred[np.where(ytrue ==  1)[0]] == 1)
    fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
    tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
    fn = ytrue.shape[0]-(tp+fp+tn)
    sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
    prec = tp / float(tp + fp)
    recall  = tp / float(tp + fn)
    f1score = 2*tp/float(2*tp + fp + fn)

    return (prec, recall, f1score)

In [159]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, goodfeat=goodfeatures)

ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub  > 156)[0]] =  1

#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)

Xsub.shape


Out[159]:
(20000, 332)

In [160]:
import sklearn.svm

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for gm in [0.001, 0.01, 0.1, 1.0]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))


5270 4730 0 0
5270 4730 0 0
C=0.0001 Gamma=0.0010  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0001 Gamma=0.0100  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0001 Gamma=0.1000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0001 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0010 Gamma=0.0010  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0010 Gamma=0.0100  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0010 Gamma=0.1000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0010 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0100 Gamma=0.0010  ==> Prec:0.662  Recall:0.827  F1Score:0.735
4360 2229 2501 910
2836 855 3875 2434
C=0.0100 Gamma=0.0100  ==> Prec:0.768  Recall:0.538  F1Score:0.633
C=0.0100 Gamma=0.1000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0100 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.1000 Gamma=0.0010  ==> Prec:0.699  Recall:0.789  F1Score:0.742
4160 1790 2940 1110
3351 1120 3610 1919
C=0.1000 Gamma=0.0100  ==> Prec:0.749  Recall:0.636  F1Score:0.688
C=0.1000 Gamma=0.1000  ==> Prec:0.834  Recall:0.152  F1Score:0.257
800 159 4571 4470
5270 4730 0 0
C=0.1000 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=1.0000 Gamma=0.0010  ==> Prec:0.710  Recall:0.786  F1Score:0.746
4140 1688 3042 1130
3666 1244 3486 1604
C=1.0000 Gamma=0.0100  ==> Prec:0.747  Recall:0.696  F1Score:0.720
C=1.0000 Gamma=0.1000  ==> Prec:0.825  Recall:0.261  F1Score:0.397
1378 293 4437 3892
5268 4730 0 2
C=1.0000 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690

Learning Curve


In [151]:
## Picking the best C and gamma (C=1, gamma=0.1)

for n in [1000, 2000, 4000, 8000, 16000, 32000]:
    Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)

    ysub[np.where(ysub <= 156)[0]] = -1
    ysub[np.where(ysub  > 156)[0]] =  1

    #Xsub = Xsub[:, goodfeatures]
    #Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
    
    sys.stderr.write('\nSize = %d  ==> '%(n))
    Tp = [0,0,0,0,0]
    Fp = [0,0,0,0,0]
    Tn = [0,0,0,0,0]
    Fn = [0,0,0,0,0]
    for i in range(5):
        tr_idx = np.random.choice(n, size=n/2, replace=False)
        ts_idx = np.setdiff1d(np.arange(n), tr_idx, assume_unique=True)
        yts = ysub[ts_idx]

        clf = sklearn.svm.SVC(C=1.0, kernel='rbf', gamma=0.10)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        tp = np.sum(ypred[np.where(yts ==  1)[0]] == 1)
        fp = np.sum(ypred[np.where(yts == -1)[0]] == 1)
        tn = np.sum(ypred[np.where(yts == -1)[0]] == -1)
        Tp[i], Fp[i], Tn[i], Fn[i] = tp, fp, tn, yts.shape[0]-(tp+fp+tn)
        sys.stderr.write ("%d (%d %d %d %d)"%(i, tp, fp, tn, yts.shape[0]-(tp+fp+tn)))
        
    Tp_mean, Fp_mean, Tn_mean, Fn_mean = np.mean(Tp), np.mean(Fp), np.mean(Tn), np.mean(Fn)
    recall  = Tp_mean / (Tp_mean + Fn_mean)
    prec = Tp_mean / (Tp_mean + Fp_mean)
    f1score = 2*Tp_mean/(2*Tp_mean + Fp_mean + Fn_mean)
    sys.stderr.write('\nAverage: Prec=%.3f  Recall=%.3f   F1-score=%.3f\n'%(prec, recall, f1score))


Size = 1000  ==> 0 (269 231 0 0)1 (251 249 0 0)2 (265 235 0 0)3 (271 229 0 0)4 (263 237 0 0)
Average: Prec=0.528  Recall=1.000   F1-score=0.691

Size = 2000  ==> 0 (549 451 0 0)1 (549 451 0 0)2 (542 458 0 0)3 (539 461 0 0)4 (550 450 0 0)
Average: Prec=0.546  Recall=1.000   F1-score=0.706

Size = 4000  ==> 0 (234 71 886 809)1 (255 89 896 760)2 (276 77 910 737)3 (210 60 893 837)4 (295 97 898 710)
Average: Prec=0.763  Recall=0.248   F1-score=0.374

Size = 8000  ==> 0 (514 103 1773 1610)1 (527 106 1807 1560)2 (706 165 1779 1350)3 (573 133 1787 1507)4 (531 131 1799 1539)
Average: Prec=0.817  Recall=0.274   F1-score=0.410
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-151-0a71d38fddc2> in <module>()
      2 
      3 for n in [1000, 2000, 4000, 8000, 16000, 32000]:
----> 4     Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)
      5 
      6     ysub[np.where(ysub <= 156)[0]] = -1

<ipython-input-149-1a123f2c8060> in readRandomSample(data_fname, y, size, goodfeat, acc_miny, acc_maxy)
     35                     ix = -1
     36                     for i,v in enumerate(line):
---> 37                         if np.any(goodfeat == i):
     38                             ix += 1
     39                             Xsub[nf,ix] = int(v)

KeyboardInterrupt: 

Apply Logistic Regression


In [167]:
import sklearn.linear_model

# we create an instance of Neighbours Classifier and fit the data.
n=20000
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)

ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub  > 156)[0]] =  1
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)

tr_idx = np.random.choice(n, size=n/2, replace=False)
ts_idx = np.setdiff1d(np.arange(n), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [1.0, 10.0, 100.0, 1000.0, 10000.0]:
    logreg = sklearn.linear_model.LogisticRegression(C=c)
    logreg.fit(Xsub[tr_idx,:], ysub[tr_idx])
    ypred = logreg.predict(Xsub[ts_idx])
    prec, recall, f1score = evalPerformance(yts, ypred)
    print ("C=%.4f ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, prec, recall, f1score))


4146 1809 2912 1133
4146 1808 2913 1133
C=1.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
C=10.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
C=100.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
4145 1808 2913 1134
4145 1808 2913 1134
C=1000.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
C=10000.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
4145 1808 2913 1134

Classify subclasses 156:164

positive (+1): y==163:164

negative (-1): 156 <= y <= 162

Find good features


In [110]:
rdiff2 = calStandMeanDiff(y, np.arange(157,162), np.arange(162, 165))

print(np.sum(rdiff2 > 0.1))
goodfeatures_cs2 = np.where(rdiff2 > 0.1)[0]

goodfeatures_cs2


335
Number of samples in NegClass: 235201 and PosClass: 287574 
Out[110]:
array([  1,   2,   5,   6,   7,   8,  13,  15,  16,  17,  18,  20,  24,
        25,  28,  31,  34,  35,  37,  38,  40,  43,  48,  50,  51,  52,
        56,  57,  59,  61,  65,  69,  70,  71,  72,  79,  81,  84,  87,
        90,  91,  92,  94,  97,  99, 101, 106, 110, 112, 118, 122, 127,
       129, 133, 138, 139, 141, 143, 146, 148, 149, 151, 152, 153, 155,
       158, 161, 163, 166, 167, 169, 175, 178, 179, 183, 184, 187, 188,
       189, 196, 206, 208, 210, 217, 219, 221, 222, 224, 226, 229, 230,
       237, 239, 246, 250, 253, 255, 256, 258, 261, 263, 267, 271, 272,
       274, 277, 281, 287, 288, 289, 293, 294, 295, 305, 307, 308, 311,
       313, 315, 316, 318, 320, 324, 328, 329, 330, 332, 334, 337, 338,
       342, 343, 344, 348, 350, 353, 355, 357, 358, 359, 364, 366, 367,
       372, 374, 381, 386, 388, 397, 400, 401, 402, 403, 404, 406, 407,
       408, 409, 412, 413, 414, 416, 418, 419, 422, 428, 430, 432, 437,
       440, 443, 445, 451, 457, 462, 464, 466, 467, 468, 470, 471, 473,
       476, 484, 485, 486, 490, 492, 496, 498, 499, 500, 501, 504, 507,
       510, 511, 516, 517, 524, 526, 528, 529, 530, 537, 539, 540, 541,
       547, 548, 553, 556, 561, 564, 565, 566, 572, 576, 578, 579, 581,
       582, 583, 585, 588, 590, 594, 600, 601, 602, 603, 605, 607, 608,
       609, 610, 613, 615, 617, 628, 632, 633, 634, 641, 643, 645, 648,
       651, 652, 653, 655, 656, 658, 661, 665, 666, 667, 669, 671, 673,
       680, 681, 686, 692, 700, 701, 703, 711, 712, 714, 715, 716, 718,
       719, 722, 723, 724, 726, 727, 728, 732, 735, 748, 751, 753, 757,
       758, 759, 760, 762, 764, 770, 772, 773, 774, 775, 776, 778, 784,
       785, 786, 790, 793, 794, 796, 798, 802, 806, 811, 815, 816, 817,
       822, 823, 829, 833, 840, 848, 849, 852, 853, 854, 855, 858, 863,
       864, 869, 871, 876, 884, 886, 890, 893, 894, 897])

In [67]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, acc_miny=156, acc_maxy=164)

ysub[np.where(ysub <= 162)[0]] = -1
ysub[np.where(ysub  > 162)[0]] =  1

Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub)) / np.std(Xsub)

Xsub.shape

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for gm in [0.001, 0.01, 0.1, 1.0]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        tp = np.sum(ypred[np.where(yts ==  1)[0]] == 1)
        fp = np.sum(ypred[np.where(yts == -1)[0]] == 1)
        tn = np.sum(ypred[np.where(yts == -1)[0]] == -1)
        print ("C=%.4f Gamma=%.4f  ==> TP:%d  FP:%d  TN:%d FN:%d"%(c, gm, tp, fp, tn, yts.shape[0]-(tp+fp+tn)))


Out[67]:
array([[-0.25956771,  1.89171015, -0.25956771,  0.81607122],
       [-0.25956771, -0.25956771, -0.25956771,  0.0989786 ],
       [-0.25956771,  0.0989786 , -0.25956771, -0.25956771]])

In [135]:
a = ['1', '2', '3']

np.array(a).astype(int)


Out[135]:
array([1, 2, 3])

In [137]:
Xtest = pandas.read_table('/home/vahid/Downloads/data/ml/data_test.txt', sep=" ", usecols=goodfeatures, dtype='int', header=None)

In [138]:
Xtest.shape


Out[138]:
(262102, 332)

In [139]:
Xtest[:5]


Out[139]:
1 2 5 6 14 15 24 25 35 38 ... 876 877 878 879 881 884 886 890 892 894
0 0 0 0 0 0 0 3 2 0 0 ... 0 0 3 0 0 0 0 0 2 0
1 1 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 1 1 1 1 0 0
2 5 0 4 0 4 0 0 0 0 1 ... 0 0 3 6 0 0 0 0 1 1
3 0 3 0 0 0 0 0 0 0 0 ... 2 0 3 0 0 0 0 0 0 1
4 0 9 0 0 0 1 3 0 0 0 ... 2 0 3 0 0 0 0 0 0 1

5 rows × 332 columns

Extracting the Lower Half ($y \in [1..156]$)


In [11]:
yuniq,ycount = np.unique(y[0][np.where(y[0]<=156)[0]], return_counts=True)

print(ycount)
np.sum(ycount[np.where(yuniq<=130)[0]])


[ 1263  1261  1255  1256  1252  1235  1240  1264  1256  1281  1245  1278
  1278  1253  1255  1255  1291  1277  1308  1285  1322  1309  1318  1322
  1327  1339  1361  1361  1335  1396  1359  1393  1373  1356  1398  1416
  1386  1398  1396  1404  1430  1398  1416  1406  1420  1445  1433  1445
  1454  1451  1481  1482  1477  1474  1478  1486  1512  1492  1557  1557
  1548  1530  1574  1582  1606  1611  1666  1650  1704  1739  1735  1743
  1728  1796  1737  1810  1822  1864  1847  1838  1857  1913  1910  1917
  2006  1992  2033  2063  2072  2063  2096  2128  2134  2206  2215  2212
  2258  2279  2287  2319  2356  2435  2438  2491  2486  2485  2502  2555
  2594  2629  2575  2587  2777  2875  2897  2884  2978  3087  3179  3368
  3388  3421  3409  3453  3536  3586  3615  3696  3821  3802  3934  4059
  4069  4253  4819  4939  5038  5259  5310  6080  6487  6623  7256  8279
  9069  9221  9707  9998 10557 10645 11484 12382 12858 16548 18562 21943]
Out[11]:
247846

In [12]:
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
    with open('../data/data_tr.lower.txt', 'w') as f1, open('../data/label_tr.lower.txt', 'w') as g1, \
         open('../data/data_cv.lower.txt', 'w') as f2, open('../data/label_cv.lower.txt', 'w') as g2:
        n = 0
        for line in fp:
            if y[0][n] <= 156:
                if np.random.uniform(low=0, high=1, size=None) > 0.1:
                    f1.write('%s'%line)
                    g1.write('%d %d\n'%(n, y[0][n]))
                else:
                    f2.write('%s'%line)
                    g2.write('%d %d\n'%(n, y[0][n]))               
            n += 1

In [4]:
ylow = pandas.read_table("../data/label_tr.lower.txt", sep=" ", dtype='int', header=None)

np.unique(ylow[1], return_counts=True)


Out[4]:
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156]),
 array([ 1104,  1123,  1120,  1137,  1110,  1113,  1101,  1149,  1122,
         1153,  1120,  1155,  1145,  1139,  1134,  1137,  1177,  1153,
         1173,  1135,  1190,  1165,  1196,  1205,  1209,  1211,  1226,
         1220,  1219,  1255,  1215,  1253,  1237,  1203,  1248,  1282,
         1252,  1249,  1246,  1244,  1294,  1261,  1268,  1245,  1281,
         1314,  1273,  1290,  1304,  1303,  1347,  1309,  1332,  1326,
         1323,  1345,  1384,  1320,  1384,  1400,  1388,  1405,  1424,
         1412,  1440,  1447,  1488,  1499,  1537,  1571,  1554,  1573,
         1564,  1626,  1565,  1623,  1622,  1669,  1672,  1660,  1683,
         1715,  1715,  1723,  1799,  1766,  1808,  1867,  1862,  1834,
         1893,  1914,  1931,  1970,  2020,  1988,  2059,  2047,  2066,
         2063,  2120,  2179,  2233,  2230,  2247,  2253,  2254,  2327,
         2322,  2387,  2306,  2344,  2501,  2609,  2619,  2574,  2687,
         2794,  2850,  3051,  3032,  3099,  3087,  3122,  3162,  3211,
         3243,  3314,  3445,  3415,  3536,  3667,  3659,  3790,  4291,
         4433,  4519,  4754,  4783,  5450,  5860,  5956,  6528,  7420,
         8186,  8281,  8729,  8974,  9529,  9596, 10401, 11195, 11583,
        14908, 16727, 19731]))

In [8]:
ylow = pandas.read_table("../data/label_cv.lower.txt", sep=" ", dtype='int', header=None)

np.unique(ylow[1], return_counts=True)


Out[8]:
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156]),
 array([ 159,  138,  135,  119,  142,  122,  139,  115,  134,  128,  125,
         123,  133,  114,  121,  118,  114,  124,  135,  150,  132,  144,
         122,  117,  118,  128,  135,  141,  116,  141,  144,  140,  136,
         153,  150,  134,  134,  149,  150,  160,  136,  137,  148,  161,
         139,  131,  160,  155,  150,  148,  134,  173,  145,  148,  155,
         141,  128,  172,  173,  157,  160,  125,  150,  170,  166,  164,
         178,  151,  167,  168,  181,  170,  164,  170,  172,  187,  200,
         195,  175,  178,  174,  198,  195,  194,  207,  226,  225,  196,
         210,  229,  203,  214,  203,  236,  195,  224,  199,  232,  221,
         256,  236,  256,  205,  261,  239,  232,  248,  228,  272,  242,
         269,  243,  276,  266,  278,  310,  291,  293,  329,  317,  356,
         322,  322,  331,  374,  375,  372,  382,  376,  387,  398,  392,
         410,  463,  528,  506,  519,  505,  527,  630,  627,  667,  728,
         859,  883,  940,  978, 1024, 1028, 1049, 1083, 1187, 1275, 1640,
        1835, 2212]))

Extracting 157-159


In [7]:
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
    with open('../data/data_c157-159.txt', 'w') as f1, open('../data/label_c157-159.txt', 'w') as g1:
        n = 0
        for line in fp:
            if y[0][n] >= 157 and y[0][n] <= 159:
                f1.write('%s'%line)
                g1.write('%d %d\n'%(n, y[0][n]))
            n += 1

In [ ]: