In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys

from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
y = pandas.read_table("~/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)

ndim= 900
y.head()


Out[2]:
0
0 161
1 163
2 56
3 119
4 138

In [3]:
np.unique(y[0], return_counts=True)


Out[3]:
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164]),
 array([  1263,   1261,   1255,   1256,   1252,   1235,   1240,   1264,
          1256,   1281,   1245,   1278,   1278,   1253,   1255,   1255,
          1291,   1277,   1308,   1285,   1322,   1309,   1318,   1322,
          1327,   1339,   1361,   1361,   1335,   1396,   1359,   1393,
          1373,   1356,   1398,   1416,   1386,   1398,   1396,   1404,
          1430,   1398,   1416,   1406,   1420,   1445,   1433,   1445,
          1454,   1451,   1481,   1482,   1477,   1474,   1478,   1486,
          1512,   1492,   1557,   1557,   1548,   1530,   1574,   1582,
          1606,   1611,   1666,   1650,   1704,   1739,   1735,   1743,
          1728,   1796,   1737,   1810,   1822,   1864,   1847,   1838,
          1857,   1913,   1910,   1917,   2006,   1992,   2033,   2063,
          2072,   2063,   2096,   2128,   2134,   2206,   2215,   2212,
          2258,   2279,   2287,   2319,   2356,   2435,   2438,   2491,
          2486,   2485,   2502,   2555,   2594,   2629,   2575,   2587,
          2777,   2875,   2897,   2884,   2978,   3087,   3179,   3368,
          3388,   3421,   3409,   3453,   3536,   3586,   3615,   3696,
          3821,   3802,   3934,   4059,   4069,   4253,   4819,   4939,
          5038,   5259,   5310,   6080,   6487,   6623,   7256,   8279,
          9069,   9221,   9707,   9998,  10557,  10645,  11484,  12382,
         12858,  16548,  18562,  21943,  30679,  34092,  45439,  60513,
         64478,  65211,  92241, 130122]))

In [4]:
yuniq,ycount = np.unique(y[0], return_counts=True)

print(np.sum(ycount[np.where(np.in1d(yuniq, range(162, 164)))[0]]))
print(np.sum(ycount[np.where(np.in1d(yuniq, range(164, 165)))[0]]))


157452
130122

In [5]:
import pickle

cstat = pickle.load(open( "../data/sum_features.dat", "rb" ) )

In [18]:
### Calclulate Standardized Mean Difference Between Classes

def calStandMeanDiff(y, cstat, yneg, ypos):
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)


    n1 = np.sum(np.in1d(y, yneg))
    n2 = np.sum(np.in1d(y, ypos))
    sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))

    for yi in yneg:
        sx += cstat[yi][0]
        ssx += cstat[yi][1]
    r1_mean = sx / float(n1)
    r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)

    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)
    for yi in ypos:
        sx += cstat[yi][0]
        ssx += cstat[yi][1]
    r2_mean = sx / float(n2)
    r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)

    tot_mean = cstat['all'][0] / float(cstat['all'][2])
    tot_var  = (cstat['all'][1] - 2*cstat['all'][0]*tot_mean + tot_mean**2) / float(cstat['all'][2])

    rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)

    return (rdiff)


## unit test:
mean_test = calStandMeanDiff(y, cstat, np.arange(162,164), np.arange(164, 165)) 
print(np.sum(mean_test > 0.1))


342
Number of samples in NegClass: 157452 and PosClass: 130122 

Classify items belonging to first half (1) Second half (-1)

Finding Good Features


In [19]:
rdiff = calStandMeanDiff(y, cstat, np.arange(162,164), np.arange(164, 165))


## Good Features:
goodfeatures = np.where(rdiff > 0.1)[0]

goodfeatures


Number of samples in NegClass: 157452 and PosClass: 130122 
Out[19]:
array([  4,   6,   7,  11,  13,  16,  18,  20,  21,  22,  29,  31,  33,
        36,  44,  46,  50,  54,  59,  63,  64,  68,  70,  73,  75,  77,
        78,  81,  83,  86,  90,  95,  98, 101, 104, 106, 107, 108, 109,
       113, 114, 115, 117, 123, 124, 125, 126, 127, 130, 131, 134, 138,
       142, 144, 147, 150, 154, 155, 157, 160, 162, 164, 172, 176, 177,
       182, 183, 187, 189, 190, 194, 201, 203, 205, 207, 210, 211, 212,
       218, 227, 231, 235, 236, 238, 239, 241, 244, 247, 248, 249, 250,
       251, 253, 259, 260, 264, 268, 270, 273, 275, 276, 277, 280, 283,
       284, 292, 297, 298, 299, 303, 304, 305, 307, 309, 313, 317, 319,
       321, 323, 324, 326, 327, 333, 339, 342, 349, 351, 356, 357, 361,
       362, 366, 368, 370, 376, 378, 380, 383, 384, 385, 390, 391, 392,
       393, 394, 403, 406, 409, 410, 411, 415, 419, 420, 424, 426, 427,
       431, 433, 434, 435, 438, 439, 444, 446, 448, 450, 453, 454, 456,
       457, 458, 460, 463, 468, 469, 471, 472, 475, 477, 478, 479, 480,
       483, 486, 488, 489, 491, 494, 499, 503, 505, 506, 512, 513, 514,
       516, 517, 518, 519, 520, 521, 523, 525, 526, 531, 534, 535, 536,
       538, 541, 543, 546, 550, 551, 553, 554, 555, 557, 558, 559, 560,
       562, 565, 572, 573, 574, 575, 580, 581, 583, 586, 587, 591, 592,
       593, 596, 598, 599, 604, 606, 611, 616, 618, 621, 623, 627, 630,
       637, 638, 639, 642, 647, 649, 655, 657, 663, 664, 668, 670, 672,
       675, 677, 678, 682, 684, 685, 687, 688, 691, 693, 694, 695, 696,
       697, 700, 707, 709, 713, 715, 728, 729, 736, 737, 739, 745, 747,
       749, 756, 765, 768, 771, 772, 777, 783, 787, 793, 794, 795, 799,
       804, 807, 808, 809, 810, 813, 814, 817, 819, 820, 823, 826, 829,
       830, 835, 837, 838, 839, 843, 847, 850, 851, 853, 854, 856, 860,
       862, 863, 865, 866, 868, 872, 873, 874, 880, 882, 883, 888, 893,
       895, 897, 898, 899])

Read a Random Sample


In [6]:
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
    """ Read a random sample
    """
    if goodfeat is None:
        goodfeat = np.arange(ndim)
    Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
    ysub = np.zeros(shape=size, dtype=int)

    if acc_miny is None:
        acc_miny = np.min(y)
    if acc_maxy is None:
        acc_maxy = np.max(y)
        
    #yuniq, ycount = np.unique(y, return_counts=True)
    #tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
    
    acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
    assert(acceptable_indx.shape[0] > size)
    choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
    #print(choice_indx.shape)
    #sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
    
    #proba = 1.0 - size/float(tot_acceptable)
    
        
    with open(data_fname, 'r') as fp:
        n = 0
        nf = 0
        for line in fp:
#            if (y[n] >= acc_miny and y[n]<=acc_maxy):
#                if np.random.uniform(low=0, high=1) > proba and nf < size:
            if nf < size:
                if n == choice_indx[nf]:
                    line = line.strip().split()
                    ix = -1
                    for i,v in enumerate(line):
                        if np.any(goodfeat == i):
                            ix += 1
                            Xsub[nf,ix] = int(v)
                    ysub[nf] = y[n]

                    nf += 1
            n += 1
    return(Xsub, ysub)

In [20]:
## unit testing readRandomSample()
gf_test = np.arange(18,27)
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
                              size=2000, goodfeat=gf_test, acc_miny=15, acc_maxy=20)

print(Xsub.shape)
print(np.unique(ysub))


(2000, 9)
[15 16 17 18 19 20]

In [21]:
### Performance Evaluation
def evalPerformance(ytrue, ypred):
    tp = np.sum(ypred[np.where(ytrue ==  1)[0]] == 1)
    fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
    tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
    fn = ytrue.shape[0]-(tp+fp+tn)
    #sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
    prec = tp / float(tp + fp)
    recall  = tp / float(tp + fn)
    f1score = 2*tp/float(2*tp + fp + fn)

    return (prec, recall, f1score)

In [23]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=200, \
                              goodfeat=goodfeatures, acc_miny=162, acc_maxy=164)

assert(np.sum(ysub < 162) == 0)
ysub[np.where(ysub < 164)[0]] = -1
ysub[np.where(ysub >= 164)[0]] =  1

print(np.sum(ysub == -1), np.sum(ysub==1))

#Xsub = Xsub[:, goodfeatures]
features_idx = np.where(np.std(Xsub, axis=0)> 0.001)[0]
print("Number of Good Features: %d"%features_idx.shape[0])

Xsub = Xsub[:,features_idx]

Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)

Xsub.shape


(105, 95)
Number of Good Features: 342
Out[23]:
(200, 342)

Grid-Search (coarse)


In [13]:
import sklearn.svm

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for gm in [0.001, 0.01, 0.1, 1.0]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))


C=0.0001 Gamma=0.0010  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0001 Gamma=0.0100  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0001 Gamma=0.1000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0001 Gamma=1.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0010 Gamma=0.0010  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0010 Gamma=0.0100  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0010 Gamma=0.1000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0010 Gamma=1.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0100 Gamma=0.0010  ==> Prec:0.666  Recall:0.512  F1Score:0.579
C=0.0100 Gamma=0.0100  ==> Prec:0.648  Recall:0.181  F1Score:0.283
C=0.0100 Gamma=0.1000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.0100 Gamma=1.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.1000 Gamma=0.0010  ==> Prec:0.715  Recall:0.698  F1Score:0.707
C=0.1000 Gamma=0.0100  ==> Prec:0.727  Recall:0.445  F1Score:0.552
C=0.1000 Gamma=0.1000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=0.1000 Gamma=1.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000
C=1.0000 Gamma=0.0010  ==> Prec:0.730  Recall:0.786  F1Score:0.757
C=1.0000 Gamma=0.0100  ==> Prec:0.786  Recall:0.601  F1Score:0.681
C=1.0000 Gamma=0.1000  ==> Prec:0.763  Recall:0.010  F1Score:0.020
C=1.0000 Gamma=1.0000  ==> Prec:nan  Recall:0.000  F1Score:0.000

In [14]:
import sklearn.svm

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [0.2, 0.5, 1, 1.5, 2, 5, 10]:
    for gm in [0.0005, 0.0005, 0.001, 0.0015, 0.002, 0.005]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))


C=0.2000 Gamma=0.0005  ==> Prec:0.720  Recall:0.697  F1Score:0.708
C=0.2000 Gamma=0.0005  ==> Prec:0.720  Recall:0.697  F1Score:0.708
C=0.2000 Gamma=0.0010  ==> Prec:0.728  Recall:0.724  F1Score:0.726
C=0.2000 Gamma=0.0015  ==> Prec:0.737  Recall:0.728  F1Score:0.732
C=0.2000 Gamma=0.0020  ==> Prec:0.742  Recall:0.725  F1Score:0.733
C=0.2000 Gamma=0.0050  ==> Prec:0.752  Recall:0.644  F1Score:0.694
C=0.5000 Gamma=0.0005  ==> Prec:0.724  Recall:0.735  F1Score:0.730
C=0.5000 Gamma=0.0005  ==> Prec:0.724  Recall:0.735  F1Score:0.730
C=0.5000 Gamma=0.0010  ==> Prec:0.738  Recall:0.750  F1Score:0.744
C=0.5000 Gamma=0.0015  ==> Prec:0.746  Recall:0.757  F1Score:0.752
C=0.5000 Gamma=0.0020  ==> Prec:0.753  Recall:0.754  F1Score:0.754
C=0.5000 Gamma=0.0050  ==> Prec:0.770  Recall:0.695  F1Score:0.731
C=1.0000 Gamma=0.0005  ==> Prec:0.729  Recall:0.750  F1Score:0.739
C=1.0000 Gamma=0.0005  ==> Prec:0.729  Recall:0.750  F1Score:0.739
C=1.0000 Gamma=0.0010  ==> Prec:0.740  Recall:0.773  F1Score:0.756
C=1.0000 Gamma=0.0015  ==> Prec:0.749  Recall:0.777  F1Score:0.763
C=1.0000 Gamma=0.0020  ==> Prec:0.755  Recall:0.776  F1Score:0.765
C=1.0000 Gamma=0.0050  ==> Prec:0.774  Recall:0.728  F1Score:0.751
C=1.5000 Gamma=0.0005  ==> Prec:0.732  Recall:0.759  F1Score:0.745
C=1.5000 Gamma=0.0005  ==> Prec:0.732  Recall:0.759  F1Score:0.745
C=1.5000 Gamma=0.0010  ==> Prec:0.743  Recall:0.779  F1Score:0.760
C=1.5000 Gamma=0.0015  ==> Prec:0.750  Recall:0.787  F1Score:0.768
C=1.5000 Gamma=0.0020  ==> Prec:0.756  Recall:0.788  F1Score:0.772
C=1.5000 Gamma=0.0050  ==> Prec:0.778  Recall:0.738  F1Score:0.758
C=2.0000 Gamma=0.0005  ==> Prec:0.731  Recall:0.764  F1Score:0.747
C=2.0000 Gamma=0.0005  ==> Prec:0.731  Recall:0.764  F1Score:0.747
C=2.0000 Gamma=0.0010  ==> Prec:0.745  Recall:0.785  F1Score:0.765
C=2.0000 Gamma=0.0015  ==> Prec:0.751  Recall:0.793  F1Score:0.771
C=2.0000 Gamma=0.0020  ==> Prec:0.754  Recall:0.792  F1Score:0.773
C=2.0000 Gamma=0.0050  ==> Prec:0.780  Recall:0.733  F1Score:0.756
C=5.0000 Gamma=0.0005  ==> Prec:0.736  Recall:0.786  F1Score:0.760
C=5.0000 Gamma=0.0005  ==> Prec:0.736  Recall:0.786  F1Score:0.760
C=5.0000 Gamma=0.0010  ==> Prec:0.749  Recall:0.802  F1Score:0.775
C=5.0000 Gamma=0.0015  ==> Prec:0.750  Recall:0.800  F1Score:0.774
C=5.0000 Gamma=0.0020  ==> Prec:0.757  Recall:0.791  F1Score:0.773
C=5.0000 Gamma=0.0050  ==> Prec:0.793  Recall:0.718  F1Score:0.753
C=10.0000 Gamma=0.0005  ==> Prec:0.741  Recall:0.797  F1Score:0.768
C=10.0000 Gamma=0.0005  ==> Prec:0.741  Recall:0.797  F1Score:0.768
C=10.0000 Gamma=0.0010  ==> Prec:0.746  Recall:0.802  F1Score:0.773
C=10.0000 Gamma=0.0015  ==> Prec:0.754  Recall:0.790  F1Score:0.771
C=10.0000 Gamma=0.0020  ==> Prec:0.765  Recall:0.776  F1Score:0.770
C=10.0000 Gamma=0.0050  ==> Prec:0.794  Recall:0.713  F1Score:0.751

In [ ]: