import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys

from matplotlib import pyplot as plt

%matplotlib inline

y = pandas.read_table("/home/vahid/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)


0 161
1 163
2 56
3 119
4 138

np.unique(y[0], return_counts=True)

In [162]:
cv_idx = np.random.choice(y.shape[0], 100000, replace=False)

with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
    with open('../data/data_cv.txt', 'w') as f1, open('../data/label_cv.txt', 'w') as y1, \
        open('../data/data_tr.txt', 'w') as f2, open('../data/label_tr.txt', 'w') as y2:
        n = 0
        for line in fp:
            if np.any(n == cv_idx):
            n += 1

pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )

In [22]:
### Cal Mean and Var. of each feature for each class

ndim = 900
def calClassStat(data_fname, y):
    """ Return a dictionary: class_label:(mean_vec, var_vec, num)
        data_fname is the file containing all the features and samples
        y: class labels for each sample
    yuniq = np.unique(y)
    for ci in yuniq:
        clstat[ci] = [np.zeros(shape=ndim), np.zeros(shape=ndim), 0]
        clstat['all'] = [np.zeros(shape=ndim), np.zeros(shape=ndim), 0]
    with open(data_fname, 'r') as fp:
        n = 0
        for line in fp:
            line = line.strip().split()
            vals = np.empty(shape=ndim, dtype=int)
            for i,v in enumerate(line):
                vals[i] = int(v)            
            label = y[n]
            assert(len(line) == ndim)
            clstat[label][0] += vals
            clstat[label][1] += vals**2
            clstat[label][2] += 1
            n += 1
    for ci in yuniq:
        clstat['all'][0] += clstat[ci][0]
        clstat['all'][1] += clstat[ci][1]
        clstat['all'][2] += clstat[ci][2]
    return (clstat)

r = calClassStat('/home/vahid/Downloads/data/ml/data_train.txt', y[0])

In [125]:
import pickle

pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )

In [106]:
### Calclulate Standardized Mean Difference Between Classes

def calStandMeanDiff(y, yneg, ypos):
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)

    n1 = np.sum(np.in1d(y, yneg))
    n2 = np.sum(np.in1d(y, ypos))
    sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))
    for yi in yneg:
        sx += r[yi][0]
        ssx += r[yi][1]
    r1_mean = sx / float(n1)
    r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)

    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)
    for yi in ypos:
        sx += r[yi][0]
        ssx += r[yi][1]
    r2_mean = sx / float(n2)
    r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)

    tot_mean = r['all'][0] / float(r['all'][2])
    tot_var  = (r['all'][1] - 2*r['all'][0]*tot_mean + tot_mean**2) / float(r['all'][2])

    rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)

    return (rdiff)

## unit test:
mean_test = calStandMeanDiff(y, np.arange(1,157), np.arange(157, 165)) 
print(np.sum(mean_test > 0.1))

Number of samples in NegClass: 477225 and PosClass: 522775 

Classify items belonging to first half (1) Second half (-1)

Finding Good Features

In [107]:
rdiff = calStandMeanDiff(y, np.arange(1,157), np.arange(157, 165)) 

## Good Features:
goodfeatures = np.where(rdiff > 0.1)[0]


Number of samples in NegClass: 477225 and PosClass: 522775 
Read a Random Sample

In [149]:
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
    """ Read a random sample
    if goodfeat is None:
        goodfeat = np.arange(ndim)
    Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
    ysub = np.zeros(shape=size, dtype=int)

    if acc_miny is None:
        acc_miny = np.min(y)
    if acc_maxy is None:
        acc_maxy = np.max(y)
    #yuniq, ycount = np.unique(y, return_counts=True)
    #tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
    acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
    assert(acceptable_indx.shape[0] > size)
    choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
    #sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
    #proba = 1.0 - size/float(tot_acceptable)
    with open(data_fname, 'r') as fp:
        n = 0
        nf = 0
        for line in fp:
#            if (y[n] >= acc_miny and y[n]<=acc_maxy):
#                if np.random.uniform(low=0, high=1) > proba and nf < size:
            if nf < size:
                if n == choice_indx[nf]:
                    line = line.strip().split()
                    ix = -1
                    for i,v in enumerate(line):
                        if np.any(goodfeat == i):
                            ix += 1
                            Xsub[nf,ix] = int(v)
                    ysub[nf] = y[n]

                    nf += 1
            n += 1
    return(Xsub, ysub)

In [119]:
## unit testing readRandomSample()
gf_test = np.arange(21,35)
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
                              size=2000, goodfeat=gf_test, acc_miny=15, acc_maxy=20)


(2000, 14)
[15 16 17 18 19 20]

In [158]:
### Performance Evaluation
def evalPerformance(ytrue, ypred):
    tp = np.sum(ypred[np.where(ytrue ==  1)[0]] == 1)
    fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
    tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
    fn = ytrue.shape[0]-(tp+fp+tn)
    sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
    prec = tp / float(tp + fp)
    recall  = tp / float(tp + fn)
    f1score = 2*tp/float(2*tp + fp + fn)

    return (prec, recall, f1score)

In [159]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, goodfeat=goodfeatures)

ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub  > 156)[0]] =  1

#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)


(20000, 332)

In [160]:
import sklearn.svm

ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for gm in [0.001, 0.01, 0.1, 1.0]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))

5270 4730 0 0
5270 4730 0 0
C=0.0001 Gamma=0.0010  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0001 Gamma=0.0100  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0001 Gamma=0.1000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0001 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0010 Gamma=0.0010  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0010 Gamma=0.0100  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0010 Gamma=0.1000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0010 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.0100 Gamma=0.0010  ==> Prec:0.662  Recall:0.827  F1Score:0.735
4360 2229 2501 910
2836 855 3875 2434
C=0.0100 Gamma=0.0100  ==> Prec:0.768  Recall:0.538  F1Score:0.633
C=0.0100 Gamma=0.1000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
5270 4730 0 0
5270 4730 0 0
C=0.0100 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=0.1000 Gamma=0.0010  ==> Prec:0.699  Recall:0.789  F1Score:0.742
4160 1790 2940 1110
3351 1120 3610 1919
C=0.1000 Gamma=0.0100  ==> Prec:0.749  Recall:0.636  F1Score:0.688
C=0.1000 Gamma=0.1000  ==> Prec:0.834  Recall:0.152  F1Score:0.257
800 159 4571 4470
5270 4730 0 0
C=0.1000 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690
C=1.0000 Gamma=0.0010  ==> Prec:0.710  Recall:0.786  F1Score:0.746
4140 1688 3042 1130
3666 1244 3486 1604
C=1.0000 Gamma=0.0100  ==> Prec:0.747  Recall:0.696  F1Score:0.720
C=1.0000 Gamma=0.1000  ==> Prec:0.825  Recall:0.261  F1Score:0.397
1378 293 4437 3892
5268 4730 0 2
C=1.0000 Gamma=1.0000  ==> Prec:0.527  Recall:1.000  F1Score:0.690

Learning Curve

In [151]:
## Picking the best C and gamma (C=1, gamma=0.1)

for n in [1000, 2000, 4000, 8000, 16000, 32000]:
    Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)

    ysub[np.where(ysub <= 156)[0]] = -1
    ysub[np.where(ysub  > 156)[0]] =  1

    #Xsub = Xsub[:, goodfeatures]
    #Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
    sys.stderr.write('\nSize = %d  ==> '%(n))
    Tp = [0,0,0,0,0]
    Fp = [0,0,0,0,0]
    Tn = [0,0,0,0,0]
    Fn = [0,0,0,0,0]
    for i in range(5):
        tr_idx = np.random.choice(n, size=n/2, replace=False)
        ts_idx = np.setdiff1d(np.arange(n), tr_idx, assume_unique=True)
        yts = ysub[ts_idx]

        clf = sklearn.svm.SVC(C=1.0, kernel='rbf', gamma=0.10)[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        tp = np.sum(ypred[np.where(yts ==  1)[0]] == 1)
        fp = np.sum(ypred[np.where(yts == -1)[0]] == 1)
        tn = np.sum(ypred[np.where(yts == -1)[0]] == -1)
        Tp[i], Fp[i], Tn[i], Fn[i] = tp, fp, tn, yts.shape[0]-(tp+fp+tn)
        sys.stderr.write ("%d (%d %d %d %d)"%(i, tp, fp, tn, yts.shape[0]-(tp+fp+tn)))
    Tp_mean, Fp_mean, Tn_mean, Fn_mean = np.mean(Tp), np.mean(Fp), np.mean(Tn), np.mean(Fn)
    recall  = Tp_mean / (Tp_mean + Fn_mean)
    prec = Tp_mean / (Tp_mean + Fp_mean)
    f1score = 2*Tp_mean/(2*Tp_mean + Fp_mean + Fn_mean)
    sys.stderr.write('\nAverage: Prec=%.3f  Recall=%.3f   F1-score=%.3f\n'%(prec, recall, f1score))

Size = 1000  ==> 0 (269 231 0 0)1 (251 249 0 0)2 (265 235 0 0)3 (271 229 0 0)4 (263 237 0 0)
Average: Prec=0.528  Recall=1.000   F1-score=0.691

Size = 2000  ==> 0 (549 451 0 0)1 (549 451 0 0)2 (542 458 0 0)3 (539 461 0 0)4 (550 450 0 0)
Average: Prec=0.546  Recall=1.000   F1-score=0.706

Size = 4000  ==> 0 (234 71 886 809)1 (255 89 896 760)2 (276 77 910 737)3 (210 60 893 837)4 (295 97 898 710)
Average: Prec=0.763  Recall=0.248   F1-score=0.374

Size = 8000  ==> 0 (514 103 1773 1610)1 (527 106 1807 1560)2 (706 165 1779 1350)3 (573 133 1787 1507)4 (531 131 1799 1539)
Average: Prec=0.817  Recall=0.274   F1-score=0.410
Apply Logistic Regression

In [167]:
import sklearn.linear_model

# we create an instance of Neighbours Classifier and fit the data.
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)

ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub  > 156)[0]] =  1
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)

tr_idx = np.random.choice(n, size=n/2, replace=False)
ts_idx = np.setdiff1d(np.arange(n), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [1.0, 10.0, 100.0, 1000.0, 10000.0]:
    logreg = sklearn.linear_model.LogisticRegression(C=c)[tr_idx,:], ysub[tr_idx])
    ypred = logreg.predict(Xsub[ts_idx])
    prec, recall, f1score = evalPerformance(yts, ypred)
    print ("C=%.4f ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, prec, recall, f1score))

4146 1809 2912 1133
4146 1808 2913 1133
C=1.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
C=10.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
C=100.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
4145 1808 2913 1134
4145 1808 2913 1134
C=1000.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
C=10000.0000 Gamma=1.0000  ==> Prec:0.696  Recall:0.785  F1Score:0.738
4145 1808 2913 1134

Classify subclasses 156:164

positive (+1): y==163:164

negative (-1): 156 <= y <= 162

Find good features

In [110]:
rdiff2 = calStandMeanDiff(y, np.arange(157,162), np.arange(162, 165))

print(np.sum(rdiff2 > 0.1))
goodfeatures_cs2 = np.where(rdiff2 > 0.1)[0]


Number of samples in NegClass: 235201 and PosClass: 287574 
In [67]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, acc_miny=156, acc_maxy=164)

ysub[np.where(ysub <= 162)[0]] = -1
ysub[np.where(ysub  > 162)[0]] =  1

Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub)) / np.std(Xsub)


ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]

for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for gm in [0.001, 0.01, 0.1, 1.0]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        tp = np.sum(ypred[np.where(yts ==  1)[0]] == 1)
        fp = np.sum(ypred[np.where(yts == -1)[0]] == 1)
        tn = np.sum(ypred[np.where(yts == -1)[0]] == -1)
        print ("C=%.4f Gamma=%.4f  ==> TP:%d  FP:%d  TN:%d FN:%d"%(c, gm, tp, fp, tn, yts.shape[0]-(tp+fp+tn)))

In [135]:
a = ['1', '2', '3']


array([1, 2, 3])

In [137]:
Xtest = pandas.read_table('/home/vahid/Downloads/data/ml/data_test.txt', sep=" ", usecols=goodfeatures, dtype='int', header=None)

(262102, 332)

Extracting the Lower Half ($y \in [1..156]$)

In [11]:
yuniq,ycount = np.unique(y[0][np.where(y[0]<=156)[0]], return_counts=True)


In [12]:
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
    with open('../data/data_tr.lower.txt', 'w') as f1, open('../data/label_tr.lower.txt', 'w') as g1, \
         open('../data/data_cv.lower.txt', 'w') as f2, open('../data/label_cv.lower.txt', 'w') as g2:
        n = 0
        for line in fp:
            if y[0][n] <= 156:
                if np.random.uniform(low=0, high=1, size=None) > 0.1:
                    g1.write('%d %d\n'%(n, y[0][n]))
                    g2.write('%d %d\n'%(n, y[0][n]))               
            n += 1

In [4]:
ylow = pandas.read_table("../data/label_tr.lower.txt", sep=" ", dtype='int', header=None)

np.unique(ylow[1], return_counts=True)

In [8]:
ylow = pandas.read_table("../data/label_cv.lower.txt", sep=" ", dtype='int', header=None)

np.unique(ylow[1], return_counts=True)

Extracting 157-159

In [7]:
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
    with open('../data/data_c157-159.txt', 'w') as f1, open('../data/label_c157-159.txt', 'w') as g1:
        n = 0
        for line in fp:
            if y[0][n] >= 157 and y[0][n] <= 159:
                g1.write('%d %d\n'%(n, y[0][n]))
            n += 1

