In [1]:
    
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
    
In [27]:
    
y = pandas.read_table("~/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)
ndim= 900
y.head()
    
    Out[27]:
In [3]:
    
np.unique(y[0], return_counts=True)
    
    Out[3]:
In [25]:
    
yuniq,ycount = np.unique(y[0], return_counts=True)
print(np.sum(ycount[np.where(np.in1d(yuniq, range(157, 162)))[0]]))
print(np.sum(ycount[np.where(np.in1d(yuniq, range(162, 165)))[0]]))
    
    
In [22]:
    
import pickle
cstat = pickle.load(open( "../data/sum_features.dat", "rb" ) )
    
In [125]:
    
import pickle
pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )
    
In [29]:
    
### Calclulate Standardized Mean Difference Between Classes
def calStandMeanDiff(y, cstat, yneg, ypos):
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)
    n1 = np.sum(np.in1d(y, yneg))
    n2 = np.sum(np.in1d(y, ypos))
    sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))
    for yi in yneg:
        sx += cstat[yi][0]
        ssx += cstat[yi][1]
    r1_mean = sx / float(n1)
    r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)
    sx  = np.zeros(shape=ndim, dtype=float)
    ssx = np.zeros(shape=ndim, dtype=float)
    for yi in ypos:
        sx += cstat[yi][0]
        ssx += cstat[yi][1]
    r2_mean = sx / float(n2)
    r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)
    tot_mean = cstat['all'][0] / float(cstat['all'][2])
    tot_var  = (cstat['all'][1] - 2*cstat['all'][0]*tot_mean + tot_mean**2) / float(cstat['all'][2])
    rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)
    return (rdiff)
## unit test:
mean_test = calStandMeanDiff(y, cstat, np.arange(157,162), np.arange(162, 165)) 
print(np.sum(mean_test > 0.1))
    
    
    
In [30]:
    
rdiff = calStandMeanDiff(y, cstat, np.arange(157,162), np.arange(162, 165))
## Good Features:
goodfeatures = np.where(rdiff > 0.1)[0]
goodfeatures
    
    
    Out[30]:
In [31]:
    
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
    """ Read a random sample
    """
    if goodfeat is None:
        goodfeat = np.arange(ndim)
    Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
    ysub = np.zeros(shape=size, dtype=int)
    if acc_miny is None:
        acc_miny = np.min(y)
    if acc_maxy is None:
        acc_maxy = np.max(y)
        
    #yuniq, ycount = np.unique(y, return_counts=True)
    #tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
    
    acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
    assert(acceptable_indx.shape[0] > size)
    choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
    #print(choice_indx.shape)
    #sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
    
    #proba = 1.0 - size/float(tot_acceptable)
    
        
    with open(data_fname, 'r') as fp:
        n = 0
        nf = 0
        for line in fp:
#            if (y[n] >= acc_miny and y[n]<=acc_maxy):
#                if np.random.uniform(low=0, high=1) > proba and nf < size:
            if nf < size:
                if n == choice_indx[nf]:
                    line = line.strip().split()
                    ix = -1
                    for i,v in enumerate(line):
                        if np.any(goodfeat == i):
                            ix += 1
                            Xsub[nf,ix] = int(v)
                    ysub[nf] = y[n]
                    nf += 1
            n += 1
    return(Xsub, ysub)
    
In [33]:
    
## unit testing readRandomSample()
gf_test = np.arange(18,27)
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
                              size=2000, goodfeat=gf_test, acc_miny=15, acc_maxy=20)
print(Xsub.shape)
print(np.unique(ysub))
    
    
In [37]:
    
### Performance Evaluation
def evalPerformance(ytrue, ypred):
    tp = np.sum(ypred[np.where(ytrue ==  1)[0]] == 1)
    fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
    tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
    fn = ytrue.shape[0]-(tp+fp+tn)
    #sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
    prec = tp / float(tp + fp)
    recall  = tp / float(tp + fn)
    f1score = 2*tp/float(2*tp + fp + fn)
    return (prec, recall, f1score)
    
In [41]:
    
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, \
                              goodfeat=goodfeatures, acc_miny=157, acc_maxy=164)
assert(np.sum(ysub < 157) == 0)
ysub[np.where(ysub < 162)[0]] = -1
ysub[np.where(ysub >= 162)[0]] =  1
print(np.sum(ysub == -1), np.sum(ysub==1))
#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
Xsub.shape
    
    
    Out[41]:
In [38]:
    
import sklearn.svm
ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for gm in [0.001, 0.01, 0.1, 1.0]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))
    
    
In [40]:
    
import sklearn.svm
ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for c in [0.05, 0.10, 0.15, 0.25, 0.5, 0.8]:
    for gm in [0.0005, 0.001, 0.0015, 0.002, 0.005]:
        clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
        clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
        ypred = clf.predict(Xsub[ts_idx, :])
        prec, recall, f1score = evalPerformance(yts, ypred)
        print ("C=%.4f Gamma=%.4f  ==> Prec:%.3f  Recall:%.3f  F1Score:%.3f"%(c, gm, prec, recall, f1score))
    
    
In [ ]: