In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
In [3]:
y = pandas.read_table("/home/vahid/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)
y.head()
Out[3]:
In [3]:
np.unique(y[0], return_counts=True)
Out[3]:
In [8]:
Out[8]:
In [162]:
cv_idx = np.random.choice(y.shape[0], 100000, replace=False)
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
with open('../data/data_cv.txt', 'w') as f1, open('../data/label_cv.txt', 'w') as y1, \
open('../data/data_tr.txt', 'w') as f2, open('../data/label_tr.txt', 'w') as y2:
n = 0
for line in fp:
if np.any(n == cv_idx):
f1.write('%s'%line)
y1.write('%d\n'%y[0][n])
else:
f2.write('%s'%line)
y2.write('%d\n'%y[0][])
n += 1
In [ ]:
Xc
pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )
In [22]:
### Cal Mean and Var. of each feature for each class
ndim = 900
def calClassStat(data_fname, y):
""" Return a dictionary: class_label:(mean_vec, var_vec, num)
data_fname is the file containing all the features and samples
y: class labels for each sample
"""
clstat={}
yuniq = np.unique(y)
for ci in yuniq:
clstat[ci] = [np.zeros(shape=ndim), np.zeros(shape=ndim), 0]
clstat['all'] = [np.zeros(shape=ndim), np.zeros(shape=ndim), 0]
with open(data_fname, 'r') as fp:
n = 0
for line in fp:
line = line.strip().split()
vals = np.empty(shape=ndim, dtype=int)
for i,v in enumerate(line):
vals[i] = int(v)
label = y[n]
assert(len(line) == ndim)
clstat[label][0] += vals
clstat[label][1] += vals**2
clstat[label][2] += 1
n += 1
for ci in yuniq:
clstat['all'][0] += clstat[ci][0]
clstat['all'][1] += clstat[ci][1]
clstat['all'][2] += clstat[ci][2]
return (clstat)
r = calClassStat('/home/vahid/Downloads/data/ml/data_train.txt', y[0])
In [125]:
import pickle
pickle.dump( r, open( "../data/sum_features.dat", "wb" ) )
In [106]:
### Calclulate Standardized Mean Difference Between Classes
def calStandMeanDiff(y, yneg, ypos):
sx = np.zeros(shape=ndim, dtype=float)
ssx = np.zeros(shape=ndim, dtype=float)
n1 = np.sum(np.in1d(y, yneg))
n2 = np.sum(np.in1d(y, ypos))
sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))
for yi in yneg:
sx += r[yi][0]
ssx += r[yi][1]
r1_mean = sx / float(n1)
r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)
sx = np.zeros(shape=ndim, dtype=float)
ssx = np.zeros(shape=ndim, dtype=float)
for yi in ypos:
sx += r[yi][0]
ssx += r[yi][1]
r2_mean = sx / float(n2)
r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)
tot_mean = r['all'][0] / float(r['all'][2])
tot_var = (r['all'][1] - 2*r['all'][0]*tot_mean + tot_mean**2) / float(r['all'][2])
rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)
return (rdiff)
## unit test:
mean_test = calStandMeanDiff(y, np.arange(1,157), np.arange(157, 165))
print(np.sum(mean_test > 0.1))
In [107]:
rdiff = calStandMeanDiff(y, np.arange(1,157), np.arange(157, 165))
## Good Features:
goodfeatures = np.where(rdiff > 0.1)[0]
goodfeatures
Out[107]:
In [149]:
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
""" Read a random sample
"""
if goodfeat is None:
goodfeat = np.arange(ndim)
Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
ysub = np.zeros(shape=size, dtype=int)
if acc_miny is None:
acc_miny = np.min(y)
if acc_maxy is None:
acc_maxy = np.max(y)
#yuniq, ycount = np.unique(y, return_counts=True)
#tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
assert(acceptable_indx.shape[0] > size)
choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
#print(choice_indx.shape)
#sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
#proba = 1.0 - size/float(tot_acceptable)
with open(data_fname, 'r') as fp:
n = 0
nf = 0
for line in fp:
# if (y[n] >= acc_miny and y[n]<=acc_maxy):
# if np.random.uniform(low=0, high=1) > proba and nf < size:
if nf < size:
if n == choice_indx[nf]:
line = line.strip().split()
ix = -1
for i,v in enumerate(line):
if np.any(goodfeat == i):
ix += 1
Xsub[nf,ix] = int(v)
ysub[nf] = y[n]
nf += 1
n += 1
return(Xsub, ysub)
In [119]:
## unit testing readRandomSample()
gf_test = np.arange(21,35)
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
size=2000, goodfeat=gf_test, acc_miny=15, acc_maxy=20)
print(Xsub.shape)
print(np.unique(ysub))
In [158]:
### Performance Evaluation
def evalPerformance(ytrue, ypred):
tp = np.sum(ypred[np.where(ytrue == 1)[0]] == 1)
fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
fn = ytrue.shape[0]-(tp+fp+tn)
sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
prec = tp / float(tp + fp)
recall = tp / float(tp + fn)
f1score = 2*tp/float(2*tp + fp + fn)
return (prec, recall, f1score)
In [159]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, goodfeat=goodfeatures)
ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub > 156)[0]] = 1
#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
Xsub.shape
Out[159]:
In [160]:
import sklearn.svm
ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
for gm in [0.001, 0.01, 0.1, 1.0]:
clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
ypred = clf.predict(Xsub[ts_idx, :])
prec, recall, f1score = evalPerformance(yts, ypred)
print ("C=%.4f Gamma=%.4f ==> Prec:%.3f Recall:%.3f F1Score:%.3f"%(c, gm, prec, recall, f1score))
In [151]:
## Picking the best C and gamma (C=1, gamma=0.1)
for n in [1000, 2000, 4000, 8000, 16000, 32000]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)
ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub > 156)[0]] = 1
#Xsub = Xsub[:, goodfeatures]
#Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
sys.stderr.write('\nSize = %d ==> '%(n))
Tp = [0,0,0,0,0]
Fp = [0,0,0,0,0]
Tn = [0,0,0,0,0]
Fn = [0,0,0,0,0]
for i in range(5):
tr_idx = np.random.choice(n, size=n/2, replace=False)
ts_idx = np.setdiff1d(np.arange(n), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
clf = sklearn.svm.SVC(C=1.0, kernel='rbf', gamma=0.10)
clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
ypred = clf.predict(Xsub[ts_idx, :])
tp = np.sum(ypred[np.where(yts == 1)[0]] == 1)
fp = np.sum(ypred[np.where(yts == -1)[0]] == 1)
tn = np.sum(ypred[np.where(yts == -1)[0]] == -1)
Tp[i], Fp[i], Tn[i], Fn[i] = tp, fp, tn, yts.shape[0]-(tp+fp+tn)
sys.stderr.write ("%d (%d %d %d %d)"%(i, tp, fp, tn, yts.shape[0]-(tp+fp+tn)))
Tp_mean, Fp_mean, Tn_mean, Fn_mean = np.mean(Tp), np.mean(Fp), np.mean(Tn), np.mean(Fn)
recall = Tp_mean / (Tp_mean + Fn_mean)
prec = Tp_mean / (Tp_mean + Fp_mean)
f1score = 2*Tp_mean/(2*Tp_mean + Fp_mean + Fn_mean)
sys.stderr.write('\nAverage: Prec=%.3f Recall=%.3f F1-score=%.3f\n'%(prec, recall, f1score))
In [167]:
import sklearn.linear_model
# we create an instance of Neighbours Classifier and fit the data.
n=20000
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=n, goodfeat=goodfeatures)
ysub[np.where(ysub <= 156)[0]] = -1
ysub[np.where(ysub > 156)[0]] = 1
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
tr_idx = np.random.choice(n, size=n/2, replace=False)
ts_idx = np.setdiff1d(np.arange(n), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for c in [1.0, 10.0, 100.0, 1000.0, 10000.0]:
logreg = sklearn.linear_model.LogisticRegression(C=c)
logreg.fit(Xsub[tr_idx,:], ysub[tr_idx])
ypred = logreg.predict(Xsub[ts_idx])
prec, recall, f1score = evalPerformance(yts, ypred)
print ("C=%.4f ==> Prec:%.3f Recall:%.3f F1Score:%.3f"%(c, prec, recall, f1score))
In [110]:
rdiff2 = calStandMeanDiff(y, np.arange(157,162), np.arange(162, 165))
print(np.sum(rdiff2 > 0.1))
goodfeatures_cs2 = np.where(rdiff2 > 0.1)[0]
goodfeatures_cs2
Out[110]:
In [67]:
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], size=20000, acc_miny=156, acc_maxy=164)
ysub[np.where(ysub <= 162)[0]] = -1
ysub[np.where(ysub > 162)[0]] = 1
Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub)) / np.std(Xsub)
Xsub.shape
ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for c in [0.0001, 0.001, 0.01, 0.1, 1.0]:
for gm in [0.001, 0.01, 0.1, 1.0]:
clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
ypred = clf.predict(Xsub[ts_idx, :])
tp = np.sum(ypred[np.where(yts == 1)[0]] == 1)
fp = np.sum(ypred[np.where(yts == -1)[0]] == 1)
tn = np.sum(ypred[np.where(yts == -1)[0]] == -1)
print ("C=%.4f Gamma=%.4f ==> TP:%d FP:%d TN:%d FN:%d"%(c, gm, tp, fp, tn, yts.shape[0]-(tp+fp+tn)))
Out[67]:
In [135]:
a = ['1', '2', '3']
np.array(a).astype(int)
Out[135]:
In [137]:
Xtest = pandas.read_table('/home/vahid/Downloads/data/ml/data_test.txt', sep=" ", usecols=goodfeatures, dtype='int', header=None)
In [138]:
Xtest.shape
Out[138]:
In [139]:
Xtest[:5]
Out[139]:
In [11]:
yuniq,ycount = np.unique(y[0][np.where(y[0]<=156)[0]], return_counts=True)
print(ycount)
np.sum(ycount[np.where(yuniq<=130)[0]])
Out[11]:
In [12]:
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
with open('../data/data_tr.lower.txt', 'w') as f1, open('../data/label_tr.lower.txt', 'w') as g1, \
open('../data/data_cv.lower.txt', 'w') as f2, open('../data/label_cv.lower.txt', 'w') as g2:
n = 0
for line in fp:
if y[0][n] <= 156:
if np.random.uniform(low=0, high=1, size=None) > 0.1:
f1.write('%s'%line)
g1.write('%d %d\n'%(n, y[0][n]))
else:
f2.write('%s'%line)
g2.write('%d %d\n'%(n, y[0][n]))
n += 1
In [4]:
ylow = pandas.read_table("../data/label_tr.lower.txt", sep=" ", dtype='int', header=None)
np.unique(ylow[1], return_counts=True)
Out[4]:
In [8]:
ylow = pandas.read_table("../data/label_cv.lower.txt", sep=" ", dtype='int', header=None)
np.unique(ylow[1], return_counts=True)
Out[8]:
In [7]:
with open('/home/vahid/Downloads/data/ml/data_train.txt', 'r') as fp:
with open('../data/data_c157-159.txt', 'w') as f1, open('../data/label_c157-159.txt', 'w') as g1:
n = 0
for line in fp:
if y[0][n] >= 157 and y[0][n] <= 159:
f1.write('%s'%line)
g1.write('%d %d\n'%(n, y[0][n]))
n += 1
In [ ]: