In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
y = pandas.read_table("~/Downloads/data/ml/label_train.txt", sep=" ", dtype='int', header=None)
ndim= 900
y.head()
Out[2]:
In [3]:
ymin = 1
ysplit = 131
ymax = 156
In [4]:
np.unique(y[0], return_counts=True)
Out[4]:
In [20]:
yuniq,ycount = np.unique(y[0], return_counts=True)
print(np.sum(ycount[np.where(np.in1d(yuniq, range(ymin, ysplit)))[0]]))
print(np.sum(ycount[np.where(np.in1d(yuniq, range(ysplit, ymax+1)))[0]]))
In [5]:
import pickle
cstat = pickle.load(open( "../data/sum_features.dat", "rb" ) )
In [6]:
### Calclulate Standardized Mean Difference Between Classes
def calStandMeanDiff(y, cstat, yneg, ypos):
sx = np.zeros(shape=ndim, dtype=float)
ssx = np.zeros(shape=ndim, dtype=float)
n1 = np.sum(np.in1d(y, yneg))
n2 = np.sum(np.in1d(y, ypos))
sys.stderr.write("Number of samples in NegClass: %d and PosClass: %d \n"%(n1, n2))
for yi in yneg:
sx += cstat[yi][0]
ssx += cstat[yi][1]
r1_mean = sx / float(n1)
r1_var = (ssx - 2*sx*r1_mean + r1_mean**2) / float(n1)
tot_mean = sx
tot_var = ssx
sx = np.zeros(shape=ndim, dtype=float)
ssx = np.zeros(shape=ndim, dtype=float)
for yi in ypos:
sx += cstat[yi][0]
ssx += cstat[yi][1]
r2_mean = sx / float(n2)
r2_var = (ssx - 2*sx*r2_mean + r2_mean**2) / float(n2)
tot_mean += sx
tot_var += ssx
tot_mean = tot_mean / float(n1 + n2)
tot_var = (tot_var - 2*tot_var*tot_mean + tot_mean**2) / float(n1 + n2)
rdiff = (r1_mean - r2_mean) / np.sqrt(tot_var)
return (rdiff)
## unit test:
mean_test = calStandMeanDiff(y, cstat, np.arange(ymin,ysplit), np.arange(ysplit, ymax+1))
print(np.sum(mean_test > 0.001))
In [7]:
rdiff = calStandMeanDiff(y, cstat, np.arange(ymin,ysplit), np.arange(ysplit, ymax+1))
## Good Features:
goodfeatures = np.where(rdiff > 0.001)[0]
print(goodfeatures)
if goodfeatures.shape[0] < 100:
rest = np.setdiff1d(np.arange(ndim), goodfeatures, assume_unique=True)
goodfeatures = np.hstack((goodfeatures, rest[:100]))
print(goodfeatures.shape)
goodfeatures
Out[7]:
In [8]:
def readRandomSample(data_fname, y, size, goodfeat=None, acc_miny=None, acc_maxy=None):
""" Read a random sample
"""
if goodfeat is None:
goodfeat = np.arange(ndim)
Xsub = np.empty(shape=(size,goodfeat.shape[0]), dtype=float)
ysub = np.zeros(shape=size, dtype=int)
if acc_miny is None:
acc_miny = np.min(y)
if acc_maxy is None:
acc_maxy = np.max(y)
#yuniq, ycount = np.unique(y, return_counts=True)
#tot_acceptable = np.sum(ycount[np.where((yuniq >= acc_miny) & (yuniq <= acc_maxy))[0]])
acceptable_indx = np.where((y>=acc_miny) & (y<=acc_maxy))[0]
assert(acceptable_indx.shape[0] > size)
choice_indx = np.sort(np.random.choice(acceptable_indx, size, replace=False))
#print(choice_indx.shape)
#sys.stderr.write("Total Accetables: --> %d"%(tot_acceptable))
#proba = 1.0 - size/float(tot_acceptable)
with open(data_fname, 'r') as fp:
n = 0
nf = 0
for line in fp:
# if (y[n] >= acc_miny and y[n]<=acc_maxy):
# if np.random.uniform(low=0, high=1) > proba and nf < size:
if nf < size:
if n == choice_indx[nf]:
line = line.strip().split()
ix = -1
for i,v in enumerate(line):
if np.any(goodfeat == i):
ix += 1
Xsub[nf,ix] = int(v)
ysub[nf] = y[n]
nf += 1
n += 1
return(Xsub, ysub)
In [23]:
## unit testing readRandomSample()
gf_test = goodfeatures
Xsub, ysub = readRandomSample('/home/vahid/Downloads/data/ml/data_train.txt', y[0], \
size=2000, goodfeat=gf_test, acc_miny=ymin, acc_maxy=ymax)
print(Xsub.shape)
print(np.unique(ysub))
In [9]:
### Performance Evaluation
def evalPerformance(ytrue, ypred):
tp = np.sum(ypred[np.where(ytrue == 1)[0]] == 1)
fp = np.sum(ypred[np.where(ytrue == -1)[0]] == 1)
tn = np.sum(ypred[np.where(ytrue == -1)[0]] == -1)
fn = ytrue.shape[0]-(tp+fp+tn)
#sys.stderr.write('%d %d %d %d\n'%(tp,fp,tn,fn))
prec = tp / float(tp + fp)
recall = tp / float(tp + fn)
f1score = 2*tp/float(2*tp + fp + fn)
return (prec, recall, f1score)
In [26]:
y = pandas.read_table('../data/label_tr.lower.txt', sep=' ', header=None, dtype='int')
print(np.unique(y[1]))
Xsub, ysub = readRandomSample('../data/data_tr.lower.txt', y[1], size=20000, \
goodfeat=goodfeatures, acc_miny=ymin, acc_maxy=ymax)
print(np.unique(ysub))
assert(np.sum(ysub < ymin) == 0)
assert(np.sum(ysub > ymax) == 0)
ysub[np.where(ysub < ysplit)[0]] = -1
ysub[np.where(ysub >= ysplit)[0]] = 1
print(np.sum(ysub == -1), np.sum(ysub==1))
#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
Xsub.shape
Out[26]:
In [28]:
import sklearn.svm
ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for c in [1.0, 5.0, 10, 20, 50]:
for gm in [0.001, 0.01, 0.1, 1.0, 5.0, 10, 20]:
clf = sklearn.svm.SVC(C=c, kernel='rbf', gamma=gm)
clf.fit(Xsub[tr_idx, :], ysub[tr_idx])
ypred = clf.predict(Xsub[ts_idx, :])
prec, recall, f1score = evalPerformance(yts, ypred)
print ("C=%.4f Gamma=%.4f ==> Prec:%.3f Recall:%.3f F1Score:%.3f"%(c, gm, prec, recall, f1score))
In [19]:
#y = pandas.read_table('../data/label_train.txt', sep=' ', header=None, dtype='int')
Xsub, ysub = readRandomSample('../data/data_train.txt', y[0], size=20000, \
acc_miny=ymin, acc_maxy=ymax)
print(np.unique(ysub))
assert(np.sum(ysub < ymin) == 0)
assert(np.sum(ysub > ymax) == 0)
ysub[np.where(ysub < ysplit)[0]] = -1
ysub[np.where(ysub >= ysplit)[0]] = 1
print(np.sum(ysub == -1), np.sum(ysub==1))
#Xsub = Xsub[:, goodfeatures]
Xsub = (Xsub - np.mean(Xsub, axis=0)) / np.std(Xsub, axis=0)
Xsub.shape
Out[19]:
In [20]:
import sklearn.ensemble
import datetime as dt
ntot = Xsub.shape[0]
tr_idx = np.random.choice(ntot, size=ntot/2, replace=False)
ts_idx = np.setdiff1d(np.arange(ntot), tr_idx, assume_unique=True)
yts = ysub[ts_idx]
for n_est in [20, 50, 100, 200, 500]:
rfclf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, criterion='gini')
start_time = dt.datetime.now()
rfclf.fit(Xsub[tr_idx, :], ysub[tr_idx])
fit_time = dt.datetime.now()
ypred = rfclf.predict(Xsub[ts_idx, :])
pred_time = dt.datetime.now()
prec, recall, f1score = evalPerformance(yts, ypred)
print ("TrainSize %d n_est %d ==> Prec:%.3f Recall:%.3f F1Score:%.3f (fit-time %d pred-time %d)" \
%(tr_idx.shape[0], n_est, prec, recall, f1score, (fit_time-start_time).seconds, (pred_time - fit_time).seconds))
In [2]:
ndim= 900
y = pandas.read_table('../data/label_tr.lower.txt', sep=' ', header=None, dtype='int')
ycv = pandas.read_table('../data/label_cv.lower.txt', sep=' ', header=None, dtype='int')
Xcv = pandas.read_table('../data/data_cv.lower.txt', sep=' ', header=None, dtype='int')
print(np.unique(y[1]))
print(np.unique(ycv[1]))
print(Xcv.shape)
feat_idx = np.random.choice(np.arange(ndim), 30, replace=False)
Xcv = Xcv.iloc[:, feat_idx]
print(Xcv.shape)
In [22]:
ntot_train = y.shape[0]
print(ntot_train)
df = pandas.read_table('../data/data_tr.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')
Xcv = pandas.read_table('../data/data_cv.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')
print(df.shape)
print(Xcv.shape)
In [14]:
tr_idx = np.random.choice(df.shape[0], ntot_train/2, replace=False)
ts_idx = np.setdiff1d(np.arange(df.shape[0]), tr_idx, assume_unique=True)
Xtr = df.iloc[tr_idx, :]
print(Xtr.shape, tr_idx.shape, ts_idx.shape)
In [16]:
kdt = scipy.spatial.KDTree(Xtr.iloc[:tr_idx.shape[0]/2,:], leafsize=1000)
qt_dist, qt_idx = kdt.query(Xtr.iloc[:10,:], k=10)
print(qt_dist)
print(qt_idx)
In [28]:
ntr = Xtr.shape[0]
nsplit = ntr / 2
kdt1 = scipy.spatial.KDTree(Xtr.iloc[:nsplit,:], leafsize=1000)
#kdt2 = scipy.spatial.KDTree(Xtr.iloc[nsplit:ntr,:], leafsize=1000)
qt1_idx = kdt1.query(Xcv[:20], k=10)[1]
#qt2_idx = kdt2.query(df.iloc[ts_idx, :])[1]
In [48]:
str_idx = np.arange(4)*ntot_train/4
end_idx = np.arange(1,5)*ntot_train/4
def get_label(arr):
return(y.iloc[arr,1].values)
for i,(s,e) in enumerate(zip(str_idx, end_idx)):
sys.stdout.write('%6d - %6d '%(s,e))
kdt = scipy.spatial.KDTree(df.iloc[s:e,:], leafsize=1000)
qt_idx = kdt.query(Xcv[:5], k=10)[1]
print(qt_idx.shape)
pred = np.apply_along_axis(get_label, 0, qt_idx)
print(pred)
np.savetxt('/tmp/preds.%d.dat'%i, pred, fmt='%d')
#print(y.iloc[qt_idx[:,:],1].values)
In [18]:
ntot_train = y.shape[0]
str_idx = np.arange(5)*ntot_train/5
end_idx = np.arange(1,6)*ntot_train/5
def get_label(arr):
return(y.iloc[arr,1].values)
for n in range(100):
feat_idx = np.random.choice(ndim, size=30, replace=False)
df = pandas.read_table('../data/data_tr.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')
Xcv = pandas.read_table('../data/data_cv.lower.txt', usecols=feat_idx, nrows=ntot_train, header=None, sep=' ')
sys.stdout.write('\n %d %d %d ==> ' %(n, df.shape[0], Xcv.shape[0]))
for i,(s,e) in enumerate(zip(str_idx, end_idx)):
sys.stdout.write('%6d-%6d '%(s,e))
kdt = scipy.spatial.KDTree(df.iloc[s:e,:], leafsize=1000)
qt_dist, qt_idx = kdt.query(Xcv, k=10)
pred = np.apply_along_axis(get_label, 0, qt_idx)
np.savetxt('/tmp/dists.%d.%d.dat'%(n,i), qt_dist, fmt='%.4f')
np.savetxt('/tmp/preds.%d.%d.dat'%(n,i), pred, fmt='%d')
In [ ]: