In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
df = pandas.read_table("../data/data_dev.txt", sep=" ", dtype='int', header=None)
df.head()
Out[2]:
In [3]:
y = pandas.read_table("../data/label_dev.txt", sep=" ", dtype='int', header=None)
y.head()
Out[3]:
In [4]:
np.random.seed(seed = 1234)
N, m = df.shape
train_idx = np.random.choice(N, size=0.8*N, replace=False)
test_idx = np.setdiff1d(np.arange(N), train_idx, assume_unique=True)
print(train_idx.shape, test_idx.shape)
Xtrain = df.iloc[train_idx,:]
ytrain = y.iloc[train_idx,:]
In [5]:
N, m = Xtrain.shape
yuniq = np.unique(ytrain[0])
yuniq_dict = {}
for ci in yuniq:
yuniq_dict[ci] = np.where(ytrain[0] == ci)[0]
In [6]:
cls_pos = {}
cls_neg = {}
for ci in yuniq:
yinx = np.where(ytrain[0] == ci)[0]
ni = yinx.shape[0]
others = np.setdiff1d(np.arange(N), yinx, assume_unique=True)
ni_sel = ni
if ni > 4000:
ni_sel = 4000
ntot = 10*ni_sel
comb_inx = np.array([]) #yinx
for cj in yuniq:
if ci != cj:
nj = yuniq_dict[cj].shape[0]
nj_sel = np.max([np.min([ni/4, nj/4]),int(ntot*nj/float(N))])
cj_inx = np.random.choice(yuniq_dict[cj], size=nj_sel)
comb_inx = np.hstack([comb_inx, cj_inx])
#print(cj_inx)
sys.stderr.write("%d %d \t"%(ni,comb_inx.shape[0]))
#np.random.shuffle(comb_inx)
cls_pos[ci] = yinx
cls_neg[ci] = comb_inx
In [7]:
np.unique(ytrain.iloc[cls_neg[164],0].values, return_counts=True)
Out[7]:
In [12]:
import sklearn.svm
clf = sklearn.svm.SVC(C=100, kernel='rbf', gamma=1.0)
for ci in yuniq[:1]:
for i in range(5):
tr_pos = np.random.choice(cls_pos[ci], size=cls_pos[ci].shape[0], replace=True)
tr_neg = np.random.choice(cls_neg[ci], size=tr_pos.shape[0], replace=True)
tr_idx = np.hstack((tr_pos, tr_neg))
ts_idx = np.setdiff1d(np.hstack((cls_pos[ci], cls_neg[ci])), tr_idx, assume_unique=False)
print("%d %d %d %d"%(tr_pos.shape[0], tr_neg.shape[0], tr_idx.shape[0], ts_idx.shape[0]))
Xtr = Xtrain.iloc[tr_idx,:400]
ytr = ytrain.iloc[tr_idx,0].values
ytr[np.where(ytr != ci)[0]] = -1
clf.fit(Xtr, ytr)
Xts = Xtrain.iloc[ts_idx,:400]
yts = ytrain.iloc[ts_idx,0].values
yts[np.where(yts != ci)[0]] = -1
ypred = clf.predict(Xts)
print(np.sum(ytr==-1))
print("Class %d ==> %.4f %.4f"%(ci, np.sum(yts==ci), np.sum(ypred[np.where(yts==ci)])))
In [58]:
ypred[ypred==0]
Out[58]:
In [ ]:
import sklearn.cluster
dbs = sklearn.cluster.DBSCAN(eps=20, min_samples=10, algorithm='ball_tree', metric='euclidean')
yclust = dbs.fit_predict(df)
In [68]:
np.sum(yclust != -1)
Out[68]:
In [8]:
feat_dict = {}
for ci in yuniq:
feat_var = df.iloc[cls_pos[ci],:].var()
med_var = feat_var.median()
#print("%d %d"%(ci,np.sum(feat_var > med_var)))
feat_dict[ci] = np.where(feat_var > med_var)[0]
In [9]:
feat_dict[1]
Out[9]:
In [11]:
import sklearn.svm
clf = sklearn.svm.SVC(C=10, kernel='rbf', gamma=1.0)
for ci in yuniq[148:164]:
for i in range(5):
tr_pos = np.random.choice(cls_pos[ci], size=cls_pos[ci].shape[0]*0.1, replace=False)
tr_neg = np.random.choice(cls_neg[ci], size=tr_pos.shape[0], replace=False)
tr_idx = np.hstack((tr_pos, tr_neg))
ts_idx = np.setdiff1d(np.hstack((cls_pos[ci], cls_neg[ci])), tr_idx, assume_unique=True)
sys.stderr.write("%d %d %d %d\t"%(tr_pos.shape[0], tr_neg.shape[0], tr_idx.shape[0], ts_idx.shape[0]))
Xtr = Xtrain.iloc[tr_idx,feat_dict[ci]]
ytr = ytrain.iloc[tr_idx,0].values
ytr[np.where(ytr != ci)[0]] = -1
ytr[np.where(ytr == ci)[0]] = +1
clf.fit(Xtr, ytr)
Xts = Xtrain.iloc[ts_idx,feat_dict[ci]]
yts = ytrain.iloc[ts_idx,0].values
yts[np.where(yts != ci)[0]] = -1
yts[np.where(yts == ci)[0]] = +1
ypred = clf.predict(Xts)
print("Class %d ==> P=%d TP+FP=%d TP=%d TPR=%.4f"%(ci, np.sum(yts==1), np.sum(ypred == 1), \
np.sum(ypred[np.where(yts==1)]==1), \
np.sum(ypred[np.where(yts==1)]==1)/float(np.sum(yts==1))))
In [ ]: