In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
df = pandas.read_table("../data/data_dev.txt", sep=" ", dtype='int', header=None)
df.head()
Out[2]:
In [3]:
y = pandas.read_table("../data/label_dev.txt", sep=" ", dtype='int', header=None)
y.head()
Out[3]:
In [12]:
yl, yc = np.unique(y[0], return_counts=True)
print(yl.shape)
print(np.max(yc))
yc = yc / float(np.sum(yc))
fig1 = plt.figure(1, figsize=(12,6))
ax = fig1.add_subplot(1, 1, 1)
#line1 = plt.plot(yl, yc, 'b-')
line1 = ax.bar(yl, yc, width=1, color='r')
#plt.setp(line1, color='b', linewidth=3, marker='^', markersize=8)
plt.setp(line1, color='b', linewidth=0.1)
plt.setp(ax.get_xticklabels(), rotation='horizontal', fontsize=16)
plt.setp(ax.get_yticklabels(), rotation='vertical', fontsize=16)
plt.xlabel('Class Label', size=20)
plt.ylabel('Distribution', size=20)
plt.title('Class Frequency', size=20)
#plt.legend(['Y1', 'Y2'], loc='upper right') ## not supported by plotly yet
plt.axis([0, 166, 0, 0.14])
plt.show()
In [4]:
#def split_train(df, y):
np.random.seed(seed = 1234)
N, m = df.shape
train_idx = np.random.choice(N, size=0.8*N, replace=False)
test_idx = np.setdiff1d(np.arange(N), train_idx, assume_unique=True)
print(train_idx.shape, test_idx.shape)
In [10]:
## KFold Cross-validation:
np.random.seed(seed=1234)
K = 10
ainx = np.random.randint(K, size=N)
for i in range(K):
test_idx = np.where(ainx == i)[0]
train_idx = np.where(ainx != i)[0]
print("%d %d %d"%(i, test_idx.shape[0], train_idx.shape[0]))
In [7]:
from sklearn.naive_bayes import MultinomialNB
for alpha in [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 10.0, 100.0]:
clf = MultinomialNB(alpha=alpha)
clf.fit(df.iloc[train_idx,:], y.iloc[train_idx, 0])
ypred = clf.predict(df.iloc[test_idx,:])
res = np.sum(ypred == y.iloc[test_idx,0])
print("%f %f"%(alpha, res))
for alpha in [0.01, 0.1, 0.2, 0.5, 1.0, 2.0, 10.0, 100.0]:
clf = MultinomialNB(alpha=alpha, fit_prior=False)
clf.fit(df.iloc[train_idx,:], y.iloc[train_idx, 0])
ypred = clf.predict(df.iloc[test_idx,:])
res = np.sum(ypred == y.iloc[test_idx,0])
print("%f %f"%(alpha, res))
In [5]:
def make_SVMliteFormat(fname, X, y):
assert (X.shape[0] == y.shape[0])
assert (np.sum(y==1) + np.sum(y==-1) +np.sum(y==0) == y.shape[0])
N, m = X.shape
with open(fname, 'w') as fp:
for i in range(N):
fp.write("%d "%y[i])
for j in range(m):
if X.iloc[i,j] != 0:
fp.write("%d:%d " %(j+1, X.iloc[i,j]))
fp.write("\n")
Xsub = df.iloc[comb_inx,:]
ysub = y.iloc[comb_inx,0].values
ysub[np.where(ysub == ci)[0]] = 1
ysub[np.where(ysub != ci)[0]] = -1
make_SVMliteFormat('../data/class_splits/c_%d.txt'%ci, Xsub, ysub)
In [6]:
yuniq = np.unique(y[0])
yuniq_dict = {}
for ci in yuniq:
yuniq_dict[ci] = np.where(y[0] == ci)[0]
In [41]:
np.random.seed(seed=1234)
cls_dict = {}
for ci in yuniq:
yinx = np.where(y[0] == ci)[0]
nci = yinx.shape[0]
others = np.setdiff1d(np.arange(N), yinx, assume_unique=True)
comb_inx = yinx
for cj in yuniq:
if ci != cj:
cj_inx = np.unique(np.random.choice(yuniq_dict[cj], size=np.random.uniform(low=0.01*nci, high=0.02*nci)))
comb_inx = np.hstack([comb_inx, cj_inx])
#print(cj_inx)
sys.stderr.write("%d "%comb_inx.shape[0])
np.random.shuffle(comb_inx)
cls_dict[ci] = comb_inx
In [58]:
import sklearn.svm
clf = sklearn.svm.SVC(C=1.0, kernel='rbf', gamma=1.0)
for ci in yuniq[:]:
tr_idx = np.random.choice(cls_dict[ci], size=0.8*cls_dict[ci].shape[0], replace=False)
ts_idx = np.setdiff1d(cls_dict[ci], tr_idx, assume_unique=True)
Xtr = df.iloc[tr_idx,:]
ytr = y.iloc[tr_idx,0].values
ytr[np.where(ytr != ci)[0]] = -1
clf.fit(Xtr, ytr)
Xts = df.iloc[ts_idx,:]
yts = y.iloc[ts_idx,0].values
yts[np.where(yts != ci)[0]] = -1
ypred = clf.predict(Xts)
print("Class %d ==> %.4f"%(ci, np.sum(yts == ypred)/float(ypred.shape[0])))
In [50]:
#np.sum(yt[np.where(yt == 1)[0]] == 1)
#np.sum(clf.predict(df.iloc[test_idx[:],:]) == 1)
ypred = clf.predict(df.iloc[test_idx[:],:])
ri = np.where(y.iloc[test_idx[:],0] == 1)
ypred[ri]
Out[50]:
In [ ]:
In [ ]:
from sklearn.decomposition import RandomizedPCA
rpca = RandomizedPCA()
rpca.fit(df)
print(rpca.explained_variance_ratio_)
In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 100)
pca.fit(df)
print(pca.explained_variance_ratio_)
In [10]:
np.sum(pca.explained_variance_ratio_)
Out[10]:
In [13]:
import sys
sys.setrecursionlimit(10000)
for i in range(1):
Xtrain = df.iloc[ainx == i,:300]
print(i, Xtrain.shape)
kdt1 = scipy.spatial.KDTree(Xtrain, leafsize=200)
print(kdt1)
In [21]:
ytrain = y.iloc[ainx == i,0]
Xtest = df.iloc[ainx == 1,:300]
ytest = y.iloc[ainx == 1,0]
In [31]:
for j in range(10):
q = kdt1.query(Xtest.iloc[j,:], k=4)
print(q[0])
print(ytest.iloc[j], ytrain.iloc[q[1]].values)
In [8]:
labels = np.unique(y.iloc[train_idx,0], return_counts=False)
labels
Out[8]:
In [18]:
center = np.empty(shape=(labels.shape[0], m), dtype=float)
Xtrain = df.iloc[train_idx, :]
ytrain = y.iloc[train_idx, :]
n = 0
for i in labels:
mvec = Xtrain.iloc[np.where(ytrain[0] == i)[0], :].median().values
center[n, :] = mvec
n += 1
center.shape
Out[18]:
In [23]:
import vecspace
ref = np.random.uniform(low=0.0, high=10.0, size=100*900).reshape((100,900))
vr = vecspace.Vectorify(ref, metric='euclidean')
XtrainVec = np.empty(shape=(Xtrain.shape[0], ref.shape[0]), dtype=float)
for i in np.arange(Xtrain.shape[0]):
XtrainVec[i,:] = vr.vectorize(Xtrain.iloc[i,:])
In [20]:
np.savetxt('../data/trainvec.txt', XtrainVec, fmt='%.2f', delimiter=' ')
In [ ]: