In [1]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys

from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
df = pandas.read_table("../data/data_dev.txt", sep=" ", dtype='int', header=None)

df.head()


Out[2]:
0 1 2 3 4 5 6 7 8 9 ... 890 891 892 893 894 895 896 897 898 899
0 3 0 0 0 2 0 0 2 0 1 ... 0 0 0 0 0 0 0 3 0 2
1 0 0 0 0 0 0 0 1 5 0 ... 0 0 0 0 1 3 2 1 0 2
2 0 0 0 0 0 2 0 0 1 0 ... 0 0 0 0 1 0 0 0 3 0
3 2 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 0 0 2 27 0 0 1 0 4 ... 0 0 0 0 0 3 0 0 0 1

5 rows × 900 columns


In [3]:
y = pandas.read_table("../data/label_dev.txt", sep=" ", dtype='int', header=None)

y.head()


Out[3]:
0
0 117
1 73
2 161
3 162
4 160

In [4]:
np.random.seed(seed = 1234)

N, m = df.shape

train_idx = np.random.choice(N, size=0.8*N, replace=False)
test_idx = np.setdiff1d(np.arange(N), train_idx, assume_unique=True)
print(train_idx.shape, test_idx.shape)

Xtrain = df.iloc[train_idx,:]
ytrain =  y.iloc[train_idx,:]


((100000,), (25000,))

In [5]:
N, m = Xtrain.shape

yuniq = np.unique(ytrain[0])

yuniq_dict = {}

for ci in yuniq:
    yuniq_dict[ci] = np.where(ytrain[0] == ci)[0]

In [6]:
cls_pos = {}
cls_neg = {}

for ci in yuniq:
    yinx = np.where(ytrain[0] == ci)[0]
    ni = yinx.shape[0]
    others = np.setdiff1d(np.arange(N), yinx, assume_unique=True)
    ni_sel = ni
    if ni > 4000:
        ni_sel = 4000
    ntot = 10*ni_sel
    comb_inx = np.array([]) #yinx
    for cj in yuniq:
        if ci != cj:
            nj = yuniq_dict[cj].shape[0]
            nj_sel = np.max([np.min([ni/4, nj/4]),int(ntot*nj/float(N))])
            
            cj_inx = np.random.choice(yuniq_dict[cj], size=nj_sel)
            comb_inx = np.hstack([comb_inx, cj_inx])
            #print(cj_inx)
            
    sys.stderr.write("%d %d   \t"%(ni,comb_inx.shape[0]))

    #np.random.shuffle(comb_inx)
    cls_pos[ci] = yinx
    cls_neg[ci] = comb_inx


132 5265   	129 5123   	120 4809   	147 5685   	108 4337   	113 4503   	115 4511   	132 5265   	130 5128   	123 4826   	105 4182   	152 5922   	139 5421   	131 5134   	130 5128   	136 5407   	136 5407   	131 5134   	134 5275   	137 5410   	255 8570   	248 8472   	149 5802   	150 5807   	125 4970   	139 5421   	137 5410   	279 9046   	133 5270   	132 5265   	138 5416   	129 5123   	141 5543   	138 5416   	142 5547   	133 5270   	144 5668   	150 5807   	249 8479   	131 5134   	296 9388   	168 6410   	152 5922   	173 6531   	295 9335   	286 9188   	163 6183   	315 9665   	290 9259   	146 5680   	162 6179   	324 9832   	185 6878   	164 6288   	169 6414   	355 10235   	169 6414   	314 9661   	192 7096   	206 7423   	187 6888   	194 7106   	187 6888   	186 6882   	291 9266   	193 7101   	192 7096   	343 10076   	203 7324   	202 7321   	197 7210   	335 9965   	229 8019   	225 7924   	216 7719   	231 8031   	244 8385   	197 7210   	211 7532   	236 8204   	212 7618   	229 8019   	221 7823   	243 8313   	233 8115   	243 8313   	251 8486   	276 9030   	242 8307   	248 8472   	260 8718   	263 8734   	246 8397   	272 8955   	263 8734   	240 8296   	281 9111   	264 8797   	295 9335   	282 9115   	310 9596   	319 9726   	335 9965   	336 10008   	307 9537   	338 10017   	344 10115   	323 9788   	307 9537   	357 10276   	353 10227   	392 10699   	380 10562   	359 10288   	372 10467   	382 10573   	406 10844   	412 10921   	421 11013   	514 11957   	482 11638   	454 11353   	532 12137   	523 12040   	590 12663   	641 13123   	669 13369   	741 13976   	801 14468   	918 15396   	941 15576   	980 15869   	1071 16497   	1062 16438   	999 15999   	1162 17086   	1162 17086   	1289 17843   	1634 19812   	1914 21336   	2225 22956   	3098 29949   	3380 32581   	4574 38108   	5956 37555   	6373 37388   	6549 37318   	9123 36288   	12981 34745   	

In [7]:
np.unique(ytrain.iloc[cls_neg[164],0].values, return_counts=True)


Out[7]:
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  13,  14,
         15,  16,  17,  21,  22,  24,  26,  32,  33,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  56,  58,  59,  60,  61,  62,  63,  64,  65,  66,
         67,  68,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
         81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
         94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
        107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
        120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
        133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
        146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
        159, 160, 161, 162, 163]),
 array([  52,   51,   48,   58,   43,   45,   46,   52,   52,   49,   42,
          60,   55,   52,   52,   54,   54,   52,   53,   54,  102,   99,
          59,   60,   50,   55,   54,  111,   53,   52,   55,   51,   56,
          55,   56,   53,   57,   60,   99,   52,  118,   67,   60,   69,
         118,  114,   65,  126,  116,   58,   64,  129,   74,   65,   67,
         142,   67,  125,   76,   82,   74,   77,   74,   74,  116,   77,
          76,  137,   81,   80,   78,  134,   91,   90,   86,   92,   97,
          78,   84,   94,   84,   91,   88,   97,   93,   97,  100,  110,
          96,   99,  104,  105,   98,  108,  105,   96,  112,  105,  118,
         112,  124,  127,  134,  134,  122,  135,  137,  129,  122,  142,
         141,  156,  152,  143,  148,  152,  162,  164,  168,  205,  192,
         181,  212,  209,  236,  256,  267,  296,  320,  367,  376,  392,
         428,  424,  399,  464,  464,  515,  653,  765,  890, 1239, 1352,
        1829, 2382, 2549, 2619, 3649]))

In [12]:
import sklearn.svm
clf = sklearn.svm.SVC(C=100, kernel='rbf', gamma=1.0)

for ci in yuniq[:1]:
    for i in range(5):
        tr_pos = np.random.choice(cls_pos[ci], size=cls_pos[ci].shape[0], replace=True)
        tr_neg = np.random.choice(cls_neg[ci], size=tr_pos.shape[0], replace=True)
        tr_idx = np.hstack((tr_pos, tr_neg))
        ts_idx = np.setdiff1d(np.hstack((cls_pos[ci], cls_neg[ci])), tr_idx, assume_unique=False)
        print("%d %d %d %d"%(tr_pos.shape[0], tr_neg.shape[0], tr_idx.shape[0], ts_idx.shape[0]))
        Xtr = Xtrain.iloc[tr_idx,:400]
        ytr = ytrain.iloc[tr_idx,0].values
    
        ytr[np.where(ytr != ci)[0]] = -1
        clf.fit(Xtr, ytr)
    
        Xts = Xtrain.iloc[ts_idx,:400]
        yts = ytrain.iloc[ts_idx,0].values
        yts[np.where(yts != ci)[0]] = -1
        ypred = clf.predict(Xts)
        print(np.sum(ytr==-1))
        print("Class %d   ==>  %.4f %.4f"%(ci, np.sum(yts==ci), np.sum(ypred[np.where(yts==ci)])))


132 132 264 4877
132
Class 1   ==>  52.0000 -52.0000
132 132 264 4870
132
Class 1   ==>  44.0000 -44.0000
132 132 264 4881
132
Class 1   ==>  50.0000 -50.0000
132 132 264 4878
132
Class 1   ==>  50.0000 -50.0000
132 132 264 4874
132
Class 1   ==>  49.0000 -49.0000

In [58]:
ypred[ypred==0]


Out[58]:
array([], dtype=int64)

Clustering


In [ ]:
import sklearn.cluster

dbs = sklearn.cluster.DBSCAN(eps=20, min_samples=10, algorithm='ball_tree', metric='euclidean')

yclust = dbs.fit_predict(df)

In [68]:
np.sum(yclust != -1)


Out[68]:
0

Removing Features with Large Intra-Class Variance


In [8]:
feat_dict = {}

for ci in yuniq:
    feat_var = df.iloc[cls_pos[ci],:].var()
    med_var = feat_var.median()
    #print("%d %d"%(ci,np.sum(feat_var > med_var)))
    feat_dict[ci] = np.where(feat_var > med_var)[0]

In [9]:
feat_dict[1]


Out[9]:
array([  0,   1,   2,   3,   4,   5,   8,   9,  12,  13,  14,  17,  19,
        22,  25,  26,  28,  30,  32,  33,  38,  41,  42,  43,  45,  47,
        51,  53,  55,  57,  58,  60,  61,  62,  63,  64,  66,  67,  69,
        70,  71,  74,  75,  76,  79,  80,  82,  83,  85,  87,  88,  89,
        91,  92,  93,  97, 100, 102, 103, 105, 107, 111, 112, 113, 116,
       117, 119, 124, 128, 129, 132, 133, 136, 139, 140, 142, 145, 146,
       148, 149, 156, 158, 163, 165, 167, 171, 172, 173, 174, 176, 179,
       181, 183, 184, 188, 191, 192, 193, 196, 198, 199, 200, 204, 206,
       208, 213, 216, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228,
       230, 232, 233, 234, 235, 237, 242, 243, 246, 251, 252, 254, 255,
       257, 258, 260, 261, 262, 268, 269, 271, 272, 274, 276, 278, 280,
       285, 287, 288, 289, 290, 291, 294, 296, 299, 300, 301, 302, 303,
       306, 312, 316, 317, 318, 319, 320, 321, 322, 323, 326, 330, 331,
       332, 335, 336, 342, 343, 346, 351, 355, 356, 358, 359, 360, 361,
       363, 365, 367, 369, 370, 371, 372, 374, 376, 379, 382, 384, 389,
       393, 394, 395, 396, 397, 398, 401, 402, 404, 408, 417, 418, 419,
       423, 425, 426, 428, 436, 440, 441, 442, 444, 446, 447, 449, 452,
       453, 455, 456, 459, 460, 461, 463, 464, 466, 469, 471, 473, 474,
       478, 479, 481, 482, 484, 485, 486, 488, 492, 495, 497, 498, 501,
       502, 504, 505, 507, 509, 510, 519, 522, 523, 524, 525, 528, 529,
       530, 532, 533, 534, 539, 540, 542, 544, 547, 548, 549, 557, 559,
       560, 562, 563, 567, 568, 569, 570, 573, 575, 577, 578, 581, 582,
       584, 585, 589, 590, 591, 593, 594, 598, 601, 602, 603, 605, 606,
       607, 608, 609, 612, 614, 615, 616, 617, 620, 621, 622, 623, 624,
       625, 626, 627, 631, 632, 633, 634, 635, 637, 640, 643, 644, 646,
       647, 648, 650, 653, 654, 656, 658, 660, 661, 662, 663, 667, 669,
       670, 674, 675, 678, 679, 682, 683, 685, 687, 688, 690, 693, 694,
       695, 696, 698, 699, 700, 702, 703, 704, 705, 708, 709, 713, 717,
       718, 720, 721, 724, 725, 726, 727, 730, 731, 732, 733, 734, 736,
       738, 742, 743, 746, 750, 751, 753, 754, 755, 756, 758, 759, 762,
       763, 766, 767, 769, 774, 775, 776, 781, 782, 783, 784, 785, 788,
       790, 791, 792, 797, 798, 799, 800, 801, 803, 806, 808, 809, 810,
       811, 812, 815, 818, 821, 828, 831, 832, 834, 835, 836, 839, 840,
       841, 845, 846, 848, 849, 850, 851, 852, 853, 857, 861, 870, 871,
       872, 873, 874, 875, 876, 878, 887, 889])

In [11]:
import sklearn.svm
clf = sklearn.svm.SVC(C=10, kernel='rbf', gamma=1.0)

for ci in yuniq[148:164]:
    for i in range(5):
        tr_pos = np.random.choice(cls_pos[ci], size=cls_pos[ci].shape[0]*0.1, replace=False)
        tr_neg = np.random.choice(cls_neg[ci], size=tr_pos.shape[0], replace=False)
        tr_idx = np.hstack((tr_pos, tr_neg))
        ts_idx = np.setdiff1d(np.hstack((cls_pos[ci], cls_neg[ci])), tr_idx, assume_unique=True)
        sys.stderr.write("%d %d %d %d\t"%(tr_pos.shape[0], tr_neg.shape[0], tr_idx.shape[0], ts_idx.shape[0]))
        
        Xtr = Xtrain.iloc[tr_idx,feat_dict[ci]]
        ytr = ytrain.iloc[tr_idx,0].values
    
        ytr[np.where(ytr != ci)[0]] = -1
        ytr[np.where(ytr == ci)[0]] = +1
        clf.fit(Xtr, ytr)
    
        Xts = Xtrain.iloc[ts_idx,feat_dict[ci]]
        yts = ytrain.iloc[ts_idx,0].values
        yts[np.where(yts != ci)[0]] = -1
        yts[np.where(yts == ci)[0]] = +1
        ypred = clf.predict(Xts)

        print("Class %d   ==>  P=%d TP+FP=%d TP=%d TPR=%.4f"%(ci, np.sum(yts==1), np.sum(ypred == 1), \
                                                              np.sum(ypred[np.where(yts==1)]==1), \
                                                              np.sum(ypred[np.where(yts==1)]==1)/float(np.sum(yts==1))))


1298 1298 2596 39004	1298 1298 2596 39004	
Class 164   ==>  11683.0000 39004.0000 11683.0000
Class 164   ==>  11683.0000 39003.0000 11683.0000
Class 164   ==>  11683.0000 39005.0000 11683.0000
1298 1298 2596 39005	1298 1298 2596 39000	
Class 164   ==>  11683.0000 39000.0000 11683.0000
Class 164   ==>  11683.0000 39001.0000 11683.0000
1298 1298 2596 39001	

In [ ]: