In [5]:
import numpy as np
import pandas
import scipy, scipy.spatial
import sklearn
import sys
import os, re

from matplotlib import pyplot as plt
%matplotlib inline

In [17]:
flst = [f for f in os.listdir('../results/lower/') if re.match('res\.[0-9]+.*\.pred$', f)]

len(flst)


Out[17]:
2332

In [36]:
dsel = pandas.read_table('../data/data_test.lower.txt', sep=',', header=None)[0]
df = pandas.read_table('/home/vahid/Downloads/data/ml/data_test.txt', sep=',', header=None)[0]

print(dsel.shape, df.shape)


((107731,), (262102,))

In [46]:
idx = np.zeros(shape=dsel.shape[0], dtype=int)

js = 0
for i in range(dsel.shape[0]):
    for j in range(js,df.shape[0]):
        if dsel[i] == df[j]:
            if i%500 ==0:
                sys.stdout.write('%d %d\t'%(i,j))
            idx[i] = j
            js = j+1
            break


0 1	500 1183	1000 2385	1500 3640	2000 4865	2500 6134	3000 7326	3500 8511	4000 9701	4500 10925	5000 12111	5500 13271	6000 14489	6500 15670	7000 16892	7500 18114	8000 19360	8500 20575	9000 21726	9500 23008	10000 24190	10500 25398	11000 26558	11500 27788	12000 29010	12500 30250	13000 31454	13500 32666	14000 33933	14500 35132	15000 36382	15500 37553	16000 38825	16500 40089	17000 41255	17500 42464	18000 43640	18500 44910	19000 46140	19500 47341	20000 48557	20500 49812	21000 51078	21500 52270	22000 53515	22500 54763	23000 56062	23500 57247	24000 58462	24500 59678	25000 60887	25500 62119	26000 63361	26500 64533	27000 65806	27500 67064	28000 68253	28500 69448	29000 70658	29500 71901	30000 73118	30500 74335	31000 75592	31500 76845	32000 77968	32500 79146	33000 80332	33500 81618	34000 82808	34500 84053	35000 85227	35500 86481	36000 87703	36500 88961	37000 90167	37500 91363	38000 92566	38500 93767	39000 94927	39500 96127	40000 97301	40500 98488	41000 99747	41500 100985	42000 102157	42500 103347	43000 104580	43500 105813	44000 107002	44500 108319	45000 109587	45500 110807	46000 112062	46500 113166	47000 114371	47500 115594	48000 116836	48500 118065	49000 119295	49500 120545	50000 121740	50500 122969	51000 124226	51500 125362	52000 126596	52500 127818	53000 129058	53500 130293	54000 131518	54500 132697	55000 133902	55500 135120	56000 136312	56500 137525	57000 138733	57500 139892	58000 141182	58500 142398	59000 143592	59500 144739	60000 145923	60500 147171	61000 148448	61500 149625	62000 150832	62500 152009	63000 153238	63500 154502	64000 155720	64500 156973	65000 158210	65500 159453	66000 160718	66500 161962	67000 163204	67500 164374	68000 165610	68500 166808	69000 167983	69500 169209	70000 170386	70500 171563	71000 172854	71500 174043	72000 175264	72500 176468	73000 177597	73500 178834	74000 180080	74500 181301	75000 182511	75500 183727	76000 184915	76500 186181	77000 187397	77500 188675	78000 189917	78500 191084	79000 192340	79500 193489	80000 194714	80500 195978	81000 197183	81500 198398	82000 199547	82500 200794	83000 201964	83500 203174	84000 204347	84500 205624	85000 206791	85500 208000	86000 209167	86500 210404	87000 211670	87500 212873	88000 214171	88500 215416	89000 216613	89500 217835	90000 219037	90500 220234	91000 221395	91500 222569	92000 223832	92500 225061	93000 226262	93500 227505	94000 228826	94500 230138	95000 231324	95500 232563	96000 233718	96500 234918	97000 236106	97500 237313	98000 238550	98500 239660	99000 240876	99500 242056	100000 243244	100500 244473	101000 245686	101500 246899	102000 248097	102500 249344	103000 250623	103500 251778	104000 253003	104500 254197	105000 255487	105500 256721	106000 257934	106500 259127	107000 260348	107500 261563	

In [50]:
np.savetxt('../data/index_lower.txt', idx+1, fmt='%d')

In [77]:
def combKNN(ic, thresh=20000):

    mydict = {}
    for f in flst:
        df = pandas.read_table('../results/lower/'+f, sep=' ', header=None)
        for i in range(df.shape[0]):
            if np.any(ic == df.iloc[i,:]):
                s = np.sum(ic == df.iloc[i,:])
                if i in mydict:
                    mydict[i] += s
                else:
                    mydict[i] = s
                    
        if np.sum(np.array(mydict.values())) > thresh:
            break
        else:
            sys.stderr.write('%d\t'%len(mydict.keys()))
        
    res = pandas.DataFrame(mydict.items())
    res = res.iloc[(-res.iloc[:,1]).argsort(),:].head(n=100)
    res.iloc[:,0] = idx[res.iloc[:,0]] + 1
    np.savetxt('../results/res.c%d.dat'%ic, res, fmt='%d')

In [ ]:
for k in range(1,101):
    sys.stderr.write('Class %d ==> '%k)
    combKNN(k)
    sys.stderr.write('\tFinished!!\n')


Class 1 ==> 3235	6160	9021	12123	14889	17958		Finished!!
Class 2 ==> 2707	5435	8009	10707	12939	15289	17678		Finished!!
Class 3 ==> 2694	5465	8235	10884	13561	16184		Finished!!
Class 4 ==> 3093	6291	9510	12345	15388	17817		Finished!!
Class 5 ==> 2666	5410	8418	11059	13702	16205		Finished!!
Class 6 ==> 2832	5376	7793	10108	12237	14619	16888		Finished!!
Class 7 ==> 2977	6358	9074	11694	14079	16700		Finished!!
Class 8 ==> 2639	5457	8254	10644	13076	15560	18044		Finished!!
Class 9 ==> 2888	5622	8302	10774	13129	15467	17969		Finished!!
Class 10 ==> 2877	6044	8806	11668	14517	17243		Finished!!
Class 11 ==> 2796	5431	8522	11190	14039	16550		Finished!!
Class 12 ==> 2995	5969	8920	11585	14043	16450		Finished!!
Class 13 ==> 3029	5695	8575	11263	13615	16198		Finished!!
Class 14 ==> 3370	6428	9641	12404	15026	17968		Finished!!
Class 15 ==> 3073	6355	8984	11401	14071	16661		Finished!!
Class 16 ==> 3212	6197	8864	11553	14398	17128		Finished!!
Class 17 ==> 2835	5765	8379	10865	13335	15825		Finished!!
Class 18 ==> 2948	6140	9067	11860	14763	17547		Finished!!
Class 19 ==> 3177	6070	8874	11816	14282	17179		Finished!!
Class 20 ==> 2834	5591	8263	10718	13485	15826	18163		Finished!!
Class 21 ==> 2756	5161	7932	10611	12939	15331	17655		Finished!!
Class 22 ==> 2960	5546	8226	11008	13455	16030		Finished!!
Class 23 ==> 3043	5750	8542	11378	13688	16026		Finished!!
Class 24 ==> 3136	5859	8797	11599	14376	16936		Finished!!
Class 25 ==> 3040	6275	9318	11962	14410	16896		Finished!!
Class 26 ==> 3080	6078	9232	11837	14348	17101		Finished!!
Class 27 ==> 2976	6035	8873	11695	14421	17115		Finished!!
Class 28 ==> 2810	

In [71]:
idx[res.iloc[:10,0]]


Out[71]:
array([ 79797,  79801, 239241,  79805,     20,     22,  79812,  79814,
           33,     37])

In [ ]: