In [2]:
from scipy.spatial.distance import cosine
import numpy as np
import re

In [3]:
f = open("sentences.txt")
data = f.readlines()
f.close()

In [4]:
lines = []
for i in xrange(0,len(data)):
    lines.append(filter(None, re.split('[^a-z]', data[i].lower())))

In [6]:
words = dict()
i = 0
for line in lines:
    for word in line:
        if word in words:
            continue
        words[word] = i
        i += 1

In [8]:
matr = np.zeros((22,254))
for i in xrange(0,len(lines)):
    for word in lines[i]:
        matr[i,words[word]] += 1

In [49]:
res_cos = []
res_cos.append((0,1))
for i in xrange(1,len(lines)):
    res_cos.append((i,cosine(matr[0],matr[i])))

In [56]:
res_cos = sorted(res_cos,key=lambda tup: tup[1])

In [57]:
res_cos


Out[57]:
[(6, 0.7327387580875756),
 (4, 0.77708871496985887),
 (21, 0.82503644694405864),
 (10, 0.83281653622739416),
 (12, 0.83964325485254543),
 (16, 0.84063618542208085),
 (20, 0.84275727449171223),
 (2, 0.86447381456421235),
 (13, 0.87035925528956715),
 (14, 0.87401184233025764),
 (11, 0.88047713906656067),
 (8, 0.88427248752843102),
 (19, 0.88854435748492944),
 (3, 0.89517151632780823),
 (9, 0.90550888174769317),
 (7, 0.92587506833388988),
 (5, 0.94023856953328033),
 (15, 0.94427217874246472),
 (18, 0.94427217874246472),
 (1, 0.95275444087384664),
 (17, 0.95664450152379399),
 (0, 1)]

In [59]:
ans = u'6 4'
with open('sumbission-1.txt', 'w') as f:
    f.write(ans)