In [1]:
from scipy.spatial.distance import cosine
import numpy as np
import re

In [2]:
text = list()
with open("sentences_example.txt", 'r') as file_obj:
    for line in file_obj:
        text.append(filter(lambda w: w != '', re.split("[^a-z]", line.lower())))

In [3]:
words_set = set()
for line in text:
    for word in line:
        words_set.add(word)
words = dict(enumerate(words_set))

In [4]:
words_array = []
for line in text:
    words_array += [ list(map(lambda w: line.count(words[w]), words)) ]
words_matrix = np.array(words_array)

In [5]:
distances = list()
for row in range(1, words_matrix.shape[0]):
    distances.append(cosine(words_matrix[0], words_matrix[row]))


/home/ubuntuser/.local/lib/python2.7/site-packages/scipy/spatial/distance.py:505: RuntimeWarning: invalid value encountered in true_divide
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))

In [6]:
def two_mins(nums):
    min_1, min_2 = float('inf'), float('inf')
    for num in nums:
        if num <= min_1:
            min_1, min_2 = num, min_1
        elif num < min_2:
            min_2 = num
    return min_1, min_2

In [7]:
with open("submission-1.txt", 'w') as file_obj:
    min_1, min_2 = two_mins(distances)
    ind_1, ind_2 = distances.index(min_1) + 1, distances.index(min_2) + 1
    file_obj.write(str(ind_1) + " " + str(ind_2))