In [1]:
questions = open('questions.txt', 'r').readlines()
answers = open('digital_11000_temp0.5_parsed.txt', 'r').readlines()
print(len(answers))


7841

In [2]:
import gensim
google_path = '/home/mar/code/marcel/ECO/data/GoogleNews-vectors-negative300.bin'
model = model = gensim.models.Word2Vec.load_word2vec_format(google_path, binary=True)
print('done')


/home/mar/.virtualenvs/eco3_experiments/lib/python3.4/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")
done

In [3]:
q = []
for index, item in enumerate(questions):
    if len(item) > 5:
        q.append(item.rstrip())
questions = q

a = []

for index, item in enumerate(answers):
    if len(item) > 5:
        a.append(item.rstrip())
answers = a

In [4]:
import textblob
import random
sentiments = []
for text in answers:
    text = textblob.TextBlob(text)
    sentiments.append((text.string, text.sentiment))
sentiments.sort(key=lambda sentiments: sentiments[1], reverse=False)

# prints post positive answers
for i in range(10):
    pass
    #print(sentiments[i])

In [5]:
import numpy as np
import scipy.spatial.distance

def avg_feature_vector(words, model, num_features):
    # function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float64")
    nwords = 0

    # list containing names of words in the vocabulary
    # index2word_set = set(model.index2word) this is moved as input param for performance reasons
    for word in words:
        if word in model.vocab:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])
        #else:
        #    print('not in vocabulary: ' + word)

    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [21]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.chat import eliza, iesha, rude,suntsu
import copy

def tag(string):
    """
    returns tagging for different types of inputs.
    sentence as a string > list of tags
    sentence as a list > list of tags
    word as string > tag
    """
    s_type = type(string)
    #print string, 
    if s_type == list:
        return pos_tag(string)
    elif s_type == str or s_type == unicode:
        sent = word_tokenize(string)
        #print sent,len(sent)
        if len(sent) > 1:
            return pos_tag(sent)
        else:
            #print pos_tag(sent)
            return pos_tag(sent)[0]

def remove_unknown_chars(text, model):
    return_text = ''
    for word in text.split():
        if word in model.vocab:
            return_text += word
            return_text += ' '
    return return_text

def filter_tags(text):
    tags = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBZ', 'VBD', 'VBP', 'VBN', 'VBG', 'JJ', 'JJR', 'JJS'] #nouns, verbs and adjectives
    good_words = []
    for word in text.split():
        word_tag = tag(word)
        if word_tag[1] in tags:
            good_words.append(word)
    return ' '.join(good_words)

In [22]:
from gensim import matutils
from numpy import array
from numpy import dot

answer_vectors = {}
for answer in answers:
    a = remove_unknown_chars(answer, model)
    a_original = copy.copy(a)
    a = filter_tags(a).split()
    if not a or len(a_original.split()) < 8 or len(a_original.split()) > 30:
        continue
    vector = [model[word] for word in a]
    v = matutils.unitvec(array(vector).mean(axis=0))
    answer_vectors[a_original] = v
print(len(answer_vectors))


7308

In [ ]:
import language_check
tool = language_check.LanguageTool('en-US')

question = random.choice(questions)
question = 'i like computers'
matches = tool.check(question)
question = language_check.correct(question, matches)
print('---- Random question:')
print(question)

similarities = []
q = remove_unknown_chars(question, model)
for word in q.split():
    word_tag = tag(word)
    print(word + ' ' + word_tag[1])
q = filter_tags(q)
q_vector = [model[word] for word in q.split()]
q_v = matutils.unitvec(array(q_vector).mean(axis=0))
for key, value in answer_vectors.items():
    dist = dot(q_v, value)
    similarities.append((key, dist))
print(len(similarities))
similarities.sort(key=lambda similarities: similarities[1], reverse=True)
print('---- Most similar match by n_similarity:')
for i in range(5):
    text = similarities[i]
    print(text)
    matches = tool.check(similarities[i][0])
    corrected = language_check.correct(similarities[i][0], matches)
    print(corrected)
print('---- Least similar match by n_similarity:')
for i in range(5):
    print(similarities[(-i) - 1])

In [ ]:
res = model.most_similar(positive=['I', 'shaking'], negative=['do', 'sleeping'], topn=10)
print(res)
res2 = model.similarity('shaking','do')
print(res2)

In [ ]: