In [1]:
questions = open('questions.txt', 'r').readlines()
answers = open('digital_11000_temp0.5_parsed.txt', 'r').readlines()
print(len(answers))
In [2]:
import gensim
google_path = '/home/mar/code/marcel/ECO/data/GoogleNews-vectors-negative300.bin'
model = model = gensim.models.Word2Vec.load_word2vec_format(google_path, binary=True)
print('done')
In [3]:
q = []
for index, item in enumerate(questions):
if len(item) > 5:
q.append(item.rstrip())
questions = q
a = []
for index, item in enumerate(answers):
if len(item) > 5:
a.append(item.rstrip())
answers = a
In [4]:
import textblob
import random
sentiments = []
for text in answers:
text = textblob.TextBlob(text)
sentiments.append((text.string, text.sentiment))
sentiments.sort(key=lambda sentiments: sentiments[1], reverse=False)
# prints post positive answers
for i in range(10):
pass
#print(sentiments[i])
In [5]:
import numpy as np
import scipy.spatial.distance
def avg_feature_vector(words, model, num_features):
# function to average all words vectors in a given paragraph
featureVec = np.zeros((num_features,), dtype="float64")
nwords = 0
# list containing names of words in the vocabulary
# index2word_set = set(model.index2word) this is moved as input param for performance reasons
for word in words:
if word in model.vocab:
nwords = nwords+1
featureVec = np.add(featureVec, model[word])
#else:
# print('not in vocabulary: ' + word)
if nwords > 0:
featureVec = np.divide(featureVec, nwords)
return featureVec
In [21]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.chat import eliza, iesha, rude,suntsu
import copy
def tag(string):
"""
returns tagging for different types of inputs.
sentence as a string > list of tags
sentence as a list > list of tags
word as string > tag
"""
s_type = type(string)
#print string,
if s_type == list:
return pos_tag(string)
elif s_type == str or s_type == unicode:
sent = word_tokenize(string)
#print sent,len(sent)
if len(sent) > 1:
return pos_tag(sent)
else:
#print pos_tag(sent)
return pos_tag(sent)[0]
def remove_unknown_chars(text, model):
return_text = ''
for word in text.split():
if word in model.vocab:
return_text += word
return_text += ' '
return return_text
def filter_tags(text):
tags = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBZ', 'VBD', 'VBP', 'VBN', 'VBG', 'JJ', 'JJR', 'JJS'] #nouns, verbs and adjectives
good_words = []
for word in text.split():
word_tag = tag(word)
if word_tag[1] in tags:
good_words.append(word)
return ' '.join(good_words)
In [22]:
from gensim import matutils
from numpy import array
from numpy import dot
answer_vectors = {}
for answer in answers:
a = remove_unknown_chars(answer, model)
a_original = copy.copy(a)
a = filter_tags(a).split()
if not a or len(a_original.split()) < 8 or len(a_original.split()) > 30:
continue
vector = [model[word] for word in a]
v = matutils.unitvec(array(vector).mean(axis=0))
answer_vectors[a_original] = v
print(len(answer_vectors))
In [ ]:
import language_check
tool = language_check.LanguageTool('en-US')
question = random.choice(questions)
question = 'i like computers'
matches = tool.check(question)
question = language_check.correct(question, matches)
print('---- Random question:')
print(question)
similarities = []
q = remove_unknown_chars(question, model)
for word in q.split():
word_tag = tag(word)
print(word + ' ' + word_tag[1])
q = filter_tags(q)
q_vector = [model[word] for word in q.split()]
q_v = matutils.unitvec(array(q_vector).mean(axis=0))
for key, value in answer_vectors.items():
dist = dot(q_v, value)
similarities.append((key, dist))
print(len(similarities))
similarities.sort(key=lambda similarities: similarities[1], reverse=True)
print('---- Most similar match by n_similarity:')
for i in range(5):
text = similarities[i]
print(text)
matches = tool.check(similarities[i][0])
corrected = language_check.correct(similarities[i][0], matches)
print(corrected)
print('---- Least similar match by n_similarity:')
for i in range(5):
print(similarities[(-i) - 1])
In [ ]:
res = model.most_similar(positive=['I', 'shaking'], negative=['do', 'sleeping'], topn=10)
print(res)
res2 = model.similarity('shaking','do')
print(res2)
In [ ]: