In [1]:
import gensim
from os import path
from glob import glob
import numpy as np
In [8]:
with open("/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/script.txt", "r") as f:
data = f.read()
In [ ]:
model.save_word2vec_format("shizer", fvocab="shizer_vocab.txt", binary=True)
In [11]:
# ====== KEYWORD EXTRACTION ======
# ================================
stopword = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/stop_words/sklearn_stopwords.txt"
inputt = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/script.txt"
# Initialize RAKE object,
rake_object = Rake(stop_words_path=stopword, min_char_length=4,
max_words_length=4, min_keyword_frequency=3)
# 2. run on RAKE on a given text
sample_file = open(inputt, 'r')
text = sample_file.read()
keywords = rake_object.run(text)
In [3]:
# ======= KEYWORD RANKING ========
# ================================
model = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/w2v_models/GoogleNews-vectors-negative300.bin.gz"
print("loading Word2Vec model...")
model = gensim.models.KeyedVectors.load_word2vec_format(model, limit=150000, binary=True)
print("loaded model!")
In [13]:
test = "evaluation/"
test_dirs = glob(path.join(test, "*txt"))
test_docs = [doc.read() for doc in [open(test_file, "r") for test_file in test_dirs]]
test_vecs = [get_avg_feature_vecs([doc],
model=model,
num_features=model.vector_size)
for doc in test_docs]
In [14]:
from itertools import compress
index2word_set = set(model.index2word)
bool_split = [word[0] in index2word_set for word in keywords]
keyword_in_model = list(compress(keywords, bool_split))
In [30]:
#sort keywords + choose how many to use
sorted_keyword = sorted(keyword_in_model, key=lambda x: x[1], reverse=True)
sorted_keyword = sorted(keyword_in_model, key=lambda x: x[1], reverse=True)
n_keywords = int(0.25*len(sorted_keyword))
keyword_list = sorted_keyword[0:n_keywords]
In [31]:
keyword_vecs = [(model.word_vec(word[0])) for word in sorted_keyword]
In [32]:
# generating candidate words from test docs (optional)
# test_words = generate_candidate_keywords(split_sentences(test_docs[0]), stopword_pattern=stopword,
# min_char_length=2, max_words_length=2)
In [33]:
test = "../evaluation/"
test_dirs = glob(path.join(test, "*txt"))
test_docs = [doc.read() for doc in [open(test_file, "r") for test_file in test_dirs]]
test_vecs = [get_avg_feature_vecs([doc],
model=model,
num_features=model.vector_size)
for doc in test_docs]
In [34]:
#Ranking
from sklearn.metrics import pairwise
In [35]:
x=[]
for vec in test_vecs:
for key_word in keyword_vecs:
x.append(pairwise.cosine_similarity(X=key_word.reshape(1,-1), Y = vec.reshape(1,-1)))
In [36]:
x=[]
for vec in test_vecs:
x.append([pairwise.cosine_similarity(X=key_word.reshape(1,-1), Y = vec.reshape(1,-1)) for key_word in keyword_vecs])
In [37]:
z=np.zeros_like(x[0])
for doc in x:
sum_keyword = z + doc
In [48]:
len(keyword_list)
Out[48]:
In [38]:
names_key = [k[0] for k in sorted_keyword]
In [39]:
# adding cosine similarities to get a single 'rank' for each keyword
z=np.zeros_like(x[0])
for y in x:
z=z+y
In [40]:
newlist = z/3
In [41]:
final = list(zip(names_key, newlist))
In [42]:
fff=[]
for i in range(len(names_key)):
fff.append((names_key[i], newlist[i][0][0]))
In [43]:
ranked = sorted(fff, key=lambda x: x[1], reverse=True)
ranked
Out[43]:
In [281]:
out = "bla.txt"
file.
In [294]:
# ========= SAVE OUTPUT ==========
# ================================
print("saving results")
with open("bla.txt","w") as f:
for line in ranked:
strs=" score: ".join(str(x) for x in line)
f.write(strs+"\n")
In [298]:
for line in ranked:
strs=" score: ".join(str(x) for x in line)
print(strs)
In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
# Automatic keyword extraction from individual documents.
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
#
# NOTE: The original implementation (available at - https://github.com/zelandiya/RAKE-tutorial)
# has been extended and updated to work with Python 3 and to include more specific functionality
import re
import operator
import six
from six.moves import range
# Required functions for RAKE
def is_number(s):
try:
float(s) if '.' in s else int(s)
return True
except ValueError:
return False
def load_stop_words(stop_word_file):
"""
Utility function to load stop words from a file and return as a list of words
@param stop_word_file Path and file name of a file containing stop words.
@return list A list of stop words.
"""
stop_words = []
for line in open(stop_word_file):
if line.strip()[0:1] != "#":
for word in line.split(): # in case more than one per line
stop_words.append(word)
return stop_words
def separate_words(text, min_word_return_size):
"""
Utility function to return a list of all words that are have a length greater than a specified number of characters.
@param text The text that must be split in to words.
@param min_word_return_size The minimum no of characters a word must have to be included.
"""
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
words = []
for single_word in splitter.split(text):
current_word = single_word.strip().lower()
#leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
words.append(current_word)
return words
def split_sentences(text):
"""
Utility function to return a list of sentences.
@param text The text that must be split in to sentences.
"""
sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
sentences = sentence_delimiters.split(text)
return sentences
def build_stop_word_regex(stop_word_file_path):
stop_word_list = load_stop_words(stop_word_file_path)
stop_word_regex_list = []
for word in stop_word_list:
word_regex = '\\b' + word + '\\b'
stop_word_regex_list.append(word_regex)
stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
return stop_word_pattern
def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5):
phrase_list = []
for s in sentence_list:
tmp = re.sub(stopword_pattern, '|', s.strip())
phrases = tmp.split("|")
for phrase in phrases:
phrase = phrase.strip().lower()
if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
phrase_list.append(phrase)
return phrase_list
def is_acceptable(phrase, min_char_length, max_words_length):
# a phrase must have a min length in characters
if len(phrase) < min_char_length:
return 0
# a phrase must have a max number of words
words = phrase.split()
if len(words) > max_words_length:
return 0
digits = 0
alpha = 0
for i in range(0, len(phrase)):
if phrase[i].isdigit():
digits += 1
elif phrase[i].isalpha():
alpha += 1
# a phrase must have at least one alpha character
if alpha == 0:
return 0
# a phrase must have more alpha than digits characters
if digits > alpha:
return 0
return 1
def calculate_word_scores(phraseList):
word_frequency = {}
word_degree = {}
for phrase in phraseList:
word_list = separate_words(phrase, 0)
word_list_length = len(word_list)
word_list_degree = word_list_length - 1
# if word_list_degree > 3: word_list_degree = 3 #exp.
for word in word_list:
word_frequency.setdefault(word, 0)
word_frequency[word] += 1
word_degree.setdefault(word, 0)
word_degree[word] += word_list_degree # orig.
# word_degree[word] += 1/(word_list_length*1.0) #exp.
for item in word_frequency:
word_degree[item] = word_degree[item] + word_frequency[item]
# Calculate Word scores = deg(w)/freq(w)
word_score = {}
for item in word_frequency:
word_score.setdefault(item, 0)
word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.
# word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
return word_score
def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
keyword_candidates = {}
for phrase in phrase_list:
if min_keyword_frequency > 1:
if phrase_list.count(phrase) < min_keyword_frequency:
continue
keyword_candidates.setdefault(phrase, 0)
word_list = separate_words(phrase, 0)
candidate_score = 0
for word in word_list:
candidate_score += word_score[word]
keyword_candidates[phrase] = candidate_score
return keyword_candidates
class Rake(object):
def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1):
self.__stop_words_path = stop_words_path
self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
self.__min_char_length = min_char_length
self.__max_words_length = max_words_length
self.__min_keyword_frequency = min_keyword_frequency
def run(self, text):
sentence_list = split_sentences(text)
phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern,
self.__min_char_length, self.__max_words_length)
word_scores = calculate_word_scores(phrase_list)
keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
sorted_keywords = sorted(six.iteritems(keyword_candidates), key=operator.itemgetter(1), reverse=True)
return sorted_keywords
test=None
# Testing + debugging RAKE on pre-defined text block
if test:
text = "Compatibility of systems of linear constraints over the set of natural numbers. " \
"Criteria of compatibility of a system of linear Diophantine equations, strict inequations," \
" and nonstrict inequations are considered. Upper bounds for components of a minimal set of " \
"solutions and algorithms of construction of minimal generating sets of solutions for all types" \
" of systems are given. These criteria and the corresponding algorithms for constructing a minimal" \
" supporting set of solutions can be used in solving all the considered" \
" types of systems and systems of mixed types."
# Split text into sentences
sentenceList = split_sentences(text)
stoppath = "stop_words/sklearn_stopwords.txt"
stopwordpattern = build_stop_word_regex(stoppath)
# generate candidate keywords
phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
# calculate individual word scores
wordscores = calculate_word_scores(phraseList)
# generate candidate keyword scores
keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
if debug:
print(keywordcandidates)
sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
if debug:
print(sortedKeywords)
totalKeywords = len(sortedKeywords)
if debug:
print(totalKeywords)
print(sortedKeywords[0:(totalKeywords // 3)])
rake = Rake("stop_words/sklearn_stopwords.txt")
keywords = rake.run(text)
print(keywords)
In [6]:
def make_feature_vec(words, model, num_features):
"""
Function to average all of the word vectors in a given paragraph
:param words:
:param model:
:param num_features:
:return:
"""
# Pre-initialize an empty numpy array (for speed)
feature_vec = np.zeros((num_features,), dtype="float32")
n_words = 0
# Index2word is a list that contains the names of the words in
# the model's vocabulary. Convert it to a set, for speed
index2word_set = set(model.index2word)
# Loop over each word in the review and, if it is in the model's
# vocabulary, add its feature vector to the total
for word in words:
if word in index2word_set:
n_words += 1
feature_vec = np.add(feature_vec, model[word])
# Divide the result by the number of words to get the average
feature_vec = np.divide(feature_vec, n_words)
return feature_vec
def get_avg_feature_vecs(reviews, model, num_features):
# Given a set of reviews (each one a list of words), calculate
# the average feature vector for each one and return a 2D numpy array
#
# Initialize a counter
counter = 0
#
# Pre-allocate a 2D numpy array, for speed
review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
#
# Loop through the reviews
for review in reviews:
# Print a status message
# if counter % 1000 == 0:
print("Review %d of %d" % (counter, len(reviews)))
# Call the function (defined above) that makes average feature vectors
review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
# Increment the counter
counter += 1
return review_feature_vecs
In [ ]: