RAKE



In [1]:
import gensim
from os import path
from glob import glob
import numpy as np

In [8]:
with open("/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/script.txt", "r") as f:
    data = f.read()

In [ ]:
model.save_word2vec_format("shizer", fvocab="shizer_vocab.txt", binary=True)

In [11]:
# ====== KEYWORD EXTRACTION ======
# ================================
stopword = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/stop_words/sklearn_stopwords.txt"
inputt = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/script.txt"

# Initialize RAKE object,
rake_object = Rake(stop_words_path=stopword, min_char_length=4,
                   max_words_length=4, min_keyword_frequency=3)

# 2. run on RAKE on a given text
sample_file = open(inputt, 'r')
text = sample_file.read()

keywords = rake_object.run(text)

In [3]:
# ======= KEYWORD RANKING ========
# ================================

model = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/w2v_models/GoogleNews-vectors-negative300.bin.gz"

print("loading Word2Vec model...")
model = gensim.models.KeyedVectors.load_word2vec_format(model, limit=150000, binary=True)
print("loaded model!")


loading Word2Vec model...
loaded model!

In [13]:
test = "evaluation/"

test_dirs = glob(path.join(test, "*txt"))
test_docs = [doc.read() for doc in [open(test_file, "r") for test_file in test_dirs]]
test_vecs = [get_avg_feature_vecs([doc],
                                  model=model,
                                  num_features=model.vector_size)
             for doc in test_docs]

In [14]:
from itertools import compress

index2word_set = set(model.index2word)
bool_split = [word[0] in index2word_set for word in keywords]
keyword_in_model = list(compress(keywords, bool_split))

In [30]:
#sort keywords + choose how many to use
sorted_keyword = sorted(keyword_in_model, key=lambda x: x[1], reverse=True)

sorted_keyword = sorted(keyword_in_model, key=lambda x: x[1], reverse=True)
n_keywords = int(0.25*len(sorted_keyword))
keyword_list = sorted_keyword[0:n_keywords]

In [31]:
keyword_vecs = [(model.word_vec(word[0])) for word in sorted_keyword]

In [32]:
# generating candidate words from test docs (optional)
# test_words = generate_candidate_keywords(split_sentences(test_docs[0]), stopword_pattern=stopword,
#                                         min_char_length=2, max_words_length=2)

In [33]:
test = "../evaluation/"

test_dirs = glob(path.join(test, "*txt"))
test_docs = [doc.read() for doc in [open(test_file, "r") for test_file in test_dirs]]
test_vecs = [get_avg_feature_vecs([doc],
                                  model=model,
                                  num_features=model.vector_size)
             for doc in test_docs]


Review 0 of 1
Review 0 of 1
Review 0 of 1

In [34]:
#Ranking
from sklearn.metrics import pairwise

In [35]:
x=[]
for vec in test_vecs:
    for key_word in keyword_vecs:
        x.append(pairwise.cosine_similarity(X=key_word.reshape(1,-1), Y = vec.reshape(1,-1)))

In [36]:
x=[]
for vec in test_vecs:
    x.append([pairwise.cosine_similarity(X=key_word.reshape(1,-1), Y = vec.reshape(1,-1)) for key_word in keyword_vecs])

In [37]:
z=np.zeros_like(x[0])
for doc in x:
    sum_keyword = z + doc

In [48]:
len(keyword_list)


Out[48]:
21

In [38]:
names_key = [k[0] for k in sorted_keyword]

In [39]:
# adding cosine similarities to get a single 'rank' for each keyword
z=np.zeros_like(x[0])
for y in x:
    z=z+y

In [40]:
newlist = z/3

In [41]:
final = list(zip(names_key, newlist))

In [42]:
fff=[]
for i in range(len(names_key)):
    fff.append((names_key[i], newlist[i][0][0]))

In [43]:
ranked = sorted(fff, key=lambda x: x[1], reverse=True)
ranked


Out[43]:
[('china', 0.242401),
 ('type', 0.19073217),
 ('means', 0.16486825),
 ('form', 0.13493739),
 ('rice', 0.12611552),
 ('chicken', 0.12589379),
 ('sweet', 0.11941203),
 ('world', 0.1109058),
 ('development', 0.10882666),
 ('having', 0.10622595),
 ('used', 0.10187316),
 ('cultural', 0.099895574),
 ('types', 0.097578488),
 ('umami', 0.097115934),
 ('health', 0.094237708),
 ('salt', 0.092982449),
 ('times', 0.092271946),
 ('starvation', 0.089643747),
 ('hunger', 0.089590617),
 ('minerals', 0.088807322),
 ('texture', 0.088739671),
 ('people', 0.086064778),
 ('result', 0.0837484),
 ('sugar', 0.082929388),
 ('methods', 0.080974534),
 ('known', 0.079273887),
 ('packaging', 0.075944103),
 ('fruits', 0.073765509),
 ('corn', 0.073155783),
 ('according', 0.071755178),
 ('increase', 0.071639679),
 ('transportation', 0.070788026),
 ('cooked', 0.070380159),
 ('wheat', 0.070036478),
 ('investment', 0.068769686),
 ('reasons', 0.068568893),
 ('affect', 0.068433747),
 ('culture', 0.067772917),
 ('consumption', 0.066966295),
 ('caused', 0.064567439),
 ('seeds', 0.062590219),
 ('fish', 0.061663259),
 ('enhance', 0.060794968),
 ('sale', 0.060116809),
 ('food', 0.05965361),
 ('agriculture', 0.059275296),
 ('sweetness', 0.058638304),
 ('home', 0.056022521),
 ('cultures', 0.054038893),
 ('vegetables', 0.053137455),
 ('seafood', 0.052252591),
 ('diet', 0.044206943),
 ('meat', 0.040651802),
 ('prepared', 0.040598195),
 ('grow', 0.040244192),
 ('usually', 0.039673906),
 ('consumers', 0.039646834),
 ('prices', 0.039328538),
 ('meats', 0.038419973),
 ('preparation', 0.036577865),
 ('eaten', 0.036285266),
 ('ovens', 0.036008809),
 ('enjoyable', 0.033106118),
 ('considered', 0.032849845),
 ('fats', 0.032445978),
 ('flavor', 0.032211341),
 ('farmers', 0.030326007),
 ('companies', 0.02906364),
 ('humans', 0.02753544),
 ('sour', 0.026070559),
 ('cooking', 0.02463785),
 ('countries', 0.021427276),
 ('ingredients', 0.02129437),
 ('example', 0.01869305),
 ('production', 0.015212964),
 ('taste', 0.015157831),
 ('bitter', 0.010554562),
 ('plants', 0.0068413611),
 ('foods', 0.0060227271),
 ('animal', 0.001508461),
 ('contaminants', -0.0070710517),
 ('fired', -0.0070932577),
 ('safety', -0.0070954538),
 ('addition', -0.0072883219),
 ('lead', -0.011358492),
 ('animals', -0.012626085),
 ('contrast', -0.02175129)]

In [281]:
out = "bla.txt"
file.

In [294]:
# ========= SAVE OUTPUT ==========
# ================================

print("saving results")

with open("bla.txt","w") as f:
    for line in ranked:
        strs="    score: ".join(str(x) for x in line)
        f.write(strs+"\n")


saving results

In [298]:
for line in ranked:
    strs="    score: ".join(str(x) for x in line)
    print(strs)


china    score: 0.242626
type    score: 0.190607
means    score: 0.164325
form    score: 0.134714
rice    score: 0.126557
chicken    score: 0.125995
sweet    score: 0.119496
world    score: 0.112293
development    score: 0.109133
having    score: 0.107233
used    score: 0.101601
cultural    score: 0.100244
types    score: 0.0964164
umami    score: 0.0957589
health    score: 0.0931618
salt    score: 0.0921845
times    score: 0.0915946
hunger    score: 0.0897221
starvation    score: 0.0894356
texture    score: 0.0879902
minerals    score: 0.087908
people    score: 0.0856616
result    score: 0.0828769
sugar    score: 0.0823318
methods    score: 0.0808169
known    score: 0.0788293
packaging    score: 0.0769205
according    score: 0.0742263
corn    score: 0.0741341
fruits    score: 0.073292
increase    score: 0.0727475
wheat    score: 0.071175
transportation    score: 0.0708261
cooked    score: 0.0696251
investment    score: 0.0691062
reasons    score: 0.0690556
culture    score: 0.0684923
affect    score: 0.0679003
consumption    score: 0.067041
caused    score: 0.0646559
seeds    score: 0.0631239
sale    score: 0.0612348
fish    score: 0.0610483
enhance    score: 0.0608734
agriculture    score: 0.0596838
food    score: 0.0593474
home    score: 0.0585603
sweetness    score: 0.0582649
cultures    score: 0.0544249
vegetables    score: 0.0527228
seafood    score: 0.0516737
diet    score: 0.0426555
prepared    score: 0.0418323
prices    score: 0.0409436
grow    score: 0.0405165
consumers    score: 0.040495
meat    score: 0.0399547
usually    score: 0.0396568
meats    score: 0.0376271
preparation    score: 0.0364894
eaten    score: 0.0349766
ovens    score: 0.0345096
enjoyable    score: 0.0339255
considered    score: 0.03249
flavor    score: 0.0313941
fats    score: 0.0309461
farmers    score: 0.0302418
companies    score: 0.0292786
humans    score: 0.0272249
sour    score: 0.0259572
cooking    score: 0.0245886
countries    score: 0.0210818
ingredients    score: 0.0203366
example    score: 0.0191558
production    score: 0.0160983
taste    score: 0.0155592
bitter    score: 0.0120033
plants    score: 0.00684935
foods    score: 0.00524395
animal    score: 0.000727753
fired    score: -0.00596238
safety    score: -0.00715225
contaminants    score: -0.00771302
addition    score: -0.0079871
lead    score: -0.0108352
animals    score: -0.012787
contrast    score: -0.021082

#


In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
# Automatic keyword extraction from individual documents.
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
#
# NOTE: The original implementation (available at - https://github.com/zelandiya/RAKE-tutorial)
# has been extended and updated to work with Python 3 and to include more specific functionality


import re
import operator
import six
from six.moves import range


# Required functions for RAKE
def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words


def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
    sentences = sentence_delimiters.split(text)
    return sentences


def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = '\\b' + word + '\\b'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern


def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
                phrase_list.append(phrase)
    return phrase_list


def is_acceptable(phrase, min_char_length, max_words_length):

    # a phrase must have a min length in characters
    if len(phrase) < min_char_length:
        return 0

    # a phrase must have a max number of words
    words = phrase.split()
    if len(words) > max_words_length:
        return 0

    digits = 0
    alpha = 0
    for i in range(0, len(phrase)):
        if phrase[i].isdigit():
            digits += 1
        elif phrase[i].isalpha():
            alpha += 1

    # a phrase must have at least one alpha character
    if alpha == 0:
        return 0

    # a phrase must have more alpha than digits characters
    if digits > alpha:
        return 0
    return 1


def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        # if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  # orig.
            # word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/freq(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score


def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
    keyword_candidates = {}

    for phrase in phrase_list:
        if min_keyword_frequency > 1:
            if phrase_list.count(phrase) < min_keyword_frequency:
                continue
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates


class Rake(object):
    def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1):
        self.__stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
        self.__min_char_length = min_char_length
        self.__max_words_length = max_words_length
        self.__min_keyword_frequency = min_keyword_frequency

    def run(self, text):
        sentence_list = split_sentences(text)
        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern,
                                                  self.__min_char_length, self.__max_words_length)
        word_scores = calculate_word_scores(phrase_list)
        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
        sorted_keywords = sorted(six.iteritems(keyword_candidates), key=operator.itemgetter(1), reverse=True)
        return sorted_keywords


test=None
# Testing + debugging RAKE on pre-defined text block
if test:
    text = "Compatibility of systems of linear constraints over the set of natural numbers. " \
           "Criteria of compatibility of a system of linear Diophantine equations, strict inequations," \
           " and nonstrict inequations are considered. Upper bounds for components of a minimal set of " \
           "solutions and algorithms of construction of minimal generating sets of solutions for all types" \
           " of systems are given. These criteria and the corresponding algorithms for constructing a minimal" \
           " supporting set of solutions can be used in solving all the considered" \
           " types of systems and systems of mixed types."

    # Split text into sentences
    sentenceList = split_sentences(text)
    stoppath = "stop_words/sklearn_stopwords.txt"
    stopwordpattern = build_stop_word_regex(stoppath)

    # generate candidate keywords
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

    # calculate individual word scores
    wordscores = calculate_word_scores(phraseList)

    # generate candidate keyword scores
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    if debug:
        print(keywordcandidates)

    sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
    if debug:
        print(sortedKeywords)

    totalKeywords = len(sortedKeywords)
    if debug:
        print(totalKeywords)
        print(sortedKeywords[0:(totalKeywords // 3)])

    rake = Rake("stop_words/sklearn_stopwords.txt")
    keywords = rake.run(text)
    print(keywords)

In [6]:
def make_feature_vec(words, model, num_features):
    """
    Function to average all of the word vectors in a given paragraph
    :param words:
    :param model:
    :param num_features:
    :return:
    """
    # Pre-initialize an empty numpy array (for speed)
    feature_vec = np.zeros((num_features,), dtype="float32")

    n_words = 0

    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    index2word_set = set(model.index2word)

    # Loop over each word in the review and, if it is in the model's
    # vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])

    # Divide the result by the number of words to get the average
    feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate
    # the average feature vector for each one and return a 2D numpy array
    #
    # Initialize a counter
    counter = 0
    #
    # Pre-allocate a 2D numpy array, for speed
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    #
    # Loop through the reviews
    for review in reviews:
        # Print a status message
        # if counter % 1000 == 0:
        print("Review %d of %d" % (counter, len(reviews)))

        # Call the function (defined above) that makes average feature vectors
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        # Increment the counter
        counter += 1
    return review_feature_vecs

In [ ]: