In [3]:
import gensim
from os import path
from glob import glob
import numpy as np

In [12]:
with open("/Users/Belal/Projects/jobs/i2x_job/script.txt", "r") as f:
    data = f.read()

In [13]:
# ====== KEYWORD EXTRACTION ======
# ================================
stopword = "stop_words/sklearn_stopwords.txt"
input = "/Users/Belal/Projects/jobs/i2x_job/script.txt"

# Initialize RAKE object,
rake_object = Rake(stop_words_path=stopword, min_char_length=5,
                   max_words_length=3, min_keyword_frequency=4)

# 2. run on RAKE on a given text
sample_file = open(input, 'r', encoding="iso-8859-1")
text = sample_file.read()

keywords = rake_object.run(text)

In [14]:
# ======= KEYWORD RANKING ========
# ================================

model = "w2v_models/GoogleNews-vectors-negative300.bin.gz"

print("loading Word2Vec model...")
model = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)
print("loaded model!")


loading Word2Vec model...
loaded model!

In [28]:
get_avg_feature_vecs([test_docs] ,model, 300)


Review 0 of 1
Out[28]:
array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.]], dtype=float32)

In [29]:
test = "evaluation/"

test_dirs = glob(path.join(test, "*txt"))
test_docs = [doc.read() for doc in [open(test_file, "r") for test_file in test_dirs]]
test_vecs = [get_avg_feature_vecs([doc],
                                  model=model,
                                  num_features=model.vector_size)
             for doc in test_docs]


Review 0 of 1
Review 0 of 1
Review 0 of 1

In [36]:
test_vecs[0][0]


Out[36]:
array([-3193.11181641,  2132.89648438,    -5.62496948,  2498.46875   ,
       -1009.86804199,   289.55325317, -1689.80322266, -1049.11242676,
        -666.45129395,   428.36975098, -1054.78796387, -1627.27441406,
       -3691.68603516,   506.66766357, -2325.35400391,  1676.30212402,
        1861.67431641,  3143.71875   ,  -292.78210449,   120.52070618,
       -4665.56640625,  -644.64984131,  2085.19287109,   296.72491455,
       -1566.43151855,   531.07550049, -4607.33544922,  1040.11303711,
        -517.31994629,  -380.19915771,  -194.37350464,   536.14135742,
       -1413.38378906, -2052.39160156, -2510.00756836,  1775.68835449,
       -3954.99487305,  2293.77880859, -1133.16723633,  1587.45471191,
        -668.28808594,  -899.00054932,  1100.43359375,  1642.91455078,
         927.42041016,  -452.34075928,  -594.91210938, -3238.29833984,
       -2112.20092773,  1532.01831055, -3279.25292969,  4382.36621094,
        -546.29931641,  4216.60107422,   817.12548828,  2493.01635742,
       -2893.09765625, -1916.31982422,   -89.95082092, -2870.59472656,
       -2521.78393555, -1322.29003906, -3515.35864258, -1002.54699707,
        -745.57995605, -3632.70410156, -1835.30529785,  2206.83618164,
       -1080.04663086,   839.62585449,   524.00091553, -1341.78515625,
         671.80749512,  -316.80584717,  -543.19702148,  -184.9153595 ,
        2450.33666992,   278.20022583,   420.37088013, -1581.48486328,
       -2585.35839844,  -192.55581665,  -679.45861816,   -38.09625244,
        2383.36035156,  1286.43103027,  -367.4786377 ,  4080.83618164,
         335.44116211,   994.06072998,   373.13342285,   533.52026367,
        -616.59619141, -2278.50195312,   878.14978027,  1941.27661133,
       -1659.98010254,  1823.09924316,  4659.14013672,  -287.46191406,
       -1371.09692383,   -92.46264648, -1109.56201172,  -294.79873657,
       -1031.0246582 ,  2623.12084961, -1487.66625977,   996.11218262,
        -150.88015747,   306.13397217, -2551.03149414, -3313.01855469,
        -747.69958496,  -937.44317627,  1199.18640137,  2920.59570312,
        1491.57287598,  -737.69226074,  -369.22299194,  -623.11297607,
        2109.51391602,  1309.76696777,  -237.03744507,   443.25317383,
        2216.72802734, -3001.95581055, -1812.27270508,  -345.6361084 ,
        1094.48693848,  1957.01196289, -1332.32702637,  -422.36141968,
       -1446.01135254,  -848.29193115, -3019.59008789,  1340.77734375,
        -708.85180664,   630.51507568,  4411.29785156,  2567.0078125 ,
        3672.73095703, -1304.40759277,   861.9163208 ,  -705.31713867,
       -1126.24157715, -1774.40698242,  -941.3046875 ,  3006.10083008,
        -979.74328613,   580.85882568,  1252.2199707 , -5341.47558594,
        -898.62561035,  -789.81005859, -2050.00537109, -1634.41418457,
        1390.27600098,  3234.8125    ,  -441.81048584,  -571.37750244,
         399.21502686,  1594.57495117,    26.5831604 ,   123.36439514,
        -338.77526855,  -908.03027344,  3030.05859375,   749.05419922,
       -1872.34216309,  1632.21508789, -2064.81201172, -1705.51403809,
        -734.80975342, -1378.21801758,  -333.80914307,  4143.40917969,
        4635.29394531, -3869.17529297,   218.38027954, -1591.50537109,
         137.72470093,   462.16595459,   842.92565918,  -772.10235596,
        -433.66339111,  1355.53271484,  -742.76257324,  1262.32568359,
        -490.1675415 ,   680.47937012,  2164.21728516,  -954.53320312,
       -4490.44775391,  -377.88870239,  2153.08911133,   783.5425415 ,
       -1194.4498291 ,  -225.69979858,  1262.67089844, -1860.14916992,
       -2027.57531738,   383.88095093, -3710.83935547, -2379.81396484,
        1201.42333984, -1356.0826416 ,  -596.04290771,  1981.05322266,
        -753.6270752 ,  2962.23364258,  2675.63525391,  1593.25598145,
       -2656.22607422,   103.84896851, -1568.53259277,  1022.1862793 ,
        2926.7109375 , -1161.88439941, -2880.45385742,    17.77093506,
       -2163.08618164,  2449.45361328,  1457.89257812,  -261.88671875,
         161.7756958 , -1433.01269531,  1078.76147461,  1117.71679688,
         130.74960327, -2619.28393555,   431.17547607,  -914.64978027,
       -1677.76135254,   262.26898193,   171.86212158,   413.20489502,
        1032.17211914,  1611.27441406,  1573.22558594,  1789.51574707,
        1628.61328125,  1606.88354492,  4888.79394531, -1110.80175781,
        -211.71246338,  1545.95300293, -2776.79052734,  2777.21020508,
         922.58117676, -1384.19812012,  1180.73388672,   467.90731812,
        1031.99609375,  1823.17944336,  -246.91387939, -2230.44140625,
        -722.87969971,   979.25042725,   646.89404297, -1543.60107422,
        1358.41833496,  -372.14562988,  1807.40441895,  -302.50897217,
       -1569.41186523,   399.6892395 ,    96.5213623 ,   146.48069763,
        -218.46366882, -1113.54199219, -1545.00439453,  -363.00976562,
         797.65246582,   261.51367188,  1388.35131836, -1476.90148926,
       -3195.21362305, -2484.12817383, -1825.03076172,  1296.81433105,
       -1807.43554688,  2801.49829102,   -28.73397827,   125.85220337,
          99.55349731,  1147.25390625,  -119.74285889,  -304.63931274,
       -1052.06018066,  -893.42700195,  1315.87023926,   -56.41821289,
       -1788.14697266,  1814.83764648,  -620.51715088, -2996.87329102,
       -2048.20605469,  -434.65499878, -2035.68505859,  2802.39941406], dtype=float32)

In [ ]:


In [ ]:
# ========= SAVE OUTPUT ==========
# ================================

print("saving results")

with open(args["output"], "wb") as outdir:
    outdir.write(test_vecs)

In [ ]:
phrase_list = []

for s in t1_sent[0:5]:
    tmp = re.sub(" a ", '|', s.strip())
    phrases = tmp.split("|")
    for phrase in phrases:
        phrase = phrase.strip().lower()
        if phrase != "" and is_acceptable(phrase, min_char_length=1, max_words_length=5):
            phrase_list.append(phrase)

phrase_list

In [ ]:
generate_candidate_keywords(t1_sent, stop, min_char_length=2, max_words_length=4)

In [ ]:
sentenceList = split_sentences(data)
stopwordpattern = build_stop_word_regex(stop)
phraseList = generate_candidate_keywords(sentenceList, stopwordpattern, max_words_length=2)

In [ ]:
wordscores = calculate_word_scores(phraseList)
keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)

In [ ]:
sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)
totalKeywords = len(sortedKeywords)

In [ ]:
for keyword in sortedKeywords[0:(totalKeywords // 3)]:
    print ("Keyword: ", keyword[0], ", score: ", keyword[1])

In [ ]:
import gensim
import time

In [ ]:
start = time.time()
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/Belal/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True)  
stop = time.time()
print("took {} to load".format(start-stop))

In [ ]:
# https://github.com/zelandiya/RAKE-tutorial/blob/master/rake.py

with open("scripts/script.txt", "r") as f:
    data = f.read()

stop = "/Users/Belal/Projects/jobs/i2x_job/RAKE-tutorial/stop_words/scikit_stopwords.txt"

rake_object = Rake(stop, 3, 1, 3)
keywords = rake_object.run(data)

In [ ]:
p = model.word_vec("Potter")
e = model.word_vec("encyclopedia")
p_e = model.word_vec("Potter_encyclopedia")

In [ ]:


In [ ]:
key_vec = []
[key_vec.extend(model.word_vec(word[0])) for word in keywords]

In [ ]:
test_words = generate_candidate_keywords(split_sentences(t1), stopword_pattern=stop, min_char_length=2, max_words_length=2)
test2_words = generate_candidate_keywords(split_sentences(t2), stopword_pattern=stop, min_char_length=2, max_words_length=2)
test3_words = generate_candidate_keywords(split_sentences(t3), stopword_pattern=stop, min_char_length=2, max_words_length=2)

In [ ]:
from itertools import compress

bool_split = [word in model.vocab for word in test_words]
x = list(compress(test_words, bool_split))

In [ ]:
lll = []
[lll.append(model.word_vec(y)) for y in x]

In [ ]:
model.index2word

my own Doc2Vec


In [ ]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        # Print a status message every 1000th review
        if counter%1000 == 0:
            print ("Review %d of %d" % (counter, len(reviews)))

        # Call the function (defined above) that makes average feature vectors

        reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features)
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs


def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

In [ ]:
import numpy as np

In [ ]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        # Print a status message every 1000th review
        if counter%1000 == 0:
            print ("Review %d of %d" % (counter, len(reviews)))

        # Call the function (defined above) that makes average feature vectors

        reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features)
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

In [ ]:


In [ ]:
test_list = [test_words, test2_words, test3_words]
trainDataVecs = [getAvgFeatureVecs([test_doc], model=model, num_features=model.vector_size) for test_doc in test_list]

In [ ]:
avg_test = (trainDataVecs[0] + trainDataVecs[1] + trainDataVecs[2])/3

In [ ]:
from sklearn.metrics import pairwise

In [ ]:
y=(x[0] + x[1] + x[2])

In [ ]:
x_sum = ([sum(x) for x in zip(x[0], x[1], x[2])])

In [ ]:
x=[]
for test_vec in trainDataVecs:
    for key_word in key_vec:
        x.append(pairwise.cosine_similarity(X=key_word.reshape(1,-1), Y = test_vec.reshape(1,-1)))

In [ ]:
x=[]
for test_vec in trainDataVecs:
    x.append([pairwise.cosine_similarity(X=key_word.reshape(1,-1), Y = test_vec.reshape(1,-1)) for key_word in key_vec])

In [ ]:
newList = [x / 3 for x in x_sum]

In [ ]:
names_key = [k[0] for k in keywords]

In [ ]:
final = list(zip(names_key, newList))

In [ ]:
fff=[]
for i in range(len(names_key)):
    fff.append((names_key[i], newList[i][0][0]))

In [ ]:
ranked = sorted(fff, key=lambda x: x[1], reverse=True)
ranked

In [ ]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in ttt[0:1]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

In [ ]:
print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

#


In [9]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
# Automatic keyword extraction from individual documents.
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
#
# NOTE: The original implementation (available at - https://github.com/zelandiya/RAKE-tutorial)
# has been extended and updated to work with Python 3 and to include more specific functionality


import re
import operator
import six
from six.moves import range


# Required functions for RAKE
def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words


def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
    sentences = sentence_delimiters.split(text)
    return sentences


def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = '\\b' + word + '\\b'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern


def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
                phrase_list.append(phrase)
    return phrase_list


def is_acceptable(phrase, min_char_length, max_words_length):

    # a phrase must have a min length in characters
    if len(phrase) < min_char_length:
        return 0

    # a phrase must have a max number of words
    words = phrase.split()
    if len(words) > max_words_length:
        return 0

    digits = 0
    alpha = 0
    for i in range(0, len(phrase)):
        if phrase[i].isdigit():
            digits += 1
        elif phrase[i].isalpha():
            alpha += 1

    # a phrase must have at least one alpha character
    if alpha == 0:
        return 0

    # a phrase must have more alpha than digits characters
    if digits > alpha:
        return 0
    return 1


def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        # if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  # orig.
            # word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/freq(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    # word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score


def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
    keyword_candidates = {}

    for phrase in phrase_list:
        if min_keyword_frequency > 1:
            if phrase_list.count(phrase) < min_keyword_frequency:
                continue
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates


class Rake(object):
    def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1):
        self.__stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
        self.__min_char_length = min_char_length
        self.__max_words_length = max_words_length
        self.__min_keyword_frequency = min_keyword_frequency

    def run(self, text):
        sentence_list = split_sentences(text)
        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern,
                                                  self.__min_char_length, self.__max_words_length)
        word_scores = calculate_word_scores(phrase_list)
        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
        sorted_keywords = sorted(six.iteritems(keyword_candidates), key=operator.itemgetter(1), reverse=True)
        return sorted_keywords


test=None
# Testing + debugging RAKE on pre-defined text block
if test:
    text = "Compatibility of systems of linear constraints over the set of natural numbers. " \
           "Criteria of compatibility of a system of linear Diophantine equations, strict inequations," \
           " and nonstrict inequations are considered. Upper bounds for components of a minimal set of " \
           "solutions and algorithms of construction of minimal generating sets of solutions for all types" \
           " of systems are given. These criteria and the corresponding algorithms for constructing a minimal" \
           " supporting set of solutions can be used in solving all the considered" \
           " types of systems and systems of mixed types."

    # Split text into sentences
    sentenceList = split_sentences(text)
    stoppath = "stop_words/sklearn_stopwords.txt"
    stopwordpattern = build_stop_word_regex(stoppath)

    # generate candidate keywords
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

    # calculate individual word scores
    wordscores = calculate_word_scores(phraseList)

    # generate candidate keyword scores
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    if debug:
        print(keywordcandidates)

    sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
    if debug:
        print(sortedKeywords)

    totalKeywords = len(sortedKeywords)
    if debug:
        print(totalKeywords)
        print(sortedKeywords[0:(totalKeywords // 3)])

    rake = Rake("stop_words/sklearn_stopwords.txt")
    keywords = rake.run(text)
    print(keywords)

In [2]:
def make_feature_vec(words, model, num_features):
    """
    Function to average all of the word vectors in a given paragraph
    :param words:
    :param model:
    :param num_features:
    :return:
    """
    # Pre-initialize an empty numpy array (for speed)
    feature_vec = np.zeros((num_features,), dtype="float32")

    n_words = 0

    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    index2word_set = set(model.index2word)

    # Loop over each word in the review and, if it is in the model's
    # vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])

    # Divide the result by the number of words to get the average
    feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate
    # the average feature vector for each one and return a 2D numpy array
    #
    # Initialize a counter
    counter = 0
    #
    # Pre-allocate a 2D numpy array, for speed
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    #
    # Loop through the reviews
    for review in reviews:
        # Print a status message
        # if counter % 1000 == 0:
        print("Review %d of %d" % (counter, len(reviews)))

        # Call the function (defined above) that makes average feature vectors
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        # Increment the counter
        counter += 1
    return review_feature_vecs

In [ ]: