In [57]:
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
# Automatic keyword extraction from indi-vidual documents.
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
#
# NOTE: The original code (from https://github.com/aneesha/RAKE)
# has been extended by a_medelyan (zelandiya)
# with a set of heuristics to decide whether a phrase is an acceptable candidate
# as well as the ability to set frequency and phrase length parameters
# important when dealing with longer documents
import re
import operator
debug = False
test = False
def is_number(s):
try:
float(s) if '.' in s else int(s)
return True
except ValueError:
return False
SmartStoplist = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
def load_stop_words(stop_word_file):
"""
Utility function to load stop words from a file and return as a list of words
@param stop_word_file Path and file name of a file containing stop words.
@return list A list of stop words.
"""
#stop_words = []
#for line in open(stop_word_file):
# if line.strip()[0:1] != "#":
# for word in line.split(): # in case more than one per line
# stop_words.append(word)
return stop_word_file
def separate_words(text, min_word_return_size):
"""
Utility function to return a list of all words that are have a length greater than a specified number of characters.
@param text The text that must be split in to words.
@param min_word_return_size The minimum no of characters a word must have to be included.
"""
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
words = []
for single_word in splitter.split(text):
current_word = single_word.strip().lower()
#leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
words.append(current_word)
return words
def split_sentences(text):
"""
Utility function to return a list of sentences.
@param text The text that must be split in to sentences.
"""
sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
sentences = sentence_delimiters.split(text)
return sentences
def build_stop_word_regex(stop_word_file_path):
stop_word_list = load_stop_words(stop_word_file_path)
stop_word_regex_list = []
for word in stop_word_list:
word_regex = '\\b' + word + '\\b'
stop_word_regex_list.append(word_regex)
stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
return stop_word_pattern
def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, min_words_length=1,max_words_length=5):
phrase_list = []
for s in sentence_list:
tmp = re.sub(stopword_pattern, '|', s.strip())
phrases = tmp.split("|")
for phrase in phrases:
phrase = phrase.strip().lower()
if phrase != "" and is_acceptable(phrase, min_char_length, min_words_length, max_words_length):
phrase_list.append(phrase)
return phrase_list
def is_acceptable(phrase, min_char_length, min_words_length, max_words_length):
# a phrase must have a min length in characters
if len(phrase) < min_char_length:
return 0
# a phrase must have a min and max number of words
words = phrase.split()
if (len(words) > max_words_length) or (len(words)< min_words_length):
return 0
digits = 0
alpha = 0
for i in range(0, len(phrase)):
if phrase[i].isdigit():
digits += 1
elif phrase[i].isalpha():
alpha += 1
# a phrase must have at least one alpha character
if alpha == 0:
return 0
# a phrase must have more alpha than digits characters
if digits > alpha:
return 0
return 1
def calculate_word_scores(phraseList):
word_frequency = {}
word_degree = {}
for phrase in phraseList:
word_list = separate_words(phrase, 0)
word_list_length = len(word_list)
word_list_degree = word_list_length - 1
#if word_list_degree > 3: word_list_degree = 3 #exp.
for word in word_list:
word_frequency.setdefault(word, 0)
word_frequency[word] += 1
word_degree.setdefault(word, 0)
word_degree[word] += word_list_degree #orig.
#word_degree[word] += 1/(word_list_length*1.0) #exp.
for item in word_frequency:
word_degree[item] = word_degree[item] + word_frequency[item]
# Calculate Word scores = deg(w)/frew(w)
word_score = {}
for item in word_frequency:
word_score.setdefault(item, 0)
word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig.
#word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
return word_score
def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
keyword_candidates = {}
for phrase in phrase_list:
if min_keyword_frequency > 1:
if phrase_list.count(phrase) < min_keyword_frequency:
continue
keyword_candidates.setdefault(phrase, 0)
word_list = separate_words(phrase, 0)
candidate_score = 0
for word in word_list:
candidate_score += word_score[word]
keyword_candidates[phrase] = candidate_score
return keyword_candidates
class Rake(object):
def __init__(self, stop_words_path, min_char_length=1, min_words_length=1, max_words_length=5, min_keyword_frequency=1):
self.__stop_words_path = stop_words_path
self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
self.__min_char_length = min_char_length
self.__min_words_length = min_words_length
self.__max_words_length = max_words_length
self.__min_keyword_frequency = min_keyword_frequency
def run(self, text):
sentence_list = split_sentences(text)
phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern, self.__min_char_length, self.__min_words_length, self.__max_words_length)
word_scores = calculate_word_scores(phrase_list)
keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
return sorted_keywords
if test:
text = "UN SG, Kofi Annan, on Tuesday presented the UN SC with six options ranging from maintaining the status quo to a full withdrawal for the UN mission monitoring the tense Ethiopia-Eritrea border. In a report to the 15-member council on the activities of the UN Mission in Ethiopia and Eritrea (UNMEE), Annan made it clear that none of the options was perfect and said that both Addis Ababa and Asmara would still have to fully implement council resolution 1640. Annan also suggested that the council set deadlines for compliance with its previous resolutions on the issue. And he warned that unless Eritrea's restrictions on UNMEE movements, including a ban on helicopter flights, were lifted rapidly, ""I would be obliged to make recommendations to the council concerning force deployment on the ground by the end of January 2006."" Annan's report was released amid a continuing stalemate in the Horn of Africa border dispute after UNMEE last month completed the withdrawal of its North American and European UN peacekeepers from Eritrea, meeting a deadline set by Asmara for their expulsions."
# Split text into sentences
sentenceList = split_sentences(text)
#stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
stoppath = "RAKE/SmartStoplist.txt" #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
stopwordpattern = build_stop_word_regex(stoppath)
# generate candidate keywords
phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
# calculate individual word scores
wordscores = calculate_word_scores(phraseList)
# generate candidate keyword scores
keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
if debug: print keywordcandidates
sortedKeywords = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
if debug: print sortedKeywords
totalKeywords = len(sortedKeywords)
if debug: print totalKeywords
print sortedKeywords[0:(totalKeywords / 3)]
def extractKeyword(text):
rake = Rake(SmartStoplist,5,2,3,1)
keywords = rake.run(text)
print keywords
In [62]:
text = "Fuel shortages due to rebel infighting threaten lives,Hundreds of private trucks used to pass through the crossing daily, carrying crudely processed oil bought from fields under the control of ISIL in eastern Syria, to sell to the rest of northern rebel-held Syria."
extractKeyword(text)