RAKE Algorithm

Rapid Automatic Keyword Extraction

feature: domain-independent, language-independent

tend to extract longer phrase
a phrase almost not contain punctuations and stop words, thus split the sentence based on the punctuations and stop words.

Score of each phrase:
$$wordScore = wordDegree(w) / wordFrequency(w)$$

phraseScore equals the sum of wordScore in the phrase

Problem of RAKE algorithm:

  • may produce long phrases

In [20]:
# Adapted from: github.com/aneesha/RAKE/rake.py
from __future__ import division
import operator
import nltk
import string

def isPunct(word):
  return len(word) == 1 and word in string.punctuation

def isNumeric(word):
  try:
    float(word) if '.' in word else int(word)
    return True
  except ValueError:
    return False

class RakeKeywordExtractor:

  def __init__(self):
    self.stopwords = set(nltk.corpus.stopwords.words())
    self.top_fraction = 1 # consider top third candidate keywords by score

  def _generate_candidate_keywords(self, sentences):
    phrase_list = []
    for sentence in sentences:
      words = map(lambda x: "|" if x in self.stopwords else x,
                    nltk.word_tokenize(sentence.lower()))
      phrase = []
      for word in words:
        if word == "|" or isPunct(word):
          if len(phrase) > 0:
            phrase_list.append(phrase)
            phrase = []
        else:
          phrase.append(word)
    return phrase_list

  def _calculate_word_scores(self, phrase_list):
    word_freq = nltk.FreqDist()
    word_degree = nltk.FreqDist()
    for phrase in phrase_list:
      word_list_degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
      for word in phrase:
        word_freq[word] += 1
        word_degree[word] += word_list_degree # other words
    for word in word_freq.keys():
      word_degree[word] = word_degree[word] + word_freq[word] # itself
    # word score = deg(w) / freq(w)
    word_scores = {}
    for word in word_freq.keys():
      word_scores[word] = word_degree[word] / word_freq[word]
    return word_scores

  def _calculate_phrase_scores(self, phrase_list, word_scores):
    phrase_scores = {}
    for phrase in phrase_list:
      phrase_score = 0
      for word in phrase:
        phrase_score += word_scores[word]
      phrase_scores[" ".join(phrase)] = phrase_score
    return phrase_scores
    
  def extract(self, text, incl_scores=False):
    sentences = nltk.sent_tokenize(text)
    phrase_list = self._generate_candidate_keywords(sentences)
    word_scores = self._calculate_word_scores(phrase_list)
    phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores)
    sorted_phrase_scores = sorted(phrase_scores.iteritems(), 
                                  key=operator.itemgetter(1), reverse=True)
    n_phrases = len(sorted_phrase_scores)
    if incl_scores:
      return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
    else:
      return map(lambda x: x[0],
        sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])

In [21]:
def test():
  rake = RakeKeywordExtractor()
  keywords = rake.extract("""
Compatibility of systems of linear constraints over the set of natural 
numbers. Criteria of compatibility of a system of linear Diophantine 
equations, strict inequations, and nonstrict inequations are considered. 
Upper bounds for components of a minimal set of solutions and algorithms 
of construction of minimal generating sets of solutions for all types of 
systems are given. These criteria and the corresponding algorithms for 
constructing a minimal supporting set of solutions can be used in solving 
all the considered types of systems and systems of mixed types.
  """, incl_scores=True)
  print keywords
  
    
test()


[('minimal generating sets', 8.666666666666666), ('linear diophantine equations', 8.5), ('minimal supporting set', 7.666666666666666), ('minimal set', 4.666666666666666), ('linear constraints', 4.5), ('upper bounds', 4.0), ('natural numbers', 4.0), ('nonstrict inequations', 4.0), ('strict inequations', 4.0), ('mixed types', 3.666666666666667), ('corresponding algorithms', 3.5), ('considered types', 3.166666666666667), ('set', 2.0), ('types', 1.6666666666666667), ('considered', 1.5), ('algorithms', 1.5), ('constructing', 1.0), ('solutions', 1.0), ('given', 1.0), ('solving', 1.0), ('system', 1.0), ('systems', 1.0), ('criteria', 1.0), ('compatibility', 1.0), ('used', 1.0), ('construction', 1.0), ('components', 1.0)]

In [22]:
sample = "in my own language.\
As a video uploader, this means you can reach\
to people all over the world,\
irrespective of language.\
[Hiroto, Bedhead]\
You can upload multiple tracks like English and French,\
and viewers can choose the track they like.\
[Toliver, Japanese Learner]\
For example, if you enjoy using YouTube in French,"
rake = RakeKeywordExtractor()
keywords = rake.extract(sample, incl_scores=True)

print keywords


[('upload multiple tracks like english', 23.0), ('enjoy using youtube', 9.0), ('video uploader', 4.0), ('reachto people', 4.0), ('japanese learner', 4.0), ('like', 3.0), ('toliver', 1.0), ('language', 1.0), ('means', 1.0), ('bedhead', 1.0), ('choose', 1.0), ('french', 1.0), ('track', 1.0), ('hiroto', 1.0), ('irrespective', 1.0), ('world', 1.0), ('example', 1.0), ('viewers', 1.0), ('language.as', 1.0)]

In [48]:
rake = RakeKeywordExtractor()
keywords = rake.extract(test_article, incl_scores=True)

print keywords


[(u'console wars got even hotter today', 33.0), (u'\u2019\u201d said zenith ceo michael ahn', 32.8), (u'playstation 4 hitting stores later', 19.0), (u'next-generation video game systems', 15.333333333333334), (u'video game world.\u201d according', 15.333333333333334), (u'system\u2019s launch titles moonchaser', 15.0), (u'electronics manufacturer zenith announced', 14.8), (u'gamespace pro press event', 13.333333333333332), (u'cris collinsworth\u2019s pigskin 2013', 12.0), (u'nine launch titles', 10.0), (u'sleek silver-and-gray box', 9.0), (u'survival-horror thriller inzomnia', 9.0), (u'stores nov. 19', 7.0), (u'console game', 6.333333333333334), (u'gamespace pro', 5.333333333333333), (u'zenith representatives', 4.8), (u'play cds', 4.0), (u'z-connect technology', 4.0), (u'finally poised', 4.0), (u'starting price', 4.0), (u'3d graphics', 4.0), (u'big waves', 4.0), (u'internet using', 4.0), (u'xbox one', 4.0), (u'double-analog-stick controllers', 4.0), (u'console', 3.0), (u'zenith', 2.8), (u'650 units', 2.0), (u'saying', 1.0), (u'already', 1.0), (u'month', 1.0), (u'player', 1.0), (u'\u201cwith', 1.0), (u'make', 1.0), (u'radiation', 1.0), (u'preordered', 1.0), (u'way', 1.0), (u'officially', 1.0), (u'\u2018move', 1.0), (u'arrives', 1.0), (u'ability', 1.0), (u'lincolnshire', 1.0), (u'release', 1.0), (u'sony', 1.0), (u'log', 1.0), (u'showcasing', 1.0), (u'microsoft', 1.0), (u'374.99', 0.0)]

TextRank Algorithm

https://github.com/davidadamojr/TextRank

graph-based ranking algorithm
rank words based on their associations in the graph
has best performance when only nouns and adjectives are selected as potential keywords


In [56]:
"""
From this paper: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
External dependencies: nltk, numpy, networkx
Based on https://gist.github.com/voidfiles/1646117
"""

import nltk
import itertools
from operator import itemgetter
import networkx as nx

class TextRank:

    def _normalize(self, tagged):
        return [(item[0].replace('.', ''), item[1]) for item in tagged]

    def _unique_everseen(self, iterable, key=None):
        "List unique elements, preserving order. Remember all elements ever seen."
        # unique_everseen('AAAABBBCCDAABBB') --> A B C D
        # unique_everseen('ABBCcAD', str.lower) --> A B C D
        seen = set()
        seen_add = seen.add
        if key is None:
            for element in itertools.ifilterfalse(seen.__contains__, iterable):
                seen_add(element)
                yield element
        else:
            for element in iterable:
                k = key(element)
                if k not in seen:
                    seen_add(k)
                    yield element

    def _lDistance(self, firstString, secondString):
        "Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
        if len(firstString) > len(secondString):
            firstString, secondString = secondString, firstString
        distances = range(len(firstString) + 1)
        for index2, char2 in enumerate(secondString):
            newDistances = [index2 + 1]
            for index1, char1 in enumerate(firstString):
                if char1 == char2:
                    newDistances.append(distances[index1])
                else:
                    newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
            distances = newDistances
        return distances[-1]

    def _buildGraph(self, nodes):
        "nodes - list of hashables that represents the nodes of the graph"
        gr = nx.Graph() #initialize an undirected graph
        gr.add_nodes_from(nodes)
        nodePairs = list(itertools.combinations(nodes, 2))

        #add edges to the graph (weighted by Levenshtein distance)
        for pair in nodePairs:
            firstString = pair[0]
            secondString = pair[1]
            levDistance = self._lDistance(firstString, secondString)
            gr.add_edge(firstString, secondString, weight=levDistance)

        return gr

    def extractKeyphrases(self, text):
        #tokenize the text using nltk
        wordTokens = nltk.word_tokenize(text)

        #assign POS tags to the words in the text
        tagged = nltk.pos_tag(wordTokens)
        textlist = [x[0] for x in tagged]

        # filter words with 'NN', 'JJ', 'NNP' tags
        tagged = filter(lambda x: x[1] in ['NN', 'JJ', 'NNP'], tagged)
        tagged = self._normalize(tagged)

        unique_word_set = self._unique_everseen([x[0] for x in tagged])
        word_set_list = list(unique_word_set)

       #this will be used to determine adjacent words in order to construct keyphrases with two words

        graph = self._buildGraph(word_set_list)

        #pageRank - initial value of 1.0, error tolerance of 0,0001, 
        calculated_page_rank = nx.pagerank(graph, weight='weight')

        #most important words in ascending order of importance
        keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

        #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
        aThird = int(len(word_set_list) / 3)
        keyphrases = keyphrases[0:aThird+1]

        #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
        #together
        modifiedKeyphrases = set([])
        dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
        i = 0
        j = 1
        while j < len(textlist):
            firstWord = textlist[i]
            secondWord = textlist[j]
            if firstWord in keyphrases and secondWord in keyphrases:
                keyphrase = firstWord + ' ' + secondWord
                modifiedKeyphrases.add(keyphrase)
                dealtWith.add(firstWord)
                dealtWith.add(secondWord)
            else:
                if firstWord in keyphrases and firstWord not in dealtWith: 
                    modifiedKeyphrases.add(firstWord)

                #if this is the last word in the text, and it is a keyword,
                #it definitely has no chance of being a keyphrase at this point    
                if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                    modifiedKeyphrases.add(secondWord)

            i = i + 1
            j = j + 1

        return modifiedKeyphrases

In [58]:
test_article = "LINCOLNSHIRE, IL With next-generation video game systems such as the Xbox One and the Playstation 4 hitting stores later this month, the console wars got even hotter today as electronics manufacturer Zenith announced the release of its own console, the Gamespace Pro, which arrives in stores Nov. 19. “With its sleek silver-and-gray box, double-analog-stick controllers, ability to play CDs, and starting price of $374.99, the Gamespace Pro is our way of saying, ‘Move over, Sony and Microsoft, Zenith is now officially a player in the console game,’” said Zenith CEO Michael Ahn at a Gamespace Pro press event, showcasing the system’s launch titles MoonChaser: Radiation, Cris Collinsworth’s Pigskin 2013, and survival-horror thriller InZomnia. “With over nine launch titles, 3D graphics, and the ability to log on to the internet using our Z-Connect technology, Zenith is finally poised to make some big waves in the video game world.” According to Zenith representatives, over 650 units have already been preordered."
test_article = test_article.decode('utf-8')

# test_list = test_article.split()
textRank = TextRank()
keyphrases = textRank.extractKeyphrases(test_article)
print 'keyphrase list:'
print keyphrases


keyphrase list:
set([u'Playstation', u'survival-horror thriller', u'LINCOLNSHIRE', u'MoonChaser', u'Collinsworth\u2019s', u'Radiation', u'Z-Connect technology', u'Microsoft', u'double-analog-stick', u'silver-and-gray', u'system\u2019s', u'thriller InZomnia', u'internet', u'next-generation', u'Gamespace', u'manufacturer'])

In [76]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
    print candidates
    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]
    return candidates

def score_keyphrases_by_tfidf(texts, candidates='chunks'):
    import gensim, nltk
    
    # extract candidates from each text in texts, either chunks or words
    if candidates == 'chunks':
        boc_texts = [extract_candidate_chunks(text) for text in texts]
    elif candidates == 'words':
        boc_texts = [extract_candidate_words(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary

In [82]:
text = """
Compatibility of systems of linear constraints over the set of natural 
numbers. Criteria of compatibility of a system of linear Diophantine 
equations, strict inequations, and nonstrict inequations are considered. 
Upper bounds for components of a minimal set of solutions and algorithms 
of construction of minimal generating sets of solutions for all types of 
systems are given. These criteria and the corresponding algorithms for 
constructing a minimal supporting set of solutions can be used in solving 
all the considered types of systems and systems of mixed types.
  """
from nltk.tokenize import sent_tokenize
texts = sent_tokenize(text)

extract_candidate_chunks(text)
extract_candidate_words(text)
# print score_keyphrases_by_tfidf(texts)


['compatibility of systems', 'linear constraints', 'set of natural numbers', 'criteria of compatibility', 'system of linear diophantine equations', 'strict inequations', 'nonstrict inequations', 'upper', 'components', 'minimal set of solutions', 'algorithms of construction', 'sets of solutions', 'types of systems', 'criteria', 'corresponding algorithms', 'minimal supporting set of solutions', 'types of systems', 'systems of mixed types']
Out[82]:
[]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

LDA algorithm


In [6]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words

from gensim import corpora, models
import gensim


/usr/local/lib/python2.7/dist-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")

In [2]:
test_doc = "in my own language.\
As a video uploader, this means you can reach\
to people all over the world,\
irrespective of language.\
[Hiroto, Bedhead]\
You can upload multiple tracks like English and French,\
and viewers can choose the track they like.\
[Toliver, Japanese Learner]\
For example, if you enjoy using YouTube in French,"

In [79]:
# preprocessing
from nltk import sent_tokenize,tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import re

tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop_words = get_stop_words('en')

def data_clean(text, stemmer_type='english'):
    if stemmer_type in ['english', 'porter']:
        stemmer = SnowballStemmer(stemmer_type)
    else:
        stemmer = SnowballStemmer('english')
    cleaned_texts = []
    sentences = sent_tokenize(text)
    for sentence in sentences:
        # remove character name (need to be modify with specific file)
        sentence = re.sub('(\[.*\])', '', sentence)
        tokens = tokenizer.tokenize(sentence.lower())
        # remove stop words
        stopped_tokens = [i for i in tokens if not i in en_stop_words]
        stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]
        
        cleaned_texts.append(stemmed_tokens)
                
    return cleaned_texts

In [80]:
cleaned_texts = data_clean(test_doc, 'english')

In [81]:
cleaned_texts


Out[81]:
[[u'languag',
  u'video',
  u'upload',
  u'mean',
  u'can',
  u'reachto',
  u'peopl',
  u'world',
  u'irrespect',
  u'languag'],
 [u'can',
  u'upload',
  u'multipl',
  u'track',
  u'like',
  u'english',
  u'french',
  u'viewer',
  u'can',
  u'choos',
  u'track',
  u'like'],
 [u'exampl', u'enjoy', u'use', u'youtub', u'french']]

In [82]:
def lda_trainer(texts):
    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
    
    return ldamodel.print_topics(num_topics=2, num_words=4)

In [83]:
lda_trainer(cleaned_texts)


Out[83]:
[(0, u'0.101*"french" + 0.099*"use" + 0.099*"enjoy" + 0.099*"exampl"'),
 (1, u'0.110*"can" + 0.078*"track" + 0.078*"like" + 0.078*"upload"')]

In [ ]:
# noun or verb noun is more important
# noun phrase

In [ ]:
# trained with entire subtitles 
# use tfidf find the most important words