Rapid Automatic Keyword Extraction
feature: domain-independent, language-independent
tend to extract longer phrase
a phrase almost not contain punctuations and stop words, thus split the sentence based on the punctuations and stop words.
Score of each phrase:
$$wordScore = wordDegree(w) / wordFrequency(w)$$
phraseScore equals the sum of wordScore in the phrase
Problem of RAKE algorithm:
In [20]:
# Adapted from: github.com/aneesha/RAKE/rake.py
from __future__ import division
import operator
import nltk
import string
def isPunct(word):
return len(word) == 1 and word in string.punctuation
def isNumeric(word):
try:
float(word) if '.' in word else int(word)
return True
except ValueError:
return False
class RakeKeywordExtractor:
def __init__(self):
self.stopwords = set(nltk.corpus.stopwords.words())
self.top_fraction = 1 # consider top third candidate keywords by score
def _generate_candidate_keywords(self, sentences):
phrase_list = []
for sentence in sentences:
words = map(lambda x: "|" if x in self.stopwords else x,
nltk.word_tokenize(sentence.lower()))
phrase = []
for word in words:
if word == "|" or isPunct(word):
if len(phrase) > 0:
phrase_list.append(phrase)
phrase = []
else:
phrase.append(word)
return phrase_list
def _calculate_word_scores(self, phrase_list):
word_freq = nltk.FreqDist()
word_degree = nltk.FreqDist()
for phrase in phrase_list:
word_list_degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
for word in phrase:
word_freq[word] += 1
word_degree[word] += word_list_degree # other words
for word in word_freq.keys():
word_degree[word] = word_degree[word] + word_freq[word] # itself
# word score = deg(w) / freq(w)
word_scores = {}
for word in word_freq.keys():
word_scores[word] = word_degree[word] / word_freq[word]
return word_scores
def _calculate_phrase_scores(self, phrase_list, word_scores):
phrase_scores = {}
for phrase in phrase_list:
phrase_score = 0
for word in phrase:
phrase_score += word_scores[word]
phrase_scores[" ".join(phrase)] = phrase_score
return phrase_scores
def extract(self, text, incl_scores=False):
sentences = nltk.sent_tokenize(text)
phrase_list = self._generate_candidate_keywords(sentences)
word_scores = self._calculate_word_scores(phrase_list)
phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores)
sorted_phrase_scores = sorted(phrase_scores.iteritems(),
key=operator.itemgetter(1), reverse=True)
n_phrases = len(sorted_phrase_scores)
if incl_scores:
return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
else:
return map(lambda x: x[0],
sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])
In [21]:
def test():
rake = RakeKeywordExtractor()
keywords = rake.extract("""
Compatibility of systems of linear constraints over the set of natural
numbers. Criteria of compatibility of a system of linear Diophantine
equations, strict inequations, and nonstrict inequations are considered.
Upper bounds for components of a minimal set of solutions and algorithms
of construction of minimal generating sets of solutions for all types of
systems are given. These criteria and the corresponding algorithms for
constructing a minimal supporting set of solutions can be used in solving
all the considered types of systems and systems of mixed types.
""", incl_scores=True)
print keywords
test()
In [22]:
sample = "in my own language.\
As a video uploader, this means you can reach\
to people all over the world,\
irrespective of language.\
[Hiroto, Bedhead]\
You can upload multiple tracks like English and French,\
and viewers can choose the track they like.\
[Toliver, Japanese Learner]\
For example, if you enjoy using YouTube in French,"
rake = RakeKeywordExtractor()
keywords = rake.extract(sample, incl_scores=True)
print keywords
In [48]:
rake = RakeKeywordExtractor()
keywords = rake.extract(test_article, incl_scores=True)
print keywords
https://github.com/davidadamojr/TextRank
graph-based ranking algorithm
rank words based on their associations in the graph
has best performance when only nouns and adjectives are selected as potential keywords
In [56]:
"""
From this paper: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
External dependencies: nltk, numpy, networkx
Based on https://gist.github.com/voidfiles/1646117
"""
import nltk
import itertools
from operator import itemgetter
import networkx as nx
class TextRank:
def _normalize(self, tagged):
return [(item[0].replace('.', ''), item[1]) for item in tagged]
def _unique_everseen(self, iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in itertools.ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def _lDistance(self, firstString, secondString):
"Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
if len(firstString) > len(secondString):
firstString, secondString = secondString, firstString
distances = range(len(firstString) + 1)
for index2, char2 in enumerate(secondString):
newDistances = [index2 + 1]
for index1, char1 in enumerate(firstString):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
distances = newDistances
return distances[-1]
def _buildGraph(self, nodes):
"nodes - list of hashables that represents the nodes of the graph"
gr = nx.Graph() #initialize an undirected graph
gr.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
#add edges to the graph (weighted by Levenshtein distance)
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
levDistance = self._lDistance(firstString, secondString)
gr.add_edge(firstString, secondString, weight=levDistance)
return gr
def extractKeyphrases(self, text):
#tokenize the text using nltk
wordTokens = nltk.word_tokenize(text)
#assign POS tags to the words in the text
tagged = nltk.pos_tag(wordTokens)
textlist = [x[0] for x in tagged]
# filter words with 'NN', 'JJ', 'NNP' tags
tagged = filter(lambda x: x[1] in ['NN', 'JJ', 'NNP'], tagged)
tagged = self._normalize(tagged)
unique_word_set = self._unique_everseen([x[0] for x in tagged])
word_set_list = list(unique_word_set)
#this will be used to determine adjacent words in order to construct keyphrases with two words
graph = self._buildGraph(word_set_list)
#pageRank - initial value of 1.0, error tolerance of 0,0001,
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
aThird = int(len(word_set_list) / 3)
keyphrases = keyphrases[0:aThird+1]
#take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
#together
modifiedKeyphrases = set([])
dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
i = 0
j = 1
while j < len(textlist):
firstWord = textlist[i]
secondWord = textlist[j]
if firstWord in keyphrases and secondWord in keyphrases:
keyphrase = firstWord + ' ' + secondWord
modifiedKeyphrases.add(keyphrase)
dealtWith.add(firstWord)
dealtWith.add(secondWord)
else:
if firstWord in keyphrases and firstWord not in dealtWith:
modifiedKeyphrases.add(firstWord)
#if this is the last word in the text, and it is a keyword,
#it definitely has no chance of being a keyphrase at this point
if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
modifiedKeyphrases.add(secondWord)
i = i + 1
j = j + 1
return modifiedKeyphrases
In [58]:
test_article = "LINCOLNSHIRE, IL With next-generation video game systems such as the Xbox One and the Playstation 4 hitting stores later this month, the console wars got even hotter today as electronics manufacturer Zenith announced the release of its own console, the Gamespace Pro, which arrives in stores Nov. 19. “With its sleek silver-and-gray box, double-analog-stick controllers, ability to play CDs, and starting price of $374.99, the Gamespace Pro is our way of saying, ‘Move over, Sony and Microsoft, Zenith is now officially a player in the console game,’” said Zenith CEO Michael Ahn at a Gamespace Pro press event, showcasing the system’s launch titles MoonChaser: Radiation, Cris Collinsworth’s Pigskin 2013, and survival-horror thriller InZomnia. “With over nine launch titles, 3D graphics, and the ability to log on to the internet using our Z-Connect technology, Zenith is finally poised to make some big waves in the video game world.” According to Zenith representatives, over 650 units have already been preordered."
test_article = test_article.decode('utf-8')
# test_list = test_article.split()
textRank = TextRank()
keyphrases = textRank.extractKeyphrases(test_article)
print 'keyphrase list:'
print keyphrases
In [76]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
print candidates
return [cand for cand in candidates
if cand not in stop_words and not all(char in punct for char in cand)]
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
for sent in nltk.sent_tokenize(text)))
# filter on certain POS tags and lowercase all words
candidates = [word.lower() for word, tag in tagged_words
if tag in good_tags and word.lower() not in stop_words
and not all(char in punct for char in word)]
return candidates
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
import gensim, nltk
# extract candidates from each text in texts, either chunks or words
if candidates == 'chunks':
boc_texts = [extract_candidate_chunks(text) for text in texts]
elif candidates == 'words':
boc_texts = [extract_candidate_words(text) for text in texts]
# make gensim dictionary and corpus
dictionary = gensim.corpora.Dictionary(boc_texts)
corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
# transform corpus with tf*idf model
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf, dictionary
In [82]:
text = """
Compatibility of systems of linear constraints over the set of natural
numbers. Criteria of compatibility of a system of linear Diophantine
equations, strict inequations, and nonstrict inequations are considered.
Upper bounds for components of a minimal set of solutions and algorithms
of construction of minimal generating sets of solutions for all types of
systems are given. These criteria and the corresponding algorithms for
constructing a minimal supporting set of solutions can be used in solving
all the considered types of systems and systems of mixed types.
"""
from nltk.tokenize import sent_tokenize
texts = sent_tokenize(text)
extract_candidate_chunks(text)
extract_candidate_words(text)
# print score_keyphrases_by_tfidf(texts)
Out[82]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [6]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from gensim import corpora, models
import gensim
In [2]:
test_doc = "in my own language.\
As a video uploader, this means you can reach\
to people all over the world,\
irrespective of language.\
[Hiroto, Bedhead]\
You can upload multiple tracks like English and French,\
and viewers can choose the track they like.\
[Toliver, Japanese Learner]\
For example, if you enjoy using YouTube in French,"
In [79]:
# preprocessing
from nltk import sent_tokenize,tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import re
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop_words = get_stop_words('en')
def data_clean(text, stemmer_type='english'):
if stemmer_type in ['english', 'porter']:
stemmer = SnowballStemmer(stemmer_type)
else:
stemmer = SnowballStemmer('english')
cleaned_texts = []
sentences = sent_tokenize(text)
for sentence in sentences:
# remove character name (need to be modify with specific file)
sentence = re.sub('(\[.*\])', '', sentence)
tokens = tokenizer.tokenize(sentence.lower())
# remove stop words
stopped_tokens = [i for i in tokens if not i in en_stop_words]
stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]
cleaned_texts.append(stemmed_tokens)
return cleaned_texts
In [80]:
cleaned_texts = data_clean(test_doc, 'english')
In [81]:
cleaned_texts
Out[81]:
In [82]:
def lda_trainer(texts):
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
return ldamodel.print_topics(num_topics=2, num_words=4)
In [83]:
lda_trainer(cleaned_texts)
Out[83]:
In [ ]:
# noun or verb noun is more important
# noun phrase
In [ ]:
# trained with entire subtitles
# use tfidf find the most important words