In [1]:
import nltk
In [7]:
import networkx as nx
In [44]:
import itertools
In [2]:
nltk.download('punkt')
Out[2]:
In [3]:
nltk.download('averaged_perceptron_tagger')
Out[3]:
In [16]:
f = open("./10.txt")
In [17]:
fread = f.read()
In [18]:
fread
Out[18]:
In [21]:
fread=fread.decode('utf-8')
Tokenize the text using nltk
In [23]:
word_tokens = nltk.word_tokenize(fread)
Assign POS tags to the words in the text
In [24]:
tagged = nltk.pos_tag(word_tokens)
In [26]:
textlist = [x[0] for x in tagged]
In [30]:
# filter_for_tags
defaulttags = ['NN','JJ','NNP']
tagged_filtered = [item for item in tagged if item[1] in defaulttags]
In [31]:
tagged_filtered
Out[31]:
Normalize - return a list of tuples with the first item's periods removed.
In [37]:
tagged_filtered_normalized = [(item[0].replace('.',''), item[1]) for item in tagged_filtered]
In [36]:
def unique_everseen(iterable, key=None):
""" List unique elements in order of appearance.
Examples:
unique_everseen('AAAABBBCCDAABBB') --> A B C D
unique_everseen('ABBCcAD', str.lower) --> A B C D
"""
seen = set()
seen_add = seen.add
if key is None:
for element in [x for x in iterable if x not in seen]:
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
In [38]:
unique_word_set=unique_everseen([x[0] for x in tagged_filtered_normalized])
In [39]:
word_set_list = list(unique_word_set)
In [40]:
word_set_list
Out[40]:
This will be used to determine adjacent words in order to construct keyphrases with two words
In [41]:
gr = nx.Graph()
In [42]:
gr.add_nodes_from(word_set_list)
In [45]:
nodePairs = list(itertools.combinations(word_set_list,2))
Add edges to the graph (weighted by Levenshtein distance)
In [48]:
def levenshtein_distance(first,second):
""" Return the levenshtein distance between two strings.
http://rosettacode.org/wiki/Levenshtein_distance#Python
"""
if len(first) > len(second):
first, second = second, first
distances = range(len(first)+1)
for index2, char2 in enumerate(second):
new_distances = [index2 + 1]
for index1, char1 in enumerate(first):
if char1 == char2:
new_distances.append(distances[index1])
else:
new_distances.append(1 + min((distances[index1],
distances[index1+1],
new_distances[-1])))
distances = new_distances
return distances[-1]
For example,
In [51]:
example_pair = nodePairs[0]; example_pair
Out[51]:
In [52]:
levenshtein_distance( example_pair[0], example_pair[1] )
Out[52]:
In [55]:
[( index2, char2 ) for index2, char2 in enumerate(example_pair[1])]
Out[55]:
In [56]:
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
levDistance = levenshtein_distance(firstString, secondString)
gr.add_edge(firstString, secondString, weight=levDistance)
In [60]:
calculated_page_rank = nx.pagerank(gr, weight='weight')
Most important words in ascending order of importance
In [63]:
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
The number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
In [ ]: