In [1]:
import nltk

In [7]:
import networkx as nx

In [44]:
import itertools

In [2]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/topolo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[2]:
True

In [3]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/topolo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Out[3]:
True

In [16]:
f = open("./10.txt")

In [17]:
fread = f.read()

In [18]:
fread


Out[18]:
'\xe2\x80\x9cWith a surface temperature of 10,000 degrees Fahrenheit and frequent eruptions of ionized gases flowing along strong magnetic fields, the sun is the first star we\xe2\x80\x99ve seen with the right conditions to support fire organisms, and we believe there is evidence to support the theory that fire-bacteria, fire-insects, and even tiny fire-fish were once perhaps populous on the sun\xe2\x80\x99s surface.\xe2\x80\x9d Scientists cautioned that despite the exciting possibilities of fire-life on the star, there are numerous logistical, moral, and ethical questions to resolve before scientists could even begin to entertain the possibility of putting fire-people on the sun. \xef\xbb\xbfWASHINGTON In an announcement'

In [21]:
fread=fread.decode('utf-8')

Tokenize the text using nltk


In [23]:
word_tokens = nltk.word_tokenize(fread)

Assign POS tags to the words in the text


In [24]:
tagged = nltk.pos_tag(word_tokens)

In [26]:
textlist = [x[0] for x in tagged]

In [30]:
# filter_for_tags
defaulttags = ['NN','JJ','NNP']
tagged_filtered = [item for item in tagged if item[1] in defaulttags]

In [31]:
tagged_filtered


Out[31]:
[(u'surface', 'NN'),
 (u'temperature', 'NN'),
 (u'Fahrenheit', 'NNP'),
 (u'frequent', 'JJ'),
 (u'ionized', 'JJ'),
 (u'strong', 'JJ'),
 (u'magnetic', 'JJ'),
 (u'sun', 'NN'),
 (u'first', 'JJ'),
 (u'star', 'NN'),
 (u'we\u2019ve', 'NN'),
 (u'right', 'JJ'),
 (u'fire', 'NN'),
 (u'evidence', 'NN'),
 (u'theory', 'NN'),
 (u'fire-bacteria', 'JJ'),
 (u'tiny', 'JJ'),
 (u'fire-fish', 'JJ'),
 (u'populous', 'JJ'),
 (u'sun\u2019s', 'NN'),
 (u'surface.\u201d', 'NN'),
 (u'exciting', 'JJ'),
 (u'fire-life', 'NN'),
 (u'star', 'NN'),
 (u'numerous', 'JJ'),
 (u'logistical', 'JJ'),
 (u'moral', 'JJ'),
 (u'ethical', 'JJ'),
 (u'possibility', 'NN'),
 (u'fire-people', 'NN'),
 (u'sun', 'NN'),
 (u'\ufeffWASHINGTON', 'NN'),
 (u'announcement', 'NN')]

Normalize - return a list of tuples with the first item's periods removed.


In [37]:
tagged_filtered_normalized = [(item[0].replace('.',''), item[1]) for item in tagged_filtered]

In [36]:
def unique_everseen(iterable, key=None):
    """ List unique elements in order of appearance.  
   
    Examples:
    unique_everseen('AAAABBBCCDAABBB') --> A B C D  
    unique_everseen('ABBCcAD', str.lower) --> A B C D 
    """  
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in [x for x in iterable if x not in seen]:
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

In [38]:
unique_word_set=unique_everseen([x[0] for x in tagged_filtered_normalized])

In [39]:
word_set_list = list(unique_word_set)

In [40]:
word_set_list


Out[40]:
[u'surface',
 u'temperature',
 u'Fahrenheit',
 u'frequent',
 u'ionized',
 u'strong',
 u'magnetic',
 u'sun',
 u'first',
 u'star',
 u'we\u2019ve',
 u'right',
 u'fire',
 u'evidence',
 u'theory',
 u'fire-bacteria',
 u'tiny',
 u'fire-fish',
 u'populous',
 u'sun\u2019s',
 u'surface\u201d',
 u'exciting',
 u'fire-life',
 u'star',
 u'numerous',
 u'logistical',
 u'moral',
 u'ethical',
 u'possibility',
 u'fire-people',
 u'sun',
 u'\ufeffWASHINGTON',
 u'announcement']

This will be used to determine adjacent words in order to construct keyphrases with two words

Build Graph

Return a networkx graph instance.

Initialize an undirected graph.


In [41]:
gr = nx.Graph()

In [42]:
gr.add_nodes_from(word_set_list)

In [45]:
nodePairs = list(itertools.combinations(word_set_list,2))

Add edges to the graph (weighted by Levenshtein distance)


In [48]:
def levenshtein_distance(first,second):
    """ Return the levenshtein distance between two strings.  
    
    http://rosettacode.org/wiki/Levenshtein_distance#Python
    """
    if len(first) > len(second):
        first, second = second, first
    distances = range(len(first)+1)
    for index2, char2 in enumerate(second):
        new_distances = [index2 + 1]
        for index1, char1 in enumerate(first):
            if char1 == char2:
                new_distances.append(distances[index1])
            else:
                new_distances.append(1 + min((distances[index1], 
                                                distances[index1+1],
                                                 new_distances[-1])))
        distances = new_distances
    return distances[-1]

For example,


In [51]:
example_pair = nodePairs[0]; example_pair


Out[51]:
(u'surface', u'temperature')

In [52]:
levenshtein_distance( example_pair[0], example_pair[1] )


Out[52]:
9

In [55]:
[( index2, char2 ) for index2, char2 in enumerate(example_pair[1])]


Out[55]:
[(0, u't'),
 (1, u'e'),
 (2, u'm'),
 (3, u'p'),
 (4, u'e'),
 (5, u'r'),
 (6, u'a'),
 (7, u't'),
 (8, u'u'),
 (9, u'r'),
 (10, u'e')]

In [56]:
for pair in nodePairs:
    firstString = pair[0]
    secondString = pair[1]
    levDistance = levenshtein_distance(firstString, secondString)
    gr.add_edge(firstString, secondString, weight=levDistance)

pageRank -

initial value of 1.0, error tolerance of 0.0001,


In [60]:
calculated_page_rank = nx.pagerank(gr, weight='weight')

Most important words in ascending order of importance


In [63]:
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

The number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)


In [ ]: