notebook.community

Edit and run



In [206]:

    
import re
from wikipedia import search, page
import gensim
from gensim.parsing import PorterStemmer
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from networkx import Graph
from collections import defaultdict
import pygraphviz
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout



In [207]:

    
global_stemmer = PorterStemmer()
 
class StemmingHelper(object):
    """
    Class to aid the stemming process - from word to stemmed form,
    and vice versa.
    The 'original' form of a stemmed word will be returned as the
    form in which its been used the most number of times in the text.
    """
 
    #This reverse lookup will remember the original forms of the stemmed
    #words
    word_lookup = {}
 
    @classmethod
    def stem(cls, word):
        """
        Stems a word and updates the reverse lookup.
        """
 
        #Stem the word
        stemmed = global_stemmer.stem(word)
 
        #Update the word lookup
        if stemmed not in cls.word_lookup:
            cls.word_lookup[stemmed] = {}
        cls.word_lookup[stemmed][word] = (
            cls.word_lookup[stemmed].get(word, 0) + 1)
 
        return stemmed
 
    @classmethod
    def original_form(cls, word):
        """
        Returns original form of a word given the stemmed version,
        as stored in the word lookup.
        """
 
        if word in cls.word_lookup:
            return max(cls.word_lookup[word].keys(),
                       key=lambda x: cls.word_lookup[word][x])
        else:
            return word



In [208]:

    
def _get_param_matrices(vocabulary, sentence_terms):
    """
    Returns
    =======
    1. Top 300(or lesser, if vocab is short) most frequent terms(list)
    2. co-occurence matrix wrt the most frequent terms(dict)
    3. Dict containing Pg of most-frequent terms(dict)
    4. nw(no of terms affected) of each term(dict)
    """
 
    #Figure out top n terms with respect to mere occurences
    n = min(300, len(vocabulary))
    topterms = list(vocabulary.keys())
    topterms.sort(key = lambda x: vocabulary[x], reverse = True)
    topterms = topterms[:n]
 
    #nw maps term to the number of terms it 'affects'
    #(sum of number of terms in all sentences it
    #appears in)
    nw = {}
    #Co-occurence values are wrt top terms only
    co_occur = {}
    #Initially, co-occurence matrix is empty
    for x in vocabulary:
        co_occur[x] = [0 for i in range(len(topterms))]
 
    #Iterate over list of all sentences' vocabulary dictionaries
    #Build the co-occurence matrix
    for sentence in sentence_terms:
        total_terms = sum(list(sentence.values()))
        #This list contains the indices of all terms from topterms,
        #that are present in this sentence
        top_indices = []
        #Populate top_indices
        top_indices = [topterms.index(x) for x in sentence
                       if x in topterms]
        #Update nw dict, and co-occurence matrix
        for term in sentence:
            nw[term] = nw.get(term, 0) + total_terms
            for index in top_indices:
                co_occur[term][index] += (sentence[term] *
                                          sentence[topterms[index]])
 
    #Pg is just nw[term]/total vocabulary of text
    Pg = {}
    N = sum(list(vocabulary.values()))
    for x in topterms:
        Pg[x] = float(nw[x])/N
 
    return topterms, co_occur, Pg, nw



In [209]:

    
def get_top_n_terms(vocabulary, sentence_terms, n=50):
    """
    Returns the top 'n' terms from a block of text, in the form of a list,
    from most important to least.
 
    'vocabulary' should be a dict mapping each term to the number
    of its occurences in the entire text.
    'sentence_terms' should be an iterable of dicts, each denoting the
    vocabulary of the corresponding sentence.
    """
 
    #First compute the matrices
    topterms, co_occur, Pg, nw = _get_param_matrices(vocabulary,
                                                     sentence_terms)
 
    #This dict will map each term to its weightage with respect to the
    #document
    result = {}
 
    N = sum(list(vocabulary.values()))
    #Iterates over all terms in vocabulary
    for term in co_occur:
        term = str(term)
        org_term = str(term)
        for x in Pg:
            #expected_cooccur is the expected cooccurence of term with this
            #term, based on nw value of this and Pg value of the other
            expected_cooccur = nw[term] * Pg[x]
            #Result measures the difference(in no of terms) of expected
            #cooccurence and  actual cooccurence
            result[org_term] = ((co_occur[term][topterms.index(x)] -
                                 expected_cooccur)**2/ float(expected_cooccur))
 
    terms = list(result.keys())
    terms.sort(key=lambda x: result[x],
               reverse=True)
 
    return terms[:n]



In [210]:

    
def build_mind_map(model, stemmer, root, nodes, alpha=0.2):
    """
    Returns the Mind-Map in the form of a NetworkX Graph instance.
 
    'model' should be an instance of gensim.models.Word2Vec
    'nodes' should be a list of terms, included in the vocabulary of
    'model'.
    'root' should be the node that is to be used as the root of the Mind
    Map graph.
    'stemmer' should be an instance of StemmingHelper.
    """
 
    #This will be the Mind-Map
    g = nx.DiGraph()
 
    #Ensure that the every node is in the vocabulary of the Word2Vec
    #model, and that the root itself is included in the given nodes
    for node in nodes:
        if node not in model.vocab:
            raise ValueError(node + " not in model's vocabulary")
    if root not in nodes:
        raise ValueError("root not in nodes")
 
    ##Containers for algorithm run
    #Initially, all nodes are unvisited
    unvisited_nodes = set(nodes)
    #Initially, no nodes are visited
    visited_nodes = set([])
    #The following will map visited node to its contextual vector
    visited_node_vectors = {}
    #Thw following will map unvisited nodes to (closest_distance, parent)
    #parent will obviously be a visited node
    node_distances = {}
 
    #Initialization with respect to root
    current_node = root
    visited_node_vectors[root] = model[root]
    unvisited_nodes.remove(root)
    visited_nodes.add(root)
 
    #Build the Mind-Map in n-1 iterations
    for i in range(1, len(nodes)):
        #For every unvisited node 'x'
        for x in unvisited_nodes:
            #Compute contextual distance between current node and x
            dist_from_current = cosine(visited_node_vectors[current_node],
                                       model[x])
            #Get the least contextual distance to x found until now
            distance = node_distances.get(x, (100, ''))
            #If current node provides a shorter path to x, update x's
            #distance and parent information
            if distance[0] > dist_from_current:
                node_distances[x] = (dist_from_current, current_node)
 
        #Choose next 'current' as that unvisited node, which has the
        #lowest contextual distance from any of the visited nodes
        next_node = min(unvisited_nodes,
                        key=lambda x: node_distances[x][0])
 
        ##Update all containers
        parent = node_distances[next_node][1]
        del node_distances[next_node]
        next_node_vect = ((1 - alpha)*model[next_node] +
                          alpha*visited_node_vectors[parent])
        visited_node_vectors[next_node] = next_node_vect
        unvisited_nodes.remove(next_node)
        visited_nodes.add(next_node)
 
        #Add the link between newly selected node and its parent(from the
        #visited nodes) to the NetworkX Graph instance
        g.add_edge(stemmer.original_form(parent).capitalize(),
                   stemmer.original_form(next_node).capitalize())
 
        #The new node becomes the current node for the next iteration
        current_node = next_node
 
    return g



In [211]:

    
def decorate(decorator):
    '''
    Decorator wrapper - decorator(func(text))
    '''
    def decoratorFn(func):
        def wrap(text):
            return decorator(func(text))
        return wrap
    return decoratorFn

def loop(decorator):
    '''
    Decorator wrapper - [decorator(item) for item in func(text)]
    '''
    def decoratorFn(func):
        def wrap(text):
            return [decorator(item) for item in func(text)]
        return wrap
    return decoratorFn



In [212]:

    
def removePunctuations(text):
    return re.sub(r'[^a-z ]', '', text).strip()

def padConnectors(text):
    return re.sub(r'[,:\(\)\{\}\[\]]{1,}', ' ', text).strip()

def removeExtraSpaces(text):
    return re.sub(r' {1,}', ' ', text).strip()

def filterStopwords(text):
    return filter(lambda x: len(x) >= 3, text)

@loop(StemmingHelper.stem)
@decorate(filterStopwords)
def textToTokens(sent):
    return sent.strip().split(' ')

@decorate(removePunctuations)
@decorate(removeExtraSpaces)
@decorate(padConnectors)
def cleanLine(text):
    return text
    
@loop(textToTokens)
@loop(cleanLine)
def textToSents(text):
    return re.split(r'[\r|\n|\.|;]{1,}',text.strip())

@decorate(textToSents)
def parser(text):
    return text.strip().lower()



In [213]:

    
def keepOnly(bow, vocab):
    return {key:val for (key,val) in bow.items() if vocab.get(key)}


def sentToBow(sent, tf = None):
    if tf is None:
        tf = defaultdict(int)
    for token in sent:
        tf[token] += 1
    return tf

@loop(sentToBow)
def sentsToBow(sents):
    return sents

def sentsToGlobalBow(sents):
    tf = defaultdict(int)
    for sent in sents:
        sentToBow(sent, tf)
    return tf



In [214]:

    
with open('default_stopwords.txt') as f:
    stopwords = f.read().strip().splitlines()
    stopwords = list(map(lambda x: StemmingHelper.stem(x), stopwords))

def removeStopwords(sents, stopwords = stopwords):
    return [[token for token in sent if token not in stopwords] for sent in sents]
    
def wikiSource(theme = 'machine learning'):
    titles = search(theme)
    wikipage = page(titles[0])
    return wikipage.content



In [357]:

    
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 10)


import matplotlib

def graph_draw(graph):
    pos=graphviz_layout(graph)
    nx.draw_networkx_nodes(graph,
            pos,
            node_size=2500,
            node_color="w")
    
    nx.draw_networkx_labels(graph, pos, font_size=9)
    nx.draw_networkx_edges(graph, pos)
    plt.show()
    
#graph_draw(mm)



In [ ]:

    
import pandas as pd

df = pd.read_csv('RSRN.csv')
raw = '\n'.join(df.text.tolist())
raw



In [ ]:

    
min_count = 10
size = 50
window = 4

#sents = parser(wikiSource('machine learning'))
sents = parser(raw)
sents = removeStopwords(sents)
model = Word2Vec(sents, min_count=min_count, size=size, window=window)
vocab = model.vocab
sentBow = [keepOnly(bow, vocab) for bow in sentsToBow(sents)]
globalBow = keepOnly(sentsToGlobalBow(sents), vocab)

nodes = get_top_n_terms(globalBow, sentBow, 25)
nodes



In [ ]:

    
mm = build_mind_map(model, StemmingHelper, 'nathan', nodes)



In [ ]:

    
mm.edges()



In [ ]:

    
[StemmingHelper.original_form(token) for token in nodes]



In [238]:

    
# utility functions to access mongodb
from pymongo import MongoClient
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer().stem

class DataHelper(object):
    """
    Class to help load our data from mongodb
    """
    def __init__(self, user, project, host = 'localhost', port = 27017):
        self.mongo = MongoClient(host, port)
        self.pointTo(user, project)
        self.clear()

    def pointTo(self, user, project):
        self._user = user
        self._prj = project
        self.clear()
        return self
    
    def clear(self):
        self._vocab = None
        self._tf = None
        self._dtm = None
        self._model = None
        
    @property
    def _collectionPandas(self):
        return self._prj + '#pandas'

    @property
    def _collectionData(self):
        return self._prj + '#data'
    
    @property
    def _collectionModels(self):
        return self._prj + '#models'

    @property
    def bow(self):
        '''
            return a list of tuple list.
            Each tuple list represents the bag-of-words for each doc.
            Each tuple holds the token id and term frequency for the doc.
            
        '''
        collection = self.mongo[self._user][self._collectionPandas]
        _data = list(collection.find(projection={'_id': False}))
        _bow = pd.DataFrame(_data, columns=_data[0].keys()).bow_lda
        #_vocab = self.vocab
        #self._tf = [{_vocab[key]: count for key, count in doc} for doc in _bow]
        return _bow
    
    @property
    def text(self):
        '''
            return a list of tuple list.
            Each tuple list represents the bag-of-words for each doc.
            Each tuple holds the token id and term frequency for the doc.
            
        '''
        collection = self.mongo[self._user][self._collectionPandas]
        _data = list(collection.find(projection={'_id': False}))
        return pd.DataFrame(_data, columns=_data[0].keys()).text
    
    @property
    def dtm(self):
        '''
            return sparse matrix with documents as rows, vocabulary as columns
        '''
        return gensim.matutils.corpus2csc(self.bow).T.copy()
    
    @property
    def df(self):
        return self.dtm.getnnz(axis = 0)

    @property
    def tf(self):
        return np.asarray(self.dtm.sum(axis = 0))[0]
    
    @property
    def vocab(self):
        '''
            return vocab as a list of strings
        '''
        if self._vocab:
            return self._vocab
        
        collection = self.mongo[self._user][self._collectionData]
        self._vocab = collection.find_one({"Type": "VocabLda"})['Value']
        return self._vocab
    
    @property
    def model(self):
        '''
            return vocab as a list of strings
        '''
        if self._model:
            return self._model
        
        collection = self.mongo[self._user][self._collectionModels]
        self._model = collection.find_one({},{'_id': False})
        return self._model



In [239]:

    
mongoHost = '0.0.0.0' 
mongoPort = 27017
user = '0ef01b0b-fffc-45d1-8c50-fda9b4e8933f'
prj = 'test_prj25'

dataHelper = DataHelper(user, prj, mongoHost, mongoPort)



In [297]:

    
import json

modelData = dataHelper.model

with open('model.json', 'w') as fout:
    json.dump({
            'topic_term_dists': modelData['topic_term_dists'],
            'doc_topic_dists': modelData['doc_topic_dists'],
            'vocab': dataHelper.vocab,
            'tf': dataHelper.tf.tolist(),
            'df': dataHelper.df.tolist()
        }, fout)



In [365]:

    
def npmax(i, l):
    max_idx = np.argmax(l)
    max_val = l[max_idx]
    return (i, max_idx, max_val)

raw = dataHelper.text.tolist()
vocab = dataHelper.vocab
docs_topic = modelData['doc_topic_dists']
topics_term = modelData['topic_term_dists']
num_topics = len(topics_term)
min_count = 10
size = 50
window = 4

doc_summary = [npmax(i, arr) for i, arr in enumerate(docs_topic)]

def trainTopicModel(fn):
    def wrap(topic, doc_summary, threshold = 0.8):
        text = [raw[docId] for docId, topicId, score in fn(topic, doc_summary, threshold = 0.8)]
        text = u'\n'.join(text)
        sents = parser(text)
        sents = removeStopwords(sents)
        model = Word2Vec(sents, min_count=min_count, size=size, window=window)
        model.save('word2vec-'+str(topic)+'.pickle')
        return model
    return wrap

@trainTopicModel    
def getTopDoc(topic, doc_summary, threshold = 0.9):
    return filter(lambda x: x[1] == topic and x[2] >= threshold, doc_summary)

def getTopTerms(topic, topics_term, vocab, top = 15):
    terms = sorted(enumerate(topics_term[topic]), key=lambda x: x[1], reverse = True)
    return [vocab[i] for i, score in terms][0:top]



In [366]:

    
def doTopic(i):
    model = getTopDoc(i, doc_summary)
    nodes = getTopTerms(i, topics_term, vocab)
    nodes = list(set(model.vocab.keys()).intersection(set(nodes)))
    g = build_mind_map(model, StemmingHelper, nodes[0], nodes)
    with open('graph-'+str(i)+'.json','w') as fout:
        json.dump({
                'edges': g.edges(),
                'nodes': g.nodes()
            }, fout)
    return g

graphs = [doTopic(i) for i in range(num_topics)]



In [367]:

    
graph_draw(graphs[0])



In [368]:

    
graph_draw(graphs[1])



In [369]:

    
graph_draw(graphs[2])



In [370]:

    
graph_draw(graphs[3])



In [372]:

    
graph_draw(graphs[4])



In [371]:

    
min_count = 10
size = 50
window = 4

raw = u'\n'.join(dataHelper.text)
sents = parser(raw)
sents = removeStopwords(sents)
model = Word2Vec(sents, min_count=min_count, size=size, window=window)
vocab = model.vocab
sentBow = [keepOnly(bow, vocab) for bow in sentsToBow(sents)]
globalBow = keepOnly(sentsToGlobalBow(sents), vocab)

nodes = get_top_n_terms(globalBow, sentBow, 25)
mm = build_mind_map(model, StemmingHelper, nodes[0], nodes)
#graph_draw(mm)



In [262]:

    
model.save('word2vec.pickle')



In [261]:

    
Word2Vec.load('word2vec.pickle')









    Out[261]:





<gensim.models.word2vec.Word2Vec at 0x7f4347d04240>