notebook.community

Edit and run



In [73]:

    
import matplotlib.pyplot as plt



In [74]:

    
import nltk
from nltk.collocations import *
from nltk.tokenize import *
with open('roosevelt.txt', encoding='utf-8') as wordfile:
    text = wordfile.read()
words = nltk.word_tokenize(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(2)
scored =finder.score_ngrams(bigram_measures.pmi)



In [75]:

    
def clustering_coefficient(G,v):
    neighbors = G[v].keys()
    if len(neighbors) == 1: return -1.0
    links = 0
    for w in neighbors:
        for u in neighbors:
            if u in G[w]: links += 0.5
    return 2.0*links/(len(neighbors)*(len(neighbors)-1))



In [76]:

    
import networkx as nx
G={}
# dictionary=nx.clustering(G, scored.keys(), scored.values)

#graph=nx.Graph()
#for (x,y),score in scored:
    #if score>0:
        #graph.add_edge(x,y,weight=score)



In [77]:

    
#nx.write_graphml(graph,"bigrams.graphml",encoding='utf-8',prettyprint=True)



In [83]:

    
def make_link(G, node1, node2):
    if node1 not in G:
        G[node1] = {}
    (G[node1])[node2] = 1
    if node2 not in G:
        G[node2] = {}
    (G[node2])[node1] = 1
    return G



In [84]:

    
for (x,y) in scored: make_link(G,x,y)



In [85]:

    
for word in G.keys():
    coeff=(clustering_coefficient(G,word))
#     print(type(word))
    
    if coeff>0:
        print(word+' : '+str(coeff))



In [ ]: