In [73]:
import matplotlib.pyplot as plt
In [74]:
import nltk
from nltk.collocations import *
from nltk.tokenize import *
with open('roosevelt.txt', encoding='utf-8') as wordfile:
text = wordfile.read()
words = nltk.word_tokenize(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(2)
scored =finder.score_ngrams(bigram_measures.pmi)
In [75]:
def clustering_coefficient(G,v):
neighbors = G[v].keys()
if len(neighbors) == 1: return -1.0
links = 0
for w in neighbors:
for u in neighbors:
if u in G[w]: links += 0.5
return 2.0*links/(len(neighbors)*(len(neighbors)-1))
In [76]:
import networkx as nx
G={}
# dictionary=nx.clustering(G, scored.keys(), scored.values)
#graph=nx.Graph()
#for (x,y),score in scored:
#if score>0:
#graph.add_edge(x,y,weight=score)
In [77]:
#nx.write_graphml(graph,"bigrams.graphml",encoding='utf-8',prettyprint=True)
In [83]:
def make_link(G, node1, node2):
if node1 not in G:
G[node1] = {}
(G[node1])[node2] = 1
if node2 not in G:
G[node2] = {}
(G[node2])[node1] = 1
return G
In [84]:
for (x,y) in scored: make_link(G,x,y)
In [85]:
for word in G.keys():
coeff=(clustering_coefficient(G,word))
# print(type(word))
if coeff>0:
print(word+' : '+str(coeff))
In [ ]: