In [136]:
import nltk
from nltk.collocations import *
bgm = nltk.collocations.BigramAssocMeasures()
In [76]:
filename = 'Output1.txt'
In [78]:
with open(filename, encoding = "utf-8") as f:
for line in f:
text = line.split()
In [132]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
In [133]:
# a short text for testing
# text1 = "I do not like green eggs and ham , I do not like them Sam I am!"
# text1 = text1.split()
In [157]:
finder = BigramCollocationFinder.from_words(text)
In [158]:
# filter bigrams that contain '.'
# since '.' was used to separate different movies
finder.apply_word_filter(lambda w: w in ('.'))
In [159]:
# not sure if this is valid
# ignore all bigrams which occur less than 50 times in the corpus
finder.apply_freq_filter(50)
In [160]:
scored = finder.score_ngrams(bgm.student_t)
len(scored)
Out[160]:
In [161]:
scored[0:10]
Out[161]:
In [162]:
# 1.645 corresponds to alpha = 0.05
# 2.576 corresponds to alpha = 0.005
# 3.090 corresponds to alpha = 0.001
# filter_ngram = sorted(finder.above_score(bgm.student_t, 3.090))
# len(filter_ngram)
# filter_ngram[0:10]
In [163]:
bigram = finder.nbest(bgm.student_t, 10)
In [164]:
bigram
Out[164]:
In [99]:
finder.score_ngrams?
In [137]:
bgm?
In [124]:
bgm.student_t?
In [ ]: