In [136]:
import nltk
from nltk.collocations import *
bgm = nltk.collocations.BigramAssocMeasures()

In [76]:
filename = 'Output1.txt'

In [78]:
with open(filename, encoding = "utf-8") as f:
    for line in f:
        text = line.split()

In [132]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [133]:
# a short text for testing
# text1 = "I do not like green eggs and ham , I do not like them Sam I am!"
# text1 = text1.split()

In [157]:
finder = BigramCollocationFinder.from_words(text)

In [158]:
# filter bigrams that contain '.'
# since '.' was used to separate different movies
finder.apply_word_filter(lambda w: w in ('.'))

In [159]:
# not sure if this is valid
# ignore all bigrams which occur less than 50 times in the corpus
finder.apply_freq_filter(50)

In [160]:
scored = finder.score_ngrams(bgm.student_t)
len(scored)


Out[160]:
39070

In [161]:
scored[0:10]


Out[161]:
[(('watch', 'movi'), 91.63230089913765),
 (('one', 'best'), 87.40838156193752),
 (('main', 'charact'), 85.55962909562712),
 (('special', 'effect'), 85.28024715279213),
 (('even', 'though'), 85.22403988200098),
 (('look', 'like'), 84.36034575400157),
 (('dont', 'know'), 80.68202235738728),
 (('feel', 'like'), 79.34506359764823),
 (('ive', 'seen'), 78.24016689778063),
 (('year', 'old'), 76.88897952637127)]

In [162]:
# 1.645 corresponds to alpha = 0.05
# 2.576 corresponds to alpha = 0.005
# 3.090 corresponds to alpha = 0.001
# filter_ngram = sorted(finder.above_score(bgm.student_t, 3.090))
# len(filter_ngram)
# filter_ngram[0:10]


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-162-0cdaee629387> in <module>()
      2 # 2.576 corresponds to alpha = 0.005
      3 # 3.090 corresponds to alpha = 0.001
----> 4 filter_ngram = sorted(scored.above_score(bgm.student_t, 3.090))

AttributeError: 'list' object has no attribute 'above_score'

In [163]:
bigram = finder.nbest(bgm.student_t, 10)

In [164]:
bigram


Out[164]:
[('watch', 'movi'),
 ('one', 'best'),
 ('main', 'charact'),
 ('special', 'effect'),
 ('even', 'though'),
 ('look', 'like'),
 ('dont', 'know'),
 ('feel', 'like'),
 ('ive', 'seen'),
 ('year', 'old')]

In [99]:
finder.score_ngrams?

In [137]:
bgm?

In [124]:
bgm.student_t?

In [ ]: