notebook.community

Edit and run



In [136]:

    
import nltk
from nltk.collocations import *
bgm = nltk.collocations.BigramAssocMeasures()



In [76]:

    
filename = 'Output1.txt'



In [78]:

    
with open(filename, encoding = "utf-8") as f:
    for line in f:
        text = line.split()



In [132]:

    
bigram_measures = nltk.collocations.BigramAssocMeasures()



In [133]:

    
# a short text for testing
# text1 = "I do not like green eggs and ham , I do not like them Sam I am!"
# text1 = text1.split()



In [157]:

    
finder = BigramCollocationFinder.from_words(text)



In [158]:

    
# filter bigrams that contain '.'
# since '.' was used to separate different movies
finder.apply_word_filter(lambda w: w in ('.'))



In [159]:

    
# not sure if this is valid
# ignore all bigrams which occur less than 50 times in the corpus
finder.apply_freq_filter(50)



In [160]:

    
scored = finder.score_ngrams(bgm.student_t)
len(scored)









    Out[160]:





39070



In [161]:

    
scored[0:10]









    Out[161]:





[(('watch', 'movi'), 91.63230089913765),
 (('one', 'best'), 87.40838156193752),
 (('main', 'charact'), 85.55962909562712),
 (('special', 'effect'), 85.28024715279213),
 (('even', 'though'), 85.22403988200098),
 (('look', 'like'), 84.36034575400157),
 (('dont', 'know'), 80.68202235738728),
 (('feel', 'like'), 79.34506359764823),
 (('ive', 'seen'), 78.24016689778063),
 (('year', 'old'), 76.88897952637127)]



In [162]:

    
# 1.645 corresponds to alpha = 0.05
# 2.576 corresponds to alpha = 0.005
# 3.090 corresponds to alpha = 0.001
# filter_ngram = sorted(finder.above_score(bgm.student_t, 3.090))
# len(filter_ngram)
# filter_ngram[0:10]









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-162-0cdaee629387> in <module>()
      2 # 2.576 corresponds to alpha = 0.005
      3 # 3.090 corresponds to alpha = 0.001
----> 4 filter_ngram = sorted(scored.above_score(bgm.student_t, 3.090))

AttributeError: 'list' object has no attribute 'above_score'



In [163]:

    
bigram = finder.nbest(bgm.student_t, 10)



In [164]:

    
bigram









    Out[164]:





[('watch', 'movi'),
 ('one', 'best'),
 ('main', 'charact'),
 ('special', 'effect'),
 ('even', 'though'),
 ('look', 'like'),
 ('dont', 'know'),
 ('feel', 'like'),
 ('ive', 'seen'),
 ('year', 'old')]



In [99]:

    
finder.score_ngrams?



In [137]:

    
bgm?



In [124]:

    
bgm.student_t?



In [ ]: