In [34]:
import nltk
from nltk.collocations import *
from sklearn.externals import joblib

In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [4]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [5]:
filename = "Output1.txt"

In [6]:
with open(filename) as f:
    for line in f:
        text = line

In [7]:
tokens = nltk.wordpunct_tokenize(text)

In [8]:
finder = BigramCollocationFinder.from_words(tokens)

In [9]:
notfilter = finder.score_ngrams(bigram_measures.raw_freq)

In [ ]:
#scored = finder.score_ngrams(bigram_measures.raw_freq)

In [10]:
finder.apply_freq_filter(50)

In [11]:
temp = finder.score_ngrams(bigram_measures.raw_freq)

In [16]:
finder.apply_word_filter(lambda w: w in (','))

In [19]:
res_bi = finder.score_ngrams(bigram_measures.raw_freq)

In [17]:
# bigram with frequency
res_bi_freq = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))

In [18]:
finder_tr = TrigramCollocationFinder.from_words(tokens)

In [19]:
finder_tr.apply_freq_filter(50)

In [20]:
finder_tr.apply_word_filter(lambda w: w in (','))

In [21]:
res_tr = finder_tr.score_ngrams(trigram_measures.raw_freq)

In [23]:
# trigram with frequency
res_bi_freq = sorted(finder_tr.ngram_fd.items(), key=lambda t: (-t[1], t[0]))

In [27]:
import re
from collections import Counter

In [28]:
words = re.findall(r'\w+', text)

In [30]:
mono_gram = set(words)

In [31]:
mono_gram = list(mono_gram)

In [61]:
res_bi = [item[0] for item in res_bi]

In [62]:
res_tr = [item[0] for item in res_tr]

In [73]:
import pickle

In [81]:
with open('res_tr.pkl', 'wb') as f:
    pickle.dump(res_tr, f)

In [85]:
with open('res_bi.pkl', 'wb') as f:
    pickle.dump(res_bi, f)

In [35]:
#output bi and trigram with freq
joblib.dump(res_bi_freq, 'res_bi_freq')


Out[35]:
['res_bi_freq']

In [36]:
joblib.dump(res_tr_freq, 'res_tr_freq')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-36-ea2d35bbea74> in <module>()
----> 1 joblib.dump(res_tr_freq, 'res_tr_freq')

NameError: name 'res_tr_freq' is not defined

In [ ]: