In [34]:
import nltk
from nltk.collocations import *
from sklearn.externals import joblib
In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
In [4]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
In [5]:
filename = "Output1.txt"
In [6]:
with open(filename) as f:
for line in f:
text = line
In [7]:
tokens = nltk.wordpunct_tokenize(text)
In [8]:
finder = BigramCollocationFinder.from_words(tokens)
In [9]:
notfilter = finder.score_ngrams(bigram_measures.raw_freq)
In [ ]:
#scored = finder.score_ngrams(bigram_measures.raw_freq)
In [10]:
finder.apply_freq_filter(50)
In [11]:
temp = finder.score_ngrams(bigram_measures.raw_freq)
In [16]:
finder.apply_word_filter(lambda w: w in (','))
In [19]:
res_bi = finder.score_ngrams(bigram_measures.raw_freq)
In [17]:
# bigram with frequency
res_bi_freq = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
In [18]:
finder_tr = TrigramCollocationFinder.from_words(tokens)
In [19]:
finder_tr.apply_freq_filter(50)
In [20]:
finder_tr.apply_word_filter(lambda w: w in (','))
In [21]:
res_tr = finder_tr.score_ngrams(trigram_measures.raw_freq)
In [23]:
# trigram with frequency
res_bi_freq = sorted(finder_tr.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
In [27]:
import re
from collections import Counter
In [28]:
words = re.findall(r'\w+', text)
In [30]:
mono_gram = set(words)
In [31]:
mono_gram = list(mono_gram)
In [61]:
res_bi = [item[0] for item in res_bi]
In [62]:
res_tr = [item[0] for item in res_tr]
In [73]:
import pickle
In [81]:
with open('res_tr.pkl', 'wb') as f:
pickle.dump(res_tr, f)
In [85]:
with open('res_bi.pkl', 'wb') as f:
pickle.dump(res_bi, f)
In [35]:
#output bi and trigram with freq
joblib.dump(res_bi_freq, 'res_bi_freq')
Out[35]:
In [36]:
joblib.dump(res_tr_freq, 'res_tr_freq')
In [ ]: