In [2]:
from sklearn.externals import joblib
from collections import Counter
import nltk
import numpy as np
from numba import jit
import pickle

In [4]:
trigram = joblib.load('res_tr.pkl')
trigram = set(trigram)

In [83]:
len(trigram)


Out[83]:
3619

In [81]:
bigram = joblib.load('res_bi.pkl')
bigram = set(bigram)

In [82]:
len(bigram)


Out[82]:
39075

In [7]:
clean_complete_df = joblib.load('clean_complete_df')

In [77]:
def get_trigram(text):
    return nltk.FreqDist(nltk.trigrams(nltk.wordpunct_tokenize(text)))

get_trigram_ = np.vectorize(get_trigram)

In [75]:
def find_trigram(c):
    d = {}
    for k in c.keys():
        if k in trigram:
            d.update({k: c.get(k)})
    return d

find_trigram_ = np.vectorize(find_trigram)

In [84]:
def get_bigram(text):
    return nltk.FreqDist(nltk.bigrams(nltk.wordpunct_tokenize(text)))

get_bigram_ = np.vectorize(get_bigram)

In [85]:
def find_bigram(c):
    d = {}
    for k in c.keys():
        if k in bigram:
            d.update({k: c.get(k)})
    return d

find_bigram_ = np.vectorize(find_bigram)

In [86]:
reviews = get_trigram_(clean_complete_df['reviews'])

In [87]:
clean_complete_df['reviews_tri'] = find_trigram_(reviews)

In [88]:
contents = get_trigram_(clean_complete_df['content'])

In [89]:
clean_complete_df['contents_tri'] = find_trigram_(contents)

In [92]:
reviews2 = get_bigram_(clean_complete_df['reviews'])

In [93]:
clean_complete_df['reviews_bi'] = find_bigram_(reviews2)

In [94]:
contents2 = get_bigram_(clean_complete_df['content'])

In [95]:
clean_complete_df['contents_bi'] = find_bigram_(contents2)

In [102]:
clean_complete_df.shape


Out[102]:
(1549, 9)

In [104]:
clean_complete_df['contents_bi'][1001]


Out[104]:
{('alison', 'pill'): 3,
 ('award', 'nomin'): 1,
 ('comedydrama', 'film'): 1,
 ('comic', 'book'): 1,
 ('extern', 'link'): 1,
 ('film', 'direct'): 1,
 ('film', 'director'): 1,
 ('film', 'festiv'): 1,
 ('gael', 'garcia'): 3,
 ('garcia', 'bernal'): 3,
 ('intern', 'film'): 1,
 ('premier', 'toronto'): 1,
 ('refer', 'extern'): 1,
 ('stori', 'cast'): 1,
 ('toronto', 'intern'): 1}

In [100]:
clean_complete_df['contents_bi'][0]


Out[100]:
{('2016', 'american'): 1,
 ('begin', 'investig'): 1,
 ('extern', 'link'): 1,
 ('film', 'saw'): 1,
 ('film', 'written'): 1,
 ('horror', 'film'): 1,
 ('link', 'offici'): 1,
 ('offici', 'websit'): 1,
 ('peopl', 'die'): 1,
 ('psycholog', 'horror'): 1,
 ('refer', 'extern'): 1,
 ('releas', 'film'): 1,
 ('sleep', 'paralysi'): 1,
 ('social', 'worker'): 1,
 ('world', 'premier'): 1}

In [108]:
clean_complete_df['reviews_bi'][10]


Out[108]:
{('american', 'film'): 1,
 ('begin', 'end'): 1,
 ('cage', 'charact'): 1,
 ('charact', 'base'): 1,
 ('compel', 'film'): 1,
 ('complet', 'surpris'): 1,
 ('documentari', 'show'): 1,
 ('extrem', 'import'): 1,
 ('feel', 'like'): 1,
 ('film', 'guess'): 1,
 ('film', 'reveal'): 1,
 ('good', 'american'): 1,
 ('help', 'lot'): 1,
 ('inner', 'work'): 1,
 ('know', 'bit'): 1,
 ('like', 'happen'): 1,
 ('lot', 'make'): 1,
 ('make', 'sens'): 1,
 ('man', 'good'): 1,
 ('name', 'name'): 1,
 ('nicola', 'cage'): 1}

In [106]:
clean_complete_df.to_pickle('clean_complete_ngram.pickle')

In [107]:
temp =  joblib.load('clean_complete_ngram.pickle')

In [109]:
temp.head()


Out[109]:
title year synopsis reviews content reviews_tri contents_tri reviews_bi contents_bi
0 Dead Awake 2016 investig death twin sister sleep social worker... movi new havent seen name 3 4 last yeari famil... dead awak 2016 american supernatur psycholog h... {('good', 'horror', 'film'): 1, ('good', 'job'... {('refer', 'extern', 'link'): 1, ('extern', 'l... {('right', 'right'): 1, ('best', 'thriller'): ... {('world', 'premier'): 1, ('psycholog', 'horro...
10 A Good American 2015 documentari reveal truth nsa cryptologist inno... documentari show situat idea know bit heard re... good american 2015 austrian documentari film c... {} {('new', 'york', 'time'): 1, ('refer', 'extern... {('make', 'sens'): 1, ('compel', 'film'): 1, (... {('produc', 'direct'): 1, ('score', 'film'): 1...
11 Hard Tide 2015 drug dealer who emul father success crimin car... watch rot last night tempt dont bother script ... hard tide 2015 british crime drama written dir... {('nine', 'year', 'old'): 1, ('doesnt', 'take'... {('gave', 'film', 'posit'): 1, ('recept', 'rot... {('want', 'good'): 1, ('watch', 'film'): 1, ('... {('hard', 'time'): 1, ('35', 'star'): 1, ('dru...
13 Carrie Pilby 2016 social awkward 19yearold geniu make big plan s... excit see film toronto filmfest last week enjo... carri pilbi 2016 american comedi film direct s... {('toronto', 'film', 'festiv'): 1, ('realli', ... {('acquir', 'distribut', 'right'): 1, ('right'... {('good', 'role'): 1, ('watch', 'film'): 1, ('... {('film', 'star'): 1, ('refer', 'extern'): 1, ...
14 A Dark Song 2016 griev death son woman hire occult expert lead ... writer felt job review mere whine prattl happe... dark song 2016 irish independ horror film writ... {('good', 'horror', 'film'): 1, ('time', 'writ... {('film', 'festiv', 'releas'): 1, ('london', '... {('vast', 'major'): 1, ('fantast', 'film'): 1,... {('end', 'definit'): 1, ('festiv', 'releas'): ...

In [ ]: