In [1]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from collections import Counter
from collections import OrderedDict
import matplotlib.pyplot as plt
from 
import cosine

In [43]:
data = joblib.load('../data/clean_complete_ngram.pickle')

In [44]:
data.head(5)


Out[44]:
title year synopsis reviews content reviews_tri contents_tri reviews_bi contents_bi
0 Dead Awake 2016 investig death twin sister sleep social worker... movi new havent seen name 3 4 last yeari famil... dead awak 2016 american supernatur psycholog h... {('nightmar', 'elm', 'street'): 4, ('done', 'g... {('link', 'offici', 'websit'): 1, ('psycholog'... {('done', 'peopl'): 1, ('sleep', 'paralysi'): ... {('sleep', 'paralysi'): 1, ('social', 'worker'...
10 A Good American 2015 documentari reveal truth nsa cryptologist inno... documentari show situat idea know bit heard re... good american 2015 austrian documentari film c... {} {('refer', 'extern', 'link'): 1, ('link', 'off... {('film', 'reveal'): 1, ('complet', 'surpris')... {('film', 'produc'): 1, ('film', 'score'): 1, ...
11 Hard Tide 2015 drug dealer who emul father success crimin car... watch rot last night tempt dont bother script ... hard tide 2015 british crime drama written dir... {('8', 'year', 'old'): 1, ('doesnt', 'take', '... {('recept', 'rotten', 'tomato'): 1, ('gave', '... {('wrote', 'direct'): 1, ('writer', 'director'... {('releas', 'theatric'): 1, ('aggreg', 'report...
13 Carrie Pilby 2016 social awkward 19yearold geniu make big plan s... excit see film toronto filmfest last week enjo... carri pilbi 2016 american comedi film direct s... {('toronto', 'intern', 'film'): 1, ('new', 'yo... {('comedi', 'film', 'direct'): 1, ('releas', '... {('intrigu', 'premis'): 1, ('sweet', 'littl'):... {('film', 'produc'): 2, ('video', 'demand'): 2...
14 A Dark Song 2016 griev death son woman hire occult expert lead ... writer felt job review mere whine prattl happe... dark song 2016 irish independ horror film writ... {('actual', 'look', 'like'): 1, ('absolut', 'l... {('recept', 'film', 'receiv'): 1, ('london', '... {('two', 'line'): 1, ('open', 'door'): 1, ('re... {('approv', 'rate'): 1, ('drive', 'away'): 1, ...

In [4]:
# split into mono-grams
data['reviews_mono'] = data.apply(lambda row: Counter(list(zip(row['reviews'].split()))), axis=1)
data['content_mono'] = data.apply(lambda row: Counter(list(zip(row['content'].split()))), axis=1)

In [5]:
data['reviews_n'] = data['reviews_tri'].copy()
data['content_n'] = data['contents_tri'].copy()

In [6]:
# combine mono-grams, bi-grams and tri-grams
data['reviews_n'] = data.apply(lambda row: {**row['reviews_n'],**row['reviews_bi'], **row['reviews_mono']}, axis=1)
data['content_n'] = data.apply(lambda row: {**row['content_n'],**row['contents_bi'], **row['content_mono']}, axis=1)

In [7]:
data['n_gram'] = data.apply(lambda row: {**row['reviews_n'], **row['content_n']}, axis=1)

In [8]:
data['complete'] = data.apply(lambda row: dict.fromkeys(list(row['n_gram'].keys()), 0), axis=1)

In [9]:
def update_dict(d1, d2):
    d1_ = Counter(d1)
    d2_ = Counter(d2)
    d1_.update(d2_)
    return d1_

In [10]:
# get complete list for imdb and wiki
# add zeros 
data['imdb'] = data.apply(lambda row: update_dict(row['reviews_n'], row['complete']), axis=1)
data['wiki'] = data.apply(lambda row: update_dict(row['content_n'], row['complete']), axis=1)

In [11]:
# sort by keys
data['imdb'] = data.apply(lambda row: OrderedDict(sorted(row['imdb'].items())), axis=1)
data['wiki'] = data.apply(lambda row: OrderedDict(sorted(row['wiki'].items())), axis=1)

In [12]:
# compute Euclidean distance 
data['euclidean'] = data.apply(lambda row: np.sqrt(np.sum((np.array(list(row['imdb'].values()))-np.array(list(row['wiki'].values())))**2)), axis=1)

In [14]:
# number of n-grams
data['length'] = data.apply(lambda row: len(row['complete']), axis=1)

In [15]:
data['euclidean_adj'] = data['euclidean']/data['length']

In [16]:
data['cosine'] = data.apply(lambda row: cosine(list(row['imdb'].values()), list(row['wiki'].values())), axis=1)

In [17]:
data.loc[data['length']==data['length'].max(), ]


Out[17]:
title year synopsis reviews content reviews_tri contents_tri reviews_bi contents_bi reviews_mono ... reviews_n content_n n_gram complete imdb wiki euclidean length euclidean_adj cosine
285 Titanic 1997 distraught engag cruel millionair young woman ... convers turn favorit movi ill mention titan co... titan 1997 american epic romancedisast film di... {('look', 'like', 'he'): 2, ('make', 'audienc'... {('tomato', 'film', 'approv'): 1, ('best', 'vi... {('show', 'way'): 1, ('realli', 'strong'): 1, ... {('film', 'produc'): 1, ('molli', 'brown'): 4,... {('sank',): 167, ('absenc',): 4, ('whiten',): ... ... {('emot', 'touch'): 1, ('fragment',): 3, ('sor... {('andrew',): 8, ('sank',): 1, ('ian',): 1, ('... {('emot', 'touch'): 1, ('ian',): 1, ('notthi',... {('emot', 'touch'): 0, ('whiten',): 0, ('ameri... {('0',): 9, ('000',): 3, ('001',): 1, ('01',):... {('0',): 0, ('000',): 0, ('001',): 0, ('01',):... 15546.651923 36804 0.422417 0.298777

1 rows × 21 columns


In [29]:
data['euclidean'].hist(bins=100)
plt.show()



In [26]:
data['euclidean_adj'].hist(bins=50)
plt.show()



In [27]:
data['cosine'].hist(bins=50)
plt.show()



In [21]:
data['size_flag'] = data['length'] > 4000

In [28]:
data.boxplot(by = ['size_flag'], column = 'euclidean')
plt.show()



In [22]:
data.boxplot(by = ['size_flag'], column = 'euclidean_adj')
plt.show()



In [23]:
data.boxplot(by = ['size_flag'], column = 'cosine')
plt.show()



In [ ]: