In [1]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from collections import Counter
from collections import OrderedDict
import matplotlib.pyplot as plt
import cosine
In [43]:
data = joblib.load('../data/clean_complete_ngram.pickle')
In [44]:
data.head(5)
Out[44]:
In [4]:
# split into mono-grams
data['reviews_mono'] = data.apply(lambda row: Counter(list(zip(row['reviews'].split()))), axis=1)
data['content_mono'] = data.apply(lambda row: Counter(list(zip(row['content'].split()))), axis=1)
In [5]:
data['reviews_n'] = data['reviews_tri'].copy()
data['content_n'] = data['contents_tri'].copy()
In [6]:
# combine mono-grams, bi-grams and tri-grams
data['reviews_n'] = data.apply(lambda row: {**row['reviews_n'],**row['reviews_bi'], **row['reviews_mono']}, axis=1)
data['content_n'] = data.apply(lambda row: {**row['content_n'],**row['contents_bi'], **row['content_mono']}, axis=1)
In [7]:
data['n_gram'] = data.apply(lambda row: {**row['reviews_n'], **row['content_n']}, axis=1)
In [8]:
data['complete'] = data.apply(lambda row: dict.fromkeys(list(row['n_gram'].keys()), 0), axis=1)
In [9]:
def update_dict(d1, d2):
d1_ = Counter(d1)
d2_ = Counter(d2)
d1_.update(d2_)
return d1_
In [10]:
# get complete list for imdb and wiki
# add zeros
data['imdb'] = data.apply(lambda row: update_dict(row['reviews_n'], row['complete']), axis=1)
data['wiki'] = data.apply(lambda row: update_dict(row['content_n'], row['complete']), axis=1)
In [11]:
# sort by keys
data['imdb'] = data.apply(lambda row: OrderedDict(sorted(row['imdb'].items())), axis=1)
data['wiki'] = data.apply(lambda row: OrderedDict(sorted(row['wiki'].items())), axis=1)
In [12]:
# compute Euclidean distance
data['euclidean'] = data.apply(lambda row: np.sqrt(np.sum((np.array(list(row['imdb'].values()))-np.array(list(row['wiki'].values())))**2)), axis=1)
In [14]:
# number of n-grams
data['length'] = data.apply(lambda row: len(row['complete']), axis=1)
In [15]:
data['euclidean_adj'] = data['euclidean']/data['length']
In [16]:
data['cosine'] = data.apply(lambda row: cosine(list(row['imdb'].values()), list(row['wiki'].values())), axis=1)
In [17]:
data.loc[data['length']==data['length'].max(), ]
Out[17]:
In [29]:
data['euclidean'].hist(bins=100)
plt.show()
In [26]:
data['euclidean_adj'].hist(bins=50)
plt.show()
In [27]:
data['cosine'].hist(bins=50)
plt.show()
In [21]:
data['size_flag'] = data['length'] > 4000
In [28]:
data.boxplot(by = ['size_flag'], column = 'euclidean')
plt.show()
In [22]:
data.boxplot(by = ['size_flag'], column = 'euclidean_adj')
plt.show()
In [23]:
data.boxplot(by = ['size_flag'], column = 'cosine')
plt.show()
In [ ]: