In [1]:
from sklearn.externals import joblib
from collections import Counter
import numpy as np
import pandas as pd
import re

In [2]:
# read data
df =  joblib.load('../../data/clean_complete_ngram.pickle')

In [3]:
df.head(2)


Out[3]:
title year synopsis reviews content reviews_tri contents_tri reviews_bi contents_bi
0 Dead Awake 2016 investig death twin sister sleep social worker... movi new havent seen name 3 4 last yeari famil... dead awak 2016 american supernatur psycholog h... {('first', 'time', 'saw'): 1, ('like', 'horror... {('link', 'offici', 'websit'): 1, ('extern', '... {('happen', 'that'): 1, ('time', 'year'): 1, (... {('film', 'saw'): 1, ('sleep', 'paralysi'): 1,...
10 A Good American 2015 documentari reveal truth nsa cryptologist inno... documentari show situat idea know bit heard re... good american 2015 austrian documentari film c... {} {('critic', 'recept', 'film'): 1, ('new', 'yor... {('american', 'film'): 1, ('charact', 'base'):... {('good', 'american'): 2, ('produc', 'direct')...

In [4]:
def remove_ngram(grams, text):
    for gram in grams:
        expres = " ".join(gram)
        text = re.sub(expres, "", text)
    return text

In [5]:
# remove tri-grams
df['cleaned_content'] = df.apply(lambda row: remove_ngram(list(row['contents_tri'].keys()), row['content']), axis=1)
df['cleaned_reviews'] = df.apply(lambda row: remove_ngram(list(row['reviews_tri'].keys()), row['reviews']), axis=1)

In [6]:
# remove bi-grams
df['cleaned_content'] = df.apply(lambda row: remove_ngram(list(row['contents_bi'].keys()), row['cleaned_content']), axis=1)
df['cleaned_reviews'] = df.apply(lambda row: remove_ngram(list(row['reviews_bi'].keys()), row['cleaned_reviews']), axis=1)

In [7]:
# split text into mono-grams
df['reviews_mono'] = df.apply(lambda row: Counter(row['cleaned_reviews'].split()), axis=1)
df['contents_mono'] = df.apply(lambda row: Counter(row['cleaned_content'].split()), axis=1)

In [8]:
df.head(3)


Out[8]:
title year synopsis reviews content reviews_tri contents_tri reviews_bi contents_bi cleaned_content cleaned_reviews reviews_mono contents_mono
0 Dead Awake 2016 investig death twin sister sleep social worker... movi new havent seen name 3 4 last yeari famil... dead awak 2016 american supernatur psycholog h... {('first', 'time', 'saw'): 1, ('like', 'horror... {('link', 'offici', 'websit'): 1, ('extern', '... {('happen', 'that'): 1, ('time', 'year'): 1, (... {('film', 'saw'): 1, ('sleep', 'paralysi'): 1,... dead awak supernatur written jeffrey reddick... name last yeari familiar actual sleep paral... {'movi': 3, 'bathtub': 1, 'writer': 1, 'occur'... {'grant': 2, 'target': 1, 'bowman': 2, 'myster...
10 A Good American 2015 documentari reveal truth nsa cryptologist inno... documentari show situat idea know bit heard re... good american 2015 austrian documentari film c... {} {('critic', 'recept', 'film'): 1, ('new', 'yor... {('american', 'film'): 1, ('charact', 'base'):... {('good', 'american'): 2, ('produc', 'direct')... 2015 austrian chronicl work whistleblow will... situat idea heard read deep reason documenta... {'predict': 1, 'current': 1, 'piec': 1, 'creat... {'inventor': 1, 'conspiraci': 1, 'gener': 1, '...
11 Hard Tide 2015 drug dealer who emul father success crimin car... watch rot last night tempt dont bother script ... hard tide 2015 british crime drama written dir... {('end', 'credit', 'roll'): 1, ('make', 'film'... {('film', 'festiv', 'septemb'): 1, ('film', 'p... {('movi', 'releas'): 1, ('right', 'thing'): 1,... {('threaten', 'kill'): 1, ('father', 'death'):... hard tide 2015 british robert osman nathanae... watch rot tempt clueless act cringeabl folk... {'folk': 1, 'council': 1, 'famili': 1, 'nathan... {'protest': 1, 'council': 1, '6510': 1, 'frien...

In [11]:
sub_df = df.loc[:,['title', 'year','reviews_mono', 'contents_mono', 'reviews_bi', 'contents_bi', 'reviews_tri', 'contents_tri']].copy()

In [12]:
sub_df.head()


Out[12]:
title year reviews_mono contents_mono reviews_bi contents_bi reviews_tri contents_tri
0 Dead Awake 2016 {'movi': 3, 'bathtub': 1, 'writer': 1, 'occur'... {'grant': 2, 'target': 1, 'bowman': 2, 'myster... {('happen', 'that'): 1, ('time', 'year'): 1, (... {('film', 'saw'): 1, ('sleep', 'paralysi'): 1,... {('first', 'time', 'saw'): 1, ('like', 'horror... {('link', 'offici', 'websit'): 1, ('extern', '...
10 A Good American 2015 {'predict': 1, 'current': 1, 'piec': 1, 'creat... {'inventor': 1, 'conspiraci': 1, 'gener': 1, '... {('american', 'film'): 1, ('charact', 'base'):... {('good', 'american'): 2, ('produc', 'direct')... {} {('critic', 'recept', 'film'): 1, ('new', 'yor...
11 Hard Tide 2015 {'folk': 1, 'council': 1, 'famili': 1, 'nathan... {'protest': 1, 'council': 1, '6510': 1, 'frien... {('movi', 'releas'): 1, ('right', 'thing'): 1,... {('threaten', 'kill'): 1, ('father', 'death'):... {('end', 'credit', 'roll'): 1, ('make', 'film'... {('film', 'festiv', 'septemb'): 1, ('film', 'p...
13 Carrie Pilby 2016 {'movi': 4, 'smart': 1, 'happi': 2, 'edg': 1, ... {'due': 1, '9': 1, 'cartsoni': 1, 'writer': 1,... {('live', 'new'): 2, ('netflix', 'stream'): 1,... {('limit', 'releas'): 2, ('princip', 'photogra... {('19', 'year', 'old'): 1, ('intern', 'film', ... {('intern', 'film', 'festiv'): 2, ('releas', '...
14 A Dark Song 2016 {'claim': 2, 'cheap': 2, 'behest': 1, 'wander'... {'dilapid': 1, 'audaci': 1, 'comic': 1, 'demon... {('good', 'premis'): 1, ('make', 'sens'): 1, (... {('two', 'complet'): 1, ('film', 'festiv'): 2,... {('know', 'go', 'happen'): 1, ('featur', 'film... {('film', 'festiv', 'releas'): 1, ('recept', '...

In [14]:
joblib.dump(sub_df, 'ngrams')


Out[14]:
['ngrams', 'ngrams_01.npy', 'ngrams_02.npy', 'ngrams_03.npy', 'ngrams_04.npy']

In [ ]: