In [1]:

    
import numpy as np
import pandas as pd
import nltk
import os
from sklearn.externals import joblib
#nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from string import punctuation
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

combine data



In [ ]:

    
# read in data 
df1 = pd.read_csv('./../data/reviews_wiki_0_1000_v2.csv',  encoding = "ISO-8859-1")
df2 = pd.read_csv('./../data/reviews_wiki_1000_2000.csv',  encoding = "ISO-8859-1")
df3 = pd.read_csv('./../data/reviews_wiki_2000_3101.csv',  encoding = "ISO-8859-1")

# drop the first columns
df1.drop('Unnamed: 0', 1, inplace=True)
df2.drop('Unnamed: 0', 1, inplace=True)
df3.drop('Unnamed: 0', 1, inplace=True)



In [ ]:

    
# merge data
df = df1.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)



In [ ]:

    
df.shape



In [202]:

    
# split wiki_titile
regex = re.compile('.*(\((\d{4}\s+)?film\))')



In [203]:

    
def split_film(movie_title):
    match = regex.match(movie_title)
    try: 
        res = match.group(1)
    except AttributeError:
        res = None
    return res
split_film_ = np.vectorize(split_film)



In [204]:

    
def split_year(movie_title):
    match = regex.match(movie_title)
    try: 
        res = match.group(2).strip()
    except AttributeError:
        res = None
    return res
split_year_ = np.vectorize(split_year)



In [205]:

    
# split film and year 
df['Ind_film'] = split_film_(df['wiki_title'])
df['year'] = split_year_(df['wiki_title'])



In [207]:

    
# summarise the number of wiki_titles
title_count = df.groupby(['title'])[['wiki_title']].count()
title_count.reset_index(inplace=True)
title_count = title_count.rename(index=str, columns={"title": "title", "wiki_title": "count"})



In [208]:

    
df = df.merge(title_count, how='left', on='title')



In [209]:

    
sub_df = df[(df['count'] == 1) | (df['Ind_film'] != 'None')]



In [213]:

    
# movies with year
df_w_year = sub_df[sub_df['year'] != 'None']
df_wo_year = sub_df[sub_df['year'] == 'None']



In [214]:

    
# convert string into numeric
df_w_year['year'] = pd.to_numeric(df_w_year['year'])

# two duplicate rows 
df_wo_year.drop_duplicates(inplace=True)









    



C:\Users\yanxi\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Users\yanxi\Anaconda3\lib\site-packages\pandas\util\decorators.py:91: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)



In [215]:

    
# read in IMBD reviews
imdb = pd.read_csv('movies_reviews_all.csv')
imdb.drop(['Unnamed: 0'], axis=1, inplace=True)



In [216]:

    
sub_imdb = imdb.copy(deep=True)
sub_imdb.drop(['imdbID'], axis=1, inplace=True)



In [217]:

    
sub_imdb = pd.merge(sub_imdb, df_w_year[['title', 'year', 'content']], on=['title', 'year'], how='left')

sub_imdb = pd.merge(sub_imdb, df_wo_year[['title', 'content']], on=['title'], how='left')



In [218]:

    
def combine_cols(val1, val2):
    if pd.isnull(val1):
        return val2
    if pd.isnull(val2):
        return val1



In [219]:

    
sub_imdb['content'] = sub_imdb.apply(lambda row: combine_cols(row['content_x'], row['content_y']), axis=1)
sub_imdb.drop(['content_x', 'content_y'], axis=1, inplace=True)



In [236]:

    
# remove missing rows
df_combined = sub_imdb[(sub_imdb['reviews'].notnull())&(sub_imdb['content'].notnull())]



In [241]:

    
df_combined.shape









    Out[241]:





(1549, 5)



In [243]:

    
df_combined.to_csv('reviews_combined.csv', index=False)



In [244]:

    
joblib.dump(df_combined, 'reviews_combined')









    Out[244]:





['reviews_combined',
 'reviews_combined_01.npy',
 'reviews_combined_02.npy',
 'reviews_combined_03.npy',
 'reviews_combined_04.npy']



In [242]:

    
df_combined.head(20)









    Out[242]:






  
    
      
      title
      year
      synopsis
      reviews
      content
    
  
  
    
      0
      Dead Awake
      2016
      While investigating the death of her twin sist...
      The movie is nothing new, nothing you haven't ...
      Dead Awake is a 2016 American supernatural psy...
    
    
      10
      A Good American
      2015
      This documentary reveals the truth about an NS...
      Many documentaries show us situations that we ...
      A Good American is a 2015 Austrian documentary...
    
    
      11
      Hard Tide
      2015
      A drug dealer who's been emulating his father'...
      Watched this rot last night. If your tempted d...
      Hard Tide is a 2015 British crime drama writte...
    
    
      13
      Carrie Pilby
      2016
      A socially awkward 19-year-old genius makes bi...
      I was so excited to see this film at the Toron...
      Carrie Pilby is a 2016 American comedy film di...
    
    
      14
      A Dark Song
      2016
      Grieving the death of her son, a woman hires a...
      This writer has always felt that the job of a ...
      A Dark Song is a 2016 Irish independent horror...
    
    
      18
      Bright Star
      2009
      This drama details the passionate three-year r...
      With such high hopes for a film, a letdown is ...
      Bright Star is a 2009 British-French-Australia...
    
    
      19
      Chloe
      2009
      Suspecting her husband of infidelity, Catherin...
      This film reminded me of the 90's wave of erot...
      Chloe is a 2009 erotic thriller film directed ...
    
    
      20
      Easy Virtue
      2008
      An American widow impulsively marries a wealth...
      Easy Virtue is a very liberal adaptation of No...
      Easy Virtue is a 2008 British romantic comedy ...
    
    
      21
      Kicking and Screaming
      1995
      After graduating, four college roomies -- petr...
      This is simply the best "Big Chill" movie sinc...
      Kicking and Screaming is a 1995 film by Noah B...
    
    
      22
      Vincent N Roxxy
      2016
      In rural Louisiana, a terse loner forges a red...
      What a shitty movie. All this movie does is gi...
      Vincent N Roxxy is a 2016 American action crim...
    
    
      23
      Ali
      2001
      Boxing legend Muhammad Ali stirred controversy...
      Some people never liked Ali. He is one of thos...
      Ali is a 2001 American biographical sports dra...
    
    
      24
      Barefoot in the Park
      1967
      A pair of newlyweds -- he, a stuffed-shirt law...
      I'm no great fan of Neil Simon, but this neat ...
      Barefoot in the Park is a 1967 American comedy...
    
    
      27
      The Chase
      1966
      After Bubber Reeves escapes from prison, he fi...
      Of course it's cheesy, it's supposed to be!  I...
      The Chase is a 1966 Technicolor American drama...
    
    
      28
      Dead Poets Society
      1989
      An unconventional teacher inspires students th...
      There are certain films that get under your sk...
      Dead Poets Society is a 1989 American drama fi...
    
    
      29
      Deep Blue Sea
      1999
      Scientists conduct research on sharks, hoping ...
      After the opening scene of *Deep Blue Sea*, in...
      Deep Blue Sea is a 1999 American science ficti...
    
    
      32
      The Electric Horseman
      1979
      An alcoholic former rodeo champion who's been ...
      OK, first, to the reviewer that suggested "too...
      The Electric Horseman is a 1979 American weste...
    
    
      33
      Fracture
      2007
      Ted Crawford, who allegedly attempted to murde...
      I thought the intellectual chess game between ...
      Fracture is a 2007 American-German legal drama...
    
    
      36
      Gone Baby Gone
      2007
      When a 4-year-old girl vanishes from a rough B...
      I would consider myself to be a movie buff and...
      Gone Baby Gone is a 2007 American neo-noir mys...
    
    
      37
      Greenberg
      2010
      At a crossroads in his life, Roger house-sits ...
      ... (which, if you've read the other reviews h...
      Greenberg is a 2010 American comedy-drama film...
    
    
      39
      Hercules
      1997
      The heavenly Hercules is stripped of his immor...
      I'm shocked to see this movie with a rating be...
      Hercules is a 1997 American animated musical f...

clean stopwords, punctuations and finish stemming



In [3]:

    
df = joblib.load('/Users/yanxi/Documents/2017 Fall/Independent Study/data/reviews_combined')



In [ ]:

    
stopwords1 = [v.strip() for v in open('stop_words.txt').readlines()]
stop = stopwords.words('english')
STOPWORDS = set(stopwords1).union(set(stop))



In [ ]:

    
re_punc = '|'.join([re.escape(x) for x in string.punctuation])
# remove punc
df['synopsis'] = df['synopsis'].str.replace(re_punc,'')
df['reviews'] = df['reviews'].str.replace(re_punc,'')
df['content'] = df['content'].str.replace(re_punc,'')



In [ ]:

    
# to lower case 
df['synopsis'] = df.apply(lambda row: row['synopsis'].lower(), axis=1)
df['reviews'] = df.apply(lambda row: row['reviews'].lower(), axis=1)
df['content'] = df.apply(lambda row: row['content'].lower(), axis=1)



In [ ]:

    
# split words and remove stop words
def remove_stopwords(text):
    words = [word for word in text.split() if word not in STOPWORDS]
    text_new = " ".join(words)
    return(text_new)

remove_stopwords_ = np.vectorize(remove_stopwords)



In [ ]:

    
df['synopsis'] = remove_stopwords_(df['synopsis'])
df['reviews'] = remove_stopwords_(df['reviews'])
df['content'] = remove_stopwords_(df['content'])



In [ ]:

    
# stemming 
def stem(word):
    return ps.stem(word)
stem_ = np.vectorize(stem)



In [ ]:

    
df['synopsis'] = df.apply(lambda row: " ".join(stem_(row['synopsis'].split())), axis=1)
df['reviews'] = df.apply(lambda row: " ".join(stem_(row['reviews'].split())), axis=1)
df['content'] = df.apply(lambda row: " ".join(stem_(row['content'].split())), axis=1)



In [ ]:

    
joblib.dump(df, 'clean_complete_df')

	title	year	synopsis	reviews	content
0	Dead Awake	2016	While investigating the death of her twin sist...	The movie is nothing new, nothing you haven't ...	Dead Awake is a 2016 American supernatural psy...
10	A Good American	2015	This documentary reveals the truth about an NS...	Many documentaries show us situations that we ...	A Good American is a 2015 Austrian documentary...
11	Hard Tide	2015	A drug dealer who's been emulating his father'...	Watched this rot last night. If your tempted d...	Hard Tide is a 2015 British crime drama writte...
13	Carrie Pilby	2016	A socially awkward 19-year-old genius makes bi...	I was so excited to see this film at the Toron...	Carrie Pilby is a 2016 American comedy film di...
14	A Dark Song	2016	Grieving the death of her son, a woman hires a...	This writer has always felt that the job of a ...	A Dark Song is a 2016 Irish independent horror...
18	Bright Star	2009	This drama details the passionate three-year r...	With such high hopes for a film, a letdown is ...	Bright Star is a 2009 British-French-Australia...
19	Chloe	2009	Suspecting her husband of infidelity, Catherin...	This film reminded me of the 90's wave of erot...	Chloe is a 2009 erotic thriller film directed ...
20	Easy Virtue	2008	An American widow impulsively marries a wealth...	Easy Virtue is a very liberal adaptation of No...	Easy Virtue is a 2008 British romantic comedy ...
21	Kicking and Screaming	1995	After graduating, four college roomies -- petr...	This is simply the best "Big Chill" movie sinc...	Kicking and Screaming is a 1995 film by Noah B...
22	Vincent N Roxxy	2016	In rural Louisiana, a terse loner forges a red...	What a shitty movie. All this movie does is gi...	Vincent N Roxxy is a 2016 American action crim...
23	Ali	2001	Boxing legend Muhammad Ali stirred controversy...	Some people never liked Ali. He is one of thos...	Ali is a 2001 American biographical sports dra...
24	Barefoot in the Park	1967	A pair of newlyweds -- he, a stuffed-shirt law...	I'm no great fan of Neil Simon, but this neat ...	Barefoot in the Park is a 1967 American comedy...
27	The Chase	1966	After Bubber Reeves escapes from prison, he fi...	Of course it's cheesy, it's supposed to be! I...	The Chase is a 1966 Technicolor American drama...
28	Dead Poets Society	1989	An unconventional teacher inspires students th...	There are certain films that get under your sk...	Dead Poets Society is a 1989 American drama fi...
29	Deep Blue Sea	1999	Scientists conduct research on sharks, hoping ...	After the opening scene of Deep Blue Sea, in...	Deep Blue Sea is a 1999 American science ficti...
32	The Electric Horseman	1979	An alcoholic former rodeo champion who's been ...	OK, first, to the reviewer that suggested "too...	The Electric Horseman is a 1979 American weste...
33	Fracture	2007	Ted Crawford, who allegedly attempted to murde...	I thought the intellectual chess game between ...	Fracture is a 2007 American-German legal drama...
36	Gone Baby Gone	2007	When a 4-year-old girl vanishes from a rough B...	I would consider myself to be a movie buff and...	Gone Baby Gone is a 2007 American neo-noir mys...
37	Greenberg	2010	At a crossroads in his life, Roger house-sits ...	... (which, if you've read the other reviews h...	Greenberg is a 2010 American comedy-drama film...
39	Hercules	1997	The heavenly Hercules is stripped of his immor...	I'm shocked to see this movie with a rating be...	Hercules is a 1997 American animated musical f...