In [1]:
import numpy as np
import pandas as pd
import nltk
import os
from sklearn.externals import joblib
#nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from string import punctuation
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

combine data


In [ ]:
# read in data 
df1 = pd.read_csv('./../data/reviews_wiki_0_1000_v2.csv',  encoding = "ISO-8859-1")
df2 = pd.read_csv('./../data/reviews_wiki_1000_2000.csv',  encoding = "ISO-8859-1")
df3 = pd.read_csv('./../data/reviews_wiki_2000_3101.csv',  encoding = "ISO-8859-1")

# drop the first columns
df1.drop('Unnamed: 0', 1, inplace=True)
df2.drop('Unnamed: 0', 1, inplace=True)
df3.drop('Unnamed: 0', 1, inplace=True)

In [ ]:
# merge data
df = df1.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)

In [ ]:
df.shape

In [202]:
# split wiki_titile
regex = re.compile('.*(\((\d{4}\s+)?film\))')

In [203]:
def split_film(movie_title):
    match = regex.match(movie_title)
    try: 
        res = match.group(1)
    except AttributeError:
        res = None
    return res
split_film_ = np.vectorize(split_film)

In [204]:
def split_year(movie_title):
    match = regex.match(movie_title)
    try: 
        res = match.group(2).strip()
    except AttributeError:
        res = None
    return res
split_year_ = np.vectorize(split_year)

In [205]:
# split film and year 
df['Ind_film'] = split_film_(df['wiki_title'])
df['year'] = split_year_(df['wiki_title'])

In [207]:
# summarise the number of wiki_titles
title_count = df.groupby(['title'])[['wiki_title']].count()
title_count.reset_index(inplace=True)
title_count = title_count.rename(index=str, columns={"title": "title", "wiki_title": "count"})

In [208]:
df = df.merge(title_count, how='left', on='title')

In [209]:
sub_df = df[(df['count'] == 1) | (df['Ind_film'] != 'None')]

In [213]:
# movies with year
df_w_year = sub_df[sub_df['year'] != 'None']
df_wo_year = sub_df[sub_df['year'] == 'None']

In [214]:
# convert string into numeric
df_w_year['year'] = pd.to_numeric(df_w_year['year'])

# two duplicate rows 
df_wo_year.drop_duplicates(inplace=True)


C:\Users\yanxi\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\Users\yanxi\Anaconda3\lib\site-packages\pandas\util\decorators.py:91: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)

In [215]:
# read in IMBD reviews
imdb = pd.read_csv('movies_reviews_all.csv')
imdb.drop(['Unnamed: 0'], axis=1, inplace=True)

In [216]:
sub_imdb = imdb.copy(deep=True)
sub_imdb.drop(['imdbID'], axis=1, inplace=True)

In [217]:
sub_imdb = pd.merge(sub_imdb, df_w_year[['title', 'year', 'content']], on=['title', 'year'], how='left')

sub_imdb = pd.merge(sub_imdb, df_wo_year[['title', 'content']], on=['title'], how='left')

In [218]:
def combine_cols(val1, val2):
    if pd.isnull(val1):
        return val2
    if pd.isnull(val2):
        return val1

In [219]:
sub_imdb['content'] = sub_imdb.apply(lambda row: combine_cols(row['content_x'], row['content_y']), axis=1)
sub_imdb.drop(['content_x', 'content_y'], axis=1, inplace=True)

In [236]:
# remove missing rows
df_combined = sub_imdb[(sub_imdb['reviews'].notnull())&(sub_imdb['content'].notnull())]

In [241]:
df_combined.shape


Out[241]:
(1549, 5)

In [243]:
df_combined.to_csv('reviews_combined.csv', index=False)

In [244]:
joblib.dump(df_combined, 'reviews_combined')


Out[244]:
['reviews_combined',
 'reviews_combined_01.npy',
 'reviews_combined_02.npy',
 'reviews_combined_03.npy',
 'reviews_combined_04.npy']

In [242]:
df_combined.head(20)


Out[242]:
title year synopsis reviews content
0 Dead Awake 2016 While investigating the death of her twin sist... The movie is nothing new, nothing you haven't ... Dead Awake is a 2016 American supernatural psy...
10 A Good American 2015 This documentary reveals the truth about an NS... Many documentaries show us situations that we ... A Good American is a 2015 Austrian documentary...
11 Hard Tide 2015 A drug dealer who's been emulating his father'... Watched this rot last night. If your tempted d... Hard Tide is a 2015 British crime drama writte...
13 Carrie Pilby 2016 A socially awkward 19-year-old genius makes bi... I was so excited to see this film at the Toron... Carrie Pilby is a 2016 American comedy film di...
14 A Dark Song 2016 Grieving the death of her son, a woman hires a... This writer has always felt that the job of a ... A Dark Song is a 2016 Irish independent horror...
18 Bright Star 2009 This drama details the passionate three-year r... With such high hopes for a film, a letdown is ... Bright Star is a 2009 British-French-Australia...
19 Chloe 2009 Suspecting her husband of infidelity, Catherin... This film reminded me of the 90's wave of erot... Chloe is a 2009 erotic thriller film directed ...
20 Easy Virtue 2008 An American widow impulsively marries a wealth... Easy Virtue is a very liberal adaptation of No... Easy Virtue is a 2008 British romantic comedy ...
21 Kicking and Screaming 1995 After graduating, four college roomies -- petr... This is simply the best "Big Chill" movie sinc... Kicking and Screaming is a 1995 film by Noah B...
22 Vincent N Roxxy 2016 In rural Louisiana, a terse loner forges a red... What a shitty movie. All this movie does is gi... Vincent N Roxxy is a 2016 American action crim...
23 Ali 2001 Boxing legend Muhammad Ali stirred controversy... Some people never liked Ali. He is one of thos... Ali is a 2001 American biographical sports dra...
24 Barefoot in the Park 1967 A pair of newlyweds -- he, a stuffed-shirt law... I'm no great fan of Neil Simon, but this neat ... Barefoot in the Park is a 1967 American comedy...
27 The Chase 1966 After Bubber Reeves escapes from prison, he fi... Of course it's cheesy, it's supposed to be! I... The Chase is a 1966 Technicolor American drama...
28 Dead Poets Society 1989 An unconventional teacher inspires students th... There are certain films that get under your sk... Dead Poets Society is a 1989 American drama fi...
29 Deep Blue Sea 1999 Scientists conduct research on sharks, hoping ... After the opening scene of *Deep Blue Sea*, in... Deep Blue Sea is a 1999 American science ficti...
32 The Electric Horseman 1979 An alcoholic former rodeo champion who's been ... OK, first, to the reviewer that suggested "too... The Electric Horseman is a 1979 American weste...
33 Fracture 2007 Ted Crawford, who allegedly attempted to murde... I thought the intellectual chess game between ... Fracture is a 2007 American-German legal drama...
36 Gone Baby Gone 2007 When a 4-year-old girl vanishes from a rough B... I would consider myself to be a movie buff and... Gone Baby Gone is a 2007 American neo-noir mys...
37 Greenberg 2010 At a crossroads in his life, Roger house-sits ... ... (which, if you've read the other reviews h... Greenberg is a 2010 American comedy-drama film...
39 Hercules 1997 The heavenly Hercules is stripped of his immor... I'm shocked to see this movie with a rating be... Hercules is a 1997 American animated musical f...

clean stopwords, punctuations and finish stemming


In [3]:
df = joblib.load('/Users/yanxi/Documents/2017 Fall/Independent Study/data/reviews_combined')

In [ ]:
stopwords1 = [v.strip() for v in open('stop_words.txt').readlines()]
stop = stopwords.words('english')
STOPWORDS = set(stopwords1).union(set(stop))

In [ ]:
re_punc = '|'.join([re.escape(x) for x in string.punctuation])
# remove punc
df['synopsis'] = df['synopsis'].str.replace(re_punc,'')
df['reviews'] = df['reviews'].str.replace(re_punc,'')
df['content'] = df['content'].str.replace(re_punc,'')

In [ ]:
# to lower case 
df['synopsis'] = df.apply(lambda row: row['synopsis'].lower(), axis=1)
df['reviews'] = df.apply(lambda row: row['reviews'].lower(), axis=1)
df['content'] = df.apply(lambda row: row['content'].lower(), axis=1)

In [ ]:
# split words and remove stop words
def remove_stopwords(text):
    words = [word for word in text.split() if word not in STOPWORDS]
    text_new = " ".join(words)
    return(text_new)

remove_stopwords_ = np.vectorize(remove_stopwords)

In [ ]:
df['synopsis'] = remove_stopwords_(df['synopsis'])
df['reviews'] = remove_stopwords_(df['reviews'])
df['content'] = remove_stopwords_(df['content'])

In [ ]:
# stemming 
def stem(word):
    return ps.stem(word)
stem_ = np.vectorize(stem)

In [ ]:
df['synopsis'] = df.apply(lambda row: " ".join(stem_(row['synopsis'].split())), axis=1)
df['reviews'] = df.apply(lambda row: " ".join(stem_(row['reviews'].split())), axis=1)
df['content'] = df.apply(lambda row: " ".join(stem_(row['content'].split())), axis=1)

In [ ]:
joblib.dump(df, 'clean_complete_df')