In [1]:
import pandas as pd
from nltk.corpus import stopwords
#nltk.download('stopwords')
import string
from string import punctuation
import re
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [10]:
sub_df = pd.read_csv('/Users/Cecilia/Desktop/independent_study/reviews_combined.csv', encoding = "ISO-8859-1")

In [11]:
stopwords1 = [v.strip() for v in open('/Users/Cecilia/Desktop/independent_study/stop_words.txt').readlines()]

In [12]:
stop = stopwords.words('english')
STOPWORDS = set(stopwords1).union(set(stop))

In [13]:
re_punc = '|'.join([re.escape(x) for x in string.punctuation])
# remove punc
sub_df['synopsis'] = sub_df['synopsis'].str.replace(re_punc,'')
sub_df['reviews'] = sub_df['reviews'].str.replace(re_punc,'')
sub_df['content'] = sub_df['content'].str.replace(re_punc,'')

In [14]:
synopsis = sub_df['synopsis'].str.cat(sep=' . ')
review = sub_df['reviews'].str.cat(sep=' . ')
content = sub_df['content'].str.cat(sep=' . ')

In [15]:
text = synopsis + review + content

In [18]:
text = text.lower()
text = [word for word in text.split() if word not in stop]

In [19]:
def stem(word):
    return ps.stem(word)
stem_ = np.vectorize(stem)

In [20]:
text = stem_(text)

In [21]:
text1 = ' '.join(text)

In [23]:
text1[0:200]


Out[23]:
'investig death twin sister sleep social worker unwittingli open door evil supernatur entiti . documentari reveal truth nsa cryptologist whose innov surveil program elimin week 911 attack . drug dealer'

In [24]:
len(text1)


Out[24]:
131055495

In [105]:
with open("Output.txt", "w", encoding = "ISO-8859-1") as text_file:
    text_file.write(text1)

In [25]:
with open("Output1.txt", "w", encoding = "utf-8") as text_file:
    text_file.write(text1)

In [26]:
pwd


Out[26]:
'/Users/Cecilia/Desktop/independent_study/Natural_Language_Processing/n-gramming'

In [ ]: