In [1]:
import pandas as pd
from nltk.corpus import stopwords
#nltk.download('stopwords')
import string
from string import punctuation
import re
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
In [10]:
sub_df = pd.read_csv('/Users/Cecilia/Desktop/independent_study/reviews_combined.csv', encoding = "ISO-8859-1")
In [11]:
stopwords1 = [v.strip() for v in open('/Users/Cecilia/Desktop/independent_study/stop_words.txt').readlines()]
In [12]:
stop = stopwords.words('english')
STOPWORDS = set(stopwords1).union(set(stop))
In [13]:
re_punc = '|'.join([re.escape(x) for x in string.punctuation])
# remove punc
sub_df['synopsis'] = sub_df['synopsis'].str.replace(re_punc,'')
sub_df['reviews'] = sub_df['reviews'].str.replace(re_punc,'')
sub_df['content'] = sub_df['content'].str.replace(re_punc,'')
In [14]:
synopsis = sub_df['synopsis'].str.cat(sep=' . ')
review = sub_df['reviews'].str.cat(sep=' . ')
content = sub_df['content'].str.cat(sep=' . ')
In [15]:
text = synopsis + review + content
In [18]:
text = text.lower()
text = [word for word in text.split() if word not in stop]
In [19]:
def stem(word):
return ps.stem(word)
stem_ = np.vectorize(stem)
In [20]:
text = stem_(text)
In [21]:
text1 = ' '.join(text)
In [23]:
text1[0:200]
Out[23]:
In [24]:
len(text1)
Out[24]:
In [105]:
with open("Output.txt", "w", encoding = "ISO-8859-1") as text_file:
text_file.write(text1)
In [25]:
with open("Output1.txt", "w", encoding = "utf-8") as text_file:
text_file.write(text1)
In [26]:
pwd
Out[26]:
In [ ]: