notebook.community

Edit and run



In [1]:

    
import pandas as pd
from nltk.corpus import stopwords
#nltk.download('stopwords')
import string
from string import punctuation
import re
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()



In [10]:

    
sub_df = pd.read_csv('/Users/Cecilia/Desktop/independent_study/reviews_combined.csv', encoding = "ISO-8859-1")



In [11]:

    
stopwords1 = [v.strip() for v in open('/Users/Cecilia/Desktop/independent_study/stop_words.txt').readlines()]



In [12]:

    
stop = stopwords.words('english')
STOPWORDS = set(stopwords1).union(set(stop))



In [13]:

    
re_punc = '|'.join([re.escape(x) for x in string.punctuation])
# remove punc
sub_df['synopsis'] = sub_df['synopsis'].str.replace(re_punc,'')
sub_df['reviews'] = sub_df['reviews'].str.replace(re_punc,'')
sub_df['content'] = sub_df['content'].str.replace(re_punc,'')



In [14]:

    
synopsis = sub_df['synopsis'].str.cat(sep=' . ')
review = sub_df['reviews'].str.cat(sep=' . ')
content = sub_df['content'].str.cat(sep=' . ')



In [15]:

    
text = synopsis + review + content



In [18]:

    
text = text.lower()
text = [word for word in text.split() if word not in stop]



In [19]:

    
def stem(word):
    return ps.stem(word)
stem_ = np.vectorize(stem)



In [20]:

    
text = stem_(text)



In [21]:

    
text1 = ' '.join(text)



In [23]:

    
text1[0:200]









    Out[23]:





'investig death twin sister sleep social worker unwittingli open door evil supernatur entiti . documentari reveal truth nsa cryptologist whose innov surveil program elimin week 911 attack . drug dealer'



In [24]:

    
len(text1)









    Out[24]:





131055495



In [105]:

    
with open("Output.txt", "w", encoding = "ISO-8859-1") as text_file:
    text_file.write(text1)



In [25]:

    
with open("Output1.txt", "w", encoding = "utf-8") as text_file:
    text_file.write(text1)



In [26]:

    
pwd









    Out[26]:





'/Users/Cecilia/Desktop/independent_study/Natural_Language_Processing/n-gramming'



In [ ]: